diff mbox series

aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171]

Message ID 20240514012926.29048-1-quic_pzheng@quicinc.com
State New
Headers show
Series aarch64: Fold vget_low_* intrinsics to BIT_FIELD_REF [PR102171] | expand

Commit Message

Pengxuan Zheng May 14, 2024, 1:29 a.m. UTC
This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
optimization opportunities for gimple optimizers.

While we are here, we also remove the vget_low_* definitions from arm_neon.h and
use the new intrinsics framework.

        PR target/102171

gcc/ChangeLog:

	* config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
	New macro to create definitions for all vget_low intrinsics.
	(VGET_LOW_BUILTIN): Likewise.
	(enum aarch64_builtins): Add vget_low function codes.
	(aarch64_general_fold_builtin): Fold vget_low calls.
	* config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
	* config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
	(aarch64_vget_lo_halfv8bf): Likewise.
	* config/aarch64/arm_neon.h (__attribute__): Delete.
	(vget_low_f16): Likewise.
	(vget_low_f32): Likewise.
	(vget_low_f64): Likewise.
	(vget_low_p8): Likewise.
	(vget_low_p16): Likewise.
	(vget_low_p64): Likewise.
	(vget_low_s8): Likewise.
	(vget_low_s16): Likewise.
	(vget_low_s32): Likewise.
	(vget_low_s64): Likewise.
	(vget_low_u8): Likewise.
	(vget_low_u16): Likewise.
	(vget_low_u32): Likewise.
	(vget_low_u64): Likewise.
	(vget_low_bf16): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
	with vget_low_s16.
	* gcc.target/aarch64/vget_low_2.c: New test.
	* gcc.target/aarch64/vget_low_2_be.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
---
 gcc/config/aarch64/aarch64-builtins.cc        |  60 ++++++++++
 gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
 gcc/config/aarch64/aarch64-simd.md            |  23 +---
 gcc/config/aarch64/arm_neon.h                 | 105 ------------------
 gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
 gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +++++
 .../gcc.target/aarch64/vget_low_2_be.c        |  31 ++++++
 7 files changed, 124 insertions(+), 132 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c

Comments

Richard Sandiford May 20, 2024, 9:56 a.m. UTC | #1
Pengxuan Zheng <quic_pzheng@quicinc.com> writes:
> This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
> optimization opportunities for gimple optimizers.
>
> While we are here, we also remove the vget_low_* definitions from arm_neon.h and
> use the new intrinsics framework.
>
>         PR target/102171
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
> 	New macro to create definitions for all vget_low intrinsics.
> 	(VGET_LOW_BUILTIN): Likewise.
> 	(enum aarch64_builtins): Add vget_low function codes.
> 	(aarch64_general_fold_builtin): Fold vget_low calls.
> 	* config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
> 	* config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
> 	(aarch64_vget_lo_halfv8bf): Likewise.
> 	* config/aarch64/arm_neon.h (__attribute__): Delete.
> 	(vget_low_f16): Likewise.
> 	(vget_low_f32): Likewise.
> 	(vget_low_f64): Likewise.
> 	(vget_low_p8): Likewise.
> 	(vget_low_p16): Likewise.
> 	(vget_low_p64): Likewise.
> 	(vget_low_s8): Likewise.
> 	(vget_low_s16): Likewise.
> 	(vget_low_s32): Likewise.
> 	(vget_low_s64): Likewise.
> 	(vget_low_u8): Likewise.
> 	(vget_low_u16): Likewise.
> 	(vget_low_u32): Likewise.
> 	(vget_low_u64): Likewise.
> 	(vget_low_bf16): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
> 	with vget_low_s16.
> 	* gcc.target/aarch64/vget_low_2.c: New test.
> 	* gcc.target/aarch64/vget_low_2_be.c: New test.

Ok, thanks.  I suppose the patch has the side effect of allowing
vget_low_bf16 to be called without +bf16.  IMO that's the correct
behaviour though, and is consistent with how we handle reinterprets.

Richard

> Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64-builtins.cc        |  60 ++++++++++
>  gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
>  gcc/config/aarch64/aarch64-simd.md            |  23 +---
>  gcc/config/aarch64/arm_neon.h                 | 105 ------------------
>  gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
>  gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +++++
>  .../gcc.target/aarch64/vget_low_2_be.c        |  31 ++++++
>  7 files changed, 124 insertions(+), 132 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
>
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
> index 75d21de1401..4afe7c86ae3 100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
>    VREINTERPRET_BUILTINS \
>    VREINTERPRETQ_BUILTINS
>  
> +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> +  VGET_LOW_BUILTIN(f16) \
> +  VGET_LOW_BUILTIN(f32) \
> +  VGET_LOW_BUILTIN(f64) \
> +  VGET_LOW_BUILTIN(p8) \
> +  VGET_LOW_BUILTIN(p16) \
> +  VGET_LOW_BUILTIN(p64) \
> +  VGET_LOW_BUILTIN(s8) \
> +  VGET_LOW_BUILTIN(s16) \
> +  VGET_LOW_BUILTIN(s32) \
> +  VGET_LOW_BUILTIN(s64) \
> +  VGET_LOW_BUILTIN(u8) \
> +  VGET_LOW_BUILTIN(u16) \
> +  VGET_LOW_BUILTIN(u32) \
> +  VGET_LOW_BUILTIN(u64) \
> +  VGET_LOW_BUILTIN(bf16)
> +
>  typedef struct
>  {
>    const char *name;
> @@ -697,6 +714,9 @@ typedef struct
>  #define VREINTERPRET_BUILTIN(A, B, L) \
>    AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
>  
> +#define VGET_LOW_BUILTIN(A) \
> +  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
> +
>  #undef VAR1
>  #define VAR1(T, N, MAP, FLAG, A) \
>    AARCH64_SIMD_BUILTIN_##T##_##N##A,
> @@ -732,6 +752,7 @@ enum aarch64_builtins
>    AARCH64_CRC32_BUILTIN_MAX,
>    /* SIMD intrinsic builtins.  */
>    AARCH64_SIMD_VREINTERPRET_BUILTINS
> +  AARCH64_SIMD_VGET_LOW_BUILTINS
>    /* ARMv8.3-A Pointer Authentication Builtins.  */
>    AARCH64_PAUTH_BUILTIN_AUTIA1716,
>    AARCH64_PAUTH_BUILTIN_PACIA1716,
> @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = {
>       && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
>    },
>  
> +#undef VGET_LOW_BUILTIN
> +#define VGET_LOW_BUILTIN(A) \
> +  {"vget_low_" #A, \
> +   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
> +   2, \
> +   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
> +   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
> +   FLAG_AUTO_FP, \
> +   false \
> +  },
> +
> +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> +  VGET_LOW_BUILTIN(f16) \
> +  VGET_LOW_BUILTIN(f32) \
> +  VGET_LOW_BUILTIN(f64) \
> +  VGET_LOW_BUILTIN(p8) \
> +  VGET_LOW_BUILTIN(p16) \
> +  VGET_LOW_BUILTIN(p64) \
> +  VGET_LOW_BUILTIN(s8) \
> +  VGET_LOW_BUILTIN(s16) \
> +  VGET_LOW_BUILTIN(s32) \
> +  VGET_LOW_BUILTIN(s64) \
> +  VGET_LOW_BUILTIN(u8) \
> +  VGET_LOW_BUILTIN(u16) \
> +  VGET_LOW_BUILTIN(u32) \
> +  VGET_LOW_BUILTIN(u64) \
> +  VGET_LOW_BUILTIN(bf16)
> +
>  static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
>    AARCH64_SIMD_VREINTERPRET_BUILTINS
> +  AARCH64_SIMD_VGET_LOW_BUILTINS
>  };
>  
>  
> @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
>  #define VREINTERPRET_BUILTIN(A, B, L) \
>    case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
>  
> +#undef VGET_LOW_BUILTIN
> +#define VGET_LOW_BUILTIN(A) \
> +  case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
>  
>  /* Try to fold a call to the built-in function with subcode FCODE.  The
>     function is passed the N_ARGS arguments in ARGS and it returns a value
> @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
>  	return fold_build1 (FLOAT_EXPR, type, args[0]);
>        AARCH64_SIMD_VREINTERPRET_BUILTINS
>  	return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
> +      AARCH64_SIMD_VGET_LOW_BUILTINS
> +        {
> +          auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
> +
> +          return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64),
> +                              bitsize_int (pos));
> +        }
>        case AARCH64_SIMD_BUILTIN_LANE_CHECK:
>  	gcc_assert (n_args == 3);
>  	if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
> diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> index da16f602a55..a9f0558f8b6 100644
> --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> @@ -65,8 +65,6 @@
>    BUILTIN_VS (UNOP, ctz, 2, NONE)
>    BUILTIN_VB (UNOP, popcount, 2, NONE)
>  
> -  /* Implemented by aarch64_get_low<mode>.  */
> -  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
>    /* Implemented by aarch64_get_high<mode>.  */
>    BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
>  
> @@ -960,8 +958,7 @@
>    VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
>    VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
>  
> -  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
> -  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
> +  /* Implemented by aarch64_vget_hi_halfv8bf.  */
>    VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
>  
>    /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index f8bb973a278..5a28a8e3c6a 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>"
>    }
>  )
>  
> -(define_expand "aarch64_get_low<mode>"
> -  [(match_operand:<VHALF> 0 "register_operand")
> -   (match_operand:VQMOV 1 "register_operand")]
> -  "TARGET_FLOAT"
> -  {
> -    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
> -    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
> -    DONE;
> -  }
> -)
> -
>  (define_expand "aarch64_get_high<mode>"
>    [(match_operand:<VHALF> 0 "register_operand")
>     (match_operand:VQMOV 1 "register_operand")]
> @@ -9774,17 +9763,7 @@ (define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
>    [(set_attr "type" "neon_dot<VDQSF:q>")]
>  )
>  
> -;; vget_low/high_bf16
> -(define_expand "aarch64_vget_lo_halfv8bf"
> -  [(match_operand:V4BF 0 "register_operand")
> -   (match_operand:V8BF 1 "register_operand")]
> -  "TARGET_BF16_SIMD"
> -{
> -  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
> -  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
> -  DONE;
> -})
> -
> +;; vget_high_bf16
>  (define_expand "aarch64_vget_hi_halfv8bf"
>    [(match_operand:V4BF 0 "register_operand")
>     (match_operand:V8BF 1 "register_operand")]
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index 0ee325dccad..92c2c5361cd 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
>    return __aarch64_vset_lane_any (__elem, __vec, __index);
>  }
>  
> -__extension__ extern __inline float16x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_f16 (float16x8_t __a)
> -{
> -  return __builtin_aarch64_get_lowv8hf (__a);
> -}
> -
> -__extension__ extern __inline float32x2_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_f32 (float32x4_t __a)
> -{
> -  return __builtin_aarch64_get_lowv4sf (__a);
> -}
> -
> -__extension__ extern __inline float64x1_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_f64 (float64x2_t __a)
> -{
> -  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
> -}
> -
> -__extension__ extern __inline poly8x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_p8 (poly8x16_t __a)
> -{
> -  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> -}
> -
> -__extension__ extern __inline poly16x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_p16 (poly16x8_t __a)
> -{
> -  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> -}
> -
> -__extension__ extern __inline poly64x1_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_p64 (poly64x2_t __a)
> -{
> -  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
> -}
> -
> -__extension__ extern __inline int8x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_s8 (int8x16_t __a)
> -{
> -  return  __builtin_aarch64_get_lowv16qi (__a);
> -}
> -
> -__extension__ extern __inline int16x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_s16 (int16x8_t __a)
> -{
> -  return  __builtin_aarch64_get_lowv8hi (__a);
> -}
> -
> -__extension__ extern __inline int32x2_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_s32 (int32x4_t __a)
> -{
> -  return  __builtin_aarch64_get_lowv4si (__a);
> -}
> -
> -__extension__ extern __inline int64x1_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_s64 (int64x2_t __a)
> -{
> -  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
> -}
> -
> -__extension__ extern __inline uint8x8_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_u8 (uint8x16_t __a)
> -{
> -  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> -}
> -
> -__extension__ extern __inline uint16x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_u16 (uint16x8_t __a)
> -{
> -  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> -}
> -
> -__extension__ extern __inline uint32x2_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_u32 (uint32x4_t __a)
> -{
> -  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
> -}
> -
> -__extension__ extern __inline uint64x1_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_u64 (uint64x2_t __a)
> -{
> -  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
> -}
> -
>  __extension__ extern __inline float16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vget_high_f16 (float16x8_t __a)
> @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
>    return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
>  }
>  
> -__extension__ extern __inline bfloat16x4_t
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vget_low_bf16 (bfloat16x8_t __a)
> -{
> -  return __builtin_aarch64_vget_lo_halfv8bf (__a);
> -}
> -
>  __extension__ extern __inline bfloat16x4_t
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
>  vget_high_bf16 (bfloat16x8_t __a)
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> index a8e445c6e19..fc8607f7218 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() {
>        int y_l = vmull_laneq_u16(r);
>        uint16x8_t __a = g;
>        jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
> -          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
> +          (uint16x4_t)vget_low_s16((int16x8_t)__a);
>        __a = b;
>        int cb_l = scaled_128_5;
>        int cb_h = scaled_128_5;
> diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> new file mode 100644
> index 00000000000..44414e1c043
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fdump-tree-optimized" } */
> +
> +#include <arm_neon.h>
> +
> +#define VARIANTS				\
> +VARIANT (uint8x8_t, uint8x16_t, u8)		\
> +VARIANT (uint16x4_t, uint16x8_t, u16)		\
> +VARIANT (uint32x2_t, uint32x4_t, u32)		\
> +VARIANT (uint64x1_t, uint64x2_t, u64)		\
> +VARIANT (int8x8_t, int8x16_t, s8)		\
> +VARIANT (int16x4_t, int16x8_t, s16)		\
> +VARIANT (int32x2_t, int32x4_t, s32)		\
> +VARIANT (int64x1_t, int64x2_t, s64)		\
> +VARIANT (float16x4_t, float16x8_t, f16)		\
> +VARIANT (float32x2_t, float32x4_t, f32)		\
> +VARIANT (float64x1_t, float64x2_t, f64)		\
> +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> +
> +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> +#define VARIANT(TYPE64, TYPE128, SUFFIX)	\
> +TYPE64						\
> +test_vget_low_##SUFFIX (TYPE128 vec)		\
> +{						\
> +  return vget_low_##SUFFIX (vec);		\
> +}
> +
> +VARIANTS
> +
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> new file mode 100644
> index 00000000000..c3f4c4f0e0d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target stdint_types_mbig_endian } */
> +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
> +
> +#include <arm_neon.h>
> +
> +#define VARIANTS				\
> +VARIANT (uint8x8_t, uint8x16_t, u8)		\
> +VARIANT (uint16x4_t, uint16x8_t, u16)		\
> +VARIANT (uint32x2_t, uint32x4_t, u32)		\
> +VARIANT (uint64x1_t, uint64x2_t, u64)		\
> +VARIANT (int8x8_t, int8x16_t, s8)		\
> +VARIANT (int16x4_t, int16x8_t, s16)		\
> +VARIANT (int32x2_t, int32x4_t, s32)		\
> +VARIANT (int64x1_t, int64x2_t, s64)		\
> +VARIANT (float16x4_t, float16x8_t, f16)		\
> +VARIANT (float32x2_t, float32x4_t, f32)		\
> +VARIANT (float64x1_t, float64x2_t, f64)		\
> +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> +
> +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> +#define VARIANT(TYPE64, TYPE128, SUFFIX)	\
> +TYPE64						\
> +test_vget_low_##SUFFIX (TYPE128 vec)		\
> +{						\
> +  return vget_low_##SUFFIX (vec);		\
> +}
> +
> +VARIANTS
> +
> +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */
Andrew Pinski May 20, 2024, 8:15 p.m. UTC | #2
On Mon, May 20, 2024 at 2:57 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Pengxuan Zheng <quic_pzheng@quicinc.com> writes:
> > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up more
> > optimization opportunities for gimple optimizers.
> >
> > While we are here, we also remove the vget_low_* definitions from arm_neon.h and
> > use the new intrinsics framework.
> >
> >         PR target/102171
> >
> > gcc/ChangeLog:
> >
> >       * config/aarch64/aarch64-builtins.cc (AARCH64_SIMD_VGET_LOW_BUILTINS):
> >       New macro to create definitions for all vget_low intrinsics.
> >       (VGET_LOW_BUILTIN): Likewise.
> >       (enum aarch64_builtins): Add vget_low function codes.
> >       (aarch64_general_fold_builtin): Fold vget_low calls.
> >       * config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
> >       * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
> >       (aarch64_vget_lo_halfv8bf): Likewise.
> >       * config/aarch64/arm_neon.h (__attribute__): Delete.
> >       (vget_low_f16): Likewise.
> >       (vget_low_f32): Likewise.
> >       (vget_low_f64): Likewise.
> >       (vget_low_p8): Likewise.
> >       (vget_low_p16): Likewise.
> >       (vget_low_p64): Likewise.
> >       (vget_low_s8): Likewise.
> >       (vget_low_s16): Likewise.
> >       (vget_low_s32): Likewise.
> >       (vget_low_s64): Likewise.
> >       (vget_low_u8): Likewise.
> >       (vget_low_u16): Likewise.
> >       (vget_low_u32): Likewise.
> >       (vget_low_u64): Likewise.
> >       (vget_low_bf16): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> >       * gcc.target/aarch64/pr113573.c: Replace __builtin_aarch64_get_lowv8hi
> >       with vget_low_s16.
> >       * gcc.target/aarch64/vget_low_2.c: New test.
> >       * gcc.target/aarch64/vget_low_2_be.c: New test.
>
> Ok, thanks.  I suppose the patch has the side effect of allowing
> vget_low_bf16 to be called without +bf16.  IMO that's the correct
> behaviour though, and is consistent with how we handle reinterprets.

Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 .

Thanks,
Andrew

>
> Richard
>
> > Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> > ---
> >  gcc/config/aarch64/aarch64-builtins.cc        |  60 ++++++++++
> >  gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
> >  gcc/config/aarch64/aarch64-simd.md            |  23 +---
> >  gcc/config/aarch64/arm_neon.h                 | 105 ------------------
> >  gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
> >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +++++
> >  .../gcc.target/aarch64/vget_low_2_be.c        |  31 ++++++
> >  7 files changed, 124 insertions(+), 132 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> >
> > diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
> > index 75d21de1401..4afe7c86ae3 100644
> > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
> >    VREINTERPRET_BUILTINS \
> >    VREINTERPRETQ_BUILTINS
> >
> > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > +  VGET_LOW_BUILTIN(f16) \
> > +  VGET_LOW_BUILTIN(f32) \
> > +  VGET_LOW_BUILTIN(f64) \
> > +  VGET_LOW_BUILTIN(p8) \
> > +  VGET_LOW_BUILTIN(p16) \
> > +  VGET_LOW_BUILTIN(p64) \
> > +  VGET_LOW_BUILTIN(s8) \
> > +  VGET_LOW_BUILTIN(s16) \
> > +  VGET_LOW_BUILTIN(s32) \
> > +  VGET_LOW_BUILTIN(s64) \
> > +  VGET_LOW_BUILTIN(u8) \
> > +  VGET_LOW_BUILTIN(u16) \
> > +  VGET_LOW_BUILTIN(u32) \
> > +  VGET_LOW_BUILTIN(u64) \
> > +  VGET_LOW_BUILTIN(bf16)
> > +
> >  typedef struct
> >  {
> >    const char *name;
> > @@ -697,6 +714,9 @@ typedef struct
> >  #define VREINTERPRET_BUILTIN(A, B, L) \
> >    AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
> >
> > +#define VGET_LOW_BUILTIN(A) \
> > +  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
> > +
> >  #undef VAR1
> >  #define VAR1(T, N, MAP, FLAG, A) \
> >    AARCH64_SIMD_BUILTIN_##T##_##N##A,
> > @@ -732,6 +752,7 @@ enum aarch64_builtins
> >    AARCH64_CRC32_BUILTIN_MAX,
> >    /* SIMD intrinsic builtins.  */
> >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> >    /* ARMv8.3-A Pointer Authentication Builtins.  */
> >    AARCH64_PAUTH_BUILTIN_AUTIA1716,
> >    AARCH64_PAUTH_BUILTIN_PACIA1716,
> > @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = {
> >       && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
> >    },
> >
> > +#undef VGET_LOW_BUILTIN
> > +#define VGET_LOW_BUILTIN(A) \
> > +  {"vget_low_" #A, \
> > +   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
> > +   2, \
> > +   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
> > +   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
> > +   FLAG_AUTO_FP, \
> > +   false \
> > +  },
> > +
> > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > +  VGET_LOW_BUILTIN(f16) \
> > +  VGET_LOW_BUILTIN(f32) \
> > +  VGET_LOW_BUILTIN(f64) \
> > +  VGET_LOW_BUILTIN(p8) \
> > +  VGET_LOW_BUILTIN(p16) \
> > +  VGET_LOW_BUILTIN(p64) \
> > +  VGET_LOW_BUILTIN(s8) \
> > +  VGET_LOW_BUILTIN(s16) \
> > +  VGET_LOW_BUILTIN(s32) \
> > +  VGET_LOW_BUILTIN(s64) \
> > +  VGET_LOW_BUILTIN(u8) \
> > +  VGET_LOW_BUILTIN(u16) \
> > +  VGET_LOW_BUILTIN(u32) \
> > +  VGET_LOW_BUILTIN(u64) \
> > +  VGET_LOW_BUILTIN(bf16)
> > +
> >  static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
> >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> >  };
> >
> >
> > @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
> >  #define VREINTERPRET_BUILTIN(A, B, L) \
> >    case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
> >
> > +#undef VGET_LOW_BUILTIN
> > +#define VGET_LOW_BUILTIN(A) \
> > +  case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
> >
> >  /* Try to fold a call to the built-in function with subcode FCODE.  The
> >     function is passed the N_ARGS arguments in ARGS and it returns a value
> > @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned int fcode, tree type,
> >       return fold_build1 (FLOAT_EXPR, type, args[0]);
> >        AARCH64_SIMD_VREINTERPRET_BUILTINS
> >       return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
> > +      AARCH64_SIMD_VGET_LOW_BUILTINS
> > +        {
> > +          auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
> > +
> > +          return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64),
> > +                              bitsize_int (pos));
> > +        }
> >        case AARCH64_SIMD_BUILTIN_LANE_CHECK:
> >       gcc_assert (n_args == 3);
> >       if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
> > diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
> > index da16f602a55..a9f0558f8b6 100644
> > --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> > @@ -65,8 +65,6 @@
> >    BUILTIN_VS (UNOP, ctz, 2, NONE)
> >    BUILTIN_VB (UNOP, popcount, 2, NONE)
> >
> > -  /* Implemented by aarch64_get_low<mode>.  */
> > -  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
> >    /* Implemented by aarch64_get_high<mode>.  */
> >    BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
> >
> > @@ -960,8 +958,7 @@
> >    VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
> >    VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
> >
> > -  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
> > -  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
> > +  /* Implemented by aarch64_vget_hi_halfv8bf.  */
> >    VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
> >
> >    /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
> > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> > index f8bb973a278..5a28a8e3c6a 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>"
> >    }
> >  )
> >
> > -(define_expand "aarch64_get_low<mode>"
> > -  [(match_operand:<VHALF> 0 "register_operand")
> > -   (match_operand:VQMOV 1 "register_operand")]
> > -  "TARGET_FLOAT"
> > -  {
> > -    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
> > -    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
> > -    DONE;
> > -  }
> > -)
> > -
> >  (define_expand "aarch64_get_high<mode>"
> >    [(match_operand:<VHALF> 0 "register_operand")
> >     (match_operand:VQMOV 1 "register_operand")]
> > @@ -9774,17 +9763,7 @@ (define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> >    [(set_attr "type" "neon_dot<VDQSF:q>")]
> >  )
> >
> > -;; vget_low/high_bf16
> > -(define_expand "aarch64_vget_lo_halfv8bf"
> > -  [(match_operand:V4BF 0 "register_operand")
> > -   (match_operand:V8BF 1 "register_operand")]
> > -  "TARGET_BF16_SIMD"
> > -{
> > -  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
> > -  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
> > -  DONE;
> > -})
> > -
> > +;; vget_high_bf16
> >  (define_expand "aarch64_vget_hi_halfv8bf"
> >    [(match_operand:V4BF 0 "register_operand")
> >     (match_operand:V8BF 1 "register_operand")]
> > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> > index 0ee325dccad..92c2c5361cd 100644
> > --- a/gcc/config/aarch64/arm_neon.h
> > +++ b/gcc/config/aarch64/arm_neon.h
> > @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
> >    return __aarch64_vset_lane_any (__elem, __vec, __index);
> >  }
> >
> > -__extension__ extern __inline float16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f16 (float16x8_t __a)
> > -{
> > -  return __builtin_aarch64_get_lowv8hf (__a);
> > -}
> > -
> > -__extension__ extern __inline float32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f32 (float32x4_t __a)
> > -{
> > -  return __builtin_aarch64_get_lowv4sf (__a);
> > -}
> > -
> > -__extension__ extern __inline float64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_f64 (float64x2_t __a)
> > -{
> > -  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
> > -}
> > -
> > -__extension__ extern __inline poly8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p8 (poly8x16_t __a)
> > -{
> > -  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> > -}
> > -
> > -__extension__ extern __inline poly16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p16 (poly16x8_t __a)
> > -{
> > -  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> > -}
> > -
> > -__extension__ extern __inline poly64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_p64 (poly64x2_t __a)
> > -{
> > -  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
> > -}
> > -
> > -__extension__ extern __inline int8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s8 (int8x16_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv16qi (__a);
> > -}
> > -
> > -__extension__ extern __inline int16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s16 (int16x8_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv8hi (__a);
> > -}
> > -
> > -__extension__ extern __inline int32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s32 (int32x4_t __a)
> > -{
> > -  return  __builtin_aarch64_get_lowv4si (__a);
> > -}
> > -
> > -__extension__ extern __inline int64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_s64 (int64x2_t __a)
> > -{
> > -  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
> > -}
> > -
> > -__extension__ extern __inline uint8x8_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u8 (uint8x16_t __a)
> > -{
> > -  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u16 (uint16x8_t __a)
> > -{
> > -  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint32x2_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u32 (uint32x4_t __a)
> > -{
> > -  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
> > -}
> > -
> > -__extension__ extern __inline uint64x1_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_u64 (uint64x2_t __a)
> > -{
> > -  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
> > -}
> > -
> >  __extension__ extern __inline float16x4_t
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vget_high_f16 (float16x8_t __a)
> > @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
> >    return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
> >  }
> >
> > -__extension__ extern __inline bfloat16x4_t
> > -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> > -vget_low_bf16 (bfloat16x8_t __a)
> > -{
> > -  return __builtin_aarch64_vget_lo_halfv8bf (__a);
> > -}
> > -
> >  __extension__ extern __inline bfloat16x4_t
> >  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> >  vget_high_bf16 (bfloat16x8_t __a)
> > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > index a8e445c6e19..fc8607f7218 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() {
> >        int y_l = vmull_laneq_u16(r);
> >        uint16x8_t __a = g;
> >        jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
> > -          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
> > +          (uint16x4_t)vget_low_s16((int16x8_t)__a);
> >        __a = b;
> >        int cb_l = scaled_128_5;
> >        int cb_h = scaled_128_5;
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > new file mode 100644
> > index 00000000000..44414e1c043
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +#define VARIANTS                             \
> > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > +
> > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > +TYPE64                                               \
> > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > +{                                            \
> > +  return vget_low_##SUFFIX (vec);            \
> > +}
> > +
> > +VARIANTS
> > +
> > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > new file mode 100644
> > index 00000000000..c3f4c4f0e0d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > @@ -0,0 +1,31 @@
> > +/* { dg-do compile } */
> > +/* { dg-require-effective-target stdint_types_mbig_endian } */
> > +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
> > +
> > +#include <arm_neon.h>
> > +
> > +#define VARIANTS                             \
> > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > +
> > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > +TYPE64                                               \
> > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > +{                                            \
> > +  return vget_low_##SUFFIX (vec);            \
> > +}
> > +
> > +VARIANTS
> > +
> > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */
Pengxuan Zheng May 20, 2024, 8:36 p.m. UTC | #3
> On Mon, May 20, 2024 at 2:57 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > Pengxuan Zheng <quic_pzheng@quicinc.com> writes:
> > > This patch folds vget_low_* intrinsics to BIT_FILED_REF to open up
> > > more optimization opportunities for gimple optimizers.
> > >
> > > While we are here, we also remove the vget_low_* definitions from
> > > arm_neon.h and use the new intrinsics framework.
> > >
> > >         PR target/102171
> > >
> > > gcc/ChangeLog:
> > >
> > >       * config/aarch64/aarch64-builtins.cc
> (AARCH64_SIMD_VGET_LOW_BUILTINS):
> > >       New macro to create definitions for all vget_low intrinsics.
> > >       (VGET_LOW_BUILTIN): Likewise.
> > >       (enum aarch64_builtins): Add vget_low function codes.
> > >       (aarch64_general_fold_builtin): Fold vget_low calls.
> > >       * config/aarch64/aarch64-simd-builtins.def: Delete vget_low builtins.
> > >       * config/aarch64/aarch64-simd.md (aarch64_get_low<mode>): Delete.
> > >       (aarch64_vget_lo_halfv8bf): Likewise.
> > >       * config/aarch64/arm_neon.h (__attribute__): Delete.
> > >       (vget_low_f16): Likewise.
> > >       (vget_low_f32): Likewise.
> > >       (vget_low_f64): Likewise.
> > >       (vget_low_p8): Likewise.
> > >       (vget_low_p16): Likewise.
> > >       (vget_low_p64): Likewise.
> > >       (vget_low_s8): Likewise.
> > >       (vget_low_s16): Likewise.
> > >       (vget_low_s32): Likewise.
> > >       (vget_low_s64): Likewise.
> > >       (vget_low_u8): Likewise.
> > >       (vget_low_u16): Likewise.
> > >       (vget_low_u32): Likewise.
> > >       (vget_low_u64): Likewise.
> > >       (vget_low_bf16): Likewise.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >       * gcc.target/aarch64/pr113573.c: Replace
> __builtin_aarch64_get_lowv8hi
> > >       with vget_low_s16.
> > >       * gcc.target/aarch64/vget_low_2.c: New test.
> > >       * gcc.target/aarch64/vget_low_2_be.c: New test.
> >
> > Ok, thanks.  I suppose the patch has the side effect of allowing
> > vget_low_bf16 to be called without +bf16.  IMO that's the correct
> > behaviour though, and is consistent with how we handle reinterprets.

Thanks, Richard! Yes, it does have the side effect you pointed out and is consistent with how reinterprets are handled currently.
> 
> Pushed as r15-697-ga2e4fe5a53cf75cd055f64e745ebd51253e42254 .

Thanks, Andrew!
> 
> Thanks,
> Andrew
> 
> >
> > Richard
> >
> > > Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
> > > ---
> > >  gcc/config/aarch64/aarch64-builtins.cc        |  60 ++++++++++
> > >  gcc/config/aarch64/aarch64-simd-builtins.def  |   5 +-
> > >  gcc/config/aarch64/aarch64-simd.md            |  23 +---
> > >  gcc/config/aarch64/arm_neon.h                 | 105 ------------------
> > >  gcc/testsuite/gcc.target/aarch64/pr113573.c   |   2 +-
> > >  gcc/testsuite/gcc.target/aarch64/vget_low_2.c |  30 +++++
> > >  .../gcc.target/aarch64/vget_low_2_be.c        |  31 ++++++
> > >  7 files changed, 124 insertions(+), 132 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > >
> > > diff --git a/gcc/config/aarch64/aarch64-builtins.cc
> > > b/gcc/config/aarch64/aarch64-builtins.cc
> > > index 75d21de1401..4afe7c86ae3 100644
> > > --- a/gcc/config/aarch64/aarch64-builtins.cc
> > > +++ b/gcc/config/aarch64/aarch64-builtins.cc
> > > @@ -658,6 +658,23 @@ static aarch64_simd_builtin_datum
> aarch64_simd_builtin_data[] = {
> > >    VREINTERPRET_BUILTINS \
> > >    VREINTERPRETQ_BUILTINS
> > >
> > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > > +  VGET_LOW_BUILTIN(f16) \
> > > +  VGET_LOW_BUILTIN(f32) \
> > > +  VGET_LOW_BUILTIN(f64) \
> > > +  VGET_LOW_BUILTIN(p8) \
> > > +  VGET_LOW_BUILTIN(p16) \
> > > +  VGET_LOW_BUILTIN(p64) \
> > > +  VGET_LOW_BUILTIN(s8) \
> > > +  VGET_LOW_BUILTIN(s16) \
> > > +  VGET_LOW_BUILTIN(s32) \
> > > +  VGET_LOW_BUILTIN(s64) \
> > > +  VGET_LOW_BUILTIN(u8) \
> > > +  VGET_LOW_BUILTIN(u16) \
> > > +  VGET_LOW_BUILTIN(u32) \
> > > +  VGET_LOW_BUILTIN(u64) \
> > > +  VGET_LOW_BUILTIN(bf16)
> > > +
> > >  typedef struct
> > >  {
> > >    const char *name;
> > > @@ -697,6 +714,9 @@ typedef struct
> > >  #define VREINTERPRET_BUILTIN(A, B, L) \
> > >    AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
> > >
> > > +#define VGET_LOW_BUILTIN(A) \
> > > +  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
> > > +
> > >  #undef VAR1
> > >  #define VAR1(T, N, MAP, FLAG, A) \
> > >    AARCH64_SIMD_BUILTIN_##T##_##N##A,
> > > @@ -732,6 +752,7 @@ enum aarch64_builtins
> > >    AARCH64_CRC32_BUILTIN_MAX,
> > >    /* SIMD intrinsic builtins.  */
> > >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> > >    /* ARMv8.3-A Pointer Authentication Builtins.  */
> > >    AARCH64_PAUTH_BUILTIN_AUTIA1716,
> > >    AARCH64_PAUTH_BUILTIN_PACIA1716,
> > > @@ -823,8 +844,37 @@ static aarch64_fcmla_laneq_builtin_datum
> aarch64_fcmla_lane_builtin_data[] = {
> > >       && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
> > >    },
> > >
> > > +#undef VGET_LOW_BUILTIN
> > > +#define VGET_LOW_BUILTIN(A) \
> > > +  {"vget_low_" #A, \
> > > +   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
> > > +   2, \
> > > +   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
> > > +   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
> > > +   FLAG_AUTO_FP, \
> > > +   false \
> > > +  },
> > > +
> > > +#define AARCH64_SIMD_VGET_LOW_BUILTINS \
> > > +  VGET_LOW_BUILTIN(f16) \
> > > +  VGET_LOW_BUILTIN(f32) \
> > > +  VGET_LOW_BUILTIN(f64) \
> > > +  VGET_LOW_BUILTIN(p8) \
> > > +  VGET_LOW_BUILTIN(p16) \
> > > +  VGET_LOW_BUILTIN(p64) \
> > > +  VGET_LOW_BUILTIN(s8) \
> > > +  VGET_LOW_BUILTIN(s16) \
> > > +  VGET_LOW_BUILTIN(s32) \
> > > +  VGET_LOW_BUILTIN(s64) \
> > > +  VGET_LOW_BUILTIN(u8) \
> > > +  VGET_LOW_BUILTIN(u16) \
> > > +  VGET_LOW_BUILTIN(u32) \
> > > +  VGET_LOW_BUILTIN(u64) \
> > > +  VGET_LOW_BUILTIN(bf16)
> > > +
> > >  static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[]
> = {
> > >    AARCH64_SIMD_VREINTERPRET_BUILTINS
> > > +  AARCH64_SIMD_VGET_LOW_BUILTINS
> > >  };
> > >
> > >
> > > @@ -3216,6 +3266,9 @@ aarch64_fold_builtin_lane_check (tree arg0,
> > > tree arg1, tree arg2)  #define VREINTERPRET_BUILTIN(A, B, L) \
> > >    case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
> > >
> > > +#undef VGET_LOW_BUILTIN
> > > +#define VGET_LOW_BUILTIN(A) \
> > > +  case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
> > >
> > >  /* Try to fold a call to the built-in function with subcode FCODE.  The
> > >     function is passed the N_ARGS arguments in ARGS and it returns a
> > > value @@ -3235,6 +3288,13 @@ aarch64_general_fold_builtin (unsigned
> int fcode, tree type,
> > >       return fold_build1 (FLOAT_EXPR, type, args[0]);
> > >        AARCH64_SIMD_VREINTERPRET_BUILTINS
> > >       return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
> > > +      AARCH64_SIMD_VGET_LOW_BUILTINS
> > > +        {
> > > +          auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
> > > +
> > > +          return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64),
> > > +                              bitsize_int (pos));
> > > +        }
> > >        case AARCH64_SIMD_BUILTIN_LANE_CHECK:
> > >       gcc_assert (n_args == 3);
> > >       if (aarch64_fold_builtin_lane_check (args[0], args[1],
> > > args[2])) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def
> > > b/gcc/config/aarch64/aarch64-simd-builtins.def
> > > index da16f602a55..a9f0558f8b6 100644
> > > --- a/gcc/config/aarch64/aarch64-simd-builtins.def
> > > +++ b/gcc/config/aarch64/aarch64-simd-builtins.def
> > > @@ -65,8 +65,6 @@
> > >    BUILTIN_VS (UNOP, ctz, 2, NONE)
> > >    BUILTIN_VB (UNOP, popcount, 2, NONE)
> > >
> > > -  /* Implemented by aarch64_get_low<mode>.  */
> > > -  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
> > >    /* Implemented by aarch64_get_high<mode>.  */
> > >    BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
> > >
> > > @@ -960,8 +958,7 @@
> > >    VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
> > >    VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
> > >
> > > -  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
> > > -  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
> > > +  /* Implemented by aarch64_vget_hi_halfv8bf.  */
> > >    VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
> > >
> > >    /* Implemented by aarch64_simd_<sur>mmlav16qi.  */ diff --git
> > > a/gcc/config/aarch64/aarch64-simd.md
> > > b/gcc/config/aarch64/aarch64-simd.md
> > > index f8bb973a278..5a28a8e3c6a 100644
> > > --- a/gcc/config/aarch64/aarch64-simd.md
> > > +++ b/gcc/config/aarch64/aarch64-simd.md
> > > @@ -288,17 +288,6 @@ (define_expand "aarch64_get_half<mode>"
> > >    }
> > >  )
> > >
> > > -(define_expand "aarch64_get_low<mode>"
> > > -  [(match_operand:<VHALF> 0 "register_operand")
> > > -   (match_operand:VQMOV 1 "register_operand")]
> > > -  "TARGET_FLOAT"
> > > -  {
> > > -    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>,
> false);
> > > -    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1],
> lo));
> > > -    DONE;
> > > -  }
> > > -)
> > > -
> > >  (define_expand "aarch64_get_high<mode>"
> > >    [(match_operand:<VHALF> 0 "register_operand")
> > >     (match_operand:VQMOV 1 "register_operand")] @@ -9774,17 +9763,7
> > > @@ (define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
> > >    [(set_attr "type" "neon_dot<VDQSF:q>")]
> > >  )
> > >
> > > -;; vget_low/high_bf16
> > > -(define_expand "aarch64_vget_lo_halfv8bf"
> > > -  [(match_operand:V4BF 0 "register_operand")
> > > -   (match_operand:V8BF 1 "register_operand")]
> > > -  "TARGET_BF16_SIMD"
> > > -{
> > > -  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
> > > -  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1],
> > > p));
> > > -  DONE;
> > > -})
> > > -
> > > +;; vget_high_bf16
> > >  (define_expand "aarch64_vget_hi_halfv8bf"
> > >    [(match_operand:V4BF 0 "register_operand")
> > >     (match_operand:V8BF 1 "register_operand")] diff --git
> > > a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> > > index 0ee325dccad..92c2c5361cd 100644
> > > --- a/gcc/config/aarch64/arm_neon.h
> > > +++ b/gcc/config/aarch64/arm_neon.h
> > > @@ -3027,104 +3027,6 @@ vsetq_lane_u64 (uint64_t __elem, uint64x2_t
> __vec, const int __index)
> > >    return __aarch64_vset_lane_any (__elem, __vec, __index);  }
> > >
> > > -__extension__ extern __inline float16x4_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_f16 (float16x8_t __a)
> > > -{
> > > -  return __builtin_aarch64_get_lowv8hf (__a); -}
> > > -
> > > -__extension__ extern __inline float32x2_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_f32 (float32x4_t __a)
> > > -{
> > > -  return __builtin_aarch64_get_lowv4sf (__a); -}
> > > -
> > > -__extension__ extern __inline float64x1_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_f64 (float64x2_t __a)
> > > -{
> > > -  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)}; -}
> > > -
> > > -__extension__ extern __inline poly8x8_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_p8 (poly8x16_t __a)
> > > -{
> > > -  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline poly16x4_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_p16 (poly16x8_t __a)
> > > -{
> > > -  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline poly64x1_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_p64 (poly64x2_t __a)
> > > -{
> > > -  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline int8x8_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_s8 (int8x16_t __a)
> > > -{
> > > -  return  __builtin_aarch64_get_lowv16qi (__a); -}
> > > -
> > > -__extension__ extern __inline int16x4_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_s16 (int16x8_t __a)
> > > -{
> > > -  return  __builtin_aarch64_get_lowv8hi (__a); -}
> > > -
> > > -__extension__ extern __inline int32x2_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_s32 (int32x4_t __a)
> > > -{
> > > -  return  __builtin_aarch64_get_lowv4si (__a); -}
> > > -
> > > -__extension__ extern __inline int64x1_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_s64 (int64x2_t __a)
> > > -{
> > > -  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)}; -}
> > > -
> > > -__extension__ extern __inline uint8x8_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_u8 (uint8x16_t __a)
> > > -{
> > > -  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline uint16x4_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_u16 (uint16x8_t __a)
> > > -{
> > > -  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline uint32x2_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_u32 (uint32x4_t __a)
> > > -{
> > > -  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t)
> > > __a); -}
> > > -
> > > -__extension__ extern __inline uint64x1_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_u64 (uint64x2_t __a)
> > > -{
> > > -  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t)
> > > __a)}; -}
> > > -
> > >  __extension__ extern __inline float16x4_t  __attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > >  vget_high_f16 (float16x8_t __a)
> > > @@ -28479,13 +28381,6 @@ vbfmlaltq_laneq_f32 (float32x4_t __r,
> bfloat16x8_t __a, bfloat16x8_t __b,
> > >    return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b,
> > > __index);  }
> > >
> > > -__extension__ extern __inline bfloat16x4_t -__attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > > -vget_low_bf16 (bfloat16x8_t __a)
> > > -{
> > > -  return __builtin_aarch64_vget_lo_halfv8bf (__a); -}
> > > -
> > >  __extension__ extern __inline bfloat16x4_t  __attribute__
> > > ((__always_inline__, __gnu_inline__, __artificial__))
> > >  vget_high_bf16 (bfloat16x8_t __a)
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > > b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > > index a8e445c6e19..fc8607f7218 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
> > > @@ -26,7 +26,7 @@ void jsimd_extbgrx_ycc_convert_neon() {
> > >        int y_l = vmull_laneq_u16(r);
> > >        uint16x8_t __a = g;
> > >        jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
> > > -          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
> > > +          (uint16x4_t)vget_low_s16((int16x8_t)__a);
> > >        __a = b;
> > >        int cb_l = scaled_128_5;
> > >        int cb_h = scaled_128_5;
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > new file mode 100644
> > > index 00000000000..44414e1c043
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
> > > @@ -0,0 +1,30 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -fdump-tree-optimized" } */
> > > +
> > > +#include <arm_neon.h>
> > > +
> > > +#define VARIANTS                             \
> > > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > > +
> > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > > +TYPE64                                               \
> > > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > > +{                                            \
> > > +  return vget_low_##SUFFIX (vec);            \
> > > +}
> > > +
> > > +VARIANTS
> > > +
> > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF
> > > +<vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > > b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > > new file mode 100644
> > > index 00000000000..c3f4c4f0e0d
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
> > > @@ -0,0 +1,31 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-require-effective-target stdint_types_mbig_endian } */
> > > +/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
> > > +
> > > +#include <arm_neon.h>
> > > +
> > > +#define VARIANTS                             \
> > > +VARIANT (uint8x8_t, uint8x16_t, u8)          \
> > > +VARIANT (uint16x4_t, uint16x8_t, u16)                \
> > > +VARIANT (uint32x2_t, uint32x4_t, u32)                \
> > > +VARIANT (uint64x1_t, uint64x2_t, u64)                \
> > > +VARIANT (int8x8_t, int8x16_t, s8)            \
> > > +VARIANT (int16x4_t, int16x8_t, s16)          \
> > > +VARIANT (int32x2_t, int32x4_t, s32)          \
> > > +VARIANT (int64x1_t, int64x2_t, s64)          \
> > > +VARIANT (float16x4_t, float16x8_t, f16)              \
> > > +VARIANT (float32x2_t, float32x4_t, f32)              \
> > > +VARIANT (float64x1_t, float64x2_t, f64)              \
> > > +VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
> > > +
> > > +/* vget_low_* intrinsics should become BIT_FIELD_REF. */
> > > +#define VARIANT(TYPE64, TYPE128, SUFFIX)     \
> > > +TYPE64                                               \
> > > +test_vget_low_##SUFFIX (TYPE128 vec)         \
> > > +{                                            \
> > > +  return vget_low_##SUFFIX (vec);            \
> > > +}
> > > +
> > > +VARIANTS
> > > +
> > > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF
> > > +<vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index 75d21de1401..4afe7c86ae3 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -658,6 +658,23 @@  static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = {
   VREINTERPRET_BUILTINS \
   VREINTERPRETQ_BUILTINS
 
+#define AARCH64_SIMD_VGET_LOW_BUILTINS \
+  VGET_LOW_BUILTIN(f16) \
+  VGET_LOW_BUILTIN(f32) \
+  VGET_LOW_BUILTIN(f64) \
+  VGET_LOW_BUILTIN(p8) \
+  VGET_LOW_BUILTIN(p16) \
+  VGET_LOW_BUILTIN(p64) \
+  VGET_LOW_BUILTIN(s8) \
+  VGET_LOW_BUILTIN(s16) \
+  VGET_LOW_BUILTIN(s32) \
+  VGET_LOW_BUILTIN(s64) \
+  VGET_LOW_BUILTIN(u8) \
+  VGET_LOW_BUILTIN(u16) \
+  VGET_LOW_BUILTIN(u32) \
+  VGET_LOW_BUILTIN(u64) \
+  VGET_LOW_BUILTIN(bf16)
+
 typedef struct
 {
   const char *name;
@@ -697,6 +714,9 @@  typedef struct
 #define VREINTERPRET_BUILTIN(A, B, L) \
   AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B,
 
+#define VGET_LOW_BUILTIN(A) \
+  AARCH64_SIMD_BUILTIN_VGET_LOW_##A,
+
 #undef VAR1
 #define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
@@ -732,6 +752,7 @@  enum aarch64_builtins
   AARCH64_CRC32_BUILTIN_MAX,
   /* SIMD intrinsic builtins.  */
   AARCH64_SIMD_VREINTERPRET_BUILTINS
+  AARCH64_SIMD_VGET_LOW_BUILTINS
   /* ARMv8.3-A Pointer Authentication Builtins.  */
   AARCH64_PAUTH_BUILTIN_AUTIA1716,
   AARCH64_PAUTH_BUILTIN_PACIA1716,
@@ -823,8 +844,37 @@  static aarch64_fcmla_laneq_builtin_datum aarch64_fcmla_lane_builtin_data[] = {
      && SIMD_INTR_QUAL(A) == SIMD_INTR_QUAL(B) \
   },
 
+#undef VGET_LOW_BUILTIN
+#define VGET_LOW_BUILTIN(A) \
+  {"vget_low_" #A, \
+   AARCH64_SIMD_BUILTIN_VGET_LOW_##A, \
+   2, \
+   { SIMD_INTR_MODE(A, d), SIMD_INTR_MODE(A, q) }, \
+   { SIMD_INTR_QUAL(A), SIMD_INTR_QUAL(A) }, \
+   FLAG_AUTO_FP, \
+   false \
+  },
+
+#define AARCH64_SIMD_VGET_LOW_BUILTINS \
+  VGET_LOW_BUILTIN(f16) \
+  VGET_LOW_BUILTIN(f32) \
+  VGET_LOW_BUILTIN(f64) \
+  VGET_LOW_BUILTIN(p8) \
+  VGET_LOW_BUILTIN(p16) \
+  VGET_LOW_BUILTIN(p64) \
+  VGET_LOW_BUILTIN(s8) \
+  VGET_LOW_BUILTIN(s16) \
+  VGET_LOW_BUILTIN(s32) \
+  VGET_LOW_BUILTIN(s64) \
+  VGET_LOW_BUILTIN(u8) \
+  VGET_LOW_BUILTIN(u16) \
+  VGET_LOW_BUILTIN(u32) \
+  VGET_LOW_BUILTIN(u64) \
+  VGET_LOW_BUILTIN(bf16)
+
 static const aarch64_simd_intrinsic_datum aarch64_simd_intrinsic_data[] = {
   AARCH64_SIMD_VREINTERPRET_BUILTINS
+  AARCH64_SIMD_VGET_LOW_BUILTINS
 };
 
 
@@ -3216,6 +3266,9 @@  aarch64_fold_builtin_lane_check (tree arg0, tree arg1, tree arg2)
 #define VREINTERPRET_BUILTIN(A, B, L) \
   case AARCH64_SIMD_BUILTIN_VREINTERPRET##L##_##A##_##B:
 
+#undef VGET_LOW_BUILTIN
+#define VGET_LOW_BUILTIN(A) \
+  case AARCH64_SIMD_BUILTIN_VGET_LOW_##A:
 
 /* Try to fold a call to the built-in function with subcode FCODE.  The
    function is passed the N_ARGS arguments in ARGS and it returns a value
@@ -3235,6 +3288,13 @@  aarch64_general_fold_builtin (unsigned int fcode, tree type,
 	return fold_build1 (FLOAT_EXPR, type, args[0]);
       AARCH64_SIMD_VREINTERPRET_BUILTINS
 	return fold_build1 (VIEW_CONVERT_EXPR, type, args[0]);
+      AARCH64_SIMD_VGET_LOW_BUILTINS
+        {
+          auto pos = BYTES_BIG_ENDIAN ? 64 : 0;
+
+          return fold_build3 (BIT_FIELD_REF, type, args[0], bitsize_int (64),
+                              bitsize_int (pos));
+        }
       case AARCH64_SIMD_BUILTIN_LANE_CHECK:
 	gcc_assert (n_args == 3);
 	if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2]))
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index da16f602a55..a9f0558f8b6 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -65,8 +65,6 @@ 
   BUILTIN_VS (UNOP, ctz, 2, NONE)
   BUILTIN_VB (UNOP, popcount, 2, NONE)
 
-  /* Implemented by aarch64_get_low<mode>.  */
-  BUILTIN_VQMOV (UNOP, get_low, 0, AUTO_FP)
   /* Implemented by aarch64_get_high<mode>.  */
   BUILTIN_VQMOV (UNOP, get_high, 0, AUTO_FP)
 
@@ -960,8 +958,7 @@ 
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, FP, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, FP, v4sf)
 
-  /* Implemented by aarch64_vget_lo/hi_halfv8bf.  */
-  VAR1 (UNOP, vget_lo_half, 0, AUTO_FP, v8bf)
+  /* Implemented by aarch64_vget_hi_halfv8bf.  */
   VAR1 (UNOP, vget_hi_half, 0, AUTO_FP, v8bf)
 
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index f8bb973a278..5a28a8e3c6a 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -288,17 +288,6 @@  (define_expand "aarch64_get_half<mode>"
   }
 )
 
-(define_expand "aarch64_get_low<mode>"
-  [(match_operand:<VHALF> 0 "register_operand")
-   (match_operand:VQMOV 1 "register_operand")]
-  "TARGET_FLOAT"
-  {
-    rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
-    emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
-    DONE;
-  }
-)
-
 (define_expand "aarch64_get_high<mode>"
   [(match_operand:<VHALF> 0 "register_operand")
    (match_operand:VQMOV 1 "register_operand")]
@@ -9774,17 +9763,7 @@  (define_insn "aarch64_bfdot_lane<VBF:isquadop><VDQSF:mode>"
   [(set_attr "type" "neon_dot<VDQSF:q>")]
 )
 
-;; vget_low/high_bf16
-(define_expand "aarch64_vget_lo_halfv8bf"
-  [(match_operand:V4BF 0 "register_operand")
-   (match_operand:V8BF 1 "register_operand")]
-  "TARGET_BF16_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (V8BFmode, 8, false);
-  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], p));
-  DONE;
-})
-
+;; vget_high_bf16
 (define_expand "aarch64_vget_hi_halfv8bf"
   [(match_operand:V4BF 0 "register_operand")
    (match_operand:V8BF 1 "register_operand")]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 0ee325dccad..92c2c5361cd 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -3027,104 +3027,6 @@  vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   return __aarch64_vset_lane_any (__elem, __vec, __index);
 }
 
-__extension__ extern __inline float16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_f16 (float16x8_t __a)
-{
-  return __builtin_aarch64_get_lowv8hf (__a);
-}
-
-__extension__ extern __inline float32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_f32 (float32x4_t __a)
-{
-  return __builtin_aarch64_get_lowv4sf (__a);
-}
-
-__extension__ extern __inline float64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_f64 (float64x2_t __a)
-{
-  return (float64x1_t) {__builtin_aarch64_get_lowv2df (__a)};
-}
-
-__extension__ extern __inline poly8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_p8 (poly8x16_t __a)
-{
-  return (poly8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
-}
-
-__extension__ extern __inline poly16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_p16 (poly16x8_t __a)
-{
-  return (poly16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
-}
-
-__extension__ extern __inline poly64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_p64 (poly64x2_t __a)
-{
-  return (poly64x1_t) __builtin_aarch64_get_lowv2di ((int64x2_t) __a);
-}
-
-__extension__ extern __inline int8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_s8 (int8x16_t __a)
-{
-  return  __builtin_aarch64_get_lowv16qi (__a);
-}
-
-__extension__ extern __inline int16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_s16 (int16x8_t __a)
-{
-  return  __builtin_aarch64_get_lowv8hi (__a);
-}
-
-__extension__ extern __inline int32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_s32 (int32x4_t __a)
-{
-  return  __builtin_aarch64_get_lowv4si (__a);
-}
-
-__extension__ extern __inline int64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_s64 (int64x2_t __a)
-{
-  return  (int64x1_t) {__builtin_aarch64_get_lowv2di (__a)};
-}
-
-__extension__ extern __inline uint8x8_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_u8 (uint8x16_t __a)
-{
-  return (uint8x8_t) __builtin_aarch64_get_lowv16qi ((int8x16_t) __a);
-}
-
-__extension__ extern __inline uint16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_u16 (uint16x8_t __a)
-{
-  return (uint16x4_t) __builtin_aarch64_get_lowv8hi ((int16x8_t) __a);
-}
-
-__extension__ extern __inline uint32x2_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_u32 (uint32x4_t __a)
-{
-  return (uint32x2_t) __builtin_aarch64_get_lowv4si ((int32x4_t) __a);
-}
-
-__extension__ extern __inline uint64x1_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_u64 (uint64x2_t __a)
-{
-  return (uint64x1_t) {__builtin_aarch64_get_lowv2di ((int64x2_t) __a)};
-}
-
 __extension__ extern __inline float16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_f16 (float16x8_t __a)
@@ -28479,13 +28381,6 @@  vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
-__extension__ extern __inline bfloat16x4_t
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vget_low_bf16 (bfloat16x8_t __a)
-{
-  return __builtin_aarch64_vget_lo_halfv8bf (__a);
-}
-
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vget_high_bf16 (bfloat16x8_t __a)
diff --git a/gcc/testsuite/gcc.target/aarch64/pr113573.c b/gcc/testsuite/gcc.target/aarch64/pr113573.c
index a8e445c6e19..fc8607f7218 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr113573.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr113573.c
@@ -26,7 +26,7 @@  void jsimd_extbgrx_ycc_convert_neon() {
       int y_l = vmull_laneq_u16(r);
       uint16x8_t __a = g;
       jsimd_extbgrx_ycc_convert_neon___trans_tmp_2 =
-          (uint16x4_t)__builtin_aarch64_get_lowv8hi((int16x8_t)__a);
+          (uint16x4_t)vget_low_s16((int16x8_t)__a);
       __a = b;
       int cb_l = scaled_128_5;
       int cb_h = scaled_128_5;
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
new file mode 100644
index 00000000000..44414e1c043
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+
+#include <arm_neon.h>
+
+#define VARIANTS				\
+VARIANT (uint8x8_t, uint8x16_t, u8)		\
+VARIANT (uint16x4_t, uint16x8_t, u16)		\
+VARIANT (uint32x2_t, uint32x4_t, u32)		\
+VARIANT (uint64x1_t, uint64x2_t, u64)		\
+VARIANT (int8x8_t, int8x16_t, s8)		\
+VARIANT (int16x4_t, int16x8_t, s16)		\
+VARIANT (int32x2_t, int32x4_t, s32)		\
+VARIANT (int64x1_t, int64x2_t, s64)		\
+VARIANT (float16x4_t, float16x8_t, f16)		\
+VARIANT (float32x2_t, float32x4_t, f32)		\
+VARIANT (float64x1_t, float64x2_t, f64)		\
+VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
+
+/* vget_low_* intrinsics should become BIT_FIELD_REF. */
+#define VARIANT(TYPE64, TYPE128, SUFFIX)	\
+TYPE64						\
+test_vget_low_##SUFFIX (TYPE128 vec)		\
+{						\
+  return vget_low_##SUFFIX (vec);		\
+}
+
+VARIANTS
+
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 0>" 12 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
new file mode 100644
index 00000000000..c3f4c4f0e0d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_2_be.c
@@ -0,0 +1,31 @@ 
+/* { dg-do compile } */
+/* { dg-require-effective-target stdint_types_mbig_endian } */
+/* { dg-options "-O3 -fdump-tree-optimized -mbig-endian" } */
+
+#include <arm_neon.h>
+
+#define VARIANTS				\
+VARIANT (uint8x8_t, uint8x16_t, u8)		\
+VARIANT (uint16x4_t, uint16x8_t, u16)		\
+VARIANT (uint32x2_t, uint32x4_t, u32)		\
+VARIANT (uint64x1_t, uint64x2_t, u64)		\
+VARIANT (int8x8_t, int8x16_t, s8)		\
+VARIANT (int16x4_t, int16x8_t, s16)		\
+VARIANT (int32x2_t, int32x4_t, s32)		\
+VARIANT (int64x1_t, int64x2_t, s64)		\
+VARIANT (float16x4_t, float16x8_t, f16)		\
+VARIANT (float32x2_t, float32x4_t, f32)		\
+VARIANT (float64x1_t, float64x2_t, f64)		\
+VARIANT (bfloat16x4_t, bfloat16x8_t, bf16)
+
+/* vget_low_* intrinsics should become BIT_FIELD_REF. */
+#define VARIANT(TYPE64, TYPE128, SUFFIX)	\
+TYPE64						\
+test_vget_low_##SUFFIX (TYPE128 vec)		\
+{						\
+  return vget_low_##SUFFIX (vec);		\
+}
+
+VARIANTS
+
+/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF <vec_\[0-9\]*\\\(D\\\), 64, 64>" 12 "optimized" } } */