@@ -62,6 +62,10 @@ ADJUST_ALIGNMENT (VNx8BI, 2);
ADJUST_ALIGNMENT (VNx4BI, 2);
ADJUST_ALIGNMENT (VNx2BI, 2);
+/* Bfloat16 modes. */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI. */
VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI. */
VECTOR_MODES (FLOAT, 8); /* V2SF. */
@@ -69,13 +73,6 @@ VECTOR_MODES (FLOAT, 16); /* V4SF V2DF. */
VECTOR_MODE (FLOAT, DF, 1); /* V1DF. */
VECTOR_MODE (FLOAT, HF, 2); /* V2HF. */
-/* Bfloat16 modes. */
-FLOAT_MODE (BF, 2, 0);
-ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
-
-VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */
-VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */
-
/* Oct Int: 256-bit integer mode needed for 32-byte vector arguments. */
INT_MODE (OI, 32);
@@ -96,6 +93,7 @@ INT_MODE (XI, 64);
ADJUST_NUNITS (VH##HI, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VS##SI, aarch64_sve_vg * NVECS * 2); \
ADJUST_NUNITS (VD##DI, aarch64_sve_vg * NVECS); \
+ ADJUST_NUNITS (VH##BF, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VH##HF, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VS##SF, aarch64_sve_vg * NVECS * 2); \
ADJUST_NUNITS (VD##DF, aarch64_sve_vg * NVECS); \
@@ -104,6 +102,7 @@ INT_MODE (XI, 64);
ADJUST_ALIGNMENT (VH##HI, 16); \
ADJUST_ALIGNMENT (VS##SI, 16); \
ADJUST_ALIGNMENT (VD##DI, 16); \
+ ADJUST_ALIGNMENT (VH##BF, 16); \
ADJUST_ALIGNMENT (VH##HF, 16); \
ADJUST_ALIGNMENT (VS##SF, 16); \
ADJUST_ALIGNMENT (VD##DF, 16);
@@ -18,15 +18,15 @@
<http://www.gnu.org/licenses/>. */
#define REQUIRED_EXTENSIONS 0
-DEF_SVE_FUNCTION (svabd, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svabd, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svabs, unary, all_float_and_signed, mxz)
DEF_SVE_FUNCTION (svacge, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svacgt, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svacle, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svaclt, compare_opt_n, all_float, implicit)
-DEF_SVE_FUNCTION (svadd, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svadd, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svadda, fold_left, all_float, implicit)
-DEF_SVE_FUNCTION (svaddv, reduction_wide, all_data, implicit)
+DEF_SVE_FUNCTION (svaddv, reduction_wide, all_arith, implicit)
DEF_SVE_FUNCTION (svadrb, adr_offset, none, none)
DEF_SVE_FUNCTION (svadrd, adr_index, none, none)
DEF_SVE_FUNCTION (svadrh, adr_index, none, none)
@@ -51,17 +51,17 @@ DEF_SVE_FUNCTION (svcls, unary_to_uint, all_signed, mxz)
DEF_SVE_FUNCTION (svclz, unary_to_uint, all_integer, mxz)
DEF_SVE_FUNCTION (svcmla, ternary_rotate, all_float, mxz)
DEF_SVE_FUNCTION (svcmla_lane, ternary_lane_rotate, hs_float, none)
-DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpeq_wide, compare_wide_opt_n, bhs_signed, implicit)
-DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpge_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpgt_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmple_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmplt_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpne_wide, compare_wide_opt_n, bhs_signed, implicit)
DEF_SVE_FUNCTION (svcmpuo, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svcnot, unary, all_integer, mxz)
@@ -160,23 +160,23 @@ DEF_SVE_FUNCTION (svlsl, binary_uint_opt_n, all_integer, mxz)
DEF_SVE_FUNCTION (svlsl_wide, binary_uint64_opt_n, bhs_integer, mxz)
DEF_SVE_FUNCTION (svlsr, binary_uint_opt_n, all_unsigned, mxz)
DEF_SVE_FUNCTION (svlsr_wide, binary_uint64_opt_n, bhs_unsigned, mxz)
-DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svmax, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmax, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmaxnm, binary_opt_n, all_float, mxz)
DEF_SVE_FUNCTION (svmaxnmv, reduction, all_float, implicit)
-DEF_SVE_FUNCTION (svmaxv, reduction, all_data, implicit)
-DEF_SVE_FUNCTION (svmin, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmaxv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmin, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svminnm, binary_opt_n, all_float, mxz)
DEF_SVE_FUNCTION (svminnmv, reduction, all_float, implicit)
-DEF_SVE_FUNCTION (svminv, reduction, all_data, implicit)
-DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svminv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmla_lane, ternary_lane, all_float, none)
-DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmls_lane, ternary_lane, all_float, none)
DEF_SVE_FUNCTION (svmmla, mmla, none, none)
DEF_SVE_FUNCTION (svmov, unary, b, z)
-DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svmul, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmul, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmul_lane, binary_lane, all_float, none)
DEF_SVE_FUNCTION (svmulh, binary_opt_n, all_integer, mxz)
DEF_SVE_FUNCTION (svmulx, binary_opt_n, all_float, mxz)
@@ -287,8 +287,8 @@ DEF_SVE_FUNCTION (svst2, store, all_data, implicit)
DEF_SVE_FUNCTION (svst3, store, all_data, implicit)
DEF_SVE_FUNCTION (svst4, store, all_data, implicit)
DEF_SVE_FUNCTION (svstnt1, store, all_data, implicit)
-DEF_SVE_FUNCTION (svsub, binary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svsub, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svtbl, binary_uint, all_data, none)
DEF_SVE_FUNCTION (svtmad, tmad, all_float, none)
DEF_SVE_FUNCTION (svtrn1, binary, all_data, none)
@@ -31,7 +31,7 @@ DEF_SVE_FUNCTION (svabdlt, binary_long_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svaddlb, binary_long_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svaddlbt, binary_long_opt_n, hsd_signed, none)
DEF_SVE_FUNCTION (svaddlt, binary_long_opt_n, hsd_integer, none)
-DEF_SVE_FUNCTION (svaddp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svaddp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svaddwb, binary_wide_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svaddwt, binary_wide_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svbcax, ternary_opt_n, all_integer, none)
@@ -69,7 +69,7 @@ DEF_SVE_FUNCTION (svldnt1uw_gather, load_ext_gather_offset_restricted, d_integer
DEF_SVE_FUNCTION (svldnt1uw_gather, load_ext_gather_index_restricted, d_integer, implicit)
DEF_SVE_FUNCTION (svlogb, unary_to_int, all_float, mxz)
DEF_SVE_FUNCTION (svmatch, compare, bh_integer, implicit)
-DEF_SVE_FUNCTION (svmaxp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svmaxp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svmaxnmp, binary, all_float, mx)
DEF_SVE_FUNCTION (svmla_lane, ternary_lane, hsd_integer, none)
DEF_SVE_FUNCTION (svmlalb, ternary_long_opt_n, s_float_hsd_integer, none)
@@ -81,7 +81,7 @@ DEF_SVE_FUNCTION (svmlslb, ternary_long_opt_n, s_float_hsd_integer, none)
DEF_SVE_FUNCTION (svmlslb_lane, ternary_long_lane, s_float_sd_integer, none)
DEF_SVE_FUNCTION (svmlslt, ternary_long_opt_n, s_float_hsd_integer, none)
DEF_SVE_FUNCTION (svmlslt_lane, ternary_long_lane, s_float_sd_integer, none)
-DEF_SVE_FUNCTION (svminp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svminp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svminnmp, binary, all_float, mx)
DEF_SVE_FUNCTION (svmovlb, unary_long, hsd_integer, none)
DEF_SVE_FUNCTION (svmovlt, unary_long, hsd_integer, none)
@@ -184,9 +184,16 @@ CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
/* _f16 _f32 _f64
_s8 _s16 _s32 _s64
_u8 _u16 _u32 _u64. */
-#define TYPES_all_data(S, D) \
+#define TYPES_all_arith(S, D) \
TYPES_all_float (S, D), TYPES_all_integer (S, D)
+/* _bf16
+ _f16 _f32 _f64
+ _s8 _s16 _s32 _s64
+ _u8 _u16 _u32 _u64. */
+#define TYPES_all_data(S, D) \
+ S (bf16), TYPES_all_arith (S, D)
+
/* _b only. */
#define TYPES_b(S, D) \
S (b)
@@ -371,14 +378,17 @@ CONSTEXPR const type_suffix_info type_suffixes[NUM_TYPE_SUFFIXES + 1] = {
TYPES_inc_dec_n1 (D, u32), \
TYPES_inc_dec_n1 (D, u64)
-/* { _f16 _f32 _f64 } { _f16 _f32 _f64 }
- { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 }
- { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 }. */
+/* { _bf16 } { _bf16 }
+ { _f16 _f32 _f64 } { _f16 _f32 _f64 }
+ { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 }
+ { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 }. */
#define TYPES_reinterpret1(D, A) \
+ D (A, bf16), \
D (A, f16), D (A, f32), D (A, f64), \
D (A, s8), D (A, s16), D (A, s32), D (A, s64), \
D (A, u8), D (A, u16), D (A, u32), D (A, u64)
#define TYPES_reinterpret(S, D) \
+ TYPES_reinterpret1 (D, bf16), \
TYPES_reinterpret1 (D, f16), \
TYPES_reinterpret1 (D, f32), \
TYPES_reinterpret1 (D, f64), \
@@ -428,6 +438,7 @@ DEF_SVE_TYPES_ARRAY (all_signed);
DEF_SVE_TYPES_ARRAY (all_float_and_signed);
DEF_SVE_TYPES_ARRAY (all_unsigned);
DEF_SVE_TYPES_ARRAY (all_integer);
+DEF_SVE_TYPES_ARRAY (all_arith);
DEF_SVE_TYPES_ARRAY (all_data);
DEF_SVE_TYPES_ARRAY (b);
DEF_SVE_TYPES_ARRAY (b_unsigned);
@@ -3351,7 +3362,7 @@ register_tuple_type (unsigned int num_vectors, vector_type_index type)
&& TYPE_ALIGN (tuple_type) == 128);
/* Work out the structure name. */
- char buffer[sizeof ("svfloat64x4_t")];
+ char buffer[sizeof ("svbfloat16x4_t")];
const char *vector_type_name = vector_types[type].acle_name;
snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
(int) strlen (vector_type_name) - 2, vector_type_name,
@@ -61,6 +61,7 @@ DEF_SVE_MODE (u64offset, none, svuint64_t, bytes)
DEF_SVE_MODE (vnum, none, none, vectors)
DEF_SVE_TYPE (svbool_t, 10, __SVBool_t, boolean_type_node)
+DEF_SVE_TYPE (svbfloat16_t, 14, __SVBfloat16_t, aarch64_bf16_type_node)
DEF_SVE_TYPE (svfloat16_t, 13, __SVFloat16_t, aarch64_fp16_type_node)
DEF_SVE_TYPE (svfloat32_t, 13, __SVFloat32_t, float_type_node)
DEF_SVE_TYPE (svfloat64_t, 13, __SVFloat64_t, double_type_node)
@@ -81,6 +82,7 @@ DEF_SVE_TYPE_SUFFIX (b8, svbool_t, bool, 8, VNx16BImode)
DEF_SVE_TYPE_SUFFIX (b16, svbool_t, bool, 16, VNx8BImode)
DEF_SVE_TYPE_SUFFIX (b32, svbool_t, bool, 32, VNx4BImode)
DEF_SVE_TYPE_SUFFIX (b64, svbool_t, bool, 64, VNx2BImode)
+DEF_SVE_TYPE_SUFFIX (bf16, svbfloat16_t, bfloat, 16, VNx8BFmode)
DEF_SVE_TYPE_SUFFIX (f16, svfloat16_t, float, 16, VNx8HFmode)
DEF_SVE_TYPE_SUFFIX (f32, svfloat32_t, float, 32, VNx4SFmode)
DEF_SVE_TYPE_SUFFIX (f64, svfloat64_t, float, 64, VNx2DFmode)
@@ -150,6 +150,7 @@ enum predication_index
enum type_class_index
{
TYPE_bool,
+ TYPE_bfloat,
TYPE_float,
TYPE_signed,
TYPE_unsigned,
@@ -1656,6 +1656,7 @@ aarch64_classify_vector_mode (machine_mode mode)
case E_VNx8HImode:
case E_VNx4SImode:
case E_VNx2DImode:
+ case E_VNx8BFmode:
case E_VNx8HFmode:
case E_VNx4SFmode:
case E_VNx2DFmode:
@@ -1666,6 +1667,7 @@ aarch64_classify_vector_mode (machine_mode mode)
case E_VNx16HImode:
case E_VNx8SImode:
case E_VNx4DImode:
+ case E_VNx16BFmode:
case E_VNx16HFmode:
case E_VNx8SFmode:
case E_VNx4DFmode:
@@ -1674,6 +1676,7 @@ aarch64_classify_vector_mode (machine_mode mode)
case E_VNx24HImode:
case E_VNx12SImode:
case E_VNx6DImode:
+ case E_VNx24BFmode:
case E_VNx24HFmode:
case E_VNx12SFmode:
case E_VNx6DFmode:
@@ -1682,6 +1685,7 @@ aarch64_classify_vector_mode (machine_mode mode)
case E_VNx32HImode:
case E_VNx16SImode:
case E_VNx8DImode:
+ case E_VNx32BFmode:
case E_VNx32HFmode:
case E_VNx16SFmode:
case E_VNx8DFmode:
@@ -16109,8 +16113,10 @@ aarch64_full_sve_mode (scalar_mode mode)
return VNx4SFmode;
case E_HFmode:
return VNx8HFmode;
+ case E_BFmode:
+ return VNx8BFmode;
case E_DImode:
- return VNx2DImode;
+ return VNx2DImode;
case E_SImode:
return VNx4SImode;
case E_HImode:
@@ -26,6 +26,7 @@
#define _ARM_SVE_H_
#include <stdint.h>
+#include <arm_bf16.h>
typedef __fp16 float16_t;
typedef float float32_t;
@@ -321,19 +321,15 @@ (define_mode_iterator VNx2DF_ONLY [VNx2DF])
;; All SVE vector structure modes.
(define_mode_iterator SVE_STRUCT [VNx32QI VNx16HI VNx8SI VNx4DI
- VNx16HF VNx8SF VNx4DF
+ VNx16BF VNx16HF VNx8SF VNx4DF
VNx48QI VNx24HI VNx12SI VNx6DI
- VNx24HF VNx12SF VNx6DF
+ VNx24BF VNx24HF VNx12SF VNx6DF
VNx64QI VNx32HI VNx16SI VNx8DI
- VNx32HF VNx16SF VNx8DF])
-
-;; SVE_STRUCT restricted to 2-vector tuples.
-(define_mode_iterator SVE_STRUCT2 [VNx32QI VNx16HI VNx8SI VNx4DI
- VNx16HF VNx8SF VNx4DF])
+ VNx32BF VNx32HF VNx16SF VNx8DF])
;; All fully-packed SVE vector modes.
(define_mode_iterator SVE_FULL [VNx16QI VNx8HI VNx4SI VNx2DI
- VNx8HF VNx4SF VNx2DF])
+ VNx8BF VNx8HF VNx4SF VNx2DF])
;; All fully-packed SVE integer vector modes.
(define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
@@ -349,7 +345,8 @@ (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
(define_mode_iterator SVE_FULL_BHSI [VNx16QI VNx8HI VNx4SI])
;; Fully-packed SVE vector modes that have 16-bit, 32-bit or 64-bit elements.
-(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI VNx8HF VNx4SF VNx2DF])
+(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
+ VNx8BF VNx8HF VNx4SF VNx2DF])
;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
;; elements.
@@ -395,6 +392,7 @@ (define_mode_iterator SVE_PARTIAL_I [VNx8QI VNx4QI VNx2QI
(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
VNx8HI VNx4HI VNx2HI
VNx8HF VNx4HF VNx2HF
+ VNx8BF
VNx4SI VNx2SI
VNx4SF VNx2SF
VNx2DI
@@ -1005,6 +1003,7 @@ (define_mode_attr Vetype [(V8QI "b") (V16QI "b")
(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+ (VNx8BF "h")
(VNx4SI "s") (VNx2SI "s")
(VNx4SF "s") (VNx2SF "s")
(VNx2DI "d")
@@ -1021,6 +1020,7 @@ (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+ (VNx8BF "h")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "d")
@@ -1028,6 +1028,7 @@ (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
(VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
(VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
+ (VNx16BF "h") (VNx24BF "h") (VNx32BF "h")
(VNx8SI "w") (VNx12SI "w") (VNx16SI "w")
(VNx8SF "w") (VNx12SF "w") (VNx16SF "w")
(VNx4DI "d") (VNx6DI "d") (VNx8DI "d")
@@ -1038,6 +1039,7 @@ (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
(VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
(VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
+ (VNx8BF "h")
(VNx4SI "s") (VNx2SI "d")
(VNx4SF "s") (VNx2SF "d")
(VNx2DI "d")
@@ -1077,6 +1079,7 @@ (define_mode_attr VEL [(V8QI "QI") (V16QI "QI")
(VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
(VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
(VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
+ (VNx8BF "BF")
(VNx4SI "SI") (VNx2SI "SI")
(VNx4SF "SF") (VNx2SF "SF")
(VNx2DI "DI")
@@ -1095,6 +1098,7 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
(VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
(VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
(VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
+ (VNx8BF "bf")
(VNx4SI "si") (VNx2SI "si")
(VNx4SF "sf") (VNx2SF "sf")
(VNx2DI "di")
@@ -1102,19 +1106,19 @@ (define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
;; Element mode with floating-point values replaced by like-sized integers.
(define_mode_attr VEL_INT [(VNx16QI "QI")
- (VNx8HI "HI") (VNx8HF "HI")
+ (VNx8HI "HI") (VNx8HF "HI") (VNx8BF "HI")
(VNx4SI "SI") (VNx4SF "SI")
(VNx2DI "DI") (VNx2DF "DI")])
;; Gives the mode of the 128-bit lowpart of an SVE vector.
(define_mode_attr V128 [(VNx16QI "V16QI")
- (VNx8HI "V8HI") (VNx8HF "V8HF")
+ (VNx8HI "V8HI") (VNx8HF "V8HF") (VNx8BF "V8BF")
(VNx4SI "V4SI") (VNx4SF "V4SF")
(VNx2DI "V2DI") (VNx2DF "V2DF")])
;; ...and again in lower case.
(define_mode_attr v128 [(VNx16QI "v16qi")
- (VNx8HI "v8hi") (VNx8HF "v8hf")
+ (VNx8HI "v8hi") (VNx8HF "v8hf") (VNx8BF "v8bf")
(VNx4SI "v4si") (VNx4SF "v4sf")
(VNx2DI "v2di") (VNx2DF "v2df")])
@@ -1277,6 +1281,7 @@ (define_mode_attr vwcore [(V8QI "w") (V16QI "w")
(VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
(VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
(VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
+ (VNx8BF "w")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "x")
@@ -1303,6 +1308,7 @@ (define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI "V16QI")
(HF "HI")
(VNx16QI "VNx16QI")
(VNx8HI "VNx8HI") (VNx8HF "VNx8HI")
+ (VNx8BF "VNx8HI")
(VNx4SI "VNx4SI") (VNx4SF "VNx4SI")
(VNx2DI "VNx2DI") (VNx2DF "VNx2DI")
])
@@ -1318,15 +1324,18 @@ (define_mode_attr v_int_equiv [(V8QI "v8qi") (V16QI "v16qi")
(SF "si")
(VNx16QI "vnx16qi")
(VNx8HI "vnx8hi") (VNx8HF "vnx8hi")
+ (VNx8BF "vnx8hi")
(VNx4SI "vnx4si") (VNx4SF "vnx4si")
(VNx2DI "vnx2di") (VNx2DF "vnx2di")
])
;; Floating-point equivalent of selected modes.
(define_mode_attr V_FP_EQUIV [(VNx8HI "VNx8HF") (VNx8HF "VNx8HF")
+ (VNx8BF "VNx8HF")
(VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
(VNx2DI "VNx2DF") (VNx2DF "VNx2DF")])
(define_mode_attr v_fp_equiv [(VNx8HI "vnx8hf") (VNx8HF "vnx8hf")
+ (VNx8BF "vnx8hf")
(VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
(VNx2DI "vnx2df") (VNx2DF "vnx2df")])
@@ -1508,51 +1517,63 @@ (define_code_attr MAX_OPP [(smax "SMIN") (umax "UMIN")])
;; The number of subvectors in an SVE_STRUCT.
(define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
(VNx8SI "2") (VNx4DI "2")
+ (VNx16BF "2")
(VNx16HF "2") (VNx8SF "2") (VNx4DF "2")
(VNx48QI "3") (VNx24HI "3")
(VNx12SI "3") (VNx6DI "3")
+ (VNx24BF "3")
(VNx24HF "3") (VNx12SF "3") (VNx6DF "3")
(VNx64QI "4") (VNx32HI "4")
(VNx16SI "4") (VNx8DI "4")
+ (VNx32BF "4")
(VNx32HF "4") (VNx16SF "4") (VNx8DF "4")])
;; The number of instruction bytes needed for an SVE_STRUCT move. This is
;; equal to vector_count * 4.
(define_mode_attr insn_length [(VNx32QI "8") (VNx16HI "8")
(VNx8SI "8") (VNx4DI "8")
+ (VNx16BF "8")
(VNx16HF "8") (VNx8SF "8") (VNx4DF "8")
(VNx48QI "12") (VNx24HI "12")
(VNx12SI "12") (VNx6DI "12")
+ (VNx24BF "12")
(VNx24HF "12") (VNx12SF "12") (VNx6DF "12")
(VNx64QI "16") (VNx32HI "16")
(VNx16SI "16") (VNx8DI "16")
+ (VNx32BF "16")
(VNx32HF "16") (VNx16SF "16") (VNx8DF "16")])
;; The type of a subvector in an SVE_STRUCT.
(define_mode_attr VSINGLE [(VNx32QI "VNx16QI")
(VNx16HI "VNx8HI") (VNx16HF "VNx8HF")
+ (VNx16BF "VNx8BF")
(VNx8SI "VNx4SI") (VNx8SF "VNx4SF")
(VNx4DI "VNx2DI") (VNx4DF "VNx2DF")
(VNx48QI "VNx16QI")
(VNx24HI "VNx8HI") (VNx24HF "VNx8HF")
+ (VNx24BF "VNx8BF")
(VNx12SI "VNx4SI") (VNx12SF "VNx4SF")
(VNx6DI "VNx2DI") (VNx6DF "VNx2DF")
(VNx64QI "VNx16QI")
(VNx32HI "VNx8HI") (VNx32HF "VNx8HF")
+ (VNx32BF "VNx8BF")
(VNx16SI "VNx4SI") (VNx16SF "VNx4SF")
(VNx8DI "VNx2DI") (VNx8DF "VNx2DF")])
;; ...and again in lower case.
(define_mode_attr vsingle [(VNx32QI "vnx16qi")
(VNx16HI "vnx8hi") (VNx16HF "vnx8hf")
+ (VNx16BF "vnx8bf")
(VNx8SI "vnx4si") (VNx8SF "vnx4sf")
(VNx4DI "vnx2di") (VNx4DF "vnx2df")
(VNx48QI "vnx16qi")
(VNx24HI "vnx8hi") (VNx24HF "vnx8hf")
+ (VNx24BF "vnx8bf")
(VNx12SI "vnx4si") (VNx12SF "vnx4sf")
(VNx6DI "vnx2di") (VNx6DF "vnx2df")
(VNx64QI "vnx16qi")
(VNx32HI "vnx8hi") (VNx32HF "vnx8hf")
+ (VNx32BF "vnx8bf")
(VNx16SI "vnx4si") (VNx16SF "vnx4sf")
(VNx8DI "vnx2di") (VNx8DF "vnx2df")])
@@ -1562,20 +1583,24 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
(VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
(VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
(VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
+ (VNx8BF "VNx8BI")
(VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
(VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
(VNx2DI "VNx2BI")
(VNx2DF "VNx2BI")
(VNx32QI "VNx16BI")
(VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
+ (VNx16BF "VNx8BI")
(VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
(VNx4DI "VNx2BI") (VNx4DF "VNx2BI")
(VNx48QI "VNx16BI")
(VNx24HI "VNx8BI") (VNx24HF "VNx8BI")
+ (VNx24BF "VNx8BI")
(VNx12SI "VNx4BI") (VNx12SF "VNx4BI")
(VNx6DI "VNx2BI") (VNx6DF "VNx2BI")
(VNx64QI "VNx16BI")
(VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
+ (VNx32BF "VNx8BI")
(VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
@@ -1584,25 +1609,30 @@ (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
(VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
(VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
(VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
+ (VNx8BF "vnx8bi")
(VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
(VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
(VNx2DI "vnx2bi")
(VNx2DF "vnx2bi")
(VNx32QI "vnx16bi")
(VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
+ (VNx16BF "vnx8bi")
(VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
(VNx4DI "vnx2bi") (VNx4DF "vnx2bi")
(VNx48QI "vnx16bi")
(VNx24HI "vnx8bi") (VNx24HF "vnx8bi")
+ (VNx24BF "vnx8bi")
(VNx12SI "vnx4bi") (VNx12SF "vnx4bi")
(VNx6DI "vnx2bi") (VNx6DF "vnx2bi")
(VNx64QI "vnx16bi")
(VNx32HI "vnx8bi") (VNx32HF "vnx4bi")
+ (VNx32BF "vnx8bi")
(VNx16SI "vnx4bi") (VNx16SF "vnx4bi")
(VNx8DI "vnx2bi") (VNx8DF "vnx2bi")])
(define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
(VNx8HI "VNx16HI") (VNx8HF "VNx16HF")
+ (VNx8BF "VNx16BF")
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
@@ -14,6 +14,7 @@ void f9(svuint64_t) {}
void f10(svfloat16_t) {}
void f11(svfloat32_t) {}
void f12(svfloat64_t) {}
+void f13(svbfloat16_t) {}
/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
@@ -27,3 +28,4 @@ void f12(svfloat64_t) {}
/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
@@ -12,6 +12,7 @@ void f9(__SVUint64_t) {}
void f10(__SVFloat16_t) {}
void f11(__SVFloat32_t) {}
void f12(__SVFloat64_t) {}
+void f13(__SVBfloat16_t) {}
/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
@@ -25,3 +26,4 @@ void f12(__SVFloat64_t) {}
/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
new file mode 100644
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_bf16_tied1:
+** clasta z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied1, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z0, z1),
+ z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** clasta z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied2, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z1, z0),
+ z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_bf16_untied:
+** movprfx z0, z1
+** clasta z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_untied, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z1, z2),
+ z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_bf16:
+** clasta h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclasta_n_bf16 (p0, d0, z2),
+ d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_bf16:
+** mov v0\.h\[0\], v1\.h\[0\]
+** clasta h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclasta_n_bf16 (p0, d1, z2),
+ d0 = svclasta (p0, d1, z2))
new file mode 100644
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_bf16_tied1:
+** clastb z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied1, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z0, z1),
+ z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** clastb z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied2, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z1, z0),
+ z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_bf16_untied:
+** movprfx z0, z1
+** clastb z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_untied, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z1, z2),
+ z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_bf16:
+** clastb h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclastb_n_bf16 (p0, d0, z2),
+ d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_bf16:
+** mov v0\.h\[0\], v1\.h\[0\]
+** clastb h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclastb_n_bf16 (p0, d1, z2),
+ d0 = svclastb (p0, d1, z2))
new file mode 100644
@@ -0,0 +1,52 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_bf16_m_tied1:
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_tied1, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_m (z0, p0, z4),
+ z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_bf16_m_untied:
+** movprfx z0, z1
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_untied, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_m (z1, p0, z4),
+ z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_bf16_z:
+** movprfx z0\.h, p0/z, z4\.h
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_z, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_z (p0, z4),
+ z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_bf16_x:
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_x, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_x (p0, z4),
+ z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_bf16_x:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_DUAL_Z (ptrue_cnt_bf16_x, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_x (svptrue_b16 (), z4),
+ z0 = svcnt_x (svptrue_b16 (), z4))
@@ -42,6 +42,16 @@ TEST_CREATE (create2_u16, svuint16x2_t, svuint16_t,
z0 = svcreate2_u16 (z6, z5),
z0 = svcreate2 (z6, z5))
+/*
+** create2_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** ret
+*/
+TEST_CREATE (create2_bf16, svbfloat16x2_t, svbfloat16_t,
+ z0 = svcreate2_bf16 (z4, z5),
+ z0 = svcreate2 (z4, z5))
+
/*
** create2_f16:
** mov z0\.d, z4\.d
@@ -46,6 +46,17 @@ TEST_CREATE (create3_u16, svuint16x3_t, svuint16_t,
z0 = svcreate3_u16 (z6, z5, z4),
z0 = svcreate3 (z6, z5, z4))
+/*
+** create3_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** mov z2\.d, z6\.d
+** ret
+*/
+TEST_CREATE (create3_bf16, svbfloat16x3_t, svbfloat16_t,
+ z0 = svcreate3_bf16 (z4, z5, z6),
+ z0 = svcreate3 (z4, z5, z6))
+
/*
** create3_f16:
** mov z0\.d, z4\.d
@@ -50,6 +50,18 @@ TEST_CREATE (create4_u16, svuint16x4_t, svuint16_t,
z0 = svcreate4_u16 (z6, z5, z4, z7),
z0 = svcreate4 (z6, z5, z4, z7))
+/*
+** create4_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** mov z2\.d, z6\.d
+** mov z3\.d, z7\.d
+** ret
+*/
+TEST_CREATE (create4_bf16, svbfloat16x4_t, svbfloat16_t,
+ z0 = svcreate4_bf16 (z4, z5, z6, z7),
+ z0 = svcreate4 (z4, z5, z6, z7))
+
/*
** create4_f16:
** mov z0\.d, z4\.d
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_h4_bf16:
+** mov z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16 (d4),
+ z0 = svdup_bf16 (d4))
+
+/*
+** dup_h4_bf16_m:
+** movprfx z0, z1
+** mov z0\.h, p0/m, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_m, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_m (z1, p0, d4),
+ z0 = svdup_bf16_m (z1, p0, d4))
+
+/*
+** dup_h4_bf16_z:
+** movprfx z0\.h, p0/z, z0\.h
+** mov z0\.h, p0/m, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_z, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_z (p0, d4),
+ z0 = svdup_bf16_z (p0, d4))
+
+/*
+** dup_h4_bf16_x:
+** mov z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_x, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_x (p0, d4),
+ z0 = svdup_bf16_x (p0, d4))
new file mode 100644
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_bf16_tied1:
+** mov (z[0-9]+\.h), w0
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_tied1, svbfloat16_t, uint16_t,
+ z0 = svdup_lane_bf16 (z0, x0),
+ z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_bf16_untied:
+** mov (z[0-9]+\.h), w0
+** tbl z0\.h, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_untied, svbfloat16_t, uint16_t,
+ z0 = svdup_lane_bf16 (z1, x0),
+ z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_bf16_tied1:
+** dup z0\.h, z0\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_tied1, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 0),
+ z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_bf16_untied:
+** dup z0\.h, z1\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z1, 0),
+ z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_bf16:
+** dup z0\.h, z0\.h\[15\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 15),
+ z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_bf16:
+** dup z0\.h, z0\.h\[16\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 16),
+ z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_bf16:
+** dup z0\.h, z0\.h\[31\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 31),
+ z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_bf16:
+** mov (z[0-9]+\.h), #32
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 32),
+ z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_bf16:
+** mov (z[0-9]+\.h), #63
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 63),
+ z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_bf16:
+** mov (z[0-9]+\.h), #64
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 64),
+ z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_bf16:
+** mov (z[0-9]+\.h), #255
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 255),
+ z0 = svdup_lane (z0, 255))
new file mode 100644
@@ -0,0 +1,48 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_bf16_tied:
+** dup z0\.q, z0\.q\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_tied, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 0),
+ z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_bf16_untied:
+** dup z0\.q, z1\.q\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z1, 0),
+ z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_bf16:
+** dup z0\.q, z0\.q\[1\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 1),
+ z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_bf16:
+** dup z0\.q, z0\.q\[2\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 2),
+ z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_bf16:
+** dup z0\.q, z0\.q\[3\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 3),
+ z0 = svdupq_lane (z0, 3))
new file mode 100644
@@ -0,0 +1,73 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_bf16_tied1:
+** ext z0\.b, z0\.b, z1\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied1, svbfloat16_t,
+ z0 = svext_bf16 (z0, z1, 0),
+ z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** ext z0\.b, z0\.b, \1\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied2, svbfloat16_t,
+ z0 = svext_bf16 (z1, z0, 0),
+ z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_bf16_untied:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_untied, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 0),
+ z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #2
+** ret
+*/
+TEST_UNIFORM_Z (ext_1_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 1),
+ z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #4
+** ret
+*/
+TEST_UNIFORM_Z (ext_2_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 2),
+ z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #6
+** ret
+*/
+TEST_UNIFORM_Z (ext_3_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 3),
+ z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #254
+** ret
+*/
+TEST_UNIFORM_Z (ext_127_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 127),
+ z0 = svext (z1, z2, 127))
new file mode 100644
@@ -0,0 +1,55 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get2_bf16_z0_0, svbfloat16x2_t, svbfloat16_t,
+ z0 = svget2_bf16 (z4, 0),
+ z0 = svget2 (z4, 0))
+
+/*
+** get2_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get2_bf16_z0_1, svbfloat16x2_t, svbfloat16_t,
+ z0 = svget2_bf16 (z4, 1),
+ z0 = svget2 (z4, 1))
+
+/*
+** get2_bf16_z4_0:
+** ret
+*/
+TEST_GET (get2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+ z4_res = svget2_bf16 (z4, 0),
+ z4_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+ z4_res = svget2_bf16 (z4, 1),
+ z4_res = svget2 (z4, 1))
+
+/*
+** get2_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get2_bf16_z5_0, svbfloat16x2_t, svbfloat16_t,
+ z5_res = svget2_bf16 (z4, 0),
+ z5_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z5_1:
+** ret
+*/
+TEST_GET (get2_bf16_z5_1, svbfloat16x2_t, svbfloat16_t,
+ z5_res = svget2_bf16 (z4, 1),
+ z5_res = svget2 (z4, 1))
new file mode 100644
@@ -0,0 +1,108 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_0, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 0),
+ z0 = svget3 (z4, 0))
+
+/*
+** get3_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_1, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 1),
+ z0 = svget3 (z4, 1))
+
+/*
+** get3_bf16_z0_2:
+** mov z0\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_2, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 2),
+ z0 = svget3 (z4, 2))
+
+/*
+** get3_bf16_z4_0:
+** ret
+*/
+TEST_GET (get3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 0),
+ z4_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 1),
+ z4_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z4_2:
+** mov z4\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 2),
+ z4_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z5_0, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 0),
+ z5_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z5_1:
+** ret
+*/
+TEST_GET (get3_bf16_z5_1, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 1),
+ z5_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z5_2:
+** mov z5\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z5_2, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 2),
+ z5_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z6_0:
+** mov z6\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z6_0, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 0),
+ z6_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z6_1:
+** mov z6\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z6_1, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 1),
+ z6_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z6_2:
+** ret
+*/
+TEST_GET (get3_bf16_z6_2, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 2),
+ z6_res = svget3 (z4, 2))
new file mode 100644
@@ -0,0 +1,179 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_0, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 0),
+ z0 = svget4 (z4, 0))
+
+/*
+** get4_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_1, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 1),
+ z0 = svget4 (z4, 1))
+
+/*
+** get4_bf16_z0_2:
+** mov z0\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_2, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 2),
+ z0 = svget4 (z4, 2))
+
+/*
+** get4_bf16_z0_3:
+** mov z0\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_3, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 3),
+ z0 = svget4 (z4, 3))
+
+/*
+** get4_bf16_z4_0:
+** ret
+*/
+TEST_GET (get4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 0),
+ z4_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 1),
+ z4_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z4_2:
+** mov z4\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 2),
+ z4_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z4_3:
+** mov z4\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 3),
+ z4_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_0, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 0),
+ z5_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z5_1:
+** ret
+*/
+TEST_GET (get4_bf16_z5_1, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 1),
+ z5_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z5_2:
+** mov z5\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_2, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 2),
+ z5_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z5_3:
+** mov z5\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_3, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 3),
+ z5_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z6_0:
+** mov z6\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_0, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 0),
+ z6_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z6_1:
+** mov z6\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_1, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 1),
+ z6_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z6_2:
+** ret
+*/
+TEST_GET (get4_bf16_z6_2, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 2),
+ z6_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z6_3:
+** mov z6\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_3, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 3),
+ z6_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z7_0:
+** mov z7\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_0, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 0),
+ z7_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z7_1:
+** mov z7\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_1, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 1),
+ z7_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z7_2:
+** mov z7\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_2, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 2),
+ z7_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z7_3:
+** ret
+*/
+TEST_GET (get4_bf16_z7_3, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 3),
+ z7_res = svget4 (z4, 3))
new file mode 100644
@@ -0,0 +1,22 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_h4_bf16_tied1:
+** insr z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_tied1, svbfloat16_t, bfloat16_t,
+ z0 = svinsr_n_bf16 (z0, d4),
+ z0 = svinsr (z0, d4))
+
+/*
+** insr_h4_bf16_untied:
+** movprfx z0, z1
+** insr z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_untied, svbfloat16_t, bfloat16_t,
+ z0 = svinsr_n_bf16 (z1, d4),
+ z0 = svinsr (z1, d4))
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_bf16_tied:
+** lasta h0, p0, z0\.h
+** ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+ d0 = svlasta_bf16 (p0, z0),
+ d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_bf16_untied:
+** lasta h0, p0, z1\.h
+** ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+ d0 = svlasta_bf16 (p0, z1),
+ d0 = svlasta (p0, z1))
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_bf16_tied:
+** lastb h0, p0, z0\.h
+** ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+ d0 = svlastb_bf16 (p0, z0),
+ d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_bf16_untied:
+** lastb h0, p0, z1\.h
+** ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+ d0 = svlastb_bf16 (p0, z1),
+ d0 = svlastb (p0, z1))
new file mode 100644
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_bf16_base:
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0),
+ z0 = svld1 (p0, x0))
+
+/*
+** ld1_bf16_index:
+** ld1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + x1),
+ z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_bf16_1:
+** ld1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth ()),
+ z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_bf16_7:
+** ld1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_bf16_8:
+** incb x0, all, mul #8
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_bf16_m1:
+** ld1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth ()),
+ z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_bf16_m8:
+** ld1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_bf16_m9:
+** decb x0, all, mul #9
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_bf16_0:
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 0),
+ z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_bf16_1:
+** ld1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 1),
+ z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_bf16_7:
+** ld1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 7),
+ z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 8),
+ z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_bf16_m1:
+** ld1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -1),
+ z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_bf16_m8:
+** ld1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -8),
+ z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -9),
+ z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, x1),
+ z0 = svld1_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,120 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_bf16_base:
+** ld1roh z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0),
+ z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_bf16_index:
+** ld1roh z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + x1),
+ z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_bf16_1:
+** add (x[0-9]+), x0, #?2
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 1),
+ z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_bf16_8:
+** add (x[0-9]+), x0, #?16
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 8),
+ z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_bf16_128:
+** add (x[0-9]+), x0, #?256
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_128, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 128),
+ z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_bf16_m1:
+** sub (x[0-9]+), x0, #?2
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 1),
+ z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_bf16_m8:
+** sub (x[0-9]+), x0, #?16
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 8),
+ z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_bf16_m144:
+** sub (x[0-9]+), x0, #?288
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m144, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 144),
+ z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_bf16_16:
+** ld1roh z0\.h, p0/z, \[x0, #?32\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_16, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 16),
+ z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_bf16_112:
+** ld1roh z0\.h, p0/z, \[x0, #?224\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_112, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 112),
+ z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_bf16_m16:
+** ld1roh z0\.h, p0/z, \[x0, #?-32\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m16, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 16),
+ z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_bf16_m128:
+** ld1roh z0\.h, p0/z, \[x0, #?-256\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m128, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 128),
+ z0 = svld1ro (p0, x0 - 128))
+
new file mode 100644
@@ -0,0 +1,137 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_bf16_base:
+** ld1rqh z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0),
+ z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_bf16_index:
+** ld1rqh z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + x1),
+ z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_bf16_1:
+** add (x[0-9]+), x0, #?2
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 1),
+ z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_bf16_4:
+** add (x[0-9]+), x0, #?8
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_4, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 4),
+ z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_bf16_7:
+** add (x[0-9]+), x0, #?14
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 7),
+ z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_bf16_8:
+** ld1rqh z0\.h, p0/z, \[x0, #?16\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 8),
+ z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_bf16_56:
+** ld1rqh z0\.h, p0/z, \[x0, #?112\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_56, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 56),
+ z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_bf16_64:
+** add (x[0-9]+), x0, #?128
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_64, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 64),
+ z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_bf16_m1:
+** sub (x[0-9]+), x0, #?2
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 1),
+ z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_bf16_m4:
+** sub (x[0-9]+), x0, #?8
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m4, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 4),
+ z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_bf16_m7:
+** sub (x[0-9]+), x0, #?14
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m7, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 7),
+ z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_bf16_m8:
+** ld1rqh z0\.h, p0/z, \[x0, #?-16\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 8),
+ z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_bf16_m64:
+** ld1rqh z0\.h, p0/z, \[x0, #?-128\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m64, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 64),
+ z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_bf16_m72:
+** sub (x[0-9]+), x0, #?144
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m72, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 72),
+ z0 = svld1rq (p0, x0 - 72))
new file mode 100644
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_bf16_base:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_base, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0),
+ z0 = svld2 (p0, x0))
+
+/*
+** ld2_bf16_index:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_index, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + x1),
+ z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_1:
+** incb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth ()),
+ z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_bf16_2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_bf16_14:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_14, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 14),
+ z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_16:
+** incb x0, all, mul #16
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 16),
+ z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_m1:
+** decb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth ()),
+ z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_bf16_m2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_bf16_m16:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 16),
+ z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 18),
+ z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_bf16_0:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 0),
+ z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_1:
+** incb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 1),
+ z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_bf16_2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 2),
+ z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_bf16_14:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 14),
+ z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_16:
+** incb x0, all, mul #16
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 16),
+ z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_m1:
+** decb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -1),
+ z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_bf16_m2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -2),
+ z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_bf16_m16:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -16),
+ z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -18),
+ z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld2_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, x1),
+ z0 = svld2_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_bf16_base:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_base, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0),
+ z0 = svld3 (p0, x0))
+
+/*
+** ld3_bf16_index:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_index, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + x1),
+ z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_1:
+** incb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth ()),
+ z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_2:
+** incb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_bf16_3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 3),
+ z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_bf16_21:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_21, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 21),
+ z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 24),
+ z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_m1:
+** decb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth ()),
+ z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_m2:
+** decb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_bf16_m3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 3),
+ z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_bf16_m24:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 24),
+ z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 27),
+ z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_bf16_0:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 0),
+ z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_1:
+** incb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 1),
+ z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_2:
+** incb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 2),
+ z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_bf16_3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 3),
+ z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_bf16_21:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 21),
+ z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 24),
+ z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_m1:
+** decb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -1),
+ z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_m2:
+** decb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -2),
+ z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_bf16_m3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -3),
+ z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_bf16_m24:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -24),
+ z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -27),
+ z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld3_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, x1),
+ z0 = svld3_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_bf16_base:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_base, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0),
+ z0 = svld4 (p0, x0))
+
+/*
+** ld4_bf16_index:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_index, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + x1),
+ z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_1:
+** incb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth ()),
+ z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_2:
+** incb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_3:
+** incb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 3),
+ z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_bf16_4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 4),
+ z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_bf16_28:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_28, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 28),
+ z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_bf16_32:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 32),
+ z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m1:
+** decb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth ()),
+ z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m2:
+** decb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m3:
+** decb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 3),
+ z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_bf16_m4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 4),
+ z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_bf16_m32:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 32),
+ z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_bf16_m36:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 36),
+ z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_bf16_0:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 0),
+ z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_1:
+** incb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 1),
+ z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_2:
+** incb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 2),
+ z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_3:
+** incb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 3),
+ z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_bf16_4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 4),
+ z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_bf16_28:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 28),
+ z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_bf16_32:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 32),
+ z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m1:
+** decb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -1),
+ z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m2:
+** decb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -2),
+ z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m3:
+** decb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -3),
+ z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_bf16_m4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -4),
+ z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_bf16_m32:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -32),
+ z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_bf16_m36:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -36),
+ z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld4_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, x1),
+ z0 = svld4_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,86 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_bf16_base:
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0),
+ z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_bf16_index:
+** ldff1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 + x1),
+ z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_bf16_1:
+** incb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_bf16_m1:
+** decb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_bf16_0:
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, 0),
+ z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_vnum_bf16_1:
+** incb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, 1),
+ z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_vnum_bf16_m1:
+** decb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, -1),
+ z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ldff1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ldff1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, x1),
+ z0 = svldff1_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,154 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_bf16_base:
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0),
+ z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_bf16_index:
+** add (x[0-9]+), x0, x1, lsl 1
+** ldnf1h z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + x1),
+ z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_bf16_1:
+** ldnf1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_bf16_7:
+** ldnf1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_bf16_8:
+** incb x0, all, mul #8
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_bf16_m1:
+** ldnf1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_bf16_m8:
+** ldnf1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_bf16_m9:
+** decb x0, all, mul #9
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_bf16_0:
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 0),
+ z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_bf16_1:
+** ldnf1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 1),
+ z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_bf16_7:
+** ldnf1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 7),
+ z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 8),
+ z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_bf16_m1:
+** ldnf1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -1),
+ z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_bf16_m8:
+** ldnf1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -8),
+ z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -9),
+ z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (?:x1, \1|\1, x1), x0
+** ldnf1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, x1),
+ z0 = svldnf1_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_bf16_base:
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0),
+ z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_bf16_index:
+** ldnt1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + x1),
+ z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_bf16_1:
+** ldnt1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_bf16_7:
+** ldnt1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_bf16_8:
+** incb x0, all, mul #8
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_bf16_m1:
+** ldnt1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_bf16_m8:
+** ldnt1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_bf16_m9:
+** decb x0, all, mul #9
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_bf16_0:
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 0),
+ z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_bf16_1:
+** ldnt1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 1),
+ z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_bf16_7:
+** ldnt1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 7),
+ z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 8),
+ z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_bf16_m1:
+** ldnt1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -1),
+ z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_bf16_m8:
+** ldnt1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -8),
+ z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -9),
+ z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ldnt1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ldnt1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, x1),
+ z0 = svldnt1_vnum (p0, x0, x1))
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_bf16:
+** cnth x0
+** ret
+*/
+TEST_REDUCTION_X (len_x0_bf16, uint64_t, svbfloat16_t,
+ x0 = svlen_bf16 (z0),
+ x0 = svlen (z0))
new file mode 100644
@@ -0,0 +1,207 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_bf16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_bf16_tied1, svbfloat16_t, svbfloat16_t,
+ z0_res = svreinterpret_bf16_bf16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_bf16_untied, svbfloat16_t, svbfloat16_t,
+ z0 = svreinterpret_bf16_bf16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f16_tied1, svbfloat16_t, svfloat16_t,
+ z0_res = svreinterpret_bf16_f16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f16_untied, svbfloat16_t, svfloat16_t,
+ z0 = svreinterpret_bf16_f16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f32_tied1, svbfloat16_t, svfloat32_t,
+ z0_res = svreinterpret_bf16_f32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f32_untied, svbfloat16_t, svfloat32_t,
+ z0 = svreinterpret_bf16_f32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f64_tied1, svbfloat16_t, svfloat64_t,
+ z0_res = svreinterpret_bf16_f64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f64_untied, svbfloat16_t, svfloat64_t,
+ z0 = svreinterpret_bf16_f64 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s8_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s8_tied1, svbfloat16_t, svint8_t,
+ z0_res = svreinterpret_bf16_s8 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s8_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s8_untied, svbfloat16_t, svint8_t,
+ z0 = svreinterpret_bf16_s8 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s16_tied1, svbfloat16_t, svint16_t,
+ z0_res = svreinterpret_bf16_s16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s16_untied, svbfloat16_t, svint16_t,
+ z0 = svreinterpret_bf16_s16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s32_tied1, svbfloat16_t, svint32_t,
+ z0_res = svreinterpret_bf16_s32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s32_untied, svbfloat16_t, svint32_t,
+ z0 = svreinterpret_bf16_s32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s64_tied1, svbfloat16_t, svint64_t,
+ z0_res = svreinterpret_bf16_s64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s64_untied, svbfloat16_t, svint64_t,
+ z0 = svreinterpret_bf16_s64 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u8_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u8_tied1, svbfloat16_t, svuint8_t,
+ z0_res = svreinterpret_bf16_u8 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u8_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u8_untied, svbfloat16_t, svuint8_t,
+ z0 = svreinterpret_bf16_u8 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u16_tied1, svbfloat16_t, svuint16_t,
+ z0_res = svreinterpret_bf16_u16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u16_untied, svbfloat16_t, svuint16_t,
+ z0 = svreinterpret_bf16_u16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u32_tied1, svbfloat16_t, svuint32_t,
+ z0_res = svreinterpret_bf16_u32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u32_untied, svbfloat16_t, svuint32_t,
+ z0 = svreinterpret_bf16_u32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u64_tied1, svbfloat16_t, svuint64_t,
+ z0_res = svreinterpret_bf16_u64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u64_untied, svbfloat16_t, svuint64_t,
+ z0 = svreinterpret_bf16_u64 (z4),
+ z0 = svreinterpret_bf16 (z4))
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_f16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_bf16_tied1, svfloat16_t, svbfloat16_t,
+ z0_res = svreinterpret_f16_bf16 (z0),
+ z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f16_bf16_untied, svfloat16_t, svbfloat16_t,
+ z0 = svreinterpret_f16_bf16 (z4),
+ z0 = svreinterpret_f16 (z4))
+
/*
** reinterpret_f16_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_f32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_bf16_tied1, svfloat32_t, svbfloat16_t,
+ z0_res = svreinterpret_f32_bf16 (z0),
+ z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f32_bf16_untied, svfloat32_t, svbfloat16_t,
+ z0 = svreinterpret_f32_bf16 (z4),
+ z0 = svreinterpret_f32 (z4))
+
/*
** reinterpret_f32_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_f64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_bf16_tied1, svfloat64_t, svbfloat16_t,
+ z0_res = svreinterpret_f64_bf16 (z0),
+ z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f64_bf16_untied, svfloat64_t, svbfloat16_t,
+ z0 = svreinterpret_f64_bf16 (z4),
+ z0 = svreinterpret_f64 (z4))
+
/*
** reinterpret_f64_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_s16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_bf16_tied1, svint16_t, svbfloat16_t,
+ z0_res = svreinterpret_s16_bf16 (z0),
+ z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s16_bf16_untied, svint16_t, svbfloat16_t,
+ z0 = svreinterpret_s16_bf16 (z4),
+ z0 = svreinterpret_s16 (z4))
+
/*
** reinterpret_s16_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_s32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_bf16_tied1, svint32_t, svbfloat16_t,
+ z0_res = svreinterpret_s32_bf16 (z0),
+ z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s32_bf16_untied, svint32_t, svbfloat16_t,
+ z0 = svreinterpret_s32_bf16 (z4),
+ z0 = svreinterpret_s32 (z4))
+
/*
** reinterpret_s32_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_s64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_bf16_tied1, svint64_t, svbfloat16_t,
+ z0_res = svreinterpret_s64_bf16 (z0),
+ z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s64_bf16_untied, svint64_t, svbfloat16_t,
+ z0 = svreinterpret_s64_bf16 (z4),
+ z0 = svreinterpret_s64 (z4))
+
/*
** reinterpret_s64_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_s8_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_bf16_tied1, svint8_t, svbfloat16_t,
+ z0_res = svreinterpret_s8_bf16 (z0),
+ z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s8_bf16_untied, svint8_t, svbfloat16_t,
+ z0 = svreinterpret_s8_bf16 (z4),
+ z0 = svreinterpret_s8 (z4))
+
/*
** reinterpret_s8_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_u16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_bf16_tied1, svuint16_t, svbfloat16_t,
+ z0_res = svreinterpret_u16_bf16 (z0),
+ z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u16_bf16_untied, svuint16_t, svbfloat16_t,
+ z0 = svreinterpret_u16_bf16 (z4),
+ z0 = svreinterpret_u16 (z4))
+
/*
** reinterpret_u16_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_u32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_bf16_tied1, svuint32_t, svbfloat16_t,
+ z0_res = svreinterpret_u32_bf16 (z0),
+ z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u32_bf16_untied, svuint32_t, svbfloat16_t,
+ z0 = svreinterpret_u32_bf16 (z4),
+ z0 = svreinterpret_u32 (z4))
+
/*
** reinterpret_u32_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_u64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_bf16_tied1, svuint64_t, svbfloat16_t,
+ z0_res = svreinterpret_u64_bf16 (z0),
+ z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u64_bf16_untied, svuint64_t, svbfloat16_t,
+ z0 = svreinterpret_u64_bf16 (z4),
+ z0 = svreinterpret_u64 (z4))
+
/*
** reinterpret_u64_f16_tied1:
** ret
@@ -2,6 +2,23 @@
#include "test_sve_acle.h"
+/*
+** reinterpret_u8_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_bf16_tied1, svuint8_t, svbfloat16_t,
+ z0_res = svreinterpret_u8_bf16 (z0),
+ z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u8_bf16_untied, svuint8_t, svbfloat16_t,
+ z0 = svreinterpret_u8_bf16 (z4),
+ z0 = svreinterpret_u8 (z4))
+
/*
** reinterpret_u8_f16_tied1:
** ret
new file mode 100644
@@ -0,0 +1,21 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_bf16_tied1:
+** rev z0\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (rev_bf16_tied1, svbfloat16_t,
+ z0 = svrev_bf16 (z0),
+ z0 = svrev (z0))
+
+/*
+** rev_bf16_untied:
+** rev z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (rev_bf16_untied, svbfloat16_t,
+ z0 = svrev_bf16 (z1),
+ z0 = svrev (z1))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_bf16_tied1:
+** sel z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied1, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z0, z1),
+ z0 = svsel (p0, z0, z1))
+
+/*
+** sel_bf16_tied2:
+** sel z0\.h, p0, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied2, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z1, z0),
+ z0 = svsel (p0, z1, z0))
+
+/*
+** sel_bf16_untied:
+** sel z0\.h, p0, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_untied, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z1, z2),
+ z0 = svsel (p0, z1, z2))
new file mode 100644
@@ -0,0 +1,41 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z24_0, svbfloat16x2_t, svbfloat16_t,
+ z24 = svset2_bf16 (z4, 0, z0),
+ z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z24_1, svbfloat16x2_t, svbfloat16_t,
+ z24 = svset2_bf16 (z4, 1, z0),
+ z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+ z4 = svset2_bf16 (z4, 0, z0),
+ z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+ z4 = svset2_bf16 (z4, 1, z0),
+ z4 = svset2 (z4, 1, z0))
new file mode 100644
@@ -0,0 +1,63 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_0, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 0, z0),
+ z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z26\.d, z6\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_1, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 1, z0),
+ z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z24_2:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z26\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_2, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 2, z0),
+ z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 0, z0),
+ z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 1, z0),
+ z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z4_2:
+** mov z6\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 2, z0),
+ z4 = svset3 (z4, 2, z0))
new file mode 100644
@@ -0,0 +1,87 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z7\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_0, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 0, z0),
+ z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z7\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_1, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 1, z0),
+ z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z24_2:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z27\.d, z7\.d
+** mov z26\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_2, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 2, z0),
+ z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z24_3:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_3, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 3, z0),
+ z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 0, z0),
+ z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 1, z0),
+ z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z4_2:
+** mov z6\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 2, z0),
+ z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z4_3:
+** mov z7\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 3, z0),
+ z4 = svset4 (z4, 3, z0))
new file mode 100644
@@ -0,0 +1,33 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_bf16_tied1:
+** splice z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied1, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z0, z1),
+ z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** splice z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied2, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z1, z0),
+ z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_bf16_untied:
+** movprfx z0, z1
+** splice z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_untied, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z1, z2),
+ z0 = svsplice (p0, z1, z2))
new file mode 100644
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_bf16_base:
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_base, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0, z0),
+ svst1 (p0, x0, z0))
+
+/*
+** st1_bf16_index:
+** st1h z0\.h, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st1_bf16_index, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + x1, z0),
+ svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_bf16_1:
+** st1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_1, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth (), z0),
+ svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_bf16_7:
+** st1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_7, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth () * 7, z0),
+ svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_bf16_8:
+** incb x0, all, mul #8
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_8, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth () * 8, z0),
+ svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_bf16_m1:
+** st1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_m1, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth (), z0),
+ svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_bf16_m8:
+** st1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_m8, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth () * 8, z0),
+ svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_bf16_m9:
+** decb x0, all, mul #9
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_m9, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth () * 9, z0),
+ svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_bf16_0:
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 0, z0),
+ svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_bf16_1:
+** st1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 1, z0),
+ svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_bf16_7:
+** st1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 7, z0),
+ svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_vnum_bf16_8:
+** incb x0, all, mul #8
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 8, z0),
+ svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_bf16_m1:
+** st1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -1, z0),
+ svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_bf16_m8:
+** st1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -8, z0),
+ svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -9, z0),
+ svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st1h z0\.h, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, x1, z0),
+ svst1_vnum (p0, x0, x1, z0))
new file mode 100644
@@ -0,0 +1,200 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_bf16_base:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_base, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0, z0),
+ svst2 (p0, x0, z0))
+
+/*
+** st2_bf16_index:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st2_bf16_index, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + x1, z0),
+ svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_1:
+** incb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_1, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth (), z0),
+ svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_bf16_2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_2, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_bf16_14:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_14, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 14, z0),
+ svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_16:
+** incb x0, all, mul #16
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_16, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 16, z0),
+ svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_m1:
+** decb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth (), z0),
+ svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_bf16_m2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_bf16_m16:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 16, z0),
+ svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 18, z0),
+ svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_bf16_0:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 0, z0),
+ svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_1:
+** incb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 1, z0),
+ svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_bf16_2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 2, z0),
+ svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_bf16_14:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 14, z0),
+ svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_16:
+** incb x0, all, mul #16
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 16, z0),
+ svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_m1:
+** decb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -1, z0),
+ svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_bf16_m2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -2, z0),
+ svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_bf16_m16:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -16, z0),
+ svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -18, z0),
+ svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st2_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, x1, z0),
+ svst2_vnum (p0, x0, x1, z0))
new file mode 100644
@@ -0,0 +1,242 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_bf16_base:
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_base, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0, z0),
+ svst3 (p0, x0, z0))
+
+/*
+** st3_bf16_index:
+** st3h {z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st3_bf16_index, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + x1, z0),
+ svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_1:
+** incb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_1, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth (), z0),
+ svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_2:
+** incb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_2, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_bf16_3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_3, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 3, z0),
+ svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_bf16_21:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_21, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 21, z0),
+ svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_bf16_24, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 24, z0),
+ svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_m1:
+** decb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth (), z0),
+ svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_m2:
+** decb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_bf16_m3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 3, z0),
+ svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_bf16_m24:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 24, z0),
+ svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 27, z0),
+ svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_bf16_0:
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 0, z0),
+ svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_1:
+** incb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 1, z0),
+ svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_2:
+** incb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 2, z0),
+ svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_bf16_3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 3, z0),
+ svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_bf16_21:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 21, z0),
+ svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 24, z0),
+ svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_m1:
+** decb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -1, z0),
+ svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_m2:
+** decb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -2, z0),
+ svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_bf16_m3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -3, z0),
+ svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_bf16_m24:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -24, z0),
+ svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -27, z0),
+ svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st3_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st3h {z0\.h - z2\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, x1, z0),
+ svst3_vnum (p0, x0, x1, z0))
new file mode 100644
@@ -0,0 +1,286 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_bf16_base:
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_base, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0, z0),
+ svst4 (p0, x0, z0))
+
+/*
+** st4_bf16_index:
+** st4h {z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st4_bf16_index, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + x1, z0),
+ svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_1:
+** incb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_1, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth (), z0),
+ svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_2:
+** incb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_2, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_3:
+** incb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_3, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 3, z0),
+ svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_bf16_4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_4, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 4, z0),
+ svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_bf16_28:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_28, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 28, z0),
+ svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_bf16_32:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_bf16_32, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 32, z0),
+ svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m1:
+** decb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth (), z0),
+ svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m2:
+** decb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m3:
+** decb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 3, z0),
+ svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_bf16_m4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 4, z0),
+ svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_bf16_m32:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 32, z0),
+ svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_bf16_m36:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 36, z0),
+ svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_bf16_0:
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 0, z0),
+ svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_1:
+** incb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 1, z0),
+ svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_2:
+** incb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 2, z0),
+ svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_3:
+** incb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 3, z0),
+ svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_bf16_4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 4, z0),
+ svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_bf16_28:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 28, z0),
+ svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_bf16_32:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 32, z0),
+ svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m1:
+** decb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -1, z0),
+ svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m2:
+** decb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -2, z0),
+ svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m3:
+** decb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -3, z0),
+ svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_bf16_m4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -4, z0),
+ svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_bf16_m32:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -32, z0),
+ svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_bf16_m36:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -36, z0),
+ svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st4_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st4h {z0\.h - z3\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, x1, z0),
+ svst4_vnum (p0, x0, x1, z0))
new file mode 100644
@@ -0,0 +1,158 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_bf16_base:
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_base, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0, z0),
+ svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_bf16_index:
+** stnt1h z0\.h, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_index, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + x1, z0),
+ svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_bf16_1:
+** stnt1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_1, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth (), z0),
+ svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_bf16_7:
+** stnt1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_7, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth () * 7, z0),
+ svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_bf16_8:
+** incb x0, all, mul #8
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_8, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth () * 8, z0),
+ svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_bf16_m1:
+** stnt1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth (), z0),
+ svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_bf16_m8:
+** stnt1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth () * 8, z0),
+ svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_bf16_m9:
+** decb x0, all, mul #9
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth () * 9, z0),
+ svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_bf16_0:
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 0, z0),
+ svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_bf16_1:
+** stnt1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 1, z0),
+ svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_bf16_7:
+** stnt1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 7, z0),
+ svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_vnum_bf16_8:
+** incb x0, all, mul #8
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 8, z0),
+ svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_bf16_m1:
+** stnt1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -1, z0),
+ svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_bf16_m8:
+** stnt1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -8, z0),
+ svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -9, z0),
+ svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** stnt1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** stnt1h z0\.h, p0, \[\2\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, x1, z0),
+ svstnt1_vnum (p0, x0, x1, z0))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_bf16_tied1:
+** tbl z0\.h, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbl_bf16_tied1, svbfloat16_t, svuint16_t,
+ z0 = svtbl_bf16 (z0, z4),
+ z0 = svtbl (z0, z4))
+
+/*
+** tbl_bf16_tied2:
+** tbl z0\.h, z4\.h, z0\.h
+** ret
+*/
+TEST_DUAL_Z_REV (tbl_bf16_tied2, svbfloat16_t, svuint16_t,
+ z0_res = svtbl_bf16 (z4, z0),
+ z0_res = svtbl (z4, z0))
+
+/*
+** tbl_bf16_untied:
+** tbl z0\.h, z1\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbl_bf16_untied, svbfloat16_t, svuint16_t,
+ z0 = svtbl_bf16 (z1, z4),
+ z0 = svtbl (z1, z4))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_bf16_tied1:
+** trn1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied1, svbfloat16_t,
+ z0 = svtrn1_bf16 (z0, z1),
+ z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_bf16_tied2:
+** trn1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied2, svbfloat16_t,
+ z0 = svtrn1_bf16 (z1, z0),
+ z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_bf16_untied:
+** trn1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_untied, svbfloat16_t,
+ z0 = svtrn1_bf16 (z1, z2),
+ z0 = svtrn1 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_bf16_tied1:
+** trn1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied1, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z0, z1),
+ z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_bf16_tied2:
+** trn1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied2, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z1, z0),
+ z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_bf16_untied:
+** trn1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_untied, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z1, z2),
+ z0 = svtrn1q (z1, z2))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_bf16_tied1:
+** trn2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied1, svbfloat16_t,
+ z0 = svtrn2_bf16 (z0, z1),
+ z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_bf16_tied2:
+** trn2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied2, svbfloat16_t,
+ z0 = svtrn2_bf16 (z1, z0),
+ z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_bf16_untied:
+** trn2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_untied, svbfloat16_t,
+ z0 = svtrn2_bf16 (z1, z2),
+ z0 = svtrn2 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_bf16_tied1:
+** trn2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied1, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z0, z1),
+ z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_bf16_tied2:
+** trn2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied2, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z1, z0),
+ z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_bf16_untied:
+** trn2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_untied, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z1, z2),
+ z0 = svtrn2q (z1, z2))
@@ -37,6 +37,13 @@ TEST_UNDEF (uint16, svuint16x2_t,
TEST_UNDEF (float16, svfloat16x2_t,
z0 = svundef2_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x2_t,
+ z0 = svundef2_bf16 ())
+
/*
** int32:
** ret
@@ -37,6 +37,13 @@ TEST_UNDEF (uint16, svuint16x3_t,
TEST_UNDEF (float16, svfloat16x3_t,
z0 = svundef3_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x3_t,
+ z0 = svundef3_bf16 ())
+
/*
** int32:
** ret
@@ -37,6 +37,13 @@ TEST_UNDEF (uint16, svuint16x4_t,
TEST_UNDEF (float16, svfloat16x4_t,
z0 = svundef4_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x4_t,
+ z0 = svundef4_bf16 ())
+
/*
** int32:
** ret
@@ -37,6 +37,13 @@ TEST_UNDEF (uint16, svuint16_t,
TEST_UNDEF (float16, svfloat16_t,
z0 = svundef_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16_t,
+ z0 = svundef_bf16 ())
+
/*
** int32:
** ret
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_bf16_tied1:
+** uzp1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied1, svbfloat16_t,
+ z0 = svuzp1_bf16 (z0, z1),
+ z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_bf16_tied2:
+** uzp1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied2, svbfloat16_t,
+ z0 = svuzp1_bf16 (z1, z0),
+ z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_bf16_untied:
+** uzp1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_untied, svbfloat16_t,
+ z0 = svuzp1_bf16 (z1, z2),
+ z0 = svuzp1 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_bf16_tied1:
+** uzp1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied1, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z0, z1),
+ z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_bf16_tied2:
+** uzp1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied2, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z1, z0),
+ z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_bf16_untied:
+** uzp1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_untied, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z1, z2),
+ z0 = svuzp1q (z1, z2))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_bf16_tied1:
+** uzp2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied1, svbfloat16_t,
+ z0 = svuzp2_bf16 (z0, z1),
+ z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_bf16_tied2:
+** uzp2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied2, svbfloat16_t,
+ z0 = svuzp2_bf16 (z1, z0),
+ z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_bf16_untied:
+** uzp2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_untied, svbfloat16_t,
+ z0 = svuzp2_bf16 (z1, z2),
+ z0 = svuzp2 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_bf16_tied1:
+** uzp2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied1, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z0, z1),
+ z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_bf16_tied2:
+** uzp2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied2, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z1, z0),
+ z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_bf16_untied:
+** uzp2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_untied, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z1, z2),
+ z0 = svuzp2q (z1, z2))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_bf16_tied1:
+** zip1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied1, svbfloat16_t,
+ z0 = svzip1_bf16 (z0, z1),
+ z0 = svzip1 (z0, z1))
+
+/*
+** zip1_bf16_tied2:
+** zip1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied2, svbfloat16_t,
+ z0 = svzip1_bf16 (z1, z0),
+ z0 = svzip1 (z1, z0))
+
+/*
+** zip1_bf16_untied:
+** zip1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_untied, svbfloat16_t,
+ z0 = svzip1_bf16 (z1, z2),
+ z0 = svzip1 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_bf16_tied1:
+** zip1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied1, svbfloat16_t,
+ z0 = svzip1q_bf16 (z0, z1),
+ z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_bf16_tied2:
+** zip1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied2, svbfloat16_t,
+ z0 = svzip1q_bf16 (z1, z0),
+ z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_bf16_untied:
+** zip1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_untied, svbfloat16_t,
+ z0 = svzip1q_bf16 (z1, z2),
+ z0 = svzip1q (z1, z2))
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_bf16_tied1:
+** zip2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied1, svbfloat16_t,
+ z0 = svzip2_bf16 (z0, z1),
+ z0 = svzip2 (z0, z1))
+
+/*
+** zip2_bf16_tied2:
+** zip2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied2, svbfloat16_t,
+ z0 = svzip2_bf16 (z1, z0),
+ z0 = svzip2 (z1, z0))
+
+/*
+** zip2_bf16_untied:
+** zip2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_untied, svbfloat16_t,
+ z0 = svzip2_bf16 (z1, z2),
+ z0 = svzip2 (z1, z2))
new file mode 100644
@@ -0,0 +1,32 @@
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_bf16_tied1:
+** zip2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied1, svbfloat16_t,
+ z0 = svzip2q_bf16 (z0, z1),
+ z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_bf16_tied2:
+** zip2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied2, svbfloat16_t,
+ z0 = svzip2q_bf16 (z1, z0),
+ z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_bf16_untied:
+** zip2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_untied, svbfloat16_t,
+ z0 = svzip2q_bf16 (z1, z2),
+ z0 = svzip2q (z1, z2))
@@ -12,6 +12,7 @@ svuint8_t ret_u8 (void) { return svdup_u8 (0); }
svuint16_t ret_u16 (void) { return svdup_u16 (0); }
svuint32_t ret_u32 (void) { return svdup_u32 (0); }
svuint64_t ret_u64 (void) { return svdup_u64 (0); }
+svbfloat16_t ret_bf16 (void) { return svundef_bf16 (); }
svfloat16_t ret_f16 (void) { return svdup_f16 (0); }
svfloat32_t ret_f32 (void) { return svdup_f32 (0); }
svfloat64_t ret_f64 (void) { return svdup_f64 (0); }
@@ -24,6 +25,7 @@ svuint8x2_t ret_u8x2 (void) { return svundef2_u8 (); }
svuint16x2_t ret_u16x2 (void) { return svundef2_u16 (); }
svuint32x2_t ret_u32x2 (void) { return svundef2_u32 (); }
svuint64x2_t ret_u64x2 (void) { return svundef2_u64 (); }
+svbfloat16x2_t ret_bf16x2 (void) { return svundef2_bf16 (); }
svfloat16x2_t ret_f16x2 (void) { return svundef2_f16 (); }
svfloat32x2_t ret_f32x2 (void) { return svundef2_f32 (); }
svfloat64x2_t ret_f64x2 (void) { return svundef2_f64 (); }
@@ -36,6 +38,7 @@ svuint8x3_t ret_u8x3 (void) { return svundef3_u8 (); }
svuint16x3_t ret_u16x3 (void) { return svundef3_u16 (); }
svuint32x3_t ret_u32x3 (void) { return svundef3_u32 (); }
svuint64x3_t ret_u64x3 (void) { return svundef3_u64 (); }
+svbfloat16x3_t ret_bf16x3 (void) { return svundef3_bf16 (); }
svfloat16x3_t ret_f16x3 (void) { return svundef3_f16 (); }
svfloat32x3_t ret_f32x3 (void) { return svundef3_f32 (); }
svfloat64x3_t ret_f64x3 (void) { return svundef3_f64 (); }
@@ -48,6 +51,7 @@ svuint8x4_t ret_u8x4 (void) { return svundef4_u8 (); }
svuint16x4_t ret_u16x4 (void) { return svundef4_u16 (); }
svuint32x4_t ret_u32x4 (void) { return svundef4_u32 (); }
svuint64x4_t ret_u64x4 (void) { return svundef4_u64 (); }
+svbfloat16x4_t ret_bf16x4 (void) { return svundef4_bf16 (); }
svfloat16x4_t ret_f16x4 (void) { return svundef4_f16 (); }
svfloat32x4_t ret_f32x4 (void) { return svundef4_f32 (); }
svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
@@ -62,6 +66,7 @@ svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64\n} } } */
@@ -74,6 +79,7 @@ svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x2\n} } } */
@@ -87,6 +93,7 @@ svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x3\n} } } */
@@ -99,6 +106,7 @@ svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x4\n} } } */
@@ -12,6 +12,7 @@ void fn_u8 (svuint8_t x) {}
void fn_u16 (svuint16_t x) {}
void fn_u32 (svuint32_t x) {}
void fn_u64 (svuint64_t x) {}
+void fn_bf16 (svbfloat16_t x) {}
void fn_f16 (svfloat16_t x) {}
void fn_f32 (svfloat32_t x) {}
void fn_f64 (svfloat64_t x) {}
@@ -24,6 +25,7 @@ void fn_u8x2 (svuint8x2_t x) {}
void fn_u16x2 (svuint16x2_t x) {}
void fn_u32x2 (svuint32x2_t x) {}
void fn_u64x2 (svuint64x2_t x) {}
+void fn_bf16x2 (svbfloat16x2_t x) {}
void fn_f16x2 (svfloat16x2_t x) {}
void fn_f32x2 (svfloat32x2_t x) {}
void fn_f64x2 (svfloat64x2_t x) {}
@@ -36,6 +38,7 @@ void fn_u8x3 (svuint8x3_t x) {}
void fn_u16x3 (svuint16x3_t x) {}
void fn_u32x3 (svuint32x3_t x) {}
void fn_u64x3 (svuint64x3_t x) {}
+void fn_bf16x3 (svbfloat16x3_t x) {}
void fn_f16x3 (svfloat16x3_t x) {}
void fn_f32x3 (svfloat32x3_t x) {}
void fn_f64x3 (svfloat64x3_t x) {}
@@ -48,6 +51,7 @@ void fn_u8x4 (svuint8x4_t x) {}
void fn_u16x4 (svuint16x4_t x) {}
void fn_u32x4 (svuint32x4_t x) {}
void fn_u64x4 (svuint64x4_t x) {}
+void fn_bf16x4 (svbfloat16x4_t x) {}
void fn_f16x4 (svfloat16x4_t x) {}
void fn_f32x4 (svfloat32x4_t x) {}
void fn_f64x4 (svfloat64x4_t x) {}
@@ -62,6 +66,7 @@ void fn_f64x4 (svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
@@ -74,6 +79,7 @@ void fn_f64x4 (svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
@@ -86,6 +92,7 @@ void fn_f64x4 (svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
@@ -98,6 +105,7 @@ void fn_f64x4 (svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
@@ -10,6 +10,7 @@ void fn_u8 (float d0, float d1, float d2, float d3, svuint8_t x) {}
void fn_u16 (float d0, float d1, float d2, float d3, svuint16_t x) {}
void fn_u32 (float d0, float d1, float d2, float d3, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3, svfloat32_t x) {}
void fn_f64 (float d0, float d1, float d2, float d3, svfloat64_t x) {}
@@ -22,6 +23,7 @@ void fn_u8x2 (float d0, float d1, float d2, float d3, svuint8x2_t x) {}
void fn_u16x2 (float d0, float d1, float d2, float d3, svuint16x2_t x) {}
void fn_u32x2 (float d0, float d1, float d2, float d3, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3, svfloat32x2_t x) {}
void fn_f64x2 (float d0, float d1, float d2, float d3, svfloat64x2_t x) {}
@@ -34,6 +36,7 @@ void fn_u8x3 (float d0, float d1, float d2, float d3, svuint8x3_t x) {}
void fn_u16x3 (float d0, float d1, float d2, float d3, svuint16x3_t x) {}
void fn_u32x3 (float d0, float d1, float d2, float d3, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3, svfloat32x3_t x) {}
void fn_f64x3 (float d0, float d1, float d2, float d3, svfloat64x3_t x) {}
@@ -46,6 +49,7 @@ void fn_u8x4 (float d0, float d1, float d2, float d3, svuint8x4_t x) {}
void fn_u16x4 (float d0, float d1, float d2, float d3, svuint16x4_t x) {}
void fn_u32x4 (float d0, float d1, float d2, float d3, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3, svfloat32x4_t x) {}
void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
@@ -58,6 +62,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
@@ -70,6 +75,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
@@ -82,6 +88,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
@@ -94,6 +101,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
@@ -18,6 +18,8 @@ void fn_u32 (float d0, float d1, float d2, float d3,
float d4, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
@@ -41,6 +43,8 @@ void fn_u32x2 (float d0, float d1, float d2, float d3,
float d4, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
@@ -64,6 +68,8 @@ void fn_u32x3 (float d0, float d1, float d2, float d3,
float d4, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
@@ -87,6 +93,8 @@ void fn_u32x4 (float d0, float d1, float d2, float d3,
float d4, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
@@ -102,6 +110,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
@@ -114,6 +123,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
@@ -126,6 +136,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
@@ -138,6 +149,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
@@ -18,6 +18,8 @@ void fn_u32 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
@@ -41,6 +43,8 @@ void fn_u32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
@@ -64,6 +68,8 @@ void fn_u32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
@@ -87,6 +93,8 @@ void fn_u32x4 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
@@ -102,6 +110,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
@@ -114,6 +123,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
@@ -126,6 +136,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
@@ -138,6 +149,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
@@ -18,6 +18,8 @@ void fn_u32 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
@@ -41,6 +43,8 @@ void fn_u32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
@@ -64,6 +68,8 @@ void fn_u32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
@@ -87,6 +93,8 @@ void fn_u32x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
@@ -102,6 +110,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
@@ -114,6 +123,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x2\n} } } */
@@ -126,6 +136,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
@@ -138,6 +149,7 @@ void fn_f64x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
@@ -18,6 +18,8 @@ void fn_u32 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
@@ -41,6 +43,8 @@ void fn_u32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
@@ -64,6 +68,8 @@ void fn_u32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
@@ -87,6 +93,8 @@ void fn_u32x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
new file mode 100644
@@ -0,0 +1,63 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** addvl sp, sp, #-1
+** str p4, \[sp\]
+** ptrue p4\.b, all
+** (
+** ld1h (z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+** ld1h (z[0-9]+\.h), p4/z, \[x1\]
+** st2h {\2 - \1}, p0, \[x0\]
+** |
+** ld1h (z[0-9]+\.h), p4/z, \[x1\]
+** ld1h (z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+** st2h {\3 - \4}, p0, \[x0\]
+** )
+** st4h {z0\.h - z3\.h}, p1, \[x0\]
+** st3h {z4\.h - z6\.h}, p2, \[x0\]
+** st1h z7\.h, p3, \[x0\]
+** ldr p4, \[sp\]
+** addvl sp, sp, #1
+** ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+ svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+ svst2 (p0, x0, stack);
+ svst4 (p1, x0, z0);
+ svst3 (p2, x0, z4);
+ svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee (x0,
+ svld4_vnum_bf16 (pg, x0, -8),
+ svld3_vnum_bf16 (pg, x0, -3),
+ svld2_vnum_bf16 (pg, x0, 0),
+ svld1_vnum_bf16 (pg, x0, 2),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3),
+ svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
new file mode 100644
@@ -0,0 +1,58 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+** ldr (z[0-9]+), \[x1, #1, mul vl\]
+** ldr (z[0-9]+), \[x1\]
+** st2h {\2\.h - \1\.h}, p0, \[x0\]
+** |
+** ldr (z[0-9]+), \[x1\]
+** ldr (z[0-9]+), \[x1, #1, mul vl\]
+** st2h {\3\.h - \4\.h}, p0, \[x0\]
+** )
+** st4h {z0\.h - z3\.h}, p1, \[x0\]
+** st3h {z4\.h - z6\.h}, p2, \[x0\]
+** st1h z7\.h, p3, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+ svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+ svst2 (p0, x0, stack);
+ svst4 (p1, x0, z0);
+ svst3 (p2, x0, z4);
+ svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee (x0,
+ svld4_vnum_bf16 (pg, x0, -8),
+ svld3_vnum_bf16 (pg, x0, -3),
+ svld2_vnum_bf16 (pg, x0, 0),
+ svld1_vnum_bf16 (pg, x0, 2),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3),
+ svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
new file mode 100644
@@ -0,0 +1,71 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+** ptrue p3\.b, all
+** ...
+** ld1h (z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+** ...
+** st4h {z[0-9]+\.h - \1}, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z5\.h - z7\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst4_bf16 (p0, x0, stack1);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+** ptrue p3\.b, all
+** ld1h (z[0-9]+\.h), p3/z, \[x2\]
+** st1h \1, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z0\.h - z2\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst1_bf16 (p0, x0, stack2);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee1 (x0,
+ svld3_vnum_bf16 (pg, x0, -9),
+ svld2_vnum_bf16 (pg, x0, -2),
+ svld3_vnum_bf16 (pg, x0, 0),
+ svld4_vnum_bf16 (pg, x0, 8),
+ svld1_vnum_bf16 (pg, x0, 5),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
new file mode 100644
@@ -0,0 +1,70 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+** ...
+** ldr (z[0-9]+), \[x1, #3, mul vl\]
+** ...
+** st4h {z[0-9]+\.h - \1\.h}, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z5\.h - z7\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst4_bf16 (p0, x0, stack1);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+** ptrue p3\.b, all
+** ld1h (z[0-9]+\.h), p3/z, \[x2\]
+** st1h \1, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z0\.h - z2\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst1_bf16 (p0, x0, stack2);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee1 (x0,
+ svld3_vnum_bf16 (pg, x0, -9),
+ svld2_vnum_bf16 (pg, x0, -2),
+ svld3_vnum_bf16 (pg, x0, 0),
+ svld4_vnum_bf16 (pg, x0, 8),
+ svld1_vnum_bf16 (pg, x0, 5),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
@@ -2,6 +2,7 @@
#include <arm_sve.h>
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
typedef float16_t float16x16_t __attribute__((vector_size (32)));
typedef float32_t float32x8_t __attribute__((vector_size (32)));
typedef float64_t float64x4_t __attribute__((vector_size (32)));
@@ -14,6 +15,7 @@ typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+void bfloat16_callee (bfloat16x16_t);
void float16_callee (float16x16_t);
void float32_callee (float32x8_t);
void float64_callee (float64x4_t);
@@ -26,6 +28,12 @@ void uint16_callee (uint16x16_t);
void uint32_callee (uint32x8_t);
void uint64_callee (uint64x4_t);
+void
+bfloat16_caller (bfloat16_t val)
+{
+ bfloat16_callee (svdup_bf16 (val));
+}
+
void
float16_caller (void)
{
@@ -93,7 +101,7 @@ uint64_caller (void)
}
/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0\]} 2 } } */
-/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 4 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0\]} 3 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0\]} 3 } } */
-/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 11 } } */
+/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 12 } } */
@@ -2,6 +2,7 @@
#include <arm_sve.h>
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
typedef float16_t float16x16_t __attribute__((vector_size (32)));
typedef float32_t float32x8_t __attribute__((vector_size (32)));
typedef float64_t float64x4_t __attribute__((vector_size (32)));
@@ -14,6 +15,7 @@ typedef uint16_t uint16x16_t __attribute__((vector_size (32)));
typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+void bfloat16_callee (svbfloat16_t);
void float16_callee (svfloat16_t);
void float32_callee (svfloat32_t);
void float64_callee (svfloat64_t);
@@ -26,6 +28,12 @@ void uint16_callee (svuint16_t);
void uint32_callee (svuint32_t);
void uint64_callee (svuint64_t);
+void
+bfloat16_caller (bfloat16x16_t arg)
+{
+ bfloat16_callee (arg);
+}
+
void
float16_caller (float16x16_t arg)
{
@@ -93,7 +101,7 @@ uint64_caller (uint64x4_t arg)
}
/* { dg-final { scan-assembler-times {\tld1b\tz0\.b, p[0-7]/z, \[x0\]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 4 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz0\.s, p[0-7]/z, \[x0\]} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz0\.d, p[0-7]/z, \[x0\]} 3 } } */
/* { dg-final { scan-assembler-not {\tst1[bhwd]\t} } } */
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, all
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, all
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, all
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl128
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl16
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl16
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl16
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl256
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl32
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -49,6 +49,14 @@ CALLEE (u16, __SVUint16_t)
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
@@ -107,6 +115,14 @@ CALLEE (f64, __SVFloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, __SVUint16_t)
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl64
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, all
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, all
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, all
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl128
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl16
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl16
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl16
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl256
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl32
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -51,6 +51,14 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
@@ -107,6 +115,14 @@ CALLEE (f64, svfloat64_t)
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
@@ -166,6 +182,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl64
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
typedef int16_t svint16_t __attribute__ ((vector_size (32)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
typedef int32_t svint32_t __attribute__ ((vector_size (32)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
@@ -81,6 +82,9 @@ CALLEE (u16, svuint16_t)
/* Currently we scalarize this. */
CALLEE (f16, svfloat16_t)
+/* Currently we scalarize this. */
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** (
@@ -197,6 +201,16 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ldr h0, \[sp, 16\]
+** ldp x29, x30, \[sp\], 48
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (128)));
typedef int16_t svint16_t __attribute__ ((vector_size (128)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (128)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (128)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (128)));
typedef int32_t svint32_t __attribute__ ((vector_size (128)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (128)));
@@ -71,6 +72,15 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
@@ -192,6 +202,18 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (16)));
typedef int16_t svint16_t __attribute__ ((vector_size (16)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (16)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (16)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (16)));
typedef int32_t svint32_t __attribute__ ((vector_size (16)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (16)));
@@ -61,6 +62,13 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ldr q0, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ldr q0, \[x0\]
@@ -165,6 +173,17 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** str q0, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (256)));
typedef int16_t svint16_t __attribute__ ((vector_size (256)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (256)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (256)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (256)));
typedef int32_t svint32_t __attribute__ ((vector_size (256)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (256)));
@@ -71,6 +72,15 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
@@ -192,6 +202,18 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (32)));
typedef int16_t svint16_t __attribute__ ((vector_size (32)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
typedef int32_t svint32_t __attribute__ ((vector_size (32)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
@@ -71,6 +72,15 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
@@ -192,6 +202,18 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -10,6 +10,7 @@ typedef uint8_t svuint8_t __attribute__ ((vector_size (64)));
typedef int16_t svint16_t __attribute__ ((vector_size (64)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (64)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (64)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (64)));
typedef int32_t svint32_t __attribute__ ((vector_size (64)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (64)));
@@ -71,6 +72,15 @@ CALLEE (u16, svuint16_t)
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
@@ -192,6 +202,18 @@ CALLER (u16, svuint16_t)
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
@@ -144,6 +144,34 @@ caller_f16 (void)
return svzip1 (svget2 (res, 1), svget2 (res, 0));
}
+/*
+** callee_bf16:
+** mov z0\.h, h2
+** mov z1\.h, h3
+** ret
+*/
+svbfloat16x2_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+ return svcreate2 (svdup_bf16 (h2), svdup_bf16 (h3));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** zip2 z0\.h, z1\.h, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+ svbfloat16x2_t res;
+ res = callee_bf16 (h0, h1, h2, h3);
+ return svzip2 (svget2 (res, 1), svget2 (res, 0));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
@@ -159,6 +159,35 @@ caller_f16 (void)
svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
}
+/*
+** callee_bf16:
+** mov z0\.h, h0
+** mov z1\.h, h1
+** mov z2\.h, h2
+** ret
+*/
+svbfloat16x3_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+ return svcreate3 (svdup_bf16 (h0), svdup_bf16 (h1), svdup_bf16 (h2));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** trn2 z0\.h, z0\.h, z2\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+ svbfloat16x3_t res;
+ res = callee_bf16 (h0, h1, h2);
+ return svtrn2 (svget3 (res, 0), svget3 (res, 2));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
@@ -184,6 +184,39 @@ caller_f16 (void)
svget4 (res, 3));
}
+/*
+** callee_bf16:
+** mov z0\.h, h4
+** mov z1\.h, h5
+** mov z2\.h, h6
+** mov z3\.h, h7
+** ret
+*/
+svbfloat16x4_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+ bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+ return svcreate4 (svdup_bf16 (h4), svdup_bf16 (h5),
+ svdup_bf16 (h6), svdup_bf16 (h7));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** trn2 z0\.h, z0\.h, z3\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+ bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+ svbfloat16x4_t res;
+ res = callee_bf16 (h0, h1, h2, h3, h4, h5, h6, h7);
+ return svtrn2 (svget4 (res, 0), svget4 (res, 3));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl2_bf16_tied1:
+** tbl z0\.h, {z0\.h(?:, | - )z1\.h}, z4\.h
+** ret
+*/
+TEST_TBL2 (tbl2_bf16_tied1, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z0, z4),
+ z0_res = svtbl2 (z0, z4))
+
+/*
+** tbl2_bf16_tied2:
+** tbl z0\.h, {z1\.h(?:, | - )z2\.h}, z0\.h
+** ret
+*/
+TEST_TBL2_REV (tbl2_bf16_tied2, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z1, z0),
+ z0_res = svtbl2 (z1, z0))
+
+/*
+** tbl2_bf16_untied:
+** tbl z0\.h, {z2\.h(?:, | - )z3\.h}, z4\.h
+** ret
+*/
+TEST_TBL2 (tbl2_bf16_untied, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z2, z4),
+ z0_res = svtbl2 (z2, z4))
new file mode 100644
@@ -0,0 +1,37 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbx_bf16_tied1:
+** tbx z0\.h, z1\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbx_bf16_tied1, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z0, z1, z4),
+ z0 = svtbx (z0, z1, z4))
+
+/* Bad RA choice: no preferred output sequence. */
+TEST_DUAL_Z (tbx_bf16_tied2, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z1, z0, z4),
+ z0 = svtbx (z1, z0, z4))
+
+/* Bad RA choice: no preferred output sequence. */
+TEST_DUAL_Z_REV (tbx_bf16_tied3, svbfloat16_t, svuint16_t,
+ z0_res = svtbx_bf16 (z4, z5, z0),
+ z0_res = svtbx (z4, z5, z0))
+
+/*
+** tbx_bf16_untied:
+** (
+** mov z0\.d, z1\.d
+** tbx z0\.h, z2\.h, z4\.h
+** |
+** tbx z1\.h, z2\.h, z4\.h
+** mov z0\.d, z1\.d
+** )
+** ret
+*/
+TEST_DUAL_Z (tbx_bf16_untied, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z1, z2, z4),
+ z0 = svtbx (z1, z2, z4))
new file mode 100644
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilerw_rr_bf16:
+** whilerw p0\.h, x0, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_rr_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, x1),
+ p0 = svwhilerw (x0, x1))
+
+/*
+** whilerw_0r_bf16:
+** whilerw p0\.h, xzr, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_0r_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 ((const bfloat16_t *) 0, x1),
+ p0 = svwhilerw ((const bfloat16_t *) 0, x1))
+
+/*
+** whilerw_cr_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilerw p0\.h, \1, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_cr_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 ((const bfloat16_t *) 1073741824, x1),
+ p0 = svwhilerw ((const bfloat16_t *) 1073741824, x1))
+
+/*
+** whilerw_r0_bf16:
+** whilerw p0\.h, x0, xzr
+** ret
+*/
+TEST_COMPARE_S (whilerw_r0_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, (const bfloat16_t *) 0),
+ p0 = svwhilerw (x0, (const bfloat16_t *) 0))
+
+/*
+** whilerw_rc_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilerw p0\.h, x0, \1
+** ret
+*/
+TEST_COMPARE_S (whilerw_rc_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, (const bfloat16_t *) 1073741824),
+ p0 = svwhilerw (x0, (const bfloat16_t *) 1073741824))
new file mode 100644
@@ -0,0 +1,50 @@
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilewr_rr_bf16:
+** whilewr p0\.h, x0, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_rr_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, x1),
+ p0 = svwhilewr (x0, x1))
+
+/*
+** whilewr_0r_bf16:
+** whilewr p0\.h, xzr, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_0r_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 ((const bfloat16_t *) 0, x1),
+ p0 = svwhilewr ((const bfloat16_t *) 0, x1))
+
+/*
+** whilewr_cr_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilewr p0\.h, \1, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_cr_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 ((const bfloat16_t *) 1073741824, x1),
+ p0 = svwhilewr ((const bfloat16_t *) 1073741824, x1))
+
+/*
+** whilewr_r0_bf16:
+** whilewr p0\.h, x0, xzr
+** ret
+*/
+TEST_COMPARE_S (whilewr_r0_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, (const bfloat16_t *) 0),
+ p0 = svwhilewr (x0, (const bfloat16_t *) 0))
+
+/*
+** whilewr_rc_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilewr p0\.h, x0, \1
+** ret
+*/
+TEST_COMPARE_S (whilewr_rc_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, (const bfloat16_t *) 1073741824),
+ p0 = svwhilewr (x0, (const bfloat16_t *) 1073741824))