@@ -10250,6 +10250,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
return V2DFmode;
case SFmode:
return V4SFmode;
+ case HFmode:
+ return V8HFmode;
case SImode:
return V4SImode;
case HImode:
@@ -10266,6 +10268,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
{
case SFmode:
return V2SFmode;
+ case HFmode:
+ return V4HFmode;
case SImode:
return V2SImode;
case HImode:
@@ -10469,7 +10473,12 @@ sizetochar (int size)
/* Return true iff x is a uniform vector of floating-point
constants, and the constant can be represented in
quarter-precision form. Note, as aarch64_float_const_representable
- rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
+ rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.
+ Also note that this won't ever be called for V*HFmode vectors,
+ since in aarch64_simd_valid_immediate () we check for the mode
+ and handle these vector types differently from other floating
+ point vector modes. */
+
static bool
aarch64_vect_float_const_representable_p (rtx x)
{
@@ -10505,7 +10514,10 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
unsigned int invmask = inverse ? 0xff : 0;
int eshift, emvn;
- if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
+ /* Ignore V*HFmode vectors, they are handled below with the integer
+ code. */
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT
+ && GET_MODE_INNER (mode) != HFmode)
{
if (! (aarch64_simd_imm_zero_p (op, mode)
|| aarch64_vect_float_const_representable_p (op)))
@@ -10530,15 +10542,26 @@ aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
unsigned HOST_WIDE_INT elpart;
- gcc_assert (CONST_INT_P (el));
- elpart = INTVAL (el);
+ if (CONST_INT_P (el))
+ elpart = INTVAL (el);
+ /* Convert HFmode vector element to bit pattern. Logic below will catch
+ most common constants since for FP16 the sign and exponent are in the
+ top 6 bits and a movi with a left shift of 8 will catch all powers
+ of 2 that fit in a 16 bit floating point, and the 2 extra bits left
+ for the mantissa can cover some more non-power of 2 constants. With
+ a 0 left shift, we can cover constants of the form 1.xxx since we have
+ 8 bits only for the mantissa. */
+ else if (CONST_DOUBLE_P (el) && GET_MODE_INNER (mode) == HFmode)
+ elpart =
+ real_to_target (NULL, CONST_DOUBLE_REAL_VALUE (el), HFmode);
+ else
+ gcc_unreachable ();
for (unsigned int byte = 0; byte < innersize; byte++)
{
bytes[idx++] = (elpart & 0xff) ^ invmask;
elpart >>= BITS_PER_UNIT;
}
-
}
/* Sanity check. */
@@ -11913,7 +11936,10 @@ aarch64_output_simd_mov_immediate (rtx const_vector,
lane_count = width / info.element_width;
mode = GET_MODE_INNER (mode);
- if (GET_MODE_CLASS (mode) == MODE_FLOAT)
+ /* We handle HFmode vectors separately from the other floating point
+ vector modes. See aarch64_simd_valid_immediate (), but in short
+ we use a movi instruction rather than a fmov. */
+ if (GET_MODE_CLASS (mode) == MODE_FLOAT && mode != HFmode)
{
gcc_assert (info.shift == 0 && ! info.mvn);
/* For FP zero change it to a CONST_INT 0 and use the integer SIMD
new file mode 100644
@@ -0,0 +1,262 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -save-temps" } */
+
+#include "arm_neon.h"
+
+extern void abort ();
+
+#define VAL_4(val) \
+ val, val, val, val \
+
+#define VAL_8(val) \
+ val, val, val, val, val, val, val, val \
+
+
+#define CHECK_1(LEN, val) \
+ do \
+ { \
+ float16x##LEN##_t a = {VAL_##LEN (val)}; \
+ for (int i = 0; i < LEN; ++i) \
+ if (a[i] != val) \
+ abort (); \
+ } \
+ while (0) \
+
+void __attribute__ ((noinline))
+f1 ()
+{
+ CHECK_1 (4, -64.0);
+ CHECK_1 (4, -32.0);
+ CHECK_1 (4, -16.0);
+ CHECK_1 (4, -8.0);
+ CHECK_1 (4, -4.0);
+ CHECK_1 (4, -2.0);
+ CHECK_1 (4, -1.0);
+
+ CHECK_1 (4, 0.0);
+
+ CHECK_1 (4, 1.0);
+ CHECK_1 (4, 2.0);
+ CHECK_1 (4, 4.0);
+ CHECK_1 (4, 8.0);
+ CHECK_1 (4, 16.0);
+ CHECK_1 (4, 32.0);
+ CHECK_1 (4, 64.0);
+
+ CHECK_1 (8, -64.0);
+ CHECK_1 (8, -32.0);
+ CHECK_1 (8, -16.0);
+ CHECK_1 (8, -8.0);
+ CHECK_1 (8, -4.0);
+ CHECK_1 (8, -2.0);
+ CHECK_1 (8, -1.0);
+
+ CHECK_1 (8, 0.0);
+
+ CHECK_1 (8, 1.0);
+ CHECK_1 (8, 2.0);
+ CHECK_1 (8, 4.0);
+ CHECK_1 (8, 8.0);
+ CHECK_1 (8, 16.0);
+ CHECK_1 (8, 32.0);
+ CHECK_1 (8, 64.0);
+}
+
+#define CHECK_2(LEN, val1, val2) \
+ do \
+ { \
+ float16x##LEN##_t a = \
+ vcreate_f16 (__AARCH64_UINT64_C (val1)); \
+ for (int i = 0; i < LEN; ++i) \
+ if (vget_lane_f16 (a, i) != val2) \
+ abort (); \
+ } \
+ while (0) \
+
+void __attribute__ ((noinline))
+f2 ()
+{
+ CHECK_2 (4, 0xd400d400d400d400, -64.0);
+ CHECK_2 (4, 0xd000d000d000d000, -32.0);
+ CHECK_2 (4, 0xcc00cc00cc00cc00, -16.0);
+ CHECK_2 (4, 0xc800c800c800c800, -8.0);
+ CHECK_2 (4, 0xc400c400c400c400, -4.0);
+ CHECK_2 (4, 0xc000c000c000c000, -2.0);
+ CHECK_2 (4, 0xbc00bc00bc00bc00, -1.0);
+
+ CHECK_2 (4, 0, 0.0);
+
+ CHECK_2 (4, 0x3c003c003c003c00, 1.0);
+ CHECK_2 (4, 0x4000400040004000, 2.0);
+ CHECK_2 (4, 0x4400440044004400, 4.0);
+ CHECK_2 (4, 0x4800480048004800, 8.0);
+ CHECK_2 (4, 0x4c004c004c004c00, 16.0);
+ CHECK_2 (4, 0x5000500050005000, 32.0);
+ CHECK_2 (4, 0x5400540054005400, 64.0);
+}
+
+#define VGET_LANE_F16_4(a, i) \
+ vget_lane_f16 (a, i) \
+
+#define VGET_LANE_F16_8(a, i) \
+ vgetq_lane_f16 (a, i) \
+
+#define VLD1_F16_4(x) \
+ vld1_f16 (x) \
+
+#define VLD1_F16_8(x) \
+ vld1q_f16 (x) \
+
+#define CHECK_3(LEN, val) \
+ do \
+ { \
+ float16_t x[] = {VAL_##LEN (val)}; \
+ float16x##LEN##_t a = \
+ VLD1_F16_##LEN (x); \
+ for (int i = 0; i < LEN; ++i) \
+ if (VGET_LANE_F16_##LEN (a, i) != val) \
+ abort (); \
+ } \
+ while (0) \
+
+void __attribute__ ((noinline))
+f3 ()
+{
+ CHECK_3 (4, -64.0);
+ CHECK_3 (4, -32.0);
+ CHECK_3 (4, -16.0);
+ CHECK_3 (4, -8.0);
+ CHECK_3 (4, -4.0);
+ CHECK_3 (4, -2.0);
+ CHECK_3 (4, -1.0);
+
+ CHECK_3 (4, 0.0);
+
+ CHECK_3 (4, 1.0);
+ CHECK_3 (4, 2.0);
+ CHECK_3 (4, 4.0);
+ CHECK_3 (4, 8.0);
+ CHECK_3 (4, 16.0);
+ CHECK_3 (4, 32.0);
+ CHECK_3 (4, 64.0);
+
+ CHECK_3 (8, -64.0);
+ CHECK_3 (8, -32.0);
+ CHECK_3 (8, -16.0);
+ CHECK_3 (8, -8.0);
+ CHECK_3 (8, -4.0);
+ CHECK_3 (8, -2.0);
+ CHECK_3 (8, -1.0);
+
+ CHECK_3 (8, 0.0);
+
+ CHECK_3 (8, 1.0);
+ CHECK_3 (8, 2.0);
+ CHECK_3 (8, 4.0);
+ CHECK_3 (8, 8.0);
+ CHECK_3 (8, 16.0);
+ CHECK_3 (8, 32.0);
+ CHECK_3 (8, 64.0);
+}
+
+#define VLD1_DUP_F16_4(x) \
+ vld1_dup_f16 (x) \
+
+#define VLD1_DUP_F16_8(x) \
+ vld1q_dup_f16 (x) \
+
+
+#define CHECK_4(LEN, val) \
+ do \
+ { \
+ float16_t x = val; \
+ float16x##LEN##_t a = \
+ VLD1_DUP_F16_##LEN (&x); \
+ for (int i = 0; i < LEN; ++i) \
+ if (VGET_LANE_F16_##LEN (a, i) != val) \
+ abort (); \
+ } \
+ while (0) \
+
+void __attribute__ ((noinline))
+f4 ()
+{
+ CHECK_4 (4, -64.0);
+ CHECK_4 (4, -32.0);
+ CHECK_4 (4, -16.0);
+ CHECK_4 (4, -8.0);
+ CHECK_4 (4, -4.0);
+ CHECK_4 (4, -2.0);
+ CHECK_4 (4, -1.0);
+
+ CHECK_4 (4, 0.0);
+
+ CHECK_4 (4, 1.0);
+ CHECK_4 (4, 2.0);
+ CHECK_4 (4, 4.0);
+ CHECK_4 (4, 8.0);
+ CHECK_4 (4, 16.0);
+ CHECK_4 (4, 32.0);
+ CHECK_4 (4, 64.0);
+
+ CHECK_4 (8, -64.0);
+ CHECK_4 (8, -32.0);
+ CHECK_4 (8, -16.0);
+ CHECK_4 (8, -8.0);
+ CHECK_4 (8, -4.0);
+ CHECK_4 (8, -2.0);
+ CHECK_4 (8, -1.0);
+
+ CHECK_4 (8, 0.0);
+
+ CHECK_4 (8, 1.0);
+ CHECK_4 (8, 2.0);
+ CHECK_4 (8, 4.0);
+ CHECK_4 (8, 8.0);
+ CHECK_4 (8, 16.0);
+ CHECK_4 (8, 32.0);
+ CHECK_4 (8, 64.0);
+}
+
+int
+main ()
+{
+ f1 ();
+ f2 ();
+ f3 ();
+ f4 ();
+ return 0;
+}
+
+/* We are searching for 7 movi for each constant except zero. The functions f1
+ (), f3 (), f4 () check for both V4HF and V8HF modes, while f2 () checks
+ only for V4HF, hence the 14 directive lines. The constants are in hex,
+ and the list is here:
+ 0xd4 -> -64
+ 0xd0 -> -32
+ 0xcc -> -16
+ etc...
+ 0x4c -> 16
+ 0x50 -> 32
+ 0x54 -> 64. */
+
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd4, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xd0, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xcc, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc8, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc4, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xc0, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0xbc, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x3c, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x40, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x44, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x48, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x4c, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x50, ?lsl 8\n" 7 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[48\]\[hH\], ?0x54, ?lsl 8\n" 7 } } */
+
+/* For the constant zero, the instruction emitted is a movi but with a different
+ size for the vector lane. Also, since f2 () only tests V4HF, we have 1 less
+ case to check for V8HF. V4HF mode emits v*.2s, V8HF emits v*.4s. */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[2\]\[sS\], ?0\n" 4 } } */
+/* { dg-final { scan-assembler-times "movi\t\[vV\]\[0-9\]+\.\[4\]\[sS\], ?0\n" 3 } } */