diff mbox series

[AArch64] Vectorise __builtin_signbit on aarch64

Message ID VI1PR0801MB2062E38C8E95A28B004FCFF4E4420@VI1PR0801MB2062.eurprd08.prod.outlook.com
State New
Headers show
Series [AArch64] Vectorise __builtin_signbit on aarch64 | expand

Commit Message

Przemyslaw Wirkus March 21, 2019, 9:32 a.m. UTC
Hi all,

Vectorise __builtin_signbit (v4sf) with unsigned shift right vector
instruction.

Bootstrapped and tested on aarch64-none-linux-gnu.

Assembly output for:
$ aarch64-elf-gcc -S -O3 signbitv4sf.c -dp

Before patch:

foo:
	adrp	x3, in	// 37	[c=4 l=4]  *movdi_aarch64/12
	adrp	x2, out	// 40	[c=4 l=4]  *movdi_aarch64/12
	add	x3, x3, :lo12:in	// 39	[c=4 l=4]  add_losym_di
	add	x2, x2, :lo12:out	// 42	[c=4 l=4]  add_losym_di
	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
	.p2align 3,,7
.L2:
	ldr	w1, [x3, x0]	// 10	[c=16 l=4]  *zero_extendsidi2_aarch64/1
	and	w1, w1, -2147483648	// 11	[c=4 l=4]  andsi3/1
	str	w1, [x2, x0]	// 16	[c=4 l=4]  *movsi_aarch64/8
	add	x0, x0, 4	// 17	[c=4 l=4]  *adddi3_aarch64/0
	cmp	x0, 4096	// 19	[c=4 l=4]  cmpdi/1
	bne	.L2		// 20	[c=4 l=4]  condjump
	ret		// 50	[c=0 l=4]  *do_return

After patch:

foo:
	adrp	x2, in	// 36	[c=4 l=4]  *movdi_aarch64/12
	adrp	x1, out	// 39	[c=4 l=4]  *movdi_aarch64/12
	add	x2, x2, :lo12:in	// 38	[c=4 l=4]  add_losym_di
	add	x1, x1, :lo12:out	// 41	[c=4 l=4]  add_losym_di
	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
	.p2align 3,,7
.L2:
	ldr	q0, [x2, x0]	// 10	[c=8 l=4]  *aarch64_simd_movv4sf/0
	ushr	v0.4s, v0.4s, 31	// 11	[c=12 l=4]  aarch64_simd_lshrv4si
	str	q0, [x1, x0]	// 15	[c=4 l=4]  *aarch64_simd_movv4si/2
	add	x0, x0, 16	// 16	[c=4 l=4]  *adddi3_aarch64/0
	cmp	x0, 4096	// 18	[c=4 l=4]  cmpdi/1
	bne	.L2		// 19	[c=4 l=4]  condjump
	ret		// 49	[c=0 l=4]  *do_return

Thanks,
Przemyslaw

gcc/ChangeLog:

2019-03-20  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>

	* config/aarch64/aarch64-builtins.c
	(aarch64_builtin_vectorized_function): Added CASE_CFN_SIGNBIT.
	* config/aarch64/aarch64-simd-builtins.def: (signbit)
	Extend to V4SF mode.
	* config/aarch64/aarch64-simd.md (signbitv4sf2): New expand
	defined.

gcc/testsuite/ChangeLog:

2019-02-28  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>

	* gcc.target/aarch64/signbitv4sf.c: New test.

Comments

Richard Sandiford March 22, 2019, 11:18 a.m. UTC | #1
Hi,

Przemyslaw Wirkus <Przemyslaw.Wirkus@arm.com> writes:
> Hi all,
>
> Vectorise __builtin_signbit (v4sf) with unsigned shift right vector
> instruction.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Assembly output for:
> $ aarch64-elf-gcc -S -O3 signbitv4sf.c -dp
>
> Before patch:
>
> foo:
> 	adrp	x3, in	// 37	[c=4 l=4]  *movdi_aarch64/12
> 	adrp	x2, out	// 40	[c=4 l=4]  *movdi_aarch64/12
> 	add	x3, x3, :lo12:in	// 39	[c=4 l=4]  add_losym_di
> 	add	x2, x2, :lo12:out	// 42	[c=4 l=4]  add_losym_di
> 	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
> 	.p2align 3,,7
> .L2:
> 	ldr	w1, [x3, x0]	// 10	[c=16 l=4]  *zero_extendsidi2_aarch64/1
> 	and	w1, w1, -2147483648	// 11	[c=4 l=4]  andsi3/1
> 	str	w1, [x2, x0]	// 16	[c=4 l=4]  *movsi_aarch64/8
> 	add	x0, x0, 4	// 17	[c=4 l=4]  *adddi3_aarch64/0
> 	cmp	x0, 4096	// 19	[c=4 l=4]  cmpdi/1
> 	bne	.L2		// 20	[c=4 l=4]  condjump
> 	ret		// 50	[c=0 l=4]  *do_return
>
> After patch:
>
> foo:
> 	adrp	x2, in	// 36	[c=4 l=4]  *movdi_aarch64/12
> 	adrp	x1, out	// 39	[c=4 l=4]  *movdi_aarch64/12
> 	add	x2, x2, :lo12:in	// 38	[c=4 l=4]  add_losym_di
> 	add	x1, x1, :lo12:out	// 41	[c=4 l=4]  add_losym_di
> 	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
> 	.p2align 3,,7
> .L2:
> 	ldr	q0, [x2, x0]	// 10	[c=8 l=4]  *aarch64_simd_movv4sf/0
> 	ushr	v0.4s, v0.4s, 31	// 11	[c=12 l=4]  aarch64_simd_lshrv4si
> 	str	q0, [x1, x0]	// 15	[c=4 l=4]  *aarch64_simd_movv4si/2
> 	add	x0, x0, 16	// 16	[c=4 l=4]  *adddi3_aarch64/0
> 	cmp	x0, 4096	// 18	[c=4 l=4]  cmpdi/1
> 	bne	.L2		// 19	[c=4 l=4]  condjump
> 	ret		// 49	[c=0 l=4]  *do_return
>
> Thanks,
> Przemyslaw
>
> gcc/ChangeLog:
>
> 2019-03-20  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>
>
> 	* config/aarch64/aarch64-builtins.c
> 	(aarch64_builtin_vectorized_function): Added CASE_CFN_SIGNBIT.
> 	* config/aarch64/aarch64-simd-builtins.def: (signbit)
> 	Extend to V4SF mode.
> 	* config/aarch64/aarch64-simd.md (signbitv4sf2): New expand
> 	defined.

I think it'd be better to add a new IFN_SIGNBIT internal function
that maps to signbit_optab.  That way the compiler will know what
the vector function does and there'll be no need to add a new
built-in function.

Thanks,
Richard
Przemyslaw Wirkus May 3, 2019, 8:46 a.m. UTC | #2
Hi Richard,
New patch adds a new IFN_SIGNBIT internal function that maps
to signbit_optab.

gcc/ChangeLog:

2019-05-05  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>

	* gcc/internal-fn.def (SIGNBIT): New.
	* gcc/config/aarch64/aarch64-simd.md (signbitv4sf2): New expand
	defined.

gcc/testsuite/ChangeLog:

2019-05-05  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>

	* gcc/testsuite/gcc.target/aarch64/signbitv4sf.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e3852c5d182b70978d7603225fce55c0b8ee2894..3374ce95b912cceaca49660df0579467f758974d 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -935,6 +935,21 @@
   [(set_attr "type" "neon_ins<q>")]
 )
 
+(define_expand "signbitv4sf2"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V4SF 1 "register_operand"))]
+  "TARGET_SIMD"
+{
+  int shift_amount = GET_MODE_UNIT_BITSIZE (V4SImode) - 1;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (V4SImode,
+                          shift_amount);
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V4SFmode);
+
+  emit_insn (gen_aarch64_simd_lshrv4si (operands[0], operands[1],
+                  shift_vector));
+  DONE;
+})
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index e370eaa84767839c827b6ebd0c86303bcc36fa54..016301a58d83d7128817824d7c7ef92825c7e03e 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -217,6 +217,7 @@ DEF_INTERNAL_FLT_FN (LOG10, ECF_CONST, log10, unary)
 DEF_INTERNAL_FLT_FN (LOG1P, ECF_CONST, log1p, unary)
 DEF_INTERNAL_FLT_FN (LOG2, ECF_CONST, log2, unary)
 DEF_INTERNAL_FLT_FN (LOGB, ECF_CONST, logb, unary)
+DEF_INTERNAL_FLT_FN (SIGNBIT, ECF_CONST, signbit, unary)
 DEF_INTERNAL_FLT_FN (SIGNIFICAND, ECF_CONST, significand, unary)
 DEF_INTERNAL_FLT_FN (SIN, ECF_CONST, sin, unary)
 DEF_INTERNAL_FLT_FN (SINH, ECF_CONST, sinh, unary)
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa06a5df1dbb3e295355d485b39963127a828b68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
@@ -0,0 +1,35 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 1024
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo ()
+{
+  for (int i = 0; i < N; i++)
+    out[i] = __builtin_signbit (in[i]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.4s, v[0-9]+.4s, 31} } } */
+
+int
+main ()
+{
+  foo ();
+
+  for (int i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+
Richard Sandiford May 4, 2019, 9:23 a.m. UTC | #3
Przemyslaw Wirkus <Przemyslaw.Wirkus@arm.com> writes:
> Hi Richard,
> New patch adds a new IFN_SIGNBIT internal function that maps
> to signbit_optab.

Thanks.

> gcc/ChangeLog:
>
> 2019-05-05  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>
>
> 	* gcc/internal-fn.def (SIGNBIT): New.
> 	* gcc/config/aarch64/aarch64-simd.md (signbitv4sf2): New expand
> 	defined.

Sorry for the nitpicks (I'm not really a fan of ChangeLogs), but:
the filenames are relative to the changelog file, so no "gcc/" here and

> gcc/testsuite/ChangeLog:
>
> 2019-05-05  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com>
>
> 	* gcc/testsuite/gcc.target/aarch64/signbitv4sf.c: New test.

no "gcc/testsuite/" here.

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index e3852c5d182b70978d7603225fce55c0b8ee2894..3374ce95b912cceaca49660df0579467f758974d 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -935,6 +935,21 @@
>    [(set_attr "type" "neon_ins<q>")]
>  )
>  
> +(define_expand "signbitv4sf2"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V4SF 1 "register_operand"))]
> +  "TARGET_SIMD"
> +{
> +  int shift_amount = GET_MODE_UNIT_BITSIZE (V4SImode) - 1;
> +  rtx shift_vector = aarch64_simd_gen_const_vector_dup (V4SImode,
> +                          shift_amount);
> +  operands[1] = lowpart_subreg (V4SImode, operands[1], V4SFmode);
> +
> +  emit_insn (gen_aarch64_simd_lshrv4si (operands[0], operands[1],
> +                  shift_vector));

Formatting nit: argument should be indented to the column after the
innermost unclosed "(".

> +  DONE;
> +})
> +

Looks good, but I think it can be generalised to handle v2sf if you use:

- :VDQSF instead of :V4SF
- <MODE> instead of other instances of V4SF (and <mode> instead of v4sf)
- <V_INT_EQUIV> instead of V4SI (and <v_int_equiv> instead of v4si)

E.g. this will handle SLP instances like:

void
f (int *i, float *f)
{
  i[0] = __builtin_signbitf (f[0]);
  i[1] = __builtin_signbitf (f[1]);
}

It could also be used for epilogue loop vectorisation, if we ever
turn that on by default for AArch64.

Thanks,
Richard
Przemyslaw Wirkus May 13, 2019, 10:53 a.m. UTC | #4
Hi all,

Vectorise __builtin_signbit (v2sf, v4sf) with unsigned shift right vector
instruction.

Bootstrapped and tested on aarch64-none-linux-gnu.

Assembly output for:
$ aarch64-elf-gcc -S -O3 signbitv2sf.c -dp

Before patch:

foo:
	ldp	w2, w1, [x1]	// 37	[c=0 l=4]  *load_pair_zero_extendsidi2_aarch64/0
	and	w2, w2, -2147483648	// 8	[c=4 l=4]  andsi3/1
	and	w1, w1, -2147483648	// 12	[c=4 l=4]  andsi3/1
	stp	w2, w1, [x0]	// 38	[c=0 l=4]  store_pair_sw_sisi/0
	ret		// 32	[c=0 l=4]  *do_return

After patch:

foo:
	ldr	d0, [x1]	// 7	[c=8 l=4]  *aarch64_simd_movv2sf/0
	ushr	v0.2s, v0.2s, 31	// 8	[c=12 l=4]  aarch64_simd_lshrv2si
	str	d0, [x0]	// 9	[c=4 l=4]  *aarch64_simd_movv2si/2
	ret		// 28	[c=0 l=4]  *do_return

Assembly output for:
$ aarch64-elf-gcc -S -O3 signbitv4sf.c -dp

Before patch:

foo:
	adrp	x3, in	// 38	[c=4 l=4]  *movdi_aarch64/12
	adrp	x2, out	// 41	[c=4 l=4]  *movdi_aarch64/12
	add	x3, x3, :lo12:in	// 40	[c=4 l=4]  add_losym_di
	add	x2, x2, :lo12:out	// 43	[c=4 l=4]  add_losym_di
	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
	.p2align 3,,7
.L2:
	ldr	w1, [x3, x0]	// 10	[c=16 l=4]  *zero_extendsidi2_aarch64/1
	and	w1, w1, -2147483648	// 11	[c=4 l=4]  andsi3/1
	str	w1, [x2, x0]	// 16	[c=4 l=4]  *movsi_aarch64/8
	add	x0, x0, 4	// 17	[c=4 l=4]  *adddi3_aarch64/0
	cmp	x0, 4096	// 19	[c=4 l=4]  cmpdi/1
	bne	.L2		// 20	[c=4 l=4]  condjump
	ret		// 51	[c=0 l=4]  \*do_return

After patch:

foo:
	adrp	x2, in	// 37	[c=4 l=4]  *movdi_aarch64/12
	adrp	x1, out	// 40	[c=4 l=4]  *movdi_aarch64/12
	add	x2, x2, :lo12:in	// 39	[c=4 l=4]  add_losym_di
	add	x1, x1, :lo12:out	// 42	[c=4 l=4]  add_losym_di
	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
	.p2align 3,,7
.L2:
	ldr	q0, [x2, x0]	// 10	[c=8 l=4]  *aarch64_simd_movv4sf/0
	ushr	v0.4s, v0.4s, 31	// 11	[c=12 l=4]  aarch64_simd_lshrv4si
	str	q0, [x1, x0]	// 15	[c=4 l=4]  *aarch64_simd_movv4si/2
	add	x0, x0, 16	// 16	[c=4 l=4]  *adddi3_aarch64/0
	cmp	x0, 4096	// 18	[c=4 l=4]  cmpdi/1
	bne	.L2		// 19	[c=4 l=4]  condjump
	ret		// 50	[c=0 l=4]  *do_return

OK for Trunk ?

Thanks,
Przemyslaw

gcc/ChangeLog:

2019-05-13  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com\>

	* internal-fn.def (SIGNBIT): New.
	* config/aarch64/aarch64-simd.md (signbitv2sf2): New expand
	defined.
	(signbitv4sf2): Likewise.

gcc/testsuite/ChangeLog:

2019-05-13  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com\>

	* gcc.target/aarch64/signbitv4sf.c: New test.
	* gcc.target/aarch64/signbitv2sf.c: New test.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index e3852c5d182b70978d7603225fce55c0b8ee2894..8f7227327cb960fb34c7b88e1bf283f8f17a3be9 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -935,6 +935,21 @@
   [(set_attr "type" "neon_ins<q>")]
 )
 
+(define_expand "signbit<mode>2"
+  [(use (match_operand:<V_INT_EQUIV> 0 "register_operand"))
+   (use (match_operand:VDQSF 1 "register_operand"))]
+  "TARGET_SIMD"
+{
+  int shift_amount = GET_MODE_UNIT_BITSIZE (<V_INT_EQUIV>mode) - 1;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (<V_INT_EQUIV>mode,
+                                                        shift_amount);
+  operands[1] = lowpart_subreg (<V_INT_EQUIV>mode, operands[1], <MODE>mode);
+
+  emit_insn (gen_aarch64_simd_lshr<v_int_equiv> (operands[0], operands[1],
+                                                 shift_vector));
+  DONE;
+})
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index e370eaa84767839c827b6ebd0c86303bcc36fa54..016301a58d83d7128817824d7c7ef92825c7e03e 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -217,6 +217,7 @@ DEF_INTERNAL_FLT_FN (LOG10, ECF_CONST, log10, unary)
 DEF_INTERNAL_FLT_FN (LOG1P, ECF_CONST, log1p, unary)
 DEF_INTERNAL_FLT_FN (LOG2, ECF_CONST, log2, unary)
 DEF_INTERNAL_FLT_FN (LOGB, ECF_CONST, logb, unary)
+DEF_INTERNAL_FLT_FN (SIGNBIT, ECF_CONST, signbit, unary)
 DEF_INTERNAL_FLT_FN (SIGNIFICAND, ECF_CONST, significand, unary)
 DEF_INTERNAL_FLT_FN (SIN, ECF_CONST, sin, unary)
 DEF_INTERNAL_FLT_FN (SINH, ECF_CONST, sinh, unary)
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
new file mode 100644
index 0000000000000000000000000000000000000000..2587bfedd538f30a018cf827ea57cd583b2fa084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv2sf.c
@@ -0,0 +1,40 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 8
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo (int *i, float *f)
+{
+  i[0] = __builtin_signbit (f[0]);
+  i[1] = __builtin_signbit (f[1]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.2s, v[0-9]+.2s, 31} } } */
+
+int
+main ()
+{
+  int i;
+
+  foo (out, in);
+  foo (out + 2, in + 2);
+  foo (out + 4, in + 4);
+  foo (out + 6, in + 6);
+
+  for (i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
new file mode 100644
index 0000000000000000000000000000000000000000..18cffdc7d5b2701a1bbf23f9f7d27b7a31568758
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 1024
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo ()
+{
+  int i;
+  for (i = 0; i < N; i++)
+    out[i] = __builtin_signbit (in[i]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.4s, v[0-9]+.4s, 31} } } */
+
+int
+main ()
+{
+  int i;
+
+  foo ();
+
+  for (i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+
Richard Sandiford May 14, 2019, 8:08 a.m. UTC | #5
Przemyslaw Wirkus <Przemyslaw.Wirkus@arm.com> writes:
> Hi all,
>
> Vectorise __builtin_signbit (v2sf, v4sf) with unsigned shift right vector
> instruction.
>
> Bootstrapped and tested on aarch64-none-linux-gnu.
>
> Assembly output for:
> $ aarch64-elf-gcc -S -O3 signbitv2sf.c -dp
>
> Before patch:
>
> foo:
> 	ldp	w2, w1, [x1]	// 37	[c=0 l=4]  *load_pair_zero_extendsidi2_aarch64/0
> 	and	w2, w2, -2147483648	// 8	[c=4 l=4]  andsi3/1
> 	and	w1, w1, -2147483648	// 12	[c=4 l=4]  andsi3/1
> 	stp	w2, w1, [x0]	// 38	[c=0 l=4]  store_pair_sw_sisi/0
> 	ret		// 32	[c=0 l=4]  *do_return
>
> After patch:
>
> foo:
> 	ldr	d0, [x1]	// 7	[c=8 l=4]  *aarch64_simd_movv2sf/0
> 	ushr	v0.2s, v0.2s, 31	// 8	[c=12 l=4]  aarch64_simd_lshrv2si
> 	str	d0, [x0]	// 9	[c=4 l=4]  *aarch64_simd_movv2si/2
> 	ret		// 28	[c=0 l=4]  *do_return
>
> Assembly output for:
> $ aarch64-elf-gcc -S -O3 signbitv4sf.c -dp
>
> Before patch:
>
> foo:
> 	adrp	x3, in	// 38	[c=4 l=4]  *movdi_aarch64/12
> 	adrp	x2, out	// 41	[c=4 l=4]  *movdi_aarch64/12
> 	add	x3, x3, :lo12:in	// 40	[c=4 l=4]  add_losym_di
> 	add	x2, x2, :lo12:out	// 43	[c=4 l=4]  add_losym_di
> 	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
> 	.p2align 3,,7
> .L2:
> 	ldr	w1, [x3, x0]	// 10	[c=16 l=4]  *zero_extendsidi2_aarch64/1
> 	and	w1, w1, -2147483648	// 11	[c=4 l=4]  andsi3/1
> 	str	w1, [x2, x0]	// 16	[c=4 l=4]  *movsi_aarch64/8
> 	add	x0, x0, 4	// 17	[c=4 l=4]  *adddi3_aarch64/0
> 	cmp	x0, 4096	// 19	[c=4 l=4]  cmpdi/1
> 	bne	.L2		// 20	[c=4 l=4]  condjump
> 	ret		// 51	[c=0 l=4]  \*do_return
>
> After patch:
>
> foo:
> 	adrp	x2, in	// 37	[c=4 l=4]  *movdi_aarch64/12
> 	adrp	x1, out	// 40	[c=4 l=4]  *movdi_aarch64/12
> 	add	x2, x2, :lo12:in	// 39	[c=4 l=4]  add_losym_di
> 	add	x1, x1, :lo12:out	// 42	[c=4 l=4]  add_losym_di
> 	mov	x0, 0	// 3	[c=4 l=4]  *movdi_aarch64/3
> 	.p2align 3,,7
> .L2:
> 	ldr	q0, [x2, x0]	// 10	[c=8 l=4]  *aarch64_simd_movv4sf/0
> 	ushr	v0.4s, v0.4s, 31	// 11	[c=12 l=4]  aarch64_simd_lshrv4si
> 	str	q0, [x1, x0]	// 15	[c=4 l=4]  *aarch64_simd_movv4si/2
> 	add	x0, x0, 16	// 16	[c=4 l=4]  *adddi3_aarch64/0
> 	cmp	x0, 4096	// 18	[c=4 l=4]  cmpdi/1
> 	bne	.L2		// 19	[c=4 l=4]  condjump
> 	ret		// 50	[c=0 l=4]  *do_return
>
> OK for Trunk ?
>
> Thanks,
> Przemyslaw
>
> gcc/ChangeLog:
>
> 2019-05-13  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com\>
>
> 	* internal-fn.def (SIGNBIT): New.
> 	* config/aarch64/aarch64-simd.md (signbitv2sf2): New expand
> 	defined.
> 	(signbitv4sf2): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> 2019-05-13  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com\>
>
> 	* gcc.target/aarch64/signbitv4sf.c: New test.
> 	* gcc.target/aarch64/signbitv2sf.c: New test.

Thanks, applied as r271149.

Richard
Jakub Jelinek May 14, 2019, 8:14 a.m. UTC | #6
On Tue, May 14, 2019 at 09:08:28AM +0100, Richard Sandiford wrote:
> > 2019-05-13  Przemyslaw Wirkus  <przemyslaw.wirkus@arm.com\>

What is that backslash in \> doing in the ChangeLog entries?

	Jakub
Przemyslaw Wirkus May 14, 2019, 8:39 a.m. UTC | #7
> What is that backslash in \> doing in the ChangeLog entries?

>        Jakub

My bad, tool I use for code review crafted in backslash when it wrongly assumed
I want a markdown. An early sign that machines want to take over, I guess...
I promise I will be more diligent next time.

kind regards
Przemyslaw
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 04063e5ed134d2e64487db23b8fa7794817b2739..86f8345848abd1515cef61824db525dc26ec9bdb 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -1709,6 +1709,13 @@  aarch64_builtin_vectorized_function (unsigned int fn, tree type_out,
 
 	return aarch64_builtin_decls[builtin];
       }
+    CASE_CFN_SIGNBIT:
+      {
+	if (AARCH64_CHECK_BUILTIN_MODE (4, S))
+	  return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_UNOP_signbitv4sf];
+	else
+	  return NULL_TREE;
+      }
     case CFN_BUILT_IN_BSWAP16:
 #undef AARCH64_CHECK_BUILTIN_MODE
 #define AARCH64_CHECK_BUILTIN_MODE(C, N) \
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 17bb0c4869b12ede2fc51a8f89d841ded8fac230..d568f0ba4e61febf0590b22789b006f3bfe11ccd 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -324,6 +324,9 @@ 
   VAR1 (UNOP, rint, 2, hf)
   VAR1 (UNOP, round, 2, hf)
 
+  /* Implemented by signbit<mode>2 pattern */
+  VAR1 (UNOP, signbit, 2, v4sf)
+
   /* Implemented by l<fcvt_pattern><su_optab><VQDF:mode><vcvt_target>2.  */
   VAR1 (UNOP, lbtruncv4hf, 2, v4hi)
   VAR1 (UNOP, lbtruncv8hf, 2, v8hi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index be6c27d319a1ca6fee581d8f8856a4dff8f4a060..87e2a58649c3e5d490c499115cf6b7495d448c29 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -915,6 +915,21 @@ 
   [(set_attr "type" "neon_ins<q>")]
 )
 
+(define_expand "signbitv4sf2"
+    [(use (match_operand:V4SI 0 "register_operand"))
+     (use (match_operand:V4SF 1 "register_operand"))]
+     "TARGET_SIMD"
+{
+  int shift_amount = GET_MODE_UNIT_BITSIZE (V4SImode) - 1;
+  rtx shift_vector = aarch64_simd_gen_const_vector_dup (V4SImode,
+                          shift_amount);
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V4SFmode);
+
+  emit_insn (gen_aarch64_simd_lshrv4si (operands[0], operands[1],
+                  shift_vector));
+  DONE;
+})
+
 (define_insn "aarch64_simd_lshr<mode>"
  [(set (match_operand:VDQ_I 0 "register_operand" "=w")
        (lshiftrt:VDQ_I (match_operand:VDQ_I 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
new file mode 100644
index 0000000000000000000000000000000000000000..aa06a5df1dbb3e295355d485b39963127a828b68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/signbitv4sf.c
@@ -0,0 +1,35 @@ 
+/* { dg-do run } */
+/* { dg-additional-options "-O3 --save-temps" } */
+
+extern void abort ();
+
+#define N 1024
+float in[N] = {1.0, -1.0, -2.0, 3.0, -5.0, -8.0, 13.0, 21.0};
+int out[N];
+
+void
+foo ()
+{
+  for (int i = 0; i < N; i++)
+    out[i] = __builtin_signbit (in[i]);
+}
+
+/* { dg-final { scan-assembler-not {-2147483648} } } */
+/* { dg-final { scan-assembler {\tushr\tv[0-9]+.4s, v[0-9]+.4s, 31} } } */
+
+int
+main ()
+{
+  foo ();
+
+  for (int i = 0; i < N; i++)
+  {
+    if (in[i] >= 0.0 && out[i])
+      abort ();
+    if (in[i] < 0.0 && !out[i])
+      abort ();
+  }
+
+  return 0;
+}
+