diff mbox

[AArch64] Use 128-bit vectors when autovectorizing 16-bit float types

Message ID 1485170628-13366-1-git-send-email-james.greenhalgh@arm.com
State New
Headers show

Commit Message

James Greenhalgh Jan. 23, 2017, 11:23 a.m. UTC
Hi,

As subject, we have an oversight in aarch64_simd_container_mode for
HFmode inputs. This results in trunk only autovectorizing to a 64-bit vector,
rather than a full 128-bit vector.

The fix is obvious, we just need to handle HFmode, and return an
appropriate vector mode.

Tested on aarch64-none-elf with no issues. This patch looks low risk
for this development stage to me, though it fixes an oversight rather
than a regression.

OK?

Thanks,
James

---
gcc/

2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c (aarch64_simd_container_mode): Handle
	HFmode.

gcc/testsuite/

2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/vect_fp16_1.c: New.

Comments

James Greenhalgh Feb. 2, 2017, 3:52 p.m. UTC | #1
On Mon, Jan 23, 2017 at 11:23:48AM +0000, James Greenhalgh wrote:
> 
> Hi,
> 
> As subject, we have an oversight in aarch64_simd_container_mode for
> HFmode inputs. This results in trunk only autovectorizing to a 64-bit vector,
> rather than a full 128-bit vector.
> 
> The fix is obvious, we just need to handle HFmode, and return an
> appropriate vector mode.
> 
> Tested on aarch64-none-elf with no issues. This patch looks low risk
> for this development stage to me, though it fixes an oversight rather
> than a regression.

*Ping*

Thanks,
James

> gcc/
> 
> 2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64.c (aarch64_simd_container_mode): Handle
> 	HFmode.
> 
> gcc/testsuite/
> 
> 2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.target/aarch64/vect_fp16_1.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0cf7d12..7efc1f2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -10777,6 +10777,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
>  	    return V2DFmode;
>  	  case SFmode:
>  	    return V4SFmode;
> +	  case HFmode:
> +	    return V8HFmode;
>  	  case SImode:
>  	    return V4SImode;
>  	  case HImode:
> @@ -10793,6 +10795,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
>  	  {
>  	  case SFmode:
>  	    return V2SFmode;
> +	  case HFmode:
> +	    return V4HFmode;
>  	  case SImode:
>  	    return V2SImode;
>  	  case HImode:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
> new file mode 100644
> index 0000000..da0cd81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-vect-cost-model" } */
> +
> +/* Check that we vectorize to a full 128-bit vector for _Float16 and __fp16
> +   types.  */
> +
> +/* Enable ARMv8.2-A+fp16 so we have access to the vector instructions.  */
> +#pragma GCC target ("arch=armv8.2-a+fp16")
> +
> +_Float16
> +sum_Float16 (_Float16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
> +	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
> +	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
> +{
> +  for (int i = 0; i < 256; i++)
> +    a[i] = b[i] + c[i];
> +}
> +
> +_Float16
> +sum_fp16 (__fp16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
> +	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
> +	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
> +{
> +  for (int i = 0; i < 256; i++)
> +    a[i] = b[i] + c[i];
> +}
> +
> +/* Two FADD operations on "8h" data widths, one from sum_Float16, one from
> +   sum_fp16.  */
> +/* { dg-final { scan-assembler-times "fadd\tv\[0-9\]\+.8h" 2 } } */
Richard Earnshaw (lists) Feb. 14, 2017, 2:19 p.m. UTC | #2
On 23/01/17 11:23, James Greenhalgh wrote:
> 
> Hi,
> 
> As subject, we have an oversight in aarch64_simd_container_mode for
> HFmode inputs. This results in trunk only autovectorizing to a 64-bit vector,
> rather than a full 128-bit vector.
> 
> The fix is obvious, we just need to handle HFmode, and return an
> appropriate vector mode.
> 
> Tested on aarch64-none-elf with no issues. This patch looks low risk
> for this development stage to me, though it fixes an oversight rather
> than a regression.
> 
> OK?
> 

OK.

R.

> Thanks,
> James
> 
> ---
> gcc/
> 
> 2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64.c (aarch64_simd_container_mode): Handle
> 	HFmode.
> 
> gcc/testsuite/
> 
> 2017-01-23  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.target/aarch64/vect_fp16_1.c: New.
> 
> 
> 0001-Patch-AArch64-Use-128-bit-vectors-when-autovectorizi.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0cf7d12..7efc1f2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -10777,6 +10777,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
>  	    return V2DFmode;
>  	  case SFmode:
>  	    return V4SFmode;
> +	  case HFmode:
> +	    return V8HFmode;
>  	  case SImode:
>  	    return V4SImode;
>  	  case HImode:
> @@ -10793,6 +10795,8 @@ aarch64_simd_container_mode (machine_mode mode, unsigned width)
>  	  {
>  	  case SFmode:
>  	    return V2SFmode;
> +	  case HFmode:
> +	    return V4HFmode;
>  	  case SImode:
>  	    return V2SImode;
>  	  case HImode:
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
> new file mode 100644
> index 0000000..da0cd81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -fno-vect-cost-model" } */
> +
> +/* Check that we vectorize to a full 128-bit vector for _Float16 and __fp16
> +   types.  */
> +
> +/* Enable ARMv8.2-A+fp16 so we have access to the vector instructions.  */
> +#pragma GCC target ("arch=armv8.2-a+fp16")
> +
> +_Float16
> +sum_Float16 (_Float16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
> +	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
> +	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
> +{
> +  for (int i = 0; i < 256; i++)
> +    a[i] = b[i] + c[i];
> +}
> +
> +_Float16
> +sum_fp16 (__fp16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
> +	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
> +	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
> +{
> +  for (int i = 0; i < 256; i++)
> +    a[i] = b[i] + c[i];
> +}
> +
> +/* Two FADD operations on "8h" data widths, one from sum_Float16, one from
> +   sum_fp16.  */
> +/* { dg-final { scan-assembler-times "fadd\tv\[0-9\]\+.8h" 2 } } */
>
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0cf7d12..7efc1f2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10777,6 +10777,8 @@  aarch64_simd_container_mode (machine_mode mode, unsigned width)
 	    return V2DFmode;
 	  case SFmode:
 	    return V4SFmode;
+	  case HFmode:
+	    return V8HFmode;
 	  case SImode:
 	    return V4SImode;
 	  case HImode:
@@ -10793,6 +10795,8 @@  aarch64_simd_container_mode (machine_mode mode, unsigned width)
 	  {
 	  case SFmode:
 	    return V2SFmode;
+	  case HFmode:
+	    return V4HFmode;
 	  case SImode:
 	    return V2SImode;
 	  case HImode:
diff --git a/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
new file mode 100644
index 0000000..da0cd81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect_fp16_1.c
@@ -0,0 +1,30 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-vect-cost-model" } */
+
+/* Check that we vectorize to a full 128-bit vector for _Float16 and __fp16
+   types.  */
+
+/* Enable ARMv8.2-A+fp16 so we have access to the vector instructions.  */
+#pragma GCC target ("arch=armv8.2-a+fp16")
+
+_Float16
+sum_Float16 (_Float16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
+	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
+	     _Float16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
+{
+  for (int i = 0; i < 256; i++)
+    a[i] = b[i] + c[i];
+}
+
+_Float16
+sum_fp16 (__fp16 *__restrict__ __attribute__ ((__aligned__ (16))) a,
+	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) b,
+	  __fp16 *__restrict__ __attribute__ ((__aligned__ (16))) c)
+{
+  for (int i = 0; i < 256; i++)
+    a[i] = b[i] + c[i];
+}
+
+/* Two FADD operations on "8h" data widths, one from sum_Float16, one from
+   sum_fp16.  */
+/* { dg-final { scan-assembler-times "fadd\tv\[0-9\]\+.8h" 2 } } */