diff mbox series

Add single_use to simplification (uncond_op + vec_cond -> cond_op).

Message ID 20220211013851.6479-1-hongtao.liu@intel.com
State New
Headers show
Series Add single_use to simplification (uncond_op + vec_cond -> cond_op). | expand

Commit Message

liuhongt Feb. 11, 2022, 1:38 a.m. UTC
>>> Confirmed.  When uncond_op is expensive (there's *div amongst them) that's
>>> definitely unwanted.  OTOH when it is cheap then combining will reduce
>>> latency.
>>> 
>>> GIMPLE wise it's a neutral transform if uncond_op is not single-use unless
>>> we need two v_c_es.
>> 
>> We can leave it to rtl combine/fwprop which will consider rtx_cost for them.
>>
>
>That certainly makes sense for the !single_use case.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and
aarch64-unknown-linux-gnu.
Also Bootstrapped and regtested on CLX with gcc configure --with-arch=native
--with-cpu=native.

Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/104479
	* match.pd (uncond_op + vec_cond -> cond_op): Add single_use
	for the dest of uncond_op.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/pr104479.c: New test.
	* gcc.target/i386/cond_op_shift_w-1.c: Adjust testcase.
---
 gcc/match.pd                                  | 12 ++++---
 .../gcc.target/i386/cond_op_shift_w-1.c       |  3 +-
 gcc/testsuite/gcc.target/i386/pr104479.c      | 33 +++++++++++++++++++
 3 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr104479.c

Comments

Richard Biener Feb. 11, 2022, 7:14 a.m. UTC | #1
On Fri, Feb 11, 2022 at 2:38 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> >>> Confirmed.  When uncond_op is expensive (there's *div amongst them) that's
> >>> definitely unwanted.  OTOH when it is cheap then combining will reduce
> >>> latency.
> >>>
> >>> GIMPLE wise it's a neutral transform if uncond_op is not single-use unless
> >>> we need two v_c_es.
> >>
> >> We can leave it to rtl combine/fwprop which will consider rtx_cost for them.
> >>
> >
> >That certainly makes sense for the !single_use case.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and
> aarch64-unknown-linux-gnu.
> Also Bootstrapped and regtested on CLX with gcc configure --with-arch=native
> --with-cpu=native.
>
> Ok for trunk?

OK.

Thanks,
Richard.

> gcc/ChangeLog:
>
>         PR tree-optimization/104479
>         * match.pd (uncond_op + vec_cond -> cond_op): Add single_use
>         for the dest of uncond_op.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr104479.c: New test.
>         * gcc.target/i386/cond_op_shift_w-1.c: Adjust testcase.
> ---
>  gcc/match.pd                                  | 12 ++++---
>  .../gcc.target/i386/cond_op_shift_w-1.c       |  3 +-
>  gcc/testsuite/gcc.target/i386/pr104479.c      | 33 +++++++++++++++++++
>  3 files changed, 42 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr104479.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 7bbb80172fc..c195c8cc882 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -7385,13 +7385,15 @@ and,
>    (vec_cond @0 (view_convert? (uncond_op@4 @1 @2)) @3)
>    (with { tree op_type = TREE_TYPE (@4); }
>     (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
> -       && is_truth_type_for (op_type, TREE_TYPE (@0)))
> +       && is_truth_type_for (op_type, TREE_TYPE (@0))
> +       && single_use (@4))
>      (view_convert (cond_op @0 @1 @2 (view_convert:op_type @3))))))
>   (simplify
>    (vec_cond @0 @1 (view_convert? (uncond_op@4 @2 @3)))
>    (with { tree op_type = TREE_TYPE (@4); }
>     (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
> -       && is_truth_type_for (op_type, TREE_TYPE (@0)))
> +       && is_truth_type_for (op_type, TREE_TYPE (@0))
> +       && single_use (@4))
>      (view_convert (cond_op (bit_not @0) @2 @3 (view_convert:op_type @1)))))))
>
>  /* Same for ternary operations.  */
> @@ -7401,13 +7403,15 @@ and,
>    (vec_cond @0 (view_convert? (uncond_op@5 @1 @2 @3)) @4)
>    (with { tree op_type = TREE_TYPE (@5); }
>     (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
> -       && is_truth_type_for (op_type, TREE_TYPE (@0)))
> +       && is_truth_type_for (op_type, TREE_TYPE (@0))
> +       && single_use (@5))
>      (view_convert (cond_op @0 @1 @2 @3 (view_convert:op_type @4))))))
>   (simplify
>    (vec_cond @0 @1 (view_convert? (uncond_op@5 @2 @3 @4)))
>    (with { tree op_type = TREE_TYPE (@5); }
>     (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
> -       && is_truth_type_for (op_type, TREE_TYPE (@0)))
> +       && is_truth_type_for (op_type, TREE_TYPE (@0))
> +       && single_use (@5))
>      (view_convert (cond_op (bit_not @0) @2 @3 @4
>                   (view_convert:op_type @1)))))))
>  #endif
> diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
> index 54c854f2f37..23ab8fa166f 100644
> --- a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
> +++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
> @@ -1,7 +1,6 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=int16" } */
> -/* { dg-final { scan-tree-dump-times ".COND_SHR" 1 "optimized" } } */
> -/* { dg-final { scan-tree-dump-times ".COND_SHL" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "\.COND_" 4 "optimized" } } */
>  /* { dg-final { scan-assembler-times "vpsraw"  1 } } */
>  /* { dg-final { scan-assembler-times "vpsllw"  1 } } */
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr104479.c b/gcc/testsuite/gcc.target/i386/pr104479.c
> new file mode 100644
> index 00000000000..4ca4c482542
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr104479.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=icelake-server -Ofast -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "\.COND_SHR" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not "\.COND_FMA" "optimized" } } */
> +
> +void
> +cond_shr (unsigned int* __restrict dst,
> +         unsigned int* __restrict src,
> +         unsigned int* __restrict y,
> +         int i_width)
> +{
> +  for(int x = 0; x < i_width; x++)
> +    {
> +      unsigned int temp = src[x] >> 3;
> +      dst[x] =  temp > 255 ? temp : y[x];
> +    }
> +}
> +
> +
> +void
> +cond_fma (float* __restrict dst,
> +         float* __restrict src1,
> +         float* __restrict src2,
> +         float* __restrict src3,
> +         unsigned int* __restrict y,
> +         int i_width)
> +{
> +  for(int x = 0; x < i_width; x++)
> +    {
> +      float temp = __builtin_fmaf (src1[x], src2[x], src3[x]);
> +      dst[x] = temp > 0.0f ? temp : y[x];
> +    }
> +}
> --
> 2.18.1
>
diff mbox series

Patch

diff --git a/gcc/match.pd b/gcc/match.pd
index 7bbb80172fc..c195c8cc882 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7385,13 +7385,15 @@  and,
   (vec_cond @0 (view_convert? (uncond_op@4 @1 @2)) @3)
   (with { tree op_type = TREE_TYPE (@4); }
    (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
-	&& is_truth_type_for (op_type, TREE_TYPE (@0)))
+	&& is_truth_type_for (op_type, TREE_TYPE (@0))
+	&& single_use (@4))
     (view_convert (cond_op @0 @1 @2 (view_convert:op_type @3))))))
  (simplify
   (vec_cond @0 @1 (view_convert? (uncond_op@4 @2 @3)))
   (with { tree op_type = TREE_TYPE (@4); }
    (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
-	&& is_truth_type_for (op_type, TREE_TYPE (@0)))
+	&& is_truth_type_for (op_type, TREE_TYPE (@0))
+	&& single_use (@4))
     (view_convert (cond_op (bit_not @0) @2 @3 (view_convert:op_type @1)))))))
 
 /* Same for ternary operations.  */
@@ -7401,13 +7403,15 @@  and,
   (vec_cond @0 (view_convert? (uncond_op@5 @1 @2 @3)) @4)
   (with { tree op_type = TREE_TYPE (@5); }
    (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
-	&& is_truth_type_for (op_type, TREE_TYPE (@0)))
+	&& is_truth_type_for (op_type, TREE_TYPE (@0))
+	&& single_use (@5))
     (view_convert (cond_op @0 @1 @2 @3 (view_convert:op_type @4))))))
  (simplify
   (vec_cond @0 @1 (view_convert? (uncond_op@5 @2 @3 @4)))
   (with { tree op_type = TREE_TYPE (@5); }
    (if (vectorized_internal_fn_supported_p (as_internal_fn (cond_op), op_type)
-	&& is_truth_type_for (op_type, TREE_TYPE (@0)))
+	&& is_truth_type_for (op_type, TREE_TYPE (@0))
+	&& single_use (@5))
     (view_convert (cond_op (bit_not @0) @2 @3 @4
 		  (view_convert:op_type @1)))))))
 #endif
diff --git a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
index 54c854f2f37..23ab8fa166f 100644
--- a/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
+++ b/gcc/testsuite/gcc.target/i386/cond_op_shift_w-1.c
@@ -1,7 +1,6 @@ 
 /* { dg-do compile } */
 /* { dg-options "-O2 -march=skylake-avx512 -fdump-tree-optimized -DTYPE=int16" } */
-/* { dg-final { scan-tree-dump-times ".COND_SHR" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times ".COND_SHL" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\.COND_" 4 "optimized" } } */
 /* { dg-final { scan-assembler-times "vpsraw"  1 } } */
 /* { dg-final { scan-assembler-times "vpsllw"  1 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr104479.c b/gcc/testsuite/gcc.target/i386/pr104479.c
new file mode 100644
index 00000000000..4ca4c482542
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr104479.c
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -Ofast -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "\.COND_SHR" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "\.COND_FMA" "optimized" } } */
+
+void
+cond_shr (unsigned int* __restrict dst,
+	  unsigned int* __restrict src,
+	  unsigned int* __restrict y,
+	  int i_width)
+{
+  for(int x = 0; x < i_width; x++)
+    {
+      unsigned int temp = src[x] >> 3;
+      dst[x] =  temp > 255 ? temp : y[x];
+    }
+}
+
+
+void
+cond_fma (float* __restrict dst,
+	  float* __restrict src1,
+	  float* __restrict src2,
+	  float* __restrict src3,
+	  unsigned int* __restrict y,
+	  int i_width)
+{
+  for(int x = 0; x < i_width; x++)
+    {
+      float temp = __builtin_fmaf (src1[x], src2[x], src3[x]);
+      dst[x] = temp > 0.0f ? temp : y[x];
+    }
+}