diff mbox series

[2/5] AArch64 sve: combine nested if predicates

Message ID patch-14777-tamar@arm.com
State New
Headers show
Series [1/5] AArch64 sve: combine inverted masks into NOTs | expand

Commit Message

Tamar Christina Aug. 31, 2021, 1:31 p.m. UTC
Hi All,

The following example

void f5(float * restrict z0, float * restrict z1, float *restrict x,
	float * restrict y, float c, int n)
{
    for (int i = 0; i < n; i++) {
        float a = x[i];
        float b = y[i];
        if (a > b) {
            z0[i] = a + b;
            if (a > c) {
                z1[i] = a - b;
            }
        }
    }
}

generates currently:

        ptrue   p3.b, all
        ld1w    z1.s, p1/z, [x2, x5, lsl 2]
        ld1w    z2.s, p1/z, [x3, x5, lsl 2]
        fcmgt   p0.s, p3/z, z1.s, z0.s
        fcmgt   p2.s, p1/z, z1.s, z2.s
        fcmgt   p0.s, p0/z, z1.s, z2.s

The conditions for a > b and a > c become separate comparisons.

After this patch using a 2 -> 2 split we generate:

        ld1w    z1.s, p0/z, [x2, x5, lsl 2]
        ld1w    z2.s, p0/z, [x3, x5, lsl 2]
        fcmgt   p1.s, p0/z, z1.s, z2.s
        fcmgt   p1.s, p1/z, z1.s, z0.s

Where the condition a > b && a > c are folded by using the predicate result of
the previous compare and thus allows the removal of one of the compares.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve.md (*mask_cmp_and_combine): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d 100644


--

Comments

Richard Sandiford Sept. 3, 2021, 11:15 a.m. UTC | #1
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> The following example
>
> void f5(float * restrict z0, float * restrict z1, float *restrict x,
> 	float * restrict y, float c, int n)
> {
>     for (int i = 0; i < n; i++) {
>         float a = x[i];
>         float b = y[i];
>         if (a > b) {
>             z0[i] = a + b;
>             if (a > c) {
>                 z1[i] = a - b;
>             }
>         }
>     }
> }
>
> generates currently:
>
>         ptrue   p3.b, all
>         ld1w    z1.s, p1/z, [x2, x5, lsl 2]
>         ld1w    z2.s, p1/z, [x3, x5, lsl 2]
>         fcmgt   p0.s, p3/z, z1.s, z0.s
>         fcmgt   p2.s, p1/z, z1.s, z2.s
>         fcmgt   p0.s, p0/z, z1.s, z2.s
>
> The conditions for a > b and a > c become separate comparisons.
>
> After this patch using a 2 -> 2 split we generate:
>
>         ld1w    z1.s, p0/z, [x2, x5, lsl 2]
>         ld1w    z2.s, p0/z, [x3, x5, lsl 2]
>         fcmgt   p1.s, p0/z, z1.s, z2.s
>         fcmgt   p1.s, p1/z, z1.s, z0.s
>
> Where the condition a > b && a > c are folded by using the predicate result of
> the previous compare and thus allows the removal of one of the compares.
>
> Note: This patch series is working incrementally towards generating the most
>       efficient code for this and other loops in small steps.

It looks like this could be done in the vectoriser via an extension
of the scalar_cond_masked_set mechanism.  We have:

  mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58;
  vec_mask_and_60 = loop_mask_32 & mask__54.13_59;
  …
  mask__30.17_67 = vect_a_15.9_55 > vect_cst__66;
  mask__29.18_68 = mask__54.13_59 & mask__30.17_67;
  vec_mask_and_69 = loop_mask_32 & mask__29.18_68;

When vectorising mask__29.18_68, we could test whether each side
of the "&" is already in scalar_cond_masked_set and AND in the loop
mask if so, like we do in vectorizable_condition.  We could then
separately record that the & result includes the loop mask.

Thanks,
Richard

> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-sve.md (*mask_cmp_and_combine): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-combine-and.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
> index 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -8162,6 +8162,48 @@ (define_insn_and_split "*mask_inv_combine"
>  }
>  )
>  
> +;; Combine multiple masks where the comparisons operators are the same and
> +;; each comparison has one parameter shared. e.g. combine a > b && a > c
> +(define_insn_and_split "*mask_cmp_and_combine"
> +  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
> +	(and:<VPRED>
> +	  (and:<VPRED>
> +	    (unspec:<VPRED>
> +	      [(match_operand:<VPRED> 1)
> +	       (const_int SVE_KNOWN_PTRUE)
> +	       (match_operand:SVE_FULL_F 2 "register_operand" "w")
> +	       (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
> +	      SVE_COND_FP_CMP_I0)
> +	    (unspec:<VPRED>
> +	      [(match_dup 1)
> +	       (const_int SVE_KNOWN_PTRUE)
> +	       (match_dup 2)
> +	       (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "wDz")]
> +	      SVE_COND_FP_CMP_I0))
> +	    (match_operand:<VPRED> 5 "register_operand" "Upa")))
> +   (clobber (match_scratch:<VPRED> 6 "=&Upa"))]
> +  "TARGET_SVE"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 6)
> +	(unspec:<VPRED>
> +	  [(match_dup 5)
> +	   (const_int SVE_MAYBE_NOT_PTRUE)
> +	   (match_dup 2)
> +	   (match_dup 3)]
> +	  SVE_COND_FP_CMP_I0))
> +   (set (match_dup 0)
> +	(unspec:<VPRED>
> +	  [(match_dup 6)
> +	   (const_int SVE_MAYBE_NOT_PTRUE)
> +	   (match_dup 2)
> +	   (match_dup 4)]
> +	  SVE_COND_FP_CMP_I0))]
> +{
> +  operands[6] = gen_reg_rtx (<VPRED>mode);
> +}
> +)
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [FP] Absolute comparisons
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
> +{
> +    for (int i = 0; i < n; i++) {
> +        float a = x[i];
> +        float b = y[i];
> +        if (a > b) {
> +            z0[i] = a + b;
> +            if (a > c) {
> +                z1[i] = a - b;
> +            }
> +        }
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
Tamar Christina Sept. 21, 2021, 4:54 p.m. UTC | #2
Hi honored reviewer,

Thanks for the feedback, I hereby submit the new patch:

> > Note: This patch series is working incrementally towards generating the
> most
> >       efficient code for this and other loops in small steps.
> 
> It looks like this could be done in the vectoriser via an extension of the
> scalar_cond_masked_set mechanism.  We have:
> 
>   mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58;
>   vec_mask_and_60 = loop_mask_32 & mask__54.13_59;
>   …
>   mask__30.17_67 = vect_a_15.9_55 > vect_cst__66;
>   mask__29.18_68 = mask__54.13_59 & mask__30.17_67;
>   vec_mask_and_69 = loop_mask_32 & mask__29.18_68;
> 
> When vectorising mask__29.18_68, we could test whether each side of the
> "&" is already in scalar_cond_masked_set and AND in the loop mask if so, like
> we do in vectorizable_condition.  We could then separately record that the &
> result includes the loop mask.

When never a mask is being generated from an BIT_AND we mask the operands of
the and instead and then just AND the result.

This allows us to be able to CSE the masks and generate the right combination.
However because re-assoc will try to re-order the masks in the & we have to now
perform a small local CSE on the vectorized loop is vectorization is successful.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vectorizer.c (vectorize_loops): Do local CSE through RPVN upon
	successful vectorization.
	* tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
	mask the operands instead of the combined operation.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 4e0b2adf1dc2404bc345af30cfeb9c819084894e..717a25f46aa72534eebeb382c92b9145d7d44d04 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1799,6 +1799,19 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
     return vec_mask;
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
+
+  /* Check if the mask is a combination of two different masks.  */
+  gimple *def_stmt = SSA_NAME_DEF_STMT (vec_mask);
+  if (is_gimple_assign (def_stmt)
+      && gimple_assign_rhs_code (def_stmt) == BIT_AND_EXPR)
+    {
+      tree lhs1 = gimple_assign_rhs1 (def_stmt);
+      tree lhs2 = gimple_assign_rhs2 (def_stmt);
+
+      vec_mask = prepare_load_store_mask (mask_type, loop_mask, lhs1, gsi);
+      loop_mask = prepare_load_store_mask (mask_type, loop_mask, lhs2, gsi);
+    }
+
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
 					  vec_mask, loop_mask);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 3aa3e2a678328baccc4869fe2c6546e700b92255..84bcd146af7c4dedf6acdd7317954010ad3f281b 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -81,7 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-pretty-print.h"
 #include "opt-problem.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-sccvn.h"
 
 /* Loop or bb location, with hotness information.  */
 dump_user_location_t vect_location;
@@ -1321,6 +1321,27 @@ vectorize_loops (void)
 	 ???  Also while we try hard to update loop-closed SSA form we fail
 	 to properly do this in some corner-cases (see PR56286).  */
       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
+
+      for (i = 1; i < number_of_loops (cfun); i++)
+	{
+	  loop = get_loop (cfun, i);
+	  if (!loop || !single_exit (loop))
+	    continue;
+
+	  bitmap exit_bbs;
+	  /* Perform local CSE, this esp. helps because we emit code for
+	     predicates that need to be shared for optimal predicate usage.
+	     However reassoc will re-order them and prevent CSE from working
+	     as it should.  CSE only the loop body, not the entry.  */
+	  exit_bbs = BITMAP_ALLOC (NULL);
+	  bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+	  bitmap_set_bit (exit_bbs, loop->latch->index);
+
+	  do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
+
+	  BITMAP_FREE (exit_bbs);
+	}
+
       return TODO_cleanup_cfg;
     }
Richard Sandiford Oct. 11, 2021, 4:40 p.m. UTC | #3
Tamar Christina <Tamar.Christina@arm.com> writes:
>> > Note: This patch series is working incrementally towards generating the
>> most
>> >       efficient code for this and other loops in small steps.
>> 
>> It looks like this could be done in the vectoriser via an extension of the
>> scalar_cond_masked_set mechanism.  We have:
>> 
>>   mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58;
>>   vec_mask_and_60 = loop_mask_32 & mask__54.13_59;
>>   …
>>   mask__30.17_67 = vect_a_15.9_55 > vect_cst__66;
>>   mask__29.18_68 = mask__54.13_59 & mask__30.17_67;
>>   vec_mask_and_69 = loop_mask_32 & mask__29.18_68;
>> 
>> When vectorising mask__29.18_68, we could test whether each side of the
>> "&" is already in scalar_cond_masked_set and AND in the loop mask if so, like
>> we do in vectorizable_condition.  We could then separately record that the &
>> result includes the loop mask.
>
> When never a mask is being generated from an BIT_AND we mask the operands of
> the and instead and then just AND the result.
>
> This allows us to be able to CSE the masks and generate the right combination.
> However because re-assoc will try to re-order the masks in the & we have to now
> perform a small local CSE on the vectorized loop is vectorization is successful.
>
> Note: This patch series is working incrementally towards generating the most
>       efficient code for this and other loops in small steps.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* tree-vectorizer.c (vectorize_loops): Do local CSE through RPVN upon
> 	successful vectorization.
> 	* tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
> 	mask the operands instead of the combined operation.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-combine-and.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> @@ -0,0 +1,18 @@
> +/* { dg-do assemble { target aarch64_asm_sve_ok } } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
> +{
> +    for (int i = 0; i < n; i++) {
> +        float a = x[i];
> +        float b = y[i];
> +        if (a > b) {
> +            z0[i] = a + b;
> +            if (a > c) {
> +                z1[i] = a - b;
> +            }
> +        }
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 4e0b2adf1dc2404bc345af30cfeb9c819084894e..717a25f46aa72534eebeb382c92b9145d7d44d04 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1799,6 +1799,19 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
>      return vec_mask;
>  
>    gcc_assert (TREE_TYPE (loop_mask) == mask_type);
> +
> +  /* Check if the mask is a combination of two different masks.  */
> +  gimple *def_stmt = SSA_NAME_DEF_STMT (vec_mask);
> +  if (is_gimple_assign (def_stmt)
> +      && gimple_assign_rhs_code (def_stmt) == BIT_AND_EXPR)
> +    {
> +      tree lhs1 = gimple_assign_rhs1 (def_stmt);
> +      tree lhs2 = gimple_assign_rhs2 (def_stmt);
> +
> +      vec_mask = prepare_load_store_mask (mask_type, loop_mask, lhs1, gsi);
> +      loop_mask = prepare_load_store_mask (mask_type, loop_mask, lhs2, gsi);
> +    }
> +

I think this is doing something different from what I suggested above.

I was thinking that we should do this when vectorising the AND itself
(mask__29.18_68 in the example above), using scalar_cond_masked_set
to check whether either side is or is going to be ANDed with the
loop mask.

That way we never generate more loop masks than we need to,
whereas the above version could.

Thanks,
Richard
Tamar Christina Nov. 2, 2021, 1:49 p.m. UTC | #4
Hi All,

Here’s a respin of the patch.

The following example

void f5(float * restrict z0, float * restrict z1, float *restrict x,
	float * restrict y, float c, int n)
{
    for (int i = 0; i < n; i++) {
        float a = x[i];
        float b = y[i];
        if (a > b) {
            z0[i] = a + b;
            if (a > c) {
                z1[i] = a - b;
            }
        }
    }
}

generates currently:

        ptrue   p3.b, all
        ld1w    z1.s, p1/z, [x2, x5, lsl 2]
        ld1w    z2.s, p1/z, [x3, x5, lsl 2]
        fcmgt   p0.s, p3/z, z1.s, z0.s
        fcmgt   p2.s, p1/z, z1.s, z2.s
        fcmgt   p0.s, p0/z, z1.s, z2.s
        and     p0.b, p0/z, p1.b, p1.b

The conditions for a > b and a > c become separate comparisons.

After this patch we generate:

        ld1w    z1.s, p0/z, [x2, x5, lsl 2]
        ld1w    z2.s, p0/z, [x3, x5, lsl 2]
        fcmgt   p1.s, p0/z, z1.s, z2.s
        fcmgt   p1.s, p1/z, z1.s, z0.s

Where the condition a > b && a > c are folded by using the predicate result of
the previous compare and thus allows the removal of one of the compares.

When never a mask is being generated from an BIT_AND we mask the operands of
the and instead and then just AND the result.

This allows us to be able to CSE the masks and generate the right combination.
However because re-assoc will try to re-order the masks in the & we have to now
perform a small local CSE on the vectorized loop is vectorization is successful.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
	mask the operands instead of the combined operation.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed7fb591ec69dbdafe27fc9aa08a0b0910c94003
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --save-temps" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1f56e10709e8f27d768c04f7ef914e2cd9347c36..27ee48aea429810a37777d907435a92b8fd1817d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -6302,10 +6302,39 @@ vectorizable_operation (vec_info *vinfo,
 	}
       else
 	{
+	  /* When combining two masks check is either of them has already been
+	     combined with a loop mask, if that's the case we can mark that the
+	     new combined mask doesn't need to be combined with a loop mask.  */
+	  if (masked_loop_p && code == BIT_AND_EXPR)
+	    {
+	      scalar_cond_masked_key cond1 (op0, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
+		{
+		  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop0 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop0, gsi);
+		  scalar_cond_masked_key cond (scalar_dest, ncopies);
+		  loop_vinfo->scalar_cond_masked_set.add (cond);
+		}
+
+	      scalar_cond_masked_key cond2 (op1, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond2))
+		{
+		  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop1 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop1, gsi);
+		  scalar_cond_masked_key cond (scalar_dest, ncopies);
+		  loop_vinfo->scalar_cond_masked_set.add (cond);
+		}
+	    }
+
 	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
 	  if (vec_cvt_dest)
 	    {
 	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
Richard Sandiford Nov. 2, 2021, 3:04 p.m. UTC | #5
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi All,
>
> Here’s a respin of the patch.
>
> The following example
>
> void f5(float * restrict z0, float * restrict z1, float *restrict x,
> 	float * restrict y, float c, int n)
> {
>     for (int i = 0; i < n; i++) {
>         float a = x[i];
>         float b = y[i];
>         if (a > b) {
>             z0[i] = a + b;
>             if (a > c) {
>                 z1[i] = a - b;
>             }
>         }
>     }
> }
>
> generates currently:
>
>         ptrue   p3.b, all
>         ld1w    z1.s, p1/z, [x2, x5, lsl 2]
>         ld1w    z2.s, p1/z, [x3, x5, lsl 2]
>         fcmgt   p0.s, p3/z, z1.s, z0.s
>         fcmgt   p2.s, p1/z, z1.s, z2.s
>         fcmgt   p0.s, p0/z, z1.s, z2.s
>         and     p0.b, p0/z, p1.b, p1.b
>
> The conditions for a > b and a > c become separate comparisons.
>
> After this patch we generate:
>
>         ld1w    z1.s, p0/z, [x2, x5, lsl 2]
>         ld1w    z2.s, p0/z, [x3, x5, lsl 2]
>         fcmgt   p1.s, p0/z, z1.s, z2.s
>         fcmgt   p1.s, p1/z, z1.s, z0.s
>
> Where the condition a > b && a > c are folded by using the predicate result of
> the previous compare and thus allows the removal of one of the compares.
>
> When never a mask is being generated from an BIT_AND we mask the operands of
> the and instead and then just AND the result.
>
> This allows us to be able to CSE the masks and generate the right combination.
> However because re-assoc will try to re-order the masks in the & we have to now
> perform a small local CSE on the vectorized loop is vectorization is successful.
>
> Note: This patch series is working incrementally towards generating the most
>       efficient code for this and other loops in small steps.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
> 	mask the operands instead of the combined operation.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-combine-and.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ed7fb591ec69dbdafe27fc9aa08a0b0910c94003
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 --save-temps" } */
> +
> +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
> +{
> +    for (int i = 0; i < n; i++) {
> +        float a = x[i];
> +        float b = y[i];
> +        if (a > b) {
> +            z0[i] = a + b;
> +            if (a > c) {
> +                z1[i] = a - b;
> +            }
> +        }
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 1f56e10709e8f27d768c04f7ef914e2cd9347c36..27ee48aea429810a37777d907435a92b8fd1817d 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -6302,10 +6302,39 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        else
>  	{
> +	  /* When combining two masks check is either of them has already been
> +	     combined with a loop mask, if that's the case we can mark that the

I think something like:
   s/has already been combined with/is elsewhere combined with/
might be clearer.  The other statement that ANDs with the loop mask might
come before or after this one.  In the latter case it won't have been
vectorised yet.

> +	     new combined mask doesn't need to be combined with a loop mask.  */

The way I imagined this last part working is that, after vectorising
a BIT_AND_EXPR like this:

  vop0' = vop & loop_mask_N
  vres = vop0' & vop1

we should record that vres is effectively already ANDed with loop_mask_N,
and so there's no need to create:

  vres' = vres & loop_mask_N

when vectorising the innermost IFN_MASK_STORE.

This could be a new hash_set in the loop_vec_info, containing
(condition, loop_mask) pairs for which condition & loop_mask == condition.
prepare_load_store_mask could check whether (vec_mask, loop_mask) is in
this set and return vec_mask unaltered if so.

This new set would in a sense have the opposite effect of
scalar_cond_masked_key.  scalar_cond_masked_key encourage statements
to create ANDs with the loop mask in cases where they might not have
done otherwise.  The new set would instead tell statements (and
specifically prepare_load_store_mask) that such ANDs aren't necessary.

I guess we should also pass other ANDs with the loop mask through
prepare_load_store_mask (and rename it), so that they get the benefit too.

It looks like the patch tried to do that by entering scalar_dest
into the scalar_cond_masked_set.  I think that's likely to be
counter-productive though, since it would tell other statements
in the loop that they should AND the scalar dest with the loop mask
even if they wouldn't have done normally.

> +	  if (masked_loop_p && code == BIT_AND_EXPR)
> +	    {
> +	      scalar_cond_masked_key cond1 (op0, ncopies);

Nit, but: calling them cond0 and cond1 might be better, to avoid
having cond1 describe op2 rather than op1.

The patch looks good otherwise.  I think we're there in terms of
deciding when to generate the ANDs.  It's just a case of what
to do with the result.

Thanks,
Richard

> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
> +		{
> +		  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);
> +
> +		  vop0 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop0, gsi);
> +		  scalar_cond_masked_key cond (scalar_dest, ncopies);
> +		  loop_vinfo->scalar_cond_masked_set.add (cond);
> +		}
> +
> +	      scalar_cond_masked_key cond2 (op1, ncopies);
> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond2))
> +		{
> +		  tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);
> +
> +		  vop1 = prepare_load_store_mask (TREE_TYPE (mask), mask, vop1, gsi);
> +		  scalar_cond_masked_key cond (scalar_dest, ncopies);
> +		  loop_vinfo->scalar_cond_masked_set.add (cond);
> +		}
> +	    }
> +
>  	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
>  	  new_temp = make_ssa_name (vec_dest, new_stmt);
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +
>  	  if (vec_cvt_dest)
>  	    {
>  	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
Tamar Christina Nov. 15, 2021, 10:47 a.m. UTC | #6
Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.

gcc/ChangeLog:

	* tree-vect-stmts.c (prepare_load_store_mask): Rename to...
	(prepare_vec_mask): ...This and record operations that have already been
	masked.
	(vectorizable_call): Use it.
	(vectorizable_operation): Likewise.
	(vectorizable_store): Likewise.
	(vectorizable_load): Likewise.
	* tree-vectorizer.c (vec_cond_masked_key::get_cond_ops_from_tree): New.
	* tree-vectorizer.h (struct vec_cond_masked_key): New.
	(class _loop_vec_info): Add vec_cond_masked_set.
	(vec_cond_masked_set_type): New.
	(struct default_hash_traits<vec_cond_masked_key>): New.


gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 0000000000000000000000000000000000000000..ee927346abe518caa3cba397b11dfd1ee7e93630
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 2284ad069e4d521f4e0bd43d34181a258cd636ef..b1946b589043312a9b29d832f9b8398e24787a5f 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1796,23 +1796,30 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
    form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
    that needs to be applied to all loads and stores in a vectorized loop.
-   Return VEC_MASK if LOOP_MASK is null, otherwise return VEC_MASK & LOOP_MASK.
+   Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
+   otherwise return VEC_MASK & LOOP_MASK.
 
    MASK_TYPE is the type of both masks.  If new statements are needed,
    insert them before GSI.  */
 
 static tree
-prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
-			 gimple_stmt_iterator *gsi)
+prepare_vec_mask (tree mask_type, loop_vec_info loop_vinfo, tree loop_mask,
+		  tree vec_mask, gimple_stmt_iterator *gsi)
 {
   gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
   if (!loop_mask)
     return vec_mask;
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
+
+  vec_cond_masked_key cond (vec_mask, loop_mask);
+  if (loop_vinfo->vec_cond_masked_set.contains (cond))
+    return vec_mask;
+
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
 					  vec_mask, loop_mask);
+
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
   return and_res;
 }
@@ -3526,8 +3533,9 @@ vectorizable_call (vec_info *vinfo,
 			  gcc_assert (ncopies == 1);
 			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
 							  vectype_out, i);
-			  vargs[mask_opno] = prepare_load_store_mask
-			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+			  vargs[mask_opno] = prepare_vec_mask
+			    (TREE_TYPE (mask), loop_vinfo, mask,
+			     vargs[mask_opno], gsi);
 			}
 
 		      gcall *call;
@@ -3564,8 +3572,8 @@ vectorizable_call (vec_info *vinfo,
 	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
 					      vectype_out, j);
 	      vargs[mask_opno]
-		= prepare_load_store_mask (TREE_TYPE (mask), mask,
-					   vargs[mask_opno], gsi);
+		= prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
+				    vargs[mask_opno], gsi);
 	    }
 
 	  gimple *new_stmt;
@@ -6302,10 +6310,46 @@ vectorizable_operation (vec_info *vinfo,
 	}
       else
 	{
+	  tree mask = NULL_TREE;
+	  /* When combining two masks check is either of them is elsewhere
+	     combined with a loop mask, if that's the case we can mark that the
+	     new combined mask doesn't need to be combined with a loop mask.  */
+	  if (masked_loop_p && code == BIT_AND_EXPR)
+	    {
+	      scalar_cond_masked_key cond0 (op0, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond0))
+		{
+		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop0 = prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
+					   vop0, gsi);
+		}
+
+	      scalar_cond_masked_key cond1 (op1, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
+		{
+		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop1 = prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
+					   vop1, gsi);
+		}
+	    }
+
 	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	  /* Enter the combined value into the vector cond hash so we don't
+	     AND it with a loop mask again.  */
+	  if (mask)
+	    {
+	      vec_cond_masked_key cond (new_temp, mask);
+	      loop_vinfo->vec_cond_masked_set.add (cond);
+	    }
+
 	  if (vec_cvt_dest)
 	    {
 	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
@@ -8166,8 +8210,8 @@ vectorizable_store (vec_info *vinfo,
 	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
 					     vectype, j);
 	  if (vec_mask)
-	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+	    final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
+					   final_mask, vec_mask, gsi);
 
 	  gcall *call;
 	  if (final_mask)
@@ -8221,8 +8265,8 @@ vectorizable_store (vec_info *vinfo,
 						 vec_num * ncopies,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
-		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+		final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
+					       final_mask, vec_mask, gsi);
 
 	      if (memory_access_type == VMAT_GATHER_SCATTER)
 		{
@@ -9442,8 +9486,8 @@ vectorizable_load (vec_info *vinfo,
 	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
 					     vectype, j);
 	  if (vec_mask)
-	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+	    final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
+					   final_mask, vec_mask, gsi);
 
 	  gcall *call;
 	  if (final_mask)
@@ -9494,8 +9538,8 @@ vectorizable_load (vec_info *vinfo,
 						 vec_num * ncopies,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
-		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+		final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
+					       final_mask, vec_mask, gsi);
 
 	      if (i > 0)
 		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index d466be6ffcb7b2f7724332367cce2eee75245240..894f30b35fa5491e31c58ecb8a46717ef3598a9a 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -327,6 +327,80 @@ struct default_hash_traits<scalar_cond_masked_key>
 
 typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
 
+/* Key for map that records association between
+   vector conditions and corresponding loop mask, and
+   is populated by prepare_vec_mask.  */
+
+struct vec_cond_masked_key
+{
+  vec_cond_masked_key (tree t, tree loop_mask_)
+    : loop_mask (loop_mask_)
+  {
+    get_cond_ops_from_tree (t);
+  }
+
+  void get_cond_ops_from_tree (tree);
+
+  tree loop_mask;
+  tree_code code;
+  tree op0;
+  tree op1;
+};
+
+template<>
+struct default_hash_traits<vec_cond_masked_key>
+{
+  typedef vec_cond_masked_key compare_type;
+  typedef vec_cond_masked_key value_type;
+
+  static inline hashval_t
+  hash (value_type v)
+  {
+    inchash::hash h;
+    h.add_int (v.code);
+    inchash::add_expr (v.op0, h, 0);
+    inchash::add_expr (v.op1, h, 0);
+    h.add_ptr (v.loop_mask);
+    return h.end ();
+  }
+
+  static inline bool
+  equal (value_type existing, value_type candidate)
+  {
+    return (existing.code == candidate.code
+	    && existing.loop_mask == candidate.loop_mask
+	    && operand_equal_p (existing.op0, candidate.op0, 0)
+	    && operand_equal_p (existing.op1, candidate.op1, 0));
+  }
+
+  static const bool empty_zero_p = true;
+
+  static inline void
+  mark_empty (value_type &v)
+  {
+    v.loop_mask = NULL_TREE;
+  }
+
+  static inline bool
+  is_empty (value_type v)
+  {
+    return v.loop_mask == NULL_TREE;
+  }
+
+  static inline void mark_deleted (value_type &) {}
+
+  static inline bool is_deleted (const value_type &)
+  {
+    return false;
+  }
+
+  static inline void remove (value_type &) {}
+};
+
+typedef hash_set<vec_cond_masked_key> vec_cond_masked_set_type;
+
+
+
 /* Describes two objects whose addresses must be unequal for the vectorized
    loop to be valid.  */
 typedef std::pair<tree, tree> vec_object_pair;
@@ -643,6 +717,9 @@ public:
   /* Set of scalar conditions that have loop mask applied.  */
   scalar_cond_masked_set_type scalar_cond_masked_set;
 
+  /* Set of vector conditions that have loop mask applied.  */
+  vec_cond_masked_set_type vec_cond_masked_set;
+
   /* If we are using a loop mask to align memory addresses, this variable
      contains the number of vector elements that we should skip in the
      first iteration of the vector loop (i.e. the number of leading
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 5c93961869db08e5ef4e2d22deffdaba8554d78f..4e4d3ca49b46dc051c9084372e8501457e23c632 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1720,6 +1720,40 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
   this->inverted_p = false;
 }
 
+/* If the condition represented by T is a comparison or the SSA name
+   result of a comparison, extract the comparison's operands.  Represent
+   T as NE_EXPR <T, 0> otherwise.  */
+
+void
+vec_cond_masked_key::get_cond_ops_from_tree (tree t)
+{
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+    {
+      this->code = TREE_CODE (t);
+      this->op0 = TREE_OPERAND (t, 0);
+      this->op1 = TREE_OPERAND (t, 1);
+      return;
+    }
+
+  if (TREE_CODE (t) == SSA_NAME)
+    if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
+      {
+	tree_code code = gimple_assign_rhs_code (stmt);
+	if (TREE_CODE_CLASS (code) == tcc_comparison)
+	  {
+	    this->code = code;
+	    this->op0 = gimple_assign_rhs1 (stmt);
+	    this->op1 = gimple_assign_rhs2 (stmt);
+	    return;
+	  }
+      }
+
+  this->code = NE_EXPR;
+  this->op0 = t;
+  this->op1 = build_zero_cst (TREE_TYPE (t));
+}
+
+
 /* See the comment above the declaration for details.  */
 
 unsigned int
Richard Sandiford Nov. 30, 2021, 4:24 p.m. UTC | #7
Tamar Christina <Tamar.Christina@arm.com> writes:
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.
>
> gcc/ChangeLog:
>
> 	* tree-vect-stmts.c (prepare_load_store_mask): Rename to...
> 	(prepare_vec_mask): ...This and record operations that have already been
> 	masked.
> 	(vectorizable_call): Use it.
> 	(vectorizable_operation): Likewise.
> 	(vectorizable_store): Likewise.
> 	(vectorizable_load): Likewise.
> 	* tree-vectorizer.c (vec_cond_masked_key::get_cond_ops_from_tree): New.
> 	* tree-vectorizer.h (struct vec_cond_masked_key): New.
> 	(class _loop_vec_info): Add vec_cond_masked_set.
> 	(vec_cond_masked_set_type): New.
> 	(struct default_hash_traits<vec_cond_masked_key>): New.
>
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-combine-and.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ee927346abe518caa3cba397b11dfd1ee7e93630
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
> +{
> +    for (int i = 0; i < n; i++) {
> +        float a = x[i];
> +        float b = y[i];
> +        if (a > b) {
> +            z0[i] = a + b;
> +            if (a > c) {
> +                z1[i] = a - b;
> +            }
> +        }
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 2284ad069e4d521f4e0bd43d34181a258cd636ef..b1946b589043312a9b29d832f9b8398e24787a5f 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1796,23 +1796,30 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>  /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
>     form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
>     that needs to be applied to all loads and stores in a vectorized loop.
> -   Return VEC_MASK if LOOP_MASK is null, otherwise return VEC_MASK & LOOP_MASK.
> +   Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
> +   otherwise return VEC_MASK & LOOP_MASK.
>  
>     MASK_TYPE is the type of both masks.  If new statements are needed,
>     insert them before GSI.  */
>  
>  static tree
> -prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
> -			 gimple_stmt_iterator *gsi)
> +prepare_vec_mask (tree mask_type, loop_vec_info loop_vinfo, tree loop_mask,
> +		  tree vec_mask, gimple_stmt_iterator *gsi)

Minor, but: loop_vinfo normally comes first when present.

>  {
>    gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
>    if (!loop_mask)
>      return vec_mask;
>  
>    gcc_assert (TREE_TYPE (loop_mask) == mask_type);
> +
> +  vec_cond_masked_key cond (vec_mask, loop_mask);
> +  if (loop_vinfo->vec_cond_masked_set.contains (cond))
> +    return vec_mask;
> +
>    tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
>    gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
>  					  vec_mask, loop_mask);
> +
>    gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
>    return and_res;
>  }
> @@ -3526,8 +3533,9 @@ vectorizable_call (vec_info *vinfo,
>  			  gcc_assert (ncopies == 1);
>  			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
>  							  vectype_out, i);
> -			  vargs[mask_opno] = prepare_load_store_mask
> -			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
> +			  vargs[mask_opno] = prepare_vec_mask
> +			    (TREE_TYPE (mask), loop_vinfo, mask,
> +			     vargs[mask_opno], gsi);
>  			}
>  
>  		      gcall *call;
> @@ -3564,8 +3572,8 @@ vectorizable_call (vec_info *vinfo,
>  	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
>  					      vectype_out, j);
>  	      vargs[mask_opno]
> -		= prepare_load_store_mask (TREE_TYPE (mask), mask,
> -					   vargs[mask_opno], gsi);
> +		= prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
> +				    vargs[mask_opno], gsi);
>  	    }
>  
>  	  gimple *new_stmt;
> @@ -6302,10 +6310,46 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        else
>  	{
> +	  tree mask = NULL_TREE;
> +	  /* When combining two masks check is either of them is elsewhere
> +	     combined with a loop mask, if that's the case we can mark that the
> +	     new combined mask doesn't need to be combined with a loop mask.  */
> +	  if (masked_loop_p && code == BIT_AND_EXPR)
> +	    {
> +	      scalar_cond_masked_key cond0 (op0, ncopies);
> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond0))
> +		{
> +		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);
> +
> +		  vop0 = prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
> +					   vop0, gsi);
> +		}
> +
> +	      scalar_cond_masked_key cond1 (op1, ncopies);
> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
> +		{
> +		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);
> +
> +		  vop1 = prepare_vec_mask (TREE_TYPE (mask), loop_vinfo, mask,
> +					   vop1, gsi);
> +		}
> +	    }
> +
>  	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
>  	  new_temp = make_ssa_name (vec_dest, new_stmt);
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +
> +	  /* Enter the combined value into the vector cond hash so we don't
> +	     AND it with a loop mask again.  */
> +	  if (mask)
> +	    {
> +	      vec_cond_masked_key cond (new_temp, mask);
> +	      loop_vinfo->vec_cond_masked_set.add (cond);
> +	    }
> +
>  	  if (vec_cvt_dest)
>  	    {
>  	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
> @@ -8166,8 +8210,8 @@ vectorizable_store (vec_info *vinfo,
>  	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
>  					     vectype, j);
>  	  if (vec_mask)
> -	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
> -						  vec_mask, gsi);
> +	    final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
> +					   final_mask, vec_mask, gsi);
>  
>  	  gcall *call;
>  	  if (final_mask)
> @@ -8221,8 +8265,8 @@ vectorizable_store (vec_info *vinfo,
>  						 vec_num * ncopies,
>  						 vectype, vec_num * j + i);
>  	      if (vec_mask)
> -		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
> -						      vec_mask, gsi);
> +		final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
> +					       final_mask, vec_mask, gsi);
>  
>  	      if (memory_access_type == VMAT_GATHER_SCATTER)
>  		{
> @@ -9442,8 +9486,8 @@ vectorizable_load (vec_info *vinfo,
>  	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
>  					     vectype, j);
>  	  if (vec_mask)
> -	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
> -						  vec_mask, gsi);
> +	    final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
> +					   final_mask, vec_mask, gsi);
>  
>  	  gcall *call;
>  	  if (final_mask)
> @@ -9494,8 +9538,8 @@ vectorizable_load (vec_info *vinfo,
>  						 vec_num * ncopies,
>  						 vectype, vec_num * j + i);
>  	      if (vec_mask)
> -		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
> -						      vec_mask, gsi);
> +		final_mask = prepare_vec_mask (mask_vectype, loop_vinfo,
> +					       final_mask, vec_mask, gsi);
>  
>  	      if (i > 0)
>  		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index d466be6ffcb7b2f7724332367cce2eee75245240..894f30b35fa5491e31c58ecb8a46717ef3598a9a 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -327,6 +327,80 @@ struct default_hash_traits<scalar_cond_masked_key>
>  
>  typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
>  
> +/* Key for map that records association between
> +   vector conditions and corresponding loop mask, and
> +   is populated by prepare_vec_mask.  */
> +
> +struct vec_cond_masked_key
> +{
> +  vec_cond_masked_key (tree t, tree loop_mask_)
> +    : loop_mask (loop_mask_)
> +  {
> +    get_cond_ops_from_tree (t);
> +  }
> +
> +  void get_cond_ops_from_tree (tree);
> +
> +  tree loop_mask;
> +  tree_code code;
> +  tree op0;
> +  tree op1;
> +};
> +
> +template<>
> +struct default_hash_traits<vec_cond_masked_key>
> +{
> +  typedef vec_cond_masked_key compare_type;
> +  typedef vec_cond_masked_key value_type;
> +
> +  static inline hashval_t
> +  hash (value_type v)
> +  {
> +    inchash::hash h;
> +    h.add_int (v.code);
> +    inchash::add_expr (v.op0, h, 0);
> +    inchash::add_expr (v.op1, h, 0);
> +    h.add_ptr (v.loop_mask);
> +    return h.end ();
> +  }
> +
> +  static inline bool
> +  equal (value_type existing, value_type candidate)
> +  {
> +    return (existing.code == candidate.code
> +	    && existing.loop_mask == candidate.loop_mask
> +	    && operand_equal_p (existing.op0, candidate.op0, 0)
> +	    && operand_equal_p (existing.op1, candidate.op1, 0));
> +  }
> +
> +  static const bool empty_zero_p = true;
> +
> +  static inline void
> +  mark_empty (value_type &v)
> +  {
> +    v.loop_mask = NULL_TREE;
> +  }
> +
> +  static inline bool
> +  is_empty (value_type v)
> +  {
> +    return v.loop_mask == NULL_TREE;
> +  }
> +
> +  static inline void mark_deleted (value_type &) {}
> +
> +  static inline bool is_deleted (const value_type &)
> +  {
> +    return false;
> +  }
> +
> +  static inline void remove (value_type &) {}
> +};
> +
> +typedef hash_set<vec_cond_masked_key> vec_cond_masked_set_type;
> +
> +
> +
>  /* Describes two objects whose addresses must be unequal for the vectorized
>     loop to be valid.  */
>  typedef std::pair<tree, tree> vec_object_pair;
> @@ -643,6 +717,9 @@ public:
>    /* Set of scalar conditions that have loop mask applied.  */
>    scalar_cond_masked_set_type scalar_cond_masked_set;
>  
> +  /* Set of vector conditions that have loop mask applied.  */
> +  vec_cond_masked_set_type vec_cond_masked_set;
> +
>    /* If we are using a loop mask to align memory addresses, this variable
>       contains the number of vector elements that we should skip in the
>       first iteration of the vector loop (i.e. the number of leading
> diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
> index 5c93961869db08e5ef4e2d22deffdaba8554d78f..4e4d3ca49b46dc051c9084372e8501457e23c632 100644
> --- a/gcc/tree-vectorizer.c
> +++ b/gcc/tree-vectorizer.c
> @@ -1720,6 +1720,40 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
>    this->inverted_p = false;
>  }
>  
> +/* If the condition represented by T is a comparison or the SSA name
> +   result of a comparison, extract the comparison's operands.  Represent
> +   T as NE_EXPR <T, 0> otherwise.  */
> +
> +void
> +vec_cond_masked_key::get_cond_ops_from_tree (tree t)
> +{
> +  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
> +    {
> +      this->code = TREE_CODE (t);
> +      this->op0 = TREE_OPERAND (t, 0);
> +      this->op1 = TREE_OPERAND (t, 1);
> +      return;
> +    }
> +
> +  if (TREE_CODE (t) == SSA_NAME)
> +    if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
> +      {
> +	tree_code code = gimple_assign_rhs_code (stmt);
> +	if (TREE_CODE_CLASS (code) == tcc_comparison)
> +	  {
> +	    this->code = code;
> +	    this->op0 = gimple_assign_rhs1 (stmt);
> +	    this->op1 = gimple_assign_rhs2 (stmt);
> +	    return;
> +	  }
> +      }
> +
> +  this->code = NE_EXPR;
> +  this->op0 = t;
> +  this->op1 = build_zero_cst (TREE_TYPE (t));
> +}
> +
> +

This hashing looks unnecessarily complex.  The values we're hashing are
vector SSA_NAMEs, so I think we should be able to hash and compare them
as a plain pair of pointers.

The type could then be std::pair and the hashing could be done using
pair_hash from hash-traits.h.

Thanks,
Richard
Tamar Christina Dec. 2, 2021, 9:33 p.m. UTC | #8
> This hashing looks unnecessarily complex.  The values we're hashing are
> vector SSA_NAMEs, so I think we should be able to hash and compare them
> as a plain pair of pointers.
> 
> The type could then be std::pair and the hashing could be done using
> pair_hash from hash-traits.h.
> 

Fancy.. TIL...

Here's the respun patch.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* tree-vect-stmts.c (prepare_load_store_mask): Rename to...
	(prepare_vec_mask): ...This and record operations that have already been
	masked.
	(vectorizable_call): Use it.
	(vectorizable_operation): Likewise.
	(vectorizable_store): Likewise.
	(vectorizable_load): Likewise.
	* tree-vectorizer.h (class _loop_vec_info): Add vec_cond_masked_set.
	(vec_cond_masked_set_type, tree_cond_mask_hash,
	vec_cond_masked_key): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 0000000000000000000000000000000000000000..ee927346abe518caa3cba397b11dfd1ee7e93630
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 2284ad069e4d521f4e0bd43d34181a258cd636ef..2a02ff0b1e53be6eda49770b240f8f58f3928a87 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1796,23 +1796,30 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
 /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
    form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
    that needs to be applied to all loads and stores in a vectorized loop.
-   Return VEC_MASK if LOOP_MASK is null, otherwise return VEC_MASK & LOOP_MASK.
+   Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
+   otherwise return VEC_MASK & LOOP_MASK.
 
    MASK_TYPE is the type of both masks.  If new statements are needed,
    insert them before GSI.  */
 
 static tree
-prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
-			 gimple_stmt_iterator *gsi)
+prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask,
+		  tree vec_mask, gimple_stmt_iterator *gsi)
 {
   gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
   if (!loop_mask)
     return vec_mask;
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
+
+  vec_cond_masked_key cond (vec_mask, loop_mask);
+  if (loop_vinfo->vec_cond_masked_set.contains (cond))
+    return vec_mask;
+
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
 					  vec_mask, loop_mask);
+
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
   return and_res;
 }
@@ -3526,8 +3533,9 @@ vectorizable_call (vec_info *vinfo,
 			  gcc_assert (ncopies == 1);
 			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
 							  vectype_out, i);
-			  vargs[mask_opno] = prepare_load_store_mask
-			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+			  vargs[mask_opno] = prepare_vec_mask
+			    (loop_vinfo, TREE_TYPE (mask), mask,
+			     vargs[mask_opno], gsi);
 			}
 
 		      gcall *call;
@@ -3564,8 +3572,8 @@ vectorizable_call (vec_info *vinfo,
 	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
 					      vectype_out, j);
 	      vargs[mask_opno]
-		= prepare_load_store_mask (TREE_TYPE (mask), mask,
-					   vargs[mask_opno], gsi);
+		= prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
+				    vargs[mask_opno], gsi);
 	    }
 
 	  gimple *new_stmt;
@@ -6302,10 +6310,46 @@ vectorizable_operation (vec_info *vinfo,
 	}
       else
 	{
+	  tree mask = NULL_TREE;
+	  /* When combining two masks check is either of them is elsewhere
+	     combined with a loop mask, if that's the case we can mark that the
+	     new combined mask doesn't need to be combined with a loop mask.  */
+	  if (masked_loop_p && code == BIT_AND_EXPR)
+	    {
+	      scalar_cond_masked_key cond0 (op0, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond0))
+		{
+		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
+					   vop0, gsi);
+		}
+
+	      scalar_cond_masked_key cond1 (op1, ncopies);
+	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
+		{
+		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+						  vectype, i);
+
+		  vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
+					   vop1, gsi);
+		}
+	    }
+
 	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
 	  new_temp = make_ssa_name (vec_dest, new_stmt);
 	  gimple_assign_set_lhs (new_stmt, new_temp);
 	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	  /* Enter the combined value into the vector cond hash so we don't
+	     AND it with a loop mask again.  */
+	  if (mask)
+	    {
+	      vec_cond_masked_key cond (new_temp, mask);
+	      loop_vinfo->vec_cond_masked_set.add (cond);
+	    }
+
 	  if (vec_cvt_dest)
 	    {
 	      new_temp = build1 (VIEW_CONVERT_EXPR, vectype_out, new_temp);
@@ -8166,8 +8210,8 @@ vectorizable_store (vec_info *vinfo,
 	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
 					     vectype, j);
 	  if (vec_mask)
-	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					   final_mask, vec_mask, gsi);
 
 	  gcall *call;
 	  if (final_mask)
@@ -8221,8 +8265,8 @@ vectorizable_store (vec_info *vinfo,
 						 vec_num * ncopies,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
-		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					       final_mask, vec_mask, gsi);
 
 	      if (memory_access_type == VMAT_GATHER_SCATTER)
 		{
@@ -9442,8 +9486,8 @@ vectorizable_load (vec_info *vinfo,
 	    final_mask = vect_get_loop_mask (gsi, loop_masks, ncopies,
 					     vectype, j);
 	  if (vec_mask)
-	    final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						  vec_mask, gsi);
+	    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					   final_mask, vec_mask, gsi);
 
 	  gcall *call;
 	  if (final_mask)
@@ -9494,8 +9538,8 @@ vectorizable_load (vec_info *vinfo,
 						 vec_num * ncopies,
 						 vectype, vec_num * j + i);
 	      if (vec_mask)
-		final_mask = prepare_load_store_mask (mask_vectype, final_mask,
-						      vec_mask, gsi);
+		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+					       final_mask, vec_mask, gsi);
 
 	      if (i > 0)
 		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bd6f334d15fae4ffbe8c5ffb496ed0a820971638..d587356a32ff4c7230678e69162d639a31ea4baa 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -327,6 +327,13 @@ struct default_hash_traits<scalar_cond_masked_key>
 
 typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
 
+/* Key and map that records association between vector conditions and
+   corresponding loop mask, and is populated by prepare_vec_mask.  */
+
+typedef std::pair<tree, tree> vec_cond_masked_key;
+typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_cond_mask_hash;
+typedef hash_set<tree_cond_mask_hash> vec_cond_masked_set_type;
+
 /* Describes two objects whose addresses must be unequal for the vectorized
    loop to be valid.  */
 typedef std::pair<tree, tree> vec_object_pair;
@@ -646,6 +653,9 @@ public:
   /* Set of scalar conditions that have loop mask applied.  */
   scalar_cond_masked_set_type scalar_cond_masked_set;
 
+  /* Set of vector conditions that have loop mask applied.  */
+  vec_cond_masked_set_type vec_cond_masked_set;
+
   /* If we are using a loop mask to align memory addresses, this variable
      contains the number of vector elements that we should skip in the
      first iteration of the vector loop (i.e. the number of leading
Richard Sandiford Dec. 3, 2021, 11:55 a.m. UTC | #9
Tamar Christina <Tamar.Christina@arm.com> writes:
>> This hashing looks unnecessarily complex.  The values we're hashing are
>> vector SSA_NAMEs, so I think we should be able to hash and compare them
>> as a plain pair of pointers.
>> 
>> The type could then be std::pair and the hashing could be done using
>> pair_hash from hash-traits.h.
>> 
>
> Fancy.. TIL...
>
> Here's the respun patch.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no issues.

LGTM, just some very minor nits…

> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* tree-vect-stmts.c (prepare_load_store_mask): Rename to...
> 	(prepare_vec_mask): ...This and record operations that have already been
> 	masked.
> 	(vectorizable_call): Use it.
> 	(vectorizable_operation): Likewise.
> 	(vectorizable_store): Likewise.
> 	(vectorizable_load): Likewise.
> 	* tree-vectorizer.h (class _loop_vec_info): Add vec_cond_masked_set.
> 	(vec_cond_masked_set_type, tree_cond_mask_hash,
> 	vec_cond_masked_key): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/sve/pred-combine-and.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ee927346abe518caa3cba397b11dfd1ee7e93630
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
> +{
> +    for (int i = 0; i < n; i++) {
> +        float a = x[i];
> +        float b = y[i];
> +        if (a > b) {
> +            z0[i] = a + b;
> +            if (a > c) {
> +                z1[i] = a - b;
> +            }
> +        }
> +    }
> +}
> +
> +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 2284ad069e4d521f4e0bd43d34181a258cd636ef..2a02ff0b1e53be6eda49770b240f8f58f3928a87 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1796,23 +1796,30 @@ check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype,
>  /* Return the mask input to a masked load or store.  VEC_MASK is the vectorized
>     form of the scalar mask condition and LOOP_MASK, if nonnull, is the mask
>     that needs to be applied to all loads and stores in a vectorized loop.
> -   Return VEC_MASK if LOOP_MASK is null, otherwise return VEC_MASK & LOOP_MASK.
> +   Return VEC_MASK if LOOP_MASK is null or if VEC_MASK is already masked,
> +   otherwise return VEC_MASK & LOOP_MASK.
>  
>     MASK_TYPE is the type of both masks.  If new statements are needed,
>     insert them before GSI.  */
>  
>  static tree
> -prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
> -			 gimple_stmt_iterator *gsi)
> +prepare_vec_mask (loop_vec_info loop_vinfo, tree mask_type, tree loop_mask,
> +		  tree vec_mask, gimple_stmt_iterator *gsi)
>  {
>    gcc_assert (useless_type_conversion_p (mask_type, TREE_TYPE (vec_mask)));
>    if (!loop_mask)
>      return vec_mask;
>  
>    gcc_assert (TREE_TYPE (loop_mask) == mask_type);
> +
> +  vec_cond_masked_key cond (vec_mask, loop_mask);
> +  if (loop_vinfo->vec_cond_masked_set.contains (cond))
> +    return vec_mask;
> +

Guess this is pushing a personal preference, sorry, but now that
the code is C++11, I think we should use:

  if (loop_vinfo->vec_cond_masked_set.contains ({ vec_mask, loop_mask }))
    return vec_mask;

for cases like this, avoiding the need for the separate “cond” variable.

>    tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
>    gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
>  					  vec_mask, loop_mask);
> +
>    gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
>    return and_res;
>  }
> @@ -3526,8 +3533,9 @@ vectorizable_call (vec_info *vinfo,
>  			  gcc_assert (ncopies == 1);
>  			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
>  							  vectype_out, i);
> -			  vargs[mask_opno] = prepare_load_store_mask
> -			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
> +			  vargs[mask_opno] = prepare_vec_mask
> +			    (loop_vinfo, TREE_TYPE (mask), mask,
> +			     vargs[mask_opno], gsi);
>  			}
>  
>  		      gcall *call;
> @@ -3564,8 +3572,8 @@ vectorizable_call (vec_info *vinfo,
>  	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
>  					      vectype_out, j);
>  	      vargs[mask_opno]
> -		= prepare_load_store_mask (TREE_TYPE (mask), mask,
> -					   vargs[mask_opno], gsi);
> +		= prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
> +				    vargs[mask_opno], gsi);
>  	    }
>  
>  	  gimple *new_stmt;
> @@ -6302,10 +6310,46 @@ vectorizable_operation (vec_info *vinfo,
>  	}
>        else
>  	{
> +	  tree mask = NULL_TREE;
> +	  /* When combining two masks check is either of them is elsewhere

s/is either/if either/

didn't notice last time, sorry

> +	     combined with a loop mask, if that's the case we can mark that the
> +	     new combined mask doesn't need to be combined with a loop mask.  */
> +	  if (masked_loop_p && code == BIT_AND_EXPR)
> +	    {
> +	      scalar_cond_masked_key cond0 (op0, ncopies);
> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond0))
> +		{
> +		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);

The indentation of this line is off; it should line up with “gsi”.
Same for the call below.

> +
> +		  vop0 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
> +					   vop0, gsi);
> +		}
> +
> +	      scalar_cond_masked_key cond1 (op1, ncopies);
> +	      if (loop_vinfo->scalar_cond_masked_set.contains (cond1))
> +		{
> +		  mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
> +						  vectype, i);
> +
> +		  vop1 = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask,
> +					   vop1, gsi);
> +		}
> +	    }
> +
>  	  new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1, vop2);
>  	  new_temp = make_ssa_name (vec_dest, new_stmt);
>  	  gimple_assign_set_lhs (new_stmt, new_temp);
>  	  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
> +
> +	  /* Enter the combined value into the vector cond hash so we don't
> +	     AND it with a loop mask again.  */
> +	  if (mask)
> +	    {
> +	      vec_cond_masked_key cond (new_temp, mask);
> +	      loop_vinfo->vec_cond_masked_set.add (cond);
> +	    }
> +

Similarly to the above, I think it'd be neater to do:

	  if (mask)
	    loop_vinfo->vec_cond_masked_set.add ({ new_temp, mask });

> […]
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index bd6f334d15fae4ffbe8c5ffb496ed0a820971638..d587356a32ff4c7230678e69162d639a31ea4baa 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -327,6 +327,13 @@ struct default_hash_traits<scalar_cond_masked_key>
>  
>  typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
>  
> +/* Key and map that records association between vector conditions and
> +   corresponding loop mask, and is populated by prepare_vec_mask.  */
> +
> +typedef std::pair<tree, tree> vec_cond_masked_key;
> +typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_cond_mask_hash;

Even more petty, but dropping the space before the “<” would make these
three lines consistent.

OK with those changes, thanks.

Richard

> +typedef hash_set<tree_cond_mask_hash> vec_cond_masked_set_type;
> +
>  /* Describes two objects whose addresses must be unequal for the vectorized
>     loop to be valid.  */
>  typedef std::pair<tree, tree> vec_object_pair;
> @@ -646,6 +653,9 @@ public:
>    /* Set of scalar conditions that have loop mask applied.  */
>    scalar_cond_masked_set_type scalar_cond_masked_set;
>  
> +  /* Set of vector conditions that have loop mask applied.  */
> +  vec_cond_masked_set_type vec_cond_masked_set;
> +
>    /* If we are using a loop mask to align memory addresses, this variable
>       contains the number of vector elements that we should skip in the
>       first iteration of the vector loop (i.e. the number of leading
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -8162,6 +8162,48 @@  (define_insn_and_split "*mask_inv_combine"
 }
 )
 
+;; Combine multiple masks where the comparisons operators are the same and
+;; each comparison has one parameter shared. e.g. combine a > b && a > c
+(define_insn_and_split "*mask_cmp_and_combine"
+  [(set (match_operand:<VPRED> 0 "register_operand" "=Upa")
+	(and:<VPRED>
+	  (and:<VPRED>
+	    (unspec:<VPRED>
+	      [(match_operand:<VPRED> 1)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_operand:SVE_FULL_F 2 "register_operand" "w")
+	       (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")]
+	      SVE_COND_FP_CMP_I0)
+	    (unspec:<VPRED>
+	      [(match_dup 1)
+	       (const_int SVE_KNOWN_PTRUE)
+	       (match_dup 2)
+	       (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "wDz")]
+	      SVE_COND_FP_CMP_I0))
+	    (match_operand:<VPRED> 5 "register_operand" "Upa")))
+   (clobber (match_scratch:<VPRED> 6 "=&Upa"))]
+  "TARGET_SVE"
+  "#"
+  "&& 1"
+  [(set (match_dup 6)
+	(unspec:<VPRED>
+	  [(match_dup 5)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 3)]
+	  SVE_COND_FP_CMP_I0))
+   (set (match_dup 0)
+	(unspec:<VPRED>
+	  [(match_dup 6)
+	   (const_int SVE_MAYBE_NOT_PTRUE)
+	   (match_dup 2)
+	   (match_dup 4)]
+	  SVE_COND_FP_CMP_I0))]
+{
+  operands[6] = gen_reg_rtx (<VPRED>mode);
+}
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- [FP] Absolute comparisons
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@ 
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, z[0-9]+\.s} 2 } } */