[2/2,RFC] Add loop masking support for x86

Message ID	73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr
State	New
Headers	show Return-Path: <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org> DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 35D24386185D Date: Thu, 15 Jul 2021 12:30:20 +0200 (CEST) From: Richard Biener <rguenther@suse.de> To: gcc-patches@gcc.gnu.org Subject: [PATCH 2/2][RFC] Add loop masking support for x86 Message-ID: <73rrp0p-859r-oq2n-pss7-6744807s3qr5@fhfr.qr> MIME-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Precedence: list Cc: richard.sandiford@arm.com, hongtao.liu@intel.com Errors-To: gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+incoming=patchwork.ozlabs.org@gcc.gnu.org>
Series	[1/2] Streamline vect_gen_while \| expand [1/2] Streamline vect_gen_while [2/2,RFC] Add loop masking support for x86

Richard Biener July 15, 2021, 10:30 a.m. UTC

The following extends the existing loop masking support using
SVE WHILE_ULT to x86 by proving an alternate way to produce the
mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
you can now enable masked vectorized epilogues (=1) or fully
masked vector loops (=2).

What's missing is using a scalar IV for the loop control
(but in principle AVX512 can use the mask here - just the patch
doesn't seem to work for AVX512 yet for some reason - likely
expand_vec_cond_expr_p doesn't work there).  What's also missing
is providing more support for predicated operations in the case
of reductions either via VEC_COND_EXPRs or via implementing
some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
to masked AVX512 operations.

For AVX2 and

int foo (unsigned *a, unsigned * __restrict b, int n)
{
  unsigned sum = 1;
  for (int i = 0; i < n; ++i)
    b[i] += a[i];
  return sum;
}

we get

.L3:
        vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
        vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
        addl    $8, %edx
        vpaddd  %ymm3, %ymm1, %ymm1
        vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
        vmovd   %edx, %xmm1
        vpsubd  %ymm15, %ymm2, %ymm0
        addq    $32, %rax
        vpbroadcastd    %xmm1, %ymm1
        vpaddd  %ymm4, %ymm1, %ymm1
        vpsubd  %ymm15, %ymm1, %ymm1
        vpcmpgtd        %ymm1, %ymm0, %ymm0
        vptest  %ymm0, %ymm0
        jne     .L3

for the fully masked loop body and for the masked epilogue
we see

.L4:
        vmovdqu (%rsi,%rax), %ymm3
        vpaddd  (%rdi,%rax), %ymm3, %ymm0
        vmovdqu %ymm0, (%rsi,%rax)
        addq    $32, %rax
        cmpq    %rax, %rcx
        jne     .L4
        movl    %edx, %eax
        andl    $-8, %eax
        testb   $7, %dl
        je      .L11
.L3:
        subl    %eax, %edx
        vmovdqa .LC0(%rip), %ymm1
        salq    $2, %rax
        vmovd   %edx, %xmm0
        movl    $-2147483648, %edx
        addq    %rax, %rsi
        vmovd   %edx, %xmm15
        vpbroadcastd    %xmm0, %ymm0
        vpbroadcastd    %xmm15, %ymm15
        vpsubd  %ymm15, %ymm1, %ymm1
        vpsubd  %ymm15, %ymm0, %ymm0
        vpcmpgtd        %ymm1, %ymm0, %ymm0
        vpmaskmovd      (%rsi), %ymm0, %ymm1
        vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
        vpaddd  %ymm2, %ymm1, %ymm1
        vpmaskmovd      %ymm1, %ymm0, (%rsi)
.L11:
        vzeroupper

compared to

.L3:
        movl    %edx, %r8d
        subl    %eax, %r8d
        leal    -1(%r8), %r9d
        cmpl    $2, %r9d
        jbe     .L6
        leaq    (%rcx,%rax,4), %r9
        vmovdqu (%rdi,%rax,4), %xmm2
        movl    %r8d, %eax
        andl    $-4, %eax
        vpaddd  (%r9), %xmm2, %xmm0
        addl    %eax, %esi
        andl    $3, %r8d
        vmovdqu %xmm0, (%r9)
        je      .L2
.L6:
        movslq  %esi, %r8
        leaq    0(,%r8,4), %rax
        movl    (%rdi,%r8,4), %r8d
        addl    %r8d, (%rcx,%rax)
        leal    1(%rsi), %r8d
        cmpl    %r8d, %edx
        jle     .L2
        addl    $2, %esi
        movl    4(%rdi,%rax), %r8d
        addl    %r8d, 4(%rcx,%rax)
        cmpl    %esi, %edx
        jle     .L2
        movl    8(%rdi,%rax), %edx
        addl    %edx, 8(%rcx,%rax)
.L2:

I'm giving this a little testing right now but will dig on why
I don't get masked loops when AVX512 is enabled.

Still comments are appreciated.

Thanks,
Richard.

2021-07-15  Richard Biener  <rguenther@suse.de>

	* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
	also can produce masks with VEC_COND_EXPRs.
	* tree-vect-loop.c (vect_gen_while): Generate the mask
	with a VEC_COND_EXPR in case WHILE_ULT is not supported.
---
 gcc/tree-vect-loop.c  |  8 ++++++-
 gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 11 deletions(-)

Richard Biener July 15, 2021, 10:45 a.m. UTC | #1

On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
>
> The following extends the existing loop masking support using
> SVE WHILE_ULT to x86 by proving an alternate way to produce the
> mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> you can now enable masked vectorized epilogues (=1) or fully
> masked vector loops (=2).
>
> What's missing is using a scalar IV for the loop control
> (but in principle AVX512 can use the mask here - just the patch
> doesn't seem to work for AVX512 yet for some reason - likely
> expand_vec_cond_expr_p doesn't work there).  What's also missing
> is providing more support for predicated operations in the case
> of reductions either via VEC_COND_EXPRs or via implementing
> some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> to masked AVX512 operations.
>
> For AVX2 and
>
> int foo (unsigned *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     b[i] += a[i];
>   return sum;
> }
>
> we get
>
> .L3:
>         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>         addl    $8, %edx
>         vpaddd  %ymm3, %ymm1, %ymm1
>         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>         vmovd   %edx, %xmm1
>         vpsubd  %ymm15, %ymm2, %ymm0
>         addq    $32, %rax
>         vpbroadcastd    %xmm1, %ymm1
>         vpaddd  %ymm4, %ymm1, %ymm1
>         vpsubd  %ymm15, %ymm1, %ymm1
>         vpcmpgtd        %ymm1, %ymm0, %ymm0
>         vptest  %ymm0, %ymm0
>         jne     .L3
>
> for the fully masked loop body and for the masked epilogue
> we see
>
> .L4:
>         vmovdqu (%rsi,%rax), %ymm3
>         vpaddd  (%rdi,%rax), %ymm3, %ymm0
>         vmovdqu %ymm0, (%rsi,%rax)
>         addq    $32, %rax
>         cmpq    %rax, %rcx
>         jne     .L4
>         movl    %edx, %eax
>         andl    $-8, %eax
>         testb   $7, %dl
>         je      .L11
> .L3:
>         subl    %eax, %edx
>         vmovdqa .LC0(%rip), %ymm1
>         salq    $2, %rax
>         vmovd   %edx, %xmm0
>         movl    $-2147483648, %edx
>         addq    %rax, %rsi
>         vmovd   %edx, %xmm15
>         vpbroadcastd    %xmm0, %ymm0
>         vpbroadcastd    %xmm15, %ymm15
>         vpsubd  %ymm15, %ymm1, %ymm1
>         vpsubd  %ymm15, %ymm0, %ymm0
>         vpcmpgtd        %ymm1, %ymm0, %ymm0
>         vpmaskmovd      (%rsi), %ymm0, %ymm1
>         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>         vpaddd  %ymm2, %ymm1, %ymm1
>         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> .L11:
>         vzeroupper
>
> compared to
>
> .L3:
>         movl    %edx, %r8d
>         subl    %eax, %r8d
>         leal    -1(%r8), %r9d
>         cmpl    $2, %r9d
>         jbe     .L6
>         leaq    (%rcx,%rax,4), %r9
>         vmovdqu (%rdi,%rax,4), %xmm2
>         movl    %r8d, %eax
>         andl    $-4, %eax
>         vpaddd  (%r9), %xmm2, %xmm0
>         addl    %eax, %esi
>         andl    $3, %r8d
>         vmovdqu %xmm0, (%r9)
>         je      .L2
> .L6:
>         movslq  %esi, %r8
>         leaq    0(,%r8,4), %rax
>         movl    (%rdi,%r8,4), %r8d
>         addl    %r8d, (%rcx,%rax)
>         leal    1(%rsi), %r8d
>         cmpl    %r8d, %edx
>         jle     .L2
>         addl    $2, %esi
>         movl    4(%rdi,%rax), %r8d
>         addl    %r8d, 4(%rcx,%rax)
>         cmpl    %esi, %edx
>         jle     .L2
>         movl    8(%rdi,%rax), %edx
>         addl    %edx, 8(%rcx,%rax)
> .L2:
>
> I'm giving this a little testing right now but will dig on why
> I don't get masked loops when AVX512 is enabled.

Ah, a simple thinko - rgroup_controls vectypes seem to be
always VECTOR_BOOLEAN_TYPE_P and thus we can
use expand_vec_cmp_expr_p.  The AVX512 fully masked
loop then looks like

.L3:
        vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
        vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
        vpaddd  %ymm2, %ymm1, %ymm0
        vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
        addq    $8, %rax
        vpbroadcastd    %eax, %ymm0
        vpaddd  %ymm4, %ymm0, %ymm0
        vpcmpud $6, %ymm0, %ymm3, %k1
        kortestb        %k1, %k1
        jne     .L3

I guess for x86 it's not worth preserving the VEC_COND_EXPR
mask generation but other archs may not provide all required vec_cmp
expanders.

Richard.

> Still comments are appreciated.
>
> Thanks,
> Richard.
>
> 2021-07-15  Richard Biener  <rguenther@suse.de>
>
>         * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
>         also can produce masks with VEC_COND_EXPRs.
>         * tree-vect-loop.c (vect_gen_while): Generate the mask
>         with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> ---
>  gcc/tree-vect-loop.c  |  8 ++++++-
>  gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
>  2 files changed, 47 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fc3dab0d143..2214ed11dfb 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
>  {
>    rgroup_controls *rgm;
>    unsigned int i;
> +  tree cmp_vectype;
>    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
>      if (rgm->type != NULL_TREE
>         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
>                                             cmp_type, rgm->type,
> -                                           OPTIMIZE_FOR_SPEED))
> +                                           OPTIMIZE_FOR_SPEED)
> +       && ((cmp_vectype
> +              = truth_type_for (build_vector_type
> +                                (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
> +           true)
> +       && !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
>        return false;
>    return true;
>  }
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 6a25d661800..216986399b1 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
>                 tree end_index, const char *name)
>  {
>    tree cmp_type = TREE_TYPE (start_index);
> -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> -                                                      cmp_type, mask_type,
> -                                                      OPTIMIZE_FOR_SPEED));
> -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> -                                           start_index, end_index,
> -                                           build_zero_cst (mask_type));
> -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> -  gimple_call_set_lhs (call, tmp);
> -  gimple_seq_add_stmt (seq, call);
> -  return tmp;
> +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> +                                     cmp_type, mask_type,
> +                                     OPTIMIZE_FOR_SPEED))
> +    {
> +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> +                                               start_index, end_index,
> +                                               build_zero_cst (mask_type));
> +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> +      gimple_call_set_lhs (call, tmp);
> +      gimple_seq_add_stmt (seq, call);
> +      return tmp;
> +    }
> +  else
> +    {
> +      /* Generate
> +          _1 = { start_index, start_index, ... };
> +          _2 = { end_index, end_index, ... };
> +          _3 = _1 + { 0, 1, 2 ... };
> +          _4 = _3 < _2;
> +          _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
> +      tree cvectype = build_vector_type (cmp_type,
> +                                        TYPE_VECTOR_SUBPARTS (mask_type));
> +      tree si = make_ssa_name (cvectype);
> +      gassign *ass = gimple_build_assign
> +                       (si, build_vector_from_val (cvectype, start_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree ei = make_ssa_name (cvectype);
> +      ass = gimple_build_assign (ei,
> +                                build_vector_from_val (cvectype, end_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> +                                   build_one_cst (cmp_type));
> +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> +      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> +                              si, ei);
> +      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
> +                               build_all_ones_cst (mask_type),
> +                               build_zero_cst (mask_type));
> +      return mask;
> +    }
>  }
>
>  /* Generate a vector mask of type MASK_TYPE for which index I is false iff
> --
> 2.26.2

Hongtao Liu July 15, 2021, 11:20 a.m. UTC | #2

On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > The following extends the existing loop masking support using
> > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > you can now enable masked vectorized epilogues (=1) or fully
> > masked vector loops (=2).
> >
> > What's missing is using a scalar IV for the loop control
> > (but in principle AVX512 can use the mask here - just the patch
> > doesn't seem to work for AVX512 yet for some reason - likely
> > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > is providing more support for predicated operations in the case
> > of reductions either via VEC_COND_EXPRs or via implementing
> > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > to masked AVX512 operations.
> >
> > For AVX2 and
> >
> > int foo (unsigned *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     b[i] += a[i];
> >   return sum;
> > }
> >
> > we get
> >
> > .L3:
> >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >         addl    $8, %edx
> >         vpaddd  %ymm3, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >         vmovd   %edx, %xmm1
> >         vpsubd  %ymm15, %ymm2, %ymm0
> >         addq    $32, %rax
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpaddd  %ymm4, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vptest  %ymm0, %ymm0
> >         jne     .L3
> >
> > for the fully masked loop body and for the masked epilogue
> > we see
> >
> > .L4:
> >         vmovdqu (%rsi,%rax), %ymm3
> >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >         vmovdqu %ymm0, (%rsi,%rax)
> >         addq    $32, %rax
> >         cmpq    %rax, %rcx
> >         jne     .L4
> >         movl    %edx, %eax
> >         andl    $-8, %eax
> >         testb   $7, %dl
> >         je      .L11
> > .L3:
> >         subl    %eax, %edx
> >         vmovdqa .LC0(%rip), %ymm1
> >         salq    $2, %rax
> >         vmovd   %edx, %xmm0
> >         movl    $-2147483648, %edx
> >         addq    %rax, %rsi
> >         vmovd   %edx, %xmm15
> >         vpbroadcastd    %xmm0, %ymm0
> >         vpbroadcastd    %xmm15, %ymm15
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm0, %ymm0
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >         vpaddd  %ymm2, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > .L11:
> >         vzeroupper
> >
> > compared to
> >
> > .L3:
> >         movl    %edx, %r8d
> >         subl    %eax, %r8d
> >         leal    -1(%r8), %r9d
> >         cmpl    $2, %r9d
> >         jbe     .L6
> >         leaq    (%rcx,%rax,4), %r9
> >         vmovdqu (%rdi,%rax,4), %xmm2
> >         movl    %r8d, %eax
> >         andl    $-4, %eax
> >         vpaddd  (%r9), %xmm2, %xmm0
> >         addl    %eax, %esi
> >         andl    $3, %r8d
> >         vmovdqu %xmm0, (%r9)
> >         je      .L2
> > .L6:
> >         movslq  %esi, %r8
> >         leaq    0(,%r8,4), %rax
> >         movl    (%rdi,%r8,4), %r8d
> >         addl    %r8d, (%rcx,%rax)
> >         leal    1(%rsi), %r8d
> >         cmpl    %r8d, %edx
> >         jle     .L2
> >         addl    $2, %esi
> >         movl    4(%rdi,%rax), %r8d
> >         addl    %r8d, 4(%rcx,%rax)
> >         cmpl    %esi, %edx
> >         jle     .L2
> >         movl    8(%rdi,%rax), %edx
> >         addl    %edx, 8(%rcx,%rax)
> > .L2:
> >
> > I'm giving this a little testing right now but will dig on why
> > I don't get masked loops when AVX512 is enabled.
>
> Ah, a simple thinko - rgroup_controls vectypes seem to be
> always VECTOR_BOOLEAN_TYPE_P and thus we can
> use expand_vec_cmp_expr_p.  The AVX512 fully masked
> loop then looks like
>
> .L3:
>         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>         vpaddd  %ymm2, %ymm1, %ymm0
>         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>         addq    $8, %rax
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm4, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm3, %k1
>         kortestb        %k1, %k1
>         jne     .L3
>
> I guess for x86 it's not worth preserving the VEC_COND_EXPR
> mask generation but other archs may not provide all required vec_cmp
> expanders.

For the main loop, the full-masked loop's codegen seems much worse.
Basically, we need at least 4 instructions to do what while_ult in arm does.

         vpbroadcastd    %eax, %ymm0
         vpaddd  %ymm4, %ymm0, %ymm0
         vpcmpud $6, %ymm0, %ymm3, %k1
         kortestb        %k1, %k1
vs
       whilelo(or some other while<op>)

more instructions are needed for avx2 since there's no direct
instruction for .COND_{ADD,SUB..}

original
.L4:
        vmovdqu (%rcx,%rax), %ymm1
        vpaddd (%rdi,%rax), %ymm1, %ymm0
        vmovdqu %ymm0, (%rcx,%rax)
        addq $32, %rax
        cmpq %rax, %rsi
        jne .L4

vs
avx512 full-masked loop
.L3:
         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
         vpaddd  %ymm2, %ymm1, %ymm0
         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
         addq    $8, %rax
         vpbroadcastd    %eax, %ymm0
         vpaddd  %ymm4, %ymm0, %ymm0
         vpcmpud $6, %ymm0, %ymm3, %k1
         kortestb        %k1, %k1
         jne     .L3

vs
avx2 full-masked loop
.L3:
         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
         addl    $8, %edx
         vpaddd  %ymm3, %ymm1, %ymm1
         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
         vmovd   %edx, %xmm1
         vpsubd  %ymm15, %ymm2, %ymm0
         addq    $32, %rax
         vpbroadcastd    %xmm1, %ymm1
         vpaddd  %ymm4, %ymm1, %ymm1
         vpsubd  %ymm15, %ymm1, %ymm1
         vpcmpgtd        %ymm1, %ymm0, %ymm0
        vptest  %ymm0, %ymm0
         jne     .L3

vs  arm64's code

.L3:
    ld1w z1.s, p0/z, [x1, x3, lsl 2]
    ld1w z0.s, p0/z, [x0, x3, lsl 2]
    add z0.s, z0.s, z1.s
    st1w z0.s, p0, [x1, x3, lsl 2]
    add x3, x3, x4
    whilelo p0.s, w3, w2
    b.any .L3

> Richard.
>
> > Still comments are appreciated.
> >
> > Thanks,
> > Richard.
> >
> > 2021-07-15  Richard Biener  <rguenther@suse.de>
> >
> >         * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
> >         also can produce masks with VEC_COND_EXPRs.
> >         * tree-vect-loop.c (vect_gen_while): Generate the mask
> >         with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> > ---
> >  gcc/tree-vect-loop.c  |  8 ++++++-
> >  gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
> >  2 files changed, 47 insertions(+), 11 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index fc3dab0d143..2214ed11dfb 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
> >  {
> >    rgroup_controls *rgm;
> >    unsigned int i;
> > +  tree cmp_vectype;
> >    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
> >      if (rgm->type != NULL_TREE
> >         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
> >                                             cmp_type, rgm->type,
> > -                                           OPTIMIZE_FOR_SPEED))
> > +                                           OPTIMIZE_FOR_SPEED)
> > +       && ((cmp_vectype
> > +              = truth_type_for (build_vector_type
> > +                                (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
> > +           true)
> > +       && !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
> >        return false;
> >    return true;
> >  }
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index 6a25d661800..216986399b1 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
> >                 tree end_index, const char *name)
> >  {
> >    tree cmp_type = TREE_TYPE (start_index);
> > -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > -                                                      cmp_type, mask_type,
> > -                                                      OPTIMIZE_FOR_SPEED));
> > -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > -                                           start_index, end_index,
> > -                                           build_zero_cst (mask_type));
> > -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > -  gimple_call_set_lhs (call, tmp);
> > -  gimple_seq_add_stmt (seq, call);
> > -  return tmp;
> > +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > +                                     cmp_type, mask_type,
> > +                                     OPTIMIZE_FOR_SPEED))
> > +    {
> > +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > +                                               start_index, end_index,
> > +                                               build_zero_cst (mask_type));
> > +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > +      gimple_call_set_lhs (call, tmp);
> > +      gimple_seq_add_stmt (seq, call);
> > +      return tmp;
> > +    }
> > +  else
> > +    {
> > +      /* Generate
> > +          _1 = { start_index, start_index, ... };
> > +          _2 = { end_index, end_index, ... };
> > +          _3 = _1 + { 0, 1, 2 ... };
> > +          _4 = _3 < _2;
> > +          _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
> > +      tree cvectype = build_vector_type (cmp_type,
> > +                                        TYPE_VECTOR_SUBPARTS (mask_type));
> > +      tree si = make_ssa_name (cvectype);
> > +      gassign *ass = gimple_build_assign
> > +                       (si, build_vector_from_val (cvectype, start_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree ei = make_ssa_name (cvectype);
> > +      ass = gimple_build_assign (ei,
> > +                                build_vector_from_val (cvectype, end_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> > +                                   build_one_cst (cmp_type));
> > +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> > +      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> > +                              si, ei);
> > +      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
> > +                               build_all_ones_cst (mask_type),
> > +                               build_zero_cst (mask_type));
> > +      return mask;
> > +    }
> >  }
> >
> >  /* Generate a vector mask of type MASK_TYPE for which index I is false iff
> > --
> > 2.26.2

Richard Biener July 15, 2021, 11:48 a.m. UTC | #3

On Thu, 15 Jul 2021, Hongtao Liu wrote:

> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > The following extends the existing loop masking support using
> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > > you can now enable masked vectorized epilogues (=1) or fully
> > > masked vector loops (=2).
> > >
> > > What's missing is using a scalar IV for the loop control
> > > (but in principle AVX512 can use the mask here - just the patch
> > > doesn't seem to work for AVX512 yet for some reason - likely
> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > > is providing more support for predicated operations in the case
> > > of reductions either via VEC_COND_EXPRs or via implementing
> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > > to masked AVX512 operations.
> > >
> > > For AVX2 and
> > >
> > > int foo (unsigned *a, unsigned * __restrict b, int n)
> > > {
> > >   unsigned sum = 1;
> > >   for (int i = 0; i < n; ++i)
> > >     b[i] += a[i];
> > >   return sum;
> > > }
> > >
> > > we get
> > >
> > > .L3:
> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > >         addl    $8, %edx
> > >         vpaddd  %ymm3, %ymm1, %ymm1
> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > >         vmovd   %edx, %xmm1
> > >         vpsubd  %ymm15, %ymm2, %ymm0
> > >         addq    $32, %rax
> > >         vpbroadcastd    %xmm1, %ymm1
> > >         vpaddd  %ymm4, %ymm1, %ymm1
> > >         vpsubd  %ymm15, %ymm1, %ymm1
> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >         vptest  %ymm0, %ymm0
> > >         jne     .L3
> > >
> > > for the fully masked loop body and for the masked epilogue
> > > we see
> > >
> > > .L4:
> > >         vmovdqu (%rsi,%rax), %ymm3
> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> > >         vmovdqu %ymm0, (%rsi,%rax)
> > >         addq    $32, %rax
> > >         cmpq    %rax, %rcx
> > >         jne     .L4
> > >         movl    %edx, %eax
> > >         andl    $-8, %eax
> > >         testb   $7, %dl
> > >         je      .L11
> > > .L3:
> > >         subl    %eax, %edx
> > >         vmovdqa .LC0(%rip), %ymm1
> > >         salq    $2, %rax
> > >         vmovd   %edx, %xmm0
> > >         movl    $-2147483648, %edx
> > >         addq    %rax, %rsi
> > >         vmovd   %edx, %xmm15
> > >         vpbroadcastd    %xmm0, %ymm0
> > >         vpbroadcastd    %xmm15, %ymm15
> > >         vpsubd  %ymm15, %ymm1, %ymm1
> > >         vpsubd  %ymm15, %ymm0, %ymm0
> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> > >         vpaddd  %ymm2, %ymm1, %ymm1
> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > > .L11:
> > >         vzeroupper
> > >
> > > compared to
> > >
> > > .L3:
> > >         movl    %edx, %r8d
> > >         subl    %eax, %r8d
> > >         leal    -1(%r8), %r9d
> > >         cmpl    $2, %r9d
> > >         jbe     .L6
> > >         leaq    (%rcx,%rax,4), %r9
> > >         vmovdqu (%rdi,%rax,4), %xmm2
> > >         movl    %r8d, %eax
> > >         andl    $-4, %eax
> > >         vpaddd  (%r9), %xmm2, %xmm0
> > >         addl    %eax, %esi
> > >         andl    $3, %r8d
> > >         vmovdqu %xmm0, (%r9)
> > >         je      .L2
> > > .L6:
> > >         movslq  %esi, %r8
> > >         leaq    0(,%r8,4), %rax
> > >         movl    (%rdi,%r8,4), %r8d
> > >         addl    %r8d, (%rcx,%rax)
> > >         leal    1(%rsi), %r8d
> > >         cmpl    %r8d, %edx
> > >         jle     .L2
> > >         addl    $2, %esi
> > >         movl    4(%rdi,%rax), %r8d
> > >         addl    %r8d, 4(%rcx,%rax)
> > >         cmpl    %esi, %edx
> > >         jle     .L2
> > >         movl    8(%rdi,%rax), %edx
> > >         addl    %edx, 8(%rcx,%rax)
> > > .L2:
> > >
> > > I'm giving this a little testing right now but will dig on why
> > > I don't get masked loops when AVX512 is enabled.
> >
> > Ah, a simple thinko - rgroup_controls vectypes seem to be
> > always VECTOR_BOOLEAN_TYPE_P and thus we can
> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> > loop then looks like
> >
> > .L3:
> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >         vpaddd  %ymm2, %ymm1, %ymm0
> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >         addq    $8, %rax
> >         vpbroadcastd    %eax, %ymm0
> >         vpaddd  %ymm4, %ymm0, %ymm0
> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >         kortestb        %k1, %k1
> >         jne     .L3
> >
> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> > mask generation but other archs may not provide all required vec_cmp
> > expanders.
> 
> For the main loop, the full-masked loop's codegen seems much worse.
> Basically, we need at least 4 instructions to do what while_ult in arm does.
> 
>          vpbroadcastd    %eax, %ymm0
>          vpaddd  %ymm4, %ymm0, %ymm0
>          vpcmpud $6, %ymm0, %ymm3, %k1
>          kortestb        %k1, %k1
> vs
>        whilelo(or some other while<op>)
> 
> more instructions are needed for avx2 since there's no direct
> instruction for .COND_{ADD,SUB..}
> 
> original
> .L4:
>         vmovdqu (%rcx,%rax), %ymm1
>         vpaddd (%rdi,%rax), %ymm1, %ymm0
>         vmovdqu %ymm0, (%rcx,%rax)
>         addq $32, %rax
>         cmpq %rax, %rsi
>         jne .L4
> 
> vs
> avx512 full-masked loop
> .L3:
>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>          vpaddd  %ymm2, %ymm1, %ymm0
>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>          addq    $8, %rax
>          vpbroadcastd    %eax, %ymm0
>          vpaddd  %ymm4, %ymm0, %ymm0
>          vpcmpud $6, %ymm0, %ymm3, %k1
>          kortestb        %k1, %k1
>          jne     .L3
> 
> vs
> avx2 full-masked loop
> .L3:
>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>          addl    $8, %edx
>          vpaddd  %ymm3, %ymm1, %ymm1
>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>          vmovd   %edx, %xmm1
>          vpsubd  %ymm15, %ymm2, %ymm0
>          addq    $32, %rax
>          vpbroadcastd    %xmm1, %ymm1
>          vpaddd  %ymm4, %ymm1, %ymm1
>          vpsubd  %ymm15, %ymm1, %ymm1
>          vpcmpgtd        %ymm1, %ymm0, %ymm0
>         vptest  %ymm0, %ymm0
>          jne     .L3
> 
> vs  arm64's code
> 
> .L3:
>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
>     add z0.s, z0.s, z1.s
>     st1w z0.s, p0, [x1, x3, lsl 2]
>     add x3, x3, x4
>     whilelo p0.s, w3, w2
>     b.any .L3

Yes, that's true - it might still be OK for vectorizing epilogues
and thus --param vect-partial-vector-usage=1

Can AVX512 do any better than this?

        vpbroadcastd    %eax, %ymm0
        vpaddd  %ymm4, %ymm0, %ymm0
        vpcmpud $6, %ymm0, %ymm3, %k1

Note with multiple types involved things get even worse
since you need masks for each vector mode.  But as far as
I can see that's the same for SVE (but there we have the
single-instruction whilelo).  I guess we'll also generate
wrong code at the moment for the case where we need
multiple vectors to hold the full mask - vect_gen_while
doesn't seem to be prepared for this?

So with

int foo (unsigned long *a, unsigned * __restrict b, int n)
{
  unsigned sum = 1;
  for (int i = 0; i < n; ++i)
    {
      b[i] += a[i];
    }
  return sum;
}

SVE uses

.L3:
        ld1d    z0.d, p0/z, [x1, x3, lsl 3]
        ld1d    z1.d, p0/z, [x0, x3, lsl 3]
        adr     z0.d, [z0.d, z1.d, lsl 2]
        st1d    z0.d, p0, [x1, x3, lsl 3]
        add     x3, x3, x4
        whilelo p0.d, w3, w2
        b.any   .L3

so p0 vs. p0/z, whatever that means and it looks like
the vector add can somehow concatenate z0.d and z1.d.
Truly fascinating ;)

It looks like --param vect_partial_vector_usage defaults to 2,
power forces it to 1 (power10) or 0 otherwise.

I think we'd need a target hook that toggles this per mode
so we could tune this dependent on AVX512 vectorization vs. AVX2.

The reason I even started looking at this is that we now have
so many vector modes and end up with quite big code for
vectorized epilogues.  And I do remember Intel folks contributing
patches to do fully masked AVX512 loops as well.

Boostrap/testing on x86_64-unknown-linux-gnu (with a slightly
altered patch) reveals no fails besides some assembler scans.

For reference the tested patch is below.

Thanks,
Richard.

commit 221110851fafe17d5a351f1b2da3fc3a40e3b61a
Author: Richard Biener <rguenther@suse.de>
Date:   Thu Jul 15 12:15:18 2021 +0200

    Add loop masking support for x86
    
    The following extends the existing loop masking support using
    SVE WHILE_ULT to x86 by proving an alternate way to produce the
    mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
    you can now enable masked vectorized epilogues (=1) or fully
    masked vector loops (=2).
    
    What's missing is using a scalar IV for the loop control in
    case that's profitable - the mask generation can then move
    from preheader + latch to the header.  But AVX2 and AVX512
    can use vptest and kortestb just fine.
    
    What's also missing is providing more support for predicated
    operations in the case of reductions either via VEC_COND_EXPRs
    or via implementing some of the .COND_{ADD,SUB,MUL...} internal
    functions as mapping to masked AVX512 operations.
    
    For AVX2 and
    
    int foo (unsigned *a, unsigned * __restrict b, int n)
    {
      unsigned sum = 1;
      for (int i = 0; i < n; ++i)
        b[i] += a[i];
      return sum;
    }
    
    we get
    
    .L3:
            vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
            vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
            addl    $8, %edx
            vpaddd  %ymm3, %ymm1, %ymm1
            vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
            vmovd   %edx, %xmm1
            vpsubd  %ymm15, %ymm2, %ymm0
            addq    $32, %rax
            vpbroadcastd    %xmm1, %ymm1
            vpaddd  %ymm4, %ymm1, %ymm1
            vpsubd  %ymm15, %ymm1, %ymm1
            vpcmpgtd        %ymm1, %ymm0, %ymm0
            vptest  %ymm0, %ymm0
            jne     .L3
    
    for the fully masked loop body and for the masked epilogue
    we see
    
    .L4:
            vmovdqu (%rsi,%rax), %ymm3
            vpaddd  (%rdi,%rax), %ymm3, %ymm0
            vmovdqu %ymm0, (%rsi,%rax)
            addq    $32, %rax
            cmpq    %rax, %rcx
            jne     .L4
            movl    %edx, %eax
            andl    $-8, %eax
            testb   $7, %dl
            je      .L11
    .L3:
            subl    %eax, %edx
            vmovdqa .LC0(%rip), %ymm1
            salq    $2, %rax
            vmovd   %edx, %xmm0
            movl    $-2147483648, %edx
            addq    %rax, %rsi
            vmovd   %edx, %xmm15
            vpbroadcastd    %xmm0, %ymm0
            vpbroadcastd    %xmm15, %ymm15
            vpsubd  %ymm15, %ymm1, %ymm1
            vpsubd  %ymm15, %ymm0, %ymm0
            vpcmpgtd        %ymm1, %ymm0, %ymm0
            vpmaskmovd      (%rsi), %ymm0, %ymm1
            vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
            vpaddd  %ymm2, %ymm1, %ymm1
            vpmaskmovd      %ymm1, %ymm0, (%rsi)
    .L11:
            vzeroupper
    
    compared to
    
    .L3:
            movl    %edx, %r8d
            subl    %eax, %r8d
            leal    -1(%r8), %r9d
            cmpl    $2, %r9d
            jbe     .L6
            leaq    (%rcx,%rax,4), %r9
            vmovdqu (%rdi,%rax,4), %xmm2
            movl    %r8d, %eax
            andl    $-4, %eax
            vpaddd  (%r9), %xmm2, %xmm0
            addl    %eax, %esi
            andl    $3, %r8d
            vmovdqu %xmm0, (%r9)
            je      .L2
    .L6:
            movslq  %esi, %r8
            leaq    0(,%r8,4), %rax
            movl    (%rdi,%r8,4), %r8d
            addl    %r8d, (%rcx,%rax)
            leal    1(%rsi), %r8d
            cmpl    %r8d, %edx
            jle     .L2
            addl    $2, %esi
            movl    4(%rdi,%rax), %r8d
            addl    %r8d, 4(%rcx,%rax)
            cmpl    %esi, %edx
            jle     .L2
            movl    8(%rdi,%rax), %edx
            addl    %edx, 8(%rcx,%rax)
    .L2:
    
    The AVX512 fully masked loop would be
    
            vmovdqa .LC0(%rip), %ymm4
            vpbroadcastd    %edx, %ymm3
            vpcmpud $6, %ymm4, %ymm3, %k1
            xorl    %eax, %eax
            .p2align 4,,10
            .p2align 3
    .L3:
            vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
            vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
            vpaddd  %ymm2, %ymm1, %ymm0
            vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
            addq    $8, %rax
            vpbroadcastd    %eax, %ymm0
            vpaddd  %ymm4, %ymm0, %ymm0
            vpcmpud $6, %ymm0, %ymm3, %k1
            kortestb        %k1, %k1
            jne     .L3
    
    loop control using %rax would likely be more latency friendly
    here and the mask generation could be unified to a single place.
    
    2021-07-15  Richard Biener  <rguenther@suse.de>
    
            * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
            also can produce masks with VEC_COND_EXPRs.
            * tree-vect-loop.c (vect_gen_while): Generate the mask
            with a VEC_COND_EXPR in case WHILE_ULT is not supported.

diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index fc3dab0d143..230d6e34208 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 {
   rgroup_controls *rgm;
   unsigned int i;
+  tree cmp_vectype;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
     if (rgm->type != NULL_TREE
 	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 					    cmp_type, rgm->type,
-					    OPTIMIZE_FOR_SPEED))
+					    OPTIMIZE_FOR_SPEED)
+	&& ((cmp_vectype = build_vector_type
+			     (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type))),
+	    true)
+	&& !(VECTOR_BOOLEAN_TYPE_P (rgm->type)
+	     && expand_vec_cmp_expr_p (cmp_vectype, rgm->type, LT_EXPR)))
       return false;
   return true;
 }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 6a25d661800..18c4c66cb2d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -12007,16 +12007,43 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
 		tree end_index, const char *name)
 {
   tree cmp_type = TREE_TYPE (start_index);
-  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
-						       cmp_type, mask_type,
-						       OPTIMIZE_FOR_SPEED));
-  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
-					    start_index, end_index,
-					    build_zero_cst (mask_type));
-  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
-  gimple_call_set_lhs (call, tmp);
-  gimple_seq_add_stmt (seq, call);
-  return tmp;
+  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
+				      cmp_type, mask_type,
+				      OPTIMIZE_FOR_SPEED))
+    {
+      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
+						start_index, end_index,
+						build_zero_cst (mask_type));
+      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
+      gimple_call_set_lhs (call, tmp);
+      gimple_seq_add_stmt (seq, call);
+      return tmp;
+    }
+  else
+    {
+      /* Generate
+	   _1 = { start_index, start_index, ... };
+	   _2 = { end_index, end_index, ... };
+	   _3 = _1 + { 0, 1, 2 ... };
+	   _4 = _3 < _2;  */
+      tree cvectype = build_vector_type (cmp_type,
+					 TYPE_VECTOR_SUBPARTS (mask_type));
+      gcc_assert (VECTOR_BOOLEAN_TYPE_P (mask_type)
+		  && expand_vec_cmp_expr_p (cvectype, mask_type, LT_EXPR));
+      tree si = make_ssa_name (cvectype);
+      gassign *ass = gimple_build_assign
+			(si, build_vector_from_val (cvectype, start_index));
+      gimple_seq_add_stmt (seq, ass);
+      tree ei = make_ssa_name (cvectype);
+      ass = gimple_build_assign (ei,
+				 build_vector_from_val (cvectype, end_index));
+      gimple_seq_add_stmt (seq, ass);
+      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
+				    build_one_cst (cmp_type));
+      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
+      return gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
+			   si, ei);
+    }
 }
 
 /* Generate a vector mask of type MASK_TYPE for which index I is false iff

Richard Sandiford July 15, 2021, 1:49 p.m. UTC | #4

Richard Biener <rguenther@suse.de> writes:
> The following extends the existing loop masking support using
> SVE WHILE_ULT to x86 by proving an alternate way to produce the
> mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> you can now enable masked vectorized epilogues (=1) or fully
> masked vector loops (=2).

As mentioned on IRC, WHILE_ULT is supposed to ensure that every
element after the first zero is also zero.  That happens naturally
for power-of-2 vectors if the start index is a multiple of the VF.
(And at the moment, variable-length vectors are the only way of
supporting non-power-of-2 vectors.)

This probably works fine for =2 and =1 as things stand, since the
vector IVs always start at zero.  But if in future we have a single
IV counting scalar iterations, and use it even for peeled prologue
iterations, we could end up with a situation where the approximation
is no longer safe.

E.g. suppose we had a uint32_t scalar IV with a limit of (uint32_t)-3.
If we peeled 2 iterations for alignment and then had a VF of 8,
the final vector would have a start index of (uint32_t)-6 and the
vector would be { -1, -1, -1, 0, 0, 0, -1, -1 }.

So I think it would be safer to handle this as an alternative to
using while, rather than as a direct emulation, so that we can take
the extra restrictions into account.  Alternatively, we could probably
do { 0, 1, 2, ... } < { end - start, end - start, ... }.

Thanks,
Richard



>
> What's missing is using a scalar IV for the loop control
> (but in principle AVX512 can use the mask here - just the patch
> doesn't seem to work for AVX512 yet for some reason - likely
> expand_vec_cond_expr_p doesn't work there).  What's also missing
> is providing more support for predicated operations in the case
> of reductions either via VEC_COND_EXPRs or via implementing
> some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> to masked AVX512 operations.
>
> For AVX2 and
>
> int foo (unsigned *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     b[i] += a[i];
>   return sum;
> }
>
> we get
>
> .L3:
>         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>         addl    $8, %edx
>         vpaddd  %ymm3, %ymm1, %ymm1
>         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>         vmovd   %edx, %xmm1
>         vpsubd  %ymm15, %ymm2, %ymm0
>         addq    $32, %rax
>         vpbroadcastd    %xmm1, %ymm1
>         vpaddd  %ymm4, %ymm1, %ymm1
>         vpsubd  %ymm15, %ymm1, %ymm1
>         vpcmpgtd        %ymm1, %ymm0, %ymm0
>         vptest  %ymm0, %ymm0
>         jne     .L3
>
> for the fully masked loop body and for the masked epilogue
> we see
>
> .L4:
>         vmovdqu (%rsi,%rax), %ymm3
>         vpaddd  (%rdi,%rax), %ymm3, %ymm0
>         vmovdqu %ymm0, (%rsi,%rax)
>         addq    $32, %rax
>         cmpq    %rax, %rcx
>         jne     .L4
>         movl    %edx, %eax
>         andl    $-8, %eax
>         testb   $7, %dl
>         je      .L11
> .L3:
>         subl    %eax, %edx
>         vmovdqa .LC0(%rip), %ymm1
>         salq    $2, %rax
>         vmovd   %edx, %xmm0
>         movl    $-2147483648, %edx
>         addq    %rax, %rsi
>         vmovd   %edx, %xmm15
>         vpbroadcastd    %xmm0, %ymm0
>         vpbroadcastd    %xmm15, %ymm15
>         vpsubd  %ymm15, %ymm1, %ymm1
>         vpsubd  %ymm15, %ymm0, %ymm0
>         vpcmpgtd        %ymm1, %ymm0, %ymm0
>         vpmaskmovd      (%rsi), %ymm0, %ymm1
>         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>         vpaddd  %ymm2, %ymm1, %ymm1
>         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> .L11:
>         vzeroupper
>
> compared to
>
> .L3:
>         movl    %edx, %r8d
>         subl    %eax, %r8d
>         leal    -1(%r8), %r9d
>         cmpl    $2, %r9d
>         jbe     .L6
>         leaq    (%rcx,%rax,4), %r9
>         vmovdqu (%rdi,%rax,4), %xmm2
>         movl    %r8d, %eax
>         andl    $-4, %eax
>         vpaddd  (%r9), %xmm2, %xmm0
>         addl    %eax, %esi
>         andl    $3, %r8d
>         vmovdqu %xmm0, (%r9)
>         je      .L2
> .L6:
>         movslq  %esi, %r8
>         leaq    0(,%r8,4), %rax
>         movl    (%rdi,%r8,4), %r8d
>         addl    %r8d, (%rcx,%rax)
>         leal    1(%rsi), %r8d
>         cmpl    %r8d, %edx
>         jle     .L2
>         addl    $2, %esi
>         movl    4(%rdi,%rax), %r8d
>         addl    %r8d, 4(%rcx,%rax)
>         cmpl    %esi, %edx
>         jle     .L2
>         movl    8(%rdi,%rax), %edx
>         addl    %edx, 8(%rcx,%rax)
> .L2:
>
> I'm giving this a little testing right now but will dig on why
> I don't get masked loops when AVX512 is enabled.
>
> Still comments are appreciated.
>
> Thanks,
> Richard.
>
> 2021-07-15  Richard Biener  <rguenther@suse.de>
>
> 	* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
> 	also can produce masks with VEC_COND_EXPRs.
> 	* tree-vect-loop.c (vect_gen_while): Generate the mask
> 	with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> ---
>  gcc/tree-vect-loop.c  |  8 ++++++-
>  gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
>  2 files changed, 47 insertions(+), 11 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fc3dab0d143..2214ed11dfb 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
>  {
>    rgroup_controls *rgm;
>    unsigned int i;
> +  tree cmp_vectype;
>    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
>      if (rgm->type != NULL_TREE
>  	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
>  					    cmp_type, rgm->type,
> -					    OPTIMIZE_FOR_SPEED))
> +					    OPTIMIZE_FOR_SPEED)
> +	&& ((cmp_vectype
> +	       = truth_type_for (build_vector_type
> +				 (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
> +	    true)
> +	&& !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
>        return false;
>    return true;
>  }
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 6a25d661800..216986399b1 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
>  		tree end_index, const char *name)
>  {
>    tree cmp_type = TREE_TYPE (start_index);
> -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> -						       cmp_type, mask_type,
> -						       OPTIMIZE_FOR_SPEED));
> -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> -					    start_index, end_index,
> -					    build_zero_cst (mask_type));
> -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> -  gimple_call_set_lhs (call, tmp);
> -  gimple_seq_add_stmt (seq, call);
> -  return tmp;
> +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> +				      cmp_type, mask_type,
> +				      OPTIMIZE_FOR_SPEED))
> +    {
> +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> +						start_index, end_index,
> +						build_zero_cst (mask_type));
> +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> +      gimple_call_set_lhs (call, tmp);
> +      gimple_seq_add_stmt (seq, call);
> +      return tmp;
> +    }
> +  else
> +    {
> +      /* Generate
> +	   _1 = { start_index, start_index, ... };
> +	   _2 = { end_index, end_index, ... };
> +	   _3 = _1 + { 0, 1, 2 ... };
> +	   _4 = _3 < _2;
> +	   _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
> +      tree cvectype = build_vector_type (cmp_type,
> +					 TYPE_VECTOR_SUBPARTS (mask_type));
> +      tree si = make_ssa_name (cvectype);
> +      gassign *ass = gimple_build_assign
> +			(si, build_vector_from_val (cvectype, start_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree ei = make_ssa_name (cvectype);
> +      ass = gimple_build_assign (ei,
> +				 build_vector_from_val (cvectype, end_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> +				    build_one_cst (cmp_type));
> +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> +      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> +			       si, ei);
> +      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
> +				build_all_ones_cst (mask_type),
> +				build_zero_cst (mask_type));
> +      return mask;
> +    }
>  }
>  
>  /* Generate a vector mask of type MASK_TYPE for which index I is false iff

Richard Biener July 15, 2021, 1:54 p.m. UTC | #5

On Thu, 15 Jul 2021, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > The following extends the existing loop masking support using
> > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > you can now enable masked vectorized epilogues (=1) or fully
> > masked vector loops (=2).
> 
> As mentioned on IRC, WHILE_ULT is supposed to ensure that every
> element after the first zero is also zero.  That happens naturally
> for power-of-2 vectors if the start index is a multiple of the VF.
> (And at the moment, variable-length vectors are the only way of
> supporting non-power-of-2 vectors.)
> 
> This probably works fine for =2 and =1 as things stand, since the
> vector IVs always start at zero.  But if in future we have a single
> IV counting scalar iterations, and use it even for peeled prologue
> iterations, we could end up with a situation where the approximation
> is no longer safe.
> 
> E.g. suppose we had a uint32_t scalar IV with a limit of (uint32_t)-3.
> If we peeled 2 iterations for alignment and then had a VF of 8,
> the final vector would have a start index of (uint32_t)-6 and the
> vector would be { -1, -1, -1, 0, 0, 0, -1, -1 }.

Ah, I didn't think of overflow, yeah.  Guess the add of
{ 0, 1, 2, 3 ... } would need to be saturating ;)

> So I think it would be safer to handle this as an alternative to
> using while, rather than as a direct emulation, so that we can take
> the extra restrictions into account.  Alternatively, we could probably
> do { 0, 1, 2, ... } < { end - start, end - start, ... }.

Or this, that looks correct and not worse from a complexity point
of view.

I'll see if I can come up with a testcase and fix even.

Thanks,
Richard.

> Thanks,
> Richard
> 
> 
> 
> >
> > What's missing is using a scalar IV for the loop control
> > (but in principle AVX512 can use the mask here - just the patch
> > doesn't seem to work for AVX512 yet for some reason - likely
> > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > is providing more support for predicated operations in the case
> > of reductions either via VEC_COND_EXPRs or via implementing
> > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > to masked AVX512 operations.
> >
> > For AVX2 and
> >
> > int foo (unsigned *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     b[i] += a[i];
> >   return sum;
> > }
> >
> > we get
> >
> > .L3:
> >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >         addl    $8, %edx
> >         vpaddd  %ymm3, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >         vmovd   %edx, %xmm1
> >         vpsubd  %ymm15, %ymm2, %ymm0
> >         addq    $32, %rax
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpaddd  %ymm4, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vptest  %ymm0, %ymm0
> >         jne     .L3
> >
> > for the fully masked loop body and for the masked epilogue
> > we see
> >
> > .L4:
> >         vmovdqu (%rsi,%rax), %ymm3
> >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >         vmovdqu %ymm0, (%rsi,%rax)
> >         addq    $32, %rax
> >         cmpq    %rax, %rcx
> >         jne     .L4
> >         movl    %edx, %eax
> >         andl    $-8, %eax
> >         testb   $7, %dl
> >         je      .L11
> > .L3:
> >         subl    %eax, %edx
> >         vmovdqa .LC0(%rip), %ymm1
> >         salq    $2, %rax
> >         vmovd   %edx, %xmm0
> >         movl    $-2147483648, %edx
> >         addq    %rax, %rsi
> >         vmovd   %edx, %xmm15
> >         vpbroadcastd    %xmm0, %ymm0
> >         vpbroadcastd    %xmm15, %ymm15
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm0, %ymm0
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >         vpaddd  %ymm2, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > .L11:
> >         vzeroupper
> >
> > compared to
> >
> > .L3:
> >         movl    %edx, %r8d
> >         subl    %eax, %r8d
> >         leal    -1(%r8), %r9d
> >         cmpl    $2, %r9d
> >         jbe     .L6
> >         leaq    (%rcx,%rax,4), %r9
> >         vmovdqu (%rdi,%rax,4), %xmm2
> >         movl    %r8d, %eax
> >         andl    $-4, %eax
> >         vpaddd  (%r9), %xmm2, %xmm0
> >         addl    %eax, %esi
> >         andl    $3, %r8d
> >         vmovdqu %xmm0, (%r9)
> >         je      .L2
> > .L6:
> >         movslq  %esi, %r8
> >         leaq    0(,%r8,4), %rax
> >         movl    (%rdi,%r8,4), %r8d
> >         addl    %r8d, (%rcx,%rax)
> >         leal    1(%rsi), %r8d
> >         cmpl    %r8d, %edx
> >         jle     .L2
> >         addl    $2, %esi
> >         movl    4(%rdi,%rax), %r8d
> >         addl    %r8d, 4(%rcx,%rax)
> >         cmpl    %esi, %edx
> >         jle     .L2
> >         movl    8(%rdi,%rax), %edx
> >         addl    %edx, 8(%rcx,%rax)
> > .L2:
> >
> > I'm giving this a little testing right now but will dig on why
> > I don't get masked loops when AVX512 is enabled.
> >
> > Still comments are appreciated.
> >
> > Thanks,
> > Richard.
> >
> > 2021-07-15  Richard Biener  <rguenther@suse.de>
> >
> > 	* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
> > 	also can produce masks with VEC_COND_EXPRs.
> > 	* tree-vect-loop.c (vect_gen_while): Generate the mask
> > 	with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> > ---
> >  gcc/tree-vect-loop.c  |  8 ++++++-
> >  gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
> >  2 files changed, 47 insertions(+), 11 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index fc3dab0d143..2214ed11dfb 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
> >  {
> >    rgroup_controls *rgm;
> >    unsigned int i;
> > +  tree cmp_vectype;
> >    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
> >      if (rgm->type != NULL_TREE
> >  	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
> >  					    cmp_type, rgm->type,
> > -					    OPTIMIZE_FOR_SPEED))
> > +					    OPTIMIZE_FOR_SPEED)
> > +	&& ((cmp_vectype
> > +	       = truth_type_for (build_vector_type
> > +				 (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
> > +	    true)
> > +	&& !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
> >        return false;
> >    return true;
> >  }
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index 6a25d661800..216986399b1 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
> >  		tree end_index, const char *name)
> >  {
> >    tree cmp_type = TREE_TYPE (start_index);
> > -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > -						       cmp_type, mask_type,
> > -						       OPTIMIZE_FOR_SPEED));
> > -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > -					    start_index, end_index,
> > -					    build_zero_cst (mask_type));
> > -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > -  gimple_call_set_lhs (call, tmp);
> > -  gimple_seq_add_stmt (seq, call);
> > -  return tmp;
> > +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > +				      cmp_type, mask_type,
> > +				      OPTIMIZE_FOR_SPEED))
> > +    {
> > +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > +						start_index, end_index,
> > +						build_zero_cst (mask_type));
> > +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > +      gimple_call_set_lhs (call, tmp);
> > +      gimple_seq_add_stmt (seq, call);
> > +      return tmp;
> > +    }
> > +  else
> > +    {
> > +      /* Generate
> > +	   _1 = { start_index, start_index, ... };
> > +	   _2 = { end_index, end_index, ... };
> > +	   _3 = _1 + { 0, 1, 2 ... };
> > +	   _4 = _3 < _2;
> > +	   _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
> > +      tree cvectype = build_vector_type (cmp_type,
> > +					 TYPE_VECTOR_SUBPARTS (mask_type));
> > +      tree si = make_ssa_name (cvectype);
> > +      gassign *ass = gimple_build_assign
> > +			(si, build_vector_from_val (cvectype, start_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree ei = make_ssa_name (cvectype);
> > +      ass = gimple_build_assign (ei,
> > +				 build_vector_from_val (cvectype, end_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> > +				    build_one_cst (cmp_type));
> > +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> > +      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> > +			       si, ei);
> > +      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
> > +				build_all_ones_cst (mask_type),
> > +				build_zero_cst (mask_type));
> > +      return mask;
> > +    }
> >  }
> >  
> >  /* Generate a vector mask of type MASK_TYPE for which index I is false iff
>

Richard Sandiford July 15, 2021, 2:57 p.m. UTC | #6

Richard Biener <rguenther@suse.de> writes:
> On Thu, 15 Jul 2021, Hongtao Liu wrote:
>
>> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>> >
>> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
>> > >
>> > > The following extends the existing loop masking support using
>> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
>> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
>> > > you can now enable masked vectorized epilogues (=1) or fully
>> > > masked vector loops (=2).
>> > >
>> > > What's missing is using a scalar IV for the loop control
>> > > (but in principle AVX512 can use the mask here - just the patch
>> > > doesn't seem to work for AVX512 yet for some reason - likely
>> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
>> > > is providing more support for predicated operations in the case
>> > > of reductions either via VEC_COND_EXPRs or via implementing
>> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
>> > > to masked AVX512 operations.
>> > >
>> > > For AVX2 and
>> > >
>> > > int foo (unsigned *a, unsigned * __restrict b, int n)
>> > > {
>> > >   unsigned sum = 1;
>> > >   for (int i = 0; i < n; ++i)
>> > >     b[i] += a[i];
>> > >   return sum;
>> > > }
>> > >
>> > > we get
>> > >
>> > > .L3:
>> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>> > >         addl    $8, %edx
>> > >         vpaddd  %ymm3, %ymm1, %ymm1
>> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>> > >         vmovd   %edx, %xmm1
>> > >         vpsubd  %ymm15, %ymm2, %ymm0
>> > >         addq    $32, %rax
>> > >         vpbroadcastd    %xmm1, %ymm1
>> > >         vpaddd  %ymm4, %ymm1, %ymm1
>> > >         vpsubd  %ymm15, %ymm1, %ymm1
>> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
>> > >         vptest  %ymm0, %ymm0
>> > >         jne     .L3
>> > >
>> > > for the fully masked loop body and for the masked epilogue
>> > > we see
>> > >
>> > > .L4:
>> > >         vmovdqu (%rsi,%rax), %ymm3
>> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
>> > >         vmovdqu %ymm0, (%rsi,%rax)
>> > >         addq    $32, %rax
>> > >         cmpq    %rax, %rcx
>> > >         jne     .L4
>> > >         movl    %edx, %eax
>> > >         andl    $-8, %eax
>> > >         testb   $7, %dl
>> > >         je      .L11
>> > > .L3:
>> > >         subl    %eax, %edx
>> > >         vmovdqa .LC0(%rip), %ymm1
>> > >         salq    $2, %rax
>> > >         vmovd   %edx, %xmm0
>> > >         movl    $-2147483648, %edx
>> > >         addq    %rax, %rsi
>> > >         vmovd   %edx, %xmm15
>> > >         vpbroadcastd    %xmm0, %ymm0
>> > >         vpbroadcastd    %xmm15, %ymm15
>> > >         vpsubd  %ymm15, %ymm1, %ymm1
>> > >         vpsubd  %ymm15, %ymm0, %ymm0
>> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
>> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
>> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>> > >         vpaddd  %ymm2, %ymm1, %ymm1
>> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
>> > > .L11:
>> > >         vzeroupper
>> > >
>> > > compared to
>> > >
>> > > .L3:
>> > >         movl    %edx, %r8d
>> > >         subl    %eax, %r8d
>> > >         leal    -1(%r8), %r9d
>> > >         cmpl    $2, %r9d
>> > >         jbe     .L6
>> > >         leaq    (%rcx,%rax,4), %r9
>> > >         vmovdqu (%rdi,%rax,4), %xmm2
>> > >         movl    %r8d, %eax
>> > >         andl    $-4, %eax
>> > >         vpaddd  (%r9), %xmm2, %xmm0
>> > >         addl    %eax, %esi
>> > >         andl    $3, %r8d
>> > >         vmovdqu %xmm0, (%r9)
>> > >         je      .L2
>> > > .L6:
>> > >         movslq  %esi, %r8
>> > >         leaq    0(,%r8,4), %rax
>> > >         movl    (%rdi,%r8,4), %r8d
>> > >         addl    %r8d, (%rcx,%rax)
>> > >         leal    1(%rsi), %r8d
>> > >         cmpl    %r8d, %edx
>> > >         jle     .L2
>> > >         addl    $2, %esi
>> > >         movl    4(%rdi,%rax), %r8d
>> > >         addl    %r8d, 4(%rcx,%rax)
>> > >         cmpl    %esi, %edx
>> > >         jle     .L2
>> > >         movl    8(%rdi,%rax), %edx
>> > >         addl    %edx, 8(%rcx,%rax)
>> > > .L2:
>> > >
>> > > I'm giving this a little testing right now but will dig on why
>> > > I don't get masked loops when AVX512 is enabled.
>> >
>> > Ah, a simple thinko - rgroup_controls vectypes seem to be
>> > always VECTOR_BOOLEAN_TYPE_P and thus we can
>> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
>> > loop then looks like
>> >
>> > .L3:
>> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>> >         vpaddd  %ymm2, %ymm1, %ymm0
>> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>> >         addq    $8, %rax
>> >         vpbroadcastd    %eax, %ymm0
>> >         vpaddd  %ymm4, %ymm0, %ymm0
>> >         vpcmpud $6, %ymm0, %ymm3, %k1
>> >         kortestb        %k1, %k1
>> >         jne     .L3
>> >
>> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
>> > mask generation but other archs may not provide all required vec_cmp
>> > expanders.
>> 
>> For the main loop, the full-masked loop's codegen seems much worse.
>> Basically, we need at least 4 instructions to do what while_ult in arm does.
>> 
>>          vpbroadcastd    %eax, %ymm0
>>          vpaddd  %ymm4, %ymm0, %ymm0
>>          vpcmpud $6, %ymm0, %ymm3, %k1
>>          kortestb        %k1, %k1
>> vs
>>        whilelo(or some other while<op>)
>> 
>> more instructions are needed for avx2 since there's no direct
>> instruction for .COND_{ADD,SUB..}
>> 
>> original
>> .L4:
>>         vmovdqu (%rcx,%rax), %ymm1
>>         vpaddd (%rdi,%rax), %ymm1, %ymm0
>>         vmovdqu %ymm0, (%rcx,%rax)
>>         addq $32, %rax
>>         cmpq %rax, %rsi
>>         jne .L4
>> 
>> vs
>> avx512 full-masked loop
>> .L3:
>>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>>          vpaddd  %ymm2, %ymm1, %ymm0
>>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>>          addq    $8, %rax
>>          vpbroadcastd    %eax, %ymm0
>>          vpaddd  %ymm4, %ymm0, %ymm0
>>          vpcmpud $6, %ymm0, %ymm3, %k1
>>          kortestb        %k1, %k1
>>          jne     .L3
>> 
>> vs
>> avx2 full-masked loop
>> .L3:
>>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>>          addl    $8, %edx
>>          vpaddd  %ymm3, %ymm1, %ymm1
>>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>>          vmovd   %edx, %xmm1
>>          vpsubd  %ymm15, %ymm2, %ymm0
>>          addq    $32, %rax
>>          vpbroadcastd    %xmm1, %ymm1
>>          vpaddd  %ymm4, %ymm1, %ymm1
>>          vpsubd  %ymm15, %ymm1, %ymm1
>>          vpcmpgtd        %ymm1, %ymm0, %ymm0
>>         vptest  %ymm0, %ymm0
>>          jne     .L3
>> 
>> vs  arm64's code
>> 
>> .L3:
>>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
>>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
>>     add z0.s, z0.s, z1.s
>>     st1w z0.s, p0, [x1, x3, lsl 2]
>>     add x3, x3, x4
>>     whilelo p0.s, w3, w2
>>     b.any .L3
>
> Yes, that's true - it might still be OK for vectorizing epilogues
> and thus --param vect-partial-vector-usage=1
>
> Can AVX512 do any better than this?
>
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm4, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm3, %k1
>
> Note with multiple types involved things get even worse
> since you need masks for each vector mode.  But as far as
> I can see that's the same for SVE (but there we have the
> single-instruction whilelo).  I guess we'll also generate
> wrong code at the moment for the case where we need
> multiple vectors to hold the full mask - vect_gen_while
> doesn't seem to be prepared for this?
>
> So with
>
> int foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     {
>       b[i] += a[i];
>     }
>   return sum;
> }
>
> SVE uses
>
> .L3:
>         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
>         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
>         adr     z0.d, [z0.d, z1.d, lsl 2]
>         st1d    z0.d, p0, [x1, x3, lsl 3]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
>
> so p0 vs. p0/z, whatever that means and it looks like
> the vector add can somehow concatenate z0.d and z1.d.
> Truly fascinating ;)

It looks like this is from a version in which “b” was also long.
For the loop above I get:

.L3:
        ld1d    z0.d, p0/z, [x0, x3, lsl 3]
        ld1w    z1.d, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.d, p0, [x1, x3, lsl 2]
        add     x3, x3, x4
        whilelo p0.d, w3, w2
        b.any   .L3

where the ld1w is an extending load and the st1w is a truncating store.

But yeah, there's a hard-coded assumption that a mask for 2 SIs
can also control 1 DI, etc.  For example:

void foo (unsigned long *a, unsigned * __restrict b, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i * 2] += 1;
      a[i * 2 + 1] += 2;
      b[i * 4] += 1;
      b[i * 4 + 1] += 2;
      b[i * 4 + 2] += 3;
      b[i * 4 + 3] += 4;
    }
}

becomes:

.L3:
        ld1d    z0.d, p0/z, [x0]
        add     z0.d, z0.d, z2.d
        st1d    z0.d, p0, [x0]
        ld1w    z0.s, p0/z, [x1, x3, lsl 2]
        add     z0.s, z0.s, z1.s
        st1w    z0.s, p0, [x1, x3, lsl 2]
        add     x0, x0, x4
        add     x3, x3, x5
        whilelo p0.s, x3, x2
        b.any   .L3

This is explained more in the big comment above rgroup_controls
(guess you've already seen it).

> It looks like --param vect_partial_vector_usage defaults to 2,
> power forces it to 1 (power10) or 0 otherwise.
>
> I think we'd need a target hook that toggles this per mode
> so we could tune this dependent on AVX512 vectorization vs. AVX2.

Yeah, keying it off the architecture made sense for Power because the
only purpose of len_load/store is to support partial vectorisation.
But I agree a hook is needed now that we have MASK_LOAD/STORE for
general use and are emulating WHILE_ULT.

Thanks,
Richard

Richard Biener July 15, 2021, 3:15 p.m. UTC | #7

On Thu, 15 Jul 2021, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > On Thu, 15 Jul 2021, Hongtao Liu wrote:
> >
> >> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> >> <gcc-patches@gcc.gnu.org> wrote:
> >> >
> >> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> >> > >
> >> > > The following extends the existing loop masking support using
> >> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> >> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> >> > > you can now enable masked vectorized epilogues (=1) or fully
> >> > > masked vector loops (=2).
> >> > >
> >> > > What's missing is using a scalar IV for the loop control
> >> > > (but in principle AVX512 can use the mask here - just the patch
> >> > > doesn't seem to work for AVX512 yet for some reason - likely
> >> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> >> > > is providing more support for predicated operations in the case
> >> > > of reductions either via VEC_COND_EXPRs or via implementing
> >> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> >> > > to masked AVX512 operations.
> >> > >
> >> > > For AVX2 and
> >> > >
> >> > > int foo (unsigned *a, unsigned * __restrict b, int n)
> >> > > {
> >> > >   unsigned sum = 1;
> >> > >   for (int i = 0; i < n; ++i)
> >> > >     b[i] += a[i];
> >> > >   return sum;
> >> > > }
> >> > >
> >> > > we get
> >> > >
> >> > > .L3:
> >> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >> > >         addl    $8, %edx
> >> > >         vpaddd  %ymm3, %ymm1, %ymm1
> >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >> > >         vmovd   %edx, %xmm1
> >> > >         vpsubd  %ymm15, %ymm2, %ymm0
> >> > >         addq    $32, %rax
> >> > >         vpbroadcastd    %xmm1, %ymm1
> >> > >         vpaddd  %ymm4, %ymm1, %ymm1
> >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >> > >         vptest  %ymm0, %ymm0
> >> > >         jne     .L3
> >> > >
> >> > > for the fully masked loop body and for the masked epilogue
> >> > > we see
> >> > >
> >> > > .L4:
> >> > >         vmovdqu (%rsi,%rax), %ymm3
> >> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >> > >         vmovdqu %ymm0, (%rsi,%rax)
> >> > >         addq    $32, %rax
> >> > >         cmpq    %rax, %rcx
> >> > >         jne     .L4
> >> > >         movl    %edx, %eax
> >> > >         andl    $-8, %eax
> >> > >         testb   $7, %dl
> >> > >         je      .L11
> >> > > .L3:
> >> > >         subl    %eax, %edx
> >> > >         vmovdqa .LC0(%rip), %ymm1
> >> > >         salq    $2, %rax
> >> > >         vmovd   %edx, %xmm0
> >> > >         movl    $-2147483648, %edx
> >> > >         addq    %rax, %rsi
> >> > >         vmovd   %edx, %xmm15
> >> > >         vpbroadcastd    %xmm0, %ymm0
> >> > >         vpbroadcastd    %xmm15, %ymm15
> >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> >> > >         vpsubd  %ymm15, %ymm0, %ymm0
> >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >> > >         vpaddd  %ymm2, %ymm1, %ymm1
> >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> >> > > .L11:
> >> > >         vzeroupper
> >> > >
> >> > > compared to
> >> > >
> >> > > .L3:
> >> > >         movl    %edx, %r8d
> >> > >         subl    %eax, %r8d
> >> > >         leal    -1(%r8), %r9d
> >> > >         cmpl    $2, %r9d
> >> > >         jbe     .L6
> >> > >         leaq    (%rcx,%rax,4), %r9
> >> > >         vmovdqu (%rdi,%rax,4), %xmm2
> >> > >         movl    %r8d, %eax
> >> > >         andl    $-4, %eax
> >> > >         vpaddd  (%r9), %xmm2, %xmm0
> >> > >         addl    %eax, %esi
> >> > >         andl    $3, %r8d
> >> > >         vmovdqu %xmm0, (%r9)
> >> > >         je      .L2
> >> > > .L6:
> >> > >         movslq  %esi, %r8
> >> > >         leaq    0(,%r8,4), %rax
> >> > >         movl    (%rdi,%r8,4), %r8d
> >> > >         addl    %r8d, (%rcx,%rax)
> >> > >         leal    1(%rsi), %r8d
> >> > >         cmpl    %r8d, %edx
> >> > >         jle     .L2
> >> > >         addl    $2, %esi
> >> > >         movl    4(%rdi,%rax), %r8d
> >> > >         addl    %r8d, 4(%rcx,%rax)
> >> > >         cmpl    %esi, %edx
> >> > >         jle     .L2
> >> > >         movl    8(%rdi,%rax), %edx
> >> > >         addl    %edx, 8(%rcx,%rax)
> >> > > .L2:
> >> > >
> >> > > I'm giving this a little testing right now but will dig on why
> >> > > I don't get masked loops when AVX512 is enabled.
> >> >
> >> > Ah, a simple thinko - rgroup_controls vectypes seem to be
> >> > always VECTOR_BOOLEAN_TYPE_P and thus we can
> >> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> >> > loop then looks like
> >> >
> >> > .L3:
> >> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >> >         vpaddd  %ymm2, %ymm1, %ymm0
> >> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >> >         addq    $8, %rax
> >> >         vpbroadcastd    %eax, %ymm0
> >> >         vpaddd  %ymm4, %ymm0, %ymm0
> >> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >> >         kortestb        %k1, %k1
> >> >         jne     .L3
> >> >
> >> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> >> > mask generation but other archs may not provide all required vec_cmp
> >> > expanders.
> >> 
> >> For the main loop, the full-masked loop's codegen seems much worse.
> >> Basically, we need at least 4 instructions to do what while_ult in arm does.
> >> 
> >>          vpbroadcastd    %eax, %ymm0
> >>          vpaddd  %ymm4, %ymm0, %ymm0
> >>          vpcmpud $6, %ymm0, %ymm3, %k1
> >>          kortestb        %k1, %k1
> >> vs
> >>        whilelo(or some other while<op>)
> >> 
> >> more instructions are needed for avx2 since there's no direct
> >> instruction for .COND_{ADD,SUB..}
> >> 
> >> original
> >> .L4:
> >>         vmovdqu (%rcx,%rax), %ymm1
> >>         vpaddd (%rdi,%rax), %ymm1, %ymm0
> >>         vmovdqu %ymm0, (%rcx,%rax)
> >>         addq $32, %rax
> >>         cmpq %rax, %rsi
> >>         jne .L4
> >> 
> >> vs
> >> avx512 full-masked loop
> >> .L3:
> >>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >>          vpaddd  %ymm2, %ymm1, %ymm0
> >>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >>          addq    $8, %rax
> >>          vpbroadcastd    %eax, %ymm0
> >>          vpaddd  %ymm4, %ymm0, %ymm0
> >>          vpcmpud $6, %ymm0, %ymm3, %k1
> >>          kortestb        %k1, %k1
> >>          jne     .L3
> >> 
> >> vs
> >> avx2 full-masked loop
> >> .L3:
> >>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >>          addl    $8, %edx
> >>          vpaddd  %ymm3, %ymm1, %ymm1
> >>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >>          vmovd   %edx, %xmm1
> >>          vpsubd  %ymm15, %ymm2, %ymm0
> >>          addq    $32, %rax
> >>          vpbroadcastd    %xmm1, %ymm1
> >>          vpaddd  %ymm4, %ymm1, %ymm1
> >>          vpsubd  %ymm15, %ymm1, %ymm1
> >>          vpcmpgtd        %ymm1, %ymm0, %ymm0
> >>         vptest  %ymm0, %ymm0
> >>          jne     .L3
> >> 
> >> vs  arm64's code
> >> 
> >> .L3:
> >>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> >>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> >>     add z0.s, z0.s, z1.s
> >>     st1w z0.s, p0, [x1, x3, lsl 2]
> >>     add x3, x3, x4
> >>     whilelo p0.s, w3, w2
> >>     b.any .L3
> >
> > Yes, that's true - it might still be OK for vectorizing epilogues
> > and thus --param vect-partial-vector-usage=1
> >
> > Can AVX512 do any better than this?
> >
> >         vpbroadcastd    %eax, %ymm0
> >         vpaddd  %ymm4, %ymm0, %ymm0
> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >
> > Note with multiple types involved things get even worse
> > since you need masks for each vector mode.  But as far as
> > I can see that's the same for SVE (but there we have the
> > single-instruction whilelo).  I guess we'll also generate
> > wrong code at the moment for the case where we need
> > multiple vectors to hold the full mask - vect_gen_while
> > doesn't seem to be prepared for this?
> >
> > So with
> >
> > int foo (unsigned long *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     {
> >       b[i] += a[i];
> >     }
> >   return sum;
> > }
> >
> > SVE uses
> >
> > .L3:
> >         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
> >         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
> >         adr     z0.d, [z0.d, z1.d, lsl 2]
> >         st1d    z0.d, p0, [x1, x3, lsl 3]
> >         add     x3, x3, x4
> >         whilelo p0.d, w3, w2
> >         b.any   .L3
> >
> > so p0 vs. p0/z, whatever that means and it looks like
> > the vector add can somehow concatenate z0.d and z1.d.
> > Truly fascinating ;)
> 
> It looks like this is from a version in which “b” was also long.
> For the loop above I get:
> 
> .L3:
>         ld1d    z0.d, p0/z, [x0, x3, lsl 3]
>         ld1w    z1.d, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.d, p0, [x1, x3, lsl 2]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
> 
> where the ld1w is an extending load and the st1w is a truncating store.
> 
> But yeah, there's a hard-coded assumption that a mask for 2 SIs
> can also control 1 DI, etc.  For example:
> 
> void foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       a[i * 2] += 1;
>       a[i * 2 + 1] += 2;
>       b[i * 4] += 1;
>       b[i * 4 + 1] += 2;
>       b[i * 4 + 2] += 3;
>       b[i * 4 + 3] += 4;
>     }
> }
> 
> becomes:
> 
> .L3:
>         ld1d    z0.d, p0/z, [x0]
>         add     z0.d, z0.d, z2.d
>         st1d    z0.d, p0, [x0]
>         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
>         add     z0.s, z0.s, z1.s
>         st1w    z0.s, p0, [x1, x3, lsl 2]
>         add     x0, x0, x4
>         add     x3, x3, x5
>         whilelo p0.s, x3, x2
>         b.any   .L3
> 
> This is explained more in the big comment above rgroup_controls
> (guess you've already seen it).

OK, guess I was more looking at

#define N 32
int foo (unsigned long *a, unsigned long * __restrict b,
         unsigned int *c, unsigned int * __restrict d,
         int n)
{
  unsigned sum = 1;
  for (int i = 0; i < n; ++i)
    {
      b[i] += a[i];
      d[i] += c[i];
    }
  return sum;
}

where we on x86 AVX512 vectorize with V8DI and V16SI and we
generate two masks for the two copies of V8DI (VF is 16) and one
mask for V16SI.  With SVE I see

        punpklo p1.h, p0.b
        punpkhi p2.h, p0.b

that's sth I expected to see for AVX512 as well, using the V16SI
mask and unpacking that to two V8DI ones.  But I see

        vpbroadcastd    %eax, %ymm0
        vpaddd  %ymm12, %ymm0, %ymm0
        vpcmpud $6, %ymm0, %ymm11, %k3
        vpbroadcastd    %eax, %xmm0
        vpaddd  %xmm10, %xmm0, %xmm0
        vpcmpud $1, %xmm7, %xmm0, %k1
        vpcmpud $6, %xmm0, %xmm8, %k2
        kortestb        %k1, %k1
        jne     .L3

so three %k masks generated by vpcmpud.  I'll have to look what's
the magic for SVE and why that doesn't trigger for x86 here.

Richard.

> > It looks like --param vect_partial_vector_usage defaults to 2,
> > power forces it to 1 (power10) or 0 otherwise.
> >
> > I think we'd need a target hook that toggles this per mode
> > so we could tune this dependent on AVX512 vectorization vs. AVX2.
> 
> Yeah, keying it off the architecture made sense for Power because the
> only purpose of len_load/store is to support partial vectorisation.
> But I agree a hook is needed now that we have MASK_LOAD/STORE for
> general use and are emulating WHILE_ULT.
> 
> Thanks,
> Richard
>

Richard Biener July 15, 2021, 3:31 p.m. UTC | #8

On Thu, 15 Jul 2021, Richard Biener wrote:

> On Thu, 15 Jul 2021, Richard Sandiford wrote:
> 
> > Richard Biener <rguenther@suse.de> writes:
> > > On Thu, 15 Jul 2021, Hongtao Liu wrote:
> > >
> > >> On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> > >> <gcc-patches@gcc.gnu.org> wrote:
> > >> >
> > >> > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> > >> > >
> > >> > > The following extends the existing loop masking support using
> > >> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > >> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > >> > > you can now enable masked vectorized epilogues (=1) or fully
> > >> > > masked vector loops (=2).
> > >> > >
> > >> > > What's missing is using a scalar IV for the loop control
> > >> > > (but in principle AVX512 can use the mask here - just the patch
> > >> > > doesn't seem to work for AVX512 yet for some reason - likely
> > >> > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > >> > > is providing more support for predicated operations in the case
> > >> > > of reductions either via VEC_COND_EXPRs or via implementing
> > >> > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > >> > > to masked AVX512 operations.
> > >> > >
> > >> > > For AVX2 and
> > >> > >
> > >> > > int foo (unsigned *a, unsigned * __restrict b, int n)
> > >> > > {
> > >> > >   unsigned sum = 1;
> > >> > >   for (int i = 0; i < n; ++i)
> > >> > >     b[i] += a[i];
> > >> > >   return sum;
> > >> > > }
> > >> > >
> > >> > > we get
> > >> > >
> > >> > > .L3:
> > >> > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > >> > >         addl    $8, %edx
> > >> > >         vpaddd  %ymm3, %ymm1, %ymm1
> > >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > >> > >         vmovd   %edx, %xmm1
> > >> > >         vpsubd  %ymm15, %ymm2, %ymm0
> > >> > >         addq    $32, %rax
> > >> > >         vpbroadcastd    %xmm1, %ymm1
> > >> > >         vpaddd  %ymm4, %ymm1, %ymm1
> > >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> > >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >> > >         vptest  %ymm0, %ymm0
> > >> > >         jne     .L3
> > >> > >
> > >> > > for the fully masked loop body and for the masked epilogue
> > >> > > we see
> > >> > >
> > >> > > .L4:
> > >> > >         vmovdqu (%rsi,%rax), %ymm3
> > >> > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> > >> > >         vmovdqu %ymm0, (%rsi,%rax)
> > >> > >         addq    $32, %rax
> > >> > >         cmpq    %rax, %rcx
> > >> > >         jne     .L4
> > >> > >         movl    %edx, %eax
> > >> > >         andl    $-8, %eax
> > >> > >         testb   $7, %dl
> > >> > >         je      .L11
> > >> > > .L3:
> > >> > >         subl    %eax, %edx
> > >> > >         vmovdqa .LC0(%rip), %ymm1
> > >> > >         salq    $2, %rax
> > >> > >         vmovd   %edx, %xmm0
> > >> > >         movl    $-2147483648, %edx
> > >> > >         addq    %rax, %rsi
> > >> > >         vmovd   %edx, %xmm15
> > >> > >         vpbroadcastd    %xmm0, %ymm0
> > >> > >         vpbroadcastd    %xmm15, %ymm15
> > >> > >         vpsubd  %ymm15, %ymm1, %ymm1
> > >> > >         vpsubd  %ymm15, %ymm0, %ymm0
> > >> > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >> > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> > >> > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> > >> > >         vpaddd  %ymm2, %ymm1, %ymm1
> > >> > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > >> > > .L11:
> > >> > >         vzeroupper
> > >> > >
> > >> > > compared to
> > >> > >
> > >> > > .L3:
> > >> > >         movl    %edx, %r8d
> > >> > >         subl    %eax, %r8d
> > >> > >         leal    -1(%r8), %r9d
> > >> > >         cmpl    $2, %r9d
> > >> > >         jbe     .L6
> > >> > >         leaq    (%rcx,%rax,4), %r9
> > >> > >         vmovdqu (%rdi,%rax,4), %xmm2
> > >> > >         movl    %r8d, %eax
> > >> > >         andl    $-4, %eax
> > >> > >         vpaddd  (%r9), %xmm2, %xmm0
> > >> > >         addl    %eax, %esi
> > >> > >         andl    $3, %r8d
> > >> > >         vmovdqu %xmm0, (%r9)
> > >> > >         je      .L2
> > >> > > .L6:
> > >> > >         movslq  %esi, %r8
> > >> > >         leaq    0(,%r8,4), %rax
> > >> > >         movl    (%rdi,%r8,4), %r8d
> > >> > >         addl    %r8d, (%rcx,%rax)
> > >> > >         leal    1(%rsi), %r8d
> > >> > >         cmpl    %r8d, %edx
> > >> > >         jle     .L2
> > >> > >         addl    $2, %esi
> > >> > >         movl    4(%rdi,%rax), %r8d
> > >> > >         addl    %r8d, 4(%rcx,%rax)
> > >> > >         cmpl    %esi, %edx
> > >> > >         jle     .L2
> > >> > >         movl    8(%rdi,%rax), %edx
> > >> > >         addl    %edx, 8(%rcx,%rax)
> > >> > > .L2:
> > >> > >
> > >> > > I'm giving this a little testing right now but will dig on why
> > >> > > I don't get masked loops when AVX512 is enabled.
> > >> >
> > >> > Ah, a simple thinko - rgroup_controls vectypes seem to be
> > >> > always VECTOR_BOOLEAN_TYPE_P and thus we can
> > >> > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> > >> > loop then looks like
> > >> >
> > >> > .L3:
> > >> >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > >> >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > >> >         vpaddd  %ymm2, %ymm1, %ymm0
> > >> >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > >> >         addq    $8, %rax
> > >> >         vpbroadcastd    %eax, %ymm0
> > >> >         vpaddd  %ymm4, %ymm0, %ymm0
> > >> >         vpcmpud $6, %ymm0, %ymm3, %k1
> > >> >         kortestb        %k1, %k1
> > >> >         jne     .L3
> > >> >
> > >> > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> > >> > mask generation but other archs may not provide all required vec_cmp
> > >> > expanders.
> > >> 
> > >> For the main loop, the full-masked loop's codegen seems much worse.
> > >> Basically, we need at least 4 instructions to do what while_ult in arm does.
> > >> 
> > >>          vpbroadcastd    %eax, %ymm0
> > >>          vpaddd  %ymm4, %ymm0, %ymm0
> > >>          vpcmpud $6, %ymm0, %ymm3, %k1
> > >>          kortestb        %k1, %k1
> > >> vs
> > >>        whilelo(or some other while<op>)
> > >> 
> > >> more instructions are needed for avx2 since there's no direct
> > >> instruction for .COND_{ADD,SUB..}
> > >> 
> > >> original
> > >> .L4:
> > >>         vmovdqu (%rcx,%rax), %ymm1
> > >>         vpaddd (%rdi,%rax), %ymm1, %ymm0
> > >>         vmovdqu %ymm0, (%rcx,%rax)
> > >>         addq $32, %rax
> > >>         cmpq %rax, %rsi
> > >>         jne .L4
> > >> 
> > >> vs
> > >> avx512 full-masked loop
> > >> .L3:
> > >>          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > >>          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > >>          vpaddd  %ymm2, %ymm1, %ymm0
> > >>          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > >>          addq    $8, %rax
> > >>          vpbroadcastd    %eax, %ymm0
> > >>          vpaddd  %ymm4, %ymm0, %ymm0
> > >>          vpcmpud $6, %ymm0, %ymm3, %k1
> > >>          kortestb        %k1, %k1
> > >>          jne     .L3
> > >> 
> > >> vs
> > >> avx2 full-masked loop
> > >> .L3:
> > >>          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > >>          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > >>          addl    $8, %edx
> > >>          vpaddd  %ymm3, %ymm1, %ymm1
> > >>          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > >>          vmovd   %edx, %xmm1
> > >>          vpsubd  %ymm15, %ymm2, %ymm0
> > >>          addq    $32, %rax
> > >>          vpbroadcastd    %xmm1, %ymm1
> > >>          vpaddd  %ymm4, %ymm1, %ymm1
> > >>          vpsubd  %ymm15, %ymm1, %ymm1
> > >>          vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >>         vptest  %ymm0, %ymm0
> > >>          jne     .L3
> > >> 
> > >> vs  arm64's code
> > >> 
> > >> .L3:
> > >>     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> > >>     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> > >>     add z0.s, z0.s, z1.s
> > >>     st1w z0.s, p0, [x1, x3, lsl 2]
> > >>     add x3, x3, x4
> > >>     whilelo p0.s, w3, w2
> > >>     b.any .L3
> > >
> > > Yes, that's true - it might still be OK for vectorizing epilogues
> > > and thus --param vect-partial-vector-usage=1
> > >
> > > Can AVX512 do any better than this?
> > >
> > >         vpbroadcastd    %eax, %ymm0
> > >         vpaddd  %ymm4, %ymm0, %ymm0
> > >         vpcmpud $6, %ymm0, %ymm3, %k1
> > >
> > > Note with multiple types involved things get even worse
> > > since you need masks for each vector mode.  But as far as
> > > I can see that's the same for SVE (but there we have the
> > > single-instruction whilelo).  I guess we'll also generate
> > > wrong code at the moment for the case where we need
> > > multiple vectors to hold the full mask - vect_gen_while
> > > doesn't seem to be prepared for this?
> > >
> > > So with
> > >
> > > int foo (unsigned long *a, unsigned * __restrict b, int n)
> > > {
> > >   unsigned sum = 1;
> > >   for (int i = 0; i < n; ++i)
> > >     {
> > >       b[i] += a[i];
> > >     }
> > >   return sum;
> > > }
> > >
> > > SVE uses
> > >
> > > .L3:
> > >         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
> > >         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
> > >         adr     z0.d, [z0.d, z1.d, lsl 2]
> > >         st1d    z0.d, p0, [x1, x3, lsl 3]
> > >         add     x3, x3, x4
> > >         whilelo p0.d, w3, w2
> > >         b.any   .L3
> > >
> > > so p0 vs. p0/z, whatever that means and it looks like
> > > the vector add can somehow concatenate z0.d and z1.d.
> > > Truly fascinating ;)
> > 
> > It looks like this is from a version in which “b” was also long.
> > For the loop above I get:
> > 
> > .L3:
> >         ld1d    z0.d, p0/z, [x0, x3, lsl 3]
> >         ld1w    z1.d, p0/z, [x1, x3, lsl 2]
> >         add     z0.s, z0.s, z1.s
> >         st1w    z0.d, p0, [x1, x3, lsl 2]
> >         add     x3, x3, x4
> >         whilelo p0.d, w3, w2
> >         b.any   .L3
> > 
> > where the ld1w is an extending load and the st1w is a truncating store.
> > 
> > But yeah, there's a hard-coded assumption that a mask for 2 SIs
> > can also control 1 DI, etc.  For example:
> > 
> > void foo (unsigned long *a, unsigned * __restrict b, int n)
> > {
> >   for (int i = 0; i < n; ++i)
> >     {
> >       a[i * 2] += 1;
> >       a[i * 2 + 1] += 2;
> >       b[i * 4] += 1;
> >       b[i * 4 + 1] += 2;
> >       b[i * 4 + 2] += 3;
> >       b[i * 4 + 3] += 4;
> >     }
> > }
> > 
> > becomes:
> > 
> > .L3:
> >         ld1d    z0.d, p0/z, [x0]
> >         add     z0.d, z0.d, z2.d
> >         st1d    z0.d, p0, [x0]
> >         ld1w    z0.s, p0/z, [x1, x3, lsl 2]
> >         add     z0.s, z0.s, z1.s
> >         st1w    z0.s, p0, [x1, x3, lsl 2]
> >         add     x0, x0, x4
> >         add     x3, x3, x5
> >         whilelo p0.s, x3, x2
> >         b.any   .L3
> > 
> > This is explained more in the big comment above rgroup_controls
> > (guess you've already seen it).
> 
> OK, guess I was more looking at
> 
> #define N 32
> int foo (unsigned long *a, unsigned long * __restrict b,
>          unsigned int *c, unsigned int * __restrict d,
>          int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     {
>       b[i] += a[i];
>       d[i] += c[i];
>     }
>   return sum;
> }
> 
> where we on x86 AVX512 vectorize with V8DI and V16SI and we
> generate two masks for the two copies of V8DI (VF is 16) and one
> mask for V16SI.  With SVE I see
> 
>         punpklo p1.h, p0.b
>         punpkhi p2.h, p0.b
> 
> that's sth I expected to see for AVX512 as well, using the V16SI
> mask and unpacking that to two V8DI ones.  But I see
> 
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm12, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm11, %k3
>         vpbroadcastd    %eax, %xmm0
>         vpaddd  %xmm10, %xmm0, %xmm0
>         vpcmpud $1, %xmm7, %xmm0, %k1
>         vpcmpud $6, %xmm0, %xmm8, %k2
>         kortestb        %k1, %k1
>         jne     .L3
> 
> so three %k masks generated by vpcmpud.  I'll have to look what's
> the magic for SVE and why that doesn't trigger for x86 here.

So answer myself, vect_maybe_permute_loop_masks looks for
vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
QImode so that doesn't play well here.  Not sure if there
are proper mask instructions to use (I guess there's a shift
and lopart is free).  This is QI:8 to two QI:4 (bits) mask
conversion.  Not sure how to better ask the target here - again
VnBImode might have been easier here.

Sth makes the loop not handled at all (masked) with AVX2.

Richard.

> Richard.
> 
> > > It looks like --param vect_partial_vector_usage defaults to 2,
> > > power forces it to 1 (power10) or 0 otherwise.
> > >
> > > I think we'd need a target hook that toggles this per mode
> > > so we could tune this dependent on AVX512 vectorization vs. AVX2.
> > 
> > Yeah, keying it off the architecture made sense for Power because the
> > only purpose of len_load/store is to support partial vectorisation.
> > But I agree a hook is needed now that we have MASK_LOAD/STORE for
> > general use and are emulating WHILE_ULT.
> > 
> > Thanks,
> > Richard
> > 
> 
>

Hongtao Liu July 16, 2021, 1:46 a.m. UTC | #9

On Thu, Jul 15, 2021 at 7:48 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Thu, 15 Jul 2021, Hongtao Liu wrote:
>
> > On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> > > >
> > > > The following extends the existing loop masking support using
> > > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > > > you can now enable masked vectorized epilogues (=1) or fully
> > > > masked vector loops (=2).
> > > >
> > > > What's missing is using a scalar IV for the loop control
> > > > (but in principle AVX512 can use the mask here - just the patch
> > > > doesn't seem to work for AVX512 yet for some reason - likely
> > > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > > > is providing more support for predicated operations in the case
> > > > of reductions either via VEC_COND_EXPRs or via implementing
> > > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > > > to masked AVX512 operations.
> > > >
> > > > For AVX2 and
> > > >
> > > > int foo (unsigned *a, unsigned * __restrict b, int n)
> > > > {
> > > >   unsigned sum = 1;
> > > >   for (int i = 0; i < n; ++i)
> > > >     b[i] += a[i];
> > > >   return sum;
> > > > }
> > > >
> > > > we get
> > > >
> > > > .L3:
> > > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > > >         addl    $8, %edx
> > > >         vpaddd  %ymm3, %ymm1, %ymm1
> > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > > >         vmovd   %edx, %xmm1
> > > >         vpsubd  %ymm15, %ymm2, %ymm0
> > > >         addq    $32, %rax
> > > >         vpbroadcastd    %xmm1, %ymm1
> > > >         vpaddd  %ymm4, %ymm1, %ymm1
> > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > >         vptest  %ymm0, %ymm0
> > > >         jne     .L3
> > > >
> > > > for the fully masked loop body and for the masked epilogue
> > > > we see
> > > >
> > > > .L4:
> > > >         vmovdqu (%rsi,%rax), %ymm3
> > > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> > > >         vmovdqu %ymm0, (%rsi,%rax)
> > > >         addq    $32, %rax
> > > >         cmpq    %rax, %rcx
> > > >         jne     .L4
> > > >         movl    %edx, %eax
> > > >         andl    $-8, %eax
> > > >         testb   $7, %dl
> > > >         je      .L11
> > > > .L3:
> > > >         subl    %eax, %edx
> > > >         vmovdqa .LC0(%rip), %ymm1
> > > >         salq    $2, %rax
> > > >         vmovd   %edx, %xmm0
> > > >         movl    $-2147483648, %edx
> > > >         addq    %rax, %rsi
> > > >         vmovd   %edx, %xmm15
> > > >         vpbroadcastd    %xmm0, %ymm0
> > > >         vpbroadcastd    %xmm15, %ymm15
> > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > >         vpsubd  %ymm15, %ymm0, %ymm0
> > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> > > >         vpaddd  %ymm2, %ymm1, %ymm1
> > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > > > .L11:
> > > >         vzeroupper
> > > >
> > > > compared to
> > > >
> > > > .L3:
> > > >         movl    %edx, %r8d
> > > >         subl    %eax, %r8d
> > > >         leal    -1(%r8), %r9d
> > > >         cmpl    $2, %r9d
> > > >         jbe     .L6
> > > >         leaq    (%rcx,%rax,4), %r9
> > > >         vmovdqu (%rdi,%rax,4), %xmm2
> > > >         movl    %r8d, %eax
> > > >         andl    $-4, %eax
> > > >         vpaddd  (%r9), %xmm2, %xmm0
> > > >         addl    %eax, %esi
> > > >         andl    $3, %r8d
> > > >         vmovdqu %xmm0, (%r9)
> > > >         je      .L2
> > > > .L6:
> > > >         movslq  %esi, %r8
> > > >         leaq    0(,%r8,4), %rax
> > > >         movl    (%rdi,%r8,4), %r8d
> > > >         addl    %r8d, (%rcx,%rax)
> > > >         leal    1(%rsi), %r8d
> > > >         cmpl    %r8d, %edx
> > > >         jle     .L2
> > > >         addl    $2, %esi
> > > >         movl    4(%rdi,%rax), %r8d
> > > >         addl    %r8d, 4(%rcx,%rax)
> > > >         cmpl    %esi, %edx
> > > >         jle     .L2
> > > >         movl    8(%rdi,%rax), %edx
> > > >         addl    %edx, 8(%rcx,%rax)
> > > > .L2:
> > > >
> > > > I'm giving this a little testing right now but will dig on why
> > > > I don't get masked loops when AVX512 is enabled.
> > >
> > > Ah, a simple thinko - rgroup_controls vectypes seem to be
> > > always VECTOR_BOOLEAN_TYPE_P and thus we can
> > > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> > > loop then looks like
> > >
> > > .L3:
> > >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > >         vpaddd  %ymm2, %ymm1, %ymm0
> > >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > >         addq    $8, %rax
> > >         vpbroadcastd    %eax, %ymm0
> > >         vpaddd  %ymm4, %ymm0, %ymm0
> > >         vpcmpud $6, %ymm0, %ymm3, %k1
> > >         kortestb        %k1, %k1
> > >         jne     .L3
> > >
> > > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> > > mask generation but other archs may not provide all required vec_cmp
> > > expanders.
> >
> > For the main loop, the full-masked loop's codegen seems much worse.
> > Basically, we need at least 4 instructions to do what while_ult in arm does.
> >
> >          vpbroadcastd    %eax, %ymm0
> >          vpaddd  %ymm4, %ymm0, %ymm0
> >          vpcmpud $6, %ymm0, %ymm3, %k1
> >          kortestb        %k1, %k1
> > vs
> >        whilelo(or some other while<op>)
> >
> > more instructions are needed for avx2 since there's no direct
> > instruction for .COND_{ADD,SUB..}
> >
> > original
> > .L4:
> >         vmovdqu (%rcx,%rax), %ymm1
> >         vpaddd (%rdi,%rax), %ymm1, %ymm0
> >         vmovdqu %ymm0, (%rcx,%rax)
> >         addq $32, %rax
> >         cmpq %rax, %rsi
> >         jne .L4
> >
> > vs
> > avx512 full-masked loop
> > .L3:
> >          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >          vpaddd  %ymm2, %ymm1, %ymm0
> >          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >          addq    $8, %rax
> >          vpbroadcastd    %eax, %ymm0
> >          vpaddd  %ymm4, %ymm0, %ymm0
> >          vpcmpud $6, %ymm0, %ymm3, %k1
> >          kortestb        %k1, %k1
> >          jne     .L3
> >
> > vs
> > avx2 full-masked loop
> > .L3:
> >          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >          addl    $8, %edx
> >          vpaddd  %ymm3, %ymm1, %ymm1
> >          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >          vmovd   %edx, %xmm1
> >          vpsubd  %ymm15, %ymm2, %ymm0
> >          addq    $32, %rax
> >          vpbroadcastd    %xmm1, %ymm1
> >          vpaddd  %ymm4, %ymm1, %ymm1
> >          vpsubd  %ymm15, %ymm1, %ymm1
> >          vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vptest  %ymm0, %ymm0
> >          jne     .L3
> >
> > vs  arm64's code
> >
> > .L3:
> >     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> >     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> >     add z0.s, z0.s, z1.s
> >     st1w z0.s, p0, [x1, x3, lsl 2]
> >     add x3, x3, x4
> >     whilelo p0.s, w3, w2
> >     b.any .L3
>
> Yes, that's true - it might still be OK for vectorizing epilogues
> and thus --param vect-partial-vector-usage=1
>
> Can AVX512 do any better than this?
>
>         vpbroadcastd    %eax, %ymm0
>         vpaddd  %ymm4, %ymm0, %ymm0
>         vpcmpud $6, %ymm0, %ymm3, %k1
>

we can hoist vpbroadcastd out of the loop by define
iv_vector as {start_index, start_index + 1, start_index +2, ... }
and do add and cmp in loop body.

> Note with multiple types involved things get even worse
> since you need masks for each vector mode.  But as far as
> I can see that's the same for SVE (but there we have the
> single-instruction whilelo).  I guess we'll also generate
> wrong code at the moment for the case where we need
> multiple vectors to hold the full mask - vect_gen_while
> doesn't seem to be prepared for this?
>
> So with
>
> int foo (unsigned long *a, unsigned * __restrict b, int n)
> {
>   unsigned sum = 1;
>   for (int i = 0; i < n; ++i)
>     {
>       b[i] += a[i];
>     }
>   return sum;
> }
>
> SVE uses
>
> .L3:
>         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
>         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
>         adr     z0.d, [z0.d, z1.d, lsl 2]
>         st1d    z0.d, p0, [x1, x3, lsl 3]
>         add     x3, x3, x4
>         whilelo p0.d, w3, w2
>         b.any   .L3
>
> so p0 vs. p0/z, whatever that means and it looks like
> the vector add can somehow concatenate z0.d and z1.d.
> Truly fascinating ;)
>
> It looks like --param vect_partial_vector_usage defaults to 2,
> power forces it to 1 (power10) or 0 otherwise.
>
> I think we'd need a target hook that toggles this per mode
> so we could tune this dependent on AVX512 vectorization vs. AVX2.
>
> The reason I even started looking at this is that we now have
> so many vector modes and end up with quite big code for
> vectorized epilogues.  And I do remember Intel folks contributing
> patches to do fully masked AVX512 loops as well.
>
> Boostrap/testing on x86_64-unknown-linux-gnu (with a slightly
> altered patch) reveals no fails besides some assembler scans.
>
> For reference the tested patch is below.
>
> Thanks,
> Richard.
>
> commit 221110851fafe17d5a351f1b2da3fc3a40e3b61a
> Author: Richard Biener <rguenther@suse.de>
> Date:   Thu Jul 15 12:15:18 2021 +0200
>
>     Add loop masking support for x86
>
>     The following extends the existing loop masking support using
>     SVE WHILE_ULT to x86 by proving an alternate way to produce the
>     mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
>     you can now enable masked vectorized epilogues (=1) or fully
>     masked vector loops (=2).
>
>     What's missing is using a scalar IV for the loop control in
>     case that's profitable - the mask generation can then move
>     from preheader + latch to the header.  But AVX2 and AVX512
>     can use vptest and kortestb just fine.
>
>     What's also missing is providing more support for predicated
>     operations in the case of reductions either via VEC_COND_EXPRs
>     or via implementing some of the .COND_{ADD,SUB,MUL...} internal
>     functions as mapping to masked AVX512 operations.
>
>     For AVX2 and
>
>     int foo (unsigned *a, unsigned * __restrict b, int n)
>     {
>       unsigned sum = 1;
>       for (int i = 0; i < n; ++i)
>         b[i] += a[i];
>       return sum;
>     }
>
>     we get
>
>     .L3:
>             vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
>             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
>             addl    $8, %edx
>             vpaddd  %ymm3, %ymm1, %ymm1
>             vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
>             vmovd   %edx, %xmm1
>             vpsubd  %ymm15, %ymm2, %ymm0
>             addq    $32, %rax
>             vpbroadcastd    %xmm1, %ymm1
>             vpaddd  %ymm4, %ymm1, %ymm1
>             vpsubd  %ymm15, %ymm1, %ymm1
>             vpcmpgtd        %ymm1, %ymm0, %ymm0
>             vptest  %ymm0, %ymm0
>             jne     .L3
>
>     for the fully masked loop body and for the masked epilogue
>     we see
>
>     .L4:
>             vmovdqu (%rsi,%rax), %ymm3
>             vpaddd  (%rdi,%rax), %ymm3, %ymm0
>             vmovdqu %ymm0, (%rsi,%rax)
>             addq    $32, %rax
>             cmpq    %rax, %rcx
>             jne     .L4
>             movl    %edx, %eax
>             andl    $-8, %eax
>             testb   $7, %dl
>             je      .L11
>     .L3:
>             subl    %eax, %edx
>             vmovdqa .LC0(%rip), %ymm1
>             salq    $2, %rax
>             vmovd   %edx, %xmm0
>             movl    $-2147483648, %edx
>             addq    %rax, %rsi
>             vmovd   %edx, %xmm15
>             vpbroadcastd    %xmm0, %ymm0
>             vpbroadcastd    %xmm15, %ymm15
>             vpsubd  %ymm15, %ymm1, %ymm1
>             vpsubd  %ymm15, %ymm0, %ymm0
>             vpcmpgtd        %ymm1, %ymm0, %ymm0
>             vpmaskmovd      (%rsi), %ymm0, %ymm1
>             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
>             vpaddd  %ymm2, %ymm1, %ymm1
>             vpmaskmovd      %ymm1, %ymm0, (%rsi)
>     .L11:
>             vzeroupper
>
>     compared to
>
>     .L3:
>             movl    %edx, %r8d
>             subl    %eax, %r8d
>             leal    -1(%r8), %r9d
>             cmpl    $2, %r9d
>             jbe     .L6
>             leaq    (%rcx,%rax,4), %r9
>             vmovdqu (%rdi,%rax,4), %xmm2
>             movl    %r8d, %eax
>             andl    $-4, %eax
>             vpaddd  (%r9), %xmm2, %xmm0
>             addl    %eax, %esi
>             andl    $3, %r8d
>             vmovdqu %xmm0, (%r9)
>             je      .L2
>     .L6:
>             movslq  %esi, %r8
>             leaq    0(,%r8,4), %rax
>             movl    (%rdi,%r8,4), %r8d
>             addl    %r8d, (%rcx,%rax)
>             leal    1(%rsi), %r8d
>             cmpl    %r8d, %edx
>             jle     .L2
>             addl    $2, %esi
>             movl    4(%rdi,%rax), %r8d
>             addl    %r8d, 4(%rcx,%rax)
>             cmpl    %esi, %edx
>             jle     .L2
>             movl    8(%rdi,%rax), %edx
>             addl    %edx, 8(%rcx,%rax)
>     .L2:
>
>     The AVX512 fully masked loop would be
>
>             vmovdqa .LC0(%rip), %ymm4
>             vpbroadcastd    %edx, %ymm3
>             vpcmpud $6, %ymm4, %ymm3, %k1
>             xorl    %eax, %eax
>             .p2align 4,,10
>             .p2align 3
>     .L3:
>             vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
>             vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
>             vpaddd  %ymm2, %ymm1, %ymm0
>             vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
>             addq    $8, %rax
>             vpbroadcastd    %eax, %ymm0
>             vpaddd  %ymm4, %ymm0, %ymm0
>             vpcmpud $6, %ymm0, %ymm3, %k1
>             kortestb        %k1, %k1
>             jne     .L3
>
>     loop control using %rax would likely be more latency friendly
>     here and the mask generation could be unified to a single place.
>
>     2021-07-15  Richard Biener  <rguenther@suse.de>
>
>             * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
>             also can produce masks with VEC_COND_EXPRs.
>             * tree-vect-loop.c (vect_gen_while): Generate the mask
>             with a VEC_COND_EXPR in case WHILE_ULT is not supported.
>
> diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> index fc3dab0d143..230d6e34208 100644
> --- a/gcc/tree-vect-loop.c
> +++ b/gcc/tree-vect-loop.c
> @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
>  {
>    rgroup_controls *rgm;
>    unsigned int i;
> +  tree cmp_vectype;
>    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
>      if (rgm->type != NULL_TREE
>         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
>                                             cmp_type, rgm->type,
> -                                           OPTIMIZE_FOR_SPEED))
> +                                           OPTIMIZE_FOR_SPEED)
> +       && ((cmp_vectype = build_vector_type
> +                            (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type))),
> +           true)
> +       && !(VECTOR_BOOLEAN_TYPE_P (rgm->type)
> +            && expand_vec_cmp_expr_p (cmp_vectype, rgm->type, LT_EXPR)))
>        return false;
>    return true;
>  }
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 6a25d661800..18c4c66cb2d 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -12007,16 +12007,43 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
>                 tree end_index, const char *name)
>  {
>    tree cmp_type = TREE_TYPE (start_index);
> -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> -                                                      cmp_type, mask_type,
> -                                                      OPTIMIZE_FOR_SPEED));
> -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> -                                           start_index, end_index,
> -                                           build_zero_cst (mask_type));
> -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> -  gimple_call_set_lhs (call, tmp);
> -  gimple_seq_add_stmt (seq, call);
> -  return tmp;
> +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> +                                     cmp_type, mask_type,
> +                                     OPTIMIZE_FOR_SPEED))
> +    {
> +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> +                                               start_index, end_index,
> +                                               build_zero_cst (mask_type));
> +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> +      gimple_call_set_lhs (call, tmp);
> +      gimple_seq_add_stmt (seq, call);
> +      return tmp;
> +    }
> +  else
> +    {
> +      /* Generate
> +          _1 = { start_index, start_index, ... };
> +          _2 = { end_index, end_index, ... };
> +          _3 = _1 + { 0, 1, 2 ... };
> +          _4 = _3 < _2;  */
> +      tree cvectype = build_vector_type (cmp_type,
> +                                        TYPE_VECTOR_SUBPARTS (mask_type));
> +      gcc_assert (VECTOR_BOOLEAN_TYPE_P (mask_type)
> +                 && expand_vec_cmp_expr_p (cvectype, mask_type, LT_EXPR));
> +      tree si = make_ssa_name (cvectype);
> +      gassign *ass = gimple_build_assign
> +                       (si, build_vector_from_val (cvectype, start_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree ei = make_ssa_name (cvectype);
> +      ass = gimple_build_assign (ei,
> +                                build_vector_from_val (cvectype, end_index));
> +      gimple_seq_add_stmt (seq, ass);
> +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> +                                   build_one_cst (cmp_type));
> +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> +      return gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> +                          si, ei);
> +    }
>  }
>
>  /* Generate a vector mask of type MASK_TYPE for which index I is false iff

Richard Biener July 16, 2021, 6:09 a.m. UTC | #10

On Fri, 16 Jul 2021, Hongtao Liu wrote:

> On Thu, Jul 15, 2021 at 7:48 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Thu, 15 Jul 2021, Hongtao Liu wrote:
> >
> > > On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguenther@suse.de> wrote:
> > > > >
> > > > > The following extends the existing loop masking support using
> > > > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > > > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > > > > you can now enable masked vectorized epilogues (=1) or fully
> > > > > masked vector loops (=2).
> > > > >
> > > > > What's missing is using a scalar IV for the loop control
> > > > > (but in principle AVX512 can use the mask here - just the patch
> > > > > doesn't seem to work for AVX512 yet for some reason - likely
> > > > > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > > > > is providing more support for predicated operations in the case
> > > > > of reductions either via VEC_COND_EXPRs or via implementing
> > > > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > > > > to masked AVX512 operations.
> > > > >
> > > > > For AVX2 and
> > > > >
> > > > > int foo (unsigned *a, unsigned * __restrict b, int n)
> > > > > {
> > > > >   unsigned sum = 1;
> > > > >   for (int i = 0; i < n; ++i)
> > > > >     b[i] += a[i];
> > > > >   return sum;
> > > > > }
> > > > >
> > > > > we get
> > > > >
> > > > > .L3:
> > > > >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > > > >         addl    $8, %edx
> > > > >         vpaddd  %ymm3, %ymm1, %ymm1
> > > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > > > >         vmovd   %edx, %xmm1
> > > > >         vpsubd  %ymm15, %ymm2, %ymm0
> > > > >         addq    $32, %rax
> > > > >         vpbroadcastd    %xmm1, %ymm1
> > > > >         vpaddd  %ymm4, %ymm1, %ymm1
> > > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > > >         vptest  %ymm0, %ymm0
> > > > >         jne     .L3
> > > > >
> > > > > for the fully masked loop body and for the masked epilogue
> > > > > we see
> > > > >
> > > > > .L4:
> > > > >         vmovdqu (%rsi,%rax), %ymm3
> > > > >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> > > > >         vmovdqu %ymm0, (%rsi,%rax)
> > > > >         addq    $32, %rax
> > > > >         cmpq    %rax, %rcx
> > > > >         jne     .L4
> > > > >         movl    %edx, %eax
> > > > >         andl    $-8, %eax
> > > > >         testb   $7, %dl
> > > > >         je      .L11
> > > > > .L3:
> > > > >         subl    %eax, %edx
> > > > >         vmovdqa .LC0(%rip), %ymm1
> > > > >         salq    $2, %rax
> > > > >         vmovd   %edx, %xmm0
> > > > >         movl    $-2147483648, %edx
> > > > >         addq    %rax, %rsi
> > > > >         vmovd   %edx, %xmm15
> > > > >         vpbroadcastd    %xmm0, %ymm0
> > > > >         vpbroadcastd    %xmm15, %ymm15
> > > > >         vpsubd  %ymm15, %ymm1, %ymm1
> > > > >         vpsubd  %ymm15, %ymm0, %ymm0
> > > > >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> > > > >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> > > > >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> > > > >         vpaddd  %ymm2, %ymm1, %ymm1
> > > > >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > > > > .L11:
> > > > >         vzeroupper
> > > > >
> > > > > compared to
> > > > >
> > > > > .L3:
> > > > >         movl    %edx, %r8d
> > > > >         subl    %eax, %r8d
> > > > >         leal    -1(%r8), %r9d
> > > > >         cmpl    $2, %r9d
> > > > >         jbe     .L6
> > > > >         leaq    (%rcx,%rax,4), %r9
> > > > >         vmovdqu (%rdi,%rax,4), %xmm2
> > > > >         movl    %r8d, %eax
> > > > >         andl    $-4, %eax
> > > > >         vpaddd  (%r9), %xmm2, %xmm0
> > > > >         addl    %eax, %esi
> > > > >         andl    $3, %r8d
> > > > >         vmovdqu %xmm0, (%r9)
> > > > >         je      .L2
> > > > > .L6:
> > > > >         movslq  %esi, %r8
> > > > >         leaq    0(,%r8,4), %rax
> > > > >         movl    (%rdi,%r8,4), %r8d
> > > > >         addl    %r8d, (%rcx,%rax)
> > > > >         leal    1(%rsi), %r8d
> > > > >         cmpl    %r8d, %edx
> > > > >         jle     .L2
> > > > >         addl    $2, %esi
> > > > >         movl    4(%rdi,%rax), %r8d
> > > > >         addl    %r8d, 4(%rcx,%rax)
> > > > >         cmpl    %esi, %edx
> > > > >         jle     .L2
> > > > >         movl    8(%rdi,%rax), %edx
> > > > >         addl    %edx, 8(%rcx,%rax)
> > > > > .L2:
> > > > >
> > > > > I'm giving this a little testing right now but will dig on why
> > > > > I don't get masked loops when AVX512 is enabled.
> > > >
> > > > Ah, a simple thinko - rgroup_controls vectypes seem to be
> > > > always VECTOR_BOOLEAN_TYPE_P and thus we can
> > > > use expand_vec_cmp_expr_p.  The AVX512 fully masked
> > > > loop then looks like
> > > >
> > > > .L3:
> > > >         vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > > >         vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > > >         vpaddd  %ymm2, %ymm1, %ymm0
> > > >         vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > > >         addq    $8, %rax
> > > >         vpbroadcastd    %eax, %ymm0
> > > >         vpaddd  %ymm4, %ymm0, %ymm0
> > > >         vpcmpud $6, %ymm0, %ymm3, %k1
> > > >         kortestb        %k1, %k1
> > > >         jne     .L3
> > > >
> > > > I guess for x86 it's not worth preserving the VEC_COND_EXPR
> > > > mask generation but other archs may not provide all required vec_cmp
> > > > expanders.
> > >
> > > For the main loop, the full-masked loop's codegen seems much worse.
> > > Basically, we need at least 4 instructions to do what while_ult in arm does.
> > >
> > >          vpbroadcastd    %eax, %ymm0
> > >          vpaddd  %ymm4, %ymm0, %ymm0
> > >          vpcmpud $6, %ymm0, %ymm3, %k1
> > >          kortestb        %k1, %k1
> > > vs
> > >        whilelo(or some other while<op>)
> > >
> > > more instructions are needed for avx2 since there's no direct
> > > instruction for .COND_{ADD,SUB..}
> > >
> > > original
> > > .L4:
> > >         vmovdqu (%rcx,%rax), %ymm1
> > >         vpaddd (%rdi,%rax), %ymm1, %ymm0
> > >         vmovdqu %ymm0, (%rcx,%rax)
> > >         addq $32, %rax
> > >         cmpq %rax, %rsi
> > >         jne .L4
> > >
> > > vs
> > > avx512 full-masked loop
> > > .L3:
> > >          vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> > >          vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> > >          vpaddd  %ymm2, %ymm1, %ymm0
> > >          vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> > >          addq    $8, %rax
> > >          vpbroadcastd    %eax, %ymm0
> > >          vpaddd  %ymm4, %ymm0, %ymm0
> > >          vpcmpud $6, %ymm0, %ymm3, %k1
> > >          kortestb        %k1, %k1
> > >          jne     .L3
> > >
> > > vs
> > > avx2 full-masked loop
> > > .L3:
> > >          vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> > >          vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> > >          addl    $8, %edx
> > >          vpaddd  %ymm3, %ymm1, %ymm1
> > >          vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> > >          vmovd   %edx, %xmm1
> > >          vpsubd  %ymm15, %ymm2, %ymm0
> > >          addq    $32, %rax
> > >          vpbroadcastd    %xmm1, %ymm1
> > >          vpaddd  %ymm4, %ymm1, %ymm1
> > >          vpsubd  %ymm15, %ymm1, %ymm1
> > >          vpcmpgtd        %ymm1, %ymm0, %ymm0
> > >         vptest  %ymm0, %ymm0
> > >          jne     .L3
> > >
> > > vs  arm64's code
> > >
> > > .L3:
> > >     ld1w z1.s, p0/z, [x1, x3, lsl 2]
> > >     ld1w z0.s, p0/z, [x0, x3, lsl 2]
> > >     add z0.s, z0.s, z1.s
> > >     st1w z0.s, p0, [x1, x3, lsl 2]
> > >     add x3, x3, x4
> > >     whilelo p0.s, w3, w2
> > >     b.any .L3
> >
> > Yes, that's true - it might still be OK for vectorizing epilogues
> > and thus --param vect-partial-vector-usage=1
> >
> > Can AVX512 do any better than this?
> >
> >         vpbroadcastd    %eax, %ymm0
> >         vpaddd  %ymm4, %ymm0, %ymm0
> >         vpcmpud $6, %ymm0, %ymm3, %k1
> >
> 
> we can hoist vpbroadcastd out of the loop by define
> iv_vector as {start_index, start_index + 1, start_index +2, ... }
> and do add and cmp in loop body.

For the fully masked case I'm also thinking about biasing the
loop to perform masked pre-loop iterations to align the
last vector iteration to cover a full vector.  Then we'd have
initial iteration mask setup in the preheader and in the loop
itself we'd use the scalar IV and simply set the masks to
all ones before the jump.  Looks like kxnor %k, %k, %k should
do the trick here.

We will of course misalign all known aligned accesses that way.

Anyway, looks like AVX512 wasn't designed with fully masking
in mind but more to cover more cases of traditional if-conversion.
But maybe there's hope that an instruction like kwhileult will
appear in some future incarnation of it ;)

Just for fun I benchmarked the current patch on Zen2 and
CPU 2017 where FP rate drops about 10% overall but with
worst cases being twice as slow (503.bwaves, without -flto).

Richard.

> > Note with multiple types involved things get even worse
> > since you need masks for each vector mode.  But as far as
> > I can see that's the same for SVE (but there we have the
> > single-instruction whilelo).  I guess we'll also generate
> > wrong code at the moment for the case where we need
> > multiple vectors to hold the full mask - vect_gen_while
> > doesn't seem to be prepared for this?
> >
> > So with
> >
> > int foo (unsigned long *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     {
> >       b[i] += a[i];
> >     }
> >   return sum;
> > }
> >
> > SVE uses
> >
> > .L3:
> >         ld1d    z0.d, p0/z, [x1, x3, lsl 3]
> >         ld1d    z1.d, p0/z, [x0, x3, lsl 3]
> >         adr     z0.d, [z0.d, z1.d, lsl 2]
> >         st1d    z0.d, p0, [x1, x3, lsl 3]
> >         add     x3, x3, x4
> >         whilelo p0.d, w3, w2
> >         b.any   .L3
> >
> > so p0 vs. p0/z, whatever that means and it looks like
> > the vector add can somehow concatenate z0.d and z1.d.
> > Truly fascinating ;)
> >
> > It looks like --param vect_partial_vector_usage defaults to 2,
> > power forces it to 1 (power10) or 0 otherwise.
> >
> > I think we'd need a target hook that toggles this per mode
> > so we could tune this dependent on AVX512 vectorization vs. AVX2.
> >
> > The reason I even started looking at this is that we now have
> > so many vector modes and end up with quite big code for
> > vectorized epilogues.  And I do remember Intel folks contributing
> > patches to do fully masked AVX512 loops as well.
> >
> > Boostrap/testing on x86_64-unknown-linux-gnu (with a slightly
> > altered patch) reveals no fails besides some assembler scans.
> >
> > For reference the tested patch is below.
> >
> > Thanks,
> > Richard.
> >
> > commit 221110851fafe17d5a351f1b2da3fc3a40e3b61a
> > Author: Richard Biener <rguenther@suse.de>
> > Date:   Thu Jul 15 12:15:18 2021 +0200
> >
> >     Add loop masking support for x86
> >
> >     The following extends the existing loop masking support using
> >     SVE WHILE_ULT to x86 by proving an alternate way to produce the
> >     mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> >     you can now enable masked vectorized epilogues (=1) or fully
> >     masked vector loops (=2).
> >
> >     What's missing is using a scalar IV for the loop control in
> >     case that's profitable - the mask generation can then move
> >     from preheader + latch to the header.  But AVX2 and AVX512
> >     can use vptest and kortestb just fine.
> >
> >     What's also missing is providing more support for predicated
> >     operations in the case of reductions either via VEC_COND_EXPRs
> >     or via implementing some of the .COND_{ADD,SUB,MUL...} internal
> >     functions as mapping to masked AVX512 operations.
> >
> >     For AVX2 and
> >
> >     int foo (unsigned *a, unsigned * __restrict b, int n)
> >     {
> >       unsigned sum = 1;
> >       for (int i = 0; i < n; ++i)
> >         b[i] += a[i];
> >       return sum;
> >     }
> >
> >     we get
> >
> >     .L3:
> >             vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >             addl    $8, %edx
> >             vpaddd  %ymm3, %ymm1, %ymm1
> >             vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >             vmovd   %edx, %xmm1
> >             vpsubd  %ymm15, %ymm2, %ymm0
> >             addq    $32, %rax
> >             vpbroadcastd    %xmm1, %ymm1
> >             vpaddd  %ymm4, %ymm1, %ymm1
> >             vpsubd  %ymm15, %ymm1, %ymm1
> >             vpcmpgtd        %ymm1, %ymm0, %ymm0
> >             vptest  %ymm0, %ymm0
> >             jne     .L3
> >
> >     for the fully masked loop body and for the masked epilogue
> >     we see
> >
> >     .L4:
> >             vmovdqu (%rsi,%rax), %ymm3
> >             vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >             vmovdqu %ymm0, (%rsi,%rax)
> >             addq    $32, %rax
> >             cmpq    %rax, %rcx
> >             jne     .L4
> >             movl    %edx, %eax
> >             andl    $-8, %eax
> >             testb   $7, %dl
> >             je      .L11
> >     .L3:
> >             subl    %eax, %edx
> >             vmovdqa .LC0(%rip), %ymm1
> >             salq    $2, %rax
> >             vmovd   %edx, %xmm0
> >             movl    $-2147483648, %edx
> >             addq    %rax, %rsi
> >             vmovd   %edx, %xmm15
> >             vpbroadcastd    %xmm0, %ymm0
> >             vpbroadcastd    %xmm15, %ymm15
> >             vpsubd  %ymm15, %ymm1, %ymm1
> >             vpsubd  %ymm15, %ymm0, %ymm0
> >             vpcmpgtd        %ymm1, %ymm0, %ymm0
> >             vpmaskmovd      (%rsi), %ymm0, %ymm1
> >             vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >             vpaddd  %ymm2, %ymm1, %ymm1
> >             vpmaskmovd      %ymm1, %ymm0, (%rsi)
> >     .L11:
> >             vzeroupper
> >
> >     compared to
> >
> >     .L3:
> >             movl    %edx, %r8d
> >             subl    %eax, %r8d
> >             leal    -1(%r8), %r9d
> >             cmpl    $2, %r9d
> >             jbe     .L6
> >             leaq    (%rcx,%rax,4), %r9
> >             vmovdqu (%rdi,%rax,4), %xmm2
> >             movl    %r8d, %eax
> >             andl    $-4, %eax
> >             vpaddd  (%r9), %xmm2, %xmm0
> >             addl    %eax, %esi
> >             andl    $3, %r8d
> >             vmovdqu %xmm0, (%r9)
> >             je      .L2
> >     .L6:
> >             movslq  %esi, %r8
> >             leaq    0(,%r8,4), %rax
> >             movl    (%rdi,%r8,4), %r8d
> >             addl    %r8d, (%rcx,%rax)
> >             leal    1(%rsi), %r8d
> >             cmpl    %r8d, %edx
> >             jle     .L2
> >             addl    $2, %esi
> >             movl    4(%rdi,%rax), %r8d
> >             addl    %r8d, 4(%rcx,%rax)
> >             cmpl    %esi, %edx
> >             jle     .L2
> >             movl    8(%rdi,%rax), %edx
> >             addl    %edx, 8(%rcx,%rax)
> >     .L2:
> >
> >     The AVX512 fully masked loop would be
> >
> >             vmovdqa .LC0(%rip), %ymm4
> >             vpbroadcastd    %edx, %ymm3
> >             vpcmpud $6, %ymm4, %ymm3, %k1
> >             xorl    %eax, %eax
> >             .p2align 4,,10
> >             .p2align 3
> >     .L3:
> >             vmovdqu32       (%rsi,%rax,4), %ymm2{%k1}
> >             vmovdqu32       (%rdi,%rax,4), %ymm1{%k1}
> >             vpaddd  %ymm2, %ymm1, %ymm0
> >             vmovdqu32       %ymm0, (%rsi,%rax,4){%k1}
> >             addq    $8, %rax
> >             vpbroadcastd    %eax, %ymm0
> >             vpaddd  %ymm4, %ymm0, %ymm0
> >             vpcmpud $6, %ymm0, %ymm3, %k1
> >             kortestb        %k1, %k1
> >             jne     .L3
> >
> >     loop control using %rax would likely be more latency friendly
> >     here and the mask generation could be unified to a single place.
> >
> >     2021-07-15  Richard Biener  <rguenther@suse.de>
> >
> >             * tree-vect-stmts.c (can_produce_all_loop_masks_p): We
> >             also can produce masks with VEC_COND_EXPRs.
> >             * tree-vect-loop.c (vect_gen_while): Generate the mask
> >             with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> >
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index fc3dab0d143..230d6e34208 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
> >  {
> >    rgroup_controls *rgm;
> >    unsigned int i;
> > +  tree cmp_vectype;
> >    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
> >      if (rgm->type != NULL_TREE
> >         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
> >                                             cmp_type, rgm->type,
> > -                                           OPTIMIZE_FOR_SPEED))
> > +                                           OPTIMIZE_FOR_SPEED)
> > +       && ((cmp_vectype = build_vector_type
> > +                            (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type))),
> > +           true)
> > +       && !(VECTOR_BOOLEAN_TYPE_P (rgm->type)
> > +            && expand_vec_cmp_expr_p (cmp_vectype, rgm->type, LT_EXPR)))
> >        return false;
> >    return true;
> >  }
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index 6a25d661800..18c4c66cb2d 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -12007,16 +12007,43 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
> >                 tree end_index, const char *name)
> >  {
> >    tree cmp_type = TREE_TYPE (start_index);
> > -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > -                                                      cmp_type, mask_type,
> > -                                                      OPTIMIZE_FOR_SPEED));
> > -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > -                                           start_index, end_index,
> > -                                           build_zero_cst (mask_type));
> > -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > -  gimple_call_set_lhs (call, tmp);
> > -  gimple_seq_add_stmt (seq, call);
> > -  return tmp;
> > +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > +                                     cmp_type, mask_type,
> > +                                     OPTIMIZE_FOR_SPEED))
> > +    {
> > +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > +                                               start_index, end_index,
> > +                                               build_zero_cst (mask_type));
> > +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > +      gimple_call_set_lhs (call, tmp);
> > +      gimple_seq_add_stmt (seq, call);
> > +      return tmp;
> > +    }
> > +  else
> > +    {
> > +      /* Generate
> > +          _1 = { start_index, start_index, ... };
> > +          _2 = { end_index, end_index, ... };
> > +          _3 = _1 + { 0, 1, 2 ... };
> > +          _4 = _3 < _2;  */
> > +      tree cvectype = build_vector_type (cmp_type,
> > +                                        TYPE_VECTOR_SUBPARTS (mask_type));
> > +      gcc_assert (VECTOR_BOOLEAN_TYPE_P (mask_type)
> > +                 && expand_vec_cmp_expr_p (cvectype, mask_type, LT_EXPR));
> > +      tree si = make_ssa_name (cvectype);
> > +      gassign *ass = gimple_build_assign
> > +                       (si, build_vector_from_val (cvectype, start_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree ei = make_ssa_name (cvectype);
> > +      ass = gimple_build_assign (ei,
> > +                                build_vector_from_val (cvectype, end_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> > +                                   build_one_cst (cmp_type));
> > +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> > +      return gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> > +                          si, ei);
> > +    }
> >  }
> >
> >  /* Generate a vector mask of type MASK_TYPE for which index I is false iff
> 
> 
> 
>

Richard Biener July 16, 2021, 9:11 a.m. UTC | #11

On Thu, 15 Jul 2021, Richard Biener wrote:

> On Thu, 15 Jul 2021, Richard Biener wrote:
>
> > OK, guess I was more looking at
> > 
> > #define N 32
> > int foo (unsigned long *a, unsigned long * __restrict b,
> >          unsigned int *c, unsigned int * __restrict d,
> >          int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     {
> >       b[i] += a[i];
> >       d[i] += c[i];
> >     }
> >   return sum;
> > }
> > 
> > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > generate two masks for the two copies of V8DI (VF is 16) and one
> > mask for V16SI.  With SVE I see
> > 
> >         punpklo p1.h, p0.b
> >         punpkhi p2.h, p0.b
> > 
> > that's sth I expected to see for AVX512 as well, using the V16SI
> > mask and unpacking that to two V8DI ones.  But I see
> > 
> >         vpbroadcastd    %eax, %ymm0
> >         vpaddd  %ymm12, %ymm0, %ymm0
> >         vpcmpud $6, %ymm0, %ymm11, %k3
> >         vpbroadcastd    %eax, %xmm0
> >         vpaddd  %xmm10, %xmm0, %xmm0
> >         vpcmpud $1, %xmm7, %xmm0, %k1
> >         vpcmpud $6, %xmm0, %xmm8, %k2
> >         kortestb        %k1, %k1
> >         jne     .L3
> > 
> > so three %k masks generated by vpcmpud.  I'll have to look what's
> > the magic for SVE and why that doesn't trigger for x86 here.
> 
> So answer myself, vect_maybe_permute_loop_masks looks for
> vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> QImode so that doesn't play well here.  Not sure if there
> are proper mask instructions to use (I guess there's a shift
> and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> conversion.  Not sure how to better ask the target here - again
> VnBImode might have been easier here.

So I've managed to "emulate" the unpack_lo/hi for the case of
!VECTOR_MODE_P masks by using sub-vector select (we're asking
to turn vector(8) <signed-boolean:1> into two
vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
produces the desired single mask producer and

  loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
  loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;

note for the lowpart we can just view-convert away the excess bits,
fully re-using the mask.  We generate surprisingly "good" code:

        kmovb   %k1, %edi
        shrb    $4, %dil
        kmovb   %edi, %k2

besides the lack of using kshiftrb.  I guess we're just lacking
a mask register alternative for

(insn 22 20 25 4 (parallel [
            (set (reg:QI 94 [ loop_mask_37 ])
                (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
                    (const_int 4 [0x4])))
            (clobber (reg:CC 17 flags))
        ]) 724 {*lshrqi3_1}
     (expr_list:REG_UNUSED (reg:CC 17 flags)
        (nil)))

and so we reload.  For the above cited loop the AVX512 vectorization
with --param vect-partial-vector-usage=1 does look quite sensible
to me.  Instead of a SSE vectorized epilogue plus a scalar
epilogue we get a single fully masked AVX512 "iteration" for both.
I suppose it's still mostly a code-size optimization (384 bytes
with the masked epiloge vs. 474 bytes with trunk) since it will
be likely slower for very low iteration counts but it's good
for icache usage then and good for less branch predictor usage.

That said, I have to set up SPEC on a AVX512 machine to do
any meaningful measurements (I suspect with just AVX2 we're not
going to see any benefit from masking).  Hints/help how to fix
the missing kshiftrb appreciated.

Oh, and if there's only V4DImode and V16HImode data then
we don't go the vect_maybe_permute_loop_masks path - that is,
we don't generate the (not used) intermediate mask but end up
generating two while_ult parts.

Thanks,
Richard.

Hongtao Liu July 20, 2021, 4:20 a.m. UTC | #12

On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Thu, 15 Jul 2021, Richard Biener wrote:
>
> > On Thu, 15 Jul 2021, Richard Biener wrote:
> >
> > > OK, guess I was more looking at
> > >
> > > #define N 32
> > > int foo (unsigned long *a, unsigned long * __restrict b,
> > >          unsigned int *c, unsigned int * __restrict d,
> > >          int n)
> > > {
> > >   unsigned sum = 1;
> > >   for (int i = 0; i < n; ++i)
> > >     {
> > >       b[i] += a[i];
> > >       d[i] += c[i];
> > >     }
> > >   return sum;
> > > }
> > >
> > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > mask for V16SI.  With SVE I see
> > >
> > >         punpklo p1.h, p0.b
> > >         punpkhi p2.h, p0.b
> > >
> > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > mask and unpacking that to two V8DI ones.  But I see
> > >
> > >         vpbroadcastd    %eax, %ymm0
> > >         vpaddd  %ymm12, %ymm0, %ymm0
> > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > >         vpbroadcastd    %eax, %xmm0
> > >         vpaddd  %xmm10, %xmm0, %xmm0
> > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > >         kortestb        %k1, %k1
> > >         jne     .L3
> > >
> > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > the magic for SVE and why that doesn't trigger for x86 here.
> >
> > So answer myself, vect_maybe_permute_loop_masks looks for
> > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > QImode so that doesn't play well here.  Not sure if there
> > are proper mask instructions to use (I guess there's a shift
> > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
unpack_lo/hi, only shift.
> > conversion.  Not sure how to better ask the target here - again
> > VnBImode might have been easier here.
>
> So I've managed to "emulate" the unpack_lo/hi for the case of
> !VECTOR_MODE_P masks by using sub-vector select (we're asking
> to turn vector(8) <signed-boolean:1> into two
> vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> produces the desired single mask producer and
>
>   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
>   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
>
> note for the lowpart we can just view-convert away the excess bits,
> fully re-using the mask.  We generate surprisingly "good" code:
>
>         kmovb   %k1, %edi
>         shrb    $4, %dil
>         kmovb   %edi, %k2
>
> besides the lack of using kshiftrb.  I guess we're just lacking
> a mask register alternative for
Yes, we can do it similar as kor/kand/kxor.
>
> (insn 22 20 25 4 (parallel [
>             (set (reg:QI 94 [ loop_mask_37 ])
>                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
>                     (const_int 4 [0x4])))
>             (clobber (reg:CC 17 flags))
>         ]) 724 {*lshrqi3_1}
>      (expr_list:REG_UNUSED (reg:CC 17 flags)
>         (nil)))
>
> and so we reload.  For the above cited loop the AVX512 vectorization
> with --param vect-partial-vector-usage=1 does look quite sensible
> to me.  Instead of a SSE vectorized epilogue plus a scalar
> epilogue we get a single fully masked AVX512 "iteration" for both.
> I suppose it's still mostly a code-size optimization (384 bytes
> with the masked epiloge vs. 474 bytes with trunk) since it will
> be likely slower for very low iteration counts but it's good
> for icache usage then and good for less branch predictor usage.
>
> That said, I have to set up SPEC on a AVX512 machine to do
Does patch  land in trunk already, i can have a test on CLX.
> any meaningful measurements (I suspect with just AVX2 we're not
> going to see any benefit from masking).  Hints/help how to fix
> the missing kshiftrb appreciated.
>
> Oh, and if there's only V4DImode and V16HImode data then
> we don't go the vect_maybe_permute_loop_masks path - that is,
> we don't generate the (not used) intermediate mask but end up
> generating two while_ult parts.
>
> Thanks,
> Richard.

Richard Biener July 20, 2021, 7:38 a.m. UTC | #13

On Tue, 20 Jul 2021, Hongtao Liu wrote:

> On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Thu, 15 Jul 2021, Richard Biener wrote:
> >
> > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > >
> > > > OK, guess I was more looking at
> > > >
> > > > #define N 32
> > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > >          unsigned int *c, unsigned int * __restrict d,
> > > >          int n)
> > > > {
> > > >   unsigned sum = 1;
> > > >   for (int i = 0; i < n; ++i)
> > > >     {
> > > >       b[i] += a[i];
> > > >       d[i] += c[i];
> > > >     }
> > > >   return sum;
> > > > }
> > > >
> > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > mask for V16SI.  With SVE I see
> > > >
> > > >         punpklo p1.h, p0.b
> > > >         punpkhi p2.h, p0.b
> > > >
> > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > mask and unpacking that to two V8DI ones.  But I see
> > > >
> > > >         vpbroadcastd    %eax, %ymm0
> > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > >         vpbroadcastd    %eax, %xmm0
> > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > >         kortestb        %k1, %k1
> > > >         jne     .L3
> > > >
> > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > the magic for SVE and why that doesn't trigger for x86 here.
> > >
> > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > QImode so that doesn't play well here.  Not sure if there
> > > are proper mask instructions to use (I guess there's a shift
> > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> unpack_lo/hi, only shift.
> > > conversion.  Not sure how to better ask the target here - again
> > > VnBImode might have been easier here.
> >
> > So I've managed to "emulate" the unpack_lo/hi for the case of
> > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > to turn vector(8) <signed-boolean:1> into two
> > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > produces the desired single mask producer and
> >
> >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> >
> > note for the lowpart we can just view-convert away the excess bits,
> > fully re-using the mask.  We generate surprisingly "good" code:
> >
> >         kmovb   %k1, %edi
> >         shrb    $4, %dil
> >         kmovb   %edi, %k2
> >
> > besides the lack of using kshiftrb.  I guess we're just lacking
> > a mask register alternative for
> Yes, we can do it similar as kor/kand/kxor.
> >
> > (insn 22 20 25 4 (parallel [
> >             (set (reg:QI 94 [ loop_mask_37 ])
> >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> >                     (const_int 4 [0x4])))
> >             (clobber (reg:CC 17 flags))
> >         ]) 724 {*lshrqi3_1}
> >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> >         (nil)))
> >
> > and so we reload.  For the above cited loop the AVX512 vectorization
> > with --param vect-partial-vector-usage=1 does look quite sensible
> > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > epilogue we get a single fully masked AVX512 "iteration" for both.
> > I suppose it's still mostly a code-size optimization (384 bytes
> > with the masked epiloge vs. 474 bytes with trunk) since it will
> > be likely slower for very low iteration counts but it's good
> > for icache usage then and good for less branch predictor usage.
> >
> > That said, I have to set up SPEC on a AVX512 machine to do
> Does patch  land in trunk already, i can have a test on CLX.

I'm still experimenting a bit right now but hope to get something
trunk ready at the end of this or beginning next week.  Since it's
disabled by default we can work on improving it during stage1 then.

I'm mostly struggling with the GIMPLE IL to be used for the
mask unpacking since we currently reject both the BIT_FIELD_REF
and the VIEW_CONVERT we generate (why do AVX512 masks not all have
SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
we've dropped whole-vector shifts in favor of VEC_PERM but that
doesn't work well either for integer mode vectors.  So I'm still
playing with my options here and looking for something that doesn't
require too much surgery on the RTL side to recover good mask
register code ...

Another part missing is expanders for the various cond_* patterns

OPTAB_D (cond_add_optab, "cond_add$a")
OPTAB_D (cond_sub_optab, "cond_sub$a")
OPTAB_D (cond_smul_optab, "cond_mul$a")
OPTAB_D (cond_sdiv_optab, "cond_div$a")
OPTAB_D (cond_smod_optab, "cond_mod$a")
OPTAB_D (cond_udiv_optab, "cond_udiv$a")
OPTAB_D (cond_umod_optab, "cond_umod$a")
OPTAB_D (cond_and_optab, "cond_and$a")
OPTAB_D (cond_ior_optab, "cond_ior$a")
OPTAB_D (cond_xor_optab, "cond_xor$a")
OPTAB_D (cond_ashl_optab, "cond_ashl$a")
OPTAB_D (cond_ashr_optab, "cond_ashr$a")
OPTAB_D (cond_lshr_optab, "cond_lshr$a")
OPTAB_D (cond_smin_optab, "cond_smin$a")
OPTAB_D (cond_smax_optab, "cond_smax$a")
OPTAB_D (cond_umin_optab, "cond_umin$a")
OPTAB_D (cond_umax_optab, "cond_umax$a")
OPTAB_D (cond_fma_optab, "cond_fma$a")
OPTAB_D (cond_fms_optab, "cond_fms$a")
OPTAB_D (cond_fnma_optab, "cond_fnma$a")
OPTAB_D (cond_fnms_optab, "cond_fnms$a")

I think the most useful are those for possibly trapping ops
(will be used by if-conversion) and those for reduction operations
(add,min,max) which would enable a masked reduction epilogue.

The good thing is that I've been able to get my hands on a
Cascadelake system so I can at least test things for correctness.

Richard.

> > any meaningful measurements (I suspect with just AVX2 we're not
> > going to see any benefit from masking).  Hints/help how to fix
> > the missing kshiftrb appreciated.
> >
> > Oh, and if there's only V4DImode and V16HImode data then
> > we don't go the vect_maybe_permute_loop_masks path - that is,
> > we don't generate the (not used) intermediate mask but end up
> > generating two while_ult parts.
> >
> > Thanks,
> > Richard.

Hongtao Liu July 20, 2021, 11:07 a.m. UTC | #14

On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Tue, 20 Jul 2021, Hongtao Liu wrote:
>
> > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > >
> > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > >
> > > > > OK, guess I was more looking at
> > > > >
> > > > > #define N 32
> > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > >          int n)
> > > > > {
> > > > >   unsigned sum = 1;
> > > > >   for (int i = 0; i < n; ++i)
> > > > >     {
> > > > >       b[i] += a[i];
> > > > >       d[i] += c[i];
> > > > >     }
> > > > >   return sum;
> > > > > }
> > > > >
> > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > mask for V16SI.  With SVE I see
> > > > >
> > > > >         punpklo p1.h, p0.b
> > > > >         punpkhi p2.h, p0.b
> > > > >
> > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > >
> > > > >         vpbroadcastd    %eax, %ymm0
> > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > >         vpbroadcastd    %eax, %xmm0
> > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > >         kortestb        %k1, %k1
> > > > >         jne     .L3
> > > > >
> > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > >
> > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > QImode so that doesn't play well here.  Not sure if there
> > > > are proper mask instructions to use (I guess there's a shift
> > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > unpack_lo/hi, only shift.
> > > > conversion.  Not sure how to better ask the target here - again
> > > > VnBImode might have been easier here.
> > >
> > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > to turn vector(8) <signed-boolean:1> into two
> > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > produces the desired single mask producer and
> > >
> > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > >
> > > note for the lowpart we can just view-convert away the excess bits,
> > > fully re-using the mask.  We generate surprisingly "good" code:
> > >
> > >         kmovb   %k1, %edi
> > >         shrb    $4, %dil
> > >         kmovb   %edi, %k2
> > >
> > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > a mask register alternative for
> > Yes, we can do it similar as kor/kand/kxor.
> > >
> > > (insn 22 20 25 4 (parallel [
> > >             (set (reg:QI 94 [ loop_mask_37 ])
> > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > >                     (const_int 4 [0x4])))
> > >             (clobber (reg:CC 17 flags))
> > >         ]) 724 {*lshrqi3_1}
> > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > >         (nil)))
> > >
> > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > I suppose it's still mostly a code-size optimization (384 bytes
> > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > be likely slower for very low iteration counts but it's good
> > > for icache usage then and good for less branch predictor usage.
> > >
> > > That said, I have to set up SPEC on a AVX512 machine to do
> > Does patch  land in trunk already, i can have a test on CLX.
>
> I'm still experimenting a bit right now but hope to get something
> trunk ready at the end of this or beginning next week.  Since it's
> disabled by default we can work on improving it during stage1 then.
>
> I'm mostly struggling with the GIMPLE IL to be used for the
> mask unpacking since we currently reject both the BIT_FIELD_REF
> and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
We have  instruction like ktestb which only cases about the low 8
bits, if we use SImode for all masks, code implementation can become
complex.

> we've dropped whole-vector shifts in favor of VEC_PERM but that
> doesn't work well either for integer mode vectors.  So I'm still
> playing with my options here and looking for something that doesn't
> require too much surgery on the RTL side to recover good mask
> register code ...
>
> Another part missing is expanders for the various cond_* patterns
>
> OPTAB_D (cond_add_optab, "cond_add$a")
> OPTAB_D (cond_sub_optab, "cond_sub$a")
> OPTAB_D (cond_smul_optab, "cond_mul$a")
> OPTAB_D (cond_sdiv_optab, "cond_div$a")
> OPTAB_D (cond_smod_optab, "cond_mod$a")
> OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> OPTAB_D (cond_umod_optab, "cond_umod$a")
> OPTAB_D (cond_and_optab, "cond_and$a")
> OPTAB_D (cond_ior_optab, "cond_ior$a")
> OPTAB_D (cond_xor_optab, "cond_xor$a")
> OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> OPTAB_D (cond_smin_optab, "cond_smin$a")
> OPTAB_D (cond_smax_optab, "cond_smax$a")
> OPTAB_D (cond_umin_optab, "cond_umin$a")
> OPTAB_D (cond_umax_optab, "cond_umax$a")
> OPTAB_D (cond_fma_optab, "cond_fma$a")
> OPTAB_D (cond_fms_optab, "cond_fms$a")
> OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> OPTAB_D (cond_fnms_optab, "cond_fnms$a")
I guess there's no need for scalar modes, although avx512 mask support
scalar instructions, it's a bit awkward to generate mask from scalar
operands.(we need to compare, set flag to gpr, and mov gpr to mask
register).
>
> I think the most useful are those for possibly trapping ops
> (will be used by if-conversion) and those for reduction operations
> (add,min,max) which would enable a masked reduction epilogue.
>
> The good thing is that I've been able to get my hands on a
> Cascadelake system so I can at least test things for correctness.
>
> Richard.
>
> > > any meaningful measurements (I suspect with just AVX2 we're not
> > > going to see any benefit from masking).  Hints/help how to fix
> > > the missing kshiftrb appreciated.
> > >
> > > Oh, and if there's only V4DImode and V16HImode data then
> > > we don't go the vect_maybe_permute_loop_masks path - that is,
> > > we don't generate the (not used) intermediate mask but end up
> > > generating two while_ult parts.
> > >
> > > Thanks,
> > > Richard.

Richard Biener July 20, 2021, 11:09 a.m. UTC | #15

On Tue, 20 Jul 2021, Hongtao Liu wrote:

> On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Tue, 20 Jul 2021, Hongtao Liu wrote:
> >
> > > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > > >
> > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > >
> > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > >
> > > > > > OK, guess I was more looking at
> > > > > >
> > > > > > #define N 32
> > > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > > >          int n)
> > > > > > {
> > > > > >   unsigned sum = 1;
> > > > > >   for (int i = 0; i < n; ++i)
> > > > > >     {
> > > > > >       b[i] += a[i];
> > > > > >       d[i] += c[i];
> > > > > >     }
> > > > > >   return sum;
> > > > > > }
> > > > > >
> > > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > > mask for V16SI.  With SVE I see
> > > > > >
> > > > > >         punpklo p1.h, p0.b
> > > > > >         punpkhi p2.h, p0.b
> > > > > >
> > > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > > >
> > > > > >         vpbroadcastd    %eax, %ymm0
> > > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > > >         vpbroadcastd    %eax, %xmm0
> > > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > > >         kortestb        %k1, %k1
> > > > > >         jne     .L3
> > > > > >
> > > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > > >
> > > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > > QImode so that doesn't play well here.  Not sure if there
> > > > > are proper mask instructions to use (I guess there's a shift
> > > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > > unpack_lo/hi, only shift.
> > > > > conversion.  Not sure how to better ask the target here - again
> > > > > VnBImode might have been easier here.
> > > >
> > > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > > to turn vector(8) <signed-boolean:1> into two
> > > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > > produces the desired single mask producer and
> > > >
> > > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > > >
> > > > note for the lowpart we can just view-convert away the excess bits,
> > > > fully re-using the mask.  We generate surprisingly "good" code:
> > > >
> > > >         kmovb   %k1, %edi
> > > >         shrb    $4, %dil
> > > >         kmovb   %edi, %k2
> > > >
> > > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > > a mask register alternative for
> > > Yes, we can do it similar as kor/kand/kxor.
> > > >
> > > > (insn 22 20 25 4 (parallel [
> > > >             (set (reg:QI 94 [ loop_mask_37 ])
> > > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > > >                     (const_int 4 [0x4])))
> > > >             (clobber (reg:CC 17 flags))
> > > >         ]) 724 {*lshrqi3_1}
> > > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > > >         (nil)))
> > > >
> > > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > > I suppose it's still mostly a code-size optimization (384 bytes
> > > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > > be likely slower for very low iteration counts but it's good
> > > > for icache usage then and good for less branch predictor usage.
> > > >
> > > > That said, I have to set up SPEC on a AVX512 machine to do
> > > Does patch  land in trunk already, i can have a test on CLX.
> >
> > I'm still experimenting a bit right now but hope to get something
> > trunk ready at the end of this or beginning next week.  Since it's
> > disabled by default we can work on improving it during stage1 then.
> >
> > I'm mostly struggling with the GIMPLE IL to be used for the
> > mask unpacking since we currently reject both the BIT_FIELD_REF
> > and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> > SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
> We have  instruction like ktestb which only cases about the low 8
> bits, if we use SImode for all masks, code implementation can become
> complex.
> 
> > we've dropped whole-vector shifts in favor of VEC_PERM but that
> > doesn't work well either for integer mode vectors.  So I'm still
> > playing with my options here and looking for something that doesn't
> > require too much surgery on the RTL side to recover good mask
> > register code ...
> >
> > Another part missing is expanders for the various cond_* patterns
> >
> > OPTAB_D (cond_add_optab, "cond_add$a")
> > OPTAB_D (cond_sub_optab, "cond_sub$a")
> > OPTAB_D (cond_smul_optab, "cond_mul$a")
> > OPTAB_D (cond_sdiv_optab, "cond_div$a")
> > OPTAB_D (cond_smod_optab, "cond_mod$a")
> > OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> > OPTAB_D (cond_umod_optab, "cond_umod$a")
> > OPTAB_D (cond_and_optab, "cond_and$a")
> > OPTAB_D (cond_ior_optab, "cond_ior$a")
> > OPTAB_D (cond_xor_optab, "cond_xor$a")
> > OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> > OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> > OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> > OPTAB_D (cond_smin_optab, "cond_smin$a")
> > OPTAB_D (cond_smax_optab, "cond_smax$a")
> > OPTAB_D (cond_umin_optab, "cond_umin$a")
> > OPTAB_D (cond_umax_optab, "cond_umax$a")
> > OPTAB_D (cond_fma_optab, "cond_fma$a")
> > OPTAB_D (cond_fms_optab, "cond_fms$a")
> > OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> > OPTAB_D (cond_fnms_optab, "cond_fnms$a")
> I guess there's no need for scalar modes, although avx512 mask support
> scalar instructions, it's a bit awkward to generate mask from scalar
> operands.(we need to compare, set flag to gpr, and mov gpr to mask
> register).

Yes, I think if-conversion has all the "scalar" if-converted code
dominated by a if (.IFN_VECTORIZED ()) conditional and thus it will
be only used vectorized.  See ifcvt_can_predicate where it checks

  internal_fn cond_fn = get_conditional_internal_fn (code);
  return (cond_fn != IFN_LAST
          && vectorized_internal_fn_supported_p (cond_fn, lhs_type));

as said this is really orthogonal to the fully masked loop/epilogue
and would benefit vectorization when FP ops can trap.

Richard.

Richard Biener July 20, 2021, 1:48 p.m. UTC | #16

On Thu, 15 Jul 2021, Richard Sandiford wrote:

> Richard Biener <rguenther@suse.de> writes:
> > The following extends the existing loop masking support using
> > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > you can now enable masked vectorized epilogues (=1) or fully
> > masked vector loops (=2).
> 
> As mentioned on IRC, WHILE_ULT is supposed to ensure that every
> element after the first zero is also zero.  That happens naturally
> for power-of-2 vectors if the start index is a multiple of the VF.
> (And at the moment, variable-length vectors are the only way of
> supporting non-power-of-2 vectors.)
> 
> This probably works fine for =2 and =1 as things stand, since the
> vector IVs always start at zero.  But if in future we have a single
> IV counting scalar iterations, and use it even for peeled prologue
> iterations, we could end up with a situation where the approximation
> is no longer safe.
> 
> E.g. suppose we had a uint32_t scalar IV with a limit of (uint32_t)-3.
> If we peeled 2 iterations for alignment and then had a VF of 8,
> the final vector would have a start index of (uint32_t)-6 and the
> vector would be { -1, -1, -1, 0, 0, 0, -1, -1 }.
> 
> So I think it would be safer to handle this as an alternative to
> using while, rather than as a direct emulation, so that we can take
> the extra restrictions into account.  Alternatively, we could probably
> do { 0, 1, 2, ... } < { end - start, end - start, ... }.

That doesn't end up working since in the last iteration with a
non-zero mask we'll compare with all underflowed values (start
will be > end).  So while we compute a correct mask we cannot use
that for loop control anymore.

Richard.

> Thanks,
> Richard
> 
> 
> 
> >
> > What's missing is using a scalar IV for the loop control
> > (but in principle AVX512 can use the mask here - just the patch
> > doesn't seem to work for AVX512 yet for some reason - likely
> > expand_vec_cond_expr_p doesn't work there).  What's also missing
> > is providing more support for predicated operations in the case
> > of reductions either via VEC_COND_EXPRs or via implementing
> > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping
> > to masked AVX512 operations.
> >
> > For AVX2 and
> >
> > int foo (unsigned *a, unsigned * __restrict b, int n)
> > {
> >   unsigned sum = 1;
> >   for (int i = 0; i < n; ++i)
> >     b[i] += a[i];
> >   return sum;
> > }
> >
> > we get
> >
> > .L3:
> >         vpmaskmovd      (%rsi,%rax), %ymm0, %ymm3
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm1
> >         addl    $8, %edx
> >         vpaddd  %ymm3, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi,%rax)
> >         vmovd   %edx, %xmm1
> >         vpsubd  %ymm15, %ymm2, %ymm0
> >         addq    $32, %rax
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpaddd  %ymm4, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vptest  %ymm0, %ymm0
> >         jne     .L3
> >
> > for the fully masked loop body and for the masked epilogue
> > we see
> >
> > .L4:
> >         vmovdqu (%rsi,%rax), %ymm3
> >         vpaddd  (%rdi,%rax), %ymm3, %ymm0
> >         vmovdqu %ymm0, (%rsi,%rax)
> >         addq    $32, %rax
> >         cmpq    %rax, %rcx
> >         jne     .L4
> >         movl    %edx, %eax
> >         andl    $-8, %eax
> >         testb   $7, %dl
> >         je      .L11
> > .L3:
> >         subl    %eax, %edx
> >         vmovdqa .LC0(%rip), %ymm1
> >         salq    $2, %rax
> >         vmovd   %edx, %xmm0
> >         movl    $-2147483648, %edx
> >         addq    %rax, %rsi
> >         vmovd   %edx, %xmm15
> >         vpbroadcastd    %xmm0, %ymm0
> >         vpbroadcastd    %xmm15, %ymm15
> >         vpsubd  %ymm15, %ymm1, %ymm1
> >         vpsubd  %ymm15, %ymm0, %ymm0
> >         vpcmpgtd        %ymm1, %ymm0, %ymm0
> >         vpmaskmovd      (%rsi), %ymm0, %ymm1
> >         vpmaskmovd      (%rdi,%rax), %ymm0, %ymm2
> >         vpaddd  %ymm2, %ymm1, %ymm1
> >         vpmaskmovd      %ymm1, %ymm0, (%rsi)
> > .L11:
> >         vzeroupper
> >
> > compared to
> >
> > .L3:
> >         movl    %edx, %r8d
> >         subl    %eax, %r8d
> >         leal    -1(%r8), %r9d
> >         cmpl    $2, %r9d
> >         jbe     .L6
> >         leaq    (%rcx,%rax,4), %r9
> >         vmovdqu (%rdi,%rax,4), %xmm2
> >         movl    %r8d, %eax
> >         andl    $-4, %eax
> >         vpaddd  (%r9), %xmm2, %xmm0
> >         addl    %eax, %esi
> >         andl    $3, %r8d
> >         vmovdqu %xmm0, (%r9)
> >         je      .L2
> > .L6:
> >         movslq  %esi, %r8
> >         leaq    0(,%r8,4), %rax
> >         movl    (%rdi,%r8,4), %r8d
> >         addl    %r8d, (%rcx,%rax)
> >         leal    1(%rsi), %r8d
> >         cmpl    %r8d, %edx
> >         jle     .L2
> >         addl    $2, %esi
> >         movl    4(%rdi,%rax), %r8d
> >         addl    %r8d, 4(%rcx,%rax)
> >         cmpl    %esi, %edx
> >         jle     .L2
> >         movl    8(%rdi,%rax), %edx
> >         addl    %edx, 8(%rcx,%rax)
> > .L2:
> >
> > I'm giving this a little testing right now but will dig on why
> > I don't get masked loops when AVX512 is enabled.
> >
> > Still comments are appreciated.
> >
> > Thanks,
> > Richard.
> >
> > 2021-07-15  Richard Biener  <rguenther@suse.de>
> >
> > 	* tree-vect-stmts.c (can_produce_all_loop_masks_p): We
> > 	also can produce masks with VEC_COND_EXPRs.
> > 	* tree-vect-loop.c (vect_gen_while): Generate the mask
> > 	with a VEC_COND_EXPR in case WHILE_ULT is not supported.
> > ---
> >  gcc/tree-vect-loop.c  |  8 ++++++-
> >  gcc/tree-vect-stmts.c | 50 ++++++++++++++++++++++++++++++++++---------
> >  2 files changed, 47 insertions(+), 11 deletions(-)
> >
> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
> > index fc3dab0d143..2214ed11dfb 100644
> > --- a/gcc/tree-vect-loop.c
> > +++ b/gcc/tree-vect-loop.c
> > @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
> >  {
> >    rgroup_controls *rgm;
> >    unsigned int i;
> > +  tree cmp_vectype;
> >    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
> >      if (rgm->type != NULL_TREE
> >  	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
> >  					    cmp_type, rgm->type,
> > -					    OPTIMIZE_FOR_SPEED))
> > +					    OPTIMIZE_FOR_SPEED)
> > +	&& ((cmp_vectype
> > +	       = truth_type_for (build_vector_type
> > +				 (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type)))),
> > +	    true)
> > +	&& !expand_vec_cond_expr_p (rgm->type, cmp_vectype, LT_EXPR))
> >        return false;
> >    return true;
> >  }
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index 6a25d661800..216986399b1 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -12007,16 +12007,46 @@ vect_gen_while (gimple_seq *seq, tree mask_type, tree start_index,
> >  		tree end_index, const char *name)
> >  {
> >    tree cmp_type = TREE_TYPE (start_index);
> > -  gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > -						       cmp_type, mask_type,
> > -						       OPTIMIZE_FOR_SPEED));
> > -  gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > -					    start_index, end_index,
> > -					    build_zero_cst (mask_type));
> > -  tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > -  gimple_call_set_lhs (call, tmp);
> > -  gimple_seq_add_stmt (seq, call);
> > -  return tmp;
> > +  if (direct_internal_fn_supported_p (IFN_WHILE_ULT,
> > +				      cmp_type, mask_type,
> > +				      OPTIMIZE_FOR_SPEED))
> > +    {
> > +      gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3,
> > +						start_index, end_index,
> > +						build_zero_cst (mask_type));
> > +      tree tmp = make_temp_ssa_name (mask_type, NULL, name);
> > +      gimple_call_set_lhs (call, tmp);
> > +      gimple_seq_add_stmt (seq, call);
> > +      return tmp;
> > +    }
> > +  else
> > +    {
> > +      /* Generate
> > +	   _1 = { start_index, start_index, ... };
> > +	   _2 = { end_index, end_index, ... };
> > +	   _3 = _1 + { 0, 1, 2 ... };
> > +	   _4 = _3 < _2;
> > +	   _5 = VEC_COND_EXPR <_4, { -1, -1, ... } : { 0, 0, ... }>;   */
> > +      tree cvectype = build_vector_type (cmp_type,
> > +					 TYPE_VECTOR_SUBPARTS (mask_type));
> > +      tree si = make_ssa_name (cvectype);
> > +      gassign *ass = gimple_build_assign
> > +			(si, build_vector_from_val (cvectype, start_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree ei = make_ssa_name (cvectype);
> > +      ass = gimple_build_assign (ei,
> > +				 build_vector_from_val (cvectype, end_index));
> > +      gimple_seq_add_stmt (seq, ass);
> > +      tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type),
> > +				    build_one_cst (cmp_type));
> > +      si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr);
> > +      tree cmp = gimple_build (seq, LT_EXPR, truth_type_for (cvectype),
> > +			       si, ei);
> > +      tree mask = gimple_build (seq, VEC_COND_EXPR, mask_type, cmp,
> > +				build_all_ones_cst (mask_type),
> > +				build_zero_cst (mask_type));
> > +      return mask;
> > +    }
> >  }
> >  
> >  /* Generate a vector mask of type MASK_TYPE for which index I is false iff
>

Richard Biener July 21, 2021, 6:17 a.m. UTC | #17

On Tue, 20 Jul 2021, Richard Biener wrote:

> On Thu, 15 Jul 2021, Richard Sandiford wrote:
> 
> > Richard Biener <rguenther@suse.de> writes:
> > > The following extends the existing loop masking support using
> > > SVE WHILE_ULT to x86 by proving an alternate way to produce the
> > > mask using VEC_COND_EXPRs.  So with --param vect-partial-vector-usage
> > > you can now enable masked vectorized epilogues (=1) or fully
> > > masked vector loops (=2).
> > 
> > As mentioned on IRC, WHILE_ULT is supposed to ensure that every
> > element after the first zero is also zero.  That happens naturally
> > for power-of-2 vectors if the start index is a multiple of the VF.
> > (And at the moment, variable-length vectors are the only way of
> > supporting non-power-of-2 vectors.)
> > 
> > This probably works fine for =2 and =1 as things stand, since the
> > vector IVs always start at zero.  But if in future we have a single
> > IV counting scalar iterations, and use it even for peeled prologue
> > iterations, we could end up with a situation where the approximation
> > is no longer safe.
> > 
> > E.g. suppose we had a uint32_t scalar IV with a limit of (uint32_t)-3.
> > If we peeled 2 iterations for alignment and then had a VF of 8,
> > the final vector would have a start index of (uint32_t)-6 and the
> > vector would be { -1, -1, -1, 0, 0, 0, -1, -1 }.
> > 
> > So I think it would be safer to handle this as an alternative to
> > using while, rather than as a direct emulation, so that we can take
> > the extra restrictions into account.  Alternatively, we could probably
> > do { 0, 1, 2, ... } < { end - start, end - start, ... }.
> 
> That doesn't end up working since in the last iteration with a
> non-zero mask we'll compare with all underflowed values (start
> will be > end).  So while we compute a correct mask we cannot use
> that for loop control anymore.

Of course I can just use a signed comparison here (until we get
V128QI and a QImode iterator).

Richard.

Hongtao Liu July 21, 2021, 7:57 a.m. UTC | #18

On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Tue, 20 Jul 2021, Hongtao Liu wrote:
>
> > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > >
> > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > >
> > > > > OK, guess I was more looking at
> > > > >
> > > > > #define N 32
> > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > >          int n)
> > > > > {
> > > > >   unsigned sum = 1;
> > > > >   for (int i = 0; i < n; ++i)
> > > > >     {
> > > > >       b[i] += a[i];
> > > > >       d[i] += c[i];
> > > > >     }
> > > > >   return sum;
> > > > > }
> > > > >
> > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > mask for V16SI.  With SVE I see
> > > > >
> > > > >         punpklo p1.h, p0.b
> > > > >         punpkhi p2.h, p0.b
> > > > >
> > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > >
> > > > >         vpbroadcastd    %eax, %ymm0
> > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > >         vpbroadcastd    %eax, %xmm0
> > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > >         kortestb        %k1, %k1
> > > > >         jne     .L3
> > > > >
> > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > >
> > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > QImode so that doesn't play well here.  Not sure if there
> > > > are proper mask instructions to use (I guess there's a shift
> > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > unpack_lo/hi, only shift.
> > > > conversion.  Not sure how to better ask the target here - again
> > > > VnBImode might have been easier here.
> > >
> > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > to turn vector(8) <signed-boolean:1> into two
> > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > produces the desired single mask producer and
> > >
> > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > >
> > > note for the lowpart we can just view-convert away the excess bits,
> > > fully re-using the mask.  We generate surprisingly "good" code:
> > >
> > >         kmovb   %k1, %edi
> > >         shrb    $4, %dil
> > >         kmovb   %edi, %k2
> > >
> > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > a mask register alternative for
> > Yes, we can do it similar as kor/kand/kxor.
> > >
> > > (insn 22 20 25 4 (parallel [
> > >             (set (reg:QI 94 [ loop_mask_37 ])
> > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > >                     (const_int 4 [0x4])))
> > >             (clobber (reg:CC 17 flags))
> > >         ]) 724 {*lshrqi3_1}
> > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > >         (nil)))
> > >
> > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > I suppose it's still mostly a code-size optimization (384 bytes
> > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > be likely slower for very low iteration counts but it's good
> > > for icache usage then and good for less branch predictor usage.
> > >
> > > That said, I have to set up SPEC on a AVX512 machine to do
> > Does patch  land in trunk already, i can have a test on CLX.
>
> I'm still experimenting a bit right now but hope to get something
> trunk ready at the end of this or beginning next week.  Since it's
> disabled by default we can work on improving it during stage1 then.
>
> I'm mostly struggling with the GIMPLE IL to be used for the
> mask unpacking since we currently reject both the BIT_FIELD_REF
> and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
> we've dropped whole-vector shifts in favor of VEC_PERM but that
> doesn't work well either for integer mode vectors.  So I'm still
> playing with my options here and looking for something that doesn't
> require too much surgery on the RTL side to recover good mask
> register code ...
>
> Another part missing is expanders for the various cond_* patterns
>
> OPTAB_D (cond_add_optab, "cond_add$a")
> OPTAB_D (cond_sub_optab, "cond_sub$a")
> OPTAB_D (cond_smul_optab, "cond_mul$a")
> OPTAB_D (cond_sdiv_optab, "cond_div$a")
> OPTAB_D (cond_smod_optab, "cond_mod$a")
> OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> OPTAB_D (cond_umod_optab, "cond_umod$a")
> OPTAB_D (cond_and_optab, "cond_and$a")
> OPTAB_D (cond_ior_optab, "cond_ior$a")
> OPTAB_D (cond_xor_optab, "cond_xor$a")
> OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> OPTAB_D (cond_smin_optab, "cond_smin$a")
> OPTAB_D (cond_smax_optab, "cond_smax$a")
> OPTAB_D (cond_umin_optab, "cond_umin$a")
> OPTAB_D (cond_umax_optab, "cond_umax$a")
> OPTAB_D (cond_fma_optab, "cond_fma$a")
> OPTAB_D (cond_fms_optab, "cond_fms$a")
> OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> OPTAB_D (cond_fnms_optab, "cond_fnms$a")
>
> I think the most useful are those for possibly trapping ops
> (will be used by if-conversion) and those for reduction operations
> (add,min,max) which would enable a masked reduction epilogue.
I've added cond_add/sub/max/min/smax/smin with my local patch, but I
can't figure out testcases to validate them.
Any ideas?
>
> The good thing is that I've been able to get my hands on a
> Cascadelake system so I can at least test things for correctness.
>
> Richard.
>
> > > any meaningful measurements (I suspect with just AVX2 we're not
> > > going to see any benefit from masking).  Hints/help how to fix
> > > the missing kshiftrb appreciated.
> > >
> > > Oh, and if there's only V4DImode and V16HImode data then
> > > we don't go the vect_maybe_permute_loop_masks path - that is,
> > > we don't generate the (not used) intermediate mask but end up
> > > generating two while_ult parts.
> > >
> > > Thanks,
> > > Richard.

Richard Biener July 21, 2021, 8:16 a.m. UTC | #19

On Wed, 21 Jul 2021, Hongtao Liu wrote:

> On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Tue, 20 Jul 2021, Hongtao Liu wrote:
> >
> > > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > > >
> > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > >
> > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > >
> > > > > > OK, guess I was more looking at
> > > > > >
> > > > > > #define N 32
> > > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > > >          int n)
> > > > > > {
> > > > > >   unsigned sum = 1;
> > > > > >   for (int i = 0; i < n; ++i)
> > > > > >     {
> > > > > >       b[i] += a[i];
> > > > > >       d[i] += c[i];
> > > > > >     }
> > > > > >   return sum;
> > > > > > }
> > > > > >
> > > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > > mask for V16SI.  With SVE I see
> > > > > >
> > > > > >         punpklo p1.h, p0.b
> > > > > >         punpkhi p2.h, p0.b
> > > > > >
> > > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > > >
> > > > > >         vpbroadcastd    %eax, %ymm0
> > > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > > >         vpbroadcastd    %eax, %xmm0
> > > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > > >         kortestb        %k1, %k1
> > > > > >         jne     .L3
> > > > > >
> > > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > > >
> > > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > > QImode so that doesn't play well here.  Not sure if there
> > > > > are proper mask instructions to use (I guess there's a shift
> > > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > > unpack_lo/hi, only shift.
> > > > > conversion.  Not sure how to better ask the target here - again
> > > > > VnBImode might have been easier here.
> > > >
> > > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > > to turn vector(8) <signed-boolean:1> into two
> > > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > > produces the desired single mask producer and
> > > >
> > > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > > >
> > > > note for the lowpart we can just view-convert away the excess bits,
> > > > fully re-using the mask.  We generate surprisingly "good" code:
> > > >
> > > >         kmovb   %k1, %edi
> > > >         shrb    $4, %dil
> > > >         kmovb   %edi, %k2
> > > >
> > > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > > a mask register alternative for
> > > Yes, we can do it similar as kor/kand/kxor.
> > > >
> > > > (insn 22 20 25 4 (parallel [
> > > >             (set (reg:QI 94 [ loop_mask_37 ])
> > > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > > >                     (const_int 4 [0x4])))
> > > >             (clobber (reg:CC 17 flags))
> > > >         ]) 724 {*lshrqi3_1}
> > > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > > >         (nil)))
> > > >
> > > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > > I suppose it's still mostly a code-size optimization (384 bytes
> > > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > > be likely slower for very low iteration counts but it's good
> > > > for icache usage then and good for less branch predictor usage.
> > > >
> > > > That said, I have to set up SPEC on a AVX512 machine to do
> > > Does patch  land in trunk already, i can have a test on CLX.
> >
> > I'm still experimenting a bit right now but hope to get something
> > trunk ready at the end of this or beginning next week.  Since it's
> > disabled by default we can work on improving it during stage1 then.
> >
> > I'm mostly struggling with the GIMPLE IL to be used for the
> > mask unpacking since we currently reject both the BIT_FIELD_REF
> > and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> > SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
> > we've dropped whole-vector shifts in favor of VEC_PERM but that
> > doesn't work well either for integer mode vectors.  So I'm still
> > playing with my options here and looking for something that doesn't
> > require too much surgery on the RTL side to recover good mask
> > register code ...
> >
> > Another part missing is expanders for the various cond_* patterns
> >
> > OPTAB_D (cond_add_optab, "cond_add$a")
> > OPTAB_D (cond_sub_optab, "cond_sub$a")
> > OPTAB_D (cond_smul_optab, "cond_mul$a")
> > OPTAB_D (cond_sdiv_optab, "cond_div$a")
> > OPTAB_D (cond_smod_optab, "cond_mod$a")
> > OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> > OPTAB_D (cond_umod_optab, "cond_umod$a")
> > OPTAB_D (cond_and_optab, "cond_and$a")
> > OPTAB_D (cond_ior_optab, "cond_ior$a")
> > OPTAB_D (cond_xor_optab, "cond_xor$a")
> > OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> > OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> > OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> > OPTAB_D (cond_smin_optab, "cond_smin$a")
> > OPTAB_D (cond_smax_optab, "cond_smax$a")
> > OPTAB_D (cond_umin_optab, "cond_umin$a")
> > OPTAB_D (cond_umax_optab, "cond_umax$a")
> > OPTAB_D (cond_fma_optab, "cond_fma$a")
> > OPTAB_D (cond_fms_optab, "cond_fms$a")
> > OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> > OPTAB_D (cond_fnms_optab, "cond_fnms$a")
> >
> > I think the most useful are those for possibly trapping ops
> > (will be used by if-conversion) and those for reduction operations
> > (add,min,max) which would enable a masked reduction epilogue.
> I've added cond_add/sub/max/min/smax/smin with my local patch, but I
> can't figure out testcases to validate them.
> Any ideas?

For example

double a[1024], b[1024];

void foo ()
{
  for (int i = 0; i < 1024; ++i)
    if (b[i] < 3.)
      a[i] = b[i] + 3.;
}

cannot be if-converted with -O3 due to -ftrapping-math and the
add possibly trapping.  But with cond_add it should be if-converted
and thus vectorized by making the add masked (in addition to the
masked store).

Richard.

Hongtao Liu July 21, 2021, 9:38 a.m. UTC | #20

On Wed, Jul 21, 2021 at 4:16 PM Richard Biener <rguenther@suse.de> wrote:
>
> On Wed, 21 Jul 2021, Hongtao Liu wrote:
>
> > On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
> > >
> > > On Tue, 20 Jul 2021, Hongtao Liu wrote:
> > >
> > > > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > > > >
> > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > >
> > > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > > >
> > > > > > > OK, guess I was more looking at
> > > > > > >
> > > > > > > #define N 32
> > > > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > > > >          int n)
> > > > > > > {
> > > > > > >   unsigned sum = 1;
> > > > > > >   for (int i = 0; i < n; ++i)
> > > > > > >     {
> > > > > > >       b[i] += a[i];
> > > > > > >       d[i] += c[i];
> > > > > > >     }
> > > > > > >   return sum;
> > > > > > > }
> > > > > > >
> > > > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > > > mask for V16SI.  With SVE I see
> > > > > > >
> > > > > > >         punpklo p1.h, p0.b
> > > > > > >         punpkhi p2.h, p0.b
> > > > > > >
> > > > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > > > >
> > > > > > >         vpbroadcastd    %eax, %ymm0
> > > > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > > > >         vpbroadcastd    %eax, %xmm0
> > > > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > > > >         kortestb        %k1, %k1
> > > > > > >         jne     .L3
> > > > > > >
> > > > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > > > >
> > > > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > > > QImode so that doesn't play well here.  Not sure if there
> > > > > > are proper mask instructions to use (I guess there's a shift
> > > > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > > > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > > > unpack_lo/hi, only shift.
> > > > > > conversion.  Not sure how to better ask the target here - again
> > > > > > VnBImode might have been easier here.
> > > > >
> > > > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > > > to turn vector(8) <signed-boolean:1> into two
> > > > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > > > produces the desired single mask producer and
> > > > >
> > > > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > > > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > > > >
> > > > > note for the lowpart we can just view-convert away the excess bits,
> > > > > fully re-using the mask.  We generate surprisingly "good" code:
> > > > >
> > > > >         kmovb   %k1, %edi
> > > > >         shrb    $4, %dil
> > > > >         kmovb   %edi, %k2
> > > > >
> > > > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > > > a mask register alternative for
> > > > Yes, we can do it similar as kor/kand/kxor.
> > > > >
> > > > > (insn 22 20 25 4 (parallel [
> > > > >             (set (reg:QI 94 [ loop_mask_37 ])
> > > > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > > > >                     (const_int 4 [0x4])))
> > > > >             (clobber (reg:CC 17 flags))
> > > > >         ]) 724 {*lshrqi3_1}
> > > > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > > > >         (nil)))
> > > > >
> > > > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > > > I suppose it's still mostly a code-size optimization (384 bytes
> > > > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > > > be likely slower for very low iteration counts but it's good
> > > > > for icache usage then and good for less branch predictor usage.
> > > > >
> > > > > That said, I have to set up SPEC on a AVX512 machine to do
> > > > Does patch  land in trunk already, i can have a test on CLX.
> > >
> > > I'm still experimenting a bit right now but hope to get something
> > > trunk ready at the end of this or beginning next week.  Since it's
> > > disabled by default we can work on improving it during stage1 then.
> > >
> > > I'm mostly struggling with the GIMPLE IL to be used for the
> > > mask unpacking since we currently reject both the BIT_FIELD_REF
> > > and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> > > SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
> > > we've dropped whole-vector shifts in favor of VEC_PERM but that
> > > doesn't work well either for integer mode vectors.  So I'm still
> > > playing with my options here and looking for something that doesn't
> > > require too much surgery on the RTL side to recover good mask
> > > register code ...
> > >
> > > Another part missing is expanders for the various cond_* patterns
> > >
> > > OPTAB_D (cond_add_optab, "cond_add$a")
> > > OPTAB_D (cond_sub_optab, "cond_sub$a")
> > > OPTAB_D (cond_smul_optab, "cond_mul$a")
> > > OPTAB_D (cond_sdiv_optab, "cond_div$a")
> > > OPTAB_D (cond_smod_optab, "cond_mod$a")
> > > OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> > > OPTAB_D (cond_umod_optab, "cond_umod$a")
> > > OPTAB_D (cond_and_optab, "cond_and$a")
> > > OPTAB_D (cond_ior_optab, "cond_ior$a")
> > > OPTAB_D (cond_xor_optab, "cond_xor$a")
> > > OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> > > OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> > > OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> > > OPTAB_D (cond_smin_optab, "cond_smin$a")
> > > OPTAB_D (cond_smax_optab, "cond_smax$a")
> > > OPTAB_D (cond_umin_optab, "cond_umin$a")
> > > OPTAB_D (cond_umax_optab, "cond_umax$a")
> > > OPTAB_D (cond_fma_optab, "cond_fma$a")
> > > OPTAB_D (cond_fms_optab, "cond_fms$a")
> > > OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> > > OPTAB_D (cond_fnms_optab, "cond_fnms$a")
> > >
> > > I think the most useful are those for possibly trapping ops
> > > (will be used by if-conversion) and those for reduction operations
> > > (add,min,max) which would enable a masked reduction epilogue.
> > I've added cond_add/sub/max/min/smax/smin with my local patch, but I
> > can't figure out testcases to validate them.
> > Any ideas?
>
> For example
>
> double a[1024], b[1024];
>
> void foo ()
> {
>   for (int i = 0; i < 1024; ++i)
>     if (b[i] < 3.)
>       a[i] = b[i] + 3.;
> }
>
Oh, thanks
the loop is successfully vectorized w/ cond_add expanders

        .cfi_startproc
        vbroadcastsd        .LC1(%rip), %ymm1
        xorl        %eax, %eax
        jmp        .L3
        .p2align 4,,10
        .p2align 3
.L2:
        addq        $32, %rax
        cmpq        $8192, %rax
        je        .L9
.L3:
        vmovapd        b(%rax), %ymm0
        vcmppd        $1, %ymm1, %ymm0, %k1
        kortestb        %k1, %k1
        je        .L2
        vaddpd        %ymm1, %ymm0, %ymm2{%k1}{z}
        vmovapd        %ymm2, a(%rax){%k1}
        addq        $32, %rax
        cmpq        $8192, %rax
        jne        .L3
.L9:
vzeroupper

Here's dump

  vector(4) double * vectp_a.10;
  vector(4) double * vectp_a.9;
  vector(4) double vect__2.8;
  vector(4) <signed-boolean:1> mask__22.7;
  vector(4) double vect__1.6;
  vector(4) double * vectp_b.5;
  vector(4) double * vectp_b.4;
  int i;
  double _1;
  double _2;
  unsigned int ivtmp_3;
  unsigned int ivtmp_5;
  _Bool _11;
  _Bool _22;
  double * _23;
  vector(4) double vect_cst__28;
  vector(4) double vect_cst__30;
  vector(4) double vect_cst__31;
  unsigned int ivtmp_36;
  unsigned int ivtmp_37;

  <bb 2> [local count: 10737416]:
  _11 = 1;
  vect_cst__28 = { 3.0e+0, 3.0e+0, 3.0e+0, 3.0e+0 };
  vect_cst__30 = { 3.0e+0, 3.0e+0, 3.0e+0, 3.0e+0 };
  vect_cst__31 = { 0.0, 0.0, 0.0, 0.0 };

  <bb 3> [local count: 268435396]:
  # i_12 = PHI <i_8(7), 0(2)>
  # ivtmp_5 = PHI <ivtmp_3(7), 1024(2)>
  # vectp_b.4_25 = PHI <vectp_b.4_26(7), &b(2)>
  # vectp_a.9_33 = PHI <vectp_a.9_34(7), &a(2)>
  # ivtmp_36 = PHI <ivtmp_37(7), 0(2)>
  vect__1.6_27 = MEM <vector(4) double> [(double *)vectp_b.4_25];
  _1 = b[i_12];
  mask__22.7_29 = vect__1.6_27 < vect_cst__28;
  if (mask__22.7_29 == { 0, 0, 0, 0 })
    goto <bb 20>; [100.00%]
  else
    goto <bb 21>; [20.00%]

  <bb 21> [local count: 53687078]:
  vect__2.8_32 = .COND_ADD (mask__22.7_29, vect__1.6_27, vect_cst__30,
vect_cst__31);  <--- Here.
  .MASK_STORE (vectp_a.9_33, 256B, mask__22.7_29, vect__2.8_32);

  <bb 20> [local count: 268435396]:
  i_8 = i_12 + 1;
  ivtmp_3 = ivtmp_5 - 1;
  vectp_b.4_26 = vectp_b.4_25 + 32;
  vectp_a.9_34 = vectp_a.9_33 + 32;
  ivtmp_37 = ivtmp_36 + 1;
  if (ivtmp_37 < 256)
    goto <bb 7>; [96.00%]
  else
    goto <bb 17>; [4.00%]

  <bb 7> [local count: 257697980]:
  goto <bb 3>; [100.00%]

  <bb 17> [local count: 10737416]:
  return;


> cannot be if-converted with -O3 due to -ftrapping-math and the
> add possibly trapping.  But with cond_add it should be if-converted
> and thus vectorized by making the add masked (in addition to the
> masked store).
>
> Richard.

Richard Biener July 21, 2021, 10:13 a.m. UTC | #21

On Wed, 21 Jul 2021, Hongtao Liu wrote:

> On Wed, Jul 21, 2021 at 4:16 PM Richard Biener <rguenther@suse.de> wrote:
> >
> > On Wed, 21 Jul 2021, Hongtao Liu wrote:
> >
> > > On Tue, Jul 20, 2021 at 3:38 PM Richard Biener <rguenther@suse.de> wrote:
> > > >
> > > > On Tue, 20 Jul 2021, Hongtao Liu wrote:
> > > >
> > > > > On Fri, Jul 16, 2021 at 5:11 PM Richard Biener <rguenther@suse.de> wrote:
> > > > > >
> > > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > > >
> > > > > > > On Thu, 15 Jul 2021, Richard Biener wrote:
> > > > > > >
> > > > > > > > OK, guess I was more looking at
> > > > > > > >
> > > > > > > > #define N 32
> > > > > > > > int foo (unsigned long *a, unsigned long * __restrict b,
> > > > > > > >          unsigned int *c, unsigned int * __restrict d,
> > > > > > > >          int n)
> > > > > > > > {
> > > > > > > >   unsigned sum = 1;
> > > > > > > >   for (int i = 0; i < n; ++i)
> > > > > > > >     {
> > > > > > > >       b[i] += a[i];
> > > > > > > >       d[i] += c[i];
> > > > > > > >     }
> > > > > > > >   return sum;
> > > > > > > > }
> > > > > > > >
> > > > > > > > where we on x86 AVX512 vectorize with V8DI and V16SI and we
> > > > > > > > generate two masks for the two copies of V8DI (VF is 16) and one
> > > > > > > > mask for V16SI.  With SVE I see
> > > > > > > >
> > > > > > > >         punpklo p1.h, p0.b
> > > > > > > >         punpkhi p2.h, p0.b
> > > > > > > >
> > > > > > > > that's sth I expected to see for AVX512 as well, using the V16SI
> > > > > > > > mask and unpacking that to two V8DI ones.  But I see
> > > > > > > >
> > > > > > > >         vpbroadcastd    %eax, %ymm0
> > > > > > > >         vpaddd  %ymm12, %ymm0, %ymm0
> > > > > > > >         vpcmpud $6, %ymm0, %ymm11, %k3
> > > > > > > >         vpbroadcastd    %eax, %xmm0
> > > > > > > >         vpaddd  %xmm10, %xmm0, %xmm0
> > > > > > > >         vpcmpud $1, %xmm7, %xmm0, %k1
> > > > > > > >         vpcmpud $6, %xmm0, %xmm8, %k2
> > > > > > > >         kortestb        %k1, %k1
> > > > > > > >         jne     .L3
> > > > > > > >
> > > > > > > > so three %k masks generated by vpcmpud.  I'll have to look what's
> > > > > > > > the magic for SVE and why that doesn't trigger for x86 here.
> > > > > > >
> > > > > > > So answer myself, vect_maybe_permute_loop_masks looks for
> > > > > > > vec_unpacku_hi/lo_optab, but with AVX512 the vector bools have
> > > > > > > QImode so that doesn't play well here.  Not sure if there
> > > > > > > are proper mask instructions to use (I guess there's a shift
> > > > > > > and lopart is free).  This is QI:8 to two QI:4 (bits) mask
> > > > > Yes, for 16bit and more, we have KUNPCKBW/D/Q. but for 8bit
> > > > > unpack_lo/hi, only shift.
> > > > > > > conversion.  Not sure how to better ask the target here - again
> > > > > > > VnBImode might have been easier here.
> > > > > >
> > > > > > So I've managed to "emulate" the unpack_lo/hi for the case of
> > > > > > !VECTOR_MODE_P masks by using sub-vector select (we're asking
> > > > > > to turn vector(8) <signed-boolean:1> into two
> > > > > > vector(4) <signed-boolean:1>) via BIT_FIELD_REF.  That then
> > > > > > produces the desired single mask producer and
> > > > > >
> > > > > >   loop_mask_38 = VIEW_CONVERT_EXPR<vector(4) <signed-boolean:1>>(loop_mask_54);
> > > > > >   loop_mask_37 = BIT_FIELD_REF <loop_mask_54, 4, 4>;
> > > > > >
> > > > > > note for the lowpart we can just view-convert away the excess bits,
> > > > > > fully re-using the mask.  We generate surprisingly "good" code:
> > > > > >
> > > > > >         kmovb   %k1, %edi
> > > > > >         shrb    $4, %dil
> > > > > >         kmovb   %edi, %k2
> > > > > >
> > > > > > besides the lack of using kshiftrb.  I guess we're just lacking
> > > > > > a mask register alternative for
> > > > > Yes, we can do it similar as kor/kand/kxor.
> > > > > >
> > > > > > (insn 22 20 25 4 (parallel [
> > > > > >             (set (reg:QI 94 [ loop_mask_37 ])
> > > > > >                 (lshiftrt:QI (reg:QI 98 [ loop_mask_54 ])
> > > > > >                     (const_int 4 [0x4])))
> > > > > >             (clobber (reg:CC 17 flags))
> > > > > >         ]) 724 {*lshrqi3_1}
> > > > > >      (expr_list:REG_UNUSED (reg:CC 17 flags)
> > > > > >         (nil)))
> > > > > >
> > > > > > and so we reload.  For the above cited loop the AVX512 vectorization
> > > > > > with --param vect-partial-vector-usage=1 does look quite sensible
> > > > > > to me.  Instead of a SSE vectorized epilogue plus a scalar
> > > > > > epilogue we get a single fully masked AVX512 "iteration" for both.
> > > > > > I suppose it's still mostly a code-size optimization (384 bytes
> > > > > > with the masked epiloge vs. 474 bytes with trunk) since it will
> > > > > > be likely slower for very low iteration counts but it's good
> > > > > > for icache usage then and good for less branch predictor usage.
> > > > > >
> > > > > > That said, I have to set up SPEC on a AVX512 machine to do
> > > > > Does patch  land in trunk already, i can have a test on CLX.
> > > >
> > > > I'm still experimenting a bit right now but hope to get something
> > > > trunk ready at the end of this or beginning next week.  Since it's
> > > > disabled by default we can work on improving it during stage1 then.
> > > >
> > > > I'm mostly struggling with the GIMPLE IL to be used for the
> > > > mask unpacking since we currently reject both the BIT_FIELD_REF
> > > > and the VIEW_CONVERT we generate (why do AVX512 masks not all have
> > > > SImode but sometimes QImode and sometimes HImode ...).  Unfortunately
> > > > we've dropped whole-vector shifts in favor of VEC_PERM but that
> > > > doesn't work well either for integer mode vectors.  So I'm still
> > > > playing with my options here and looking for something that doesn't
> > > > require too much surgery on the RTL side to recover good mask
> > > > register code ...
> > > >
> > > > Another part missing is expanders for the various cond_* patterns
> > > >
> > > > OPTAB_D (cond_add_optab, "cond_add$a")
> > > > OPTAB_D (cond_sub_optab, "cond_sub$a")
> > > > OPTAB_D (cond_smul_optab, "cond_mul$a")
> > > > OPTAB_D (cond_sdiv_optab, "cond_div$a")
> > > > OPTAB_D (cond_smod_optab, "cond_mod$a")
> > > > OPTAB_D (cond_udiv_optab, "cond_udiv$a")
> > > > OPTAB_D (cond_umod_optab, "cond_umod$a")
> > > > OPTAB_D (cond_and_optab, "cond_and$a")
> > > > OPTAB_D (cond_ior_optab, "cond_ior$a")
> > > > OPTAB_D (cond_xor_optab, "cond_xor$a")
> > > > OPTAB_D (cond_ashl_optab, "cond_ashl$a")
> > > > OPTAB_D (cond_ashr_optab, "cond_ashr$a")
> > > > OPTAB_D (cond_lshr_optab, "cond_lshr$a")
> > > > OPTAB_D (cond_smin_optab, "cond_smin$a")
> > > > OPTAB_D (cond_smax_optab, "cond_smax$a")
> > > > OPTAB_D (cond_umin_optab, "cond_umin$a")
> > > > OPTAB_D (cond_umax_optab, "cond_umax$a")
> > > > OPTAB_D (cond_fma_optab, "cond_fma$a")
> > > > OPTAB_D (cond_fms_optab, "cond_fms$a")
> > > > OPTAB_D (cond_fnma_optab, "cond_fnma$a")
> > > > OPTAB_D (cond_fnms_optab, "cond_fnms$a")
> > > >
> > > > I think the most useful are those for possibly trapping ops
> > > > (will be used by if-conversion) and those for reduction operations
> > > > (add,min,max) which would enable a masked reduction epilogue.
> > > I've added cond_add/sub/max/min/smax/smin with my local patch, but I
> > > can't figure out testcases to validate them.
> > > Any ideas?
> >
> > For example
> >
> > double a[1024], b[1024];
> >
> > void foo ()
> > {
> >   for (int i = 0; i < 1024; ++i)
> >     if (b[i] < 3.)
> >       a[i] = b[i] + 3.;
> > }
> >
> Oh, thanks
> the loop is successfully vectorized w/ cond_add expanders
> 
>         .cfi_startproc
>         vbroadcastsd        .LC1(%rip), %ymm1
>         xorl        %eax, %eax
>         jmp        .L3
>         .p2align 4,,10
>         .p2align 3
> .L2:
>         addq        $32, %rax
>         cmpq        $8192, %rax
>         je        .L9
> .L3:
>         vmovapd        b(%rax), %ymm0
>         vcmppd        $1, %ymm1, %ymm0, %k1
>         kortestb        %k1, %k1
>         je        .L2
>         vaddpd        %ymm1, %ymm0, %ymm2{%k1}{z}
>         vmovapd        %ymm2, a(%rax){%k1}
>         addq        $32, %rax
>         cmpq        $8192, %rax
>         jne        .L3
> .L9:
> vzeroupper
> 
> Here's dump
> 
>   vector(4) double * vectp_a.10;
>   vector(4) double * vectp_a.9;
>   vector(4) double vect__2.8;
>   vector(4) <signed-boolean:1> mask__22.7;
>   vector(4) double vect__1.6;
>   vector(4) double * vectp_b.5;
>   vector(4) double * vectp_b.4;
>   int i;
>   double _1;
>   double _2;
>   unsigned int ivtmp_3;
>   unsigned int ivtmp_5;
>   _Bool _11;
>   _Bool _22;
>   double * _23;
>   vector(4) double vect_cst__28;
>   vector(4) double vect_cst__30;
>   vector(4) double vect_cst__31;
>   unsigned int ivtmp_36;
>   unsigned int ivtmp_37;
> 
>   <bb 2> [local count: 10737416]:
>   _11 = 1;
>   vect_cst__28 = { 3.0e+0, 3.0e+0, 3.0e+0, 3.0e+0 };
>   vect_cst__30 = { 3.0e+0, 3.0e+0, 3.0e+0, 3.0e+0 };
>   vect_cst__31 = { 0.0, 0.0, 0.0, 0.0 };
> 
>   <bb 3> [local count: 268435396]:
>   # i_12 = PHI <i_8(7), 0(2)>
>   # ivtmp_5 = PHI <ivtmp_3(7), 1024(2)>
>   # vectp_b.4_25 = PHI <vectp_b.4_26(7), &b(2)>
>   # vectp_a.9_33 = PHI <vectp_a.9_34(7), &a(2)>
>   # ivtmp_36 = PHI <ivtmp_37(7), 0(2)>
>   vect__1.6_27 = MEM <vector(4) double> [(double *)vectp_b.4_25];
>   _1 = b[i_12];
>   mask__22.7_29 = vect__1.6_27 < vect_cst__28;
>   if (mask__22.7_29 == { 0, 0, 0, 0 })
>     goto <bb 20>; [100.00%]
>   else
>     goto <bb 21>; [20.00%]
> 
>   <bb 21> [local count: 53687078]:
>   vect__2.8_32 = .COND_ADD (mask__22.7_29, vect__1.6_27, vect_cst__30,
> vect_cst__31);  <--- Here.
>   .MASK_STORE (vectp_a.9_33, 256B, mask__22.7_29, vect__2.8_32);
> 
>   <bb 20> [local count: 268435396]:
>   i_8 = i_12 + 1;
>   ivtmp_3 = ivtmp_5 - 1;
>   vectp_b.4_26 = vectp_b.4_25 + 32;
>   vectp_a.9_34 = vectp_a.9_33 + 32;
>   ivtmp_37 = ivtmp_36 + 1;
>   if (ivtmp_37 < 256)
>     goto <bb 7>; [96.00%]
>   else
>     goto <bb 17>; [4.00%]
> 
>   <bb 7> [local count: 257697980]:
>   goto <bb 3>; [100.00%]
> 
>   <bb 17> [local count: 10737416]:
>   return;

Looks great!

Thanks,
Richard.

[2/2,RFC] Add loop masking support for x86

Commit Message

Comments

Patch