diff mbox series

RISC-V: Support POPCOUNT auto-vectorization

Message ID 20230731120120.651197-1-juzhe.zhong@rivai.ai
State New
Headers show
Series RISC-V: Support POPCOUNT auto-vectorization | expand

Commit Message

juzhe.zhong@rivai.ai July 31, 2023, 12:01 p.m. UTC
This patch is inspired by "lowerCTPOP" in LLVM.
Support popcount auto-vectorization by following LLVM approach.
https://godbolt.org/z/3K3GzvY7f

Before this patch:

<source>:7:21: missed: couldn't vectorize loop
<source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4);

After this patch:

popcount_32:
	ble	a2,zero,.L5
	li	t3,1431654400
	li	a7,858992640
	li	t1,252645376
	li	a6,16711680
	li	a3,65536
	addiw	t3,t3,1365
	addiw	a7,a7,819
	addiw	t1,t1,-241
	addiw	a6,a6,255
	addiw	a3,a3,-1
.L3:
	vsetvli	a5,a2,e8,mf4,ta,ma
	vle32.v	v1,0(a1)
	vsetivli	zero,4,e32,m1,ta,ma
	vsrl.vi	v2,v1,1
	vand.vx	v2,v2,t3
	vsub.vv	v1,v1,v2
	vsrl.vi	v2,v1,2
	vand.vx	v2,v2,a7
	vand.vx	v1,v1,a7
	vadd.vv	v1,v1,v2
	vsrl.vi	v2,v1,4
	vadd.vv	v1,v1,v2
	vand.vx	v1,v1,t1
	vsrl.vi	v2,v1,8
	vand.vx	v2,v2,a6
	slli	a4,a5,2
	vand.vx	v1,v1,a6
	vadd.vv	v1,v1,v2
	vsrl.vi	v2,v1,16
	vand.vx	v1,v1,a3
	vand.vx	v2,v2,a3
	vadd.vv	v1,v1,v2
	vmv.v.v	v1,v1
	vsetvli	zero,a2,e32,m1,ta,ma
	sub	a2,a2,a5
	vse32.v	v1,0(a0)
	add	a1,a1,a4
	add	a0,a0,a4
	bne	a2,zero,.L3
.L5:
	ret

gcc/ChangeLog:

	* config/riscv/autovec.md (popcount<mode>2): New pattern.
	* config/riscv/riscv-protos.h (expand_popcount): New function.
	* config/riscv/riscv-v.cc (expand_popcount): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
	* gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.

---
 gcc/config/riscv/autovec.md                   | 13 +++
 gcc/config/riscv/riscv-protos.h               |  1 +
 gcc/config/riscv/riscv-v.cc                   | 95 +++++++++++++++++++
 .../riscv/rvv/autovec/widen/popcount-1.c      | 23 +++++
 .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++
 5 files changed, 182 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c

Comments

Kito Cheng July 31, 2023, 1:38 p.m. UTC | #1
On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch is inspired by "lowerCTPOP" in LLVM.
> Support popcount auto-vectorization by following LLVM approach.
> https://godbolt.org/z/3K3GzvY7f
>
> Before this patch:
>
> <source>:7:21: missed: couldn't vectorize loop
> <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4);
>
> After this patch:
>
> popcount_32:
>         ble     a2,zero,.L5
>         li      t3,1431654400
>         li      a7,858992640
>         li      t1,252645376
>         li      a6,16711680
>         li      a3,65536
>         addiw   t3,t3,1365
>         addiw   a7,a7,819
>         addiw   t1,t1,-241
>         addiw   a6,a6,255
>         addiw   a3,a3,-1
> .L3:
>         vsetvli a5,a2,e8,mf4,ta,ma
>         vle32.v v1,0(a1)
>         vsetivli        zero,4,e32,m1,ta,ma
>         vsrl.vi v2,v1,1
>         vand.vx v2,v2,t3
>         vsub.vv v1,v1,v2
>         vsrl.vi v2,v1,2
>         vand.vx v2,v2,a7
>         vand.vx v1,v1,a7
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,4
>         vadd.vv v1,v1,v2
>         vand.vx v1,v1,t1
>         vsrl.vi v2,v1,8
>         vand.vx v2,v2,a6
>         slli    a4,a5,2
>         vand.vx v1,v1,a6
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,16
>         vand.vx v1,v1,a3
>         vand.vx v2,v2,a3
>         vadd.vv v1,v1,v2
>         vmv.v.v v1,v1
>         vsetvli zero,a2,e32,m1,ta,ma
>         sub     a2,a2,a5
>         vse32.v v1,0(a0)
>         add     a1,a1,a4
>         add     a0,a0,a4
>         bne     a2,zero,.L3
> .L5:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/autovec.md (popcount<mode>2): New pattern.
>         * config/riscv/riscv-protos.h (expand_popcount): New function.
>         * config/riscv/riscv-v.cc (expand_popcount): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.
>
> ---
>  gcc/config/riscv/autovec.md                   | 13 +++
>  gcc/config/riscv/riscv-protos.h               |  1 +
>  gcc/config/riscv/riscv-v.cc                   | 95 +++++++++++++++++++
>  .../riscv/rvv/autovec/widen/popcount-1.c      | 23 +++++
>  .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++
>  5 files changed, 182 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index b5152bc91fd..9d32b91bdca 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -922,6 +922,19 @@
>    DONE;
>  })
>
> +;; -------------------------------------------------------------------------------
> +;; - [INT] POPCOUNT.
> +;; -------------------------------------------------------------------------------
> +
> +(define_expand "popcount<mode>2"
> +  [(match_operand:VI 0 "register_operand")
> +   (match_operand:VI 1 "register_operand")]
> +  "TARGET_VECTOR"
> +{
> +  riscv_vector::expand_popcount (operands);
> +  DONE;
> +})
> +
>  ;; -------------------------------------------------------------------------------
>  ;; ---- [FP] Unary operations
>  ;; -------------------------------------------------------------------------------
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index a729db44c32..ae40fbb4b53 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -321,6 +321,7 @@ void expand_select_vl (rtx *);
>  void expand_load_store (rtx *, bool);
>  void expand_gather_scatter (rtx *, bool);
>  void expand_cond_len_ternop (unsigned, rtx *);
> +void expand_popcount (rtx *);
>
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c10e51b362e..b3caa4b188d 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type)
>    emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
>  }
>
> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> +       n = COUNT(n, 0);
> +       n = COUNT(n, 1);
> +       n = COUNT(n, 2);
> +       n = COUNT(n, 3);
> +       n = COUNT(n, 4);
> +   //  n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> +       return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }
> +*/
> +void
> +expand_popcount (rtx *ops)
> +{
> +  rtx dst = ops[0];
> +  rtx src = ops[1];
> +  machine_mode mode = GET_MODE (dst);
> +  scalar_mode smode = GET_MODE_INNER (mode);
> +  static const uint64_t mask_values[6]
> +    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
> +       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
> +
> +  unsigned bit_size = GET_MODE_BITSIZE (smode);

bit_size is always euqal or less than 64 since we don't have TI mode
for vector modes.

> +  unsigned word_size
> +    = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE;

So this is always 1, or you were trying to handle rv32 with ELEN64 here?

> +  rtx count = CONST0_RTX (mode);
> +
> +  for (unsigned n = 0; n < word_size; ++n)

Drop outer loop if word_size never larger than 1?

> +    {
> +      rtx part_value = src;
> +      for (unsigned i = 1, ct = 0;
> +          i
> +          < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size);

Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64
and bit_size <=64?

> +          i <<= 1, ++ct)
> +       {
> +         rtx mask_cst = gen_int_mode (mask_values[ct], smode);
> +
> +         rtx vshift = expand_binop (mode, lshr_optab, part_value,
> +                                    gen_int_mode (i, smode), NULL_RTX, true,
> +                                    OPTAB_DIRECT);
> +
> +         if (i == 4)
> +           {
> +             /* Optimize ((X & MASK) + ((X >> 4) & MASK))
> +
> +                -> (X + (X >> 4)) & MASK  */
> +             rtx rhs = expand_binop (mode, add_optab, part_value, vshift,
> +                                     NULL_RTX, false, OPTAB_DIRECT);
> +             part_value = gen_reg_rtx (mode);
> +             rtx part_ops[] = {part_value, rhs, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              part_ops);
> +           }
> +         else
> +           {
> +             rtx rhs = gen_reg_rtx (mode);
> +             rtx rhs_ops[] = {rhs, vshift, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              rhs_ops);
> +             if (i == 1)
> +               part_value = expand_binop (mode, sub_optab, part_value, rhs,
> +                                          NULL_RTX, false, OPTAB_DIRECT);
> +             else
> +               {
> +                 rtx lhs = gen_reg_rtx (mode);
> +                 rtx lhs_ops[] = {lhs, part_value, mask_cst};
> +                 emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                                  lhs_ops);
> +
> +                 part_value = expand_binop (mode, add_optab, lhs, rhs,
> +                                            NULL_RTX, false, OPTAB_DIRECT);
> +               }
> +           }
> +       }
> +
> +      count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false,
> +                           OPTAB_DIRECT);
> +      if (bit_size > LONG_LONG_TYPE_SIZE)

No need this if word_size is constant 1.

> +       {
> +         src = expand_binop (mode, lshr_optab, src,
> +                             gen_int_mode (LONG_LONG_TYPE_SIZE, smode),
> +                             NULL_RTX, true, OPTAB_DIRECT);
> +         bit_size -= LONG_LONG_TYPE_SIZE;
> +       }
> +    }
> +  emit_move_insn (dst, count);
> +}
> +
>  } // namespace riscv_vector
>
juzhe.zhong@rivai.ai July 31, 2023, 1:49 p.m. UTC | #2
>> Drop outer loop if word_size never larger than 1?
Yeah. we never have TI vector modes for now.

The codes I just directly copy from LLVM in generic intrinsic handling.... :)
since LLVM generic code is considering handling INT128 vector

I will remove all redundant code for INT128 vector mode in V2.
Thanks.


juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-07-31 21:38
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization
On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch is inspired by "lowerCTPOP" in LLVM.
> Support popcount auto-vectorization by following LLVM approach.
> https://godbolt.org/z/3K3GzvY7f
>
> Before this patch:
>
> <source>:7:21: missed: couldn't vectorize loop
> <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4);
>
> After this patch:
>
> popcount_32:
>         ble     a2,zero,.L5
>         li      t3,1431654400
>         li      a7,858992640
>         li      t1,252645376
>         li      a6,16711680
>         li      a3,65536
>         addiw   t3,t3,1365
>         addiw   a7,a7,819
>         addiw   t1,t1,-241
>         addiw   a6,a6,255
>         addiw   a3,a3,-1
> .L3:
>         vsetvli a5,a2,e8,mf4,ta,ma
>         vle32.v v1,0(a1)
>         vsetivli        zero,4,e32,m1,ta,ma
>         vsrl.vi v2,v1,1
>         vand.vx v2,v2,t3
>         vsub.vv v1,v1,v2
>         vsrl.vi v2,v1,2
>         vand.vx v2,v2,a7
>         vand.vx v1,v1,a7
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,4
>         vadd.vv v1,v1,v2
>         vand.vx v1,v1,t1
>         vsrl.vi v2,v1,8
>         vand.vx v2,v2,a6
>         slli    a4,a5,2
>         vand.vx v1,v1,a6
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,16
>         vand.vx v1,v1,a3
>         vand.vx v2,v2,a3
>         vadd.vv v1,v1,v2
>         vmv.v.v v1,v1
>         vsetvli zero,a2,e32,m1,ta,ma
>         sub     a2,a2,a5
>         vse32.v v1,0(a0)
>         add     a1,a1,a4
>         add     a0,a0,a4
>         bne     a2,zero,.L3
> .L5:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/autovec.md (popcount<mode>2): New pattern.
>         * config/riscv/riscv-protos.h (expand_popcount): New function.
>         * config/riscv/riscv-v.cc (expand_popcount): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.
>
> ---
>  gcc/config/riscv/autovec.md                   | 13 +++
>  gcc/config/riscv/riscv-protos.h               |  1 +
>  gcc/config/riscv/riscv-v.cc                   | 95 +++++++++++++++++++
>  .../riscv/rvv/autovec/widen/popcount-1.c      | 23 +++++
>  .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++
>  5 files changed, 182 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index b5152bc91fd..9d32b91bdca 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -922,6 +922,19 @@
>    DONE;
>  })
>
> +;; -------------------------------------------------------------------------------
> +;; - [INT] POPCOUNT.
> +;; -------------------------------------------------------------------------------
> +
> +(define_expand "popcount<mode>2"
> +  [(match_operand:VI 0 "register_operand")
> +   (match_operand:VI 1 "register_operand")]
> +  "TARGET_VECTOR"
> +{
> +  riscv_vector::expand_popcount (operands);
> +  DONE;
> +})
> +
>  ;; -------------------------------------------------------------------------------
>  ;; ---- [FP] Unary operations
>  ;; -------------------------------------------------------------------------------
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index a729db44c32..ae40fbb4b53 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -321,6 +321,7 @@ void expand_select_vl (rtx *);
>  void expand_load_store (rtx *, bool);
>  void expand_gather_scatter (rtx *, bool);
>  void expand_cond_len_ternop (unsigned, rtx *);
> +void expand_popcount (rtx *);
>
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c10e51b362e..b3caa4b188d 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type)
>    emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
>  }
>
> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> +       n = COUNT(n, 0);
> +       n = COUNT(n, 1);
> +       n = COUNT(n, 2);
> +       n = COUNT(n, 3);
> +       n = COUNT(n, 4);
> +   //  n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> +       return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }
> +*/
> +void
> +expand_popcount (rtx *ops)
> +{
> +  rtx dst = ops[0];
> +  rtx src = ops[1];
> +  machine_mode mode = GET_MODE (dst);
> +  scalar_mode smode = GET_MODE_INNER (mode);
> +  static const uint64_t mask_values[6]
> +    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
> +       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
> +
> +  unsigned bit_size = GET_MODE_BITSIZE (smode);
 
bit_size is always euqal or less than 64 since we don't have TI mode
for vector modes.
 
> +  unsigned word_size
> +    = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE;
 
So this is always 1, or you were trying to handle rv32 with ELEN64 here?
 
> +  rtx count = CONST0_RTX (mode);
> +
> +  for (unsigned n = 0; n < word_size; ++n)
 
Drop outer loop if word_size never larger than 1?
 
> +    {
> +      rtx part_value = src;
> +      for (unsigned i = 1, ct = 0;
> +          i
> +          < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size);
 
Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64
and bit_size <=64?
 
> +          i <<= 1, ++ct)
> +       {
> +         rtx mask_cst = gen_int_mode (mask_values[ct], smode);
> +
> +         rtx vshift = expand_binop (mode, lshr_optab, part_value,
> +                                    gen_int_mode (i, smode), NULL_RTX, true,
> +                                    OPTAB_DIRECT);
> +
> +         if (i == 4)
> +           {
> +             /* Optimize ((X & MASK) + ((X >> 4) & MASK))
> +
> +                -> (X + (X >> 4)) & MASK  */
> +             rtx rhs = expand_binop (mode, add_optab, part_value, vshift,
> +                                     NULL_RTX, false, OPTAB_DIRECT);
> +             part_value = gen_reg_rtx (mode);
> +             rtx part_ops[] = {part_value, rhs, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              part_ops);
> +           }
> +         else
> +           {
> +             rtx rhs = gen_reg_rtx (mode);
> +             rtx rhs_ops[] = {rhs, vshift, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              rhs_ops);
> +             if (i == 1)
> +               part_value = expand_binop (mode, sub_optab, part_value, rhs,
> +                                          NULL_RTX, false, OPTAB_DIRECT);
> +             else
> +               {
> +                 rtx lhs = gen_reg_rtx (mode);
> +                 rtx lhs_ops[] = {lhs, part_value, mask_cst};
> +                 emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                                  lhs_ops);
> +
> +                 part_value = expand_binop (mode, add_optab, lhs, rhs,
> +                                            NULL_RTX, false, OPTAB_DIRECT);
> +               }
> +           }
> +       }
> +
> +      count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false,
> +                           OPTAB_DIRECT);
> +      if (bit_size > LONG_LONG_TYPE_SIZE)
 
No need this if word_size is constant 1.
 
> +       {
> +         src = expand_binop (mode, lshr_optab, src,
> +                             gen_int_mode (LONG_LONG_TYPE_SIZE, smode),
> +                             NULL_RTX, true, OPTAB_DIRECT);
> +         bit_size -= LONG_LONG_TYPE_SIZE;
> +       }
> +    }
> +  emit_move_insn (dst, count);
> +}
> +
>  } // namespace riscv_vector
>
juzhe.zhong@rivai.ai July 31, 2023, 2:31 p.m. UTC | #3
Address comment and fix on V2:
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625870.html 
Ok for trunk?



juzhe.zhong@rivai.ai
 
From: Kito Cheng
Date: 2023-07-31 21:38
To: Juzhe-Zhong
CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc
Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization
On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote:
>
> This patch is inspired by "lowerCTPOP" in LLVM.
> Support popcount auto-vectorization by following LLVM approach.
> https://godbolt.org/z/3K3GzvY7f
>
> Before this patch:
>
> <source>:7:21: missed: couldn't vectorize loop
> <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4);
>
> After this patch:
>
> popcount_32:
>         ble     a2,zero,.L5
>         li      t3,1431654400
>         li      a7,858992640
>         li      t1,252645376
>         li      a6,16711680
>         li      a3,65536
>         addiw   t3,t3,1365
>         addiw   a7,a7,819
>         addiw   t1,t1,-241
>         addiw   a6,a6,255
>         addiw   a3,a3,-1
> .L3:
>         vsetvli a5,a2,e8,mf4,ta,ma
>         vle32.v v1,0(a1)
>         vsetivli        zero,4,e32,m1,ta,ma
>         vsrl.vi v2,v1,1
>         vand.vx v2,v2,t3
>         vsub.vv v1,v1,v2
>         vsrl.vi v2,v1,2
>         vand.vx v2,v2,a7
>         vand.vx v1,v1,a7
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,4
>         vadd.vv v1,v1,v2
>         vand.vx v1,v1,t1
>         vsrl.vi v2,v1,8
>         vand.vx v2,v2,a6
>         slli    a4,a5,2
>         vand.vx v1,v1,a6
>         vadd.vv v1,v1,v2
>         vsrl.vi v2,v1,16
>         vand.vx v1,v1,a3
>         vand.vx v2,v2,a3
>         vadd.vv v1,v1,v2
>         vmv.v.v v1,v1
>         vsetvli zero,a2,e32,m1,ta,ma
>         sub     a2,a2,a5
>         vse32.v v1,0(a0)
>         add     a1,a1,a4
>         add     a0,a0,a4
>         bne     a2,zero,.L3
> .L5:
>         ret
>
> gcc/ChangeLog:
>
>         * config/riscv/autovec.md (popcount<mode>2): New pattern.
>         * config/riscv/riscv-protos.h (expand_popcount): New function.
>         * config/riscv/riscv-v.cc (expand_popcount): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test.
>         * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test.
>
> ---
>  gcc/config/riscv/autovec.md                   | 13 +++
>  gcc/config/riscv/riscv-protos.h               |  1 +
>  gcc/config/riscv/riscv-v.cc                   | 95 +++++++++++++++++++
>  .../riscv/rvv/autovec/widen/popcount-1.c      | 23 +++++
>  .../riscv/rvv/autovec/widen/popcount_run-1.c  | 50 ++++++++++
>  5 files changed, 182 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index b5152bc91fd..9d32b91bdca 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -922,6 +922,19 @@
>    DONE;
>  })
>
> +;; -------------------------------------------------------------------------------
> +;; - [INT] POPCOUNT.
> +;; -------------------------------------------------------------------------------
> +
> +(define_expand "popcount<mode>2"
> +  [(match_operand:VI 0 "register_operand")
> +   (match_operand:VI 1 "register_operand")]
> +  "TARGET_VECTOR"
> +{
> +  riscv_vector::expand_popcount (operands);
> +  DONE;
> +})
> +
>  ;; -------------------------------------------------------------------------------
>  ;; ---- [FP] Unary operations
>  ;; -------------------------------------------------------------------------------
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index a729db44c32..ae40fbb4b53 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -321,6 +321,7 @@ void expand_select_vl (rtx *);
>  void expand_load_store (rtx *, bool);
>  void expand_gather_scatter (rtx *, bool);
>  void expand_cond_len_ternop (unsigned, rtx *);
> +void expand_popcount (rtx *);
>
>  /* Rounding mode bitfield for fixed point VXRM.  */
>  enum fixed_point_rounding_mode
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index c10e51b362e..b3caa4b188d 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type)
>    emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
>  }
>
> +/* Expand Vector POPCOUNT by parallel popcnt:
> +
> +   int parallel_popcnt(uint32_t n) {
> +   #define POW2(c)      (1U << (c))
> +   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
> +   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
> +       n = COUNT(n, 0);
> +       n = COUNT(n, 1);
> +       n = COUNT(n, 2);
> +       n = COUNT(n, 3);
> +       n = COUNT(n, 4);
> +   //  n = COUNT(n, 5);  // uncomment this line for 64-bit integers
> +       return n;
> +   #undef COUNT
> +   #undef MASK
> +   #undef POW2
> +   }
> +*/
> +void
> +expand_popcount (rtx *ops)
> +{
> +  rtx dst = ops[0];
> +  rtx src = ops[1];
> +  machine_mode mode = GET_MODE (dst);
> +  scalar_mode smode = GET_MODE_INNER (mode);
> +  static const uint64_t mask_values[6]
> +    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
> +       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
> +
> +  unsigned bit_size = GET_MODE_BITSIZE (smode);
 
bit_size is always euqal or less than 64 since we don't have TI mode
for vector modes.
 
> +  unsigned word_size
> +    = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE;
 
So this is always 1, or you were trying to handle rv32 with ELEN64 here?
 
> +  rtx count = CONST0_RTX (mode);
> +
> +  for (unsigned n = 0; n < word_size; ++n)
 
Drop outer loop if word_size never larger than 1?
 
> +    {
> +      rtx part_value = src;
> +      for (unsigned i = 1, ct = 0;
> +          i
> +          < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size);
 
Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64
and bit_size <=64?
 
> +          i <<= 1, ++ct)
> +       {
> +         rtx mask_cst = gen_int_mode (mask_values[ct], smode);
> +
> +         rtx vshift = expand_binop (mode, lshr_optab, part_value,
> +                                    gen_int_mode (i, smode), NULL_RTX, true,
> +                                    OPTAB_DIRECT);
> +
> +         if (i == 4)
> +           {
> +             /* Optimize ((X & MASK) + ((X >> 4) & MASK))
> +
> +                -> (X + (X >> 4)) & MASK  */
> +             rtx rhs = expand_binop (mode, add_optab, part_value, vshift,
> +                                     NULL_RTX, false, OPTAB_DIRECT);
> +             part_value = gen_reg_rtx (mode);
> +             rtx part_ops[] = {part_value, rhs, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              part_ops);
> +           }
> +         else
> +           {
> +             rtx rhs = gen_reg_rtx (mode);
> +             rtx rhs_ops[] = {rhs, vshift, mask_cst};
> +             emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                              rhs_ops);
> +             if (i == 1)
> +               part_value = expand_binop (mode, sub_optab, part_value, rhs,
> +                                          NULL_RTX, false, OPTAB_DIRECT);
> +             else
> +               {
> +                 rtx lhs = gen_reg_rtx (mode);
> +                 rtx lhs_ops[] = {lhs, part_value, mask_cst};
> +                 emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
> +                                  lhs_ops);
> +
> +                 part_value = expand_binop (mode, add_optab, lhs, rhs,
> +                                            NULL_RTX, false, OPTAB_DIRECT);
> +               }
> +           }
> +       }
> +
> +      count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false,
> +                           OPTAB_DIRECT);
> +      if (bit_size > LONG_LONG_TYPE_SIZE)
 
No need this if word_size is constant 1.
 
> +       {
> +         src = expand_binop (mode, lshr_optab, src,
> +                             gen_int_mode (LONG_LONG_TYPE_SIZE, smode),
> +                             NULL_RTX, true, OPTAB_DIRECT);
> +         bit_size -= LONG_LONG_TYPE_SIZE;
> +       }
> +    }
> +  emit_move_insn (dst, count);
> +}
> +
>  } // namespace riscv_vector
>
diff mbox series

Patch

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index b5152bc91fd..9d32b91bdca 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -922,6 +922,19 @@ 
   DONE;
 })
 
+;; -------------------------------------------------------------------------------
+;; - [INT] POPCOUNT.
+;; -------------------------------------------------------------------------------
+
+(define_expand "popcount<mode>2"
+  [(match_operand:VI 0 "register_operand")
+   (match_operand:VI 1 "register_operand")]
+  "TARGET_VECTOR"
+{
+  riscv_vector::expand_popcount (operands);
+  DONE;
+})
+
 ;; -------------------------------------------------------------------------------
 ;; ---- [FP] Unary operations
 ;; -------------------------------------------------------------------------------
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index a729db44c32..ae40fbb4b53 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -321,6 +321,7 @@  void expand_select_vl (rtx *);
 void expand_load_store (rtx *, bool);
 void expand_gather_scatter (rtx *, bool);
 void expand_cond_len_ternop (unsigned, rtx *);
+void expand_popcount (rtx *);
 
 /* Rounding mode bitfield for fixed point VXRM.  */
 enum fixed_point_rounding_mode
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index c10e51b362e..b3caa4b188d 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3614,4 +3614,99 @@  expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type)
   emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2));
 }
 
+/* Expand Vector POPCOUNT by parallel popcnt:
+
+   int parallel_popcnt(uint32_t n) {
+   #define POW2(c)      (1U << (c))
+   #define MASK(c)      (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U))
+   #define COUNT(x, c)  ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c))
+	n = COUNT(n, 0);
+	n = COUNT(n, 1);
+	n = COUNT(n, 2);
+	n = COUNT(n, 3);
+	n = COUNT(n, 4);
+   //	n = COUNT(n, 5);  // uncomment this line for 64-bit integers
+	return n;
+   #undef COUNT
+   #undef MASK
+   #undef POW2
+   }
+*/
+void
+expand_popcount (rtx *ops)
+{
+  rtx dst = ops[0];
+  rtx src = ops[1];
+  machine_mode mode = GET_MODE (dst);
+  scalar_mode smode = GET_MODE_INNER (mode);
+  static const uint64_t mask_values[6]
+    = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
+       0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL};
+
+  unsigned bit_size = GET_MODE_BITSIZE (smode);
+  unsigned word_size
+    = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE;
+  rtx count = CONST0_RTX (mode);
+
+  for (unsigned n = 0; n < word_size; ++n)
+    {
+      rtx part_value = src;
+      for (unsigned i = 1, ct = 0;
+	   i
+	   < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size);
+	   i <<= 1, ++ct)
+	{
+	  rtx mask_cst = gen_int_mode (mask_values[ct], smode);
+
+	  rtx vshift = expand_binop (mode, lshr_optab, part_value,
+				     gen_int_mode (i, smode), NULL_RTX, true,
+				     OPTAB_DIRECT);
+
+	  if (i == 4)
+	    {
+	      /* Optimize ((X & MASK) + ((X >> 4) & MASK))
+
+		 -> (X + (X >> 4)) & MASK  */
+	      rtx rhs = expand_binop (mode, add_optab, part_value, vshift,
+				      NULL_RTX, false, OPTAB_DIRECT);
+	      part_value = gen_reg_rtx (mode);
+	      rtx part_ops[] = {part_value, rhs, mask_cst};
+	      emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+			       part_ops);
+	    }
+	  else
+	    {
+	      rtx rhs = gen_reg_rtx (mode);
+	      rtx rhs_ops[] = {rhs, vshift, mask_cst};
+	      emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+			       rhs_ops);
+	      if (i == 1)
+		part_value = expand_binop (mode, sub_optab, part_value, rhs,
+					   NULL_RTX, false, OPTAB_DIRECT);
+	      else
+		{
+		  rtx lhs = gen_reg_rtx (mode);
+		  rtx lhs_ops[] = {lhs, part_value, mask_cst};
+		  emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP,
+				   lhs_ops);
+
+		  part_value = expand_binop (mode, add_optab, lhs, rhs,
+					     NULL_RTX, false, OPTAB_DIRECT);
+		}
+	    }
+	}
+
+      count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false,
+			    OPTAB_DIRECT);
+      if (bit_size > LONG_LONG_TYPE_SIZE)
+	{
+	  src = expand_binop (mode, lshr_optab, src,
+			      gen_int_mode (LONG_LONG_TYPE_SIZE, smode),
+			      NULL_RTX, true, OPTAB_DIRECT);
+	  bit_size -= LONG_LONG_TYPE_SIZE;
+	}
+    }
+  emit_move_insn (dst, count);
+}
+
 } // namespace riscv_vector
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
new file mode 100644
index 00000000000..bcb4a6e8571
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-vect-details" } */
+
+#include <stdint-gcc.h>
+
+void __attribute__ ((noinline, noclone))
+popcount_32 (unsigned int *restrict dst, uint32_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_popcount (src[i]);
+}
+
+void __attribute__ ((noinline, noclone))
+popcount_64 (unsigned int *restrict dst, uint64_t *restrict src, int size)
+{
+  for (int i = 0; i < size; ++i)
+    dst[i] = __builtin_popcountll (src[i]);
+}
+
+/* FIXME: We don't allow vectorize "__builtin_popcountll" yet since it needs "vec_pack_trunc" support
+          and such pattern may cause inferior codegen.
+	  We will enable "vec_pack_trunc" when we support reasonable vector cost model.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
new file mode 100644
index 00000000000..f6be709639a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c
@@ -0,0 +1,50 @@ 
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */
+
+#include "popcount-1.c"
+
+extern void abort (void) __attribute__ ((noreturn));
+
+unsigned int data[] = {
+  0x11111100, 6,
+  0xe0e0f0f0, 14,
+  0x9900aab3, 13,
+  0x00040003, 3,
+  0x000e000c, 5,
+  0x22227777, 16,
+  0x12341234, 10,
+  0x0, 0
+};
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  unsigned int count = sizeof (data) / sizeof (data[0]) / 2;
+
+  uint32_t in32[count];
+  unsigned int out32[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in32[i] = data[i * 2];
+      asm volatile ("" ::: "memory");
+    }
+  popcount_32 (out32, in32, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out32[i] != data[i * 2 + 1])
+      abort ();
+
+  count /= 2;
+  uint64_t in64[count];
+  unsigned int out64[count];
+  for (unsigned int i = 0; i < count; ++i)
+    {
+      in64[i] = ((uint64_t) data[i * 4] << 32) | data[i * 4 + 2];
+      asm volatile ("" ::: "memory");
+    }
+  popcount_64 (out64, in64, count);
+  for (unsigned int i = 0; i < count; ++i)
+    if (out64[i] != data[i * 4 + 1] + data[i * 4 + 3])
+      abort ();
+
+  return 0;
+}