Message ID | 20230731120120.651197-1-juzhe.zhong@rivai.ai |
---|---|
State | New |
Headers | show |
Series | RISC-V: Support POPCOUNT auto-vectorization | expand |
On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote: > > This patch is inspired by "lowerCTPOP" in LLVM. > Support popcount auto-vectorization by following LLVM approach. > https://godbolt.org/z/3K3GzvY7f > > Before this patch: > > <source>:7:21: missed: couldn't vectorize loop > <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4); > > After this patch: > > popcount_32: > ble a2,zero,.L5 > li t3,1431654400 > li a7,858992640 > li t1,252645376 > li a6,16711680 > li a3,65536 > addiw t3,t3,1365 > addiw a7,a7,819 > addiw t1,t1,-241 > addiw a6,a6,255 > addiw a3,a3,-1 > .L3: > vsetvli a5,a2,e8,mf4,ta,ma > vle32.v v1,0(a1) > vsetivli zero,4,e32,m1,ta,ma > vsrl.vi v2,v1,1 > vand.vx v2,v2,t3 > vsub.vv v1,v1,v2 > vsrl.vi v2,v1,2 > vand.vx v2,v2,a7 > vand.vx v1,v1,a7 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,4 > vadd.vv v1,v1,v2 > vand.vx v1,v1,t1 > vsrl.vi v2,v1,8 > vand.vx v2,v2,a6 > slli a4,a5,2 > vand.vx v1,v1,a6 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,16 > vand.vx v1,v1,a3 > vand.vx v2,v2,a3 > vadd.vv v1,v1,v2 > vmv.v.v v1,v1 > vsetvli zero,a2,e32,m1,ta,ma > sub a2,a2,a5 > vse32.v v1,0(a0) > add a1,a1,a4 > add a0,a0,a4 > bne a2,zero,.L3 > .L5: > ret > > gcc/ChangeLog: > > * config/riscv/autovec.md (popcount<mode>2): New pattern. > * config/riscv/riscv-protos.h (expand_popcount): New function. > * config/riscv/riscv-v.cc (expand_popcount): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test. > * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test. > > --- > gcc/config/riscv/autovec.md | 13 +++ > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 95 +++++++++++++++++++ > .../riscv/rvv/autovec/widen/popcount-1.c | 23 +++++ > .../riscv/rvv/autovec/widen/popcount_run-1.c | 50 ++++++++++ > 5 files changed, 182 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index b5152bc91fd..9d32b91bdca 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -922,6 +922,19 @@ > DONE; > }) > > +;; ------------------------------------------------------------------------------- > +;; - [INT] POPCOUNT. > +;; ------------------------------------------------------------------------------- > + > +(define_expand "popcount<mode>2" > + [(match_operand:VI 0 "register_operand") > + (match_operand:VI 1 "register_operand")] > + "TARGET_VECTOR" > +{ > + riscv_vector::expand_popcount (operands); > + DONE; > +}) > + > ;; ------------------------------------------------------------------------------- > ;; ---- [FP] Unary operations > ;; ------------------------------------------------------------------------------- > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index a729db44c32..ae40fbb4b53 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -321,6 +321,7 @@ void expand_select_vl (rtx *); > void expand_load_store (rtx *, bool); > void expand_gather_scatter (rtx *, bool); > void expand_cond_len_ternop (unsigned, rtx *); > +void expand_popcount (rtx *); > > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index c10e51b362e..b3caa4b188d 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type) > emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2)); > } > > +/* Expand Vector POPCOUNT by parallel popcnt: > + > + int parallel_popcnt(uint32_t n) { > + #define POW2(c) (1U << (c)) > + #define MASK(c) (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U)) > + #define COUNT(x, c) ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c)) > + n = COUNT(n, 0); > + n = COUNT(n, 1); > + n = COUNT(n, 2); > + n = COUNT(n, 3); > + n = COUNT(n, 4); > + // n = COUNT(n, 5); // uncomment this line for 64-bit integers > + return n; > + #undef COUNT > + #undef MASK > + #undef POW2 > + } > +*/ > +void > +expand_popcount (rtx *ops) > +{ > + rtx dst = ops[0]; > + rtx src = ops[1]; > + machine_mode mode = GET_MODE (dst); > + scalar_mode smode = GET_MODE_INNER (mode); > + static const uint64_t mask_values[6] > + = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, > + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL}; > + > + unsigned bit_size = GET_MODE_BITSIZE (smode); bit_size is always euqal or less than 64 since we don't have TI mode for vector modes. > + unsigned word_size > + = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE; So this is always 1, or you were trying to handle rv32 with ELEN64 here? > + rtx count = CONST0_RTX (mode); > + > + for (unsigned n = 0; n < word_size; ++n) Drop outer loop if word_size never larger than 1? > + { > + rtx part_value = src; > + for (unsigned i = 1, ct = 0; > + i > + < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size); Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64 and bit_size <=64? > + i <<= 1, ++ct) > + { > + rtx mask_cst = gen_int_mode (mask_values[ct], smode); > + > + rtx vshift = expand_binop (mode, lshr_optab, part_value, > + gen_int_mode (i, smode), NULL_RTX, true, > + OPTAB_DIRECT); > + > + if (i == 4) > + { > + /* Optimize ((X & MASK) + ((X >> 4) & MASK)) > + > + -> (X + (X >> 4)) & MASK */ > + rtx rhs = expand_binop (mode, add_optab, part_value, vshift, > + NULL_RTX, false, OPTAB_DIRECT); > + part_value = gen_reg_rtx (mode); > + rtx part_ops[] = {part_value, rhs, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + part_ops); > + } > + else > + { > + rtx rhs = gen_reg_rtx (mode); > + rtx rhs_ops[] = {rhs, vshift, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + rhs_ops); > + if (i == 1) > + part_value = expand_binop (mode, sub_optab, part_value, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + else > + { > + rtx lhs = gen_reg_rtx (mode); > + rtx lhs_ops[] = {lhs, part_value, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + lhs_ops); > + > + part_value = expand_binop (mode, add_optab, lhs, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + } > + } > + } > + > + count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false, > + OPTAB_DIRECT); > + if (bit_size > LONG_LONG_TYPE_SIZE) No need this if word_size is constant 1. > + { > + src = expand_binop (mode, lshr_optab, src, > + gen_int_mode (LONG_LONG_TYPE_SIZE, smode), > + NULL_RTX, true, OPTAB_DIRECT); > + bit_size -= LONG_LONG_TYPE_SIZE; > + } > + } > + emit_move_insn (dst, count); > +} > + > } // namespace riscv_vector >
>> Drop outer loop if word_size never larger than 1? Yeah. we never have TI vector modes for now. The codes I just directly copy from LLVM in generic intrinsic handling.... :) since LLVM generic code is considering handling INT128 vector I will remove all redundant code for INT128 vector mode in V2. Thanks. juzhe.zhong@rivai.ai From: Kito Cheng Date: 2023-07-31 21:38 To: Juzhe-Zhong CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote: > > This patch is inspired by "lowerCTPOP" in LLVM. > Support popcount auto-vectorization by following LLVM approach. > https://godbolt.org/z/3K3GzvY7f > > Before this patch: > > <source>:7:21: missed: couldn't vectorize loop > <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4); > > After this patch: > > popcount_32: > ble a2,zero,.L5 > li t3,1431654400 > li a7,858992640 > li t1,252645376 > li a6,16711680 > li a3,65536 > addiw t3,t3,1365 > addiw a7,a7,819 > addiw t1,t1,-241 > addiw a6,a6,255 > addiw a3,a3,-1 > .L3: > vsetvli a5,a2,e8,mf4,ta,ma > vle32.v v1,0(a1) > vsetivli zero,4,e32,m1,ta,ma > vsrl.vi v2,v1,1 > vand.vx v2,v2,t3 > vsub.vv v1,v1,v2 > vsrl.vi v2,v1,2 > vand.vx v2,v2,a7 > vand.vx v1,v1,a7 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,4 > vadd.vv v1,v1,v2 > vand.vx v1,v1,t1 > vsrl.vi v2,v1,8 > vand.vx v2,v2,a6 > slli a4,a5,2 > vand.vx v1,v1,a6 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,16 > vand.vx v1,v1,a3 > vand.vx v2,v2,a3 > vadd.vv v1,v1,v2 > vmv.v.v v1,v1 > vsetvli zero,a2,e32,m1,ta,ma > sub a2,a2,a5 > vse32.v v1,0(a0) > add a1,a1,a4 > add a0,a0,a4 > bne a2,zero,.L3 > .L5: > ret > > gcc/ChangeLog: > > * config/riscv/autovec.md (popcount<mode>2): New pattern. > * config/riscv/riscv-protos.h (expand_popcount): New function. > * config/riscv/riscv-v.cc (expand_popcount): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test. > * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test. > > --- > gcc/config/riscv/autovec.md | 13 +++ > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 95 +++++++++++++++++++ > .../riscv/rvv/autovec/widen/popcount-1.c | 23 +++++ > .../riscv/rvv/autovec/widen/popcount_run-1.c | 50 ++++++++++ > 5 files changed, 182 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index b5152bc91fd..9d32b91bdca 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -922,6 +922,19 @@ > DONE; > }) > > +;; ------------------------------------------------------------------------------- > +;; - [INT] POPCOUNT. > +;; ------------------------------------------------------------------------------- > + > +(define_expand "popcount<mode>2" > + [(match_operand:VI 0 "register_operand") > + (match_operand:VI 1 "register_operand")] > + "TARGET_VECTOR" > +{ > + riscv_vector::expand_popcount (operands); > + DONE; > +}) > + > ;; ------------------------------------------------------------------------------- > ;; ---- [FP] Unary operations > ;; ------------------------------------------------------------------------------- > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index a729db44c32..ae40fbb4b53 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -321,6 +321,7 @@ void expand_select_vl (rtx *); > void expand_load_store (rtx *, bool); > void expand_gather_scatter (rtx *, bool); > void expand_cond_len_ternop (unsigned, rtx *); > +void expand_popcount (rtx *); > > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index c10e51b362e..b3caa4b188d 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type) > emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2)); > } > > +/* Expand Vector POPCOUNT by parallel popcnt: > + > + int parallel_popcnt(uint32_t n) { > + #define POW2(c) (1U << (c)) > + #define MASK(c) (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U)) > + #define COUNT(x, c) ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c)) > + n = COUNT(n, 0); > + n = COUNT(n, 1); > + n = COUNT(n, 2); > + n = COUNT(n, 3); > + n = COUNT(n, 4); > + // n = COUNT(n, 5); // uncomment this line for 64-bit integers > + return n; > + #undef COUNT > + #undef MASK > + #undef POW2 > + } > +*/ > +void > +expand_popcount (rtx *ops) > +{ > + rtx dst = ops[0]; > + rtx src = ops[1]; > + machine_mode mode = GET_MODE (dst); > + scalar_mode smode = GET_MODE_INNER (mode); > + static const uint64_t mask_values[6] > + = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, > + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL}; > + > + unsigned bit_size = GET_MODE_BITSIZE (smode); bit_size is always euqal or less than 64 since we don't have TI mode for vector modes. > + unsigned word_size > + = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE; So this is always 1, or you were trying to handle rv32 with ELEN64 here? > + rtx count = CONST0_RTX (mode); > + > + for (unsigned n = 0; n < word_size; ++n) Drop outer loop if word_size never larger than 1? > + { > + rtx part_value = src; > + for (unsigned i = 1, ct = 0; > + i > + < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size); Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64 and bit_size <=64? > + i <<= 1, ++ct) > + { > + rtx mask_cst = gen_int_mode (mask_values[ct], smode); > + > + rtx vshift = expand_binop (mode, lshr_optab, part_value, > + gen_int_mode (i, smode), NULL_RTX, true, > + OPTAB_DIRECT); > + > + if (i == 4) > + { > + /* Optimize ((X & MASK) + ((X >> 4) & MASK)) > + > + -> (X + (X >> 4)) & MASK */ > + rtx rhs = expand_binop (mode, add_optab, part_value, vshift, > + NULL_RTX, false, OPTAB_DIRECT); > + part_value = gen_reg_rtx (mode); > + rtx part_ops[] = {part_value, rhs, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + part_ops); > + } > + else > + { > + rtx rhs = gen_reg_rtx (mode); > + rtx rhs_ops[] = {rhs, vshift, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + rhs_ops); > + if (i == 1) > + part_value = expand_binop (mode, sub_optab, part_value, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + else > + { > + rtx lhs = gen_reg_rtx (mode); > + rtx lhs_ops[] = {lhs, part_value, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + lhs_ops); > + > + part_value = expand_binop (mode, add_optab, lhs, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + } > + } > + } > + > + count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false, > + OPTAB_DIRECT); > + if (bit_size > LONG_LONG_TYPE_SIZE) No need this if word_size is constant 1. > + { > + src = expand_binop (mode, lshr_optab, src, > + gen_int_mode (LONG_LONG_TYPE_SIZE, smode), > + NULL_RTX, true, OPTAB_DIRECT); > + bit_size -= LONG_LONG_TYPE_SIZE; > + } > + } > + emit_move_insn (dst, count); > +} > + > } // namespace riscv_vector >
Address comment and fix on V2: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625870.html Ok for trunk? juzhe.zhong@rivai.ai From: Kito Cheng Date: 2023-07-31 21:38 To: Juzhe-Zhong CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc Subject: Re: [PATCH] RISC-V: Support POPCOUNT auto-vectorization On Mon, Jul 31, 2023 at 8:03 PM Juzhe-Zhong <juzhe.zhong@rivai.ai> wrote: > > This patch is inspired by "lowerCTPOP" in LLVM. > Support popcount auto-vectorization by following LLVM approach. > https://godbolt.org/z/3K3GzvY7f > > Before this patch: > > <source>:7:21: missed: couldn't vectorize loop > <source>:8:14: missed: not vectorized: relevant stmt not supported: _5 = __builtin_popcount (_4); > > After this patch: > > popcount_32: > ble a2,zero,.L5 > li t3,1431654400 > li a7,858992640 > li t1,252645376 > li a6,16711680 > li a3,65536 > addiw t3,t3,1365 > addiw a7,a7,819 > addiw t1,t1,-241 > addiw a6,a6,255 > addiw a3,a3,-1 > .L3: > vsetvli a5,a2,e8,mf4,ta,ma > vle32.v v1,0(a1) > vsetivli zero,4,e32,m1,ta,ma > vsrl.vi v2,v1,1 > vand.vx v2,v2,t3 > vsub.vv v1,v1,v2 > vsrl.vi v2,v1,2 > vand.vx v2,v2,a7 > vand.vx v1,v1,a7 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,4 > vadd.vv v1,v1,v2 > vand.vx v1,v1,t1 > vsrl.vi v2,v1,8 > vand.vx v2,v2,a6 > slli a4,a5,2 > vand.vx v1,v1,a6 > vadd.vv v1,v1,v2 > vsrl.vi v2,v1,16 > vand.vx v1,v1,a3 > vand.vx v2,v2,a3 > vadd.vv v1,v1,v2 > vmv.v.v v1,v1 > vsetvli zero,a2,e32,m1,ta,ma > sub a2,a2,a5 > vse32.v v1,0(a0) > add a1,a1,a4 > add a0,a0,a4 > bne a2,zero,.L3 > .L5: > ret > > gcc/ChangeLog: > > * config/riscv/autovec.md (popcount<mode>2): New pattern. > * config/riscv/riscv-protos.h (expand_popcount): New function. > * config/riscv/riscv-v.cc (expand_popcount): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/popcount-1.c: New test. > * gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c: New test. > > --- > gcc/config/riscv/autovec.md | 13 +++ > gcc/config/riscv/riscv-protos.h | 1 + > gcc/config/riscv/riscv-v.cc | 95 +++++++++++++++++++ > .../riscv/rvv/autovec/widen/popcount-1.c | 23 +++++ > .../riscv/rvv/autovec/widen/popcount_run-1.c | 50 ++++++++++ > 5 files changed, 182 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c > > diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md > index b5152bc91fd..9d32b91bdca 100644 > --- a/gcc/config/riscv/autovec.md > +++ b/gcc/config/riscv/autovec.md > @@ -922,6 +922,19 @@ > DONE; > }) > > +;; ------------------------------------------------------------------------------- > +;; - [INT] POPCOUNT. > +;; ------------------------------------------------------------------------------- > + > +(define_expand "popcount<mode>2" > + [(match_operand:VI 0 "register_operand") > + (match_operand:VI 1 "register_operand")] > + "TARGET_VECTOR" > +{ > + riscv_vector::expand_popcount (operands); > + DONE; > +}) > + > ;; ------------------------------------------------------------------------------- > ;; ---- [FP] Unary operations > ;; ------------------------------------------------------------------------------- > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index a729db44c32..ae40fbb4b53 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -321,6 +321,7 @@ void expand_select_vl (rtx *); > void expand_load_store (rtx *, bool); > void expand_gather_scatter (rtx *, bool); > void expand_cond_len_ternop (unsigned, rtx *); > +void expand_popcount (rtx *); > > /* Rounding mode bitfield for fixed point VXRM. */ > enum fixed_point_rounding_mode > diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc > index c10e51b362e..b3caa4b188d 100644 > --- a/gcc/config/riscv/riscv-v.cc > +++ b/gcc/config/riscv/riscv-v.cc > @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type) > emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2)); > } > > +/* Expand Vector POPCOUNT by parallel popcnt: > + > + int parallel_popcnt(uint32_t n) { > + #define POW2(c) (1U << (c)) > + #define MASK(c) (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U)) > + #define COUNT(x, c) ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c)) > + n = COUNT(n, 0); > + n = COUNT(n, 1); > + n = COUNT(n, 2); > + n = COUNT(n, 3); > + n = COUNT(n, 4); > + // n = COUNT(n, 5); // uncomment this line for 64-bit integers > + return n; > + #undef COUNT > + #undef MASK > + #undef POW2 > + } > +*/ > +void > +expand_popcount (rtx *ops) > +{ > + rtx dst = ops[0]; > + rtx src = ops[1]; > + machine_mode mode = GET_MODE (dst); > + scalar_mode smode = GET_MODE_INNER (mode); > + static const uint64_t mask_values[6] > + = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, > + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL}; > + > + unsigned bit_size = GET_MODE_BITSIZE (smode); bit_size is always euqal or less than 64 since we don't have TI mode for vector modes. > + unsigned word_size > + = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE; So this is always 1, or you were trying to handle rv32 with ELEN64 here? > + rtx count = CONST0_RTX (mode); > + > + for (unsigned n = 0; n < word_size; ++n) Drop outer loop if word_size never larger than 1? > + { > + rtx part_value = src; > + for (unsigned i = 1, ct = 0; > + i > + < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size); Just bit_size should be fine since LONG_LONG_TYPE_SIZE is constant 64 and bit_size <=64? > + i <<= 1, ++ct) > + { > + rtx mask_cst = gen_int_mode (mask_values[ct], smode); > + > + rtx vshift = expand_binop (mode, lshr_optab, part_value, > + gen_int_mode (i, smode), NULL_RTX, true, > + OPTAB_DIRECT); > + > + if (i == 4) > + { > + /* Optimize ((X & MASK) + ((X >> 4) & MASK)) > + > + -> (X + (X >> 4)) & MASK */ > + rtx rhs = expand_binop (mode, add_optab, part_value, vshift, > + NULL_RTX, false, OPTAB_DIRECT); > + part_value = gen_reg_rtx (mode); > + rtx part_ops[] = {part_value, rhs, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + part_ops); > + } > + else > + { > + rtx rhs = gen_reg_rtx (mode); > + rtx rhs_ops[] = {rhs, vshift, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + rhs_ops); > + if (i == 1) > + part_value = expand_binop (mode, sub_optab, part_value, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + else > + { > + rtx lhs = gen_reg_rtx (mode); > + rtx lhs_ops[] = {lhs, part_value, mask_cst}; > + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, > + lhs_ops); > + > + part_value = expand_binop (mode, add_optab, lhs, rhs, > + NULL_RTX, false, OPTAB_DIRECT); > + } > + } > + } > + > + count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false, > + OPTAB_DIRECT); > + if (bit_size > LONG_LONG_TYPE_SIZE) No need this if word_size is constant 1. > + { > + src = expand_binop (mode, lshr_optab, src, > + gen_int_mode (LONG_LONG_TYPE_SIZE, smode), > + NULL_RTX, true, OPTAB_DIRECT); > + bit_size -= LONG_LONG_TYPE_SIZE; > + } > + } > + emit_move_insn (dst, count); > +} > + > } // namespace riscv_vector >
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index b5152bc91fd..9d32b91bdca 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -922,6 +922,19 @@ DONE; }) +;; ------------------------------------------------------------------------------- +;; - [INT] POPCOUNT. +;; ------------------------------------------------------------------------------- + +(define_expand "popcount<mode>2" + [(match_operand:VI 0 "register_operand") + (match_operand:VI 1 "register_operand")] + "TARGET_VECTOR" +{ + riscv_vector::expand_popcount (operands); + DONE; +}) + ;; ------------------------------------------------------------------------------- ;; ---- [FP] Unary operations ;; ------------------------------------------------------------------------------- diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index a729db44c32..ae40fbb4b53 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -321,6 +321,7 @@ void expand_select_vl (rtx *); void expand_load_store (rtx *, bool); void expand_gather_scatter (rtx *, bool); void expand_cond_len_ternop (unsigned, rtx *); +void expand_popcount (rtx *); /* Rounding mode bitfield for fixed point VXRM. */ enum fixed_point_rounding_mode diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index c10e51b362e..b3caa4b188d 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3614,4 +3614,99 @@ expand_reduction (rtx_code code, rtx *ops, rtx init, reduction_type type) emit_insn (gen_pred_extract_first (m1_mode, ops[0], m1_tmp2)); } +/* Expand Vector POPCOUNT by parallel popcnt: + + int parallel_popcnt(uint32_t n) { + #define POW2(c) (1U << (c)) + #define MASK(c) (static_cast<uint32_t>(-1) / (POW2(POW2(c)) + 1U)) + #define COUNT(x, c) ((x) & MASK(c)) + (((x)>>(POW2(c))) & MASK(c)) + n = COUNT(n, 0); + n = COUNT(n, 1); + n = COUNT(n, 2); + n = COUNT(n, 3); + n = COUNT(n, 4); + // n = COUNT(n, 5); // uncomment this line for 64-bit integers + return n; + #undef COUNT + #undef MASK + #undef POW2 + } +*/ +void +expand_popcount (rtx *ops) +{ + rtx dst = ops[0]; + rtx src = ops[1]; + machine_mode mode = GET_MODE (dst); + scalar_mode smode = GET_MODE_INNER (mode); + static const uint64_t mask_values[6] + = {0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL, + 0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL}; + + unsigned bit_size = GET_MODE_BITSIZE (smode); + unsigned word_size + = (bit_size + LONG_LONG_TYPE_SIZE - 1) / LONG_LONG_TYPE_SIZE; + rtx count = CONST0_RTX (mode); + + for (unsigned n = 0; n < word_size; ++n) + { + rtx part_value = src; + for (unsigned i = 1, ct = 0; + i + < (bit_size > LONG_LONG_TYPE_SIZE ? LONG_LONG_TYPE_SIZE : bit_size); + i <<= 1, ++ct) + { + rtx mask_cst = gen_int_mode (mask_values[ct], smode); + + rtx vshift = expand_binop (mode, lshr_optab, part_value, + gen_int_mode (i, smode), NULL_RTX, true, + OPTAB_DIRECT); + + if (i == 4) + { + /* Optimize ((X & MASK) + ((X >> 4) & MASK)) + + -> (X + (X >> 4)) & MASK */ + rtx rhs = expand_binop (mode, add_optab, part_value, vshift, + NULL_RTX, false, OPTAB_DIRECT); + part_value = gen_reg_rtx (mode); + rtx part_ops[] = {part_value, rhs, mask_cst}; + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, + part_ops); + } + else + { + rtx rhs = gen_reg_rtx (mode); + rtx rhs_ops[] = {rhs, vshift, mask_cst}; + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, + rhs_ops); + if (i == 1) + part_value = expand_binop (mode, sub_optab, part_value, rhs, + NULL_RTX, false, OPTAB_DIRECT); + else + { + rtx lhs = gen_reg_rtx (mode); + rtx lhs_ops[] = {lhs, part_value, mask_cst}; + emit_vlmax_insn (code_for_pred_scalar (AND, mode), RVV_BINOP, + lhs_ops); + + part_value = expand_binop (mode, add_optab, lhs, rhs, + NULL_RTX, false, OPTAB_DIRECT); + } + } + } + + count = expand_binop (mode, add_optab, part_value, count, NULL_RTX, false, + OPTAB_DIRECT); + if (bit_size > LONG_LONG_TYPE_SIZE) + { + src = expand_binop (mode, lshr_optab, src, + gen_int_mode (LONG_LONG_TYPE_SIZE, smode), + NULL_RTX, true, OPTAB_DIRECT); + bit_size -= LONG_LONG_TYPE_SIZE; + } + } + emit_move_insn (dst, count); +} + } // namespace riscv_vector diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c new file mode 100644 index 00000000000..bcb4a6e8571 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount-1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-march=rv32gcv_zvfh -mabi=ilp32d --param=riscv-autovec-preference=scalable -fno-vect-cost-model -fdump-tree-vect-details" } */ + +#include <stdint-gcc.h> + +void __attribute__ ((noinline, noclone)) +popcount_32 (unsigned int *restrict dst, uint32_t *restrict src, int size) +{ + for (int i = 0; i < size; ++i) + dst[i] = __builtin_popcount (src[i]); +} + +void __attribute__ ((noinline, noclone)) +popcount_64 (unsigned int *restrict dst, uint64_t *restrict src, int size) +{ + for (int i = 0; i < size; ++i) + dst[i] = __builtin_popcountll (src[i]); +} + +/* FIXME: We don't allow vectorize "__builtin_popcountll" yet since it needs "vec_pack_trunc" support + and such pattern may cause inferior codegen. + We will enable "vec_pack_trunc" when we support reasonable vector cost model. */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c new file mode 100644 index 00000000000..f6be709639a --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/popcount_run-1.c @@ -0,0 +1,50 @@ +/* { dg-do run { target { riscv_vector } } } */ +/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */ + +#include "popcount-1.c" + +extern void abort (void) __attribute__ ((noreturn)); + +unsigned int data[] = { + 0x11111100, 6, + 0xe0e0f0f0, 14, + 0x9900aab3, 13, + 0x00040003, 3, + 0x000e000c, 5, + 0x22227777, 16, + 0x12341234, 10, + 0x0, 0 +}; + +int __attribute__ ((optimize (1))) +main (void) +{ + unsigned int count = sizeof (data) / sizeof (data[0]) / 2; + + uint32_t in32[count]; + unsigned int out32[count]; + for (unsigned int i = 0; i < count; ++i) + { + in32[i] = data[i * 2]; + asm volatile ("" ::: "memory"); + } + popcount_32 (out32, in32, count); + for (unsigned int i = 0; i < count; ++i) + if (out32[i] != data[i * 2 + 1]) + abort (); + + count /= 2; + uint64_t in64[count]; + unsigned int out64[count]; + for (unsigned int i = 0; i < count; ++i) + { + in64[i] = ((uint64_t) data[i * 4] << 32) | data[i * 4 + 2]; + asm volatile ("" ::: "memory"); + } + popcount_64 (out64, in64, count); + for (unsigned int i = 0; i < count; ++i) + if (out64[i] != data[i * 4 + 1] + data[i * 4 + 3]) + abort (); + + return 0; +}