diff mbox series

[v6,18/61] target/riscv: vector single-width integer multiply instructions

Message ID 20200317150653.9008-19-zhiwei_liu@c-sky.com
State New
Headers show
Series target/riscv: support vector extension v0.7.1 | expand

Commit Message

LIU Zhiwei March 17, 2020, 3:06 p.m. UTC
Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
---
 target/riscv/helper.h                   |  33 +++++
 target/riscv/insn32.decode              |   8 ++
 target/riscv/insn_trans/trans_rvv.inc.c |  10 ++
 target/riscv/vector_helper.c            | 156 ++++++++++++++++++++++++
 4 files changed, 207 insertions(+)

Comments

Alistair Francis March 25, 2020, 5:36 p.m. UTC | #1
On Tue, Mar 17, 2020 at 8:43 AM LIU Zhiwei <zhiwei_liu@c-sky.com> wrote:
>
> Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com>

Reviewed-by: Alistair Francis <alistair.francis@wdc.com>

Alistair

> ---
>  target/riscv/helper.h                   |  33 +++++
>  target/riscv/insn32.decode              |   8 ++
>  target/riscv/insn_trans/trans_rvv.inc.c |  10 ++
>  target/riscv/vector_helper.c            | 156 ++++++++++++++++++++++++
>  4 files changed, 207 insertions(+)
>
> diff --git a/target/riscv/helper.h b/target/riscv/helper.h
> index c7d4ff185a..f42a12eef3 100644
> --- a/target/riscv/helper.h
> +++ b/target/riscv/helper.h
> @@ -525,3 +525,36 @@ DEF_HELPER_6(vmax_vx_b, void, ptr, ptr, tl, ptr, env, i32)
>  DEF_HELPER_6(vmax_vx_h, void, ptr, ptr, tl, ptr, env, i32)
>  DEF_HELPER_6(vmax_vx_w, void, ptr, ptr, tl, ptr, env, i32)
>  DEF_HELPER_6(vmax_vx_d, void, ptr, ptr, tl, ptr, env, i32)
> +
> +DEF_HELPER_6(vmul_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmul_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmul_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmul_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
> +DEF_HELPER_6(vmul_vx_b, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmul_vx_h, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmul_vx_w, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmul_vx_d, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vx_b, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vx_h, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vx_w, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulh_vx_d, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vx_b, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vx_h, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vx_w, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhu_vx_d, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vx_b, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vx_h, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vx_w, void, ptr, ptr, tl, ptr, env, i32)
> +DEF_HELPER_6(vmulhsu_vx_d, void, ptr, ptr, tl, ptr, env, i32)
> diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
> index aafbdc6be7..abfed469bc 100644
> --- a/target/riscv/insn32.decode
> +++ b/target/riscv/insn32.decode
> @@ -363,6 +363,14 @@ vmaxu_vv        000110 . ..... ..... 000 ..... 1010111 @r_vm
>  vmaxu_vx        000110 . ..... ..... 100 ..... 1010111 @r_vm
>  vmax_vv         000111 . ..... ..... 000 ..... 1010111 @r_vm
>  vmax_vx         000111 . ..... ..... 100 ..... 1010111 @r_vm
> +vmul_vv         100101 . ..... ..... 010 ..... 1010111 @r_vm
> +vmul_vx         100101 . ..... ..... 110 ..... 1010111 @r_vm
> +vmulh_vv        100111 . ..... ..... 010 ..... 1010111 @r_vm
> +vmulh_vx        100111 . ..... ..... 110 ..... 1010111 @r_vm
> +vmulhu_vv       100100 . ..... ..... 010 ..... 1010111 @r_vm
> +vmulhu_vx       100100 . ..... ..... 110 ..... 1010111 @r_vm
> +vmulhsu_vv      100110 . ..... ..... 010 ..... 1010111 @r_vm
> +vmulhsu_vx      100110 . ..... ..... 110 ..... 1010111 @r_vm
>
>  vsetvli         0 ........... ..... 111 ..... 1010111  @r2_zimm
>  vsetvl          1000000 ..... ..... 111 ..... 1010111  @r
> diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
> index 53c49ee15c..c276beabd6 100644
> --- a/target/riscv/insn_trans/trans_rvv.inc.c
> +++ b/target/riscv/insn_trans/trans_rvv.inc.c
> @@ -1452,3 +1452,13 @@ GEN_OPIVX_TRANS(vminu_vx, opivx_check)
>  GEN_OPIVX_TRANS(vmin_vx,  opivx_check)
>  GEN_OPIVX_TRANS(vmaxu_vx, opivx_check)
>  GEN_OPIVX_TRANS(vmax_vx,  opivx_check)
> +
> +/* Vector Single-Width Integer Multiply Instructions */
> +GEN_OPIVV_GVEC_TRANS(vmul_vv,  mul)
> +GEN_OPIVV_TRANS(vmulh_vv, opivv_check)
> +GEN_OPIVV_TRANS(vmulhu_vv, opivv_check)
> +GEN_OPIVV_TRANS(vmulhsu_vv, opivv_check)
> +GEN_OPIVX_GVEC_TRANS(vmul_vx,  muls)
> +GEN_OPIVX_TRANS(vmulh_vx, opivx_check)
> +GEN_OPIVX_TRANS(vmulhu_vx, opivx_check)
> +GEN_OPIVX_TRANS(vmulhsu_vx, opivx_check)
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 32c2760a8a..56ba9a7422 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -852,6 +852,10 @@ GEN_VEXT_AMO(vamomaxuw_v_w, uint32_t, uint32_t, idx_w, clearl)
>  #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
>  #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
>  #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
> +#define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
> +#define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
> +#define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
> +#define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
>
>  /* operation of two vector elements */
>  typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
> @@ -1585,3 +1589,155 @@ GEN_VEXT_VX(vmax_vx_b, 1, 1, clearb)
>  GEN_VEXT_VX(vmax_vx_h, 2, 2, clearh)
>  GEN_VEXT_VX(vmax_vx_w, 4, 4, clearl)
>  GEN_VEXT_VX(vmax_vx_d, 8, 8, clearq)
> +
> +/* Vector Single-Width Integer Multiply Instructions */
> +#define DO_MUL(N, M) (N * M)
> +RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
> +RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
> +RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
> +RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
> +GEN_VEXT_VV(vmul_vv_b, 1, 1, clearb)
> +GEN_VEXT_VV(vmul_vv_h, 2, 2, clearh)
> +GEN_VEXT_VV(vmul_vv_w, 4, 4, clearl)
> +GEN_VEXT_VV(vmul_vv_d, 8, 8, clearq)
> +
> +static int8_t do_mulh_b(int8_t s2, int8_t s1)
> +{
> +    return (int16_t)s2 * (int16_t)s1 >> 8;
> +}
> +
> +static int16_t do_mulh_h(int16_t s2, int16_t s1)
> +{
> +    return (int32_t)s2 * (int32_t)s1 >> 16;
> +}
> +
> +static int32_t do_mulh_w(int32_t s2, int32_t s1)
> +{
> +    return (int64_t)s2 * (int64_t)s1 >> 32;
> +}
> +
> +static int64_t do_mulh_d(int64_t s2, int64_t s1)
> +{
> +    uint64_t hi_64, lo_64;
> +
> +    muls64(&lo_64, &hi_64, s1, s2);
> +    return hi_64;
> +}
> +
> +static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
> +{
> +    return (uint16_t)s2 * (uint16_t)s1 >> 8;
> +}
> +
> +static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
> +{
> +    return (uint32_t)s2 * (uint32_t)s1 >> 16;
> +}
> +
> +static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
> +{
> +    return (uint64_t)s2 * (uint64_t)s1 >> 32;
> +}
> +
> +static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
> +{
> +    uint64_t hi_64, lo_64;
> +
> +    mulu64(&lo_64, &hi_64, s2, s1);
> +    return hi_64;
> +}
> +
> +static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
> +{
> +    return (int16_t)s2 * (uint16_t)s1 >> 8;
> +}
> +
> +static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
> +{
> +    return (int32_t)s2 * (uint32_t)s1 >> 16;
> +}
> +
> +static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
> +{
> +    return (int64_t)s2 * (uint64_t)s1 >> 32;
> +}
> +
> +static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
> +{
> +    uint64_t hi_64, lo_64, abs_s2 = s2;
> +
> +    if (s2 < 0) {
> +        abs_s2 = -s2;
> +    }
> +    mulu64(&lo_64, &hi_64, abs_s2, s1);
> +    if (s2 < 0) {
> +        lo_64 = ~lo_64;
> +        hi_64 = ~hi_64;
> +        if (lo_64 == UINT64_MAX) {
> +            lo_64 = 0;
> +            hi_64 += 1;
> +        } else {
> +            lo_64 += 1;
> +        }
> +    }
> +
> +    return hi_64;
> +}
> +
> +RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
> +RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
> +RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
> +RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
> +RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
> +RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
> +RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
> +RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
> +RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
> +RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
> +RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
> +RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
> +GEN_VEXT_VV(vmulh_vv_b, 1, 1, clearb)
> +GEN_VEXT_VV(vmulh_vv_h, 2, 2, clearh)
> +GEN_VEXT_VV(vmulh_vv_w, 4, 4, clearl)
> +GEN_VEXT_VV(vmulh_vv_d, 8, 8, clearq)
> +GEN_VEXT_VV(vmulhu_vv_b, 1, 1, clearb)
> +GEN_VEXT_VV(vmulhu_vv_h, 2, 2, clearh)
> +GEN_VEXT_VV(vmulhu_vv_w, 4, 4, clearl)
> +GEN_VEXT_VV(vmulhu_vv_d, 8, 8, clearq)
> +GEN_VEXT_VV(vmulhsu_vv_b, 1, 1, clearb)
> +GEN_VEXT_VV(vmulhsu_vv_h, 2, 2, clearh)
> +GEN_VEXT_VV(vmulhsu_vv_w, 4, 4, clearl)
> +GEN_VEXT_VV(vmulhsu_vv_d, 8, 8, clearq)
> +
> +RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
> +RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
> +RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
> +RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
> +RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
> +RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
> +RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
> +RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
> +RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
> +RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
> +RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
> +RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
> +RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
> +RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
> +RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
> +RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
> +GEN_VEXT_VX(vmul_vx_b, 1, 1, clearb)
> +GEN_VEXT_VX(vmul_vx_h, 2, 2, clearh)
> +GEN_VEXT_VX(vmul_vx_w, 4, 4, clearl)
> +GEN_VEXT_VX(vmul_vx_d, 8, 8, clearq)
> +GEN_VEXT_VX(vmulh_vx_b, 1, 1, clearb)
> +GEN_VEXT_VX(vmulh_vx_h, 2, 2, clearh)
> +GEN_VEXT_VX(vmulh_vx_w, 4, 4, clearl)
> +GEN_VEXT_VX(vmulh_vx_d, 8, 8, clearq)
> +GEN_VEXT_VX(vmulhu_vx_b, 1, 1, clearb)
> +GEN_VEXT_VX(vmulhu_vx_h, 2, 2, clearh)
> +GEN_VEXT_VX(vmulhu_vx_w, 4, 4, clearl)
> +GEN_VEXT_VX(vmulhu_vx_d, 8, 8, clearq)
> +GEN_VEXT_VX(vmulhsu_vx_b, 1, 1, clearb)
> +GEN_VEXT_VX(vmulhsu_vx_h, 2, 2, clearh)
> +GEN_VEXT_VX(vmulhsu_vx_w, 4, 4, clearl)
> +GEN_VEXT_VX(vmulhsu_vx_d, 8, 8, clearq)
> --
> 2.23.0
>
Richard Henderson March 28, 2020, 12:06 a.m. UTC | #2
On 3/17/20 8:06 AM, LIU Zhiwei wrote:
> +static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
> +{
> +    uint64_t hi_64, lo_64, abs_s2 = s2;
> +
> +    if (s2 < 0) {
> +        abs_s2 = -s2;
> +    }
> +    mulu64(&lo_64, &hi_64, abs_s2, s1);
> +    if (s2 < 0) {
> +        lo_64 = ~lo_64;
> +        hi_64 = ~hi_64;
> +        if (lo_64 == UINT64_MAX) {
> +            lo_64 = 0;
> +            hi_64 += 1;
> +        } else {
> +            lo_64 += 1;
> +        }
> +    }
> +
> +    return hi_64;
> +}

Missed the improvement here.  See tcg_gen_mulsu2_i64.

Otherwise,
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
LIU Zhiwei March 28, 2020, 3:17 p.m. UTC | #3
On 2020/3/28 8:06, Richard Henderson wrote:
> On 3/17/20 8:06 AM, LIU Zhiwei wrote:
>> +static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
>> +{
>> +    uint64_t hi_64, lo_64, abs_s2 = s2;
>> +
>> +    if (s2 < 0) {
>> +        abs_s2 = -s2;
>> +    }
>> +    mulu64(&lo_64, &hi_64, abs_s2, s1);
>> +    if (s2 < 0) {
>> +        lo_64 = ~lo_64;
>> +        hi_64 = ~hi_64;
>> +        if (lo_64 == UINT64_MAX) {
>> +            lo_64 = 0;
>> +            hi_64 += 1;
>> +        } else {
>> +            lo_64 += 1;
>> +        }
>> +    }
>> +
>> +    return hi_64;
>> +}
> Missed the improvement here.  See tcg_gen_mulsu2_i64.
Though I have not gotten the principle, the code in tcg_gen_mulsu2_i64 
is much tidier.

Thanks for pointing that.

Zhiwei
> Otherwise,
> Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
>
>
> r~
Richard Henderson March 28, 2020, 3:47 p.m. UTC | #4
On 3/28/20 8:17 AM, LIU Zhiwei wrote:
>> Missed the improvement here.  See tcg_gen_mulsu2_i64.
> Though I have not gotten the principle, the code in tcg_gen_mulsu2_i64 is much
> tidier.

Let A = signed operand,
    B = unsigned operand
    P = unsigned product

If the sign bit A is set, then P is too large.
In that case we subtract 2**64 * B to fix that:

    HI_P -= (A < 0 ? B : 0)

where the conditional is computed as (A >> 63) & B.


r~
LIU Zhiwei March 28, 2020, 4:13 p.m. UTC | #5
On 2020/3/28 23:47, Richard Henderson wrote:
> On 3/28/20 8:17 AM, LIU Zhiwei wrote:
>>> Missed the improvement here.  See tcg_gen_mulsu2_i64.
>> Though I have not gotten the principle, the code in tcg_gen_mulsu2_i64 is much
>> tidier.
> Let A = signed operand,
>      B = unsigned operand
>      P = unsigned product
>
> If the sign bit A is set, then P is too large.
> In that case we subtract 2**64 * B to fix that:
>
>      HI_P -= (A < 0 ? B : 0)
>
> where the conditional is computed as (A >> 63) & B.

I think I get it.

LET  A = 2 ** 64  - X

THEN

X = 2 ** 64 - A
SIGNED_P = -X * B

if (A * B == P) then

(2 ** 64  - X) * B == P
2 **64 * B - X * B == P

-X *B == P - 2**64*B

HI_P -= (A < 0 ? B :0)

Zhiwei
>
> r~
LIU Zhiwei March 29, 2020, 4 a.m. UTC | #6
On 2020/3/29 0:13, LIU Zhiwei wrote:
>
>
> On 2020/3/28 23:47, Richard Henderson wrote:
>> On 3/28/20 8:17 AM, LIU Zhiwei wrote:
>>>> Missed the improvement here.  See tcg_gen_mulsu2_i64.
>>> Though I have not gotten the principle, the code in 
>>> tcg_gen_mulsu2_i64 is much
>>> tidier.
>> Let A = signed operand,
>>      B = unsigned operand
>>      P = unsigned product
>>
>> If the sign bit A is set, then P is too large.
>> In that case we subtract 2**64 * B to fix that:
>>
>>      HI_P -= (A < 0 ? B : 0)
>>
>> where the conditional is computed as (A >> 63) & B.
>
> I think I get it.
>
> LET  A = 2 ** 64  - X
>
> THEN
>
> X = 2 ** 64 - A
> SIGNED_P = -X * B
>
> if (A * B == P) then
>
> (2 ** 64  - X) * B == P
> 2 **64 * B - X * B == P
>
> -X *B == P - 2**64*B
>
> HI_P -= (A < 0 ? B :0)
>
It's confusing here. I paste the clearer code.

/*
  * Let  A = signed operand,
  *      B = unsigned operand
  *      P = mulu64(A, B), unsigned product
  *
  * LET  X = 2 ** 64  - A, 2's complement of A
  *      SP = signed product
  * THEN
  *      IF A < 0
  *          SP = -X * B
  *             = -(2 ** 64 - A) * B
  *             = A * B - 2 ** 64 * B
  *             = P - 2 ** 64 * B
  *      ELSE
  *          SP = P
  * THEN
  *      HI_P -= (A < 0 ? B : 0)
  */

static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
{
     uint64_t hi_64, lo_64;

     mulu64(&lo_64, &hi_64, s2, s1);

     hi_64 -= s2 < 0 ? s1 : 0;
     return hi_64;
}

Zhiwei
> Zhiwei
>>
>> r~
>
diff mbox series

Patch

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index c7d4ff185a..f42a12eef3 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -525,3 +525,36 @@  DEF_HELPER_6(vmax_vx_b, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vmax_vx_h, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vmax_vx_w, void, ptr, ptr, tl, ptr, env, i32)
 DEF_HELPER_6(vmax_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+
+DEF_HELPER_6(vmul_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmul_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmul_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmul_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulh_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulh_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulh_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulh_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vv_b, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vv_h, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vv_w, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vv_d, void, ptr, ptr, ptr, ptr, env, i32)
+DEF_HELPER_6(vmul_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmul_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmul_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmul_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulh_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulh_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulh_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulh_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhu_vx_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vx_b, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vx_h, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vx_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vmulhsu_vx_d, void, ptr, ptr, tl, ptr, env, i32)
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index aafbdc6be7..abfed469bc 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -363,6 +363,14 @@  vmaxu_vv        000110 . ..... ..... 000 ..... 1010111 @r_vm
 vmaxu_vx        000110 . ..... ..... 100 ..... 1010111 @r_vm
 vmax_vv         000111 . ..... ..... 000 ..... 1010111 @r_vm
 vmax_vx         000111 . ..... ..... 100 ..... 1010111 @r_vm
+vmul_vv         100101 . ..... ..... 010 ..... 1010111 @r_vm
+vmul_vx         100101 . ..... ..... 110 ..... 1010111 @r_vm
+vmulh_vv        100111 . ..... ..... 010 ..... 1010111 @r_vm
+vmulh_vx        100111 . ..... ..... 110 ..... 1010111 @r_vm
+vmulhu_vv       100100 . ..... ..... 010 ..... 1010111 @r_vm
+vmulhu_vx       100100 . ..... ..... 110 ..... 1010111 @r_vm
+vmulhsu_vv      100110 . ..... ..... 010 ..... 1010111 @r_vm
+vmulhsu_vx      100110 . ..... ..... 110 ..... 1010111 @r_vm
 
 vsetvli         0 ........... ..... 111 ..... 1010111  @r2_zimm
 vsetvl          1000000 ..... ..... 111 ..... 1010111  @r
diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index 53c49ee15c..c276beabd6 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -1452,3 +1452,13 @@  GEN_OPIVX_TRANS(vminu_vx, opivx_check)
 GEN_OPIVX_TRANS(vmin_vx,  opivx_check)
 GEN_OPIVX_TRANS(vmaxu_vx, opivx_check)
 GEN_OPIVX_TRANS(vmax_vx,  opivx_check)
+
+/* Vector Single-Width Integer Multiply Instructions */
+GEN_OPIVV_GVEC_TRANS(vmul_vv,  mul)
+GEN_OPIVV_TRANS(vmulh_vv, opivv_check)
+GEN_OPIVV_TRANS(vmulhu_vv, opivv_check)
+GEN_OPIVV_TRANS(vmulhsu_vv, opivv_check)
+GEN_OPIVX_GVEC_TRANS(vmul_vx,  muls)
+GEN_OPIVX_TRANS(vmulh_vx, opivx_check)
+GEN_OPIVX_TRANS(vmulhu_vx, opivx_check)
+GEN_OPIVX_TRANS(vmulhsu_vx, opivx_check)
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 32c2760a8a..56ba9a7422 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -852,6 +852,10 @@  GEN_VEXT_AMO(vamomaxuw_v_w, uint32_t, uint32_t, idx_w, clearl)
 #define OP_UUU_H uint16_t, uint16_t, uint16_t, uint16_t, uint16_t
 #define OP_UUU_W uint32_t, uint32_t, uint32_t, uint32_t, uint32_t
 #define OP_UUU_D uint64_t, uint64_t, uint64_t, uint64_t, uint64_t
+#define OP_SUS_B int8_t, uint8_t, int8_t, uint8_t, int8_t
+#define OP_SUS_H int16_t, uint16_t, int16_t, uint16_t, int16_t
+#define OP_SUS_W int32_t, uint32_t, int32_t, uint32_t, int32_t
+#define OP_SUS_D int64_t, uint64_t, int64_t, uint64_t, int64_t
 
 /* operation of two vector elements */
 typedef void opivv2_fn(void *vd, void *vs1, void *vs2, int i);
@@ -1585,3 +1589,155 @@  GEN_VEXT_VX(vmax_vx_b, 1, 1, clearb)
 GEN_VEXT_VX(vmax_vx_h, 2, 2, clearh)
 GEN_VEXT_VX(vmax_vx_w, 4, 4, clearl)
 GEN_VEXT_VX(vmax_vx_d, 8, 8, clearq)
+
+/* Vector Single-Width Integer Multiply Instructions */
+#define DO_MUL(N, M) (N * M)
+RVVCALL(OPIVV2, vmul_vv_b, OP_SSS_B, H1, H1, H1, DO_MUL)
+RVVCALL(OPIVV2, vmul_vv_h, OP_SSS_H, H2, H2, H2, DO_MUL)
+RVVCALL(OPIVV2, vmul_vv_w, OP_SSS_W, H4, H4, H4, DO_MUL)
+RVVCALL(OPIVV2, vmul_vv_d, OP_SSS_D, H8, H8, H8, DO_MUL)
+GEN_VEXT_VV(vmul_vv_b, 1, 1, clearb)
+GEN_VEXT_VV(vmul_vv_h, 2, 2, clearh)
+GEN_VEXT_VV(vmul_vv_w, 4, 4, clearl)
+GEN_VEXT_VV(vmul_vv_d, 8, 8, clearq)
+
+static int8_t do_mulh_b(int8_t s2, int8_t s1)
+{
+    return (int16_t)s2 * (int16_t)s1 >> 8;
+}
+
+static int16_t do_mulh_h(int16_t s2, int16_t s1)
+{
+    return (int32_t)s2 * (int32_t)s1 >> 16;
+}
+
+static int32_t do_mulh_w(int32_t s2, int32_t s1)
+{
+    return (int64_t)s2 * (int64_t)s1 >> 32;
+}
+
+static int64_t do_mulh_d(int64_t s2, int64_t s1)
+{
+    uint64_t hi_64, lo_64;
+
+    muls64(&lo_64, &hi_64, s1, s2);
+    return hi_64;
+}
+
+static uint8_t do_mulhu_b(uint8_t s2, uint8_t s1)
+{
+    return (uint16_t)s2 * (uint16_t)s1 >> 8;
+}
+
+static uint16_t do_mulhu_h(uint16_t s2, uint16_t s1)
+{
+    return (uint32_t)s2 * (uint32_t)s1 >> 16;
+}
+
+static uint32_t do_mulhu_w(uint32_t s2, uint32_t s1)
+{
+    return (uint64_t)s2 * (uint64_t)s1 >> 32;
+}
+
+static uint64_t do_mulhu_d(uint64_t s2, uint64_t s1)
+{
+    uint64_t hi_64, lo_64;
+
+    mulu64(&lo_64, &hi_64, s2, s1);
+    return hi_64;
+}
+
+static int8_t do_mulhsu_b(int8_t s2, uint8_t s1)
+{
+    return (int16_t)s2 * (uint16_t)s1 >> 8;
+}
+
+static int16_t do_mulhsu_h(int16_t s2, uint16_t s1)
+{
+    return (int32_t)s2 * (uint32_t)s1 >> 16;
+}
+
+static int32_t do_mulhsu_w(int32_t s2, uint32_t s1)
+{
+    return (int64_t)s2 * (uint64_t)s1 >> 32;
+}
+
+static int64_t do_mulhsu_d(int64_t s2, uint64_t s1)
+{
+    uint64_t hi_64, lo_64, abs_s2 = s2;
+
+    if (s2 < 0) {
+        abs_s2 = -s2;
+    }
+    mulu64(&lo_64, &hi_64, abs_s2, s1);
+    if (s2 < 0) {
+        lo_64 = ~lo_64;
+        hi_64 = ~hi_64;
+        if (lo_64 == UINT64_MAX) {
+            lo_64 = 0;
+            hi_64 += 1;
+        } else {
+            lo_64 += 1;
+        }
+    }
+
+    return hi_64;
+}
+
+RVVCALL(OPIVV2, vmulh_vv_b, OP_SSS_B, H1, H1, H1, do_mulh_b)
+RVVCALL(OPIVV2, vmulh_vv_h, OP_SSS_H, H2, H2, H2, do_mulh_h)
+RVVCALL(OPIVV2, vmulh_vv_w, OP_SSS_W, H4, H4, H4, do_mulh_w)
+RVVCALL(OPIVV2, vmulh_vv_d, OP_SSS_D, H8, H8, H8, do_mulh_d)
+RVVCALL(OPIVV2, vmulhu_vv_b, OP_UUU_B, H1, H1, H1, do_mulhu_b)
+RVVCALL(OPIVV2, vmulhu_vv_h, OP_UUU_H, H2, H2, H2, do_mulhu_h)
+RVVCALL(OPIVV2, vmulhu_vv_w, OP_UUU_W, H4, H4, H4, do_mulhu_w)
+RVVCALL(OPIVV2, vmulhu_vv_d, OP_UUU_D, H8, H8, H8, do_mulhu_d)
+RVVCALL(OPIVV2, vmulhsu_vv_b, OP_SUS_B, H1, H1, H1, do_mulhsu_b)
+RVVCALL(OPIVV2, vmulhsu_vv_h, OP_SUS_H, H2, H2, H2, do_mulhsu_h)
+RVVCALL(OPIVV2, vmulhsu_vv_w, OP_SUS_W, H4, H4, H4, do_mulhsu_w)
+RVVCALL(OPIVV2, vmulhsu_vv_d, OP_SUS_D, H8, H8, H8, do_mulhsu_d)
+GEN_VEXT_VV(vmulh_vv_b, 1, 1, clearb)
+GEN_VEXT_VV(vmulh_vv_h, 2, 2, clearh)
+GEN_VEXT_VV(vmulh_vv_w, 4, 4, clearl)
+GEN_VEXT_VV(vmulh_vv_d, 8, 8, clearq)
+GEN_VEXT_VV(vmulhu_vv_b, 1, 1, clearb)
+GEN_VEXT_VV(vmulhu_vv_h, 2, 2, clearh)
+GEN_VEXT_VV(vmulhu_vv_w, 4, 4, clearl)
+GEN_VEXT_VV(vmulhu_vv_d, 8, 8, clearq)
+GEN_VEXT_VV(vmulhsu_vv_b, 1, 1, clearb)
+GEN_VEXT_VV(vmulhsu_vv_h, 2, 2, clearh)
+GEN_VEXT_VV(vmulhsu_vv_w, 4, 4, clearl)
+GEN_VEXT_VV(vmulhsu_vv_d, 8, 8, clearq)
+
+RVVCALL(OPIVX2, vmul_vx_b, OP_SSS_B, H1, H1, DO_MUL)
+RVVCALL(OPIVX2, vmul_vx_h, OP_SSS_H, H2, H2, DO_MUL)
+RVVCALL(OPIVX2, vmul_vx_w, OP_SSS_W, H4, H4, DO_MUL)
+RVVCALL(OPIVX2, vmul_vx_d, OP_SSS_D, H8, H8, DO_MUL)
+RVVCALL(OPIVX2, vmulh_vx_b, OP_SSS_B, H1, H1, do_mulh_b)
+RVVCALL(OPIVX2, vmulh_vx_h, OP_SSS_H, H2, H2, do_mulh_h)
+RVVCALL(OPIVX2, vmulh_vx_w, OP_SSS_W, H4, H4, do_mulh_w)
+RVVCALL(OPIVX2, vmulh_vx_d, OP_SSS_D, H8, H8, do_mulh_d)
+RVVCALL(OPIVX2, vmulhu_vx_b, OP_UUU_B, H1, H1, do_mulhu_b)
+RVVCALL(OPIVX2, vmulhu_vx_h, OP_UUU_H, H2, H2, do_mulhu_h)
+RVVCALL(OPIVX2, vmulhu_vx_w, OP_UUU_W, H4, H4, do_mulhu_w)
+RVVCALL(OPIVX2, vmulhu_vx_d, OP_UUU_D, H8, H8, do_mulhu_d)
+RVVCALL(OPIVX2, vmulhsu_vx_b, OP_SUS_B, H1, H1, do_mulhsu_b)
+RVVCALL(OPIVX2, vmulhsu_vx_h, OP_SUS_H, H2, H2, do_mulhsu_h)
+RVVCALL(OPIVX2, vmulhsu_vx_w, OP_SUS_W, H4, H4, do_mulhsu_w)
+RVVCALL(OPIVX2, vmulhsu_vx_d, OP_SUS_D, H8, H8, do_mulhsu_d)
+GEN_VEXT_VX(vmul_vx_b, 1, 1, clearb)
+GEN_VEXT_VX(vmul_vx_h, 2, 2, clearh)
+GEN_VEXT_VX(vmul_vx_w, 4, 4, clearl)
+GEN_VEXT_VX(vmul_vx_d, 8, 8, clearq)
+GEN_VEXT_VX(vmulh_vx_b, 1, 1, clearb)
+GEN_VEXT_VX(vmulh_vx_h, 2, 2, clearh)
+GEN_VEXT_VX(vmulh_vx_w, 4, 4, clearl)
+GEN_VEXT_VX(vmulh_vx_d, 8, 8, clearq)
+GEN_VEXT_VX(vmulhu_vx_b, 1, 1, clearb)
+GEN_VEXT_VX(vmulhu_vx_h, 2, 2, clearh)
+GEN_VEXT_VX(vmulhu_vx_w, 4, 4, clearl)
+GEN_VEXT_VX(vmulhu_vx_d, 8, 8, clearq)
+GEN_VEXT_VX(vmulhsu_vx_b, 1, 1, clearb)
+GEN_VEXT_VX(vmulhsu_vx_h, 2, 2, clearh)
+GEN_VEXT_VX(vmulhsu_vx_w, 4, 4, clearl)
+GEN_VEXT_VX(vmulhsu_vx_d, 8, 8, clearq)