diff mbox series

[v4,5/5] target/riscv: add vector amo operations

Message ID 20200225103508.7651-6-zhiwei_liu@c-sky.com
State New
Headers show
Series target/riscv: support vector extension part 2 | expand

Commit Message

LIU Zhiwei Feb. 25, 2020, 10:35 a.m. UTC
Vector AMOs operate as if aq and rl bits were zero on each element
with regard to ordering relative to other instructions in the same hart.
Vector AMOs provide no ordering guarantee between element operations
in the same vector AMO instruction

Signed-off-by: LIU Zhiwei <zhiwei_liu@c-sky.com>
---
 target/riscv/helper.h                   |  56 +++++
 target/riscv/insn32-64.decode           |  11 +
 target/riscv/insn32.decode              |  13 ++
 target/riscv/insn_trans/trans_rvv.inc.c | 154 +++++++++++++
 target/riscv/vector_helper.c            | 280 +++++++++++++++++++++++-
 5 files changed, 513 insertions(+), 1 deletion(-)

Comments

Richard Henderson Feb. 28, 2020, 5:38 a.m. UTC | #1
On 2/25/20 2:35 AM, LIU Zhiwei wrote:
> +    if (s->sew < 2) {
> +        return false;
> +    }

This could just as easily be in amo_check?

> +
> +    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
> +#ifdef CONFIG_ATOMIC64
> +        fn = fns[0][seq][s->sew - 2];
> +#else
> +        gen_helper_exit_atomic(cpu_env);
> +        s->base.is_jmp = DISAS_NORETURN;
> +        return true;
> +#endif

Why are you raising exit_atomic without first checking that s->sew == 3?  We
can do 32-bit atomic operations always.

> +    } else {
> +        fn = fns[1][seq][s->sew - 2];
> +    }
> +    if (fn == NULL) {
> +        return false;
> +    }
> +
> +    return amo_trans(a->rd, a->rs1, a->rs2, data, fn, s);
> +}
> +
> +static bool amo_check(DisasContext *s, arg_rwdvm* a)
> +{
> +    return (vext_check_isa_ill(s, RVV | RVA) &&
> +            (a->wd ? vext_check_overlap_mask(s, a->rd, a->vm) : 1) &&
> +            vext_check_reg(s, a->rd, false) &&
> +            vext_check_reg(s, a->rs2, false));
> +}

I guess the "If SEW is greater than XLEN, an illegal instruction exception is
raised" requirement is currently in the column of NULLs in the !CONFIG_RISCV64
block.  But it might be better to have it explicit and save the column of NULLs.

It makes sense to me to do both sew checks together, whether in amo_check or in
amo_op.

> +#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ETYPE, MTYPE, H, DO_OP, SUF)      \
> +static void vext_##NAME##_noatomic_op(void *vs3, target_ulong addr,      \
> +        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr)\
> +{                                                                        \
> +    ETYPE ret;                                                           \
> +    target_ulong tmp;                                                    \
> +    int mmu_idx = cpu_mmu_index(env, false);                             \
> +    tmp = cpu_ld##SUF##_mmuidx_ra(env, addr, mmu_idx, retaddr);          \
> +    ret = DO_OP((ETYPE)(MTYPE)tmp, *((ETYPE *)vs3 + H(idx)));            \
> +    cpu_st##SUF##_mmuidx_ra(env, addr, ret, mmu_idx, retaddr);           \
> +    if (wd) {                                                            \
> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \

The target_long cast is wrong; should be ETYPE.

You can use cpu_ldX/stX_data (no mmu_idx or retaddr argument).  There should be
no faults, since you've already checked for read+write.

> +/* atomic opreation for vector atomic insructions */
> +#ifndef CONFIG_USER_ONLY
> +#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
> +static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
> +        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
> +{                                                                        \
> +    target_ulong tmp;                                                    \
> +    int mem_idx = cpu_mmu_index(env, false);                             \
> +    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx)),  \
> +            make_memop_idx(MO_ALIGN | MOFLAG, mem_idx));                 \
> +    if (wd) {                                                            \
> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
> +    }                                                                    \
> +}
> +#else
> +#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
> +static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
> +        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
> +{                                                                        \
> +    target_ulong tmp;                                                    \
> +    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx))); \
> +    if (wd) {                                                            \
> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
> +    }                                                                    \
> +}
> +#endif

This is not right.  It is not legal to call these helpers from another helper
-- they will use the wrong GETPC() and will not unwind properly.

> +static inline void vext_amo_atomic(void *vs3, void *v0, target_ulong base,
> +        void *vs2, CPURISCVState *env, uint32_t desc,
> +        vext_get_index_addr get_index_addr,
> +        vext_amo_atomic_fn atomic_op,
> +        vext_ld_clear_elem clear_elem,
> +        uint32_t esz, uint32_t msz, uintptr_t ra)
> +{
> +    uint32_t i;
> +    target_long addr;
> +    uint32_t wd = vext_wd(desc);
> +    uint32_t vm = vext_vm(desc);
> +    uint32_t mlen = vext_mlen(desc);
> +    uint32_t vlmax = vext_maxsz(desc) / esz;
> +
> +    for (i = 0; i < env->vl; i++) {
> +        if (!vm && !vext_elem_mask(v0, mlen, i)) {
> +            continue;
> +        }
> +        probe_read_access(env, get_index_addr(base, i, vs2), msz, ra);
> +        probe_write_access(env, get_index_addr(base, i, vs2), msz, ra);
> +    }

You probably need to check for aligned address here too, probably first so an
unaligned fault has priority over an invalid page fault.

The missing aligned address check is the only remaining exception that the
helper_atomic_* functions would raise, since you have properly checked for
read+write.  So it might be possible to get away with using the helpers, but I
don't like it.

But I do think it would be better to write your own helpers for the atomic
paths.  They need not check quite so much, since we have already done the
validation above.  You pretty much only need to use tlb_vaddr_to_host.

If that gets too ugly, we can talk about rearranging
accel/tcg/atomic_template.h so that it could be reused.

Alternately, we could simply *always* use the non-atomic helpers, and raise
exit_atomic if PARALLEL.

> +static inline void vext_amo_noatomic(void *vs3, void *v0, target_ulong base,
> +        void *vs2, CPURISCVState *env, uint32_t desc,
> +        vext_get_index_addr get_index_addr,
> +        vext_amo_noatomic_fn noatomic_op,
> +        vext_ld_clear_elem clear_elem,
> +        uint32_t esz, uint32_t msz, uintptr_t ra)

Without the retaddr argument to the noatomic_fn, as described above,
vext_amo_noatomic and vext_amo_atomic are identical.


r~
LIU Zhiwei Feb. 28, 2020, 9:19 a.m. UTC | #2
On 2020/2/28 13:38, Richard Henderson wrote:
> On 2/25/20 2:35 AM, LIU Zhiwei wrote:
>> +    if (s->sew < 2) {
>> +        return false;
>> +    }
> This could just as easily be in amo_check?
Yes, it can be done in amo_check.
>> +
>> +    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
>> +#ifdef CONFIG_ATOMIC64
>> +        fn = fns[0][seq][s->sew - 2];
>> +#else
>> +        gen_helper_exit_atomic(cpu_env);
>> +        s->base.is_jmp = DISAS_NORETURN;
>> +        return true;
>> +#endif
> Why are you raising exit_atomic without first checking that s->sew == 3?
Yes, it should be

     if (s->sew == 3) {

#ifdef CONFIG_ATOMIC64
         fn = fns[0][seq][1];
#else
         gen_helper_exit_atomic(cpu_env);
         s->base.is_jmp = DISAS_NORETURN;
         return true;
#endif

     } else {
#ifdef TARGET_RISCV64

     fn = fns[0][seq][0];
#else
     fn = fns[0][seq];
#endif

   }
Is it OK?
> We
> can do 32-bit atomic operations always.
Good.
>
>> +    } else {
>> +        fn = fns[1][seq][s->sew - 2];
>> +    }
>> +    if (fn == NULL) {
>> +        return false;
>> +    }
>> +
>> +    return amo_trans(a->rd, a->rs1, a->rs2, data, fn, s);
>> +}
>> +
>> +static bool amo_check(DisasContext *s, arg_rwdvm* a)
>> +{
>> +    return (vext_check_isa_ill(s, RVV | RVA) &&
>> +            (a->wd ? vext_check_overlap_mask(s, a->rd, a->vm) : 1) &&
>> +            vext_check_reg(s, a->rd, false) &&
>> +            vext_check_reg(s, a->rs2, false));
>> +}
> I guess the "If SEW is greater than XLEN, an illegal instruction exception is
> raised" requirement is currently in the column of NULLs in the !CONFIG_RISCV64
> block.  But it might be better to have it explicit and save the column of NULLs.
maybe  adds

(1 << s->sew) <= sizeof(target_ulong) &&

in amo_check
> It makes sense to me to do both sew checks together, whether in amo_check or in
> amo_op.
>
>> +#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ETYPE, MTYPE, H, DO_OP, SUF)      \
>> +static void vext_##NAME##_noatomic_op(void *vs3, target_ulong addr,      \
>> +        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr)\
>> +{                                                                        \
>> +    ETYPE ret;                                                           \
>> +    target_ulong tmp;                                                    \
>> +    int mmu_idx = cpu_mmu_index(env, false);                             \
>> +    tmp = cpu_ld##SUF##_mmuidx_ra(env, addr, mmu_idx, retaddr);          \
>> +    ret = DO_OP((ETYPE)(MTYPE)tmp, *((ETYPE *)vs3 + H(idx)));            \
>> +    cpu_st##SUF##_mmuidx_ra(env, addr, ret, mmu_idx, retaddr);           \
>> +    if (wd) {                                                            \
>> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
> The target_long cast is wrong; should be ETYPE.
"If the AMO memory element width is less than SEW, the value returned 
from memory
  is sign-extended to fill SEW"

So just use (target_long) to sign-extended. As you see, instructions like

vamominud

have the uint64_t as ETYPE.  And it can't sign-extend the value from 
memory by (ETYPE)(MTYPE)tmp.
> You can use cpu_ldX/stX_data (no mmu_idx or retaddr argument).  There should be
> no faults, since you've already checked for read+write.
Good idea.
>> +/* atomic opreation for vector atomic insructions */
>> +#ifndef CONFIG_USER_ONLY
>> +#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
>> +static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
>> +        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
>> +{                                                                        \
>> +    target_ulong tmp;                                                    \
>> +    int mem_idx = cpu_mmu_index(env, false);                             \
>> +    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx)),  \
>> +            make_memop_idx(MO_ALIGN | MOFLAG, mem_idx));                 \
>> +    if (wd) {                                                            \
>> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
>> +    }                                                                    \
>> +}
>> +#else
>> +#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
>> +static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
>> +        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
>> +{                                                                        \
>> +    target_ulong tmp;                                                    \
>> +    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx))); \
>> +    if (wd) {                                                            \
>> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
>> +    }                                                                    \
>> +}
>> +#endif
> This is not right.  It is not legal to call these helpers from another helper
> -- they will use the wrong GETPC() and will not unwind properly.
Oh. I didn't notice it.
>> +static inline void vext_amo_atomic(void *vs3, void *v0, target_ulong base,
>> +        void *vs2, CPURISCVState *env, uint32_t desc,
>> +        vext_get_index_addr get_index_addr,
>> +        vext_amo_atomic_fn atomic_op,
>> +        vext_ld_clear_elem clear_elem,
>> +        uint32_t esz, uint32_t msz, uintptr_t ra)
>> +{
>> +    uint32_t i;
>> +    target_long addr;
>> +    uint32_t wd = vext_wd(desc);
>> +    uint32_t vm = vext_vm(desc);
>> +    uint32_t mlen = vext_mlen(desc);
>> +    uint32_t vlmax = vext_maxsz(desc) / esz;
>> +
>> +    for (i = 0; i < env->vl; i++) {
>> +        if (!vm && !vext_elem_mask(v0, mlen, i)) {
>> +            continue;
>> +        }
>> +        probe_read_access(env, get_index_addr(base, i, vs2), msz, ra);
>> +        probe_write_access(env, get_index_addr(base, i, vs2), msz, ra);
>> +    }
> You probably need to check for aligned address here too, probably first so an
> unaligned fault has priority over an invalid page fault.
OK, I will use  cpu_unaligned_access() to check unaligned access.
> The missing aligned address check is the only remaining exception that the
> helper_atomic_* functions would raise, since you have properly checked for
> read+write.  So it might be possible to get away with using the helpers, but I
> don't like it.
Do you mean write my own helpers to implement atomic operations?

What's the meaning of " but I don't like it. "?
> But I do think it would be better to write your own helpers for the atomic
> paths.  They need not check quite so much, since we have already done the
> validation above.  You pretty much only need to use tlb_vaddr_to_host.
>
> If that gets too ugly, we can talk about rearranging
> accel/tcg/atomic_template.h so that it could be reused.
Good idea.  Perhaps use tlb_vaddr_to_host instead of atomic_mmu_lookup
to define another macro like GEN_ATOMIC_HELPER?
> Alternately, we could simply *always* use the non-atomic helpers, and raise
> exit_atomic if PARALLEL.
Yes, it's the simplest way.
However I prefer try to define something like GEN_ATOMIC_HELPER in 
vector_helper.c.
>> +static inline void vext_amo_noatomic(void *vs3, void *v0, target_ulong base,
>> +        void *vs2, CPURISCVState *env, uint32_t desc,
>> +        vext_get_index_addr get_index_addr,
>> +        vext_amo_noatomic_fn noatomic_op,
>> +        vext_ld_clear_elem clear_elem,
>> +        uint32_t esz, uint32_t msz, uintptr_t ra)
> Without the retaddr argument to the noatomic_fn, as described above,
> vext_amo_noatomic and vext_amo_atomic are identical.
Yes.

It's very kind of you to spend so much time to review.
Thanks very much.

Best Regards,
Zhiwei
>
> r~
Richard Henderson Feb. 28, 2020, 6:46 p.m. UTC | #3
On 2/28/20 1:19 AM, LIU Zhiwei wrote:
>>> +#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ETYPE, MTYPE, H, DO_OP, SUF)      \
>>> +static void vext_##NAME##_noatomic_op(void *vs3, target_ulong addr,      \
>>> +        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr)\
>>> +{                                                                        \
>>> +    ETYPE ret;                                                           \
>>> +    target_ulong tmp;                                                    \
>>> +    int mmu_idx = cpu_mmu_index(env, false);                             \
>>> +    tmp = cpu_ld##SUF##_mmuidx_ra(env, addr, mmu_idx, retaddr);          \
>>> +    ret = DO_OP((ETYPE)(MTYPE)tmp, *((ETYPE *)vs3 + H(idx)));            \
>>> +    cpu_st##SUF##_mmuidx_ra(env, addr, ret, mmu_idx, retaddr);           \
>>> +    if (wd) {                                                            \
>>> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
>> The target_long cast is wrong; should be ETYPE.
> "If the AMO memory element width is less than SEW, the value returned from memory
>  is sign-extended to fill SEW"
> 
> So just use (target_long) to sign-extended. As you see, instructions like
> 
> vamominud
> 
> have the uint64_t as ETYPE.  And it can't sign-extend the value from memory by
> (ETYPE)(MTYPE)tmp.

Casting to target_long doesn't help -- it becomes signed at a variable size,
possibly larger than MTYPE.

In addition, I think you're performing the operation at the wrong length.  The
text of the ISA document could be clearer, but

  # If SEW > 32 bits, the value returned from memory
  # is sign-extended to fill SEW.

You are performing the operation in ETYPE, but it should be done in MTYPE and
only afterward extended to ETYPE.

For minu/maxu, you're right that you need an unsigned for the operation.  But
then you need a signed type of the same width for the extension.

One possibility is to *always* make MTYPE a signed type, but for the two cases
that require an unsigned type, provide it.  E.g.

#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ESZ, MSZ, H, DO_OP, SUF)
static void vext_##NAME##_noatomic_op(void *vs3,
    target_ulong addr, uint32_t wd, uint32_t idx,
    CPURISCVState *env, uintptr_t retaddr)
{
    typedef int##ESZ##_t ETYPE;
    typedef int##MSZ##_t MTYPE;
    typedef uint##MSZ##_t UMTYPE;
    ETYPE *pe3 = (ETYPE *)vs3 + H(idx);
    MTYPE a = *pe3, b = cpu_ld##SUF##_data(env, addr);
    a = DO_OP(a, b);
    cpu_st##SUF##_data(env, addr, a);
    if (wd) {
        *pe3 = a;
    }
}

/* Signed min/max */
#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))

/* Unsigned min/max */
#define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
#define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)

GEN_VEXT_AMO_NOATOMIC_OP(vamomaxuw_v_d, 64, 32, H8, DO_MAXU, l)
GEN_VEXT_AMO_NOATOMIC_OP(vamomaxud_v_d, 64, 64, H8, DO_MAXU, q)


>> The missing aligned address check is the only remaining exception that the
>> helper_atomic_* functions would raise, since you have properly checked for
>> read+write.  So it might be possible to get away with using the helpers, but I
>> don't like it.
> Do you mean write my own helpers to implement atomic operations?
> 
> What's the meaning of " but I don't like it. "?

I don't like re-using helpers in an incorrect way.

>> But I do think it would be better to write your own helpers for the atomic
>> paths.  They need not check quite so much, since we have already done the
>> validation above.  You pretty much only need to use tlb_vaddr_to_host.
>>
>> If that gets too ugly, we can talk about rearranging
>> accel/tcg/atomic_template.h so that it could be reused.
> Good idea.  Perhaps use tlb_vaddr_to_host instead of atomic_mmu_lookup
> to define another macro like GEN_ATOMIC_HELPER?
>> Alternately, we could simply *always* use the non-atomic helpers, and raise
>> exit_atomic if PARALLEL.
> Yes, it's the simplest way.
> However I prefer try to define something like GEN_ATOMIC_HELPER in
> vector_helper.c.

I'll think about this some more.
In the short-term, I think non-atomic is the best we can do.


r~
LIU Zhiwei Feb. 29, 2020, 1:16 p.m. UTC | #4
On 2020/2/29 2:46, Richard Henderson wrote:
> On 2/28/20 1:19 AM, LIU Zhiwei wrote:
>>>> +#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ETYPE, MTYPE, H, DO_OP, SUF)      \
>>>> +static void vext_##NAME##_noatomic_op(void *vs3, target_ulong addr,      \
>>>> +        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr)\
>>>> +{                                                                        \
>>>> +    ETYPE ret;                                                           \
>>>> +    target_ulong tmp;                                                    \
>>>> +    int mmu_idx = cpu_mmu_index(env, false);                             \
>>>> +    tmp = cpu_ld##SUF##_mmuidx_ra(env, addr, mmu_idx, retaddr);          \
>>>> +    ret = DO_OP((ETYPE)(MTYPE)tmp, *((ETYPE *)vs3 + H(idx)));            \
>>>> +    cpu_st##SUF##_mmuidx_ra(env, addr, ret, mmu_idx, retaddr);           \
>>>> +    if (wd) {                                                            \
>>>> +        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
>>> The target_long cast is wrong; should be ETYPE.
>> "If the AMO memory element width is less than SEW, the value returned from memory
>>   is sign-extended to fill SEW"
>>
>> So just use (target_long) to sign-extended. As you see, instructions like
>>
>> vamominud
>>
>> have the uint64_t as ETYPE.  And it can't sign-extend the value from memory by
>> (ETYPE)(MTYPE)tmp.
> Casting to target_long doesn't help -- it becomes signed at a variable size,
> possibly larger than MTYPE.
>
> In addition, I think you're performing the operation at the wrong length.
>   The
> text of the ISA document could be clearer, but
>
>    # If SEW > 32 bits, the value returned from memory
>    # is sign-extended to fill SEW.
>
> You are performing the operation in ETYPE, but it should be done in MTYPE and
> only afterward extended to ETYPE.
Yes, I  made a mistake.It should be MTYPE.
> For minu/maxu, you're right that you need an unsigned for the operation.  But
> then you need a signed type of the same width for the extension.
>
> One possibility is to *always* make MTYPE a signed type, but for the two cases
> that require an unsigned type, provide it.  E.g.
>
> #define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ESZ, MSZ, H, DO_OP, SUF)
> static void vext_##NAME##_noatomic_op(void *vs3,
>      target_ulong addr, uint32_t wd, uint32_t idx,
>      CPURISCVState *env, uintptr_t retaddr)
> {
>      typedef int##ESZ##_t ETYPE;
>      typedef int##MSZ##_t MTYPE;
>      typedef uint##MSZ##_t UMTYPE;
>      ETYPE *pe3 = (ETYPE *)vs3 + H(idx);
>      MTYPE a = *pe3, b = cpu_ld##SUF##_data(env, addr);
>      a = DO_OP(a, b);
>      cpu_st##SUF##_data(env, addr, a);
>      if (wd) {
>          *pe3 = a;
>      }
> }
>
> /* Signed min/max */
> #define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
> #define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
>
> /* Unsigned min/max */
> #define DO_MAXU(N, M) DO_MAX((UMTYPE)N, (UMTYPE)M)
> #define DO_MINU(N, M) DO_MIN((UMTYPE)N, (UMTYPE)M)
>
> GEN_VEXT_AMO_NOATOMIC_OP(vamomaxuw_v_d, 64, 32, H8, DO_MAXU, l)
> GEN_VEXT_AMO_NOATOMIC_OP(vamomaxud_v_d, 64, 64, H8, DO_MAXU, q)
Perfect. I will try it.

>
>>> The missing aligned address check is the only remaining exception that the
>>> helper_atomic_* functions would raise, since you have properly checked for
>>> read+write.  So it might be possible to get away with using the helpers, but I
>>> don't like it.
>> Do you mean write my own helpers to implement atomic operations?
>>
>> What's the meaning of " but I don't like it. "?
> I don't like re-using helpers in an incorrect way.
>
>>> But I do think it would be better to write your own helpers for the atomic
>>> paths.  They need not check quite so much, since we have already done the
>>> validation above.  You pretty much only need to use tlb_vaddr_to_host.
>>>
>>> If that gets too ugly, we can talk about rearranging
>>> accel/tcg/atomic_template.h so that it could be reused.
>> Good idea.  Perhaps use tlb_vaddr_to_host instead of atomic_mmu_lookup
>> to define another macro like GEN_ATOMIC_HELPER?
>>> Alternately, we could simply *always* use the non-atomic helpers, and raise
>>> exit_atomic if PARALLEL.
>> Yes, it's the simplest way.
>> However I prefer try to define something like GEN_ATOMIC_HELPER in
>> vector_helper.c.
> I'll think about this some more.
> In the short-term, I think non-atomic is the best we can do.
I will accept your advice. Thanks.

Best Regards,
Zhiwei
>
> r~
diff mbox series

Patch

diff --git a/target/riscv/helper.h b/target/riscv/helper.h
index 72ba4d9bdb..cbe0d107c0 100644
--- a/target/riscv/helper.h
+++ b/target/riscv/helper.h
@@ -240,3 +240,59 @@  DEF_HELPER_5(vlhuff_v_w, void, ptr, ptr, tl, env, i32)
 DEF_HELPER_5(vlhuff_v_d, void, ptr, ptr, tl, env, i32)
 DEF_HELPER_5(vlwuff_v_w, void, ptr, ptr, tl, env, i32)
 DEF_HELPER_5(vlwuff_v_d, void, ptr, ptr, tl, env, i32)
+#ifdef TARGET_RISCV64
+DEF_HELPER_6(vamoswapw_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoswapd_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddw_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddd_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxorw_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxord_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandw_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandd_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoorw_v_d_a,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoord_v_d_a,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominw_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomind_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxw_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxd_v_d_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominuw_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominud_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxuw_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxud_v_d_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoswapw_v_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoswapd_v_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddw_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddd_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxorw_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxord_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandw_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandd_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoorw_v_d,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoord_v_d,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominw_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomind_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxw_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxd_v_d,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominuw_v_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominud_v_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxuw_v_d, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxud_v_d, void, ptr, ptr, tl, ptr, env, i32)
+#endif
+DEF_HELPER_6(vamoswapw_v_w_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddw_v_w_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxorw_v_w_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandw_v_w_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoorw_v_w_a,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominw_v_w_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxw_v_w_a,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominuw_v_w_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxuw_v_w_a, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoswapw_v_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoaddw_v_w,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoxorw_v_w,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoandw_v_w,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamoorw_v_w,   void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominw_v_w,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxw_v_w,  void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamominuw_v_w, void, ptr, ptr, tl, ptr, env, i32)
+DEF_HELPER_6(vamomaxuw_v_w, void, ptr, ptr, tl, ptr, env, i32)
diff --git a/target/riscv/insn32-64.decode b/target/riscv/insn32-64.decode
index 380bf791bc..86153d93fa 100644
--- a/target/riscv/insn32-64.decode
+++ b/target/riscv/insn32-64.decode
@@ -57,6 +57,17 @@  amomax_d   10100 . . ..... ..... 011 ..... 0101111 @atom_st
 amominu_d  11000 . . ..... ..... 011 ..... 0101111 @atom_st
 amomaxu_d  11100 . . ..... ..... 011 ..... 0101111 @atom_st
 
+#*** Vector AMO operations (in addition to Zvamo) ***
+vamoswapd_v     00001 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamoaddd_v      00000 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamoxord_v      00100 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamoandd_v      01100 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamoord_v       01000 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamomind_v      10000 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamomaxd_v      10100 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamominud_v     11000 . . ..... ..... 111 ..... 0101111 @r_wdvm
+vamomaxud_v     11100 . . ..... ..... 111 ..... 0101111 @r_wdvm
+
 # *** RV64F Standard Extension (in addition to RV32F) ***
 fcvt_l_s   1100000  00010 ..... ... ..... 1010011 @r2_rm
 fcvt_lu_s  1100000  00011 ..... ... ..... 1010011 @r2_rm
diff --git a/target/riscv/insn32.decode b/target/riscv/insn32.decode
index 973ac63fda..077551dd13 100644
--- a/target/riscv/insn32.decode
+++ b/target/riscv/insn32.decode
@@ -43,6 +43,7 @@ 
 &u    imm rd
 &shift     shamt rs1 rd
 &atomic    aq rl rs2 rs1 rd
+&rwdvm     vm wd rd rs1 rs2
 &r2nfvm    vm rd rs1 nf
 &rnfvm     vm rd rs1 rs2 nf
 
@@ -64,6 +65,7 @@ 
 @r_rm    .......   ..... ..... ... ..... ....... %rs2 %rs1 %rm %rd
 @r2_rm   .......   ..... ..... ... ..... ....... %rs1 %rm %rd
 @r2      .......   ..... ..... ... ..... ....... %rs1 %rd
+@r_wdvm  ..... wd:1 vm:1 ..... ..... ... ..... ....... &rwdvm %rs2 %rs1 %rd
 @r2_nfvm nf:3 ... vm:1 ..... ..... ... ..... ....... &r2nfvm %rs1 %rd
 @r_nfvm  nf:3 ... vm:1 ..... ..... ... ..... ....... &rnfvm %rs2 %rs1 %rd
 @r2_zimm . zimm:11  ..... ... ..... ....... %rs1 %rd
@@ -259,6 +261,17 @@  vsuxh_v    ... 111 . ..... ..... 101 ..... 0100111 @r_nfvm
 vsuxw_v    ... 111 . ..... ..... 110 ..... 0100111 @r_nfvm
 vsuxe_v    ... 111 . ..... ..... 111 ..... 0100111 @r_nfvm
 
+#*** Vector AMO operations are encoded under the standard AMO major opcode ***
+vamoswapw_v     00001 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamoaddw_v      00000 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamoxorw_v      00100 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamoandw_v      01100 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamoorw_v       01000 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamominw_v      10000 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamomaxw_v      10100 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamominuw_v     11000 . . ..... ..... 110 ..... 0101111 @r_wdvm
+vamomaxuw_v     11100 . . ..... ..... 110 ..... 0101111 @r_wdvm
+
 # *** new major opcode OP-V ***
 vsetvli         0 ........... ..... 111 ..... 1010111  @r2_zimm
 vsetvl          1000000 ..... ..... 111 ..... 1010111  @r
diff --git a/target/riscv/insn_trans/trans_rvv.inc.c b/target/riscv/insn_trans/trans_rvv.inc.c
index dda3ba555c..a0e1e496f2 100644
--- a/target/riscv/insn_trans/trans_rvv.inc.c
+++ b/target/riscv/insn_trans/trans_rvv.inc.c
@@ -576,3 +576,157 @@  GEN_VEXT_TRANS(vleff_v, 3, r2nfvm, ldff_op, ld_us_check)
 GEN_VEXT_TRANS(vlbuff_v, 4, r2nfvm, ldff_op, ld_us_check)
 GEN_VEXT_TRANS(vlhuff_v, 5, r2nfvm, ldff_op, ld_us_check)
 GEN_VEXT_TRANS(vlwuff_v, 6, r2nfvm, ldff_op, ld_us_check)
+
+/*
+ *** vector atomic operation
+ */
+typedef void gen_helper_amo(TCGv_ptr, TCGv_ptr, TCGv, TCGv_ptr,
+        TCGv_env, TCGv_i32);
+
+static bool amo_trans(uint32_t vd, uint32_t rs1, uint32_t vs2,
+        uint32_t data, gen_helper_amo *fn, DisasContext *s)
+{
+    TCGv_ptr dest, mask, index;
+    TCGv base;
+    TCGv_i32 desc;
+
+    dest = tcg_temp_new_ptr();
+    mask = tcg_temp_new_ptr();
+    index = tcg_temp_new_ptr();
+    base = tcg_temp_new();
+    desc = tcg_const_i32(simd_desc(0, s->vlen / 8, data));
+
+    gen_get_gpr(base, rs1);
+    tcg_gen_addi_ptr(dest, cpu_env, vreg_ofs(s, vd));
+    tcg_gen_addi_ptr(index, cpu_env, vreg_ofs(s, vs2));
+    tcg_gen_addi_ptr(mask, cpu_env, vreg_ofs(s, 0));
+
+    fn(dest, mask, base, index, cpu_env, desc);
+
+    tcg_temp_free_ptr(dest);
+    tcg_temp_free_ptr(mask);
+    tcg_temp_free_ptr(index);
+    tcg_temp_free(base);
+    tcg_temp_free_i32(desc);
+    return true;
+}
+
+static bool amo_op(DisasContext *s, arg_rwdvm *a, uint8_t seq)
+{
+    uint32_t data = s->mlen | (a->vm << 8) | (s->lmul << 9) | (a->wd << 11);
+    gen_helper_amo *fn;
+#ifdef TARGET_RISCV64
+    static gen_helper_amo *const fns[2][18][2] = {
+        /* atomic operation */
+        { { gen_helper_vamoswapw_v_w_a, gen_helper_vamoswapw_v_d_a },
+          { gen_helper_vamoaddw_v_w_a,  gen_helper_vamoaddw_v_d_a },
+          { gen_helper_vamoxorw_v_w_a,  gen_helper_vamoxorw_v_d_a },
+          { gen_helper_vamoandw_v_w_a,  gen_helper_vamoandw_v_d_a },
+          { gen_helper_vamoorw_v_w_a,   gen_helper_vamoorw_v_d_a },
+          { gen_helper_vamominw_v_w_a,  gen_helper_vamominw_v_d_a },
+          { gen_helper_vamomaxw_v_w_a,  gen_helper_vamomaxw_v_d_a },
+          { gen_helper_vamominuw_v_w_a, gen_helper_vamominuw_v_d_a },
+          { gen_helper_vamomaxuw_v_w_a, gen_helper_vamomaxuw_v_d_a },
+          { NULL,                       gen_helper_vamoswapd_v_d_a },
+          { NULL,                       gen_helper_vamoaddd_v_d_a },
+          { NULL,                       gen_helper_vamoxord_v_d_a },
+          { NULL,                       gen_helper_vamoandd_v_d_a },
+          { NULL,                       gen_helper_vamoord_v_d_a },
+          { NULL,                       gen_helper_vamomind_v_d_a },
+          { NULL,                       gen_helper_vamomaxd_v_d_a },
+          { NULL,                       gen_helper_vamominud_v_d_a },
+          { NULL,                       gen_helper_vamomaxud_v_d_a } },
+        /* no atomic operation */
+        { { gen_helper_vamoswapw_v_w, gen_helper_vamoswapw_v_d },
+          { gen_helper_vamoaddw_v_w,  gen_helper_vamoaddw_v_d },
+          { gen_helper_vamoxorw_v_w,  gen_helper_vamoxorw_v_d },
+          { gen_helper_vamoandw_v_w,  gen_helper_vamoandw_v_d },
+          { gen_helper_vamoorw_v_w,   gen_helper_vamoorw_v_d },
+          { gen_helper_vamominw_v_w,  gen_helper_vamominw_v_d },
+          { gen_helper_vamomaxw_v_w,  gen_helper_vamomaxw_v_d },
+          { gen_helper_vamominuw_v_w, gen_helper_vamominuw_v_d },
+          { gen_helper_vamomaxuw_v_w, gen_helper_vamomaxuw_v_d },
+          { NULL,                     gen_helper_vamoswapd_v_d },
+          { NULL,                     gen_helper_vamoaddd_v_d },
+          { NULL,                     gen_helper_vamoxord_v_d },
+          { NULL,                     gen_helper_vamoandd_v_d },
+          { NULL,                     gen_helper_vamoord_v_d },
+          { NULL,                     gen_helper_vamomind_v_d },
+          { NULL,                     gen_helper_vamomaxd_v_d },
+          { NULL,                     gen_helper_vamominud_v_d },
+          { NULL,                     gen_helper_vamomaxud_v_d } }
+    };
+#else
+    static gen_helper_amo *const fns[2][9][2] = {
+        /* atomic operation */
+        { { gen_helper_vamoswapw_v_w_a, NULL },
+          { gen_helper_vamoaddw_v_w_a,  NULL },
+          { gen_helper_vamoxorw_v_w_a,  NULL },
+          { gen_helper_vamoandw_v_w_a,  NULL },
+          { gen_helper_vamoorw_v_w_a,   NULL },
+          { gen_helper_vamominw_v_w_a,  NULL },
+          { gen_helper_vamomaxw_v_w_a,  NULL },
+          { gen_helper_vamominuw_v_w_a, NULL },
+          { gen_helper_vamomaxuw_v_w_a, NULL } },
+        /* no atomic operation */
+        { { gen_helper_vamoswapw_v_w, NULL },
+          { gen_helper_vamoaddw_v_w,  NULL },
+          { gen_helper_vamoxorw_v_w,  NULL },
+          { gen_helper_vamoandw_v_w,  NULL },
+          { gen_helper_vamoorw_v_w,   NULL },
+          { gen_helper_vamominw_v_w,  NULL },
+          { gen_helper_vamomaxw_v_w,  NULL },
+          { gen_helper_vamominuw_v_w, NULL },
+          { gen_helper_vamomaxuw_v_w, NULL } }
+    };
+#endif
+    if (s->sew < 2) {
+        return false;
+    }
+
+    if (tb_cflags(s->base.tb) & CF_PARALLEL) {
+#ifdef CONFIG_ATOMIC64
+        fn = fns[0][seq][s->sew - 2];
+#else
+        gen_helper_exit_atomic(cpu_env);
+        s->base.is_jmp = DISAS_NORETURN;
+        return true;
+#endif
+    } else {
+        fn = fns[1][seq][s->sew - 2];
+    }
+    if (fn == NULL) {
+        return false;
+    }
+
+    return amo_trans(a->rd, a->rs1, a->rs2, data, fn, s);
+}
+
+static bool amo_check(DisasContext *s, arg_rwdvm* a)
+{
+    return (vext_check_isa_ill(s, RVV | RVA) &&
+            (a->wd ? vext_check_overlap_mask(s, a->rd, a->vm) : 1) &&
+            vext_check_reg(s, a->rd, false) &&
+            vext_check_reg(s, a->rs2, false));
+}
+
+GEN_VEXT_TRANS(vamoswapw_v, 0, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoaddw_v, 1, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoxorw_v, 2, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoandw_v, 3, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoorw_v, 4, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamominw_v, 5, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamomaxw_v, 6, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamominuw_v, 7, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamomaxuw_v, 8, rwdvm, amo_op, amo_check)
+#ifdef TARGET_RISCV64
+GEN_VEXT_TRANS(vamoswapd_v, 9, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoaddd_v, 10, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoxord_v, 11, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoandd_v, 12, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamoord_v, 13, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamomind_v, 14, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamomaxd_v, 15, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamominud_v, 16, rwdvm, amo_op, amo_check)
+GEN_VEXT_TRANS(vamomaxud_v, 17, rwdvm, amo_op, amo_check)
+#endif
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 6772e31ecb..fe07a29cd2 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -94,6 +94,11 @@  static inline uint32_t vext_lmul(uint32_t desc)
     return (simd_data(desc) >> 9) & 0x3;
 }
 
+static uint32_t vext_wd(uint32_t desc)
+{
+    return (simd_data(desc) >> 11) & 0x1;
+}
+
 /*
  * Get vector group length in bytes. Its range is [64, 2048].
  *
@@ -166,9 +171,22 @@  static void vext_clear(void *tail, uint32_t cnt, uint32_t tot)
 }
 #endif
 
-static inline int vext_elem_mask(void *v0, int mlen, int index)
+static void vext_clearl(void *vd, uint32_t idx, uint32_t cnt, uint32_t tot)
+{
+    int32_t *cur = ((int32_t *)vd + H4(idx));
+    vext_clear(cur, cnt, tot);
+}
+
+#ifdef TARGET_RISCV64
+static void vext_clearq(void *vd, uint32_t idx, uint32_t cnt, uint32_t tot)
 {
+    int64_t *cur = (int64_t *)vd + idx;
+    vext_clear(cur, cnt, tot);
+}
+#endif
 
+static inline int vext_elem_mask(void *v0, int mlen, int index)
+{
     int idx = (index * mlen) / 8;
     int pos = (index * mlen) % 8;
 
@@ -976,3 +994,263 @@  GEN_VEXT_LDFF(vlhuff_v_w, uint16_t, uint32_t, MO_LEUW)
 GEN_VEXT_LDFF(vlhuff_v_d, uint16_t, uint64_t, MO_LEUW)
 GEN_VEXT_LDFF(vlwuff_v_w, uint32_t, uint32_t, MO_LEUL)
 GEN_VEXT_LDFF(vlwuff_v_d, uint32_t, uint64_t, MO_LEUL)
+
+/*
+ *** Vector AMO Operations (Zvamo)
+ */
+typedef void (*vext_amo_noatomic_fn)(void *vs3, target_ulong addr,
+        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr);
+typedef void (*vext_amo_atomic_fn)(void *vs3, target_ulong addr,
+        uint32_t wd, uint32_t idx, CPURISCVState *env);
+
+#ifdef TARGET_RISCV64
+GEN_VEXT_GET_INDEX_ADDR(vamoswapw_v_d, int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoswapd_v_d, int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoaddw_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoaddd_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoxorw_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoxord_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoandw_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoandd_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoorw_v_d,   int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamoord_v_d,   int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamominw_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamomind_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxw_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxd_v_d,  int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamominuw_v_d, int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamominud_v_d, int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxuw_v_d, int64_t, H8)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxud_v_d, int64_t, H8)
+#endif
+GEN_VEXT_GET_INDEX_ADDR(vamoswapw_v_w, int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamoaddw_v_w,  int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamoxorw_v_w,  int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamoandw_v_w,  int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamoorw_v_w,   int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamominw_v_w,  int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxw_v_w,  int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamominuw_v_w, int32_t, H4)
+GEN_VEXT_GET_INDEX_ADDR(vamomaxuw_v_w, int32_t, H4)
+
+/* no atomic opreation for vector atomic insructions */
+#define DO_SWAP(N, M) (M)
+#define DO_AND(N, M)  (N & M)
+#define DO_XOR(N, M)  (N ^ M)
+#define DO_OR(N, M)   (N | M)
+#define DO_ADD(N, M)  (N + M)
+#define DO_MAX(N, M)  ((N) >= (M) ? (N) : (M))
+#define DO_MIN(N, M)  ((N) >= (M) ? (M) : (N))
+
+#define GEN_VEXT_AMO_NOATOMIC_OP(NAME, ETYPE, MTYPE, H, DO_OP, SUF)      \
+static void vext_##NAME##_noatomic_op(void *vs3, target_ulong addr,      \
+        uint32_t wd, uint32_t idx, CPURISCVState *env, uintptr_t retaddr)\
+{                                                                        \
+    ETYPE ret;                                                           \
+    target_ulong tmp;                                                    \
+    int mmu_idx = cpu_mmu_index(env, false);                             \
+    tmp = cpu_ld##SUF##_mmuidx_ra(env, addr, mmu_idx, retaddr);          \
+    ret = DO_OP((ETYPE)(MTYPE)tmp, *((ETYPE *)vs3 + H(idx)));            \
+    cpu_st##SUF##_mmuidx_ra(env, addr, ret, mmu_idx, retaddr);           \
+    if (wd) {                                                            \
+        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
+    }                                                                    \
+}
+
+GEN_VEXT_AMO_NOATOMIC_OP(vamoswapw_v_w, int32_t,  int32_t, H4, DO_SWAP, l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoaddw_v_w,  int32_t,  int32_t, H4, DO_ADD,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoxorw_v_w,  int32_t,  int32_t, H4, DO_XOR,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoandw_v_w,  int32_t,  int32_t, H4, DO_AND,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoorw_v_w,   int32_t,  int32_t, H4, DO_OR,   l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamominw_v_w,  int32_t,  int32_t, H4, DO_MIN,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxw_v_w,  int32_t,  int32_t, H4, DO_MAX,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamominuw_v_w, uint32_t, int32_t, H4, DO_MIN,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxuw_v_w, uint32_t, int32_t, H4, DO_MAX,  l)
+#ifdef TARGET_RISCV64
+GEN_VEXT_AMO_NOATOMIC_OP(vamoswapw_v_d, int64_t,  int32_t, H8, DO_SWAP, l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoswapd_v_d, int64_t,  int64_t, H8, DO_SWAP, q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoaddw_v_d,  int64_t,  int32_t, H8, DO_ADD,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoaddd_v_d,  int64_t,  int64_t, H8, DO_ADD,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoxorw_v_d,  int64_t,  int32_t, H8, DO_XOR,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoxord_v_d,  int64_t,  int64_t, H8, DO_XOR,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoandw_v_d,  int64_t,  int32_t, H8, DO_AND,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoandd_v_d,  int64_t,  int64_t, H8, DO_AND,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoorw_v_d,   int64_t,  int32_t, H8, DO_OR,   l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamoord_v_d,   int64_t,  int64_t, H8, DO_OR,   q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamominw_v_d,  int64_t,  int32_t, H8, DO_MIN,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomind_v_d,  int64_t,  int64_t, H8, DO_MIN,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxw_v_d,  int64_t,  int32_t, H8, DO_MAX,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxd_v_d,  int64_t,  int64_t, H8, DO_MAX,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamominuw_v_d, uint64_t, int32_t, H8, DO_MIN,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamominud_v_d, uint64_t, int64_t, H8, DO_MIN,  q)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxuw_v_d, uint64_t, int32_t, H8, DO_MAX,  l)
+GEN_VEXT_AMO_NOATOMIC_OP(vamomaxud_v_d, uint64_t, int64_t, H8, DO_MAX,  q)
+#endif
+
+/* atomic opreation for vector atomic insructions */
+#ifndef CONFIG_USER_ONLY
+#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
+static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
+        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
+{                                                                        \
+    target_ulong tmp;                                                    \
+    int mem_idx = cpu_mmu_index(env, false);                             \
+    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx)),  \
+            make_memop_idx(MO_ALIGN | MOFLAG, mem_idx));                 \
+    if (wd) {                                                            \
+        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
+    }                                                                    \
+}
+#else
+#define GEN_VEXT_ATOMIC_OP(NAME, ETYPE, MTYPE, MOFLAG, H, AMO)           \
+static void vext_##NAME##_atomic_op(void *vs3, target_ulong addr,        \
+        uint32_t wd, uint32_t idx, CPURISCVState *env)                   \
+{                                                                        \
+    target_ulong tmp;                                                    \
+    tmp = helper_atomic_##AMO##_le(env, addr, *((ETYPE *)vs3 + H(idx))); \
+    if (wd) {                                                            \
+        *((ETYPE *)vs3 + H(idx)) = (target_long)(MTYPE)tmp;              \
+    }                                                                    \
+}
+#endif
+
+GEN_VEXT_ATOMIC_OP(vamoswapw_v_w, int32_t,  int32_t,  MO_TESL, H4, xchgl)
+GEN_VEXT_ATOMIC_OP(vamoaddw_v_w,  int32_t,  int32_t,  MO_TESL, H4, fetch_addl)
+GEN_VEXT_ATOMIC_OP(vamoxorw_v_w,  int32_t,  int32_t,  MO_TESL, H4, fetch_xorl)
+GEN_VEXT_ATOMIC_OP(vamoandw_v_w,  int32_t,  int32_t,  MO_TESL, H4, fetch_andl)
+GEN_VEXT_ATOMIC_OP(vamoorw_v_w,   int32_t,  int32_t,  MO_TESL, H4, fetch_orl)
+GEN_VEXT_ATOMIC_OP(vamominw_v_w,  int32_t,  int32_t,  MO_TESL, H4, fetch_sminl)
+GEN_VEXT_ATOMIC_OP(vamomaxw_v_w,  int32_t,  int32_t,  MO_TESL, H4, fetch_smaxl)
+GEN_VEXT_ATOMIC_OP(vamominuw_v_w, uint32_t, int32_t,  MO_TEUL, H4, fetch_uminl)
+GEN_VEXT_ATOMIC_OP(vamomaxuw_v_w, uint32_t, int32_t,  MO_TEUL, H4, fetch_umaxl)
+#ifdef TARGET_RISCV64
+GEN_VEXT_ATOMIC_OP(vamoswapw_v_d, int64_t,  int32_t,  MO_TESL, H8, xchgl)
+GEN_VEXT_ATOMIC_OP(vamoswapd_v_d, int64_t,  int64_t,  MO_TEQ,  H8, xchgq)
+GEN_VEXT_ATOMIC_OP(vamoaddw_v_d,  int64_t,  int32_t,  MO_TESL, H8, fetch_addl)
+GEN_VEXT_ATOMIC_OP(vamoaddd_v_d,  int64_t,  int64_t,  MO_TEQ,  H8, fetch_addq)
+GEN_VEXT_ATOMIC_OP(vamoxorw_v_d,  int64_t,  int32_t,  MO_TESL, H8, fetch_xorl)
+GEN_VEXT_ATOMIC_OP(vamoxord_v_d,  int64_t,  int64_t,  MO_TEQ,  H8, fetch_xorq)
+GEN_VEXT_ATOMIC_OP(vamoandw_v_d,  int64_t,  int32_t,  MO_TESL, H8, fetch_andl)
+GEN_VEXT_ATOMIC_OP(vamoandd_v_d,  int64_t,  int64_t,  MO_TEQ,  H8, fetch_andq)
+GEN_VEXT_ATOMIC_OP(vamoorw_v_d,   int64_t,  int32_t,  MO_TESL, H8, fetch_orl)
+GEN_VEXT_ATOMIC_OP(vamoord_v_d,   int64_t,  int64_t,  MO_TEQ,  H8, fetch_orq)
+GEN_VEXT_ATOMIC_OP(vamominw_v_d,  int64_t,  int32_t,  MO_TESL, H8, fetch_sminl)
+GEN_VEXT_ATOMIC_OP(vamomind_v_d,  int64_t,  int64_t,  MO_TEQ,  H8, fetch_sminq)
+GEN_VEXT_ATOMIC_OP(vamomaxw_v_d,  int64_t,  int32_t,  MO_TESL, H8, fetch_smaxl)
+GEN_VEXT_ATOMIC_OP(vamomaxd_v_d,  int64_t,  int64_t,  MO_TEQ,  H8, fetch_smaxq)
+GEN_VEXT_ATOMIC_OP(vamominuw_v_d, uint64_t, int32_t,  MO_TEUL, H8, fetch_uminl)
+GEN_VEXT_ATOMIC_OP(vamominud_v_d, uint64_t, int64_t,  MO_TEQ,  H8, fetch_uminq)
+GEN_VEXT_ATOMIC_OP(vamomaxuw_v_d, uint64_t, int32_t,  MO_TEUL, H8, fetch_umaxl)
+GEN_VEXT_ATOMIC_OP(vamomaxud_v_d, uint64_t, int64_t,  MO_TEQ,  H8, fetch_umaxq)
+#endif
+
+static inline void vext_amo_atomic(void *vs3, void *v0, target_ulong base,
+        void *vs2, CPURISCVState *env, uint32_t desc,
+        vext_get_index_addr get_index_addr,
+        vext_amo_atomic_fn atomic_op,
+        vext_ld_clear_elem clear_elem,
+        uint32_t esz, uint32_t msz, uintptr_t ra)
+{
+    uint32_t i;
+    target_long addr;
+    uint32_t wd = vext_wd(desc);
+    uint32_t vm = vext_vm(desc);
+    uint32_t mlen = vext_mlen(desc);
+    uint32_t vlmax = vext_maxsz(desc) / esz;
+
+    for (i = 0; i < env->vl; i++) {
+        if (!vm && !vext_elem_mask(v0, mlen, i)) {
+            continue;
+        }
+        probe_read_access(env, get_index_addr(base, i, vs2), msz, ra);
+        probe_write_access(env, get_index_addr(base, i, vs2), msz, ra);
+    }
+    for (i = 0; i < env->vl; i++) {
+        if (!vm && !vext_elem_mask(v0, mlen, i)) {
+            continue;
+        }
+        addr = get_index_addr(base, i, vs2);
+        atomic_op(vs3, addr, wd, i, env);
+    }
+    clear_elem(vs3, env->vl, env->vl * esz, vlmax * esz);
+}
+
+static inline void vext_amo_noatomic(void *vs3, void *v0, target_ulong base,
+        void *vs2, CPURISCVState *env, uint32_t desc,
+        vext_get_index_addr get_index_addr,
+        vext_amo_noatomic_fn noatomic_op,
+        vext_ld_clear_elem clear_elem,
+        uint32_t esz, uint32_t msz, uintptr_t ra)
+{
+    uint32_t i;
+    target_long addr;
+    uint32_t wd = vext_wd(desc);
+    uint32_t vm = vext_vm(desc);
+    uint32_t mlen = vext_mlen(desc);
+    uint32_t vlmax = vext_maxsz(desc) / esz;
+
+    for (i = 0; i < env->vl; i++) {
+        if (!vm && !vext_elem_mask(v0, mlen, i)) {
+            continue;
+        }
+        probe_read_access(env, get_index_addr(base, i, vs2), msz, ra);
+        probe_write_access(env, get_index_addr(base, i, vs2), msz, ra);
+    }
+    for (i = 0; i < env->vl; i++) {
+        if (!vm && !vext_elem_mask(v0, mlen, i)) {
+            continue;
+        }
+        addr = get_index_addr(base, i, vs2);
+        noatomic_op(vs3, addr, wd, i, env, ra);
+    }
+    clear_elem(vs3, env->vl, env->vl * esz, vlmax * esz);
+}
+
+#define GEN_VEXT_AMO(NAME, MTYPE, ETYPE, CLEAR_FN)              \
+void HELPER(NAME##_a)(void *vs3, void *v0, target_ulong base,   \
+        void *vs2, CPURISCVState *env, uint32_t desc)           \
+{                                                               \
+    vext_amo_atomic(vs3, v0, base, vs2, env, desc,              \
+        vext_##NAME##_get_addr,                                 \
+        vext_##NAME##_atomic_op,                                \
+        CLEAR_FN, sizeof(ETYPE), sizeof(MTYPE), GETPC());       \
+}                                                               \
+                                                                \
+void HELPER(NAME)(void *vs3, void *v0, target_ulong base,       \
+        void *vs2, CPURISCVState *env, uint32_t desc)           \
+{                                                               \
+    vext_amo_noatomic(vs3, v0, base, vs2, env, desc,            \
+        vext_##NAME##_get_addr,                                 \
+        vext_##NAME##_noatomic_op,                              \
+        CLEAR_FN, sizeof(ETYPE), sizeof(MTYPE), GETPC());       \
+}
+
+#ifdef TARGET_RISCV64
+GEN_VEXT_AMO(vamoswapw_v_d, int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoswapd_v_d, int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoaddw_v_d,  int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoaddd_v_d,  int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoxorw_v_d,  int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoxord_v_d,  int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoandw_v_d,  int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoandd_v_d,  int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoorw_v_d,   int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamoord_v_d,   int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamominw_v_d,  int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamomind_v_d,  int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamomaxw_v_d,  int32_t,  int64_t, vext_clearq)
+GEN_VEXT_AMO(vamomaxd_v_d,  int64_t, int64_t, vext_clearq)
+GEN_VEXT_AMO(vamominuw_v_d, uint32_t,  uint64_t, vext_clearq)
+GEN_VEXT_AMO(vamominud_v_d, uint64_t, uint64_t, vext_clearq)
+GEN_VEXT_AMO(vamomaxuw_v_d, uint32_t, uint64_t, vext_clearq)
+GEN_VEXT_AMO(vamomaxud_v_d, uint64_t, uint64_t, vext_clearq)
+#endif
+GEN_VEXT_AMO(vamoswapw_v_w, int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamoaddw_v_w,  int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamoxorw_v_w,  int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamoandw_v_w,  int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamoorw_v_w,   int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamominw_v_w,  int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamomaxw_v_w,  int32_t, int32_t, vext_clearl)
+GEN_VEXT_AMO(vamominuw_v_w, uint32_t, uint32_t, vext_clearl)
+GEN_VEXT_AMO(vamomaxuw_v_w, uint32_t, uint32_t, vext_clearl)