Message ID | 164845204233.25323.14607469451359734000-5@git.sr.ht |
---|---|
State | New |
Headers | show |
Series | Add tail agnostic behavior for rvv instructions | expand |
在 2022/3/7 下午3:10, ~eopxd 写道: > From: eopXD <eop.chen@sifive.com> > > Signed-off-by: eop Chen <eop.chen@sifive.com> > Reviewed-by: Frank Chang <frank.chang@sifive.com> > --- > target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++ > target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++ > 2 files changed, 41 insertions(+) > > diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc > index cc80bf00ff..66cfc8c603 100644 > --- a/target/riscv/insn_trans/trans_rvv.c.inc > +++ b/target/riscv/insn_trans/trans_rvv.c.inc > @@ -711,6 +711,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); > } > > @@ -748,6 +749,7 @@ static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); > } > > @@ -774,6 +776,7 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew) > /* EMUL = 1, NFIELDS = 1 */ > data = FIELD_DP32(data, VDATA, LMUL, 0); > data = FIELD_DP32(data, VDATA, NF, 1); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); > } > > @@ -791,6 +794,7 @@ static bool st_us_mask_op(DisasContext *s, arg_vsm_v *a, uint8_t eew) > /* EMUL = 1, NFIELDS = 1 */ > data = FIELD_DP32(data, VDATA, LMUL, 0); > data = FIELD_DP32(data, VDATA, NF, 1); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); > } > > @@ -862,6 +866,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); > } > > @@ -891,6 +896,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > fn = fns[eew]; > if (fn == NULL) { > return false; > @@ -991,6 +997,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); > } > > @@ -1043,6 +1050,7 @@ static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true); > } > > @@ -1108,6 +1116,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > + data = FIELD_DP32(data, VDATA, VTA, s->vta); > return ldff_trans(a->rd, a->rs1, data, fn, s); > } > > diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c > index 39c79c59c2..1c7015e917 100644 > --- a/target/riscv/vector_helper.c > +++ b/target/riscv/vector_helper.c > @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, > uint32_t i, k; > uint32_t nf = vext_nf(desc); > uint32_t max_elems = vext_max_elems(desc, log2_esz); > + uint32_t esz = 1 << log2_esz; > + uint32_t total_elems = vext_get_total_elems(desc, esz); > + uint32_t vta = vext_vta(desc); > > for (i = env->vstart; i < env->vl; i++, env->vstart++) { > if (!vm && !vext_elem_mask(v0, i)) { > @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, > } > } > env->vstart = 0; > + /* set tail elements to 1s */ > + for (k = 0; k < nf; ++k) { > + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, > + env->vl * esz, total_elems * esz); > + } > } > > #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ > @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > uint32_t i, k; > uint32_t nf = vext_nf(desc); > uint32_t max_elems = vext_max_elems(desc, log2_esz); > + uint32_t esz = 1 << log2_esz; > + uint32_t total_elems = vext_get_total_elems(desc, esz); > + uint32_t vta = vext_vta(desc); > > /* load bytes from guest memory */ > for (i = env->vstart; i < evl; i++, env->vstart++) { > @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, > } > } > env->vstart = 0; > + /* set tail elements to 1s */ > + for (k = 0; k < nf; ++k) { > + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, > + env->vl * esz, total_elems * esz); > + } > } > It seems incorrect here. similar to following load/store helper. In above instructions, following elements are loaded: 0 * max_elems ... 0 *max_elems + vl - 1 1 * max_elems ... 1 *max_elems + vl - 1 ....... (nf-1)* max_elems ... (nf-1)*max_elems + vl - 1 So, the elements[vl .. max_elems - 1] are tail elements, however elements[vl ... 1* total_elems - 1] may not: elements from max_elems to total_elems - 1 are active elements, If total_elems > max_elems(LMUL< 1) Or LMUL should be equal or greater than 1 here? I didn't find any description about this from the spec. I also have another question about the tail elements for these load/store instructions: when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded, then whether elements in reg+3 (if they belong to the same register group) are tail elements? Regards, Weiwei Li
> Weiwei Li <liweiwei@iscas.ac.cn> 於 2022年3月28日 下午7:56 寫道: > > > 在 2022/3/7 下午3:10, ~eopxd 写道: >> From: eopXD <eop.chen@sifive.com> >> >> Signed-off-by: eop Chen <eop.chen@sifive.com> >> Reviewed-by: Frank Chang <frank.chang@sifive.com> >> --- >> target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++ >> target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++ >> 2 files changed, 41 insertions(+) >> >> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c >> index 39c79c59c2..1c7015e917 100644 >> --- a/target/riscv/vector_helper.c >> +++ b/target/riscv/vector_helper.c >> @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, >> uint32_t i, k; >> uint32_t nf = vext_nf(desc); >> uint32_t max_elems = vext_max_elems(desc, log2_esz); >> + uint32_t esz = 1 << log2_esz; >> + uint32_t total_elems = vext_get_total_elems(desc, esz); >> + uint32_t vta = vext_vta(desc); >> for (i = env->vstart; i < env->vl; i++, env->vstart++) { >> if (!vm && !vext_elem_mask(v0, i)) { >> @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, >> } >> } >> env->vstart = 0; >> + /* set tail elements to 1s */ >> + for (k = 0; k < nf; ++k) { >> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, >> + env->vl * esz, total_elems * esz); >> + } >> } >> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ >> @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, >> uint32_t i, k; >> uint32_t nf = vext_nf(desc); >> uint32_t max_elems = vext_max_elems(desc, log2_esz); >> + uint32_t esz = 1 << log2_esz; >> + uint32_t total_elems = vext_get_total_elems(desc, esz); >> + uint32_t vta = vext_vta(desc); >> /* load bytes from guest memory */ >> for (i = env->vstart; i < evl; i++, env->vstart++) { >> @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, >> } >> } >> env->vstart = 0; >> + /* set tail elements to 1s */ >> + for (k = 0; k < nf; ++k) { >> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, >> + env->vl * esz, total_elems * esz); >> + } >> } >> > > It seems incorrect here. similar to following load/store helper. > > In above instructions, following elements are loaded: > > 0 * max_elems ... 0 *max_elems + vl - 1 > > 1 * max_elems ... 1 *max_elems + vl - 1 > > ....... > > (nf-1)* max_elems ... (nf-1)*max_elems + vl - 1 > > So, the elements[vl .. max_elems - 1] are tail elements, however elements[vl ... 1* total_elems - 1] may not: > > elements from max_elems to total_elems - 1 are active elements, If total_elems > max_elems(LMUL< 1) > > Or LMUL should be equal or greater than 1 here? I didn't find any description about this from the spec. > > I also have another question about the tail elements for these load/store instructions: > > when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded, then whether elements in reg+3 > > (if they belong to the same register group) are tail elements? > > Regards, > > Weiwei Li > The LMUL sent into vector helper function from `trans_rvv.c.inc` takes EMUL (effective LMUL) instead of LMUL. Take trans_rvv.c.inc::ld_us_op for example, ``` /* * Vector load/store instructions have the EEW encoded * directly in the instructions. The maximum vector size is * calculated with EMUL rather than LMUL. */ uint8_t emul = vext_get_emul(s, eew); data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); ``` And vext_get_emul always return something at least the length of a vector register: ``` static uint8_t vext_get_emul(DisasContext *s, uint8_t eew) { int8_t emul = eew - s->sew + s->lmul; return emul < 0 ? 0 : emul; } ``` In this case I guess the naming is a little bit misleading, `vext_max_elems` would be equivalent to `vext_get_total_elems` for all load / store instructions, which guarantees That LMUL is always equal or greater to 1. In conclusion, the behavior is correct here. I don’t understand your second question though. If nf = 3, there will be 3 registers involved with the instruction (namely reg, reg+1, reg+2). Why do we care about (reg+3)? Thanks for pointing out this question and all your efforts for reviewing. I really appreciate it. Regards, eop Chen
在 2022/3/30 下午3:42, 陳約廷 写道: > >> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 >> 2022年3月28日 下午7:56 寫道: >> >> >> 在 2022/3/7 下午3:10, ~eopxd 写道: >>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>> >>> >>> Signed-off-by: eop Chen <eop.chen@sifive.com >>> <mailto:eop.chen@sifive.com>> >>> Reviewed-by: Frank Chang <frank.chang@sifive.com >>> <mailto:frank.chang@sifive.com>> >>> --- >>> target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++ >>> target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++ >>> 2 files changed, 41 insertions(+) >>> >>> diff --git a/target/riscv/vector_helper.c >>> b/target/riscv/vector_helper.c >>> index 39c79c59c2..1c7015e917 100644 >>> --- a/target/riscv/vector_helper.c >>> +++ b/target/riscv/vector_helper.c >>> @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, >>> target_ulong base, >>> uint32_t i, k; >>> uint32_t nf = vext_nf(desc); >>> uint32_t max_elems = vext_max_elems(desc, log2_esz); >>> + uint32_t esz = 1 << log2_esz; >>> + uint32_t total_elems = vext_get_total_elems(desc, esz); >>> + uint32_t vta = vext_vta(desc); >>> for (i = env->vstart; i < env->vl; i++, env->vstart++) { >>> if (!vm && !vext_elem_mask(v0, i)) { >>> @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, >>> target_ulong base, >>> } >>> } >>> env->vstart = 0; >>> + /* set tail elements to 1s */ >>> + for (k = 0; k < nf; ++k) { >>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * >>> total_elems, >>> + env->vl * esz, total_elems >>> * esz); >>> + } >>> } >>> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) >>> \ >>> @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, >>> CPURISCVState *env, uint32_t desc, >>> uint32_t i, k; >>> uint32_t nf = vext_nf(desc); >>> uint32_t max_elems = vext_max_elems(desc, log2_esz); >>> + uint32_t esz = 1 << log2_esz; >>> + uint32_t total_elems = vext_get_total_elems(desc, esz); >>> + uint32_t vta = vext_vta(desc); >>> /* load bytes from guest memory */ >>> for (i = env->vstart; i < evl; i++, env->vstart++) { >>> @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, >>> CPURISCVState *env, uint32_t desc, >>> } >>> } >>> env->vstart = 0; >>> + /* set tail elements to 1s */ >>> + for (k = 0; k < nf; ++k) { >>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * >>> total_elems, >>> + env->vl * esz, total_elems >>> * esz); >>> + } >>> } >>> >> >> It seems incorrect here. similar to following load/store helper. >> >> In above instructions, following elements are loaded: >> >> 0 * max_elems ... 0 *max_elems + vl - 1 >> >> 1 * max_elems ... 1 *max_elems + vl - 1 >> >> ....... >> >> (nf-1)* max_elems ... (nf-1)*max_elems + vl - 1 >> >> So, the elements[vl .. max_elems - 1] are tail elements, however >> elements[vl ... 1* total_elems - 1] may not: >> >> elements from max_elems to total_elems - 1 are active elements, If >> total_elems > max_elems(LMUL< 1) >> >> Or LMUL should be equal or greater than 1 here? I didn't find any >> description about this from the spec. >> >> I also have another question about the tail elements for these >> load/store instructions: >> >> when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded, >> then whether elements in reg+3 >> >> (if they belong to the same register group) are tail elements? >> >> Regards, >> >> Weiwei Li >> > > The LMUL sent into vector helper function from `trans_rvv.c.inc` takes > EMUL > (effective LMUL) instead of LMUL. Take trans_rvv.c.inc::ld_us_op for > example, > > ``` > /* > * Vector load/store instructions have the EEW encoded > * directly in the instructions. The maximum vector size is > * calculated with EMUL rather than LMUL. > */ > uint8_t emul = vext_get_emul(s, eew); > data = FIELD_DP32(data, VDATA, VM, a->vm); > data = FIELD_DP32(data, VDATA, LMUL, emul); > data = FIELD_DP32(data, VDATA, NF, a->nf); > return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); > ``` > > And vext_get_emul always return something at least the length of a > vector register: > > ``` > static uint8_t vext_get_emul(DisasContext *s, uint8_t eew) > { > int8_t emul = eew - s->sew + s->lmul; > return emul < 0 ? 0 : emul; > } > ``` > > In this case I guess the naming is a little bit misleading, > `vext_max_elems` would be > equivalent to `vext_get_total_elems` for all load / store > instructions, which guarantees > That LMUL is always equal or greater to 1. In conclusion, the behavior > is correct here. OK. Thanks for your patient explaining. Another question: max_elems is equal to total_elems when lmul >= 0. So max_elems can be reused here instead of caculating total_elems again. > > I don’t understand your second question though. If nf = 3, there will > be 3 registers > involved with the instruction (namely reg, reg+1, reg+2). Why do we > care about > (reg+3)? > I just consider register group here. Reg, reg+1, reg+2 and reg+3 may belong to the same register group. Regards, Weiwei Li > Thanks for pointing out this question and all your efforts for > reviewing. I really > appreciate it. > > Regards, > > eop Chen
> Weiwei Li <liweiwei@iscas.ac.cn> 於 2022年3月30日 下午4:27 寫道: > 在 2022/3/30 下午3:42, 陳約廷 写道: >> >>> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 2022年3月28日 下午7:56 寫道: >>> >>> >>> 在 2022/3/7 下午3:10, ~eopxd 写道: >>>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>> > Another question: max_elems is equal to total_elems when lmul >= 0. > > So max_elems can be reused here instead of caculating total_elems again. > >> >> I don’t understand your second question though. If nf = 3, there will be 3 registers >> involved with the instruction (namely reg, reg+1, reg+2). Why do we care about >> (reg+3)? >> > I just consider register group here. Reg, reg+1, reg+2 and reg+3 may belong to the same register group. > > Regards, > > Weiwei Li > According to v-spec (under section 7.8): Each field will be held in successively numbered vector register groups. When EMUL>1 each field will occupy a vector register group held in multiple successively numbered vector registers, and the vector register group for each field must follow the usual vector register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, each field’s vector register group must start at an even vector register, but does not have to start at a multiple of 8 vector register number). I think the spec has explained itself that NFIELDS represents the number of register groups involved in this instruction. Therefore in a register group of 4 (LMUL = m2), NFIELD should be no more than 2. The `vlmax` here would be (VLEN * 4 / EEW). In this sense, if the `vl` provided for the vector instruction is within the range 2 * vlmax / 4 <= vl <= 3 * vlmax / 4, the elements in the 4th register (namely reg+3) will all be counted as tail elements. I hope this answers your question. Regards, eop Chen
在 2022/3/30 下午6:02, eop Chen 写道: > > >> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 >> 2022年3月30日 下午4:27 寫道: >> 在 2022/3/30 下午3:42, 陳約廷 写道: >>> >>>> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 >>>> 2022年3月28日 下午7:56 寫道: >>>> >>>> >>>> 在 2022/3/7 下午3:10, ~eopxd 写道: >>>>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>> >> >> Another question: max_elems is equal to total_elems when lmul >= 0. >> >> So max_elems can be reused here instead of caculating total_elems again. >> >>> >>> I don’t understand your second question though. If nf = 3, there >>> will be 3 registers >>> involved with the instruction (namely reg, reg+1, reg+2). Why do we >>> care about >>> (reg+3)? >>> >> I just consider register group here. Reg, reg+1, reg+2 and reg+3 may >> belong to the same register group. >> >> Regards, >> >> Weiwei Li >> > > According to v-spec (under section 7.8): > > Each field will be held in successively numbered vector register > groups. When EMUL>1 > each field will occupy a vector register group held in multiple > successively numbered > vector registers, and the vector register group for each field > must follow the usual vector > register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, > each field’s vector > register group must start at an even vector register, but does not > have to start at a multiple > of 8 vector register number). > > > I think the spec has explained itself that NFIELDS represents the > number of register groups involved > in this instruction. Therefore in a register group of 4 (LMUL = m2), > NFIELD should be no more than 2. > The `vlmax` here would be (VLEN * 4 / EEW). In this sense, if the `vl` > provided for the vector instruction > is within the range 2 * vlmax / 4 <= vl <= 3 * vlmax / 4, the elements > in the 4th register (namely reg+3) > will all be counted as tail elements. > > I hope this answers your question. OK, Thanks a lot. This truly answers my question, even though what I really want to know is the case for EMUL=1, and NFIELDS=3. since NFIELDS represents the number of register groups, not take the total of EMUL * NFIELDS into one register group , so reg+3 should not take into tail elements for my case. Regards, Weiwei Li > > Regards, > > eop Chen > >
diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc index cc80bf00ff..66cfc8c603 100644 --- a/target/riscv/insn_trans/trans_rvv.c.inc +++ b/target/riscv/insn_trans/trans_rvv.c.inc @@ -711,6 +711,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -748,6 +749,7 @@ static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); } @@ -774,6 +776,7 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew) /* EMUL = 1, NFIELDS = 1 */ data = FIELD_DP32(data, VDATA, LMUL, 0); data = FIELD_DP32(data, VDATA, NF, 1); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_us_trans(a->rd, a->rs1, data, fn, s, false); } @@ -791,6 +794,7 @@ static bool st_us_mask_op(DisasContext *s, arg_vsm_v *a, uint8_t eew) /* EMUL = 1, NFIELDS = 1 */ data = FIELD_DP32(data, VDATA, LMUL, 0); data = FIELD_DP32(data, VDATA, NF, 1); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_us_trans(a->rd, a->rs1, data, fn, s, true); } @@ -862,6 +866,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -891,6 +896,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); fn = fns[eew]; if (fn == NULL) { return false; @@ -991,6 +997,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false); } @@ -1043,6 +1050,7 @@ static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true); } @@ -1108,6 +1116,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew) data = FIELD_DP32(data, VDATA, VM, a->vm); data = FIELD_DP32(data, VDATA, LMUL, emul); data = FIELD_DP32(data, VDATA, NF, a->nf); + data = FIELD_DP32(data, VDATA, VTA, s->vta); return ldff_trans(a->rd, a->rs1, data, fn, s); } diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c index 39c79c59c2..1c7015e917 100644 --- a/target/riscv/vector_helper.c +++ b/target/riscv/vector_helper.c @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, uint32_t i, k; uint32_t nf = vext_nf(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(desc, esz); + uint32_t vta = vext_vta(desc); for (i = env->vstart; i < env->vl; i++, env->vstart++) { if (!vm && !vext_elem_mask(v0, i)) { @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base, } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, + env->vl * esz, total_elems * esz); + } } #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \ @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, uint32_t i, k; uint32_t nf = vext_nf(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(desc, esz); + uint32_t vta = vext_vta(desc); /* load bytes from guest memory */ for (i = env->vstart; i < evl; i++, env->vstart++) { @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc, } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, + env->vl * esz, total_elems * esz); + } } /* @@ -458,6 +474,9 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(desc, esz); + uint32_t vta = vext_vta(desc); /* load bytes from guest memory */ for (i = env->vstart; i < env->vl; i++, env->vstart++) { @@ -473,6 +492,11 @@ vext_ldst_index(void *vd, void *v0, target_ulong base, } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, + env->vl * esz, total_elems * esz); + } } #define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \ @@ -540,6 +564,9 @@ vext_ldff(void *vd, void *v0, target_ulong base, uint32_t nf = vext_nf(desc); uint32_t vm = vext_vm(desc); uint32_t max_elems = vext_max_elems(desc, log2_esz); + uint32_t esz = 1 << log2_esz; + uint32_t total_elems = vext_get_total_elems(desc, esz); + uint32_t vta = vext_vta(desc); target_ulong addr, offset, remain; /* probe every access*/ @@ -595,6 +622,11 @@ ProbeSuccess: } } env->vstart = 0; + /* set tail elements to 1s */ + for (k = 0; k < nf; ++k) { + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems, + env->vl * esz, total_elems * esz); + } } #define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \