diff mbox series

[RFC,v2,41/44] target/loongarch: Implement vilvl vilvh vextrins vshuf

Message ID 20230328030631.3117129-42-gaosong@loongson.cn
State New
Headers show
Series Add LoongArch LSX instructions | expand

Commit Message

gaosong March 28, 2023, 3:06 a.m. UTC
This patch includes:
- VILV{L/H}.{B/H/W/D};
- VSHUF.{B/H/W/D};
- VSHUF4I.{B/H/W/D};
- VPERMI.W;
- VEXTRINS.{B/H/W/D}.

Signed-off-by: Song Gao <gaosong@loongson.cn>
---
 target/loongarch/disas.c                    |  25 +++
 target/loongarch/helper.h                   |  25 +++
 target/loongarch/insn_trans/trans_lsx.c.inc |  25 +++
 target/loongarch/insns.decode               |  25 +++
 target/loongarch/lsx_helper.c               | 163 ++++++++++++++++++++
 5 files changed, 263 insertions(+)

Comments

Richard Henderson April 4, 2023, 1:31 a.m. UTC | #1
On 3/27/23 20:06, Song Gao wrote:
> +void HELPER(vshuf_b)(CPULoongArchState *env,
> +                     uint32_t vd, uint32_t vj, uint32_t vk, uint32_t va)
> +{
> +    int i, m, k;
> +    VReg temp;
> +    VReg *Vd = &(env->fpr[vd].vreg);
> +    VReg *Vj = &(env->fpr[vj].vreg);
> +    VReg *Vk = &(env->fpr[vk].vreg);
> +    VReg *Va = &(env->fpr[va].vreg);
> +
> +    m = LSX_LEN/8;
> +    for (i = 0; i < m ; i++) {
> +        k = (Va->B(i)& 0x3f) % (2 * m);

Eh?  Double masking?

> +        temp.B(i) = (Va->B(i) & 0xc0) ? 0 : k < m ? Vk->B(k) : Vj->B(k - m);

Triple masking?

I would have expected something like

     k = Va->B(i) % N;
     temp.B(i) = (k < m ? Vj : k < 2 * m ? Vk : 0);

> +#define VSHUF(NAME, BIT, E)                                                  \
> +void HELPER(NAME)(CPULoongArchState *env,                                    \
> +                  uint32_t vd, uint32_t vj, uint32_t vk)                     \
> +{                                                                            \
> +    int i, m, k;                                                             \
> +    VReg temp;                                                               \
> +    VReg *Vd = &(env->fpr[vd].vreg);                                         \
> +    VReg *Vj = &(env->fpr[vj].vreg);                                         \
> +    VReg *Vk = &(env->fpr[vk].vreg);                                         \
> +                                                                             \
> +    m = LSX_LEN/BIT;                                                         \
> +    for (i = 0; i < m; i++) {                                                \
> +        k  = (Vd->E(i) & 0x3f) % (2 * m);                                    \
> +        temp.E(i) = (Vd->E(i) & 0xc0) ? 0 : k < m ? Vk->E(k) : Vj->E(k - m); \
> +    }                                                                        \
> +    Vd->D(0) = temp.D(0);                                                    \
> +    Vd->D(1) = temp.D(1);                                                    \
> +}

Likewise.

> +#define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03))
> +
> +#define VSHUF4I(NAME, BIT, E)                             \
> +void HELPER(NAME)(CPULoongArchState *env,                 \
> +                  uint32_t vd, uint32_t vj, uint32_t imm) \
> +{                                                         \
> +    int i;                                                \
> +    VReg temp;                                            \
> +    VReg *Vd = &(env->fpr[vd].vreg);                      \
> +    VReg *Vj = &(env->fpr[vj].vreg);                      \
> +                                                          \
> +    for (i = 0; i < LSX_LEN/BIT; i++) {                   \
> +         temp.E(i) = Vj->E(SHF_POS(i, imm));              \
> +    }                                                     \
> +    Vd->D[0] = temp.D[0];                                 \
> +    Vd->D[1] = temp.D[1];                                 \
> +}

Merge SHF_POS unless you expect it to be used again?

> +void HELPER(vshuf4i_d)(CPULoongArchState *env,
> +                       uint32_t vd, uint32_t vj, uint32_t imm)
> +{
> +    VReg *Vd = &(env->fpr[vd].vreg);
> +    VReg *Vj = &(env->fpr[vj].vreg);
> +
> +    VReg temp;
> +    temp.D(0) = ((imm & 0x03) == 0x00) ? Vd->D(0):
> +                ((imm & 0x03) == 0x01) ? Vd->D(1):
> +                ((imm & 0x03) == 0x02) ? Vj->D(0): Vj->D(1);
> +
> +    temp.D(1) = ((imm & 0x0c) == 0x00) ? Vd->D(0):
> +                ((imm & 0x0c) == 0x04) ? Vd->D(1):
> +                ((imm & 0x0c) == 0x08) ? Vj->D(0): Vj->D(1);
> +
> +    Vd->D[0] = temp.D[0];
> +    Vd->D[1] = temp.D[1];
> +}

Perhaps

     temp.D(0) = (imm & 2 ? Vj : Vd)->D(imm & 1);
     temp.D(1) = (imm & 8 ? Vj : Vd)->D((imm >> 2) & 1);


r~
diff mbox series

Patch

diff --git a/target/loongarch/disas.c b/target/loongarch/disas.c
index c6cf782725..0b62bbb8be 100644
--- a/target/loongarch/disas.c
+++ b/target/loongarch/disas.c
@@ -1629,3 +1629,28 @@  INSN_LSX(vpickod_b,        vvv)
 INSN_LSX(vpickod_h,        vvv)
 INSN_LSX(vpickod_w,        vvv)
 INSN_LSX(vpickod_d,        vvv)
+
+INSN_LSX(vilvl_b,          vvv)
+INSN_LSX(vilvl_h,          vvv)
+INSN_LSX(vilvl_w,          vvv)
+INSN_LSX(vilvl_d,          vvv)
+INSN_LSX(vilvh_b,          vvv)
+INSN_LSX(vilvh_h,          vvv)
+INSN_LSX(vilvh_w,          vvv)
+INSN_LSX(vilvh_d,          vvv)
+
+INSN_LSX(vshuf_b,          vvvv)
+INSN_LSX(vshuf_h,          vvv)
+INSN_LSX(vshuf_w,          vvv)
+INSN_LSX(vshuf_d,          vvv)
+INSN_LSX(vshuf4i_b,        vv_i)
+INSN_LSX(vshuf4i_h,        vv_i)
+INSN_LSX(vshuf4i_w,        vv_i)
+INSN_LSX(vshuf4i_d,        vv_i)
+
+INSN_LSX(vpermi_w,         vv_i)
+
+INSN_LSX(vextrins_d,       vv_i)
+INSN_LSX(vextrins_w,       vv_i)
+INSN_LSX(vextrins_h,       vv_i)
+INSN_LSX(vextrins_b,       vv_i)
diff --git a/target/loongarch/helper.h b/target/loongarch/helper.h
index bf03a16afd..86c7eeeae1 100644
--- a/target/loongarch/helper.h
+++ b/target/loongarch/helper.h
@@ -686,3 +686,28 @@  DEF_HELPER_4(vpickod_b, void, env, i32, i32, i32)
 DEF_HELPER_4(vpickod_h, void, env, i32, i32, i32)
 DEF_HELPER_4(vpickod_w, void, env, i32, i32, i32)
 DEF_HELPER_4(vpickod_d, void, env, i32, i32, i32)
+
+DEF_HELPER_4(vilvl_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvl_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvl_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvl_d, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvh_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvh_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvh_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vilvh_d, void, env, i32, i32, i32)
+
+DEF_HELPER_5(vshuf_b, void, env, i32, i32, i32, i32)
+DEF_HELPER_4(vshuf_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf_d, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf4i_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf4i_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf4i_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vshuf4i_d, void, env, i32, i32, i32)
+
+DEF_HELPER_4(vpermi_w, void, env, i32, i32, i32)
+
+DEF_HELPER_4(vextrins_b, void, env, i32, i32, i32)
+DEF_HELPER_4(vextrins_h, void, env, i32, i32, i32)
+DEF_HELPER_4(vextrins_w, void, env, i32, i32, i32)
+DEF_HELPER_4(vextrins_d, void, env, i32, i32, i32)
diff --git a/target/loongarch/insn_trans/trans_lsx.c.inc b/target/loongarch/insn_trans/trans_lsx.c.inc
index 66cb67a19c..0ea7c65445 100644
--- a/target/loongarch/insn_trans/trans_lsx.c.inc
+++ b/target/loongarch/insn_trans/trans_lsx.c.inc
@@ -3485,3 +3485,28 @@  TRANS(vpickod_b, gen_vvv, gen_helper_vpickod_b)
 TRANS(vpickod_h, gen_vvv, gen_helper_vpickod_h)
 TRANS(vpickod_w, gen_vvv, gen_helper_vpickod_w)
 TRANS(vpickod_d, gen_vvv, gen_helper_vpickod_d)
+
+TRANS(vilvl_b, gen_vvv, gen_helper_vilvl_b)
+TRANS(vilvl_h, gen_vvv, gen_helper_vilvl_h)
+TRANS(vilvl_w, gen_vvv, gen_helper_vilvl_w)
+TRANS(vilvl_d, gen_vvv, gen_helper_vilvl_d)
+TRANS(vilvh_b, gen_vvv, gen_helper_vilvh_b)
+TRANS(vilvh_h, gen_vvv, gen_helper_vilvh_h)
+TRANS(vilvh_w, gen_vvv, gen_helper_vilvh_w)
+TRANS(vilvh_d, gen_vvv, gen_helper_vilvh_d)
+
+TRANS(vshuf_b, gen_vvvv, gen_helper_vshuf_b)
+TRANS(vshuf_h, gen_vvv, gen_helper_vshuf_h)
+TRANS(vshuf_w, gen_vvv, gen_helper_vshuf_w)
+TRANS(vshuf_d, gen_vvv, gen_helper_vshuf_d)
+TRANS(vshuf4i_b, gen_vv_i, gen_helper_vshuf4i_b)
+TRANS(vshuf4i_h, gen_vv_i, gen_helper_vshuf4i_h)
+TRANS(vshuf4i_w, gen_vv_i, gen_helper_vshuf4i_w)
+TRANS(vshuf4i_d, gen_vv_i, gen_helper_vshuf4i_d)
+
+TRANS(vpermi_w, gen_vv_i, gen_helper_vpermi_w)
+
+TRANS(vextrins_b, gen_vv_i, gen_helper_vextrins_b)
+TRANS(vextrins_h, gen_vv_i, gen_helper_vextrins_h)
+TRANS(vextrins_w, gen_vv_i, gen_helper_vextrins_w)
+TRANS(vextrins_d, gen_vv_i, gen_helper_vextrins_d)
diff --git a/target/loongarch/insns.decode b/target/loongarch/insns.decode
index ab9e9e422f..0263bce28e 100644
--- a/target/loongarch/insns.decode
+++ b/target/loongarch/insns.decode
@@ -1231,3 +1231,28 @@  vpickod_b        0111 00010010 00000 ..... ..... .....    @vvv
 vpickod_h        0111 00010010 00001 ..... ..... .....    @vvv
 vpickod_w        0111 00010010 00010 ..... ..... .....    @vvv
 vpickod_d        0111 00010010 00011 ..... ..... .....    @vvv
+
+vilvl_b          0111 00010001 10100 ..... ..... .....    @vvv
+vilvl_h          0111 00010001 10101 ..... ..... .....    @vvv
+vilvl_w          0111 00010001 10110 ..... ..... .....    @vvv
+vilvl_d          0111 00010001 10111 ..... ..... .....    @vvv
+vilvh_b          0111 00010001 11000 ..... ..... .....    @vvv
+vilvh_h          0111 00010001 11001 ..... ..... .....    @vvv
+vilvh_w          0111 00010001 11010 ..... ..... .....    @vvv
+vilvh_d          0111 00010001 11011 ..... ..... .....    @vvv
+
+vshuf_b          0000 11010101 ..... ..... ..... .....    @vvvv
+vshuf_h          0111 00010111 10101 ..... ..... .....    @vvv
+vshuf_w          0111 00010111 10110 ..... ..... .....    @vvv
+vshuf_d          0111 00010111 10111 ..... ..... .....    @vvv
+vshuf4i_b        0111 00111001 00 ........ ..... .....    @vv_ui8
+vshuf4i_h        0111 00111001 01 ........ ..... .....    @vv_ui8
+vshuf4i_w        0111 00111001 10 ........ ..... .....    @vv_ui8
+vshuf4i_d        0111 00111001 11 ........ ..... .....    @vv_ui8
+
+vpermi_w         0111 00111110 01 ........ ..... .....    @vv_ui8
+
+vextrins_d       0111 00111000 00 ........ ..... .....    @vv_ui8
+vextrins_w       0111 00111000 01 ........ ..... .....    @vv_ui8
+vextrins_h       0111 00111000 10 ........ ..... .....    @vv_ui8
+vextrins_b       0111 00111000 11 ........ ..... .....    @vv_ui8
diff --git a/target/loongarch/lsx_helper.c b/target/loongarch/lsx_helper.c
index b8e0aa9d3b..56faa8684d 100644
--- a/target/loongarch/lsx_helper.c
+++ b/target/loongarch/lsx_helper.c
@@ -3121,3 +3121,166 @@  VPICKOD(vpickod_b, 16, B)
 VPICKOD(vpickod_h, 32, H)
 VPICKOD(vpickod_w, 64, W)
 VPICKOD(vpickod_d, 128, D)
+
+#define VILVL(NAME, BIT, E)                              \
+void HELPER(NAME)(CPULoongArchState *env,                \
+                  uint32_t vd, uint32_t vj, uint32_t vk) \
+{                                                        \
+    int i;                                               \
+    VReg temp;                                           \
+    VReg *Vd = &(env->fpr[vd].vreg);                     \
+    VReg *Vj = &(env->fpr[vj].vreg);                     \
+    VReg *Vk = &(env->fpr[vk].vreg);                     \
+                                                         \
+    for (i = 0; i < LSX_LEN/BIT; i++) {                  \
+        temp.E(2 * i + 1) = Vj->E(i);                    \
+        temp.E(2 * i) = Vk->E(i);                        \
+    }                                                    \
+    Vd->D(0) = temp.D(0);                                \
+    Vd->D(1) = temp.D(1);                                \
+}
+
+VILVL(vilvl_b, 16, B)
+VILVL(vilvl_h, 32, H)
+VILVL(vilvl_w, 64, W)
+VILVL(vilvl_d, 128, D)
+
+#define VILVH(NAME, BIT, E)                              \
+void HELPER(NAME)(CPULoongArchState *env,                \
+                  uint32_t vd, uint32_t vj, uint32_t vk) \
+{                                                        \
+    int i;                                               \
+    VReg temp;                                           \
+    VReg *Vd = &(env->fpr[vd].vreg);                     \
+    VReg *Vj = &(env->fpr[vj].vreg);                     \
+    VReg *Vk = &(env->fpr[vk].vreg);                     \
+                                                         \
+    for (i = 0; i < LSX_LEN/BIT; i++) {                  \
+        temp.E(2 * i + 1) = Vj->E(i + LSX_LEN/BIT);      \
+        temp.E(2 * i) = Vk->E(i + LSX_LEN/BIT);          \
+    }                                                    \
+    Vd->D(0) = temp.D(0);                                \
+    Vd->D(1) = temp.D(1);                                \
+}
+
+VILVH(vilvh_b, 16, B)
+VILVH(vilvh_h, 32, H)
+VILVH(vilvh_w, 64, W)
+VILVH(vilvh_d, 128, D)
+
+void HELPER(vshuf_b)(CPULoongArchState *env,
+                     uint32_t vd, uint32_t vj, uint32_t vk, uint32_t va)
+{
+    int i, m, k;
+    VReg temp;
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+    VReg *Vk = &(env->fpr[vk].vreg);
+    VReg *Va = &(env->fpr[va].vreg);
+
+    m = LSX_LEN/8;
+    for (i = 0; i < m ; i++) {
+        k = (Va->B(i)& 0x3f) % (2 * m);
+        temp.B(i) = (Va->B(i) & 0xc0) ? 0 : k < m ? Vk->B(k) : Vj->B(k - m);
+    }
+    Vd->D(0) = temp.D(0);
+    Vd->D(1) = temp.D(1);
+}
+
+#define VSHUF(NAME, BIT, E)                                                  \
+void HELPER(NAME)(CPULoongArchState *env,                                    \
+                  uint32_t vd, uint32_t vj, uint32_t vk)                     \
+{                                                                            \
+    int i, m, k;                                                             \
+    VReg temp;                                                               \
+    VReg *Vd = &(env->fpr[vd].vreg);                                         \
+    VReg *Vj = &(env->fpr[vj].vreg);                                         \
+    VReg *Vk = &(env->fpr[vk].vreg);                                         \
+                                                                             \
+    m = LSX_LEN/BIT;                                                         \
+    for (i = 0; i < m; i++) {                                                \
+        k  = (Vd->E(i) & 0x3f) % (2 * m);                                    \
+        temp.E(i) = (Vd->E(i) & 0xc0) ? 0 : k < m ? Vk->E(k) : Vj->E(k - m); \
+    }                                                                        \
+    Vd->D(0) = temp.D(0);                                                    \
+    Vd->D(1) = temp.D(1);                                                    \
+}
+
+VSHUF(vshuf_h, 16, H)
+VSHUF(vshuf_w, 32, W)
+VSHUF(vshuf_d, 64, D)
+
+#define SHF_POS(i, imm) (((i) & 0xfc) + (((imm) >> (2 * ((i) & 0x03))) & 0x03))
+
+#define VSHUF4I(NAME, BIT, E)                             \
+void HELPER(NAME)(CPULoongArchState *env,                 \
+                  uint32_t vd, uint32_t vj, uint32_t imm) \
+{                                                         \
+    int i;                                                \
+    VReg temp;                                            \
+    VReg *Vd = &(env->fpr[vd].vreg);                      \
+    VReg *Vj = &(env->fpr[vj].vreg);                      \
+                                                          \
+    for (i = 0; i < LSX_LEN/BIT; i++) {                   \
+         temp.E(i) = Vj->E(SHF_POS(i, imm));              \
+    }                                                     \
+    Vd->D[0] = temp.D[0];                                 \
+    Vd->D[1] = temp.D[1];                                 \
+}
+
+VSHUF4I(vshuf4i_b, 8, B)
+VSHUF4I(vshuf4i_h, 16, H)
+VSHUF4I(vshuf4i_w, 32, W)
+
+void HELPER(vshuf4i_d)(CPULoongArchState *env,
+                       uint32_t vd, uint32_t vj, uint32_t imm)
+{
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+
+    VReg temp;
+    temp.D(0) = ((imm & 0x03) == 0x00) ? Vd->D(0):
+                ((imm & 0x03) == 0x01) ? Vd->D(1):
+                ((imm & 0x03) == 0x02) ? Vj->D(0): Vj->D(1);
+
+    temp.D(1) = ((imm & 0x0c) == 0x00) ? Vd->D(0):
+                ((imm & 0x0c) == 0x04) ? Vd->D(1):
+                ((imm & 0x0c) == 0x08) ? Vj->D(0): Vj->D(1);
+
+    Vd->D[0] = temp.D[0];
+    Vd->D[1] = temp.D[1];
+}
+
+void HELPER(vpermi_w)(CPULoongArchState *env,
+                      uint32_t vd, uint32_t vj, uint32_t imm)
+{
+    VReg temp;
+    VReg *Vd = &(env->fpr[vd].vreg);
+    VReg *Vj = &(env->fpr[vj].vreg);
+
+    temp.W(0) = Vj->W(imm & 0x3);
+    temp.W(1) = Vj->W((imm >> 2) & 0x3);
+    temp.W(2) = Vd->W((imm >> 4) & 0x3);
+    temp.W(3) = Vd->W((imm >> 6) & 0x3);
+
+    Vd->D(0) = temp.D(0);
+    Vd->D(1) = temp.D(1);
+}
+
+#define VEXTRINS(NAME, BIT, E, MASK)                      \
+void HELPER(NAME)(CPULoongArchState *env,                 \
+                  uint32_t vd, uint32_t vj, uint32_t imm) \
+{                                                         \
+    int ins, extr;                                        \
+    VReg *Vd = &(env->fpr[vd].vreg);                      \
+    VReg *Vj = &(env->fpr[vj].vreg);                      \
+                                                          \
+    ins = (imm >> 4) & MASK;                              \
+    extr = imm & MASK;                                    \
+    Vd->E(ins) = Vj->E(extr);                             \
+}
+
+VEXTRINS(vextrins_b, 8, B, 0xf)
+VEXTRINS(vextrins_h, 16, H, 0x7)
+VEXTRINS(vextrins_w, 32, W, 0x3)
+VEXTRINS(vextrins_d, 64, D, 0x1)