diff mbox

[16/16,v1] target-tilegx: Implement additional instructions in normal working flow

Message ID BLU436-SMTP121196FC184264F6146E853B9660@phx.gbl
State New
Headers show

Commit Message

Chen Gang Aug. 20, 2015, 9:48 p.m. UTC
They are vectors, pcnt, revbytes, icoh, and drain.

Signed-off-by: Chen Gang <gang.chen.5i5j@gmail.com>
---
 target-tilegx/helper.c    |  12 +-
 target-tilegx/helper.h    |   4 +-
 target-tilegx/translate.c | 431 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 331 insertions(+), 116 deletions(-)
diff mbox

Patch

diff --git a/target-tilegx/helper.c b/target-tilegx/helper.c
index 5ab41cd..98dd805 100644
--- a/target-tilegx/helper.c
+++ b/target-tilegx/helper.c
@@ -24,7 +24,7 @@ 
 
 #define SIGNBIT32 0x80000000
 
-int64_t helper_add_saturate(CPUTLGState *env, uint64_t rsrc, uint64_t rsrcb)
+int64_t helper_add_saturate(uint64_t rsrc, uint64_t rsrcb)
 {
     uint32_t rdst = rsrc + rsrcb;
 
@@ -53,6 +53,16 @@  uint64_t helper_cnttz(uint64_t arg)
     return ctz64(arg);
 }
 
+uint64_t helper_pcnt(uint64_t arg)
+{
+    return ctpop64(arg);
+}
+
+uint64_t helper_revbytes(uint64_t arg)
+{
+    return bswap64(arg);
+}
+
 /*
  * Functional Description
  *     uint64_t a = rf[SrcA];
diff --git a/target-tilegx/helper.h b/target-tilegx/helper.h
index 1411c19..fbd995a 100644
--- a/target-tilegx/helper.h
+++ b/target-tilegx/helper.h
@@ -1,5 +1,7 @@ 
 DEF_HELPER_2(exception, noreturn, env, i32)
 DEF_HELPER_FLAGS_1(cntlz, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(cnttz, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(pcnt, TCG_CALL_NO_RWG_SE, i64, i64)
+DEF_HELPER_FLAGS_1(revbytes, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_3(shufflebytes, TCG_CALL_NO_RWG_SE, i64, i64, i64, i64)
-DEF_HELPER_3(add_saturate, s64, env, i64, i64)
+DEF_HELPER_FLAGS_2(add_saturate, TCG_CALL_NO_RWG_SE, s64, i64, i64)
diff --git a/target-tilegx/translate.c b/target-tilegx/translate.c
index 9ae1c6d..6d993b4 100644
--- a/target-tilegx/translate.c
+++ b/target-tilegx/translate.c
@@ -302,20 +302,104 @@  static void gen_mtspr(struct DisasContext *dc, uint8_t rsrc, uint16_t imm14)
     set_exception(dc, TILEGX_EXCP_OPCODE_UNIMPLEMENTED);
 }
 
-static void extract_v1(TCGv out, TCGv in, unsigned byte)
+static void extract_v(TCGv out, TCGv in, int count, int v)
 {
-    tcg_gen_shri_i64(out, in, byte * 8);
-    tcg_gen_ext8u_i64(out, out);
+    tcg_gen_shri_i64(out, in, count * v * 8);
+    switch (v) {
+    case 1:
+        tcg_gen_ext8u_i64(out, out);
+        break;
+    case 2:
+        tcg_gen_ext16u_i64(out, out);
+        break;
+    case 4:
+        tcg_gen_ext32u_i64(out, out);
+        break;
+    default:
+        g_assert_not_reached();
+    }
 }
 
-static void insert_v1(TCGv out, TCGv in, unsigned byte)
+static void insert_v(TCGv out, TCGv in, int count, int v)
 {
-    tcg_gen_deposit_i64(out, out, in, byte * 8, 8);
+    tcg_gen_deposit_i64(out, out, in, count * v * 8, v * 8);
 }
 
-static void gen_v1cmpi(struct DisasContext *dc,
-                       uint8_t rdst, uint8_t rsrc, int8_t imm8,
-                       TCGCond cond, const char *code)
+static void gen_vadd(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, int v)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv vsrcb = load_gr(dc, rsrcb);
+    TCGv tmp = tcg_temp_new_i64();
+    TCGv tmpb = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v%dadd r%d, r%d, r%d\n",
+                  v, rdst, rsrc, rsrcb);
+
+    tcg_gen_movi_i64(vdst, 0);
+
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
+        extract_v(tmpb, vsrcb, count, v);
+        tcg_gen_add_i64(tmp, tmp, tmpb);
+        insert_v(vdst, tmp, count, v);
+    }
+
+    tcg_temp_free_i64(tmpb);
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_vsub(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, int v)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv vsrcb = load_gr(dc, rsrcb);
+    TCGv tmp = tcg_temp_new_i64();
+    TCGv tmpb = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v%dsub r%d, r%d, r%d\n",
+                  v, rdst, rsrc, rsrcb);
+
+    tcg_gen_movi_i64(vdst, 0);
+
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
+        extract_v(tmpb, vsrcb, count, v);
+        tcg_gen_sub_i64(tmp, tmp, tmpb);
+        insert_v(vdst, tmp, count, v);
+    }
+
+    tcg_temp_free_i64(tmpb);
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_vaddi(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, int8_t imm8, int v)
+{
+    int count;
+    TCGv vdst = dest_gr(dc, rdst);
+    TCGv vsrc = load_gr(dc, rsrc);
+    TCGv tmp = tcg_temp_new_i64();
+
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v%daddi r%d, r%d, %d\n",
+                  v, rdst, rsrc, imm8);
+
+    tcg_gen_movi_i64(vdst, 0);
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
+        tcg_gen_addi_i64(tmp, tmp, imm8);
+        insert_v(vdst, tmp, count, v);
+    }
+    tcg_temp_free_i64(tmp);
+}
+
+static void gen_vcmpi(struct DisasContext *dc,
+                      uint8_t rdst, uint8_t rsrc, int8_t imm8, int v,
+                      TCGCond cond, const char *code)
 {
     int count;
     TCGv vdst = dest_gr(dc, rdst);
@@ -326,17 +410,17 @@  static void gen_v1cmpi(struct DisasContext *dc,
                   code, rdst, rsrc, imm8);
 
     tcg_gen_movi_i64(vdst, 0);
-    for (count = 0; count < 8; count++) {
-        extract_v1(tmp, vsrc, count);
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
         tcg_gen_setcondi_i64(cond, tmp, tmp, imm8);
-        insert_v1(vdst, tmp, count);
+        insert_v(vdst, tmp, count, v);
     }
     tcg_temp_free_i64(tmp);
 }
 
-static void gen_v1cmp(struct DisasContext *dc,
-                      uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
-                      TCGCond cond, const char *code)
+static void gen_vcmp(struct DisasContext *dc,
+                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, int v,
+                     TCGCond cond, const char *code)
 {
     int count;
     TCGv vdst = dest_gr(dc, rdst);
@@ -349,33 +433,33 @@  static void gen_v1cmp(struct DisasContext *dc,
                   code, rdst, rsrc, rsrcb);
 
     tcg_gen_movi_i64(vdst, 0);
-    for (count = 0; count < 8; count++) {
-        extract_v1(tmp, vsrc, count);
-        extract_v1(tmp2, vsrcb, count);
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
+        extract_v(tmp2, vsrcb, count, v);
         tcg_gen_setcond_i64(cond, tmp, tmp, tmp2);
-        insert_v1(vdst, tmp, count);
+        insert_v(vdst, tmp, count, v);
     }
     tcg_temp_free_i64(tmp2);
     tcg_temp_free_i64(tmp);
 }
 
-static void gen_v1shrui(struct DisasContext *dc,
-                      uint8_t rdst, uint8_t rsrc, uint8_t shamt)
+static void gen_vshrui(struct DisasContext *dc,
+                       uint8_t rdst, uint8_t rsrc, uint8_t shamt, int v)
 {
     int count;
     TCGv vdst = dest_gr(dc, rdst);
     TCGv vsrc = load_gr(dc, rsrc);
     TCGv tmp = tcg_temp_new_i64();
 
-    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1shrui r%d, r%d, %u\n",
-                  rdst, rsrc, shamt);
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v%dshrui r%d, r%d, %u\n",
+                  v, rdst, rsrc, shamt);
 
     shamt &= 7;
     tcg_gen_movi_i64(vdst, 0);
-    for (count = 0; count < 8; count++) {
-        extract_v1(tmp, vsrc, count);
+    for (count = 0; count < sizeof(uint64_t) / v; count++) {
+        extract_v(tmp, vsrc, count, v);
         tcg_gen_shri_i64(tmp, tmp, shamt);
-        insert_v1(vdst, tmp, count);
+        insert_v(vdst, tmp, count, v);
     }
     tcg_temp_free_i64(tmp);
 }
@@ -404,8 +488,8 @@  static void gen_v1shrui(struct DisasContext *dc,
  *          }
  *        rf[Dest] = output;
  */
-static void gen_v1int_l(struct DisasContext *dc,
-                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
+static void gen_vint_l(struct DisasContext *dc,
+                       uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, int v)
 {
     int count;
     TCGv vdst = dest_gr(dc, rdst);
@@ -413,19 +497,20 @@  static void gen_v1int_l(struct DisasContext *dc,
     TCGv vsrcb = load_gr(dc, rsrcb);
     TCGv tmp = tcg_temp_new_i64();
 
-    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v1int_l r%d, r%d, r%d\n",
-                  rdst, rsrc, rsrcb);
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v%dint_l r%d, r%d, r%d\n",
+                  v, rdst, rsrc, rsrcb);
 
     tcg_gen_movi_i64(vdst, 0);
-    for (count = 0; count < 4; count++) {
-        extract_v1(tmp, vsrc, count);
-        insert_v1(vdst, tmp, 2 * count + 1);
-        extract_v1(tmp, vsrcb, count);
-        insert_v1(vdst, tmp, 2 * count);
+    for (count = 0; count < sizeof(uint64_t) / (v * 2); count++) {
+        extract_v(tmp, vsrc, count, v);
+        insert_v(vdst, tmp, 2 * count + 1, v);
+        extract_v(tmp, vsrcb, count, v);
+        insert_v(vdst, tmp, 2 * count, v);
     }
     tcg_temp_free_i64(tmp);
 }
 
+/* Still reserve v4int_l for optimization */
 static void gen_v4int_l(struct DisasContext *dc,
                         uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
 {
@@ -496,7 +581,7 @@  static void gen_addxsc(struct DisasContext *dc,
 {
     qemu_log_mask(CPU_LOG_TB_IN_ASM, "addxsc r%d, r%d, r%d\n",
                   rdst, rsrc, rsrcb);
-    gen_helper_add_saturate(dest_gr(dc, rdst), cpu_env,
+    gen_helper_add_saturate(dest_gr(dc, rdst),
                             load_gr(dc, rsrc), load_gr(dc, rsrcb));
 }
 
@@ -910,6 +995,18 @@  static void gen_cnttz(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc)
     gen_helper_cnttz(dest_gr(dc, rdst), load_gr(dc, rsrc));
 }
 
+static void gen_pcnt(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "pcnt r%d, r%d\n", rdst, rsrc);
+    gen_helper_pcnt(dest_gr(dc, rdst), load_gr(dc, rsrc));
+}
+
+static void gen_revbytes(struct DisasContext *dc, uint8_t rdst, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "revbytes r%d, r%d\n", rdst, rsrc);
+    gen_helper_revbytes(dest_gr(dc, rdst), load_gr(dc, rsrc));
+}
+
 static void gen_ld(struct DisasContext *dc,
                    uint8_t rdst, uint8_t rsrc,
                    TCGMemOp ops, const char *code)
@@ -1008,6 +1105,18 @@  static void gen_wh64(struct DisasContext *dc, uint8_t rsrc)
     /* FIXME: Do we need any implementation for it? I guess no. */
 }
 
+static void gen_icoh(struct DisasContext *dc, uint8_t rsrc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "icoh r%d\n", rsrc);
+    /* FIXME: Do we need any implementation for it? I guess no. */
+}
+
+static void gen_drain(struct DisasContext *dc)
+{
+    qemu_log_mask(CPU_LOG_TB_IN_ASM, "drain\n");
+    /* FIXME: Do we need any implementation for it? I guess no. */
+}
+
 static void gen_jr(struct DisasContext *dc, uint8_t rsrc)
 {
     qemu_log_mask(CPU_LOG_TB_IN_ASM, "jr(p) r%d\n", rsrc);
@@ -1096,6 +1205,12 @@  static void decode_u_opcode_ex_y0(struct DisasContext *dc,
     case CNTTZ_UNARY_OPCODE_Y0:
         gen_cnttz(dc, rdst, rsrc);
         return;
+    case PCNT_UNARY_OPCODE_Y0:
+        gen_pcnt(dc, rdst, rsrc);
+        return;
+    case REVBYTES_UNARY_OPCODE_Y0:
+        gen_revbytes(dc, rdst, rsrc);
+        return;
     case FNOP_UNARY_OPCODE_Y0:
     case NOP_UNARY_OPCODE_Y0:
         if (!rsrc && !rdst) {
@@ -1104,9 +1219,7 @@  static void decode_u_opcode_ex_y0(struct DisasContext *dc,
         }
         /* Fall through */
     case FSINGLE_PACK1_UNARY_OPCODE_Y0:
-    case PCNT_UNARY_OPCODE_Y0:
     case REVBITS_UNARY_OPCODE_Y0:
-    case REVBYTES_UNARY_OPCODE_Y0:
     case TBLIDXB0_UNARY_OPCODE_Y0:
     case TBLIDXB1_UNARY_OPCODE_Y0:
     case TBLIDXB2_UNARY_OPCODE_Y0:
@@ -1603,9 +1716,7 @@  static void decode_ldst2_opcode_y2(struct DisasContext *dc,
         gen_ld(dc, rsrcbdst, rsrc, MO_LEUL, "ld4u");
         return;
     case MODE_OPCODE_YA2:
-        qemu_log_mask(LOG_UNIMP,
-                      "UNIMP ldst2_opcode_y2, [" FMT64X "]\n", bundle);
-        set_exception(dc, TILEGX_EXCP_OPCODE_UNIMPLEMENTED);
+        gen_ld(dc, rsrcbdst, rsrc, MO_LESW, "ld2s");
         return;
     default:
         g_assert_not_reached();
@@ -1690,25 +1801,35 @@  static void decode_imm8_opcode_x0(struct DisasContext *dc,
     case ORI_IMM8_OPCODE_X0:
         gen_ori(dc, rdst, rsrc, imm8);
         return;
+    case V1ADDI_IMM8_OPCODE_X0:
+        gen_vaddi(dc, rdst, rsrc, imm8, 1);
+        return;
     case V1CMPEQI_IMM8_OPCODE_X0:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "v1cmpeqi");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_EQ, "v1cmpeqi");
         return;
     case V1CMPLTSI_IMM8_OPCODE_X0:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "v1cmpltsi");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_LT, "v1cmpltsi");
         return;
     case V1CMPLTUI_IMM8_OPCODE_X0:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_LTU, "v1cmpltui");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_LTU, "v1cmpltui");
+        return;
+    case V2ADDI_IMM8_OPCODE_X0:
+        gen_vaddi(dc, rdst, rsrc, imm8, 2);
+        return;
+    case V2CMPEQI_IMM8_OPCODE_X0:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_EQ, "v2cmpeqi");
+        return;
+    case V2CMPLTSI_IMM8_OPCODE_X0:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_LT, "v2cmpltsi");
+        return;
+    case V2CMPLTUI_IMM8_OPCODE_X0:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_LTU, "v2cmpltui");
         return;
     case XORI_IMM8_OPCODE_X0:
         gen_xori(dc, rdst, rsrc, imm8);
         return;
-    case V1ADDI_IMM8_OPCODE_X0:
     case V1MAXUI_IMM8_OPCODE_X0:
     case V1MINUI_IMM8_OPCODE_X0:
-    case V2ADDI_IMM8_OPCODE_X0:
-    case V2CMPEQI_IMM8_OPCODE_X0:
-    case V2CMPLTSI_IMM8_OPCODE_X0:
-    case V2CMPLTUI_IMM8_OPCODE_X0:
     case V2MAXSI_IMM8_OPCODE_X0:
     case V2MINSI_IMM8_OPCODE_X0:
         qemu_log_mask(LOG_UNIMP,
@@ -1733,6 +1854,12 @@  static void decode_u_opcode_ex_x0(struct DisasContext *dc,
     case CNTTZ_UNARY_OPCODE_X0:
         gen_cnttz(dc, rdst, rsrc);
         return;
+    case PCNT_UNARY_OPCODE_X0:
+        gen_pcnt(dc, rdst, rsrc);
+        return;
+    case REVBYTES_UNARY_OPCODE_X0:
+        gen_revbytes(dc, rdst, rsrc);
+        return;
     case FNOP_UNARY_OPCODE_X0:
     case NOP_UNARY_OPCODE_X0:
         if (!rsrc && !rdst) {
@@ -1741,9 +1868,7 @@  static void decode_u_opcode_ex_x0(struct DisasContext *dc,
         }
         /* Fall through */
     case FSINGLE_PACK1_UNARY_OPCODE_X0:
-    case PCNT_UNARY_OPCODE_X0:
     case REVBITS_UNARY_OPCODE_X0:
-    case REVBYTES_UNARY_OPCODE_X0:
     case TBLIDXB0_UNARY_OPCODE_X0:
     case TBLIDXB1_UNARY_OPCODE_X0:
     case TBLIDXB2_UNARY_OPCODE_X0:
@@ -1950,28 +2075,68 @@  static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
     case UNARY_RRR_0_OPCODE_X0:
         return decode_u_opcode_ex_x0(dc, bundle);
     case V1INT_L_RRR_0_OPCODE_X0:
-        gen_v1int_l(dc, rdst, rsrc, rsrcb);
+        gen_vint_l(dc, rdst, rsrc, rsrcb, 1);
         return;
-    case V4INT_L_RRR_0_OPCODE_X0:
-        gen_v4int_l(dc, rdst, rsrc, rsrcb);
+    case V1ADD_RRR_0_OPCODE_X0:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 1);
+        return;
+    case V1SUB_RRR_0_OPCODE_X0:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 1);
+        return;
+    case V2ADD_RRR_0_OPCODE_X0:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V2SUB_RRR_0_OPCODE_X0:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V4ADD_RRR_0_OPCODE_X0:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 4);
+        return;
+    case V4SUB_RRR_0_OPCODE_X0:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 4);
         return;
     case V1CMPEQ_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "v1cmpeq");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_EQ, "v1cmpeq");
         return;
     case V1CMPLES_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "v1cmples");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LE, "v1cmples");
         return;
     case V1CMPLEU_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "v1cmpleu");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LEU, "v1cmpleu");
         return;
     case V1CMPLTS_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "v1cmplts");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LT, "v1cmplts");
         return;
     case V1CMPLTU_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "v1cmpltu");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LTU, "v1cmpltu");
         return;
     case V1CMPNE_RRR_0_OPCODE_X0:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "v1cmpne");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_NE, "v1cmpne");
+        return;
+    case V2CMPEQ_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_EQ, "v2cmpeq");
+        return;
+    case V2CMPLES_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LE, "v2cmples");
+        return;
+    case V2CMPLEU_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LEU, "v2cmpleu");
+        return;
+    case V2CMPLTS_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LT, "v2cmplts");
+        return;
+    case V2CMPLTU_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LTU, "v2cmpltu");
+        return;
+    case V2CMPNE_RRR_0_OPCODE_X0:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_NE, "v2cmpne");
+        return;
+    case V2INT_L_RRR_0_OPCODE_X0:
+        gen_vint_l(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V4INT_L_RRR_0_OPCODE_X0:
+        /* v4int_l is a little faster then generic vint_l */
+        gen_v4int_l(dc, rdst, rsrc, rsrcb);
         return;
     case XOR_RRR_0_OPCODE_X0:
         gen_xor(dc, rdst, rsrc, rsrcb);
@@ -1988,8 +2153,8 @@  static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
     case DBLALIGN2_RRR_0_OPCODE_X0:
     case DBLALIGN4_RRR_0_OPCODE_X0:
     case DBLALIGN6_RRR_0_OPCODE_X0:
-    case FDOUBLE_ADDSUB_RRR_0_OPCODE_X0:
     case FDOUBLE_ADD_FLAGS_RRR_0_OPCODE_X0:
+    case FDOUBLE_ADDSUB_RRR_0_OPCODE_X0:
     case FDOUBLE_MUL_FLAGS_RRR_0_OPCODE_X0:
     case FDOUBLE_PACK1_RRR_0_OPCODE_X0:
     case FDOUBLE_PACK2_RRR_0_OPCODE_X0:
@@ -2000,11 +2165,10 @@  static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
     case FSINGLE_ADDSUB2_RRR_0_OPCODE_X0:
     case FSINGLE_MUL1_RRR_0_OPCODE_X0:
     case FSINGLE_MUL2_RRR_0_OPCODE_X0:
-    case FSINGLE_PACK2_RRR_0_OPCODE_X0:
     case FSINGLE_SUB1_RRR_0_OPCODE_X0:
+    case FSINGLE_PACK2_RRR_0_OPCODE_X0:
     case SUBXSC_RRR_0_OPCODE_X0:
     case V1ADDUC_RRR_0_OPCODE_X0:
-    case V1ADD_RRR_0_OPCODE_X0:
     case V1ADIFFU_RRR_0_OPCODE_X0:
     case V1AVGU_RRR_0_OPCODE_X0:
     case V1DDOTPUSA_RRR_0_OPCODE_X0:
@@ -2026,23 +2190,13 @@  static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
     case V1SHRS_RRR_0_OPCODE_X0:
     case V1SHRU_RRR_0_OPCODE_X0:
     case V1SUBUC_RRR_0_OPCODE_X0:
-    case V1SUB_RRR_0_OPCODE_X0:
     case V1INT_H_RRR_0_OPCODE_X0:
-    case V2INT_H_RRR_0_OPCODE_X0:
-    case V2INT_L_RRR_0_OPCODE_X0:
-    case V4INT_H_RRR_0_OPCODE_X0:
     case V2ADDSC_RRR_0_OPCODE_X0:
-    case V2ADD_RRR_0_OPCODE_X0:
     case V2ADIFFS_RRR_0_OPCODE_X0:
     case V2AVGS_RRR_0_OPCODE_X0:
-    case V2CMPEQ_RRR_0_OPCODE_X0:
-    case V2CMPLES_RRR_0_OPCODE_X0:
-    case V2CMPLEU_RRR_0_OPCODE_X0:
-    case V2CMPLTS_RRR_0_OPCODE_X0:
-    case V2CMPLTU_RRR_0_OPCODE_X0:
-    case V2CMPNE_RRR_0_OPCODE_X0:
     case V2DOTPA_RRR_0_OPCODE_X0:
     case V2DOTP_RRR_0_OPCODE_X0:
+    case V2INT_H_RRR_0_OPCODE_X0:
     case V2MAXS_RRR_0_OPCODE_X0:
     case V2MINS_RRR_0_OPCODE_X0:
     case V2MNZ_RRR_0_OPCODE_X0:
@@ -2062,16 +2216,14 @@  static void decode_rrr_0_opcode_x0(struct DisasContext *dc,
     case V2SHRS_RRR_0_OPCODE_X0:
     case V2SHRU_RRR_0_OPCODE_X0:
     case V2SUBSC_RRR_0_OPCODE_X0:
-    case V2SUB_RRR_0_OPCODE_X0:
     case V4ADDSC_RRR_0_OPCODE_X0:
-    case V4ADD_RRR_0_OPCODE_X0:
+    case V4INT_H_RRR_0_OPCODE_X0:
     case V4PACKSC_RRR_0_OPCODE_X0:
     case V4SHLSC_RRR_0_OPCODE_X0:
     case V4SHL_RRR_0_OPCODE_X0:
     case V4SHRS_RRR_0_OPCODE_X0:
     case V4SHRU_RRR_0_OPCODE_X0:
     case V4SUBSC_RRR_0_OPCODE_X0:
-    case V4SUB_RRR_0_OPCODE_X0:
     case V1DDOTPUA_RRR_0_OPCODE_X0:
     case V1DDOTPU_RRR_0_OPCODE_X0:
     case V1DOTPUA_RRR_0_OPCODE_X0:
@@ -2112,13 +2264,15 @@  static void decode_shift_opcode_x0(struct DisasContext *dc,
         gen_shruxi(dc, rdst, rsrc, shamt);
         return;
     case V1SHRUI_SHIFT_OPCODE_X0:
-        gen_v1shrui(dc, rdst, rsrc, shamt);
+        gen_vshrui(dc, rdst, rsrc, shamt, 1);
+        return;
+    case V2SHRUI_SHIFT_OPCODE_X0:
+        gen_vshrui(dc, rdst, rsrc, shamt, 2);
         return;
     case V1SHLI_SHIFT_OPCODE_X0:
     case V1SHRSI_SHIFT_OPCODE_X0:
     case V2SHLI_SHIFT_OPCODE_X0:
     case V2SHRSI_SHIFT_OPCODE_X0:
-    case V2SHRUI_SHIFT_OPCODE_X0:
         qemu_log_mask(LOG_UNIMP,
                       "UNIMP shift_opcode_x0, [" FMT64X "]\n", bundle);
         set_exception(dc, TILEGX_EXCP_OPCODE_UNIMPLEMENTED);
@@ -2242,14 +2396,29 @@  static void decode_imm8_opcode_x1(struct DisasContext *dc,
     case ST4_ADD_IMM8_OPCODE_X1:
         gen_st_add(dc, rsrc, rsrcb, dimm8, MO_LEUL, "st4_add");
         return;
+    case V1ADDI_IMM8_OPCODE_X1:
+        gen_vaddi(dc, rdst, rsrc, imm8, 1);
+        return;
     case V1CMPEQI_IMM8_OPCODE_X1:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_EQ, "v1cmpeqi");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_EQ, "v1cmpeqi");
         return;
     case V1CMPLTSI_IMM8_OPCODE_X1:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_LT, "v1cmpltsi");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_LT, "v1cmpltsi");
         return;
     case V1CMPLTUI_IMM8_OPCODE_X1:
-        gen_v1cmpi(dc, rdst, rsrc, imm8, TCG_COND_LTU, "v1cmpltui");
+        gen_vcmpi(dc, rdst, rsrc, imm8, 1, TCG_COND_LTU, "v1cmpltui");
+        return;
+    case V2ADDI_IMM8_OPCODE_X1:
+        gen_vaddi(dc, rdst, rsrc, imm8, 2);
+        return;
+    case V2CMPEQI_IMM8_OPCODE_X1:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_EQ, "v2cmpeqi");
+        return;
+    case V2CMPLTSI_IMM8_OPCODE_X1:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_LT, "v2cmpltsi");
+        return;
+    case V2CMPLTUI_IMM8_OPCODE_X1:
+        gen_vcmpi(dc, rdst, rsrc, imm8, 2, TCG_COND_LTU, "v2cmpltui");
         return;
     case XORI_IMM8_OPCODE_X1:
         gen_xori(dc, rdst, rsrc, imm8);
@@ -2266,13 +2435,8 @@  static void decode_imm8_opcode_x1(struct DisasContext *dc,
     case STNT2_ADD_IMM8_OPCODE_X1:
     case STNT4_ADD_IMM8_OPCODE_X1:
     case STNT_ADD_IMM8_OPCODE_X1:
-    case V1ADDI_IMM8_OPCODE_X1:
     case V1MAXUI_IMM8_OPCODE_X1:
     case V1MINUI_IMM8_OPCODE_X1:
-    case V2ADDI_IMM8_OPCODE_X1:
-    case V2CMPEQI_IMM8_OPCODE_X1:
-    case V2CMPLTSI_IMM8_OPCODE_X1:
-    case V2CMPLTUI_IMM8_OPCODE_X1:
     case V2MAXSI_IMM8_OPCODE_X1:
     case V2MINSI_IMM8_OPCODE_X1:
         qemu_log_mask(LOG_UNIMP,
@@ -2308,6 +2472,12 @@  static void decode_u_opcode_ex_x1(struct DisasContext *dc,
     uint8_t rdst = get_Dest_X1(bundle);
 
     switch (get_UnaryOpcodeExtension_X1(bundle)) {
+    case DRAIN_UNARY_OPCODE_X1:
+        if (!rdst && !rsrc) {
+            gen_drain(dc);
+            return;
+        }
+        break;
     case NOP_UNARY_OPCODE_X1:
     case FNOP_UNARY_OPCODE_X1:
         if (!rdst && !rsrc) {
@@ -2315,6 +2485,12 @@  static void decode_u_opcode_ex_x1(struct DisasContext *dc,
             return;
         }
         break;
+    case ICOH_UNARY_OPCODE_X1:
+        if (!rdst) {
+            gen_icoh(dc, rsrc);
+            return;
+        }
+        break;
     case JALRP_UNARY_OPCODE_X1:
     case JALR_UNARY_OPCODE_X1:
         if (!rdst) {
@@ -2381,12 +2557,10 @@  static void decode_u_opcode_ex_x1(struct DisasContext *dc,
             return;
         }
         break;
-    case DRAIN_UNARY_OPCODE_X1:
     case DTLBPR_UNARY_OPCODE_X1:
     case FINV_UNARY_OPCODE_X1:
     case FLUSHWB_UNARY_OPCODE_X1:
     case FLUSH_UNARY_OPCODE_X1:
-    case ICOH_UNARY_OPCODE_X1:
     case ILL_UNARY_OPCODE_X1:
     case INV_UNARY_OPCODE_X1:
     case LDNT1S_UNARY_OPCODE_X1:
@@ -2576,29 +2750,69 @@  static void decode_rrr_0_opcode_x1(struct DisasContext *dc,
         break;
     case UNARY_RRR_0_OPCODE_X1:
         return decode_u_opcode_ex_x1(dc, bundle);
-    case V1INT_L_RRR_0_OPCODE_X1:
-        gen_v1int_l(dc, rdst, rsrc, rsrcb);
-        return;
-    case V4INT_L_RRR_0_OPCODE_X1:
-        gen_v4int_l(dc, rdst, rsrc, rsrcb);
+    case V1ADD_RRR_0_OPCODE_X1:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 1);
         return;
     case V1CMPEQ_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_EQ, "v1cmpeq");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_EQ, "v1cmpeq");
         return;
     case V1CMPLES_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LE, "v1cmples");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LE, "v1cmples");
         return;
     case V1CMPLEU_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LEU, "v1cmpleu");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LEU, "v1cmpleu");
         return;
     case V1CMPLTS_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LT, "v1cmplts");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LT, "v1cmplts");
         return;
     case V1CMPLTU_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_LTU, "v1cmpltu");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_LTU, "v1cmpltu");
         return;
     case V1CMPNE_RRR_0_OPCODE_X1:
-        gen_v1cmp(dc, rdst, rsrc, rsrcb, TCG_COND_NE, "v1cmpne");
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 1, TCG_COND_NE, "v1cmpne");
+        return;
+    case V1INT_L_RRR_0_OPCODE_X1:
+        gen_vint_l(dc, rdst, rsrc, rsrcb, 1);
+        return;
+    case V1SUB_RRR_0_OPCODE_X1:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 1);
+        return;
+    case V2ADD_RRR_0_OPCODE_X1:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V2CMPEQ_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_EQ, "v2cmpeq");
+        return;
+    case V2CMPLES_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LE, "v2cmples");
+        return;
+    case V2CMPLEU_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LEU, "v2cmpleu");
+        return;
+    case V2CMPLTS_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LT, "v2cmplts");
+        return;
+    case V2CMPLTU_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_LTU, "v2cmpltu");
+        return;
+    case V2CMPNE_RRR_0_OPCODE_X1:
+        gen_vcmp(dc, rdst, rsrc, rsrcb, 2, TCG_COND_NE, "v2cmpne");
+        return;
+    case V2INT_L_RRR_0_OPCODE_X1:
+        gen_vint_l(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V2SUB_RRR_0_OPCODE_X1:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 2);
+        return;
+    case V4ADD_RRR_0_OPCODE_X1:
+        gen_vadd(dc, rdst, rsrc, rsrcb, 4);
+        return;
+    case V4INT_L_RRR_0_OPCODE_X1:
+        /* v4int_l is a little faster then generic vint_l */
+        gen_v4int_l(dc, rdst, rsrc, rsrcb);
+        return;
+    case V4SUB_RRR_0_OPCODE_X1:
+        gen_vsub(dc, rdst, rsrc, rsrcb, 4);
         return;
     case XOR_RRR_0_OPCODE_X1:
         gen_xor(dc, rdst, rsrc, rsrcb);
@@ -2613,10 +2827,8 @@  static void decode_rrr_0_opcode_x1(struct DisasContext *dc,
     case SUBXSC_RRR_0_OPCODE_X1:
     case V1INT_H_RRR_0_OPCODE_X1:
     case V2INT_H_RRR_0_OPCODE_X1:
-    case V2INT_L_RRR_0_OPCODE_X1:
     case V4INT_H_RRR_0_OPCODE_X1:
     case V1ADDUC_RRR_0_OPCODE_X1:
-    case V1ADD_RRR_0_OPCODE_X1:
     case V1MAXU_RRR_0_OPCODE_X1:
     case V1MINU_RRR_0_OPCODE_X1:
     case V1MNZ_RRR_0_OPCODE_X1:
@@ -2625,15 +2837,7 @@  static void decode_rrr_0_opcode_x1(struct DisasContext *dc,
     case V1SHRS_RRR_0_OPCODE_X1:
     case V1SHRU_RRR_0_OPCODE_X1:
     case V1SUBUC_RRR_0_OPCODE_X1:
-    case V1SUB_RRR_0_OPCODE_X1:
     case V2ADDSC_RRR_0_OPCODE_X1:
-    case V2ADD_RRR_0_OPCODE_X1:
-    case V2CMPEQ_RRR_0_OPCODE_X1:
-    case V2CMPLES_RRR_0_OPCODE_X1:
-    case V2CMPLEU_RRR_0_OPCODE_X1:
-    case V2CMPLTS_RRR_0_OPCODE_X1:
-    case V2CMPLTU_RRR_0_OPCODE_X1:
-    case V2CMPNE_RRR_0_OPCODE_X1:
     case V2MAXS_RRR_0_OPCODE_X1:
     case V2MINS_RRR_0_OPCODE_X1:
     case V2MNZ_RRR_0_OPCODE_X1:
@@ -2646,16 +2850,13 @@  static void decode_rrr_0_opcode_x1(struct DisasContext *dc,
     case V2SHRS_RRR_0_OPCODE_X1:
     case V2SHRU_RRR_0_OPCODE_X1:
     case V2SUBSC_RRR_0_OPCODE_X1:
-    case V2SUB_RRR_0_OPCODE_X1:
     case V4ADDSC_RRR_0_OPCODE_X1:
-    case V4ADD_RRR_0_OPCODE_X1:
     case V4PACKSC_RRR_0_OPCODE_X1:
     case V4SHLSC_RRR_0_OPCODE_X1:
     case V4SHL_RRR_0_OPCODE_X1:
     case V4SHRS_RRR_0_OPCODE_X1:
     case V4SHRU_RRR_0_OPCODE_X1:
     case V4SUBSC_RRR_0_OPCODE_X1:
-    case V4SUB_RRR_0_OPCODE_X1:
         break;
     default:
         g_assert_not_reached();
@@ -2692,13 +2893,15 @@  static void decode_shift_opcode_x1(struct DisasContext *dc,
         gen_shruxi(dc, rdst, rsrc, shamt);
         return;
     case V1SHRUI_SHIFT_OPCODE_X1:
-        gen_v1shrui(dc, rdst, rsrc, shamt);
+        gen_vshrui(dc, rdst, rsrc, shamt, 1);
+        return;
+    case V2SHRUI_SHIFT_OPCODE_X1:
+        gen_vshrui(dc, rdst, rsrc, shamt, 2);
         return;
     case V1SHLI_SHIFT_OPCODE_X1:
     case V1SHRSI_SHIFT_OPCODE_X1:
     case V2SHLI_SHIFT_OPCODE_X1:
     case V2SHRSI_SHIFT_OPCODE_X1:
-    case V2SHRUI_SHIFT_OPCODE_X1:
         qemu_log_mask(LOG_UNIMP,
                       "UNIMP shift_opcode_x1, [" FMT64X "]\n", bundle);
         set_exception(dc, TILEGX_EXCP_OPCODE_UNIMPLEMENTED);