Patchwork target-mips: Implement Loongson Multimedia Instructions

login
register
mail settings
Submitter Richard Henderson
Date Sept. 19, 2012, 4:59 a.m.
Message ID <1348030784-14178-1-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/184935/
State New
Headers show

Comments

Richard Henderson - Sept. 19, 2012, 4:59 a.m.
Implements all of the COP2 instructions except for the S<cond>
family of comparisons.  The documentation is unclear for those.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---

It turns out that the previous patch was totally independent of the
other 6 patches in the previous series.  That said, I've pulled the
patch out to its own branch:

  git://repo.or.cz/qemu/rth.git rth/mips/lmi


r~


 target-mips/Makefile.objs |   2 +-
 target-mips/helper.h      |  59 ++++
 target-mips/lmi_helper.c  | 744 ++++++++++++++++++++++++++++++++++++++++++++++
 target-mips/translate.c   | 379 ++++++++++++++++++++++-
 4 files changed, 1180 insertions(+), 4 deletions(-)
 create mode 100644 target-mips/lmi_helper.c
Aurelien Jarno - Sept. 19, 2012, 7:51 p.m.
On Tue, Sep 18, 2012 at 09:59:44PM -0700, Richard Henderson wrote:
> Implements all of the COP2 instructions except for the S<cond>
> family of comparisons.  The documentation is unclear for those.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
> 
> It turns out that the previous patch was totally independent of the
> other 6 patches in the previous series.  That said, I've pulled the
> patch out to its own branch:
> 
>   git://repo.or.cz/qemu/rth.git rth/mips/lmi
> 
> 
> r~
> 
> 
>  target-mips/Makefile.objs |   2 +-
>  target-mips/helper.h      |  59 ++++
>  target-mips/lmi_helper.c  | 744 ++++++++++++++++++++++++++++++++++++++++++++++
>  target-mips/translate.c   | 379 ++++++++++++++++++++++-
>  4 files changed, 1180 insertions(+), 4 deletions(-)
>  create mode 100644 target-mips/lmi_helper.c
> 
> diff --git a/target-mips/Makefile.objs b/target-mips/Makefile.objs
> index ca20f21..3eeeeac 100644
> --- a/target-mips/Makefile.objs
> +++ b/target-mips/Makefile.objs
> @@ -1,2 +1,2 @@
> -obj-y += translate.o op_helper.o helper.o cpu.o
> +obj-y += translate.o op_helper.o lmi_helper.o helper.o cpu.o
>  obj-$(CONFIG_SOFTMMU) += machine.o
> diff --git a/target-mips/helper.h b/target-mips/helper.h
> index 109ac37..f35ed78 100644
> --- a/target-mips/helper.h
> +++ b/target-mips/helper.h
> @@ -303,4 +303,63 @@ DEF_HELPER_1(rdhwr_ccres, tl, env)
>  DEF_HELPER_2(pmon, void, env, int)
>  DEF_HELPER_1(wait, void, env)
>  
> +/* Loongson multimedia functions.  */
> +DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(psubsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pshufh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packsswh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packsshb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packushb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(punpcklhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpcklbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpcklwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pavgh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pavgb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaxsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pminsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaxub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pminub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pcmpeqw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgtw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpeqh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgth, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpeqb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgtb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(psllw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psllh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrlw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrlh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psraw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrah, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pmullh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmulhh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmulhuh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaddhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pasubub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_1(biadd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
> +DEF_HELPER_FLAGS_1(pmovmskb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
> +
>  #include "def-helper.h"
> diff --git a/target-mips/lmi_helper.c b/target-mips/lmi_helper.c
> new file mode 100644
> index 0000000..1b24353
> --- /dev/null
> +++ b/target-mips/lmi_helper.c
> @@ -0,0 +1,744 @@
> +/*
> + *  Loongson Multimedia Instruction emulation helpers for QEMU.
> + *
> + *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "cpu.h"
> +#include "helper.h"
> +
> +/* If the byte ordering doesn't matter, i.e. all columns are treated
> +   identically, then this union can be used directly.  If byte ordering
> +   does matter, we generally ignore dumping to memory.  */
> +typedef union {
> +    uint8_t  ub[8];
> +    int8_t   sb[8];
> +    uint16_t uh[4];
> +    int16_t  sh[4];
> +    uint32_t uw[2];
> +    int32_t  sw[2];
> +    uint64_t d;
> +} LMIValue;
> +
> +/* Some byte ordering issues can be mitigated by XORing in the following.  */
> +#ifdef HOST_WORDS_BIGENDIAN
> +# define BYTE_ORDER_XOR(N) N
> +#else
> +# define BYTE_ORDER_XOR(N) 0
> +#endif
> +
> +#define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
> +#define SATUB(x)  (x > 0xff ? 0xff : x)
> +
> +#define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
> +#define SATUH(x)  (x > 0xffff ? 0xffff : x)
> +
> +#define SATSW(x) \
> +    (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
> +#define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
> +
> +uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.sb[i] + vt.sb[i];
> +        vs.sb[i] = SATSB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] + vt.ub[i];
> +        vs.ub[i] = SATUB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.sh[i] + vt.sh[i];
> +        vs.sh[i] = SATSH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddush(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.uh[i] + vt.uh[i];
> +        vs.uh[i] = SATUH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        vs.ub[i] += vt.ub[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] += vt.uh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] += vt.uw[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.sb[i] - vt.sb[i];
> +        vs.sb[i] = SATSB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] - vt.ub[i];
> +        vs.ub[i] = SATUB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.sh[i] - vt.sh[i];
> +        vs.sh[i] = SATSH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubush(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.uh[i] - vt.uh[i];
> +        vs.uh[i] = SATUH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        vs.ub[i] -= vt.ub[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] -= vt.uh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] -= vt.uw[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vd.d = 0;
> +    for (i = 0; i < 4; i++, ft >>= 2) {
> +        vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
> +    }
> +    return vd.d;
> +}
> +
> +uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    int64_t tmp;
> +
> +    tmp = (int32_t)(fs >> 0);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 0;
> +
> +    tmp = (int32_t)(fs >> 32);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 16;
> +
> +    tmp = (int32_t)(ft >> 0);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 32;
> +
> +    tmp = (int32_t)(ft >> 32);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 48;
> +
> +    return fd;
> +}
> +
> +uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    unsigned int i;
> +
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = fs >> (i * 16);
> +        tmp = SATSB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
> +    }
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = ft >> (i * 16);
> +        tmp = SATSB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
> +    }
> +
> +    return fd;
> +}
> +
> +uint64_t helper_packushb(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    unsigned int i;
> +
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = fs >> (i * 16);
> +        tmp = SATUB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
> +    }
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = ft >> (i * 16);
> +        tmp = SATUB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
> +    }
> +
> +    return fd;
> +}
> +
> +uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
> +{
> +    return (fs & 0xffffffff) | (ft << 32);
> +}
> +
> +uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
> +{
> +    return (fs >> 32) | (ft & ~0xffffffffull);
> +}
> +
> +uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.uh[0 ^ host] = vs.uh[0 ^ host];
> +    vd.uh[1 ^ host] = vt.uh[0 ^ host];
> +    vd.uh[2 ^ host] = vs.uh[1 ^ host];
> +    vd.uh[3 ^ host] = vt.uh[1 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.uh[0 ^ host] = vs.uh[2 ^ host];
> +    vd.uh[1 ^ host] = vt.uh[2 ^ host];
> +    vd.uh[2 ^ host] = vs.uh[3 ^ host];
> +    vd.uh[3 ^ host] = vt.uh[3 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(7);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.ub[0 ^ host] = vs.ub[0 ^ host];
> +    vd.ub[1 ^ host] = vt.ub[0 ^ host];
> +    vd.ub[2 ^ host] = vs.ub[1 ^ host];
> +    vd.ub[3 ^ host] = vt.ub[1 ^ host];
> +    vd.ub[4 ^ host] = vs.ub[2 ^ host];
> +    vd.ub[5 ^ host] = vt.ub[2 ^ host];
> +    vd.ub[6 ^ host] = vs.ub[3 ^ host];
> +    vd.ub[7 ^ host] = vt.ub[3 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(7);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.ub[0 ^ host] = vs.ub[4 ^ host];
> +    vd.ub[1 ^ host] = vt.ub[4 ^ host];
> +    vd.ub[2 ^ host] = vs.ub[5 ^ host];
> +    vd.ub[3 ^ host] = vt.ub[5 ^ host];
> +    vd.ub[4 ^ host] = vs.ub[6 ^ host];
> +    vd.ub[5 ^ host] = vt.ub[6 ^ host];
> +    vd.ub[6 ^ host] = vs.ub[7 ^ host];
> +    vd.ub[7 ^ host] = vt.ub[7 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pminub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; i++) {
> +        vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; i++) {
> +        vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psllw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] <<= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psraw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        ft = 31;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.sw[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psllh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] <<= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrah(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        ft = 15;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.sh[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.sh[i] *= vt.sh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int32_t r = vs.sh[i] * vt.sh[i];
> +        vs.sh[i] = r >> 16;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        uint32_t r = vs.uh[i] * vt.uh[i];
> +        vs.uh[i] = r >> 16;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vs, vt;
> +    uint32_t p0, p1;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
> +    p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
> +    p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
> +    p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
> +
> +    return ((uint64_t)p1 << 32) | p0;
> +}
> +
> +uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] - vt.ub[i];
> +        vs.ub[i] = (r < 0 ? -r : r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_biadd(uint64_t fs)
> +{
> +    unsigned i, fd;
> +
> +    for (i = fd = 0; i < 8; ++i) {
> +        fd += (fs >> (i * 8)) & 0xff;
> +    }
> +    return fd & 0xffff;
> +}
> +
> +uint64_t helper_pmovmskb(uint64_t fs)
> +{
> +    unsigned fd = 0;
> +
> +    fd |= ((fs >>  7) & 1) << 0;
> +    fd |= ((fs >> 15) & 1) << 1;
> +    fd |= ((fs >> 23) & 1) << 2;
> +    fd |= ((fs >> 31) & 1) << 3;
> +    fd |= ((fs >> 39) & 1) << 4;
> +    fd |= ((fs >> 47) & 1) << 5;
> +    fd |= ((fs >> 55) & 1) << 6;
> +    fd |= ((fs >> 63) & 1) << 7;
> +
> +    return fd & 0xff;
> +}
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index 52eeb2b..f61cc6e 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -446,6 +446,103 @@ enum {
>      OPC_BC2     = (0x08 << 21) | OPC_CP2,
>  };
>  
> +#define MASK_LMI(op)  (MASK_OP_MAJOR(op) | (op & (0x1F << 21)) | (op & 0x1F))
> +
> +enum {
> +    OPC_PADDSH  = (24 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDUSH = (25 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDH   = (26 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDW   = (27 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDSB  = (28 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDUSB = (29 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDB   = (30 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDD   = (31 << 21) | (0x00) | OPC_CP2,
> +
> +    OPC_PSUBSH  = (24 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBUSH = (25 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBH   = (26 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBW   = (27 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBSB  = (28 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBUSB = (29 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBB   = (30 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBD   = (31 << 21) | (0x01) | OPC_CP2,
> +
> +    OPC_PSHUFH   = (24 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKSSWH = (25 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKSSHB = (26 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKUSHB = (27 << 21) | (0x02) | OPC_CP2,
> +    OPC_XOR_CP2  = (28 << 21) | (0x02) | OPC_CP2,
> +    OPC_NOR_CP2  = (29 << 21) | (0x02) | OPC_CP2,
> +    OPC_AND_CP2  = (30 << 21) | (0x02) | OPC_CP2,
> +    OPC_PANDN    = (31 << 21) | (0x02) | OPC_CP2,
> +
> +    OPC_PUNPCKLHW = (24 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKHHW = (25 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKLBH = (26 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKHBH = (27 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_0  = (28 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_1  = (29 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_2  = (30 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_3  = (31 << 21) | (0x03) | OPC_CP2,
> +
> +    OPC_PAVGH   = (24 << 21) | (0x08) | OPC_CP2,
> +    OPC_PAVGB   = (25 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMAXSH  = (26 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMINSH  = (27 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMAXUB  = (28 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMINUB  = (29 << 21) | (0x08) | OPC_CP2,
> +
> +    OPC_PCMPEQW = (24 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTW = (25 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPEQH = (26 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTH = (27 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPEQB = (28 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTB = (29 << 21) | (0x09) | OPC_CP2,
> +
> +    OPC_PSLLW   = (24 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PSLLH   = (25 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULLH  = (26 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULHH  = (27 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULUW  = (28 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULHUH = (29 << 21) | (0x0A) | OPC_CP2,
> +
> +    OPC_PSRLW     = (24 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRLH     = (25 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRAW     = (26 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRAH     = (27 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PUNPCKLWD = (28 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PUNPCKHWD = (29 << 21) | (0x0B) | OPC_CP2,
> +
> +    OPC_ADDU_CP2 = (24 << 21) | (0x0C) | OPC_CP2,
> +    OPC_OR_CP2   = (25 << 21) | (0x0C) | OPC_CP2,
> +    OPC_ADD_CP2  = (26 << 21) | (0x0C) | OPC_CP2,
> +    OPC_DADD_CP2 = (27 << 21) | (0x0C) | OPC_CP2,
> +    OPC_SEQU_CP2 = (28 << 21) | (0x0C) | OPC_CP2,
> +    OPC_SEQ_CP2  = (29 << 21) | (0x0C) | OPC_CP2,
> +
> +    OPC_SUBU_CP2 = (24 << 21) | (0x0D) | OPC_CP2,
> +    OPC_PASUBUB  = (25 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SUB_CP2  = (26 << 21) | (0x0D) | OPC_CP2,
> +    OPC_DSUB_CP2 = (27 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SLTU_CP2 = (28 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SLT_CP2  = (29 << 21) | (0x0D) | OPC_CP2,
> +
> +    OPC_SLL_CP2  = (24 << 21) | (0x0E) | OPC_CP2,
> +    OPC_DSLL_CP2 = (25 << 21) | (0x0E) | OPC_CP2,
> +    OPC_PEXTRH   = (26 << 21) | (0x0E) | OPC_CP2,
> +    OPC_PMADDHW  = (27 << 21) | (0x0E) | OPC_CP2,
> +    OPC_SLEU_CP2 = (28 << 21) | (0x0E) | OPC_CP2,
> +    OPC_SLE_CP2  = (29 << 21) | (0x0E) | OPC_CP2,
> +
> +    OPC_SRL_CP2  = (24 << 21) | (0x0F) | OPC_CP2,
> +    OPC_DSRL_CP2 = (25 << 21) | (0x0F) | OPC_CP2,
> +    OPC_SRA_CP2  = (26 << 21) | (0x0F) | OPC_CP2,
> +    OPC_DSRA_CP2 = (27 << 21) | (0x0F) | OPC_CP2,
> +    OPC_BIADD    = (28 << 21) | (0x0F) | OPC_CP2,
> +    OPC_PMOVMSKB = (29 << 21) | (0x0F) | OPC_CP2,
> +};
> +
> +
>  #define MASK_CP3(op)       MASK_OP_MAJOR(op) | (op & 0x3F)
>  
>  enum {
> @@ -2380,8 +2477,8 @@ static void gen_cl (DisasContext *ctx, uint32_t opc,
>  }
>  
>  /* Godson integer instructions */
> -static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
> -                                int rd, int rs, int rt)
> +static void gen_loongson_integer(DisasContext *ctx, uint32_t opc,
> +                                 int rd, int rs, int rt)
>  {
>      const char *opn = "loongson";
>      TCGv t0, t1;
> @@ -2594,6 +2691,278 @@ static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
>      tcg_temp_free(t1);
>  }
>  
> +/* Loongson multimedia instructions */
> +static void gen_loongson_multimedia(DisasContext *ctx, int rd, int rs, int rt)
> +{
> +    const char *opn = "loongson_cp2";
> +    uint32_t opc, shift_max;
> +    TCGv_i64 t0, t1;
> +
> +    opc = MASK_LMI(ctx->opcode);
> +    switch (opc) {
> +    case OPC_ADD_CP2:
> +    case OPC_SUB_CP2:
> +    case OPC_DADD_CP2:
> +    case OPC_DSUB_CP2:
> +        t0 = tcg_temp_local_new_i64();
> +        t1 = tcg_temp_local_new_i64();
> +        break;
> +    default:
> +        t0 = tcg_temp_new_i64();
> +        t1 = tcg_temp_new_i64();
> +        break;
> +    }
> +
> +    gen_load_fpr64(ctx, t0, rs);
> +    gen_load_fpr64(ctx, t1, rt);
> +
> +#define LMI_HELPER(UP, LO) \
> +    case OPC_##UP: gen_helper_##LO(t0, t0, t1); opn = #LO; break
> +#define LMI_HELPER_1(UP, LO) \
> +    case OPC_##UP: gen_helper_##LO(t0, t0); opn = #LO; break
> +#define LMI_DIRECT(UP, LO, OP) \
> +    case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); opn = #LO; break
> +
> +    switch (opc) {
> +    LMI_HELPER(PADDSH, paddsh);
> +    LMI_HELPER(PADDUSH, paddush);
> +    LMI_HELPER(PADDH, paddh);
> +    LMI_HELPER(PADDW, paddw);
> +    LMI_HELPER(PADDSB, paddsb);
> +    LMI_HELPER(PADDUSB, paddusb);
> +    LMI_HELPER(PADDB, paddb);
> +
> +    LMI_HELPER(PSUBSH, psubsh);
> +    LMI_HELPER(PSUBUSH, psubush);
> +    LMI_HELPER(PSUBH, psubh);
> +    LMI_HELPER(PSUBW, psubw);
> +    LMI_HELPER(PSUBSB, psubsb);
> +    LMI_HELPER(PSUBUSB, psubusb);
> +    LMI_HELPER(PSUBB, psubb);
> +
> +    LMI_HELPER(PSHUFH, pshufh);
> +    LMI_HELPER(PACKSSWH, packsswh);
> +    LMI_HELPER(PACKSSHB, packsshb);
> +    LMI_HELPER(PACKUSHB, packushb);
> +
> +    LMI_HELPER(PUNPCKLHW, punpcklhw);
> +    LMI_HELPER(PUNPCKHHW, punpckhhw);
> +    LMI_HELPER(PUNPCKLBH, punpcklbh);
> +    LMI_HELPER(PUNPCKHBH, punpckhbh);
> +    LMI_HELPER(PUNPCKLWD, punpcklwd);
> +    LMI_HELPER(PUNPCKHWD, punpckhwd);
> +
> +    LMI_HELPER(PAVGH, pavgh);
> +    LMI_HELPER(PAVGB, pavgb);
> +    LMI_HELPER(PMAXSH, pmaxsh);
> +    LMI_HELPER(PMINSH, pminsh);
> +    LMI_HELPER(PMAXUB, pmaxub);
> +    LMI_HELPER(PMINUB, pminub);
> +
> +    LMI_HELPER(PCMPEQW, pcmpeqw);
> +    LMI_HELPER(PCMPGTW, pcmpgtw);
> +    LMI_HELPER(PCMPEQH, pcmpeqh);
> +    LMI_HELPER(PCMPGTH, pcmpgth);
> +    LMI_HELPER(PCMPEQB, pcmpeqb);
> +    LMI_HELPER(PCMPGTB, pcmpgtb);
> +
> +    LMI_HELPER(PSLLW, psllw);
> +    LMI_HELPER(PSLLH, psllh);
> +    LMI_HELPER(PSRLW, psrlw);
> +    LMI_HELPER(PSRLH, psrlh);
> +    LMI_HELPER(PSRAW, psraw);
> +    LMI_HELPER(PSRAH, psrah);
> +
> +    LMI_HELPER(PMULLH, pmullh);
> +    LMI_HELPER(PMULHH, pmulhh);
> +    LMI_HELPER(PMULHUH, pmulhuh);
> +    LMI_HELPER(PMADDHW, pmaddhw);
> +
> +    LMI_HELPER(PASUBUB, pasubub);
> +    LMI_HELPER_1(BIADD, biadd);
> +    LMI_HELPER_1(PMOVMSKB, pmovmskb);
> +
> +    LMI_DIRECT(PADDD, paddd, add);
> +    LMI_DIRECT(PSUBD, psubd, sub);
> +    LMI_DIRECT(XOR_CP2, xor, xor);
> +    LMI_DIRECT(NOR_CP2, nor, nor);
> +    LMI_DIRECT(AND_CP2, and, and);
> +    LMI_DIRECT(PANDN, pandn, andc);
> +    LMI_DIRECT(OR, or, or);
> +
> +    case OPC_PINSRH_0:
> +        tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
> +        opn = "pinsrh_0";
> +        break;
> +    case OPC_PINSRH_1:
> +        tcg_gen_deposit_i64(t0, t0, t1, 16, 16);
> +        opn = "pinsrh_1";
> +        break;
> +    case OPC_PINSRH_2:
> +        tcg_gen_deposit_i64(t0, t0, t1, 32, 16);
> +        opn = "pinsrh_2";
> +        break;
> +    case OPC_PINSRH_3:
> +        tcg_gen_deposit_i64(t0, t0, t1, 48, 16);
> +        opn = "pinsrh_3";
> +        break;
> +
> +    case OPC_PEXTRH:
> +        tcg_gen_andi_i64(t1, t1, 3);
> +        tcg_gen_shli_i64(t1, t1, 4);
> +        tcg_gen_shr_i64(t0, t0, t1);
> +        tcg_gen_ext16u_i64(t0, t0);
> +        opn = "pextrh";
> +        break;
> +
> +    case OPC_ADDU_CP2:
> +        tcg_gen_add_i64(t0, t0, t1);
> +        tcg_gen_ext32s_i64(t0, t0);
> +        opn = "addu";
> +        break;
> +    case OPC_SUBU_CP2:
> +        tcg_gen_sub_i64(t0, t0, t1);
> +        tcg_gen_ext32s_i64(t0, t0);
> +        opn = "addu";
> +        break;
> +
> +    case OPC_SLL_CP2:
> +        opn = "sll";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_SRL_CP2:
> +        opn = "srl";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_SRA_CP2:
> +        opn = "sra";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_DSLL_CP2:
> +        opn = "dsll";
> +        shift_max = 64;
> +        goto do_shift;
> +    case OPC_DSRL_CP2:
> +        opn = "dsrl";
> +        shift_max = 64;
> +        goto do_shift;
> +    case OPC_DSRA_CP2:
> +        opn = "dsra";
> +        shift_max = 64;
> +        goto do_shift;
> +    do_shift:
> +        /* Make sure shift count isn't TCG undefined behaviour.  */
> +        tcg_gen_andi_i64(t1, t1, shift_max - 1);
> +
> +        switch (opc) {
> +        case OPC_SLL_CP2:
> +        case OPC_DSLL_CP2:
> +            tcg_gen_shl_i64(t0, t0, t1);
> +            break;
> +        case OPC_SRA_CP2:
> +        case OPC_DSRA_CP2:
> +            /* Since SRA is UndefinedResult without sign-extended inputs,
> +               we can treat SRA and DSRA the same.  */
> +            tcg_gen_sar_i64(t0, t0, t1);
> +            break;
> +        case OPC_SRL_CP2:
> +            /* We want to shift in zeros for SRL; zero-extend first.  */
> +            tcg_gen_ext32u_i64(t0, t0);
> +            /* FALLTHRU */
> +        case OPC_DSRL_CP2:
> +            tcg_gen_shr_i64(t0, t0, t1);
> +            break;
> +        }
> +
> +        if (shift_max == 32) {
> +            tcg_gen_ext32s_i64(t0, t0);
> +        }
> +
> +        /* Shifts larger than MAX produce zero.  */
> +        tcg_gen_setcondi_i64(TCG_COND_LTU, t1, t1, shift_max);
> +        tcg_gen_neg_i64(t1, t1);
> +        tcg_gen_and_i64(t0, t0, t1);
> +        break;
> +
> +    case OPC_ADD_CP2:
> +    case OPC_DADD_CP2:
> +        {
> +            TCGv_i64 t2 = tcg_temp_new_i64();
> +            int lab = gen_new_label();
> +
> +            tcg_gen_mov_i64(t2, t0);
> +            tcg_gen_add_i64(t0, t1, t2);
> +            if (opc == OPC_ADD_CP2) {
> +                tcg_gen_ext32s_i64(t0, t0);
> +            }
> +            tcg_gen_xor_i64(t1, t1, t2);
> +            tcg_gen_xor_i64(t2, t2, t0);
> +            tcg_gen_andc_i64(t1, t2, t1);
> +            tcg_temp_free_i64(t2);
> +            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
> +            generate_exception(ctx, EXCP_OVERFLOW);
> +            gen_set_label(lab);
> +
> +            opn = (opc == OPC_ADD_CP2 ? "add" : "dadd");
> +            break;
> +        }
> +
> +    case OPC_SUB_CP2:
> +    case OPC_DSUB_CP2:
> +        {
> +            TCGv_i64 t2 = tcg_temp_new_i64();
> +            int lab = gen_new_label();
> +
> +            tcg_gen_mov_i64(t2, t0);
> +            tcg_gen_sub_i64(t0, t1, t2);
> +            if (opc == OPC_SUB_CP2) {
> +                tcg_gen_ext32s_i64(t0, t0);
> +            }
> +            tcg_gen_xor_i64(t1, t1, t2);
> +            tcg_gen_xor_i64(t2, t2, t0);
> +            tcg_gen_and_i64(t1, t1, t2);
> +            tcg_temp_free_i64(t2);
> +            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
> +            generate_exception(ctx, EXCP_OVERFLOW);
> +            gen_set_label(lab);
> +
> +            opn = (opc == OPC_SUB_CP2 ? "sub" : "dsub");
> +            break;
> +        }
> +
> +    case OPC_PMULUW:
> +        tcg_gen_ext32u_i64(t0, t0);
> +        tcg_gen_ext32u_i64(t1, t1);
> +        tcg_gen_mul_i64(t0, t0, t1);
> +        opn = "pmuluw";
> +        break;
> +
> +    case OPC_SEQU_CP2:
> +    case OPC_SEQ_CP2:
> +    case OPC_SLTU_CP2:
> +    case OPC_SLT_CP2:
> +    case OPC_SLEU_CP2:
> +    case OPC_SLE_CP2:
> +        /* ??? Document is unclear: Set FCC[CC].  Does that mean the
> +           FD field is the CC field?  */
> +    default:
> +        MIPS_INVAL(opn);
> +        generate_exception(ctx, EXCP_RI);
> +        return;
> +    }
> +
> +#undef LMI_HELPER
> +#undef LMI_DIRECT
> +
> +    gen_store_fpr64(ctx, t0, rd);
> +
> +    (void)opn; /* avoid a compiler warning */
> +    MIPS_DEBUG("%s %s, %s, %s", opn,
> +               fregnames[rd], fregnames[rs], fregnames[rt]);
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
>  /* Traps */
>  static void gen_trap (DisasContext *ctx, uint32_t opc,
>                        int rs, int rt, int16_t imm)
> @@ -12316,10 +12685,14 @@ static void decode_opc (CPUMIPSState *env, DisasContext *ctx, int *is_branch)
>      case OPC_LDC2:
>      case OPC_SWC2:
>      case OPC_SDC2:
> -    case OPC_CP2:
>          /* COP2: Not implemented. */
>          generate_exception_err(ctx, EXCP_CpU, 2);
>          break;
> +    case OPC_CP2:
> +        check_insn(env, ctx, INSN_LOONGSON2F);
> +        /* Note that these instructions use different fields.  */
> +        gen_loongson_multimedia(ctx, sa, rd, rt);
> +        break;
>  
>      case OPC_CP3:
>          if (env->CP0_Config1 & (1 << CP0C1_FP)) {

Thanks, applied.

Patch

diff --git a/target-mips/Makefile.objs b/target-mips/Makefile.objs
index ca20f21..3eeeeac 100644
--- a/target-mips/Makefile.objs
+++ b/target-mips/Makefile.objs
@@ -1,2 +1,2 @@ 
-obj-y += translate.o op_helper.o helper.o cpu.o
+obj-y += translate.o op_helper.o lmi_helper.o helper.o cpu.o
 obj-$(CONFIG_SOFTMMU) += machine.o
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 109ac37..f35ed78 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -303,4 +303,63 @@  DEF_HELPER_1(rdhwr_ccres, tl, env)
 DEF_HELPER_2(pmon, void, env, int)
 DEF_HELPER_1(wait, void, env)
 
+/* Loongson multimedia functions.  */
+DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psubsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pshufh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsswh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsshb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packushb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(punpcklhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pavgh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pavgb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pcmpeqw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgth, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psllw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psllh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psraw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrah, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pmullh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhuh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaddhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pasubub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_1(biadd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+DEF_HELPER_FLAGS_1(pmovmskb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+
 #include "def-helper.h"
diff --git a/target-mips/lmi_helper.c b/target-mips/lmi_helper.c
new file mode 100644
index 0000000..1b24353
--- /dev/null
+++ b/target-mips/lmi_helper.c
@@ -0,0 +1,744 @@ 
+/*
+ *  Loongson Multimedia Instruction emulation helpers for QEMU.
+ *
+ *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cpu.h"
+#include "helper.h"
+
+/* If the byte ordering doesn't matter, i.e. all columns are treated
+   identically, then this union can be used directly.  If byte ordering
+   does matter, we generally ignore dumping to memory.  */
+typedef union {
+    uint8_t  ub[8];
+    int8_t   sb[8];
+    uint16_t uh[4];
+    int16_t  sh[4];
+    uint32_t uw[2];
+    int32_t  sw[2];
+    uint64_t d;
+} LMIValue;
+
+/* Some byte ordering issues can be mitigated by XORing in the following.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define BYTE_ORDER_XOR(N) N
+#else
+# define BYTE_ORDER_XOR(N) 0
+#endif
+
+#define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
+#define SATUB(x)  (x > 0xff ? 0xff : x)
+
+#define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
+#define SATUH(x)  (x > 0xffff ? 0xffff : x)
+
+#define SATSW(x) \
+    (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
+#define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
+
+uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] + vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] + vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] + vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] + vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] += vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] += vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] += vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] - vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] - vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] - vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] -= vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] -= vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] -= vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs;
+    unsigned i;
+
+    vs.d = fs;
+    vd.d = 0;
+    for (i = 0; i < 4; i++, ft >>= 2) {
+        vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
+    }
+    return vd.d;
+}
+
+uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    int64_t tmp;
+
+    tmp = (int32_t)(fs >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 0;
+
+    tmp = (int32_t)(fs >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 16;
+
+    tmp = (int32_t)(ft >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 32;
+
+    tmp = (int32_t)(ft >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 48;
+
+    return fd;
+}
+
+uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_packushb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
+{
+    return (fs & 0xffffffff) | (ft << 32);
+}
+
+uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
+{
+    return (fs >> 32) | (ft & ~0xffffffffull);
+}
+
+uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[0 ^ host];
+    vd.uh[1 ^ host] = vt.uh[0 ^ host];
+    vd.uh[2 ^ host] = vs.uh[1 ^ host];
+    vd.uh[3 ^ host] = vt.uh[1 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[2 ^ host];
+    vd.uh[1 ^ host] = vt.uh[2 ^ host];
+    vd.uh[2 ^ host] = vs.uh[3 ^ host];
+    vd.uh[3 ^ host] = vt.uh[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[0 ^ host];
+    vd.ub[1 ^ host] = vt.ub[0 ^ host];
+    vd.ub[2 ^ host] = vs.ub[1 ^ host];
+    vd.ub[3 ^ host] = vt.ub[1 ^ host];
+    vd.ub[4 ^ host] = vs.ub[2 ^ host];
+    vd.ub[5 ^ host] = vt.ub[2 ^ host];
+    vd.ub[6 ^ host] = vs.ub[3 ^ host];
+    vd.ub[7 ^ host] = vt.ub[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[4 ^ host];
+    vd.ub[1 ^ host] = vt.ub[4 ^ host];
+    vd.ub[2 ^ host] = vs.ub[5 ^ host];
+    vd.ub[3 ^ host] = vt.ub[5 ^ host];
+    vd.ub[4 ^ host] = vs.ub[6 ^ host];
+    vd.ub[5 ^ host] = vt.ub[6 ^ host];
+    vd.ub[6 ^ host] = vs.ub[7 ^ host];
+    vd.ub[7 ^ host] = vt.ub[7 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psraw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        ft = 31;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.sw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrah(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        ft = 15;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] *= vt.sh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int32_t r = vs.sh[i] * vt.sh[i];
+        vs.sh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        uint32_t r = vs.uh[i] * vt.uh[i];
+        vs.uh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vs, vt;
+    uint32_t p0, p1;
+
+    vs.d = fs;
+    vt.d = ft;
+    p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
+    p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
+    p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
+    p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
+
+    return ((uint64_t)p1 << 32) | p0;
+}
+
+uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = (r < 0 ? -r : r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_biadd(uint64_t fs)
+{
+    unsigned i, fd;
+
+    for (i = fd = 0; i < 8; ++i) {
+        fd += (fs >> (i * 8)) & 0xff;
+    }
+    return fd & 0xffff;
+}
+
+uint64_t helper_pmovmskb(uint64_t fs)
+{
+    unsigned fd = 0;
+
+    fd |= ((fs >>  7) & 1) << 0;
+    fd |= ((fs >> 15) & 1) << 1;
+    fd |= ((fs >> 23) & 1) << 2;
+    fd |= ((fs >> 31) & 1) << 3;
+    fd |= ((fs >> 39) & 1) << 4;
+    fd |= ((fs >> 47) & 1) << 5;
+    fd |= ((fs >> 55) & 1) << 6;
+    fd |= ((fs >> 63) & 1) << 7;
+
+    return fd & 0xff;
+}
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 52eeb2b..f61cc6e 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -446,6 +446,103 @@  enum {
     OPC_BC2     = (0x08 << 21) | OPC_CP2,
 };
 
+#define MASK_LMI(op)  (MASK_OP_MAJOR(op) | (op & (0x1F << 21)) | (op & 0x1F))
+
+enum {
+    OPC_PADDSH  = (24 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSH = (25 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDH   = (26 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDW   = (27 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDSB  = (28 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSB = (29 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDB   = (30 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDD   = (31 << 21) | (0x00) | OPC_CP2,
+
+    OPC_PSUBSH  = (24 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSH = (25 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBH   = (26 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBW   = (27 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBSB  = (28 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSB = (29 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBB   = (30 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBD   = (31 << 21) | (0x01) | OPC_CP2,
+
+    OPC_PSHUFH   = (24 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSWH = (25 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSHB = (26 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKUSHB = (27 << 21) | (0x02) | OPC_CP2,
+    OPC_XOR_CP2  = (28 << 21) | (0x02) | OPC_CP2,
+    OPC_NOR_CP2  = (29 << 21) | (0x02) | OPC_CP2,
+    OPC_AND_CP2  = (30 << 21) | (0x02) | OPC_CP2,
+    OPC_PANDN    = (31 << 21) | (0x02) | OPC_CP2,
+
+    OPC_PUNPCKLHW = (24 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHHW = (25 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKLBH = (26 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHBH = (27 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_0  = (28 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_1  = (29 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_2  = (30 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_3  = (31 << 21) | (0x03) | OPC_CP2,
+
+    OPC_PAVGH   = (24 << 21) | (0x08) | OPC_CP2,
+    OPC_PAVGB   = (25 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXSH  = (26 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINSH  = (27 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXUB  = (28 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINUB  = (29 << 21) | (0x08) | OPC_CP2,
+
+    OPC_PCMPEQW = (24 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTW = (25 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQH = (26 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTH = (27 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQB = (28 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTB = (29 << 21) | (0x09) | OPC_CP2,
+
+    OPC_PSLLW   = (24 << 21) | (0x0A) | OPC_CP2,
+    OPC_PSLLH   = (25 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULLH  = (26 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHH  = (27 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULUW  = (28 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHUH = (29 << 21) | (0x0A) | OPC_CP2,
+
+    OPC_PSRLW     = (24 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRLH     = (25 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAW     = (26 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAH     = (27 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKLWD = (28 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKHWD = (29 << 21) | (0x0B) | OPC_CP2,
+
+    OPC_ADDU_CP2 = (24 << 21) | (0x0C) | OPC_CP2,
+    OPC_OR_CP2   = (25 << 21) | (0x0C) | OPC_CP2,
+    OPC_ADD_CP2  = (26 << 21) | (0x0C) | OPC_CP2,
+    OPC_DADD_CP2 = (27 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQU_CP2 = (28 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQ_CP2  = (29 << 21) | (0x0C) | OPC_CP2,
+
+    OPC_SUBU_CP2 = (24 << 21) | (0x0D) | OPC_CP2,
+    OPC_PASUBUB  = (25 << 21) | (0x0D) | OPC_CP2,
+    OPC_SUB_CP2  = (26 << 21) | (0x0D) | OPC_CP2,
+    OPC_DSUB_CP2 = (27 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLTU_CP2 = (28 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLT_CP2  = (29 << 21) | (0x0D) | OPC_CP2,
+
+    OPC_SLL_CP2  = (24 << 21) | (0x0E) | OPC_CP2,
+    OPC_DSLL_CP2 = (25 << 21) | (0x0E) | OPC_CP2,
+    OPC_PEXTRH   = (26 << 21) | (0x0E) | OPC_CP2,
+    OPC_PMADDHW  = (27 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLEU_CP2 = (28 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLE_CP2  = (29 << 21) | (0x0E) | OPC_CP2,
+
+    OPC_SRL_CP2  = (24 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRL_CP2 = (25 << 21) | (0x0F) | OPC_CP2,
+    OPC_SRA_CP2  = (26 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRA_CP2 = (27 << 21) | (0x0F) | OPC_CP2,
+    OPC_BIADD    = (28 << 21) | (0x0F) | OPC_CP2,
+    OPC_PMOVMSKB = (29 << 21) | (0x0F) | OPC_CP2,
+};
+
+
 #define MASK_CP3(op)       MASK_OP_MAJOR(op) | (op & 0x3F)
 
 enum {
@@ -2380,8 +2477,8 @@  static void gen_cl (DisasContext *ctx, uint32_t opc,
 }
 
 /* Godson integer instructions */
-static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
-                                int rd, int rs, int rt)
+static void gen_loongson_integer(DisasContext *ctx, uint32_t opc,
+                                 int rd, int rs, int rt)
 {
     const char *opn = "loongson";
     TCGv t0, t1;
@@ -2594,6 +2691,278 @@  static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
     tcg_temp_free(t1);
 }
 
+/* Loongson multimedia instructions */
+static void gen_loongson_multimedia(DisasContext *ctx, int rd, int rs, int rt)
+{
+    const char *opn = "loongson_cp2";
+    uint32_t opc, shift_max;
+    TCGv_i64 t0, t1;
+
+    opc = MASK_LMI(ctx->opcode);
+    switch (opc) {
+    case OPC_ADD_CP2:
+    case OPC_SUB_CP2:
+    case OPC_DADD_CP2:
+    case OPC_DSUB_CP2:
+        t0 = tcg_temp_local_new_i64();
+        t1 = tcg_temp_local_new_i64();
+        break;
+    default:
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        break;
+    }
+
+    gen_load_fpr64(ctx, t0, rs);
+    gen_load_fpr64(ctx, t1, rt);
+
+#define LMI_HELPER(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0, t1); opn = #LO; break
+#define LMI_HELPER_1(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0); opn = #LO; break
+#define LMI_DIRECT(UP, LO, OP) \
+    case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); opn = #LO; break
+
+    switch (opc) {
+    LMI_HELPER(PADDSH, paddsh);
+    LMI_HELPER(PADDUSH, paddush);
+    LMI_HELPER(PADDH, paddh);
+    LMI_HELPER(PADDW, paddw);
+    LMI_HELPER(PADDSB, paddsb);
+    LMI_HELPER(PADDUSB, paddusb);
+    LMI_HELPER(PADDB, paddb);
+
+    LMI_HELPER(PSUBSH, psubsh);
+    LMI_HELPER(PSUBUSH, psubush);
+    LMI_HELPER(PSUBH, psubh);
+    LMI_HELPER(PSUBW, psubw);
+    LMI_HELPER(PSUBSB, psubsb);
+    LMI_HELPER(PSUBUSB, psubusb);
+    LMI_HELPER(PSUBB, psubb);
+
+    LMI_HELPER(PSHUFH, pshufh);
+    LMI_HELPER(PACKSSWH, packsswh);
+    LMI_HELPER(PACKSSHB, packsshb);
+    LMI_HELPER(PACKUSHB, packushb);
+
+    LMI_HELPER(PUNPCKLHW, punpcklhw);
+    LMI_HELPER(PUNPCKHHW, punpckhhw);
+    LMI_HELPER(PUNPCKLBH, punpcklbh);
+    LMI_HELPER(PUNPCKHBH, punpckhbh);
+    LMI_HELPER(PUNPCKLWD, punpcklwd);
+    LMI_HELPER(PUNPCKHWD, punpckhwd);
+
+    LMI_HELPER(PAVGH, pavgh);
+    LMI_HELPER(PAVGB, pavgb);
+    LMI_HELPER(PMAXSH, pmaxsh);
+    LMI_HELPER(PMINSH, pminsh);
+    LMI_HELPER(PMAXUB, pmaxub);
+    LMI_HELPER(PMINUB, pminub);
+
+    LMI_HELPER(PCMPEQW, pcmpeqw);
+    LMI_HELPER(PCMPGTW, pcmpgtw);
+    LMI_HELPER(PCMPEQH, pcmpeqh);
+    LMI_HELPER(PCMPGTH, pcmpgth);
+    LMI_HELPER(PCMPEQB, pcmpeqb);
+    LMI_HELPER(PCMPGTB, pcmpgtb);
+
+    LMI_HELPER(PSLLW, psllw);
+    LMI_HELPER(PSLLH, psllh);
+    LMI_HELPER(PSRLW, psrlw);
+    LMI_HELPER(PSRLH, psrlh);
+    LMI_HELPER(PSRAW, psraw);
+    LMI_HELPER(PSRAH, psrah);
+
+    LMI_HELPER(PMULLH, pmullh);
+    LMI_HELPER(PMULHH, pmulhh);
+    LMI_HELPER(PMULHUH, pmulhuh);
+    LMI_HELPER(PMADDHW, pmaddhw);
+
+    LMI_HELPER(PASUBUB, pasubub);
+    LMI_HELPER_1(BIADD, biadd);
+    LMI_HELPER_1(PMOVMSKB, pmovmskb);
+
+    LMI_DIRECT(PADDD, paddd, add);
+    LMI_DIRECT(PSUBD, psubd, sub);
+    LMI_DIRECT(XOR_CP2, xor, xor);
+    LMI_DIRECT(NOR_CP2, nor, nor);
+    LMI_DIRECT(AND_CP2, and, and);
+    LMI_DIRECT(PANDN, pandn, andc);
+    LMI_DIRECT(OR, or, or);
+
+    case OPC_PINSRH_0:
+        tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
+        opn = "pinsrh_0";
+        break;
+    case OPC_PINSRH_1:
+        tcg_gen_deposit_i64(t0, t0, t1, 16, 16);
+        opn = "pinsrh_1";
+        break;
+    case OPC_PINSRH_2:
+        tcg_gen_deposit_i64(t0, t0, t1, 32, 16);
+        opn = "pinsrh_2";
+        break;
+    case OPC_PINSRH_3:
+        tcg_gen_deposit_i64(t0, t0, t1, 48, 16);
+        opn = "pinsrh_3";
+        break;
+
+    case OPC_PEXTRH:
+        tcg_gen_andi_i64(t1, t1, 3);
+        tcg_gen_shli_i64(t1, t1, 4);
+        tcg_gen_shr_i64(t0, t0, t1);
+        tcg_gen_ext16u_i64(t0, t0);
+        opn = "pextrh";
+        break;
+
+    case OPC_ADDU_CP2:
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+    case OPC_SUBU_CP2:
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+
+    case OPC_SLL_CP2:
+        opn = "sll";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRL_CP2:
+        opn = "srl";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRA_CP2:
+        opn = "sra";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_DSLL_CP2:
+        opn = "dsll";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRL_CP2:
+        opn = "dsrl";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRA_CP2:
+        opn = "dsra";
+        shift_max = 64;
+        goto do_shift;
+    do_shift:
+        /* Make sure shift count isn't TCG undefined behaviour.  */
+        tcg_gen_andi_i64(t1, t1, shift_max - 1);
+
+        switch (opc) {
+        case OPC_SLL_CP2:
+        case OPC_DSLL_CP2:
+            tcg_gen_shl_i64(t0, t0, t1);
+            break;
+        case OPC_SRA_CP2:
+        case OPC_DSRA_CP2:
+            /* Since SRA is UndefinedResult without sign-extended inputs,
+               we can treat SRA and DSRA the same.  */
+            tcg_gen_sar_i64(t0, t0, t1);
+            break;
+        case OPC_SRL_CP2:
+            /* We want to shift in zeros for SRL; zero-extend first.  */
+            tcg_gen_ext32u_i64(t0, t0);
+            /* FALLTHRU */
+        case OPC_DSRL_CP2:
+            tcg_gen_shr_i64(t0, t0, t1);
+            break;
+        }
+
+        if (shift_max == 32) {
+            tcg_gen_ext32s_i64(t0, t0);
+        }
+
+        /* Shifts larger than MAX produce zero.  */
+        tcg_gen_setcondi_i64(TCG_COND_LTU, t1, t1, shift_max);
+        tcg_gen_neg_i64(t1, t1);
+        tcg_gen_and_i64(t0, t0, t1);
+        break;
+
+    case OPC_ADD_CP2:
+    case OPC_DADD_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_add_i64(t0, t1, t2);
+            if (opc == OPC_ADD_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_andc_i64(t1, t2, t1);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_ADD_CP2 ? "add" : "dadd");
+            break;
+        }
+
+    case OPC_SUB_CP2:
+    case OPC_DSUB_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_sub_i64(t0, t1, t2);
+            if (opc == OPC_SUB_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_and_i64(t1, t1, t2);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_SUB_CP2 ? "sub" : "dsub");
+            break;
+        }
+
+    case OPC_PMULUW:
+        tcg_gen_ext32u_i64(t0, t0);
+        tcg_gen_ext32u_i64(t1, t1);
+        tcg_gen_mul_i64(t0, t0, t1);
+        opn = "pmuluw";
+        break;
+
+    case OPC_SEQU_CP2:
+    case OPC_SEQ_CP2:
+    case OPC_SLTU_CP2:
+    case OPC_SLT_CP2:
+    case OPC_SLEU_CP2:
+    case OPC_SLE_CP2:
+        /* ??? Document is unclear: Set FCC[CC].  Does that mean the
+           FD field is the CC field?  */
+    default:
+        MIPS_INVAL(opn);
+        generate_exception(ctx, EXCP_RI);
+        return;
+    }
+
+#undef LMI_HELPER
+#undef LMI_DIRECT
+
+    gen_store_fpr64(ctx, t0, rd);
+
+    (void)opn; /* avoid a compiler warning */
+    MIPS_DEBUG("%s %s, %s, %s", opn,
+               fregnames[rd], fregnames[rs], fregnames[rt]);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
 /* Traps */
 static void gen_trap (DisasContext *ctx, uint32_t opc,
                       int rs, int rt, int16_t imm)
@@ -12316,10 +12685,14 @@  static void decode_opc (CPUMIPSState *env, DisasContext *ctx, int *is_branch)
     case OPC_LDC2:
     case OPC_SWC2:
     case OPC_SDC2:
-    case OPC_CP2:
         /* COP2: Not implemented. */
         generate_exception_err(ctx, EXCP_CpU, 2);
         break;
+    case OPC_CP2:
+        check_insn(env, ctx, INSN_LOONGSON2F);
+        /* Note that these instructions use different fields.  */
+        gen_loongson_multimedia(ctx, sa, rd, rt);
+        break;
 
     case OPC_CP3:
         if (env->CP0_Config1 & (1 << CP0C1_FP)) {