Patchwork [7/7] target-mips: Implement Loongson Multimedia Instructions

login
register
mail settings
Submitter Richard Henderson
Date Sept. 17, 2012, 9:35 p.m.
Message ID <1347917713-23343-8-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/184565/
State New
Headers show

Comments

Richard Henderson - Sept. 17, 2012, 9:35 p.m.
Implements all of the COP2 instructions except for the S<cond>
family of comparisons.  The documentation is unclear for those.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-mips/Makefile.objs |   2 +-
 target-mips/helper.h      |  59 ++++
 target-mips/lmi_helper.c  | 744 ++++++++++++++++++++++++++++++++++++++++++++++
 target-mips/translate.c   | 379 ++++++++++++++++++++++-
 4 files changed, 1180 insertions(+), 4 deletions(-)
 create mode 100644 target-mips/lmi_helper.c
Aurelien Jarno - Sept. 18, 2012, 4:39 p.m.
On Mon, Sep 17, 2012 at 02:35:13PM -0700, Richard Henderson wrote:
> Implements all of the COP2 instructions except for the S<cond>
> family of comparisons.  The documentation is unclear for those.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-mips/Makefile.objs |   2 +-
>  target-mips/helper.h      |  59 ++++
>  target-mips/lmi_helper.c  | 744 ++++++++++++++++++++++++++++++++++++++++++++++
>  target-mips/translate.c   | 379 ++++++++++++++++++++++-
>  4 files changed, 1180 insertions(+), 4 deletions(-)
>  create mode 100644 target-mips/lmi_helper.c
> 
> diff --git a/target-mips/Makefile.objs b/target-mips/Makefile.objs
> index ca20f21..3eeeeac 100644
> --- a/target-mips/Makefile.objs
> +++ b/target-mips/Makefile.objs
> @@ -1,2 +1,2 @@
> -obj-y += translate.o op_helper.o helper.o cpu.o
> +obj-y += translate.o op_helper.o lmi_helper.o helper.o cpu.o
>  obj-$(CONFIG_SOFTMMU) += machine.o
> diff --git a/target-mips/helper.h b/target-mips/helper.h
> index 109ac37..f35ed78 100644
> --- a/target-mips/helper.h
> +++ b/target-mips/helper.h
> @@ -303,4 +303,63 @@ DEF_HELPER_1(rdhwr_ccres, tl, env)
>  DEF_HELPER_2(pmon, void, env, int)
>  DEF_HELPER_1(wait, void, env)
>  
> +/* Loongson multimedia functions.  */
> +DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(paddb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(psubsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psubb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pshufh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packsswh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packsshb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(packushb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(punpcklhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpcklbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpcklwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(punpckhwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pavgh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pavgb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaxsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pminsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaxub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pminub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pcmpeqw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgtw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpeqh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgth, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpeqb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pcmpgtb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(psllw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psllh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrlw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrlh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psraw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(psrah, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pmullh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmulhh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmulhuh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_2(pmaddhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +
> +DEF_HELPER_FLAGS_2(pasubub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
> +DEF_HELPER_FLAGS_1(biadd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
> +DEF_HELPER_FLAGS_1(pmovmskb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
> +
>  #include "def-helper.h"
> diff --git a/target-mips/lmi_helper.c b/target-mips/lmi_helper.c
> new file mode 100644
> index 0000000..1b24353
> --- /dev/null
> +++ b/target-mips/lmi_helper.c
> @@ -0,0 +1,744 @@
> +/*
> + *  Loongson Multimedia Instruction emulation helpers for QEMU.
> + *
> + *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
> + *
> + * This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2 of the License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with this library; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#include "cpu.h"
> +#include "helper.h"
> +
> +/* If the byte ordering doesn't matter, i.e. all columns are treated
> +   identically, then this union can be used directly.  If byte ordering
> +   does matter, we generally ignore dumping to memory.  */
> +typedef union {
> +    uint8_t  ub[8];
> +    int8_t   sb[8];
> +    uint16_t uh[4];
> +    int16_t  sh[4];
> +    uint32_t uw[2];
> +    int32_t  sw[2];
> +    uint64_t d;
> +} LMIValue;
> +
> +/* Some byte ordering issues can be mitigated by XORing in the following.  */
> +#ifdef HOST_WORDS_BIGENDIAN
> +# define BYTE_ORDER_XOR(N) N
> +#else
> +# define BYTE_ORDER_XOR(N) 0
> +#endif
> +
> +#define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
> +#define SATUB(x)  (x > 0xff ? 0xff : x)
> +
> +#define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
> +#define SATUH(x)  (x > 0xffff ? 0xffff : x)
> +
> +#define SATSW(x) \
> +    (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
> +#define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
> +
> +uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.sb[i] + vt.sb[i];
> +        vs.sb[i] = SATSB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] + vt.ub[i];
> +        vs.ub[i] = SATUB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.sh[i] + vt.sh[i];
> +        vs.sh[i] = SATSH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddush(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.uh[i] + vt.uh[i];
> +        vs.uh[i] = SATUH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        vs.ub[i] += vt.ub[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] += vt.uh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_paddw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] += vt.uw[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.sb[i] - vt.sb[i];
> +        vs.sb[i] = SATSB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] - vt.ub[i];
> +        vs.ub[i] = SATUB(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.sh[i] - vt.sh[i];
> +        vs.sh[i] = SATSH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubush(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int r = vs.uh[i] - vt.uh[i];
> +        vs.uh[i] = SATUH(r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        vs.ub[i] -= vt.ub[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] -= vt.uh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psubw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned int i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] -= vt.uw[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vd.d = 0;
> +    for (i = 0; i < 4; i++, ft >>= 2) {
> +        vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
> +    }
> +    return vd.d;
> +}
> +
> +uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    int64_t tmp;
> +
> +    tmp = (int32_t)(fs >> 0);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 0;
> +
> +    tmp = (int32_t)(fs >> 32);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 16;
> +
> +    tmp = (int32_t)(ft >> 0);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 32;
> +
> +    tmp = (int32_t)(ft >> 32);
> +    tmp = SATSH(tmp);
> +    fd |= (tmp & 0xffff) << 48;
> +
> +    return fd;
> +}
> +
> +uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    unsigned int i;
> +
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = fs >> (i * 16);
> +        tmp = SATSB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
> +    }
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = ft >> (i * 16);
> +        tmp = SATSB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
> +    }
> +
> +    return fd;
> +}
> +
> +uint64_t helper_packushb(uint64_t fs, uint64_t ft)
> +{
> +    uint64_t fd = 0;
> +    unsigned int i;
> +
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = fs >> (i * 16);
> +        tmp = SATUB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
> +    }
> +    for (i = 0; i < 4; ++i) {
> +        int16_t tmp = ft >> (i * 16);
> +        tmp = SATUB(tmp);
> +        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
> +    }
> +
> +    return fd;
> +}
> +
> +uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
> +{
> +    return (fs & 0xffffffff) | (ft << 32);
> +}
> +
> +uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
> +{
> +    return (fs >> 32) | (ft & ~0xffffffffull);
> +}
> +
> +uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.uh[0 ^ host] = vs.uh[0 ^ host];
> +    vd.uh[1 ^ host] = vt.uh[0 ^ host];
> +    vd.uh[2 ^ host] = vs.uh[1 ^ host];
> +    vd.uh[3 ^ host] = vt.uh[1 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.uh[0 ^ host] = vs.uh[2 ^ host];
> +    vd.uh[1 ^ host] = vt.uh[2 ^ host];
> +    vd.uh[2 ^ host] = vs.uh[3 ^ host];
> +    vd.uh[3 ^ host] = vt.uh[3 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(7);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.ub[0 ^ host] = vs.ub[0 ^ host];
> +    vd.ub[1 ^ host] = vt.ub[0 ^ host];
> +    vd.ub[2 ^ host] = vs.ub[1 ^ host];
> +    vd.ub[3 ^ host] = vt.ub[1 ^ host];
> +    vd.ub[4 ^ host] = vs.ub[2 ^ host];
> +    vd.ub[5 ^ host] = vt.ub[2 ^ host];
> +    vd.ub[6 ^ host] = vs.ub[3 ^ host];
> +    vd.ub[7 ^ host] = vt.ub[3 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(7);
> +    LMIValue vd, vs, vt;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    vd.ub[0 ^ host] = vs.ub[4 ^ host];
> +    vd.ub[1 ^ host] = vt.ub[4 ^ host];
> +    vd.ub[2 ^ host] = vs.ub[5 ^ host];
> +    vd.ub[3 ^ host] = vt.ub[5 ^ host];
> +    vd.ub[4 ^ host] = vs.ub[6 ^ host];
> +    vd.ub[5 ^ host] = vt.ub[6 ^ host];
> +    vd.ub[6 ^ host] = vs.ub[7 ^ host];
> +    vd.ub[7 ^ host] = vt.ub[7 ^ host];
> +
> +    return vd.d;
> +}
> +
> +uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pminub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; i++) {
> +        vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 2; i++) {
> +        vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; i++) {
> +        vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; i++) {
> +        vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psllw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] <<= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.uw[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psraw(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 31) {
> +        ft = 31;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 2; ++i) {
> +        vs.sw[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psllh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] <<= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        return 0;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.uh[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_psrah(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs;
> +    unsigned i;
> +
> +    ft &= 0x7f;
> +    if (ft > 15) {
> +        ft = 15;
> +    }
> +    vs.d = fs;
> +    for (i = 0; i < 4; ++i) {
> +        vs.sh[i] >>= ft;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        vs.sh[i] *= vt.sh[i];
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        int32_t r = vs.sh[i] * vt.sh[i];
> +        vs.sh[i] = r >> 16;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 4; ++i) {
> +        uint32_t r = vs.uh[i] * vt.uh[i];
> +        vs.uh[i] = r >> 16;
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
> +{
> +    unsigned host = BYTE_ORDER_XOR(3);
> +    LMIValue vs, vt;
> +    uint32_t p0, p1;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
> +    p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
> +    p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
> +    p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
> +
> +    return ((uint64_t)p1 << 32) | p0;
> +}
> +
> +uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
> +{
> +    LMIValue vs, vt;
> +    unsigned i;
> +
> +    vs.d = fs;
> +    vt.d = ft;
> +    for (i = 0; i < 8; ++i) {
> +        int r = vs.ub[i] - vt.ub[i];
> +        vs.ub[i] = (r < 0 ? -r : r);
> +    }
> +    return vs.d;
> +}
> +
> +uint64_t helper_biadd(uint64_t fs)
> +{
> +    unsigned i, fd;
> +
> +    for (i = fd = 0; i < 8; ++i) {
> +        fd += (fs >> (i * 8)) & 0xff;
> +    }
> +    return fd & 0xffff;
> +}
> +
> +uint64_t helper_pmovmskb(uint64_t fs)
> +{
> +    unsigned fd = 0;
> +
> +    fd |= ((fs >>  7) & 1) << 0;
> +    fd |= ((fs >> 15) & 1) << 1;
> +    fd |= ((fs >> 23) & 1) << 2;
> +    fd |= ((fs >> 31) & 1) << 3;
> +    fd |= ((fs >> 39) & 1) << 4;
> +    fd |= ((fs >> 47) & 1) << 5;
> +    fd |= ((fs >> 55) & 1) << 6;
> +    fd |= ((fs >> 63) & 1) << 7;
> +
> +    return fd & 0xff;
> +}
> diff --git a/target-mips/translate.c b/target-mips/translate.c
> index 57454f0..ac941e6 100644
> --- a/target-mips/translate.c
> +++ b/target-mips/translate.c
> @@ -446,6 +446,103 @@ enum {
>      OPC_BC2     = (0x08 << 21) | OPC_CP2,
>  };
>  
> +#define MASK_LMI(op)  (MASK_OP_MAJOR(op) | (op & (0x1F << 21)) | (op & 0x1F))
> +
> +enum {
> +    OPC_PADDSH  = (24 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDUSH = (25 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDH   = (26 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDW   = (27 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDSB  = (28 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDUSB = (29 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDB   = (30 << 21) | (0x00) | OPC_CP2,
> +    OPC_PADDD   = (31 << 21) | (0x00) | OPC_CP2,
> +
> +    OPC_PSUBSH  = (24 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBUSH = (25 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBH   = (26 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBW   = (27 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBSB  = (28 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBUSB = (29 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBB   = (30 << 21) | (0x01) | OPC_CP2,
> +    OPC_PSUBD   = (31 << 21) | (0x01) | OPC_CP2,
> +
> +    OPC_PSHUFH   = (24 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKSSWH = (25 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKSSHB = (26 << 21) | (0x02) | OPC_CP2,
> +    OPC_PACKUSHB = (27 << 21) | (0x02) | OPC_CP2,
> +    OPC_XOR_CP2  = (28 << 21) | (0x02) | OPC_CP2,
> +    OPC_NOR_CP2  = (29 << 21) | (0x02) | OPC_CP2,
> +    OPC_AND_CP2  = (30 << 21) | (0x02) | OPC_CP2,
> +    OPC_PANDN    = (31 << 21) | (0x02) | OPC_CP2,
> +
> +    OPC_PUNPCKLHW = (24 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKHHW = (25 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKLBH = (26 << 21) | (0x03) | OPC_CP2,
> +    OPC_PUNPCKHBH = (27 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_0  = (28 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_1  = (29 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_2  = (30 << 21) | (0x03) | OPC_CP2,
> +    OPC_PINSRH_3  = (31 << 21) | (0x03) | OPC_CP2,
> +
> +    OPC_PAVGH   = (24 << 21) | (0x08) | OPC_CP2,
> +    OPC_PAVGB   = (25 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMAXSH  = (26 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMINSH  = (27 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMAXUB  = (28 << 21) | (0x08) | OPC_CP2,
> +    OPC_PMINUB  = (29 << 21) | (0x08) | OPC_CP2,
> +
> +    OPC_PCMPEQW = (24 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTW = (25 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPEQH = (26 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTH = (27 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPEQB = (28 << 21) | (0x09) | OPC_CP2,
> +    OPC_PCMPGTB = (29 << 21) | (0x09) | OPC_CP2,
> +
> +    OPC_PSLLW   = (24 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PSLLH   = (25 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULLH  = (26 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULHH  = (27 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULUW  = (28 << 21) | (0x0A) | OPC_CP2,
> +    OPC_PMULHUH = (29 << 21) | (0x0A) | OPC_CP2,
> +
> +    OPC_PSRLW     = (24 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRLH     = (25 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRAW     = (26 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PSRAH     = (27 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PUNPCKLWD = (28 << 21) | (0x0B) | OPC_CP2,
> +    OPC_PUNPCKHWD = (29 << 21) | (0x0B) | OPC_CP2,
> +
> +    OPC_ADDU_CP2 = (24 << 21) | (0x0C) | OPC_CP2,
> +    OPC_OR_CP2   = (25 << 21) | (0x0C) | OPC_CP2,
> +    OPC_ADD_CP2  = (26 << 21) | (0x0C) | OPC_CP2,
> +    OPC_DADD_CP2 = (27 << 21) | (0x0C) | OPC_CP2,
> +    OPC_SEQU_CP2 = (28 << 21) | (0x0C) | OPC_CP2,
> +    OPC_SEQ_CP2  = (29 << 21) | (0x0C) | OPC_CP2,
> +
> +    OPC_SUBU_CP2 = (24 << 21) | (0x0D) | OPC_CP2,
> +    OPC_PASUBUB  = (25 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SUB_CP2  = (26 << 21) | (0x0D) | OPC_CP2,
> +    OPC_DSUB_CP2 = (27 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SLTU_CP2 = (28 << 21) | (0x0D) | OPC_CP2,
> +    OPC_SLT_CP2  = (29 << 21) | (0x0D) | OPC_CP2,
> +
> +    OPC_SLL_CP2  = (24 << 21) | (0x0E) | OPC_CP2,
> +    OPC_DSLL_CP2 = (25 << 21) | (0x0E) | OPC_CP2,
> +    OPC_PEXTRH   = (26 << 21) | (0x0E) | OPC_CP2,
> +    OPC_PMADDHW  = (27 << 21) | (0x0E) | OPC_CP2,
> +    OPC_SLEU_CP2 = (28 << 21) | (0x0E) | OPC_CP2,
> +    OPC_SLE_CP2  = (29 << 21) | (0x0E) | OPC_CP2,
> +
> +    OPC_SRL_CP2  = (24 << 21) | (0x0F) | OPC_CP2,
> +    OPC_DSRL_CP2 = (25 << 21) | (0x0F) | OPC_CP2,
> +    OPC_SRA_CP2  = (26 << 21) | (0x0F) | OPC_CP2,
> +    OPC_DSRA_CP2 = (27 << 21) | (0x0F) | OPC_CP2,
> +    OPC_BIADD    = (28 << 21) | (0x0F) | OPC_CP2,
> +    OPC_PMOVMSKB = (29 << 21) | (0x0F) | OPC_CP2,
> +};
> +
> +
>  #define MASK_CP3(op)       MASK_OP_MAJOR(op) | (op & 0x3F)
>  
>  enum {
> @@ -2424,8 +2521,8 @@ static void gen_cl (DisasContext *ctx, uint32_t opc,
>  }
>  
>  /* Godson integer instructions */
> -static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
> -                                int rd, int rs, int rt)
> +static void gen_loongson_integer(DisasContext *ctx, uint32_t opc,
> +                                 int rd, int rs, int rt)
>  {
>      const char *opn = "loongson";
>      TCGv t0, t1;
> @@ -2637,6 +2734,278 @@ static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
>      tcg_temp_free(t1);
>  }
>  
> +/* Loongson multimedia instructions */
> +static void gen_loongson_multimedia(DisasContext *ctx, int rd, int rs, int rt)
> +{
> +    const char *opn = "loongson_cp2";
> +    uint32_t opc, shift_max;
> +    TCGv_i64 t0, t1;
> +
> +    opc = MASK_LMI(ctx->opcode);
> +    switch (opc) {
> +    case OPC_ADD_CP2:
> +    case OPC_SUB_CP2:
> +    case OPC_DADD_CP2:
> +    case OPC_DSUB_CP2:
> +        t0 = tcg_temp_local_new_i64();
> +        t1 = tcg_temp_local_new_i64();
> +        break;
> +    default:
> +        t0 = tcg_temp_new_i64();
> +        t1 = tcg_temp_new_i64();
> +        break;
> +    }
> +
> +    gen_load_fpr64(ctx, t0, rs);
> +    gen_load_fpr64(ctx, t1, rt);
> +
> +#define LMI_HELPER(UP, LO) \
> +    case OPC_##UP: gen_helper_##LO(t0, t0, t1); opn = #LO; break
> +#define LMI_HELPER_1(UP, LO) \
> +    case OPC_##UP: gen_helper_##LO(t0, t0); opn = #LO; break
> +#define LMI_DIRECT(UP, LO, OP) \
> +    case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); opn = #LO; break
> +
> +    switch (opc) {
> +    LMI_HELPER(PADDSH, paddsh);
> +    LMI_HELPER(PADDUSH, paddush);
> +    LMI_HELPER(PADDH, paddh);
> +    LMI_HELPER(PADDW, paddw);
> +    LMI_HELPER(PADDSB, paddsb);
> +    LMI_HELPER(PADDUSB, paddusb);
> +    LMI_HELPER(PADDB, paddb);
> +
> +    LMI_HELPER(PSUBSH, psubsh);
> +    LMI_HELPER(PSUBUSH, psubush);
> +    LMI_HELPER(PSUBH, psubh);
> +    LMI_HELPER(PSUBW, psubw);
> +    LMI_HELPER(PSUBSB, psubsb);
> +    LMI_HELPER(PSUBUSB, psubusb);
> +    LMI_HELPER(PSUBB, psubb);
> +
> +    LMI_HELPER(PSHUFH, pshufh);
> +    LMI_HELPER(PACKSSWH, packsswh);
> +    LMI_HELPER(PACKSSHB, packsshb);
> +    LMI_HELPER(PACKUSHB, packushb);
> +
> +    LMI_HELPER(PUNPCKLHW, punpcklhw);
> +    LMI_HELPER(PUNPCKHHW, punpckhhw);
> +    LMI_HELPER(PUNPCKLBH, punpcklbh);
> +    LMI_HELPER(PUNPCKHBH, punpckhbh);
> +    LMI_HELPER(PUNPCKLWD, punpcklwd);
> +    LMI_HELPER(PUNPCKHWD, punpckhwd);
> +
> +    LMI_HELPER(PAVGH, pavgh);
> +    LMI_HELPER(PAVGB, pavgb);
> +    LMI_HELPER(PMAXSH, pmaxsh);
> +    LMI_HELPER(PMINSH, pminsh);
> +    LMI_HELPER(PMAXUB, pmaxub);
> +    LMI_HELPER(PMINUB, pminub);
> +
> +    LMI_HELPER(PCMPEQW, pcmpeqw);
> +    LMI_HELPER(PCMPGTW, pcmpgtw);
> +    LMI_HELPER(PCMPEQH, pcmpeqh);
> +    LMI_HELPER(PCMPGTH, pcmpgth);
> +    LMI_HELPER(PCMPEQB, pcmpeqb);
> +    LMI_HELPER(PCMPGTB, pcmpgtb);
> +
> +    LMI_HELPER(PSLLW, psllw);
> +    LMI_HELPER(PSLLH, psllh);
> +    LMI_HELPER(PSRLW, psrlw);
> +    LMI_HELPER(PSRLH, psrlh);
> +    LMI_HELPER(PSRAW, psraw);
> +    LMI_HELPER(PSRAH, psrah);
> +
> +    LMI_HELPER(PMULLH, pmullh);
> +    LMI_HELPER(PMULHH, pmulhh);
> +    LMI_HELPER(PMULHUH, pmulhuh);
> +    LMI_HELPER(PMADDHW, pmaddhw);
> +
> +    LMI_HELPER(PASUBUB, pasubub);
> +    LMI_HELPER_1(BIADD, biadd);
> +    LMI_HELPER_1(PMOVMSKB, pmovmskb);
> +
> +    LMI_DIRECT(PADDD, paddd, add);
> +    LMI_DIRECT(PSUBD, psubd, sub);
> +    LMI_DIRECT(XOR_CP2, xor, xor);
> +    LMI_DIRECT(NOR_CP2, nor, nor);
> +    LMI_DIRECT(AND_CP2, and, and);
> +    LMI_DIRECT(PANDN, pandn, andc);
> +    LMI_DIRECT(OR, or, or);
> +
> +    case OPC_PINSRH_0:
> +        tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
> +        opn = "pinsrh_0";
> +        break;
> +    case OPC_PINSRH_1:
> +        tcg_gen_deposit_i64(t0, t0, t1, 16, 16);
> +        opn = "pinsrh_1";
> +        break;
> +    case OPC_PINSRH_2:
> +        tcg_gen_deposit_i64(t0, t0, t1, 32, 16);
> +        opn = "pinsrh_2";
> +        break;
> +    case OPC_PINSRH_3:
> +        tcg_gen_deposit_i64(t0, t0, t1, 48, 16);
> +        opn = "pinsrh_3";
> +        break;
> +
> +    case OPC_PEXTRH:
> +        tcg_gen_andi_i64(t1, t1, 3);
> +        tcg_gen_shli_i64(t1, t1, 4);
> +        tcg_gen_shr_i64(t0, t0, t1);
> +        tcg_gen_ext16u_i64(t0, t0);
> +        opn = "pextrh";
> +        break;
> +
> +    case OPC_ADDU_CP2:
> +        tcg_gen_add_i64(t0, t0, t1);
> +        tcg_gen_ext32s_i64(t0, t0);
> +        opn = "addu";
> +        break;
> +    case OPC_SUBU_CP2:
> +        tcg_gen_sub_i64(t0, t0, t1);
> +        tcg_gen_ext32s_i64(t0, t0);
> +        opn = "addu";
> +        break;
> +
> +    case OPC_SLL_CP2:
> +        opn = "sll";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_SRL_CP2:
> +        opn = "srl";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_SRA_CP2:
> +        opn = "sra";
> +        shift_max = 32;
> +        goto do_shift;
> +    case OPC_DSLL_CP2:
> +        opn = "dsll";
> +        shift_max = 64;
> +        goto do_shift;
> +    case OPC_DSRL_CP2:
> +        opn = "dsrl";
> +        shift_max = 64;
> +        goto do_shift;
> +    case OPC_DSRA_CP2:
> +        opn = "dsra";
> +        shift_max = 64;
> +        goto do_shift;
> +    do_shift:
> +        /* Make sure shift count isn't TCG undefined behaviour.  */
> +        tcg_gen_andi_i64(t1, t1, shift_max - 1);
> +
> +        switch (opc) {
> +        case OPC_SLL_CP2:
> +        case OPC_DSLL_CP2:
> +            tcg_gen_shl_i64(t0, t0, t1);
> +            break;
> +        case OPC_SRA_CP2:
> +        case OPC_DSRA_CP2:
> +            /* Since SRA is UndefinedResult without sign-extended inputs,
> +               we can treat SRA and DSRA the same.  */
> +            tcg_gen_sar_i64(t0, t0, t1);
> +            break;
> +        case OPC_SRL_CP2:
> +            /* We want to shift in zeros for SRL; zero-extend first.  */
> +            tcg_gen_ext32u_i64(t0, t0);
> +            /* FALLTHRU */
> +        case OPC_DSRL_CP2:
> +            tcg_gen_shr_i64(t0, t0, t1);
> +            break;
> +        }
> +
> +        if (shift_max == 32) {
> +            tcg_gen_ext32s_i64(t0, t0);
> +        }
> +
> +        /* Shifts larger than MAX produce zero.  */
> +        tcg_gen_setcondi_i64(TCG_COND_LTU, t1, t1, shift_max);
> +        tcg_gen_neg_i64(t1, t1);
> +        tcg_gen_and_i64(t0, t0, t1);
> +        break;
> +
> +    case OPC_ADD_CP2:
> +    case OPC_DADD_CP2:
> +        {
> +            TCGv_i64 t2 = tcg_temp_new_i64();
> +            int lab = gen_new_label();
> +
> +            tcg_gen_mov_i64(t2, t0);
> +            tcg_gen_add_i64(t0, t1, t2);
> +            if (opc == OPC_ADD_CP2) {
> +                tcg_gen_ext32s_i64(t0, t0);
> +            }
> +            tcg_gen_xor_i64(t1, t1, t2);
> +            tcg_gen_xor_i64(t2, t2, t0);
> +            tcg_gen_andc_i64(t1, t2, t1);
> +            tcg_temp_free_i64(t2);
> +            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
> +            generate_exception(ctx, EXCP_OVERFLOW);
> +            gen_set_label(lab);
> +
> +            opn = (opc == OPC_ADD_CP2 ? "add" : "dadd");
> +            break;
> +        }
> +
> +    case OPC_SUB_CP2:
> +    case OPC_DSUB_CP2:
> +        {
> +            TCGv_i64 t2 = tcg_temp_new_i64();
> +            int lab = gen_new_label();
> +
> +            tcg_gen_mov_i64(t2, t0);
> +            tcg_gen_sub_i64(t0, t1, t2);
> +            if (opc == OPC_SUB_CP2) {
> +                tcg_gen_ext32s_i64(t0, t0);
> +            }
> +            tcg_gen_xor_i64(t1, t1, t2);
> +            tcg_gen_xor_i64(t2, t2, t0);
> +            tcg_gen_and_i64(t1, t1, t2);
> +            tcg_temp_free_i64(t2);
> +            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
> +            generate_exception(ctx, EXCP_OVERFLOW);
> +            gen_set_label(lab);
> +
> +            opn = (opc == OPC_SUB_CP2 ? "sub" : "dsub");
> +            break;
> +        }
> +
> +    case OPC_PMULUW:
> +        tcg_gen_ext32u_i64(t0, t0);
> +        tcg_gen_ext32u_i64(t1, t1);
> +        tcg_gen_mul_i64(t0, t0, t1);
> +        opn = "pmuluw";
> +        break;
> +
> +    case OPC_SEQU_CP2:
> +    case OPC_SEQ_CP2:
> +    case OPC_SLTU_CP2:
> +    case OPC_SLT_CP2:
> +    case OPC_SLEU_CP2:
> +    case OPC_SLE_CP2:
> +        /* ??? Document is unclear: Set FCC[CC].  Does that mean the
> +           FD field is the CC field?  */
> +    default:
> +        MIPS_INVAL(opn);
> +        generate_exception(ctx, EXCP_RI);
> +        return;
> +    }
> +
> +#undef LMI_HELPER
> +#undef LMI_DIRECT
> +
> +    gen_store_fpr64(ctx, t0, rd);
> +
> +    (void)opn; /* avoid a compiler warning */
> +    MIPS_DEBUG("%s %s, %s, %s", opn,
> +               fregnames[rd], fregnames[rs], fregnames[rt]);
> +    tcg_temp_free_i64(t0);
> +    tcg_temp_free_i64(t1);
> +}
> +
>  /* Traps */
>  static void gen_trap (DisasContext *ctx, uint32_t opc,
>                        int rs, int rt, int16_t imm)
> @@ -12344,10 +12713,14 @@ static void decode_opc (CPUMIPSState *env, DisasContext *ctx, int *is_branch)
>      case OPC_LDC2:
>      case OPC_SWC2:
>      case OPC_SDC2:
> -    case OPC_CP2:
>          /* COP2: Not implemented. */
>          generate_exception_err(ctx, EXCP_CpU, 2);
>          break;
> +    case OPC_CP2:
> +        check_insn(env, ctx, INSN_LOONGSON2F);
> +        /* Note that these instructions use different fields.  */
> +        gen_loongson_multimedia(ctx, sa, rd, rt);
> +        break;
>  
>      case OPC_CP3:
>          if (env->CP0_Config1 & (1 << CP0C1_FP)) {
> -- 
> 1.7.11.4
> 

I haven't look all instructions in details, but it looks fine to me.
Would it be possible to repost this patch without the need to apply the
FPR TCG patches before, so it can be merged separately?

Patch

diff --git a/target-mips/Makefile.objs b/target-mips/Makefile.objs
index ca20f21..3eeeeac 100644
--- a/target-mips/Makefile.objs
+++ b/target-mips/Makefile.objs
@@ -1,2 +1,2 @@ 
-obj-y += translate.o op_helper.o helper.o cpu.o
+obj-y += translate.o op_helper.o lmi_helper.o helper.o cpu.o
 obj-$(CONFIG_SOFTMMU) += machine.o
diff --git a/target-mips/helper.h b/target-mips/helper.h
index 109ac37..f35ed78 100644
--- a/target-mips/helper.h
+++ b/target-mips/helper.h
@@ -303,4 +303,63 @@  DEF_HELPER_1(rdhwr_ccres, tl, env)
 DEF_HELPER_2(pmon, void, env, int)
 DEF_HELPER_1(wait, void, env)
 
+/* Loongson multimedia functions.  */
+DEF_HELPER_FLAGS_2(paddsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(paddb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psubsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubush, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubsb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubusb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psubb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pshufh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsswh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packsshb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(packushb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(punpcklhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhbh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpcklwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(punpckhwd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pavgh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pavgb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminsh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaxub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pminub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pcmpeqw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgth, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpeqb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pcmpgtb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(psllw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psllh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrlh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psraw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(psrah, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pmullh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmulhuh, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_2(pmaddhw, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+
+DEF_HELPER_FLAGS_2(pasubub, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64, i64)
+DEF_HELPER_FLAGS_1(biadd, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+DEF_HELPER_FLAGS_1(pmovmskb, TCG_CALL_CONST | TCG_CALL_PURE, i64, i64)
+
 #include "def-helper.h"
diff --git a/target-mips/lmi_helper.c b/target-mips/lmi_helper.c
new file mode 100644
index 0000000..1b24353
--- /dev/null
+++ b/target-mips/lmi_helper.c
@@ -0,0 +1,744 @@ 
+/*
+ *  Loongson Multimedia Instruction emulation helpers for QEMU.
+ *
+ *  Copyright (c) 2011  Richard Henderson <rth@twiddle.net>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cpu.h"
+#include "helper.h"
+
+/* If the byte ordering doesn't matter, i.e. all columns are treated
+   identically, then this union can be used directly.  If byte ordering
+   does matter, we generally ignore dumping to memory.  */
+typedef union {
+    uint8_t  ub[8];
+    int8_t   sb[8];
+    uint16_t uh[4];
+    int16_t  sh[4];
+    uint32_t uw[2];
+    int32_t  sw[2];
+    uint64_t d;
+} LMIValue;
+
+/* Some byte ordering issues can be mitigated by XORing in the following.  */
+#ifdef HOST_WORDS_BIGENDIAN
+# define BYTE_ORDER_XOR(N) N
+#else
+# define BYTE_ORDER_XOR(N) 0
+#endif
+
+#define SATSB(x)  (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x)
+#define SATUB(x)  (x > 0xff ? 0xff : x)
+
+#define SATSH(x)  (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x)
+#define SATUH(x)  (x > 0xffff ? 0xffff : x)
+
+#define SATSW(x) \
+    (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x)
+#define SATUW(x)  (x > 0xffffffffull ? 0xffffffffull : x)
+
+uint64_t helper_paddsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] + vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] + vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] + vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] + vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] += vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] += vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_paddw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] += vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.sb[i] - vt.sb[i];
+        vs.sb[i] = SATSB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubusb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = SATUB(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.sh[i] - vt.sh[i];
+        vs.sh[i] = SATSH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubush(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int r = vs.uh[i] - vt.uh[i];
+        vs.uh[i] = SATUH(r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        vs.ub[i] -= vt.ub[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] -= vt.uh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_psubw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned int i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] -= vt.uw[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pshufh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs;
+    unsigned i;
+
+    vs.d = fs;
+    vd.d = 0;
+    for (i = 0; i < 4; i++, ft >>= 2) {
+        vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host];
+    }
+    return vd.d;
+}
+
+uint64_t helper_packsswh(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    int64_t tmp;
+
+    tmp = (int32_t)(fs >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 0;
+
+    tmp = (int32_t)(fs >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 16;
+
+    tmp = (int32_t)(ft >> 0);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 32;
+
+    tmp = (int32_t)(ft >> 32);
+    tmp = SATSH(tmp);
+    fd |= (tmp & 0xffff) << 48;
+
+    return fd;
+}
+
+uint64_t helper_packsshb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATSB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_packushb(uint64_t fs, uint64_t ft)
+{
+    uint64_t fd = 0;
+    unsigned int i;
+
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = fs >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8);
+    }
+    for (i = 0; i < 4; ++i) {
+        int16_t tmp = ft >> (i * 16);
+        tmp = SATUB(tmp);
+        fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32);
+    }
+
+    return fd;
+}
+
+uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft)
+{
+    return (fs & 0xffffffff) | (ft << 32);
+}
+
+uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft)
+{
+    return (fs >> 32) | (ft & ~0xffffffffull);
+}
+
+uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[0 ^ host];
+    vd.uh[1 ^ host] = vt.uh[0 ^ host];
+    vd.uh[2 ^ host] = vs.uh[1 ^ host];
+    vd.uh[3 ^ host] = vt.uh[1 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.uh[0 ^ host] = vs.uh[2 ^ host];
+    vd.uh[1 ^ host] = vt.uh[2 ^ host];
+    vd.uh[2 ^ host] = vs.uh[3 ^ host];
+    vd.uh[3 ^ host] = vt.uh[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[0 ^ host];
+    vd.ub[1 ^ host] = vt.ub[0 ^ host];
+    vd.ub[2 ^ host] = vs.ub[1 ^ host];
+    vd.ub[3 ^ host] = vt.ub[1 ^ host];
+    vd.ub[4 ^ host] = vs.ub[2 ^ host];
+    vd.ub[5 ^ host] = vt.ub[2 ^ host];
+    vd.ub[6 ^ host] = vs.ub[3 ^ host];
+    vd.ub[7 ^ host] = vt.ub[3 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(7);
+    LMIValue vd, vs, vt;
+
+    vs.d = fs;
+    vt.d = ft;
+    vd.ub[0 ^ host] = vs.ub[4 ^ host];
+    vd.ub[1 ^ host] = vt.ub[4 ^ host];
+    vd.ub[2 ^ host] = vs.ub[5 ^ host];
+    vd.ub[3 ^ host] = vt.ub[5 ^ host];
+    vd.ub[4 ^ host] = vs.ub[6 ^ host];
+    vd.ub[5 ^ host] = vt.ub[6 ^ host];
+    vd.ub[6 ^ host] = vs.ub[7 ^ host];
+    vd.ub[7 ^ host] = vt.ub[7 ^ host];
+
+    return vd.d;
+}
+
+uint64_t helper_pavgh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pavgb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminsh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaxub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pminub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] == vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 2; i++) {
+        vs.uw[i] = -(vs.uw[i] > vt.uw[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] == vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; i++) {
+        vs.uh[i] = -(vs.uh[i] > vt.uh[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] == vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; i++) {
+        vs.ub[i] = -(vs.ub[i] > vt.ub[i]);
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.uw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psraw(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 31) {
+        ft = 31;
+    }
+    vs.d = fs;
+    for (i = 0; i < 2; ++i) {
+        vs.sw[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psllh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] <<= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrlh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        return 0;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.uh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_psrah(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs;
+    unsigned i;
+
+    ft &= 0x7f;
+    if (ft > 15) {
+        ft = 15;
+    }
+    vs.d = fs;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] >>= ft;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmullh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        vs.sh[i] *= vt.sh[i];
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        int32_t r = vs.sh[i] * vt.sh[i];
+        vs.sh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 4; ++i) {
+        uint32_t r = vs.uh[i] * vt.uh[i];
+        vs.uh[i] = r >> 16;
+    }
+    return vs.d;
+}
+
+uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft)
+{
+    unsigned host = BYTE_ORDER_XOR(3);
+    LMIValue vs, vt;
+    uint32_t p0, p1;
+
+    vs.d = fs;
+    vt.d = ft;
+    p0  = vs.sh[0 ^ host] * vt.sh[0 ^ host];
+    p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host];
+    p1  = vs.sh[2 ^ host] * vt.sh[2 ^ host];
+    p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host];
+
+    return ((uint64_t)p1 << 32) | p0;
+}
+
+uint64_t helper_pasubub(uint64_t fs, uint64_t ft)
+{
+    LMIValue vs, vt;
+    unsigned i;
+
+    vs.d = fs;
+    vt.d = ft;
+    for (i = 0; i < 8; ++i) {
+        int r = vs.ub[i] - vt.ub[i];
+        vs.ub[i] = (r < 0 ? -r : r);
+    }
+    return vs.d;
+}
+
+uint64_t helper_biadd(uint64_t fs)
+{
+    unsigned i, fd;
+
+    for (i = fd = 0; i < 8; ++i) {
+        fd += (fs >> (i * 8)) & 0xff;
+    }
+    return fd & 0xffff;
+}
+
+uint64_t helper_pmovmskb(uint64_t fs)
+{
+    unsigned fd = 0;
+
+    fd |= ((fs >>  7) & 1) << 0;
+    fd |= ((fs >> 15) & 1) << 1;
+    fd |= ((fs >> 23) & 1) << 2;
+    fd |= ((fs >> 31) & 1) << 3;
+    fd |= ((fs >> 39) & 1) << 4;
+    fd |= ((fs >> 47) & 1) << 5;
+    fd |= ((fs >> 55) & 1) << 6;
+    fd |= ((fs >> 63) & 1) << 7;
+
+    return fd & 0xff;
+}
diff --git a/target-mips/translate.c b/target-mips/translate.c
index 57454f0..ac941e6 100644
--- a/target-mips/translate.c
+++ b/target-mips/translate.c
@@ -446,6 +446,103 @@  enum {
     OPC_BC2     = (0x08 << 21) | OPC_CP2,
 };
 
+#define MASK_LMI(op)  (MASK_OP_MAJOR(op) | (op & (0x1F << 21)) | (op & 0x1F))
+
+enum {
+    OPC_PADDSH  = (24 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSH = (25 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDH   = (26 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDW   = (27 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDSB  = (28 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDUSB = (29 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDB   = (30 << 21) | (0x00) | OPC_CP2,
+    OPC_PADDD   = (31 << 21) | (0x00) | OPC_CP2,
+
+    OPC_PSUBSH  = (24 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSH = (25 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBH   = (26 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBW   = (27 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBSB  = (28 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBUSB = (29 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBB   = (30 << 21) | (0x01) | OPC_CP2,
+    OPC_PSUBD   = (31 << 21) | (0x01) | OPC_CP2,
+
+    OPC_PSHUFH   = (24 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSWH = (25 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKSSHB = (26 << 21) | (0x02) | OPC_CP2,
+    OPC_PACKUSHB = (27 << 21) | (0x02) | OPC_CP2,
+    OPC_XOR_CP2  = (28 << 21) | (0x02) | OPC_CP2,
+    OPC_NOR_CP2  = (29 << 21) | (0x02) | OPC_CP2,
+    OPC_AND_CP2  = (30 << 21) | (0x02) | OPC_CP2,
+    OPC_PANDN    = (31 << 21) | (0x02) | OPC_CP2,
+
+    OPC_PUNPCKLHW = (24 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHHW = (25 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKLBH = (26 << 21) | (0x03) | OPC_CP2,
+    OPC_PUNPCKHBH = (27 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_0  = (28 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_1  = (29 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_2  = (30 << 21) | (0x03) | OPC_CP2,
+    OPC_PINSRH_3  = (31 << 21) | (0x03) | OPC_CP2,
+
+    OPC_PAVGH   = (24 << 21) | (0x08) | OPC_CP2,
+    OPC_PAVGB   = (25 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXSH  = (26 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINSH  = (27 << 21) | (0x08) | OPC_CP2,
+    OPC_PMAXUB  = (28 << 21) | (0x08) | OPC_CP2,
+    OPC_PMINUB  = (29 << 21) | (0x08) | OPC_CP2,
+
+    OPC_PCMPEQW = (24 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTW = (25 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQH = (26 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTH = (27 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPEQB = (28 << 21) | (0x09) | OPC_CP2,
+    OPC_PCMPGTB = (29 << 21) | (0x09) | OPC_CP2,
+
+    OPC_PSLLW   = (24 << 21) | (0x0A) | OPC_CP2,
+    OPC_PSLLH   = (25 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULLH  = (26 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHH  = (27 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULUW  = (28 << 21) | (0x0A) | OPC_CP2,
+    OPC_PMULHUH = (29 << 21) | (0x0A) | OPC_CP2,
+
+    OPC_PSRLW     = (24 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRLH     = (25 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAW     = (26 << 21) | (0x0B) | OPC_CP2,
+    OPC_PSRAH     = (27 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKLWD = (28 << 21) | (0x0B) | OPC_CP2,
+    OPC_PUNPCKHWD = (29 << 21) | (0x0B) | OPC_CP2,
+
+    OPC_ADDU_CP2 = (24 << 21) | (0x0C) | OPC_CP2,
+    OPC_OR_CP2   = (25 << 21) | (0x0C) | OPC_CP2,
+    OPC_ADD_CP2  = (26 << 21) | (0x0C) | OPC_CP2,
+    OPC_DADD_CP2 = (27 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQU_CP2 = (28 << 21) | (0x0C) | OPC_CP2,
+    OPC_SEQ_CP2  = (29 << 21) | (0x0C) | OPC_CP2,
+
+    OPC_SUBU_CP2 = (24 << 21) | (0x0D) | OPC_CP2,
+    OPC_PASUBUB  = (25 << 21) | (0x0D) | OPC_CP2,
+    OPC_SUB_CP2  = (26 << 21) | (0x0D) | OPC_CP2,
+    OPC_DSUB_CP2 = (27 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLTU_CP2 = (28 << 21) | (0x0D) | OPC_CP2,
+    OPC_SLT_CP2  = (29 << 21) | (0x0D) | OPC_CP2,
+
+    OPC_SLL_CP2  = (24 << 21) | (0x0E) | OPC_CP2,
+    OPC_DSLL_CP2 = (25 << 21) | (0x0E) | OPC_CP2,
+    OPC_PEXTRH   = (26 << 21) | (0x0E) | OPC_CP2,
+    OPC_PMADDHW  = (27 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLEU_CP2 = (28 << 21) | (0x0E) | OPC_CP2,
+    OPC_SLE_CP2  = (29 << 21) | (0x0E) | OPC_CP2,
+
+    OPC_SRL_CP2  = (24 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRL_CP2 = (25 << 21) | (0x0F) | OPC_CP2,
+    OPC_SRA_CP2  = (26 << 21) | (0x0F) | OPC_CP2,
+    OPC_DSRA_CP2 = (27 << 21) | (0x0F) | OPC_CP2,
+    OPC_BIADD    = (28 << 21) | (0x0F) | OPC_CP2,
+    OPC_PMOVMSKB = (29 << 21) | (0x0F) | OPC_CP2,
+};
+
+
 #define MASK_CP3(op)       MASK_OP_MAJOR(op) | (op & 0x3F)
 
 enum {
@@ -2424,8 +2521,8 @@  static void gen_cl (DisasContext *ctx, uint32_t opc,
 }
 
 /* Godson integer instructions */
-static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
-                                int rd, int rs, int rt)
+static void gen_loongson_integer(DisasContext *ctx, uint32_t opc,
+                                 int rd, int rs, int rt)
 {
     const char *opn = "loongson";
     TCGv t0, t1;
@@ -2637,6 +2734,278 @@  static void gen_loongson_integer (DisasContext *ctx, uint32_t opc,
     tcg_temp_free(t1);
 }
 
+/* Loongson multimedia instructions */
+static void gen_loongson_multimedia(DisasContext *ctx, int rd, int rs, int rt)
+{
+    const char *opn = "loongson_cp2";
+    uint32_t opc, shift_max;
+    TCGv_i64 t0, t1;
+
+    opc = MASK_LMI(ctx->opcode);
+    switch (opc) {
+    case OPC_ADD_CP2:
+    case OPC_SUB_CP2:
+    case OPC_DADD_CP2:
+    case OPC_DSUB_CP2:
+        t0 = tcg_temp_local_new_i64();
+        t1 = tcg_temp_local_new_i64();
+        break;
+    default:
+        t0 = tcg_temp_new_i64();
+        t1 = tcg_temp_new_i64();
+        break;
+    }
+
+    gen_load_fpr64(ctx, t0, rs);
+    gen_load_fpr64(ctx, t1, rt);
+
+#define LMI_HELPER(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0, t1); opn = #LO; break
+#define LMI_HELPER_1(UP, LO) \
+    case OPC_##UP: gen_helper_##LO(t0, t0); opn = #LO; break
+#define LMI_DIRECT(UP, LO, OP) \
+    case OPC_##UP: tcg_gen_##OP##_i64(t0, t0, t1); opn = #LO; break
+
+    switch (opc) {
+    LMI_HELPER(PADDSH, paddsh);
+    LMI_HELPER(PADDUSH, paddush);
+    LMI_HELPER(PADDH, paddh);
+    LMI_HELPER(PADDW, paddw);
+    LMI_HELPER(PADDSB, paddsb);
+    LMI_HELPER(PADDUSB, paddusb);
+    LMI_HELPER(PADDB, paddb);
+
+    LMI_HELPER(PSUBSH, psubsh);
+    LMI_HELPER(PSUBUSH, psubush);
+    LMI_HELPER(PSUBH, psubh);
+    LMI_HELPER(PSUBW, psubw);
+    LMI_HELPER(PSUBSB, psubsb);
+    LMI_HELPER(PSUBUSB, psubusb);
+    LMI_HELPER(PSUBB, psubb);
+
+    LMI_HELPER(PSHUFH, pshufh);
+    LMI_HELPER(PACKSSWH, packsswh);
+    LMI_HELPER(PACKSSHB, packsshb);
+    LMI_HELPER(PACKUSHB, packushb);
+
+    LMI_HELPER(PUNPCKLHW, punpcklhw);
+    LMI_HELPER(PUNPCKHHW, punpckhhw);
+    LMI_HELPER(PUNPCKLBH, punpcklbh);
+    LMI_HELPER(PUNPCKHBH, punpckhbh);
+    LMI_HELPER(PUNPCKLWD, punpcklwd);
+    LMI_HELPER(PUNPCKHWD, punpckhwd);
+
+    LMI_HELPER(PAVGH, pavgh);
+    LMI_HELPER(PAVGB, pavgb);
+    LMI_HELPER(PMAXSH, pmaxsh);
+    LMI_HELPER(PMINSH, pminsh);
+    LMI_HELPER(PMAXUB, pmaxub);
+    LMI_HELPER(PMINUB, pminub);
+
+    LMI_HELPER(PCMPEQW, pcmpeqw);
+    LMI_HELPER(PCMPGTW, pcmpgtw);
+    LMI_HELPER(PCMPEQH, pcmpeqh);
+    LMI_HELPER(PCMPGTH, pcmpgth);
+    LMI_HELPER(PCMPEQB, pcmpeqb);
+    LMI_HELPER(PCMPGTB, pcmpgtb);
+
+    LMI_HELPER(PSLLW, psllw);
+    LMI_HELPER(PSLLH, psllh);
+    LMI_HELPER(PSRLW, psrlw);
+    LMI_HELPER(PSRLH, psrlh);
+    LMI_HELPER(PSRAW, psraw);
+    LMI_HELPER(PSRAH, psrah);
+
+    LMI_HELPER(PMULLH, pmullh);
+    LMI_HELPER(PMULHH, pmulhh);
+    LMI_HELPER(PMULHUH, pmulhuh);
+    LMI_HELPER(PMADDHW, pmaddhw);
+
+    LMI_HELPER(PASUBUB, pasubub);
+    LMI_HELPER_1(BIADD, biadd);
+    LMI_HELPER_1(PMOVMSKB, pmovmskb);
+
+    LMI_DIRECT(PADDD, paddd, add);
+    LMI_DIRECT(PSUBD, psubd, sub);
+    LMI_DIRECT(XOR_CP2, xor, xor);
+    LMI_DIRECT(NOR_CP2, nor, nor);
+    LMI_DIRECT(AND_CP2, and, and);
+    LMI_DIRECT(PANDN, pandn, andc);
+    LMI_DIRECT(OR, or, or);
+
+    case OPC_PINSRH_0:
+        tcg_gen_deposit_i64(t0, t0, t1, 0, 16);
+        opn = "pinsrh_0";
+        break;
+    case OPC_PINSRH_1:
+        tcg_gen_deposit_i64(t0, t0, t1, 16, 16);
+        opn = "pinsrh_1";
+        break;
+    case OPC_PINSRH_2:
+        tcg_gen_deposit_i64(t0, t0, t1, 32, 16);
+        opn = "pinsrh_2";
+        break;
+    case OPC_PINSRH_3:
+        tcg_gen_deposit_i64(t0, t0, t1, 48, 16);
+        opn = "pinsrh_3";
+        break;
+
+    case OPC_PEXTRH:
+        tcg_gen_andi_i64(t1, t1, 3);
+        tcg_gen_shli_i64(t1, t1, 4);
+        tcg_gen_shr_i64(t0, t0, t1);
+        tcg_gen_ext16u_i64(t0, t0);
+        opn = "pextrh";
+        break;
+
+    case OPC_ADDU_CP2:
+        tcg_gen_add_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+    case OPC_SUBU_CP2:
+        tcg_gen_sub_i64(t0, t0, t1);
+        tcg_gen_ext32s_i64(t0, t0);
+        opn = "addu";
+        break;
+
+    case OPC_SLL_CP2:
+        opn = "sll";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRL_CP2:
+        opn = "srl";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_SRA_CP2:
+        opn = "sra";
+        shift_max = 32;
+        goto do_shift;
+    case OPC_DSLL_CP2:
+        opn = "dsll";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRL_CP2:
+        opn = "dsrl";
+        shift_max = 64;
+        goto do_shift;
+    case OPC_DSRA_CP2:
+        opn = "dsra";
+        shift_max = 64;
+        goto do_shift;
+    do_shift:
+        /* Make sure shift count isn't TCG undefined behaviour.  */
+        tcg_gen_andi_i64(t1, t1, shift_max - 1);
+
+        switch (opc) {
+        case OPC_SLL_CP2:
+        case OPC_DSLL_CP2:
+            tcg_gen_shl_i64(t0, t0, t1);
+            break;
+        case OPC_SRA_CP2:
+        case OPC_DSRA_CP2:
+            /* Since SRA is UndefinedResult without sign-extended inputs,
+               we can treat SRA and DSRA the same.  */
+            tcg_gen_sar_i64(t0, t0, t1);
+            break;
+        case OPC_SRL_CP2:
+            /* We want to shift in zeros for SRL; zero-extend first.  */
+            tcg_gen_ext32u_i64(t0, t0);
+            /* FALLTHRU */
+        case OPC_DSRL_CP2:
+            tcg_gen_shr_i64(t0, t0, t1);
+            break;
+        }
+
+        if (shift_max == 32) {
+            tcg_gen_ext32s_i64(t0, t0);
+        }
+
+        /* Shifts larger than MAX produce zero.  */
+        tcg_gen_setcondi_i64(TCG_COND_LTU, t1, t1, shift_max);
+        tcg_gen_neg_i64(t1, t1);
+        tcg_gen_and_i64(t0, t0, t1);
+        break;
+
+    case OPC_ADD_CP2:
+    case OPC_DADD_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_add_i64(t0, t1, t2);
+            if (opc == OPC_ADD_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_andc_i64(t1, t2, t1);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_ADD_CP2 ? "add" : "dadd");
+            break;
+        }
+
+    case OPC_SUB_CP2:
+    case OPC_DSUB_CP2:
+        {
+            TCGv_i64 t2 = tcg_temp_new_i64();
+            int lab = gen_new_label();
+
+            tcg_gen_mov_i64(t2, t0);
+            tcg_gen_sub_i64(t0, t1, t2);
+            if (opc == OPC_SUB_CP2) {
+                tcg_gen_ext32s_i64(t0, t0);
+            }
+            tcg_gen_xor_i64(t1, t1, t2);
+            tcg_gen_xor_i64(t2, t2, t0);
+            tcg_gen_and_i64(t1, t1, t2);
+            tcg_temp_free_i64(t2);
+            tcg_gen_brcondi_i64(TCG_COND_GE, t1, 0, lab);
+            generate_exception(ctx, EXCP_OVERFLOW);
+            gen_set_label(lab);
+
+            opn = (opc == OPC_SUB_CP2 ? "sub" : "dsub");
+            break;
+        }
+
+    case OPC_PMULUW:
+        tcg_gen_ext32u_i64(t0, t0);
+        tcg_gen_ext32u_i64(t1, t1);
+        tcg_gen_mul_i64(t0, t0, t1);
+        opn = "pmuluw";
+        break;
+
+    case OPC_SEQU_CP2:
+    case OPC_SEQ_CP2:
+    case OPC_SLTU_CP2:
+    case OPC_SLT_CP2:
+    case OPC_SLEU_CP2:
+    case OPC_SLE_CP2:
+        /* ??? Document is unclear: Set FCC[CC].  Does that mean the
+           FD field is the CC field?  */
+    default:
+        MIPS_INVAL(opn);
+        generate_exception(ctx, EXCP_RI);
+        return;
+    }
+
+#undef LMI_HELPER
+#undef LMI_DIRECT
+
+    gen_store_fpr64(ctx, t0, rd);
+
+    (void)opn; /* avoid a compiler warning */
+    MIPS_DEBUG("%s %s, %s, %s", opn,
+               fregnames[rd], fregnames[rs], fregnames[rt]);
+    tcg_temp_free_i64(t0);
+    tcg_temp_free_i64(t1);
+}
+
 /* Traps */
 static void gen_trap (DisasContext *ctx, uint32_t opc,
                       int rs, int rt, int16_t imm)
@@ -12344,10 +12713,14 @@  static void decode_opc (CPUMIPSState *env, DisasContext *ctx, int *is_branch)
     case OPC_LDC2:
     case OPC_SWC2:
     case OPC_SDC2:
-    case OPC_CP2:
         /* COP2: Not implemented. */
         generate_exception_err(ctx, EXCP_CpU, 2);
         break;
+    case OPC_CP2:
+        check_insn(env, ctx, INSN_LOONGSON2F);
+        /* Note that these instructions use different fields.  */
+        gen_loongson_multimedia(ctx, sa, rd, rt);
+        break;
 
     case OPC_CP3:
         if (env->CP0_Config1 & (1 << CP0C1_FP)) {