Patchwork [14/17] s390x: Implement opcode helpers

login
register
mail settings
Submitter Alexander Graf
Date March 24, 2011, 3:58 p.m.
Message ID <1300982333-12802-15-git-send-email-agraf@suse.de>
Download mbox | patch
Permalink /patch/88239/
State New
Headers show

Comments

Alexander Graf - March 24, 2011, 3:58 p.m.
There are some instructions that can't (or shouldn't) be expressed by pure
tcg code. For those, we call into externally compiled C functions.

This patch implements those C functions.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 target-s390x/helpers.h   |  151 +++
 target-s390x/op_helper.c | 2891 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 3040 insertions(+), 2 deletions(-)
 create mode 100644 target-s390x/helpers.h
Peter Maydell - March 24, 2011, 5:29 p.m.
On 24 March 2011 15:58, Alexander Graf <agraf@suse.de> wrote:

This is more random comments in passing than a thorough review; sorry.

> +#if HOST_LONG_BITS == 64 && defined(__GNUC__)
> +        /* assuming 64-bit hosts have __uint128_t */
> +        __uint128_t dividend = (((__uint128_t)env->regs[r1]) << 64) |
> +                               (env->regs[r1+1]);
> +        __uint128_t quotient = dividend / divisor;
> +        env->regs[r1+1] = quotient;
> +        __uint128_t remainder = dividend % divisor;
> +        env->regs[r1] = remainder;
> +#else
> +        /* 32-bit hosts would need special wrapper functionality - just abort if
> +           we encounter such a case; it's very unlikely anyways. */
> +        cpu_abort(env, "128 -> 64/64 division not implemented\n");
> +#endif

...I'm still using a 32 bit system :-)

> +/* condition codes for binary FP ops */
> +static uint32_t set_cc_f32(float32 v1, float32 v2)
> +{
> +    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
> +        return 3;
> +    } else if (float32_eq(v1, v2, &env->fpu_status)) {
> +        return 0;
> +    } else if (float32_lt(v1, v2, &env->fpu_status)) {
> +        return 1;
> +    } else {
> +        return 2;
> +    }
> +}

Can you not use float32_compare_quiet() (returns a value
telling you if it's less/equal/greater/unordered)?
If not, needs a comment saying why you need to do it the hard way.

> +/* negative absolute of 32-bit float */
> +uint32_t HELPER(lcebr)(uint32_t f1, uint32_t f2)
> +{
> +    env->fregs[f1].l.upper = float32_sub(float32_zero, env->fregs[f2].l.upper,
> +                                         &env->fpu_status);
> +    return set_cc_nz_f32(env->fregs[f1].l.upper);
> +}

Google suggests this is wrong:
http://publib.boulder.ibm.com/cgi-bin/bookmgr/BOOKS/DZ9AR006/19.4.10?SHELF=&DT=19990630131355&CASE=
says for lcebr that:
"The sign is inverted for any operand, including a QNaN or SNaN,
without causing an arithmetic exception."

but float32_sub will raise exceptions for NaNs. You want
float32_chs() (and similarly for the other types).

> +/* convert 64-bit float to 128-bit float */
> +uint32_t HELPER(lcxbr)(uint32_t f1, uint32_t f2)

Wrong comment? Looks like another invert-sign op from
the online POO.

> +/* 128-bit FP compare RR */
> +uint32_t HELPER(cxbr)(uint32_t f1, uint32_t f2)
> +{
> +    CPU_QuadU v1;
> +    v1.ll.upper = env->fregs[f1].ll;
> +    v1.ll.lower = env->fregs[f1 + 2].ll;
> +    CPU_QuadU v2;
> +    v2.ll.upper = env->fregs[f2].ll;
> +    v2.ll.lower = env->fregs[f2 + 2].ll;
> +    if (float128_is_any_nan(v1.q) || float128_is_any_nan(v2.q)) {
> +        return 3;
> +    } else if (float128_eq(v1.q, v2.q, &env->fpu_status)) {
> +        return 0;
> +    } else if (float128_lt(v1.q, v2.q, &env->fpu_status)) {
> +        return 1;
> +    } else {
> +        return 2;
> +    }
> +}

float128_compare_quiet() again?

> +/* convert 32-bit float to 64-bit int */
> +uint32_t HELPER(cgebr)(uint32_t r1, uint32_t f2, uint32_t m3)
> +{
> +    float32 v2 = env->fregs[f2].l.upper;
> +    set_round_mode(m3);
> +    env->regs[r1] = float32_to_int64(v2, &env->fpu_status);
> +    return set_cc_nz_f32(v2);
> +}

Should this really be permanently setting the rounding mode
for future instructions as well as for the op it does itself?

> +/* load 32-bit FP zero */
> +void HELPER(lzer)(uint32_t f1)
> +{
> +    env->fregs[f1].l.upper = float32_zero;
> +}

Surely this is trivial enough to inline rather than
calling a helper function...

> +/* load 128-bit FP zero */
> +void HELPER(lzxr)(uint32_t f1)
> +{
> +    CPU_QuadU x;
> +    x.q = float64_to_float128(float64_zero, &env->fpu_status);

Yuck. Just define a float128_zero if we need one.

> +uint32_t HELPER(tceb)(uint32_t f1, uint64_t m2)
> +{
> +    float32 v1 = env->fregs[f1].l.upper;
> +    int neg = float32_is_neg(v1);
> +    uint32_t cc = 0;
> +
> +    HELPER_LOG("%s: v1 0x%lx m2 0x%lx neg %d\n", __FUNCTION__, (long)v1, m2, neg);
> +    if ((float32_is_zero(v1) && (m2 & (1 << (11-neg)))) ||
> +        (float32_is_infinity(v1) && (m2 & (1 << (5-neg)))) ||
> +        (float32_is_any_nan(v1) && (m2 & (1 << (3-neg)))) ||
> +        (float32_is_signaling_nan(v1) && (m2 & (1 << (1-neg))))) {
> +        cc = 1;
> +    } else if (m2 & (1 << (9-neg))) {
> +        /* assume normalized number */
> +        cc = 1;
> +    }
> +
> +    /* FIXME: denormalized? */
> +    return cc;
> +}

There's a float32_is_zero_or_denormal(); if you need a
float32_is_denormal() which is false for real zero we
could add it, I guess.

> +static inline uint32_t cc_calc_nabs_32(CPUState *env, int32_t dst)
> +{
> +    return !!dst;
> +}

Another candidate for inlining.

-- PMM
Richard Henderson - March 24, 2011, 5:41 p.m.
On 03/24/2011 10:29 AM, Peter Maydell wrote:
> On 24 March 2011 15:58, Alexander Graf <agraf@suse.de> wrote:
> 
> This is more random comments in passing than a thorough review; sorry.
> 
>> +#if HOST_LONG_BITS == 64 && defined(__GNUC__)
>> +        /* assuming 64-bit hosts have __uint128_t */
>> +        __uint128_t dividend = (((__uint128_t)env->regs[r1]) << 64) |
>> +                               (env->regs[r1+1]);
>> +        __uint128_t quotient = dividend / divisor;
>> +        env->regs[r1+1] = quotient;
>> +        __uint128_t remainder = dividend % divisor;
>> +        env->regs[r1] = remainder;
>> +#else
>> +        /* 32-bit hosts would need special wrapper functionality - just abort if
>> +           we encounter such a case; it's very unlikely anyways. */
>> +        cpu_abort(env, "128 -> 64/64 division not implemented\n");
>> +#endif
> 
> ...I'm still using a 32 bit system :-)

A couple of options:

(1) Steal code from gcc's __[u]divdi3 for implementing double-word division via
    single-word division.  In this case, your "single-word" will be long long.

(2) Implement a simple bit reduction loop.  This is probably easiest.

(3) Reuse some of the softfloat code that manipulates 128bit quantities.  This
    is probably the best option, particularly if the availability of __uint128
    is taught to softfloat so that it doesn't always open-code stuff that the
    compiler could take care of.


r~
Alexander Graf - March 24, 2011, 6:09 p.m.
Am 24.03.2011 um 18:41 schrieb Richard Henderson <rth@twiddle.net>:

> On 03/24/2011 10:29 AM, Peter Maydell wrote:
>> On 24 March 2011 15:58, Alexander Graf <agraf@suse.de> wrote:
>> 
>> This is more random comments in passing than a thorough review; sorry.
>> 
>>> +#if HOST_LONG_BITS == 64 && defined(__GNUC__)
>>> +        /* assuming 64-bit hosts have __uint128_t */
>>> +        __uint128_t dividend = (((__uint128_t)env->regs[r1]) << 64) |
>>> +                               (env->regs[r1+1]);
>>> +        __uint128_t quotient = dividend / divisor;
>>> +        env->regs[r1+1] = quotient;
>>> +        __uint128_t remainder = dividend % divisor;
>>> +        env->regs[r1] = remainder;
>>> +#else
>>> +        /* 32-bit hosts would need special wrapper functionality - just abort if
>>> +           we encounter such a case; it's very unlikely anyways. */
>>> +        cpu_abort(env, "128 -> 64/64 division not implemented\n");
>>> +#endif
>> 
>> ...I'm still using a 32 bit system :-)
> 
> A couple of options:
> 
> (1) Steal code from gcc's __[u]divdi3 for implementing double-word division via
>    single-word division.  In this case, your "single-word" will be long long.
> 
> (2) Implement a simple bit reduction loop.  This is probably easiest.
> 
> (3) Reuse some of the softfloat code that manipulates 128bit quantities.  This
>    is probably the best option, particularly if the availability of __uint128
>    is taught to softfloat so that it doesn't always open-code stuff that the
>    compiler could take care of.

In all applications I've run so far this abort has _never_ fired. I'm not sure gcc even emits it.

So IMHO this abort doesn't hurt. Once we get a bug report of a user hitting it, we can think of ways to implement the missing bits. For now, it's not worse than a not implemented opcode (of which we still have quite a number).


Alex

>
Alexander Graf - March 24, 2011, 6:13 p.m.
Am 24.03.2011 um 18:29 schrieb Peter Maydell <peter.maydell@linaro.org>:

> On 24 March 2011 15:58, Alexander Graf <agraf@suse.de> wrote:
> 
> This is more random comments in passing than a thorough review; sorry.
> 
>> +#if HOST_LONG_BITS == 64 && defined(__GNUC__)
>> +        /* assuming 64-bit hosts have __uint128_t */
>> +        __uint128_t dividend = (((__uint128_t)env->regs[r1]) << 64) |
>> +                               (env->regs[r1+1]);
>> +        __uint128_t quotient = dividend / divisor;
>> +        env->regs[r1+1] = quotient;
>> +        __uint128_t remainder = dividend % divisor;
>> +        env->regs[r1] = remainder;
>> +#else
>> +        /* 32-bit hosts would need special wrapper functionality - just abort if
>> +           we encounter such a case; it's very unlikely anyways. */
>> +        cpu_abort(env, "128 -> 64/64 division not implemented\n");
>> +#endif
> 
> ...I'm still using a 32 bit system :-)
> 
>> +/* condition codes for binary FP ops */
>> +static uint32_t set_cc_f32(float32 v1, float32 v2)
>> +{
>> +    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
>> +        return 3;
>> +    } else if (float32_eq(v1, v2, &env->fpu_status)) {
>> +        return 0;
>> +    } else if (float32_lt(v1, v2, &env->fpu_status)) {
>> +        return 1;
>> +    } else {
>> +        return 2;
>> +    }
>> +}
> 
> Can you not use float32_compare_quiet() (returns a value
> telling you if it's less/equal/greater/unordered)?
> If not, needs a comment saying why you need to do it the hard way.

Phew - I really have (almost) no clue about fp. Those parts come from Uli. So I guess it's the easiest to just ask him :)

> 
>> +/* negative absolute of 32-bit float */
>> +uint32_t HELPER(lcebr)(uint32_t f1, uint32_t f2)
>> +{
>> +    env->fregs[f1].l.upper = float32_sub(float32_zero, env->fregs[f2].l.upper,
>> +                                         &env->fpu_status);
>> +    return set_cc_nz_f32(env->fregs[f1].l.upper);
>> +}
> 
> Google suggests this is wrong:
> http://publib.boulder.ibm.com/cgi-bin/bookmgr/BOOKS/DZ9AR006/19.4.10?SHELF=&DT=19990630131355&CASE=
> says for lcebr that:
> "The sign is inverted for any operand, including a QNaN or SNaN,
> without causing an arithmetic exception."
> 
> but float32_sub will raise exceptions for NaNs. You want
> float32_chs() (and similarly for the other types).
> 
>> +/* convert 64-bit float to 128-bit float */
>> +uint32_t HELPER(lcxbr)(uint32_t f1, uint32_t f2)
> 
> Wrong comment? Looks like another invert-sign op from
> the online POO.
> 
>> +/* 128-bit FP compare RR */
>> +uint32_t HELPER(cxbr)(uint32_t f1, uint32_t f2)
>> +{
>> +    CPU_QuadU v1;
>> +    v1.ll.upper = env->fregs[f1].ll;
>> +    v1.ll.lower = env->fregs[f1 + 2].ll;
>> +    CPU_QuadU v2;
>> +    v2.ll.upper = env->fregs[f2].ll;
>> +    v2.ll.lower = env->fregs[f2 + 2].ll;
>> +    if (float128_is_any_nan(v1.q) || float128_is_any_nan(v2.q)) {
>> +        return 3;
>> +    } else if (float128_eq(v1.q, v2.q, &env->fpu_status)) {
>> +        return 0;
>> +    } else if (float128_lt(v1.q, v2.q, &env->fpu_status)) {
>> +        return 1;
>> +    } else {
>> +        return 2;
>> +    }
>> +}
> 
> float128_compare_quiet() again?
> 
>> +/* convert 32-bit float to 64-bit int */
>> +uint32_t HELPER(cgebr)(uint32_t r1, uint32_t f2, uint32_t m3)
>> +{
>> +    float32 v2 = env->fregs[f2].l.upper;
>> +    set_round_mode(m3);
>> +    env->regs[r1] = float32_to_int64(v2, &env->fpu_status);
>> +    return set_cc_nz_f32(v2);
>> +}
> 
> Should this really be permanently setting the rounding mode
> for future instructions as well as for the op it does itself?
> 
>> +/* load 32-bit FP zero */
>> +void HELPER(lzer)(uint32_t f1)
>> +{
>> +    env->fregs[f1].l.upper = float32_zero;
>> +}
> 
> Surely this is trivial enough to inline rather than
> calling a helper function...
> 
>> +/* load 128-bit FP zero */
>> +void HELPER(lzxr)(uint32_t f1)
>> +{
>> +    CPU_QuadU x;
>> +    x.q = float64_to_float128(float64_zero, &env->fpu_status);
> 
> Yuck. Just define a float128_zero if we need one.
> 
>> +uint32_t HELPER(tceb)(uint32_t f1, uint64_t m2)
>> +{
>> +    float32 v1 = env->fregs[f1].l.upper;
>> +    int neg = float32_is_neg(v1);
>> +    uint32_t cc = 0;
>> +
>> +    HELPER_LOG("%s: v1 0x%lx m2 0x%lx neg %d\n", __FUNCTION__, (long)v1, m2, neg);
>> +    if ((float32_is_zero(v1) && (m2 & (1 << (11-neg)))) ||
>> +        (float32_is_infinity(v1) && (m2 & (1 << (5-neg)))) ||
>> +        (float32_is_any_nan(v1) && (m2 & (1 << (3-neg)))) ||
>> +        (float32_is_signaling_nan(v1) && (m2 & (1 << (1-neg))))) {
>> +        cc = 1;
>> +    } else if (m2 & (1 << (9-neg))) {
>> +        /* assume normalized number */
>> +        cc = 1;
>> +    }
>> +
>> +    /* FIXME: denormalized? */
>> +    return cc;
>> +}
> 
> There's a float32_is_zero_or_denormal(); if you need a
> float32_is_denormal() which is false for real zero we
> could add it, I guess.
> 
>> +static inline uint32_t cc_calc_nabs_32(CPUState *env, int32_t dst)
>> +{
>> +    return !!dst;
>> +}
> 
> Another candidate for inlining.
> 
> -- PMM
Alexander Graf - March 28, 2011, 5:23 p.m.
On 03/24/2011 06:29 PM, Peter Maydell wrote:
> On 24 March 2011 15:58, Alexander Graf<agraf@suse.de>  wrote:
>
> This is more random comments in passing than a thorough review; sorry.
>
>> +#if HOST_LONG_BITS == 64&&  defined(__GNUC__)
>> +        /* assuming 64-bit hosts have __uint128_t */
>> +        __uint128_t dividend = (((__uint128_t)env->regs[r1])<<  64) |
>> +                               (env->regs[r1+1]);
>> +        __uint128_t quotient = dividend / divisor;
>> +        env->regs[r1+1] = quotient;
>> +        __uint128_t remainder = dividend % divisor;
>> +        env->regs[r1] = remainder;
>> +#else
>> +        /* 32-bit hosts would need special wrapper functionality - just abort if
>> +           we encounter such a case; it's very unlikely anyways. */
>> +        cpu_abort(env, "128 ->  64/64 division not implemented\n");
>> +#endif
> ...I'm still using a 32 bit system :-)
>
>> +/* condition codes for binary FP ops */
>> +static uint32_t set_cc_f32(float32 v1, float32 v2)
>> +{
>> +    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
>> +        return 3;
>> +    } else if (float32_eq(v1, v2,&env->fpu_status)) {
>> +        return 0;
>> +    } else if (float32_lt(v1, v2,&env->fpu_status)) {
>> +        return 1;
>> +    } else {
>> +        return 2;
>> +    }
>> +}
> Can you not use float32_compare_quiet() (returns a value
> telling you if it's less/equal/greater/unordered)?
> If not, needs a comment saying why you need to do it the hard way.

I just checked the macros there and it looks like float32_compare_quiet 
returns eq when both numbers are NaN. We would still have to convert 
from the return value from that over to a CC value. I honestly don't see 
any benefit - the code doesn't become cleaner or smaller.

int float32_compare_quiet( float32 a, float32 b STATUS_PARAM )
{
     if (isless(a, b)) {
         return float_relation_less;
     } else if (a == b) {
         return float_relation_equal;
     } else if (isgreater(a, b)) {
         return float_relation_greater;
     } else {
         return float_relation_unordered;
     }
}


so

static uint32_t set_cc_f32(float32 v1, float32 v2)
{
     if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
         return 3;
     } else if (float32_eq(v1, v2, &env->fpu_status)) {
         return 0;
     } else if (float32_lt(v1, v2, &env->fpu_status)) {
         return 1;
     } else {
         return 2;
     }
}

would become

static uint32_t set_cc_f32(float32 v1, float32 v2)
{
     int r;

     if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
         return 3;
     }

     r = float32_compare_quiet(v1, v2, &env->fpu_status);
     switch (r) {
     case float_relation_equal:
         return 0;
     case float_relation_less:
         return 1;
     default:
         return 2;
     }
}


>> +/* negative absolute of 32-bit float */
>> +uint32_t HELPER(lcebr)(uint32_t f1, uint32_t f2)
>> +{
>> +    env->fregs[f1].l.upper = float32_sub(float32_zero, env->fregs[f2].l.upper,
>> +&env->fpu_status);
>> +    return set_cc_nz_f32(env->fregs[f1].l.upper);
>> +}
> Google suggests this is wrong:
> http://publib.boulder.ibm.com/cgi-bin/bookmgr/BOOKS/DZ9AR006/19.4.10?SHELF=&DT=19990630131355&CASE=
> says for lcebr that:
> "The sign is inverted for any operand, including a QNaN or SNaN,
> without causing an arithmetic exception."
>
> but float32_sub will raise exceptions for NaNs. You want
> float32_chs() (and similarly for the other types).

Ah, nice :). Thanks!

>> +/* convert 64-bit float to 128-bit float */
>> +uint32_t HELPER(lcxbr)(uint32_t f1, uint32_t f2)
> Wrong comment? Looks like another invert-sign op from
> the online POO.

Yup, wrong comment and the same as above for chs :).

>> +/* 128-bit FP compare RR */
>> +uint32_t HELPER(cxbr)(uint32_t f1, uint32_t f2)
>> +{
>> +    CPU_QuadU v1;
>> +    v1.ll.upper = env->fregs[f1].ll;
>> +    v1.ll.lower = env->fregs[f1 + 2].ll;
>> +    CPU_QuadU v2;
>> +    v2.ll.upper = env->fregs[f2].ll;
>> +    v2.ll.lower = env->fregs[f2 + 2].ll;
>> +    if (float128_is_any_nan(v1.q) || float128_is_any_nan(v2.q)) {
>> +        return 3;
>> +    } else if (float128_eq(v1.q, v2.q,&env->fpu_status)) {
>> +        return 0;
>> +    } else if (float128_lt(v1.q, v2.q,&env->fpu_status)) {
>> +        return 1;
>> +    } else {
>> +        return 2;
>> +    }
>> +}
> float128_compare_quiet() again?

See above :)

>> +/* convert 32-bit float to 64-bit int */
>> +uint32_t HELPER(cgebr)(uint32_t r1, uint32_t f2, uint32_t m3)
>> +{
>> +    float32 v2 = env->fregs[f2].l.upper;
>> +    set_round_mode(m3);
>> +    env->regs[r1] = float32_to_int64(v2,&env->fpu_status);
>> +    return set_cc_nz_f32(v2);
>> +}
> Should this really be permanently setting the rounding mode
> for future instructions as well as for the op it does itself?

IIUC every op that does rounding sets the rounding mode, no?

>> +/* load 32-bit FP zero */
>> +void HELPER(lzer)(uint32_t f1)
>> +{
>> +    env->fregs[f1].l.upper = float32_zero;
>> +}
> Surely this is trivial enough to inline rather than
> calling a helper function...

Lots of the FPU code could use inlining. The CC stuff does too. For now, 
I kept things the way Uli wrote them :).

>> +/* load 128-bit FP zero */
>> +void HELPER(lzxr)(uint32_t f1)
>> +{
>> +    CPU_QuadU x;
>> +    x.q = float64_to_float128(float64_zero,&env->fpu_status);
> Yuck. Just define a float128_zero if we need one.

Good point. Mind to do so? I find myself struggling with the code there.

>> +uint32_t HELPER(tceb)(uint32_t f1, uint64_t m2)
>> +{
>> +    float32 v1 = env->fregs[f1].l.upper;
>> +    int neg = float32_is_neg(v1);
>> +    uint32_t cc = 0;
>> +
>> +    HELPER_LOG("%s: v1 0x%lx m2 0x%lx neg %d\n", __FUNCTION__, (long)v1, m2, neg);
>> +    if ((float32_is_zero(v1)&&  (m2&  (1<<  (11-neg)))) ||
>> +        (float32_is_infinity(v1)&&  (m2&  (1<<  (5-neg)))) ||
>> +        (float32_is_any_nan(v1)&&  (m2&  (1<<  (3-neg)))) ||
>> +        (float32_is_signaling_nan(v1)&&  (m2&  (1<<  (1-neg))))) {
>> +        cc = 1;
>> +    } else if (m2&  (1<<  (9-neg))) {
>> +        /* assume normalized number */
>> +        cc = 1;
>> +    }
>> +
>> +    /* FIXME: denormalized? */
>> +    return cc;
>> +}
> There's a float32_is_zero_or_denormal(); if you need a
> float32_is_denormal() which is false for real zero we
> could add it, I guess.

Good point for a follow-up :)

>> +static inline uint32_t cc_calc_nabs_32(CPUState *env, int32_t dst)
>> +{
>> +    return !!dst;
>> +}
> Another candidate for inlining.

Same as above :)


Alex
Richard Henderson - March 28, 2011, 5:42 p.m.
On 03/28/2011 10:23 AM, Alexander Graf wrote:
> I just checked the macros there and it looks like
> float32_compare_quiet returns eq when both numbers are NaN.

No it doesn't -- a == b will never be true for either operand as NaN.

Have a look at the "real" softfloat version anyway, not the
softfloat-native version.  This is hiding behind macros; 
look for "float ## s ## _compare_internal".



r~
Peter Maydell - March 28, 2011, 5:55 p.m.
On 28 March 2011 18:23, Alexander Graf <agraf@suse.de> wrote:
> On 03/24/2011 06:29 PM, Peter Maydell wrote:
>>> +/* condition codes for binary FP ops */
>>> +static uint32_t set_cc_f32(float32 v1, float32 v2)
>>> +{
>>> +    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
>>> +        return 3;
>>> +    } else if (float32_eq(v1, v2,&env->fpu_status)) {
>>> +        return 0;
>>> +    } else if (float32_lt(v1, v2,&env->fpu_status)) {
>>> +        return 1;
>>> +    } else {
>>> +        return 2;
>>> +    }
>>> +}
>>
>> Can you not use float32_compare_quiet() (returns a value
>> telling you if it's less/equal/greater/unordered)?
>> If not, needs a comment saying why you need to do it the hard way.
>
> I just checked the macros there and it looks like float32_compare_quiet
> returns eq when both numbers are NaN.

Hmm?

    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
         extractFloat ## s ## Frac( a ) ) ||                                 \
        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
          extractFloat ## s ## Frac( b ) )) {                                \
        if (!is_quiet ||                                                     \
            float ## s ## _is_signaling_nan( a ) ||                          \
            float ## s ## _is_signaling_nan( b ) ) {                         \
            float_raise( float_flag_invalid STATUS_VAR);                     \
        }                                                                    \
        return float_relation_unordered;                                     \
    }                                                                        \

If A is a NaN (ie its exponent is nan_exp and the frac bits aren't zero)
or B is a NaN then we return float_relation_unordered.

> We would still have to convert from
> the return value from that over to a CC value. I honestly don't see any
> benefit - the code doesn't become cleaner or smaller.

So you get
static uint32_t set_cc_f32(float32 v1, float32 v2)
{
    switch (float32_compare_quiet(v1, v2, &env->fpu_status)) {
    case float_relation_unordered:
        return 3;
    case float_relation_equal:
        return 0;
    case float_relation_less:
        return 1;
    case float_relation_greater:
        return 2;
    case float_relation_unordered:
        return 3;
    }
}

(and you probably want to put the conversion switch into a function
since you'll be using it several times.)

Which I think is pretty straightforward, cleaner because we only
call one softfloat function rather than several, and should be
faster too (we get to avoid repeating a pile of tedious bit manipulation
in the eq and lt functions).

>>> +/* load 128-bit FP zero */
>>> +void HELPER(lzxr)(uint32_t f1)
>>> +{
>>> +    CPU_QuadU x;
>>> +    x.q = float64_to_float128(float64_zero,&env->fpu_status);
>>
>> Yuck. Just define a float128_zero if we need one.
>
> Good point. Mind to do so? I find myself struggling with the code there.

We could just follow the pattern of  float128_default_nan_{low,high}
in softfloat.h:

#define float128_zero_low LIT64(0)
#define float128_zero_high LIT64(0)

then your function has:
 x.q.high = float128_zero_high;
 x.q.low = float128_zero_low;

Or we could do something with an expression that returns a
struct type; that would be cleaner. I think the default nan
code is assuming it might have to be compiled with something
other than gcc. However I forget the C syntax and have to go
home now :-)

-- PMM
Alexander Graf - March 29, 2011, 9:04 a.m.
On 28.03.2011, at 19:55, Peter Maydell wrote:

> On 28 March 2011 18:23, Alexander Graf <agraf@suse.de> wrote:
>> On 03/24/2011 06:29 PM, Peter Maydell wrote:
>>>> +/* condition codes for binary FP ops */
>>>> +static uint32_t set_cc_f32(float32 v1, float32 v2)
>>>> +{
>>>> +    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
>>>> +        return 3;
>>>> +    } else if (float32_eq(v1, v2,&env->fpu_status)) {
>>>> +        return 0;
>>>> +    } else if (float32_lt(v1, v2,&env->fpu_status)) {
>>>> +        return 1;
>>>> +    } else {
>>>> +        return 2;
>>>> +    }
>>>> +}
>>> 
>>> Can you not use float32_compare_quiet() (returns a value
>>> telling you if it's less/equal/greater/unordered)?
>>> If not, needs a comment saying why you need to do it the hard way.
>> 
>> I just checked the macros there and it looks like float32_compare_quiet
>> returns eq when both numbers are NaN.
> 
> Hmm?
> 
>    if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
>         extractFloat ## s ## Frac( a ) ) ||                                 \
>        ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
>          extractFloat ## s ## Frac( b ) )) {                                \
>        if (!is_quiet ||                                                     \
>            float ## s ## _is_signaling_nan( a ) ||                          \
>            float ## s ## _is_signaling_nan( b ) ) {                         \
>            float_raise( float_flag_invalid STATUS_VAR);                     \
>        }                                                                    \
>        return float_relation_unordered;                                     \
>    }                                                                        \
> 
> If A is a NaN (ie its exponent is nan_exp and the frac bits aren't zero)
> or B is a NaN then we return float_relation_unordered.
> 
>> We would still have to convert from
>> the return value from that over to a CC value. I honestly don't see any
>> benefit - the code doesn't become cleaner or smaller.
> 
> So you get
> static uint32_t set_cc_f32(float32 v1, float32 v2)
> {
>    switch (float32_compare_quiet(v1, v2, &env->fpu_status)) {
>    case float_relation_unordered:
>        return 3;
>    case float_relation_equal:
>        return 0;
>    case float_relation_less:
>        return 1;
>    case float_relation_greater:
>        return 2;
>    case float_relation_unordered:
>        return 3;
>    }
> }
> 
> (and you probably want to put the conversion switch into a function
> since you'll be using it several times.)
> 
> Which I think is pretty straightforward, cleaner because we only
> call one softfloat function rather than several, and should be
> faster too (we get to avoid repeating a pile of tedious bit manipulation
> in the eq and lt functions).

Alrighty, changed the code :).

> 
>>>> +/* load 128-bit FP zero */
>>>> +void HELPER(lzxr)(uint32_t f1)
>>>> +{
>>>> +    CPU_QuadU x;
>>>> +    x.q = float64_to_float128(float64_zero,&env->fpu_status);
>>> 
>>> Yuck. Just define a float128_zero if we need one.
>> 
>> Good point. Mind to do so? I find myself struggling with the code there.
> 
> We could just follow the pattern of  float128_default_nan_{low,high}
> in softfloat.h:
> 
> #define float128_zero_low LIT64(0)
> #define float128_zero_high LIT64(0)
> 
> then your function has:
> x.q.high = float128_zero_high;
> x.q.low = float128_zero_low;
> 
> Or we could do something with an expression that returns a
> struct type; that would be cleaner. I think the default nan
> code is assuming it might have to be compiled with something
> other than gcc. However I forget the C syntax and have to go
> home now :-)

I'll just leave it to you for a follow-up patch :). The less I have to touch in softfpu code, the better :)


Alex

Patch

diff --git a/target-s390x/helpers.h b/target-s390x/helpers.h
new file mode 100644
index 0000000..6ca48eb
--- /dev/null
+++ b/target-s390x/helpers.h
@@ -0,0 +1,151 @@ 
+#include "def-helper.h"
+
+DEF_HELPER_1(exception, void, i32)
+DEF_HELPER_3(nc, i32, i32, i64, i64)
+DEF_HELPER_3(oc, i32, i32, i64, i64)
+DEF_HELPER_3(xc, i32, i32, i64, i64)
+DEF_HELPER_3(mvc, void, i32, i64, i64)
+DEF_HELPER_3(clc, i32, i32, i64, i64)
+DEF_HELPER_2(mvcl, i32, i32, i32)
+DEF_HELPER_FLAGS_1(set_cc_comp_s32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32)
+DEF_HELPER_FLAGS_1(set_cc_comp_s64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64)
+DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32)
+DEF_HELPER_3(clm, i32, i32, i32, i64)
+DEF_HELPER_3(stcm, void, i32, i32, i64)
+DEF_HELPER_2(mlg, void, i32, i64)
+DEF_HELPER_2(dlg, void, i32, i64)
+DEF_HELPER_FLAGS_3(set_cc_add64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64, s64, s64)
+DEF_HELPER_FLAGS_3(set_cc_addu64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64, i64, i64)
+DEF_HELPER_FLAGS_3(set_cc_add32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32, s32, s32)
+DEF_HELPER_FLAGS_3(set_cc_addu32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32, i32)
+DEF_HELPER_FLAGS_3(set_cc_sub64, TCG_CALL_PURE|TCG_CALL_CONST, i32, s64, s64, s64)
+DEF_HELPER_FLAGS_3(set_cc_subu64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64, i64, i64)
+DEF_HELPER_FLAGS_3(set_cc_sub32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32, s32, s32)
+DEF_HELPER_FLAGS_3(set_cc_subu32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32, i32)
+DEF_HELPER_3(srst, i32, i32, i32, i32)
+DEF_HELPER_3(clst, i32, i32, i32, i32)
+DEF_HELPER_3(mvpg, void, i64, i64, i64)
+DEF_HELPER_3(mvst, void, i32, i32, i32)
+DEF_HELPER_3(csg, i32, i32, i64, i32)
+DEF_HELPER_3(cdsg, i32, i32, i64, i32)
+DEF_HELPER_3(cs, i32, i32, i64, i32)
+DEF_HELPER_4(ex, i32, i32, i64, i64, i64)
+DEF_HELPER_FLAGS_1(abs_i32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32)
+DEF_HELPER_FLAGS_1(nabs_i32, TCG_CALL_PURE|TCG_CALL_CONST, s32, s32)
+DEF_HELPER_FLAGS_1(abs_i64, TCG_CALL_PURE|TCG_CALL_CONST, i64, s64)
+DEF_HELPER_FLAGS_1(nabs_i64, TCG_CALL_PURE|TCG_CALL_CONST, s64, s64)
+DEF_HELPER_3(stcmh, void, i32, i64, i32)
+DEF_HELPER_3(icmh, i32, i32, i64, i32)
+DEF_HELPER_2(ipm, void, i32, i32)
+DEF_HELPER_FLAGS_3(addc_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32, i32)
+DEF_HELPER_FLAGS_3(set_cc_addc_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64, i64, i64)
+DEF_HELPER_3(stam, void, i32, i64, i32)
+DEF_HELPER_3(lam, void, i32, i64, i32)
+DEF_HELPER_3(mvcle, i32, i32, i64, i32)
+DEF_HELPER_3(clcle, i32, i32, i64, i32)
+DEF_HELPER_3(slb, i32, i32, i32, i32)
+DEF_HELPER_4(slbg, i32, i32, i32, i64, i64)
+DEF_HELPER_2(cefbr, void, i32, s32)
+DEF_HELPER_2(cdfbr, void, i32, s32)
+DEF_HELPER_2(cxfbr, void, i32, s32)
+DEF_HELPER_2(cegbr, void, i32, s64)
+DEF_HELPER_2(cdgbr, void, i32, s64)
+DEF_HELPER_2(cxgbr, void, i32, s64)
+DEF_HELPER_2(adbr, i32, i32, i32)
+DEF_HELPER_2(aebr, i32, i32, i32)
+DEF_HELPER_2(sebr, i32, i32, i32)
+DEF_HELPER_2(sdbr, i32, i32, i32)
+DEF_HELPER_2(debr, void, i32, i32)
+DEF_HELPER_2(dxbr, void, i32, i32)
+DEF_HELPER_2(mdbr, void, i32, i32)
+DEF_HELPER_2(mxbr, void, i32, i32)
+DEF_HELPER_2(ldebr, void, i32, i32)
+DEF_HELPER_2(ldxbr, void, i32, i32)
+DEF_HELPER_2(lxdbr, void, i32, i32)
+DEF_HELPER_2(ledbr, void, i32, i32)
+DEF_HELPER_2(lexbr, void, i32, i32)
+DEF_HELPER_2(lpebr, i32, i32, i32)
+DEF_HELPER_2(lpdbr, i32, i32, i32)
+DEF_HELPER_2(lpxbr, i32, i32, i32)
+DEF_HELPER_2(ltebr, i32, i32, i32)
+DEF_HELPER_2(ltdbr, i32, i32, i32)
+DEF_HELPER_2(ltxbr, i32, i32, i32)
+DEF_HELPER_2(lcebr, i32, i32, i32)
+DEF_HELPER_2(lcdbr, i32, i32, i32)
+DEF_HELPER_2(lcxbr, i32, i32, i32)
+DEF_HELPER_2(aeb, void, i32, i32)
+DEF_HELPER_2(deb, void, i32, i32)
+DEF_HELPER_2(meeb, void, i32, i32)
+DEF_HELPER_2(cdb, i32, i32, i64)
+DEF_HELPER_2(adb, i32, i32, i64)
+DEF_HELPER_2(seb, void, i32, i32)
+DEF_HELPER_2(sdb, i32, i32, i64)
+DEF_HELPER_2(mdb, void, i32, i64)
+DEF_HELPER_2(ddb, void, i32, i64)
+DEF_HELPER_FLAGS_2(cebr, TCG_CALL_PURE, i32, i32, i32)
+DEF_HELPER_FLAGS_2(cdbr, TCG_CALL_PURE, i32, i32, i32)
+DEF_HELPER_FLAGS_2(cxbr, TCG_CALL_PURE, i32, i32, i32)
+DEF_HELPER_3(cgebr, i32, i32, i32, i32)
+DEF_HELPER_3(cgdbr, i32, i32, i32, i32)
+DEF_HELPER_3(cgxbr, i32, i32, i32, i32)
+DEF_HELPER_1(lzer, void, i32)
+DEF_HELPER_1(lzdr, void, i32)
+DEF_HELPER_1(lzxr, void, i32)
+DEF_HELPER_3(cfebr, i32, i32, i32, i32)
+DEF_HELPER_3(cfdbr, i32, i32, i32, i32)
+DEF_HELPER_3(cfxbr, i32, i32, i32, i32)
+DEF_HELPER_2(axbr, i32, i32, i32)
+DEF_HELPER_2(sxbr, i32, i32, i32)
+DEF_HELPER_2(meebr, void, i32, i32)
+DEF_HELPER_2(ddbr, void, i32, i32)
+DEF_HELPER_3(madb, void, i32, i64, i32)
+DEF_HELPER_3(maebr, void, i32, i32, i32)
+DEF_HELPER_3(madbr, void, i32, i32, i32)
+DEF_HELPER_3(msdbr, void, i32, i32, i32)
+DEF_HELPER_2(lxdb, void, i32, i64)
+DEF_HELPER_FLAGS_2(tceb, TCG_CALL_PURE, i32, i32, i64)
+DEF_HELPER_FLAGS_2(tcdb, TCG_CALL_PURE, i32, i32, i64)
+DEF_HELPER_FLAGS_2(tcxb, TCG_CALL_PURE, i32, i32, i64)
+DEF_HELPER_2(flogr, i32, i32, i64)
+DEF_HELPER_2(sqdbr, void, i32, i32)
+DEF_HELPER_FLAGS_1(cvd, TCG_CALL_PURE|TCG_CALL_CONST, i64, s32)
+DEF_HELPER_3(unpk, void, i32, i64, i64)
+DEF_HELPER_3(tr, void, i32, i64, i64)
+
+DEF_HELPER_2(servc, i32, i32, i64)
+DEF_HELPER_3(diag, i64, i32, i64, i64)
+DEF_HELPER_2(load_psw, void, i64, i64)
+DEF_HELPER_1(program_interrupt, void, i32)
+DEF_HELPER_FLAGS_1(stidp, TCG_CALL_CONST, void, i64)
+DEF_HELPER_FLAGS_1(spx, TCG_CALL_CONST, void, i64)
+DEF_HELPER_FLAGS_1(sck, TCG_CALL_CONST, i32, i64)
+DEF_HELPER_1(stck, i32, i64)
+DEF_HELPER_1(stcke, i32, i64)
+DEF_HELPER_FLAGS_1(sckc, TCG_CALL_CONST, void, i64)
+DEF_HELPER_FLAGS_1(stckc, TCG_CALL_CONST, void, i64)
+DEF_HELPER_FLAGS_1(spt, TCG_CALL_CONST, void, i64)
+DEF_HELPER_FLAGS_1(stpt, TCG_CALL_CONST, void, i64)
+DEF_HELPER_3(stsi, i32, i64, i32, i32)
+DEF_HELPER_3(lctl, void, i32, i64, i32)
+DEF_HELPER_3(lctlg, void, i32, i64, i32)
+DEF_HELPER_3(stctl, void, i32, i64, i32)
+DEF_HELPER_3(stctg, void, i32, i64, i32)
+DEF_HELPER_FLAGS_2(tprot, TCG_CALL_CONST, i32, i64, i64)
+DEF_HELPER_FLAGS_1(iske, TCG_CALL_PURE|TCG_CALL_CONST, i64, i64)
+DEF_HELPER_FLAGS_2(sske, TCG_CALL_CONST, void, i32, i64)
+DEF_HELPER_FLAGS_2(rrbe, TCG_CALL_CONST, i32, i32, i64)
+DEF_HELPER_2(csp, i32, i32, i32)
+DEF_HELPER_3(mvcs, i32, i64, i64, i64)
+DEF_HELPER_3(mvcp, i32, i64, i64, i64)
+DEF_HELPER_3(sigp, i32, i64, i32, i64)
+DEF_HELPER_1(sacf, void, i64)
+DEF_HELPER_FLAGS_2(ipte, TCG_CALL_CONST, void, i64, i64)
+DEF_HELPER_FLAGS_0(ptlb, TCG_CALL_CONST, void)
+DEF_HELPER_2(lra, i32, i64, i32)
+DEF_HELPER_2(stura, void, i64, i32)
+DEF_HELPER_2(cksm, void, i32, i32)
+
+DEF_HELPER_FLAGS_4(calc_cc, TCG_CALL_PURE|TCG_CALL_CONST,
+                   i32, i32, i64, i64, i64)
+
+#include "def-helper.h"
diff --git a/target-s390x/op_helper.c b/target-s390x/op_helper.c
index 402df2d..ca59d7d 100644
--- a/target-s390x/op_helper.c
+++ b/target-s390x/op_helper.c
@@ -1,6 +1,7 @@ 
 /*
  *  S/390 helper routines
  *
+ *  Copyright (c) 2009 Ulrich Hecht
  *  Copyright (c) 2009 Alexander Graf
  *
  * This library is free software; you can redistribute it and/or
@@ -18,6 +19,11 @@ 
  */
 
 #include "exec.h"
+#include "host-utils.h"
+#include "helpers.h"
+#include <string.h>
+#include "kvm.h"
+#include "qemu-timer.h"
 
 /*****************************************************************************/
 /* Softmmu support */
@@ -64,10 +70,2891 @@  void tlb_fill (target_ulong addr, int is_write, int mmu_idx, void *retaddr)
                 cpu_restore_state(tb, env, pc, NULL);
             }
         }
-        /* XXX */
-        /* helper_raise_exception_err(env->exception_index, env->error_code); */
+        cpu_loop_exit();
     }
     env = saved_env;
 }
 
 #endif
+/* #define DEBUG_HELPER */
+#ifdef DEBUG_HELPER
+#define HELPER_LOG(x...) qemu_log(x)
+#else
+#define HELPER_LOG(x...)
+#endif
+
+/* raise an exception */
+void HELPER(exception)(uint32_t excp)
+{
+    HELPER_LOG("%s: exception %d\n", __FUNCTION__, excp);
+    env->exception_index = excp;
+    cpu_loop_exit();
+}
+
+#ifndef CONFIG_USER_ONLY
+static void mvc_fast_memset(CPUState *env, uint32_t l, uint64_t dest,
+                            uint8_t byte)
+{
+    target_phys_addr_t dest_phys;
+    target_phys_addr_t len = l;
+    void *dest_p;
+    uint64_t asc = env->psw.mask & PSW_MASK_ASC;
+    int flags;
+
+    if (mmu_translate(env, dest, 1, asc, &dest_phys, &flags)) {
+        stb(dest, byte);
+        cpu_abort(env, "should never reach here");
+    }
+    dest_phys |= dest & ~TARGET_PAGE_MASK;
+
+    dest_p = cpu_physical_memory_map(dest_phys, &len, 1);
+
+    memset(dest_p, byte, len);
+
+    cpu_physical_memory_unmap(dest_p, 1, len, len);
+}
+
+static void mvc_fast_memmove(CPUState *env, uint32_t l, uint64_t dest,
+                             uint64_t src)
+{
+    target_phys_addr_t dest_phys;
+    target_phys_addr_t src_phys;
+    target_phys_addr_t len = l;
+    void *dest_p;
+    void *src_p;
+    uint64_t asc = env->psw.mask & PSW_MASK_ASC;
+    int flags;
+
+    if (mmu_translate(env, dest, 1, asc, &dest_phys, &flags)) {
+        stb(dest, 0);
+        cpu_abort(env, "should never reach here");
+    }
+    dest_phys |= dest & ~TARGET_PAGE_MASK;
+
+    if (mmu_translate(env, src, 0, asc, &src_phys, &flags)) {
+        ldub(src);
+        cpu_abort(env, "should never reach here");
+    }
+    src_phys |= src & ~TARGET_PAGE_MASK;
+
+    dest_p = cpu_physical_memory_map(dest_phys, &len, 1);
+    src_p = cpu_physical_memory_map(src_phys, &len, 0);
+
+    memmove(dest_p, src_p, len);
+
+    cpu_physical_memory_unmap(dest_p, 1, len, len);
+    cpu_physical_memory_unmap(src_p, 0, len, len);
+}
+#endif
+
+/* and on array */
+uint32_t HELPER(nc)(uint32_t l, uint64_t dest, uint64_t src)
+{
+    int i;
+    unsigned char x;
+    uint32_t cc = 0;
+
+    HELPER_LOG("%s l %d dest %" PRIx64 " src %" PRIx64 "\n",
+               __FUNCTION__, l, dest, src);
+    for (i = 0; i <= l; i++) {
+        x = ldub(dest + i) & ldub(src + i);
+        if (x) {
+            cc = 1;
+        }
+        stb(dest + i, x);
+    }
+    return cc;
+}
+
+/* xor on array */
+uint32_t HELPER(xc)(uint32_t l, uint64_t dest, uint64_t src)
+{
+    int i;
+    unsigned char x;
+    uint32_t cc = 0;
+
+    HELPER_LOG("%s l %d dest %" PRIx64 " src %" PRIx64 "\n",
+               __FUNCTION__, l, dest, src);
+
+#ifndef CONFIG_USER_ONLY
+    /* xor with itself is the same as memset(0) */
+    if ((l > 32) && (src == dest) &&
+        (src & TARGET_PAGE_MASK) == ((src + l) & TARGET_PAGE_MASK)) {
+        mvc_fast_memset(env, l + 1, dest, 0);
+        return 0;
+    }
+#else
+    if (src == dest) {
+        memset((void*)dest, 0, l + 1);
+        return 0;
+    }
+#endif
+
+    for (i = 0; i <= l; i++) {
+        x = ldub(dest + i) ^ ldub(src + i);
+        if (x) {
+            cc = 1;
+        }
+        stb(dest + i, x);
+    }
+    return cc;
+}
+
+/* or on array */
+uint32_t HELPER(oc)(uint32_t l, uint64_t dest, uint64_t src)
+{
+    int i;
+    unsigned char x;
+    uint32_t cc = 0;
+
+    HELPER_LOG("%s l %d dest %" PRIx64 " src %" PRIx64 "\n",
+               __FUNCTION__, l, dest, src);
+    for (i = 0; i <= l; i++) {
+        x = ldub(dest + i) | ldub(src + i);
+        if (x) {
+            cc = 1;
+        }
+        stb(dest + i, x);
+    }
+    return cc;
+}
+
+/* memmove */
+void HELPER(mvc)(uint32_t l, uint64_t dest, uint64_t src)
+{
+    int i = 0;
+    int x = 0;
+    uint32_t l_64 = (l + 1) / 8;
+
+    HELPER_LOG("%s l %d dest %" PRIx64 " src %" PRIx64 "\n",
+               __FUNCTION__, l, dest, src);
+
+#ifndef CONFIG_USER_ONLY
+    if ((l > 32) &&
+        (src & TARGET_PAGE_MASK) == ((src + l) & TARGET_PAGE_MASK) &&
+        (dest & TARGET_PAGE_MASK) == ((dest + l) & TARGET_PAGE_MASK)) {
+        if (dest == (src + 1)) {
+            mvc_fast_memset(env, l + 1, dest, ldub(src));
+            return;
+        } else if ((src & TARGET_PAGE_MASK) != (dest & TARGET_PAGE_MASK)) {
+            mvc_fast_memmove(env, l + 1, dest, src);
+            return;
+        }
+    }
+#else
+    if (dest == (src + 1)) {
+        memset((void*)dest, ldub(src), l + 1);
+        return;
+    } else {
+        memmove((void*)dest, (void*)src, l + 1);
+        return;
+    }
+#endif
+
+    /* handle the parts that fit into 8-byte loads/stores */
+    if (dest != (src + 1)) {
+        for (i = 0; i < l_64; i++) {
+            stq(dest + x, ldq(src + x));
+            x += 8;
+        }
+    }
+
+    /* slow version crossing pages with byte accesses */
+    for (i = x; i <= l; i++) {
+        stb(dest + i, ldub(src + i));
+    }
+}
+
+/* compare unsigned byte arrays */
+uint32_t HELPER(clc)(uint32_t l, uint64_t s1, uint64_t s2)
+{
+    int i;
+    unsigned char x,y;
+    uint32_t cc;
+    HELPER_LOG("%s l %d s1 %" PRIx64 " s2 %" PRIx64 "\n",
+               __FUNCTION__, l, s1, s2);
+    for (i = 0; i <= l; i++) {
+        x = ldub(s1 + i);
+        y = ldub(s2 + i);
+        HELPER_LOG("%02x (%c)/%02x (%c) ", x, x, y, y);
+        if (x < y) {
+            cc = 1;
+            goto done;
+        } else if (x > y) {
+            cc = 2;
+            goto done;
+        }
+    }
+    cc = 0;
+done:
+    HELPER_LOG("\n");
+    return cc;
+}
+
+/* compare logical under mask */
+uint32_t HELPER(clm)(uint32_t r1, uint32_t mask, uint64_t addr)
+{
+    uint8_t r,d;
+    uint32_t cc;
+    HELPER_LOG("%s: r1 0x%x mask 0x%x addr 0x%" PRIx64 "\n", __FUNCTION__, r1,
+               mask, addr);
+    cc = 0;
+    while (mask) {
+        if (mask & 8) {
+            d = ldub(addr);
+            r = (r1 & 0xff000000UL) >> 24;
+            HELPER_LOG("mask 0x%x %02x/%02x (0x%" PRIx64 ") ", mask, r, d,
+                        addr);
+            if (r < d) {
+                cc = 1;
+                break;
+            } else if (r > d) {
+                cc = 2;
+                break;
+            }
+            addr++;
+        }
+        mask = (mask << 1) & 0xf;
+        r1 <<= 8;
+    }
+    HELPER_LOG("\n");
+    return cc;
+}
+
+/* store character under mask */
+void HELPER(stcm)(uint32_t r1, uint32_t mask, uint64_t addr)
+{
+    uint8_t r;
+    HELPER_LOG("%s: r1 0x%x mask 0x%x addr 0x%lx\n", __FUNCTION__, r1, mask,
+               addr);
+    while (mask) {
+        if (mask & 8) {
+            r = (r1 & 0xff000000UL) >> 24;
+            stb(addr, r);
+            HELPER_LOG("mask 0x%x %02x (0x%lx) ", mask, r, addr);
+            addr++;
+        }
+        mask = (mask << 1) & 0xf;
+        r1 <<= 8;
+    }
+    HELPER_LOG("\n");
+}
+
+/* 64/64 -> 128 unsigned multiplication */
+void HELPER(mlg)(uint32_t r1, uint64_t v2)
+{
+#if HOST_LONG_BITS == 64 && defined(__GNUC__)
+    /* assuming 64-bit hosts have __uint128_t */
+    __uint128_t res = (__uint128_t)env->regs[r1 + 1];
+    res *= (__uint128_t)v2;
+    env->regs[r1] = (uint64_t)(res >> 64);
+    env->regs[r1 + 1] = (uint64_t)res;
+#else
+    mulu64(&env->regs[r1 + 1], &env->regs[r1], env->regs[r1 + 1], v2);
+#endif
+}
+
+/* 128 -> 64/64 unsigned division */
+void HELPER(dlg)(uint32_t r1, uint64_t v2)
+{
+    uint64_t divisor = v2;
+
+    if (!env->regs[r1]) {
+        /* 64 -> 64/64 case */
+        env->regs[r1] = env->regs[r1+1] % divisor;
+        env->regs[r1+1] = env->regs[r1+1] / divisor;
+        return;
+    } else {
+
+#if HOST_LONG_BITS == 64 && defined(__GNUC__)
+        /* assuming 64-bit hosts have __uint128_t */
+        __uint128_t dividend = (((__uint128_t)env->regs[r1]) << 64) |
+                               (env->regs[r1+1]);
+        __uint128_t quotient = dividend / divisor;
+        env->regs[r1+1] = quotient;
+        __uint128_t remainder = dividend % divisor;
+        env->regs[r1] = remainder;
+#else
+        /* 32-bit hosts would need special wrapper functionality - just abort if
+           we encounter such a case; it's very unlikely anyways. */
+        cpu_abort(env, "128 -> 64/64 division not implemented\n");
+#endif
+    }
+}
+
+static inline uint64_t get_address(int x2, int b2, int d2)
+{
+    uint64_t r = d2;
+
+    if (x2) {
+        r += env->regs[x2];
+    }
+
+    if (b2) {
+        r += env->regs[b2];
+    }
+
+    /* 31-Bit mode */
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        r &= 0x7fffffff;
+    }
+
+    return r;
+}
+
+static inline uint64_t get_address_31fix(int reg)
+{
+    uint64_t r = env->regs[reg];
+
+    /* 31-Bit mode */
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        r &= 0x7fffffff;
+    }
+
+    return r;
+}
+
+/* search string (c is byte to search, r2 is string, r1 end of string) */
+uint32_t HELPER(srst)(uint32_t c, uint32_t r1, uint32_t r2)
+{
+    uint64_t i;
+    uint32_t cc = 2;
+    uint64_t str = get_address_31fix(r2);
+    uint64_t end = get_address_31fix(r1);
+
+    HELPER_LOG("%s: c %d *r1 0x%" PRIx64 " *r2 0x%" PRIx64 "\n", __FUNCTION__,
+               c, env->regs[r1], env->regs[r2]);
+
+    for (i = str; i != end; i++) {
+        if (ldub(i) == c) {
+            env->regs[r1] = i;
+            cc = 1;
+            break;
+        }
+    }
+
+    return cc;
+}
+
+/* unsigned string compare (c is string terminator) */
+uint32_t HELPER(clst)(uint32_t c, uint32_t r1, uint32_t r2)
+{
+    uint64_t s1 = get_address_31fix(r1);//env->regs[r1];
+    uint64_t s2 = get_address_31fix(r2);//env->regs[r2];
+    uint8_t v1, v2;
+    uint32_t cc;
+    c = c & 0xff;
+#ifdef CONFIG_USER_ONLY
+    if (!c) {
+        HELPER_LOG("%s: comparing '%s' and '%s'\n",
+                   __FUNCTION__, (char*)s1, (char*)s2);
+    }
+#endif
+    for (;;) {
+        v1 = ldub(s1);
+        v2 = ldub(s2);
+        if ((v1 == c || v2 == c) || (v1 != v2)) {
+            break;
+        }
+        s1++;
+        s2++;
+    }
+
+    if (v1 == v2) {
+        cc = 0;
+    } else {
+        cc = (v1 < v2) ? 1 : 2;
+        /* FIXME: 31-bit mode! */
+        env->regs[r1] = s1;
+        env->regs[r2] = s2;
+    }
+    return cc;
+}
+
+/* move page */
+void HELPER(mvpg)(uint64_t r0, uint64_t r1, uint64_t r2)
+{
+    /* XXX missing r0 handling */
+#ifdef CONFIG_USER_ONLY
+    int i;
+
+    for (i = 0; i < TARGET_PAGE_SIZE; i++) {
+        stb(r1 + i, ldub(r2 + i));
+    }
+#else
+    mvc_fast_memmove(env, TARGET_PAGE_SIZE, r1, r2);
+#endif
+}
+
+/* string copy (c is string terminator) */
+void HELPER(mvst)(uint32_t c, uint32_t r1, uint32_t r2)
+{
+    uint64_t dest = get_address_31fix(r1);//env->regs[r1];
+    uint64_t src = get_address_31fix(r2);//env->regs[r2];
+    uint8_t v;
+    c = c & 0xff;
+#ifdef CONFIG_USER_ONLY
+    if (!c) {
+        HELPER_LOG("%s: copy '%s' to 0x%lx\n", __FUNCTION__, (char*)src, dest);
+    }
+#endif
+    for (;;) {
+        v = ldub(src);
+        stb(dest, v);
+        if (v == c) {
+            break;
+        }
+        src++;
+        dest++;
+    }
+    env->regs[r1] = dest; /* FIXME: 31-bit mode! */
+}
+
+/* compare and swap 64-bit */
+uint32_t HELPER(csg)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    /* FIXME: locking? */
+    uint32_t cc;
+    uint64_t v2 = ldq(a2);
+    if (env->regs[r1] == v2) {
+        cc = 0;
+        stq(a2, env->regs[r3]);
+    } else {
+        cc = 1;
+        env->regs[r1] = v2;
+    }
+    return cc;
+}
+
+/* compare double and swap 64-bit */
+uint32_t HELPER(cdsg)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    /* FIXME: locking? */
+    uint32_t cc;
+    uint64_t v2_hi = ldq(a2);
+    uint64_t v2_lo = ldq(a2 + 8);
+    uint64_t v1_hi = env->regs[r1];
+    uint64_t v1_lo = env->regs[r1 + 1];
+
+    if ((v1_hi == v2_hi) && (v1_lo == v2_lo)) {
+        cc = 0;
+        stq(a2, env->regs[r3]);
+        stq(a2 + 8, env->regs[r3 + 1]);
+    } else {
+        cc = 1;
+        env->regs[r1] = v2_hi;
+        env->regs[r1 + 1] = v2_lo;
+    }
+
+    return cc;
+}
+
+/* compare and swap 32-bit */
+uint32_t HELPER(cs)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    /* FIXME: locking? */
+    uint32_t cc;
+    HELPER_LOG("%s: r1 %d a2 0x%lx r3 %d\n", __FUNCTION__, r1, a2, r3);
+    uint32_t v2 = ldl(a2);
+    if (((uint32_t)env->regs[r1]) == v2) {
+        cc = 0;
+        stl(a2, (uint32_t)env->regs[r3]);
+    } else {
+        cc = 1;
+        env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) | v2;
+    }
+    return cc;
+}
+
+static uint32_t helper_icm(uint32_t r1, uint64_t address, uint32_t mask)
+{
+    int pos = 24; /* top of the lower half of r1 */
+    uint64_t rmask = 0xff000000ULL;
+    uint8_t val = 0;
+    int ccd = 0;
+    uint32_t cc = 0;
+
+    while (mask) {
+        if (mask & 8) {
+            env->regs[r1] &= ~rmask;
+            val = ldub(address);
+            if ((val & 0x80) && !ccd) {
+                cc = 1;
+            }
+            ccd = 1;
+            if (val && cc == 0) {
+                cc = 2;
+            }
+            env->regs[r1] |= (uint64_t)val << pos;
+            address++;
+        }
+        mask = (mask << 1) & 0xf;
+        pos -= 8;
+        rmask >>= 8;
+    }
+
+    return cc;
+}
+
+/* execute instruction
+   this instruction executes an insn modified with the contents of r1
+   it does not change the executed instruction in memory
+   it does not change the program counter
+   in other words: tricky...
+   currently implemented by interpreting the cases it is most commonly used in
+ */
+uint32_t HELPER(ex)(uint32_t cc, uint64_t v1, uint64_t addr, uint64_t ret)
+{
+    uint16_t insn = lduw_code(addr);
+    HELPER_LOG("%s: v1 0x%lx addr 0x%lx insn 0x%x\n", __FUNCTION__, v1, addr,
+             insn);
+    if ((insn & 0xf0ff) == 0xd000) {
+        uint32_t l, insn2, b1, b2, d1, d2;
+        l = v1 & 0xff;
+        insn2 = ldl_code(addr + 2);
+        b1 = (insn2 >> 28) & 0xf;
+        b2 = (insn2 >> 12) & 0xf;
+        d1 = (insn2 >> 16) & 0xfff;
+        d2 = insn2 & 0xfff;
+        switch (insn & 0xf00) {
+        case 0x200:
+            helper_mvc(l, get_address(0, b1, d1), get_address(0, b2, d2));
+            break;
+        case 0x500:
+            cc = helper_clc(l, get_address(0, b1, d1), get_address(0, b2, d2));
+            break;
+        case 0x700:
+            cc = helper_xc(l, get_address(0, b1, d1), get_address(0, b2, d2));
+            break;
+        default:
+            goto abort;
+            break;
+        }
+    } else if ((insn & 0xff00) == 0x0a00) {
+        /* supervisor call */
+        HELPER_LOG("%s: svc %ld via execute\n", __FUNCTION__, (insn|v1) & 0xff);
+        env->psw.addr = ret - 4;
+        env->int_svc_code = (insn|v1) & 0xff;
+        env->int_svc_ilc = 4;
+#ifdef CONFIG_USER_ONLY
+        helper_exception(EXCP_EXECUTE_SVC + ((insn | v1) & 0xff));
+#else
+        helper_exception(EXCP_SVC);
+#endif
+    } else if ((insn & 0xff00) == 0xbf00) {
+        uint32_t insn2, r1, r3, b2, d2;
+        insn2 = ldl_code(addr + 2);
+        r1 = (insn2 >> 20) & 0xf;
+        r3 = (insn2 >> 16) & 0xf;
+        b2 = (insn2 >> 12) & 0xf;
+        d2 = insn2 & 0xfff;
+        cc = helper_icm(r1, get_address(0, b2, d2), r3);
+    } else {
+abort:
+        cpu_abort(env, "EXECUTE on instruction prefix 0x%x not implemented\n",
+                  insn);
+    }
+    return cc;
+}
+
+/* absolute value 32-bit */
+uint32_t HELPER(abs_i32)(int32_t val)
+{
+    if (val < 0) {
+        return -val;
+    } else {
+        return val;
+    }
+}
+
+/* negative absolute value 32-bit */
+int32_t HELPER(nabs_i32)(int32_t val)
+{
+    if (val < 0) {
+        return val;
+    } else {
+        return -val;
+    }
+}
+
+/* absolute value 64-bit */
+uint64_t HELPER(abs_i64)(int64_t val)
+{
+    HELPER_LOG("%s: val 0x%" PRIx64 "\n", __FUNCTION__, val);
+
+    if (val < 0) {
+        return -val;
+    } else {
+        return val;
+    }
+}
+
+/* negative absolute value 64-bit */
+int64_t HELPER(nabs_i64)(int64_t val)
+{
+    if (val < 0) {
+        return val;
+    } else {
+        return -val;
+    }
+}
+
+/* add with carry 32-bit unsigned */
+uint32_t HELPER(addc_u32)(uint32_t cc, uint32_t v1, uint32_t v2)
+{
+    uint32_t res;
+
+    res = v1 + v2;
+    if (cc & 2) {
+        res++;
+    }
+
+    return res;
+}
+
+/* store character under mask high operates on the upper half of r1 */
+void HELPER(stcmh)(uint32_t r1, uint64_t address, uint32_t mask)
+{
+    int pos = 56; /* top of the upper half of r1 */
+
+    while (mask) {
+        if (mask & 8) {
+            stb(address, (env->regs[r1] >> pos) & 0xff);
+            address++;
+        }
+        mask = (mask << 1) & 0xf;
+        pos -= 8;
+    }
+}
+
+/* insert character under mask high; same as icm, but operates on the
+   upper half of r1 */
+uint32_t HELPER(icmh)(uint32_t r1, uint64_t address, uint32_t mask)
+{
+    int pos = 56; /* top of the upper half of r1 */
+    uint64_t rmask = 0xff00000000000000ULL;
+    uint8_t val = 0;
+    int ccd = 0;
+    uint32_t cc = 0;
+
+    while (mask) {
+        if (mask & 8) {
+            env->regs[r1] &= ~rmask;
+            val = ldub(address);
+            if ((val & 0x80) && !ccd) {
+                cc = 1;
+            }
+            ccd = 1;
+            if (val && cc == 0) {
+                cc = 2;
+            }
+            env->regs[r1] |= (uint64_t)val << pos;
+            address++;
+        }
+        mask = (mask << 1) & 0xf;
+        pos -= 8;
+        rmask >>= 8;
+    }
+
+    return cc;
+}
+
+/* insert psw mask and condition code into r1 */
+void HELPER(ipm)(uint32_t cc, uint32_t r1)
+{
+    uint64_t r = env->regs[r1];
+
+    r &= 0xffffffff00ffffffULL;
+    r |= (cc << 28) | ( (env->psw.mask >> 40) & 0xf );
+    env->regs[r1] = r;
+    HELPER_LOG("%s: cc %d psw.mask 0x%lx r1 0x%lx\n", __FUNCTION__,
+               cc, env->psw.mask, r);
+}
+
+/* load access registers r1 to r3 from memory at a2 */
+void HELPER(lam)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        env->aregs[i] = ldl(a2);
+        a2 += 4;
+
+        if (i == r3) {
+            break;
+        }
+    }
+}
+
+/* store access registers r1 to r3 in memory at a2 */
+void HELPER(stam)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        stl(a2, env->aregs[i]);
+        a2 += 4;
+
+        if (i == r3) {
+            break;
+        }
+    }
+}
+
+/* move long */
+uint32_t HELPER(mvcl)(uint32_t r1, uint32_t r2)
+{
+    uint64_t destlen = env->regs[r1 + 1] & 0xffffff;
+    uint64_t dest = get_address_31fix(r1);
+    uint64_t srclen = env->regs[r2 + 1] & 0xffffff;
+    uint64_t src = get_address_31fix(r2);
+    uint8_t pad = src >> 24;
+    uint8_t v;
+    uint32_t cc;
+
+    if (destlen == srclen) {
+        cc = 0;
+    } else if (destlen < srclen) {
+        cc = 1;
+    } else {
+        cc = 2;
+    }
+
+    if (srclen > destlen) {
+        srclen = destlen;
+    }
+
+    for (; destlen && srclen; src++, dest++, destlen--, srclen--) {
+        v = ldub(src);
+        stb(dest, v);
+    }
+
+    for (; destlen; dest++, destlen--) {
+        stb(dest, pad);
+    }
+
+    env->regs[r1 + 1] = destlen;
+    /* can't use srclen here, we trunc'ed it */
+    env->regs[r2 + 1] -= src - env->regs[r2];
+    env->regs[r1] = dest;
+    env->regs[r2] = src;
+
+    return cc;
+}
+
+/* move long extended another memcopy insn with more bells and whistles */
+uint32_t HELPER(mvcle)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    uint64_t destlen = env->regs[r1 + 1];
+    uint64_t dest = env->regs[r1];
+    uint64_t srclen = env->regs[r3 + 1];
+    uint64_t src = env->regs[r3];
+    uint8_t pad = a2 & 0xff;
+    uint8_t v;
+    uint32_t cc;
+
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        destlen = (uint32_t)destlen;
+        srclen = (uint32_t)srclen;
+        dest &= 0x7fffffff;
+        src &= 0x7fffffff;
+    }
+
+    if (destlen == srclen) {
+        cc = 0;
+    } else if (destlen < srclen) {
+        cc = 1;
+    } else {
+        cc = 2;
+    }
+
+    if (srclen > destlen) {
+        srclen = destlen;
+    }
+
+    for (; destlen && srclen; src++, dest++, destlen--, srclen--) {
+        v = ldub(src);
+        stb(dest, v);
+    }
+
+    for (; destlen; dest++, destlen--) {
+        stb(dest, pad);
+    }
+
+    env->regs[r1 + 1] = destlen;
+    /* can't use srclen here, we trunc'ed it */
+    /* FIXME: 31-bit mode! */
+    env->regs[r3 + 1] -= src - env->regs[r3];
+    env->regs[r1] = dest;
+    env->regs[r3] = src;
+
+    return cc;
+}
+
+/* compare logical long extended memcompare insn with padding */
+uint32_t HELPER(clcle)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    uint64_t destlen = env->regs[r1 + 1];
+    uint64_t dest = get_address_31fix(r1);
+    uint64_t srclen = env->regs[r3 + 1];
+    uint64_t src = get_address_31fix(r3);
+    uint8_t pad = a2 & 0xff;
+    uint8_t v1 = 0,v2 = 0;
+    uint32_t cc = 0;
+
+    if (!(destlen || srclen)) {
+        return cc;
+    }
+
+    if (srclen > destlen) {
+        srclen = destlen;
+    }
+
+    for (; destlen || srclen; src++, dest++, destlen--, srclen--) {
+        v1 = srclen ? ldub(src) : pad;
+        v2 = destlen ? ldub(dest) : pad;
+        if (v1 != v2) {
+            cc = (v1 < v2) ? 1 : 2;
+            break;
+        }
+    }
+
+    env->regs[r1 + 1] = destlen;
+    /* can't use srclen here, we trunc'ed it */
+    env->regs[r3 + 1] -= src - env->regs[r3];
+    env->regs[r1] = dest;
+    env->regs[r3] = src;
+
+    return cc;
+}
+
+/* subtract unsigned v2 from v1 with borrow */
+uint32_t HELPER(slb)(uint32_t cc, uint32_t r1, uint32_t v2)
+{
+    uint32_t v1 = env->regs[r1];
+    uint32_t res = v1 + (~v2) + (cc >> 1);
+
+    env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) | res;
+    if (cc & 2) {
+        /* borrow */
+        return v1 ? 1 : 0;
+    } else {
+        return v1 ? 3 : 2;
+    }
+}
+
+/* subtract unsigned v2 from v1 with borrow */
+uint32_t HELPER(slbg)(uint32_t cc, uint32_t r1, uint64_t v1, uint64_t v2)
+{
+    uint64_t res = v1 + (~v2) + (cc >> 1);
+
+    env->regs[r1] = res;
+    if (cc & 2) {
+        /* borrow */
+        return v1 ? 1 : 0;
+    } else {
+        return v1 ? 3 : 2;
+    }
+}
+
+/* condition codes for binary FP ops */
+static uint32_t set_cc_f32(float32 v1, float32 v2)
+{
+    if (float32_is_any_nan(v1) || float32_is_any_nan(v2)) {
+        return 3;
+    } else if (float32_eq(v1, v2, &env->fpu_status)) {
+        return 0;
+    } else if (float32_lt(v1, v2, &env->fpu_status)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static uint32_t set_cc_f64(float64 v1, float64 v2)
+{
+    if (float64_is_any_nan(v1) || float64_is_any_nan(v2)) {
+        return 3;
+    } else if (float64_eq(v1, v2, &env->fpu_status)) {
+        return 0;
+    } else if (float64_lt(v1, v2, &env->fpu_status)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+/* condition codes for unary FP ops */
+static uint32_t set_cc_nz_f32(float32 v)
+{
+    if (float32_is_any_nan(v)) {
+        return 3;
+    } else if (float32_is_zero(v)) {
+        return 0;
+    } else if (float32_is_neg(v)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static uint32_t set_cc_nz_f64(float64 v)
+{
+    if (float64_is_any_nan(v)) {
+        return 3;
+    } else if (float64_is_zero(v)) {
+        return 0;
+    } else if (float64_is_neg(v)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static uint32_t set_cc_nz_f128(float128 v)
+{
+    if (float128_is_any_nan(v)) {
+        return 3;
+    } else if (float128_is_zero(v)) {
+        return 0;
+    } else if (float128_is_neg(v)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+/* convert 32-bit int to 64-bit float */
+void HELPER(cdfbr)(uint32_t f1, int32_t v2)
+{
+    HELPER_LOG("%s: converting %d to f%d\n", __FUNCTION__, v2, f1);
+    env->fregs[f1].d = int32_to_float64(v2, &env->fpu_status);
+}
+
+/* convert 32-bit int to 128-bit float */
+void HELPER(cxfbr)(uint32_t f1, int32_t v2)
+{
+    CPU_QuadU v1;
+    v1.q = int32_to_float128(v2, &env->fpu_status);
+    env->fregs[f1].ll = v1.ll.upper;
+    env->fregs[f1 + 2].ll = v1.ll.lower;
+}
+
+/* convert 64-bit int to 32-bit float */
+void HELPER(cegbr)(uint32_t f1, int64_t v2)
+{
+    HELPER_LOG("%s: converting %ld to f%d\n", __FUNCTION__, v2, f1);
+    env->fregs[f1].l.upper = int64_to_float32(v2, &env->fpu_status);
+}
+
+/* convert 64-bit int to 64-bit float */
+void HELPER(cdgbr)(uint32_t f1, int64_t v2)
+{
+    HELPER_LOG("%s: converting %ld to f%d\n", __FUNCTION__, v2, f1);
+    env->fregs[f1].d = int64_to_float64(v2, &env->fpu_status);
+}
+
+/* convert 64-bit int to 128-bit float */
+void HELPER(cxgbr)(uint32_t f1, int64_t v2)
+{
+    CPU_QuadU x1;
+    x1.q = int64_to_float128(v2, &env->fpu_status);
+    HELPER_LOG("%s: converted %ld to 0x%lx and 0x%lx\n", __FUNCTION__, v2,
+               x1.ll.upper, x1.ll.lower);
+    env->fregs[f1].ll = x1.ll.upper;
+    env->fregs[f1 + 2].ll = x1.ll.lower;
+}
+
+/* convert 32-bit int to 32-bit float */
+void HELPER(cefbr)(uint32_t f1, int32_t v2)
+{
+    env->fregs[f1].l.upper = int32_to_float32(v2, &env->fpu_status);
+    HELPER_LOG("%s: converting %d to 0x%d in f%d\n", __FUNCTION__, v2,
+               env->fregs[f1].l.upper, f1);
+}
+
+/* 32-bit FP addition RR */
+uint32_t HELPER(aebr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_add(env->fregs[f1].l.upper,
+                                         env->fregs[f2].l.upper,
+                                         &env->fpu_status);
+    HELPER_LOG("%s: adding 0x%d resulting in 0x%d in f%d\n", __FUNCTION__,
+               env->fregs[f2].l.upper, env->fregs[f1].l.upper, f1);
+
+    return set_cc_nz_f32(env->fregs[f1].l.upper);
+}
+
+/* 64-bit FP addition RR */
+uint32_t HELPER(adbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_add(env->fregs[f1].d, env->fregs[f2].d,
+                                   &env->fpu_status);
+    HELPER_LOG("%s: adding 0x%ld resulting in 0x%ld in f%d\n", __FUNCTION__,
+               env->fregs[f2].d, env->fregs[f1].d, f1);
+
+    return set_cc_nz_f64(env->fregs[f1].d);
+}
+
+/* 32-bit FP subtraction RR */
+uint32_t HELPER(sebr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_sub(env->fregs[f1].l.upper,
+                                         env->fregs[f2].l.upper,
+                                         &env->fpu_status);
+    HELPER_LOG("%s: adding 0x%d resulting in 0x%d in f%d\n", __FUNCTION__,
+               env->fregs[f2].l.upper, env->fregs[f1].l.upper, f1);
+
+    return set_cc_nz_f32(env->fregs[f1].l.upper);
+}
+
+/* 64-bit FP subtraction RR */
+uint32_t HELPER(sdbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_sub(env->fregs[f1].d, env->fregs[f2].d,
+                                   &env->fpu_status);
+    HELPER_LOG("%s: subtracting 0x%ld resulting in 0x%ld in f%d\n",
+               __FUNCTION__, env->fregs[f2].d, env->fregs[f1].d, f1);
+
+    return set_cc_nz_f64(env->fregs[f1].d);
+}
+
+/* 32-bit FP division RR */
+void HELPER(debr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_div(env->fregs[f1].l.upper,
+                                         env->fregs[f2].l.upper,
+                                         &env->fpu_status);
+}
+
+/* 128-bit FP division RR */
+void HELPER(dxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    CPU_QuadU res;
+    res.q = float128_div(v1.q, v2.q, &env->fpu_status);
+    env->fregs[f1].ll = res.ll.upper;
+    env->fregs[f1 + 2].ll = res.ll.lower;
+}
+
+/* 64-bit FP multiplication RR */
+void HELPER(mdbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_mul(env->fregs[f1].d, env->fregs[f2].d,
+                                   &env->fpu_status);
+}
+
+/* 128-bit FP multiplication RR */
+void HELPER(mxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    CPU_QuadU res;
+    res.q = float128_mul(v1.q, v2.q, &env->fpu_status);
+    env->fregs[f1].ll = res.ll.upper;
+    env->fregs[f1 + 2].ll = res.ll.lower;
+}
+
+/* convert 32-bit float to 64-bit float */
+void HELPER(ldebr)(uint32_t r1, uint32_t r2)
+{
+    env->fregs[r1].d = float32_to_float64(env->fregs[r2].l.upper,
+                                          &env->fpu_status);
+}
+
+/* convert 128-bit float to 64-bit float */
+void HELPER(ldxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU x2;
+    x2.ll.upper = env->fregs[f2].ll;
+    x2.ll.lower = env->fregs[f2 + 2].ll;
+    env->fregs[f1].d = float128_to_float64(x2.q, &env->fpu_status);
+    HELPER_LOG("%s: to 0x%ld\n", __FUNCTION__, env->fregs[f1].d);
+}
+
+/* convert 64-bit float to 128-bit float */
+void HELPER(lxdbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU res;
+    res.q = float64_to_float128(env->fregs[f2].d, &env->fpu_status);
+    env->fregs[f1].ll = res.ll.upper;
+    env->fregs[f1 + 2].ll = res.ll.lower;
+}
+
+/* convert 64-bit float to 32-bit float */
+void HELPER(ledbr)(uint32_t f1, uint32_t f2)
+{
+    float64 d2 = env->fregs[f2].d;
+    env->fregs[f1].l.upper = float64_to_float32(d2, &env->fpu_status);
+}
+
+/* convert 128-bit float to 32-bit float */
+void HELPER(lexbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU x2;
+    x2.ll.upper = env->fregs[f2].ll;
+    x2.ll.lower = env->fregs[f2 + 2].ll;
+    env->fregs[f1].l.upper = float128_to_float32(x2.q, &env->fpu_status);
+    HELPER_LOG("%s: to 0x%d\n", __FUNCTION__, env->fregs[f1].l.upper);
+}
+
+/* absolute value of 32-bit float */
+uint32_t HELPER(lpebr)(uint32_t f1, uint32_t f2)
+{
+    float32 v1;
+    float32 v2 = env->fregs[f2].d;
+    v1 = float32_abs(v2);
+    env->fregs[f1].d = v1;
+    return set_cc_nz_f32(v1);
+}
+
+/* absolute value of 64-bit float */
+uint32_t HELPER(lpdbr)(uint32_t f1, uint32_t f2)
+{
+    float64 v1;
+    float64 v2 = env->fregs[f2].d;
+    v1 = float64_abs(v2);
+    env->fregs[f1].d = v1;
+    return set_cc_nz_f64(v1);
+}
+
+/* absolute value of 128-bit float */
+uint32_t HELPER(lpxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    v1.q = float128_abs(v2.q);
+    env->fregs[f1].ll = v1.ll.upper;
+    env->fregs[f1 + 2].ll = v1.ll.lower;
+    return set_cc_nz_f128(v1.q);
+}
+
+/* load and test 64-bit float */
+uint32_t HELPER(ltdbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = env->fregs[f2].d;
+    return set_cc_nz_f64(env->fregs[f1].d);
+}
+
+/* load and test 32-bit float */
+uint32_t HELPER(ltebr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = env->fregs[f2].l.upper;
+    return set_cc_nz_f32(env->fregs[f1].l.upper);
+}
+
+/* load and test 128-bit float */
+uint32_t HELPER(ltxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU x;
+    x.ll.upper = env->fregs[f2].ll;
+    x.ll.lower = env->fregs[f2 + 2].ll;
+    env->fregs[f1].ll = x.ll.upper;
+    env->fregs[f1 + 2].ll = x.ll.lower;
+    return set_cc_nz_f128(x.q);
+}
+
+/* negative absolute of 32-bit float */
+uint32_t HELPER(lcebr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_sub(float32_zero, env->fregs[f2].l.upper,
+                                         &env->fpu_status);
+    return set_cc_nz_f32(env->fregs[f1].l.upper);
+}
+
+/* negative absolute of 64-bit float */
+uint32_t HELPER(lcdbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_sub(float64_zero, env->fregs[f2].d,
+                                   &env->fpu_status);
+    return set_cc_nz_f64(env->fregs[f1].d);
+}
+
+/* convert 64-bit float to 128-bit float */
+uint32_t HELPER(lcxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU x1, x2;
+    x2.ll.upper = env->fregs[f2].ll;
+    x2.ll.lower = env->fregs[f2 + 2].ll;
+    x1.q = float128_sub(float64_to_float128(float64_zero, &env->fpu_status),
+                        x2.q, &env->fpu_status);
+    env->fregs[f1].ll = x1.ll.upper;
+    env->fregs[f1 + 2].ll = x1.ll.lower;
+    return set_cc_nz_f128(x1.q);
+}
+
+/* 32-bit FP addition RM */
+void HELPER(aeb)(uint32_t f1, uint32_t val)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    CPU_FloatU v2;
+    v2.l = val;
+    HELPER_LOG("%s: adding 0x%d from f%d and 0x%d\n", __FUNCTION__,
+               v1, f1, v2.f);
+    env->fregs[f1].l.upper = float32_add(v1, v2.f, &env->fpu_status);
+}
+
+/* 32-bit FP division RM */
+void HELPER(deb)(uint32_t f1, uint32_t val)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    CPU_FloatU v2;
+    v2.l = val;
+    HELPER_LOG("%s: dividing 0x%d from f%d by 0x%d\n", __FUNCTION__,
+               v1, f1, v2.f);
+    env->fregs[f1].l.upper = float32_div(v1, v2.f, &env->fpu_status);
+}
+
+/* 32-bit FP multiplication RM */
+void HELPER(meeb)(uint32_t f1, uint32_t val)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    CPU_FloatU v2;
+    v2.l = val;
+    HELPER_LOG("%s: multiplying 0x%d from f%d and 0x%d\n", __FUNCTION__,
+               v1, f1, v2.f);
+    env->fregs[f1].l.upper = float32_mul(v1, v2.f, &env->fpu_status);
+}
+
+/* 32-bit FP compare RR */
+uint32_t HELPER(cebr)(uint32_t f1, uint32_t f2)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    float32 v2 = env->fregs[f2].l.upper;;
+    HELPER_LOG("%s: comparing 0x%d from f%d and 0x%d\n", __FUNCTION__,
+               v1, f1, v2);
+    return set_cc_f32(v1, v2);
+}
+
+/* 64-bit FP compare RR */
+uint32_t HELPER(cdbr)(uint32_t f1, uint32_t f2)
+{
+    float64 v1 = env->fregs[f1].d;
+    float64 v2 = env->fregs[f2].d;;
+    HELPER_LOG("%s: comparing 0x%ld from f%d and 0x%ld\n", __FUNCTION__,
+               v1, f1, v2);
+    return set_cc_f64(v1, v2);
+}
+
+/* 128-bit FP compare RR */
+uint32_t HELPER(cxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    if (float128_is_any_nan(v1.q) || float128_is_any_nan(v2.q)) {
+        return 3;
+    } else if (float128_eq(v1.q, v2.q, &env->fpu_status)) {
+        return 0;
+    } else if (float128_lt(v1.q, v2.q, &env->fpu_status)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+/* 64-bit FP compare RM */
+uint32_t HELPER(cdb)(uint32_t f1, uint64_t a2)
+{
+    float64 v1 = env->fregs[f1].d;
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    HELPER_LOG("%s: comparing 0x%ld from f%d and 0x%lx\n", __FUNCTION__, v1,
+               f1, v2.d);
+    return set_cc_f64(v1, v2.d);
+}
+
+/* 64-bit FP addition RM */
+uint32_t HELPER(adb)(uint32_t f1, uint64_t a2)
+{
+    float64 v1 = env->fregs[f1].d;
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    HELPER_LOG("%s: adding 0x%lx from f%d and 0x%lx\n", __FUNCTION__,
+               v1, f1, v2.d);
+    env->fregs[f1].d = v1 = float64_add(v1, v2.d, &env->fpu_status);
+    return set_cc_nz_f64(v1);
+}
+
+/* 32-bit FP subtraction RM */
+void HELPER(seb)(uint32_t f1, uint32_t val)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    CPU_FloatU v2;
+    v2.l = val;
+    env->fregs[f1].l.upper = float32_sub(v1, v2.f, &env->fpu_status);
+}
+
+/* 64-bit FP subtraction RM */
+uint32_t HELPER(sdb)(uint32_t f1, uint64_t a2)
+{
+    float64 v1 = env->fregs[f1].d;
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    env->fregs[f1].d = v1 = float64_sub(v1, v2.d, &env->fpu_status);
+    return set_cc_nz_f64(v1);
+}
+
+/* 64-bit FP multiplication RM */
+void HELPER(mdb)(uint32_t f1, uint64_t a2)
+{
+    float64 v1 = env->fregs[f1].d;
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    HELPER_LOG("%s: multiplying 0x%lx from f%d and 0x%ld\n", __FUNCTION__,
+               v1, f1, v2.d);
+    env->fregs[f1].d = float64_mul(v1, v2.d, &env->fpu_status);
+}
+
+/* 64-bit FP division RM */
+void HELPER(ddb)(uint32_t f1, uint64_t a2)
+{
+    float64 v1 = env->fregs[f1].d;
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    HELPER_LOG("%s: dividing 0x%lx from f%d by 0x%ld\n", __FUNCTION__,
+               v1, f1, v2.d);
+    env->fregs[f1].d = float64_div(v1, v2.d, &env->fpu_status);
+}
+
+static void set_round_mode(int m3)
+{
+    switch (m3) {
+    case 0:
+        /* current mode */
+        break;
+    case 1:
+        /* biased round no nearest */
+    case 4:
+        /* round to nearest */
+        set_float_rounding_mode(float_round_nearest_even, &env->fpu_status);
+        break;
+    case 5:
+        /* round to zero */
+        set_float_rounding_mode(float_round_to_zero, &env->fpu_status);
+        break;
+    case 6:
+        /* round to +inf */
+        set_float_rounding_mode(float_round_up, &env->fpu_status);
+        break;
+    case 7:
+        /* round to -inf */
+        set_float_rounding_mode(float_round_down, &env->fpu_status);
+        break;
+    }
+}
+
+/* convert 32-bit float to 64-bit int */
+uint32_t HELPER(cgebr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    float32 v2 = env->fregs[f2].l.upper;
+    set_round_mode(m3);
+    env->regs[r1] = float32_to_int64(v2, &env->fpu_status);
+    return set_cc_nz_f32(v2);
+}
+
+/* convert 64-bit float to 64-bit int */
+uint32_t HELPER(cgdbr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    float64 v2 = env->fregs[f2].d;
+    set_round_mode(m3);
+    env->regs[r1] = float64_to_int64(v2, &env->fpu_status);
+    return set_cc_nz_f64(v2);
+}
+
+/* convert 128-bit float to 64-bit int */
+uint32_t HELPER(cgxbr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+set_round_mode(m3);
+    env->regs[r1] = float128_to_int64(v2.q, &env->fpu_status);
+    if (float128_is_any_nan(v2.q)) {
+        return 3;
+    } else if (float128_is_zero(v2.q)) {
+        return 0;
+    } else if (float128_is_neg(v2.q)) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+/* convert 32-bit float to 32-bit int */
+uint32_t HELPER(cfebr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    float32 v2 = env->fregs[f2].l.upper;
+    set_round_mode(m3);
+    env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) |
+                     float32_to_int32(v2, &env->fpu_status);
+    return set_cc_nz_f32(v2);
+}
+
+/* convert 64-bit float to 32-bit int */
+uint32_t HELPER(cfdbr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    float64 v2 = env->fregs[f2].d;
+    set_round_mode(m3);
+    env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) |
+                     float64_to_int32(v2, &env->fpu_status);
+    return set_cc_nz_f64(v2);
+}
+
+/* convert 128-bit float to 32-bit int */
+uint32_t HELPER(cfxbr)(uint32_t r1, uint32_t f2, uint32_t m3)
+{
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) |
+                     float128_to_int32(v2.q, &env->fpu_status);
+    return set_cc_nz_f128(v2.q);
+}
+
+/* load 32-bit FP zero */
+void HELPER(lzer)(uint32_t f1)
+{
+    env->fregs[f1].l.upper = float32_zero;
+}
+
+/* load 64-bit FP zero */
+void HELPER(lzdr)(uint32_t f1)
+{
+    env->fregs[f1].d = float64_zero;
+}
+
+/* load 128-bit FP zero */
+void HELPER(lzxr)(uint32_t f1)
+{
+    CPU_QuadU x;
+    x.q = float64_to_float128(float64_zero, &env->fpu_status);
+    env->fregs[f1].ll = x.ll.upper;
+    env->fregs[f1 + 1].ll = x.ll.lower;
+}
+
+/* 128-bit FP subtraction RR */
+uint32_t HELPER(sxbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    CPU_QuadU res;
+    res.q = float128_sub(v1.q, v2.q, &env->fpu_status);
+    env->fregs[f1].ll = res.ll.upper;
+    env->fregs[f1 + 2].ll = res.ll.lower;
+    return set_cc_nz_f128(res.q);
+}
+
+/* 128-bit FP addition RR */
+uint32_t HELPER(axbr)(uint32_t f1, uint32_t f2)
+{
+    CPU_QuadU v1;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+    CPU_QuadU v2;
+    v2.ll.upper = env->fregs[f2].ll;
+    v2.ll.lower = env->fregs[f2 + 2].ll;
+    CPU_QuadU res;
+    res.q = float128_add(v1.q, v2.q, &env->fpu_status);
+    env->fregs[f1].ll = res.ll.upper;
+    env->fregs[f1 + 2].ll = res.ll.lower;
+    return set_cc_nz_f128(res.q);
+}
+
+/* 32-bit FP multiplication RR */
+void HELPER(meebr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_mul(env->fregs[f1].l.upper,
+                                         env->fregs[f2].l.upper,
+                                         &env->fpu_status);
+}
+
+/* 64-bit FP division RR */
+void HELPER(ddbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_div(env->fregs[f1].d, env->fregs[f2].d,
+                                   &env->fpu_status);
+}
+
+/* 64-bit FP multiply and add RM */
+void HELPER(madb)(uint32_t f1, uint64_t a2, uint32_t f3)
+{
+    HELPER_LOG("%s: f1 %d a2 0x%lx f3 %d\n", __FUNCTION__, f1, a2, f3);
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    env->fregs[f1].d = float64_add(env->fregs[f1].d,
+                                   float64_mul(v2.d, env->fregs[f3].d,
+                                               &env->fpu_status),
+                                   &env->fpu_status);
+}
+
+/* 64-bit FP multiply and add RR */
+void HELPER(madbr)(uint32_t f1, uint32_t f3, uint32_t f2)
+{
+    HELPER_LOG("%s: f1 %d f2 %d f3 %d\n", __FUNCTION__, f1, f2, f3);
+    env->fregs[f1].d = float64_add(float64_mul(env->fregs[f2].d,
+                                               env->fregs[f3].d,
+                                               &env->fpu_status),
+                                   env->fregs[f1].d, &env->fpu_status);
+}
+
+/* 64-bit FP multiply and subtract RR */
+void HELPER(msdbr)(uint32_t f1, uint32_t f3, uint32_t f2)
+{
+    HELPER_LOG("%s: f1 %d f2 %d f3 %d\n", __FUNCTION__, f1, f2, f3);
+    env->fregs[f1].d = float64_sub(float64_mul(env->fregs[f2].d,
+                                               env->fregs[f3].d,
+                                               &env->fpu_status),
+                                   env->fregs[f1].d, &env->fpu_status);
+}
+
+/* 32-bit FP multiply and add RR */
+void HELPER(maebr)(uint32_t f1, uint32_t f3, uint32_t f2)
+{
+    env->fregs[f1].l.upper = float32_add(env->fregs[f1].l.upper,
+                                         float32_mul(env->fregs[f2].l.upper,
+                                                     env->fregs[f3].l.upper,
+                                                     &env->fpu_status),
+                                         &env->fpu_status);
+}
+
+/* convert 64-bit float to 128-bit float */
+void HELPER(lxdb)(uint32_t f1, uint64_t a2)
+{
+    CPU_DoubleU v2;
+    v2.ll = ldq(a2);
+    CPU_QuadU v1;
+    v1.q = float64_to_float128(v2.d, &env->fpu_status);
+    env->fregs[f1].ll = v1.ll.upper;
+    env->fregs[f1 + 2].ll = v1.ll.lower;
+}
+
+/* test data class 32-bit */
+uint32_t HELPER(tceb)(uint32_t f1, uint64_t m2)
+{
+    float32 v1 = env->fregs[f1].l.upper;
+    int neg = float32_is_neg(v1);
+    uint32_t cc = 0;
+
+    HELPER_LOG("%s: v1 0x%lx m2 0x%lx neg %d\n", __FUNCTION__, (long)v1, m2, neg);
+    if ((float32_is_zero(v1) && (m2 & (1 << (11-neg)))) ||
+        (float32_is_infinity(v1) && (m2 & (1 << (5-neg)))) ||
+        (float32_is_any_nan(v1) && (m2 & (1 << (3-neg)))) ||
+        (float32_is_signaling_nan(v1) && (m2 & (1 << (1-neg))))) {
+        cc = 1;
+    } else if (m2 & (1 << (9-neg))) {
+        /* assume normalized number */
+        cc = 1;
+    }
+
+    /* FIXME: denormalized? */
+    return cc;
+}
+
+/* test data class 64-bit */
+uint32_t HELPER(tcdb)(uint32_t f1, uint64_t m2)
+{
+    float64 v1 = env->fregs[f1].d;
+    int neg = float64_is_neg(v1);
+    uint32_t cc = 0;
+
+    HELPER_LOG("%s: v1 0x%lx m2 0x%lx neg %d\n", __FUNCTION__, v1, m2, neg);
+    if ((float64_is_zero(v1) && (m2 & (1 << (11-neg)))) ||
+        (float64_is_infinity(v1) && (m2 & (1 << (5-neg)))) ||
+        (float64_is_any_nan(v1) && (m2 & (1 << (3-neg)))) ||
+        (float64_is_signaling_nan(v1) && (m2 & (1 << (1-neg))))) {
+        cc = 1;
+    } else if (m2 & (1 << (9-neg))) {
+        /* assume normalized number */
+        cc = 1;
+    }
+    /* FIXME: denormalized? */
+    return cc;
+}
+
+/* test data class 128-bit */
+uint32_t HELPER(tcxb)(uint32_t f1, uint64_t m2)
+{
+    CPU_QuadU v1;
+    uint32_t cc = 0;
+    v1.ll.upper = env->fregs[f1].ll;
+    v1.ll.lower = env->fregs[f1 + 2].ll;
+
+    int neg = float128_is_neg(v1.q);
+    if ((float128_is_zero(v1.q) && (m2 & (1 << (11-neg)))) ||
+        (float128_is_infinity(v1.q) && (m2 & (1 << (5-neg)))) ||
+        (float128_is_any_nan(v1.q) && (m2 & (1 << (3-neg)))) ||
+        (float128_is_signaling_nan(v1.q) && (m2 & (1 << (1-neg))))) {
+        cc = 1;
+    } else if (m2 & (1 << (9-neg))) {
+        /* assume normalized number */
+        cc = 1;
+    }
+    /* FIXME: denormalized? */
+    return cc;
+}
+
+/* find leftmost one */
+uint32_t HELPER(flogr)(uint32_t r1, uint64_t v2)
+{
+    uint64_t res = 0;
+    uint64_t ov2 = v2;
+
+    while (!(v2 & 0x8000000000000000ULL) && v2) {
+        v2 <<= 1;
+        res++;
+    }
+
+    if (!v2) {
+        env->regs[r1] = 64;
+        env->regs[r1 + 1] = 0;
+        return 0;
+    } else {
+        env->regs[r1] = res;
+        env->regs[r1 + 1] = ov2 & ~(0x8000000000000000ULL >> res);
+        return 2;
+    }
+}
+
+/* square root 64-bit RR */
+void HELPER(sqdbr)(uint32_t f1, uint32_t f2)
+{
+    env->fregs[f1].d = float64_sqrt(env->fregs[f2].d, &env->fpu_status);
+}
+
+static inline uint64_t cksm_overflow(uint64_t cksm)
+{
+    if (cksm > 0xffffffffULL) {
+        cksm &= 0xffffffffULL;
+        cksm++;
+    }
+    return cksm;
+}
+
+/* checksum */
+void HELPER(cksm)(uint32_t r1, uint32_t r2)
+{
+    uint64_t src = get_address_31fix(r2);
+    uint64_t src_len = env->regs[(r2 + 1) & 15];
+    uint64_t cksm = 0;
+
+    while (src_len >= 4) {
+        cksm += ldl(src);
+        cksm = cksm_overflow(cksm);
+
+        /* move to next word */
+        src_len -= 4;
+        src += 4;
+    }
+
+    switch (src_len) {
+    case 0:
+        break;
+    case 1:
+        cksm += ldub(src);
+        cksm = cksm_overflow(cksm);
+        break;
+    case 2:
+        cksm += lduw(src);
+        cksm = cksm_overflow(cksm);
+        break;
+    case 3:
+        /* XXX check if this really is correct */
+        cksm += lduw(src) << 8;
+        cksm += ldub(src + 2);
+        cksm = cksm_overflow(cksm);
+        break;
+    }
+
+    /* indicate we've processed everything */
+    env->regs[(r2 + 1) & 15] = 0;
+
+    /* store result */
+    env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) | (uint32_t)cksm;
+}
+
+static inline uint32_t cc_calc_ltgt_32(CPUState *env, int32_t src,
+                                       int32_t dst)
+{
+    if (src == dst) {
+        return 0;
+    } else if (src < dst) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static inline uint32_t cc_calc_ltgt0_32(CPUState *env, int32_t dst)
+{
+    return cc_calc_ltgt_32(env, dst, 0);
+}
+
+static inline uint32_t cc_calc_ltgt_64(CPUState *env, int64_t src,
+                                       int64_t dst)
+{
+    if (src == dst) {
+        return 0;
+    } else if (src < dst) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static inline uint32_t cc_calc_ltgt0_64(CPUState *env, int64_t dst)
+{
+    return cc_calc_ltgt_64(env, dst, 0);
+}
+
+static inline uint32_t cc_calc_ltugtu_32(CPUState *env, uint32_t src,
+                                         uint32_t dst)
+{
+    if (src == dst) {
+        return 0;
+    } else if (src < dst) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static inline uint32_t cc_calc_ltugtu_64(CPUState *env, uint64_t src,
+                                         uint64_t dst)
+{
+    if (src == dst) {
+        return 0;
+    } else if (src < dst) {
+        return 1;
+    } else {
+        return 2;
+    }
+}
+
+static inline uint32_t cc_calc_tm_32(CPUState *env, uint32_t val, uint32_t mask)
+{
+    HELPER_LOG("%s: val 0x%x mask 0x%x\n", __FUNCTION__, val, mask);
+    uint16_t r = val & mask;
+    if (r == 0 || mask == 0) {
+        return 0;
+    } else if (r == mask) {
+        return 3;
+    } else {
+        return 1;
+    }
+}
+
+/* set condition code for test under mask */
+static inline uint32_t cc_calc_tm_64(CPUState *env, uint64_t val, uint32_t mask)
+{
+    uint16_t r = val & mask;
+    HELPER_LOG("%s: val 0x%lx mask 0x%x r 0x%x\n", __FUNCTION__, val, mask, r);
+    if (r == 0 || mask == 0) {
+        return 0;
+    } else if (r == mask) {
+        return 3;
+    } else {
+        while (!(mask & 0x8000)) {
+            mask <<= 1;
+            val <<= 1;
+        }
+        if (val & 0x8000) {
+            return 2;
+        } else {
+            return 1;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_nz(CPUState *env, uint64_t dst)
+{
+    return !!dst;
+}
+
+static inline uint32_t cc_calc_add_64(CPUState *env, int64_t a1, int64_t a2,
+                                      int64_t ar)
+{
+    if ((a1 > 0 && a2 > 0 && ar < 0) || (a1 < 0 && a2 < 0 && ar > 0)) {
+        return 3; /* overflow */
+    } else {
+        if (ar < 0) {
+            return 1;
+        } else if (ar > 0) {
+            return 2;
+        } else {
+            return 0;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_addu_64(CPUState *env, uint64_t a1, uint64_t a2,
+                                       uint64_t ar)
+{
+    if (ar == 0) {
+        if (a1) {
+            return 2;
+        } else {
+            return 0;
+        }
+    } else {
+        if (ar < a1 || ar < a2) {
+          return 3;
+        } else {
+          return 1;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_sub_64(CPUState *env, int64_t a1, int64_t a2,
+                                      int64_t ar)
+{
+    if ((a1 > 0 && a2 < 0 && ar < 0) || (a1 < 0 && a2 > 0 && ar > 0)) {
+        return 3; /* overflow */
+    } else {
+        if (ar < 0) {
+            return 1;
+        } else if (ar > 0) {
+            return 2;
+        } else {
+            return 0;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_subu_64(CPUState *env, uint64_t a1, uint64_t a2,
+                                       uint64_t ar)
+{
+    if (ar == 0) {
+        return 2;
+    } else {
+        if (a2 > a1) {
+            return 1;
+        } else {
+            return 3;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_abs_64(CPUState *env, int64_t dst)
+{
+    if ((uint64_t)dst == 0x8000000000000000ULL) {
+        return 3;
+    } else if (dst) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static inline uint32_t cc_calc_nabs_64(CPUState *env, int64_t dst)
+{
+    return !!dst;
+}
+
+static inline uint32_t cc_calc_comp_64(CPUState *env, int64_t dst)
+{
+    if ((uint64_t)dst == 0x8000000000000000ULL) {
+        return 3;
+    } else if (dst < 0) {
+        return 1;
+    } else if (dst > 0) {
+        return 2;
+    } else {
+        return 0;
+    }
+}
+
+
+static inline uint32_t cc_calc_add_32(CPUState *env, int32_t a1, int32_t a2,
+                                      int32_t ar)
+{
+    if ((a1 > 0 && a2 > 0 && ar < 0) || (a1 < 0 && a2 < 0 && ar > 0)) {
+        return 3; /* overflow */
+    } else {
+        if (ar < 0) {
+            return 1;
+        } else if (ar > 0) {
+            return 2;
+        } else {
+            return 0;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_addu_32(CPUState *env, uint32_t a1, uint32_t a2,
+                                       uint32_t ar)
+{
+    if (ar == 0) {
+        if (a1) {
+          return 2;
+        } else {
+          return 0;
+        }
+    } else {
+        if (ar < a1 || ar < a2) {
+          return 3;
+        } else {
+          return 1;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_sub_32(CPUState *env, int32_t a1, int32_t a2,
+                                      int32_t ar)
+{
+    if ((a1 > 0 && a2 < 0 && ar < 0) || (a1 < 0 && a2 > 0 && ar > 0)) {
+        return 3; /* overflow */
+    } else {
+        if (ar < 0) {
+            return 1;
+        } else if (ar > 0) {
+            return 2;
+        } else {
+            return 0;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_subu_32(CPUState *env, uint32_t a1, uint32_t a2,
+                                       uint32_t ar)
+{
+    if (ar == 0) {
+        return 2;
+    } else {
+        if (a2 > a1) {
+            return 1;
+        } else {
+            return 3;
+        }
+    }
+}
+
+static inline uint32_t cc_calc_abs_32(CPUState *env, int32_t dst)
+{
+    if ((uint32_t)dst == 0x80000000UL) {
+        return 3;
+    } else if (dst) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static inline uint32_t cc_calc_nabs_32(CPUState *env, int32_t dst)
+{
+    return !!dst;
+}
+
+static inline uint32_t cc_calc_comp_32(CPUState *env, int32_t dst)
+{
+    if ((uint32_t)dst == 0x80000000UL) {
+        return 3;
+    } else if (dst < 0) {
+        return 1;
+    } else if (dst > 0) {
+        return 2;
+    } else {
+        return 0;
+    }
+}
+
+/* calculate condition code for insert character under mask insn */
+static inline uint32_t cc_calc_icm_32(CPUState *env, uint32_t mask, uint32_t val)
+{
+    HELPER_LOG("%s: mask 0x%x val %d\n", __FUNCTION__, mask, val);
+    uint32_t cc;
+    if (!val || !mask) {
+        cc = 0;
+    } else {
+        while (mask != 1) {
+            mask >>= 1;
+            val >>= 8;
+        }
+        if (val & 0x80) {
+            cc = 1;
+        } else {
+            cc = 2;
+        }
+    }
+    return cc;
+}
+
+static inline uint32_t do_calc_cc(CPUState *env, uint32_t cc_op, uint64_t src,
+                                  uint64_t dst, uint64_t vr)
+{
+    uint32_t r = 0;
+
+    switch (cc_op) {
+    case CC_OP_CONST0:
+    case CC_OP_CONST1:
+    case CC_OP_CONST2:
+    case CC_OP_CONST3:
+        /* cc_op value _is_ cc */
+        r = cc_op;
+        break;
+    case CC_OP_LTGT0_32:
+        r = cc_calc_ltgt0_32(env, dst);
+        break;
+    case CC_OP_LTGT0_64:
+        r =  cc_calc_ltgt0_64(env, dst);
+        break;
+    case CC_OP_LTGT_32:
+        r =  cc_calc_ltgt_32(env, src, dst);
+        break;
+    case CC_OP_LTGT_64:
+        r =  cc_calc_ltgt_64(env, src, dst);
+        break;
+    case CC_OP_LTUGTU_32:
+        r =  cc_calc_ltugtu_32(env, src, dst);
+        break;
+    case CC_OP_LTUGTU_64:
+        r =  cc_calc_ltugtu_64(env, src, dst);
+        break;
+    case CC_OP_TM_32:
+        r =  cc_calc_tm_32(env, src, dst);
+        break;
+    case CC_OP_TM_64:
+        r =  cc_calc_tm_64(env, src, dst);
+        break;
+    case CC_OP_NZ:
+        r =  cc_calc_nz(env, dst);
+        break;
+    case CC_OP_ADD_64:
+        r =  cc_calc_add_64(env, src, dst, vr);
+        break;
+    case CC_OP_ADDU_64:
+        r =  cc_calc_addu_64(env, src, dst, vr);
+        break;
+    case CC_OP_SUB_64:
+        r =  cc_calc_sub_64(env, src, dst, vr);
+        break;
+    case CC_OP_SUBU_64:
+        r =  cc_calc_subu_64(env, src, dst, vr);
+        break;
+    case CC_OP_ABS_64:
+        r =  cc_calc_abs_64(env, dst);
+        break;
+    case CC_OP_NABS_64:
+        r =  cc_calc_nabs_64(env, dst);
+        break;
+    case CC_OP_COMP_64:
+        r =  cc_calc_comp_64(env, dst);
+        break;
+
+    case CC_OP_ADD_32:
+        r =  cc_calc_add_32(env, src, dst, vr);
+        break;
+    case CC_OP_ADDU_32:
+        r =  cc_calc_addu_32(env, src, dst, vr);
+        break;
+    case CC_OP_SUB_32:
+        r =  cc_calc_sub_32(env, src, dst, vr);
+        break;
+    case CC_OP_SUBU_32:
+        r =  cc_calc_subu_32(env, src, dst, vr);
+        break;
+    case CC_OP_ABS_32:
+        r =  cc_calc_abs_64(env, dst);
+        break;
+    case CC_OP_NABS_32:
+        r =  cc_calc_nabs_64(env, dst);
+        break;
+    case CC_OP_COMP_32:
+        r =  cc_calc_comp_32(env, dst);
+        break;
+
+    case CC_OP_ICM:
+        r =  cc_calc_icm_32(env, src, dst);
+        break;
+
+    case CC_OP_LTGT_F32:
+        r = set_cc_f32(src, dst);
+        break;
+    case CC_OP_LTGT_F64:
+        r = set_cc_f64(src, dst);
+        break;
+    case CC_OP_NZ_F32:
+        r = set_cc_nz_f32(dst);
+        break;
+    case CC_OP_NZ_F64:
+        r = set_cc_nz_f64(dst);
+        break;
+
+    default:
+        cpu_abort(env, "Unknown CC operation: %s\n", cc_name(cc_op));
+    }
+
+    HELPER_LOG("%s: %15s 0x%016lx 0x%016lx 0x%016lx = %d\n", __FUNCTION__,
+               cc_name(cc_op), src, dst, vr, r);
+    return r;
+}
+
+uint32_t calc_cc(CPUState *env, uint32_t cc_op, uint64_t src, uint64_t dst,
+                 uint64_t vr)
+{
+    return do_calc_cc(env, cc_op, src, dst, vr);
+}
+
+uint32_t HELPER(calc_cc)(uint32_t cc_op, uint64_t src, uint64_t dst,
+                         uint64_t vr)
+{
+    return do_calc_cc(env, cc_op, src, dst, vr);
+}
+
+uint64_t HELPER(cvd)(int32_t bin)
+{
+    /* positive 0 */
+    uint64_t dec = 0x0c;
+    int shift = 4;
+
+    if (bin < 0) {
+        bin = -bin;
+        dec = 0x0d;
+    }
+
+    for (shift = 4; (shift < 64) && bin; shift += 4) {
+        int current_number = bin % 10;
+
+        dec |= (current_number) << shift;
+        bin /= 10;
+    }
+
+    return dec;
+}
+
+void HELPER(unpk)(uint32_t len, uint64_t dest, uint64_t src)
+{
+    int len_dest = len >> 4;
+    int len_src = len & 0xf;
+    uint8_t b;
+    int second_nibble = 0;
+
+    dest += len_dest;
+    src += len_src;
+
+    /* last byte is special, it only flips the nibbles */
+    b = ldub(src);
+    stb(dest, (b << 4) | (b >> 4));
+    src--;
+    len_src--;
+
+    /* now pad every nibble with 0xf0 */
+
+    while (len_dest > 0) {
+        uint8_t cur_byte = 0;
+
+        if (len_src > 0) {
+            cur_byte = ldub(src);
+        }
+
+        len_dest--;
+        dest--;
+
+        /* only advance one nibble at a time */
+        if (second_nibble) {
+            cur_byte >>= 4;
+            len_src--;
+            src--;
+        }
+        second_nibble = !second_nibble;
+
+        /* digit */
+        cur_byte = (cur_byte & 0xf);
+        /* zone bits */
+        cur_byte |= 0xf0;
+
+        stb(dest, cur_byte);
+    }
+}
+
+void HELPER(tr)(uint32_t len, uint64_t array, uint64_t trans)
+{
+    int i;
+
+    for (i = 0; i <= len; i++) {
+        uint8_t byte = ldub(array + i);
+        uint8_t new_byte = ldub(trans + byte);
+        stb(array + i, new_byte);
+    }
+}
+
+#ifndef CONFIG_USER_ONLY
+
+void HELPER(load_psw)(uint64_t mask, uint64_t addr)
+{
+    load_psw(env, mask, addr);
+    cpu_loop_exit();
+}
+
+static void program_interrupt(CPUState *env, uint32_t code, int ilc)
+{
+    qemu_log("program interrupt at %#" PRIx64 "\n", env->psw.addr);
+
+    if (kvm_enabled()) {
+        kvm_s390_interrupt(env, KVM_S390_PROGRAM_INT, code);
+    } else {
+        env->int_pgm_code = code;
+        env->int_pgm_ilc = ilc;
+        env->exception_index = EXCP_PGM;
+        cpu_loop_exit();
+    }
+}
+
+static void ext_interrupt(CPUState *env, int type, uint32_t param,
+                          uint64_t param64)
+{
+    cpu_inject_ext(env, type, param, param64);
+}
+
+int sclp_service_call(CPUState *env, uint32_t sccb, uint64_t code)
+{
+    int r = 0;
+
+#ifdef DEBUG_HELPER
+    printf("sclp(0x%x, 0x%" PRIx64 ")\n", sccb, code);
+#endif
+
+    if (sccb & ~0x7ffffff8ul) {
+        fprintf(stderr, "KVM: invalid sccb address 0x%x\n", sccb);
+        r = -1;
+        goto out;
+    }
+
+    switch(code) {
+        case SCLP_CMDW_READ_SCP_INFO:
+        case SCLP_CMDW_READ_SCP_INFO_FORCED:
+            stw_phys(sccb + SCP_MEM_CODE, ram_size >> 20);
+            stb_phys(sccb + SCP_INCREMENT, 1);
+            stw_phys(sccb + SCP_RESPONSE_CODE, 0x10);
+
+            if (kvm_enabled()) {
+#ifdef CONFIG_KVM
+                kvm_s390_interrupt_internal(env, KVM_S390_INT_SERVICE,
+                                            sccb & ~3, 0, 1);
+#endif
+            } else {
+                env->psw.addr += 4;
+                ext_interrupt(env, EXT_SERVICE, sccb & ~3, 0);
+            }
+            break;
+        default:
+#ifdef DEBUG_HELPER
+            printf("KVM: invalid sclp call 0x%x / 0x%" PRIx64 "x\n", sccb, code);
+#endif
+            r = -1;
+            break;
+    }
+
+out:
+    return r;
+}
+
+/* SCLP service call */
+uint32_t HELPER(servc)(uint32_t r1, uint64_t r2)
+{
+    if (sclp_service_call(env, r1, r2)) {
+        return 3;
+    }
+
+    return 0;
+}
+
+/* DIAG */
+uint64_t HELPER(diag)(uint32_t num, uint64_t mem, uint64_t code)
+{
+    uint64_t r;
+
+    switch (num) {
+    case 0x500:
+        /* KVM hypercall */
+        r = s390_virtio_hypercall(env, mem, code);
+        break;
+    case 0x44:
+        /* yield */
+        r = 0;
+        break;
+    case 0x308:
+        /* ipl */
+        r = 0;
+        break;
+    default:
+        r = -1;
+        break;
+    }
+
+    if (r) {
+        program_interrupt(env, PGM_OPERATION, ILC_LATER_INC);
+    }
+
+    return r;
+}
+
+/* Store CPU ID */
+void HELPER(stidp)(uint64_t a1)
+{
+    stq(a1, env->cpu_num);
+}
+
+/* Set Prefix */
+void HELPER(spx)(uint64_t a1)
+{
+    uint32_t prefix;
+
+    prefix = ldl(a1);
+    env->psa = prefix & 0xfffff000;
+    qemu_log("prefix: %#x\n", prefix);
+    tlb_flush_page(env, 0);
+    tlb_flush_page(env, TARGET_PAGE_SIZE);
+}
+
+/* Set Clock */
+uint32_t HELPER(sck)(uint64_t a1)
+{
+    /* XXX not implemented - is it necessary? */
+
+    return 0;
+}
+
+static inline uint64_t clock_value(CPUState *env)
+{
+    uint64_t time;
+
+    time = env->tod_offset +
+           time2tod(qemu_get_clock(vm_clock) - env->tod_basetime);
+
+    return time;
+}
+
+/* Store Clock */
+uint32_t HELPER(stck)(uint64_t a1)
+{
+    stq(a1, clock_value(env));
+
+    return 0;
+}
+
+/* Store Clock Extended */
+uint32_t HELPER(stcke)(uint64_t a1)
+{
+    stb(a1, 0);
+    /* basically the same value as stck */
+    stq(a1 + 1, clock_value(env) | env->cpu_num);
+    /* more fine grained than stck */
+    stq(a1 + 9, 0);
+    /* XXX programmable fields */
+    stw(a1 + 17, 0);
+
+
+    return 0;
+}
+
+/* Set Clock Comparator */
+void HELPER(sckc)(uint64_t a1)
+{
+    uint64_t time = ldq(a1);
+
+    if (time == -1ULL) {
+        return;
+    }
+
+    /* difference between now and then */
+    time -= clock_value(env);
+    /* nanoseconds */
+    time = (time * 125) >> 9;
+
+    qemu_mod_timer(env->tod_timer, qemu_get_clock(vm_clock) + time);
+}
+
+/* Store Clock Comparator */
+void HELPER(stckc)(uint64_t a1)
+{
+    /* XXX implement */
+    stq(a1, 0);
+}
+
+/* Set CPU Timer */
+void HELPER(spt)(uint64_t a1)
+{
+    uint64_t time = ldq(a1);
+
+    if (time == -1ULL) {
+        return;
+    }
+
+    /* nanoseconds */
+    time = (time * 125) >> 9;
+
+    qemu_mod_timer(env->cpu_timer, qemu_get_clock(vm_clock) + time);
+}
+
+/* Store CPU Timer */
+void HELPER(stpt)(uint64_t a1)
+{
+    /* XXX implement */
+    stq(a1, 0);
+}
+
+/* Store System Information */
+uint32_t HELPER(stsi)(uint64_t a0, uint32_t r0, uint32_t r1)
+{
+    int cc = 0;
+    int sel1, sel2;
+
+    if ((r0 & STSI_LEVEL_MASK) <= STSI_LEVEL_3 &&
+        ((r0 & STSI_R0_RESERVED_MASK) || (r1 & STSI_R1_RESERVED_MASK))) {
+        /* valid function code, invalid reserved bits */
+        program_interrupt(env, PGM_SPECIFICATION, 2);
+    }
+
+    sel1 = r0 & STSI_R0_SEL1_MASK;
+    sel2 = r1 & STSI_R1_SEL2_MASK;
+
+    /* XXX: spec exception if sysib is not 4k-aligned */
+
+    switch (r0 & STSI_LEVEL_MASK) {
+    case STSI_LEVEL_1:
+        if ((sel1 == 1) && (sel2 == 1)) {
+            /* Basic Machine Configuration */
+            struct sysib_111 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            ebcdic_put(sysib.manuf, "QEMU            ", 16);
+            /* same as machine type number in STORE CPU ID */
+            ebcdic_put(sysib.type, "QEMU", 4);
+            /* same as model number in STORE CPU ID */
+            ebcdic_put(sysib.model, "QEMU            ", 16);
+            ebcdic_put(sysib.sequence, "QEMU            ", 16);
+            ebcdic_put(sysib.plant, "QEMU", 4);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else if ((sel1 == 2) && (sel2 == 1)) {
+            /* Basic Machine CPU */
+            struct sysib_121 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            /* XXX make different for different CPUs? */
+            ebcdic_put(sysib.sequence, "QEMUQEMUQEMUQEMU", 16);
+            ebcdic_put(sysib.plant, "QEMU", 4);
+            stw_p(&sysib.cpu_addr, env->cpu_num);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else if ((sel1 == 2) && (sel2 == 2)) {
+            /* Basic Machine CPUs */
+            struct sysib_122 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            stl_p(&sysib.capability, 0x443afc29);
+            /* XXX change when SMP comes */
+            stw_p(&sysib.total_cpus, 1);
+            stw_p(&sysib.active_cpus, 1);
+            stw_p(&sysib.standby_cpus, 0);
+            stw_p(&sysib.reserved_cpus, 0);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else {
+            cc = 3;
+        }
+        break;
+    case STSI_LEVEL_2:
+    {
+        if ((sel1 == 2) && (sel2 == 1)) {
+            /* LPAR CPU */
+            struct sysib_221 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            /* XXX make different for different CPUs? */
+            ebcdic_put(sysib.sequence, "QEMUQEMUQEMUQEMU", 16);
+            ebcdic_put(sysib.plant, "QEMU", 4);
+            stw_p(&sysib.cpu_addr, env->cpu_num);
+            stw_p(&sysib.cpu_id, 0);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else if ((sel1 == 2) && (sel2 == 2)) {
+            /* LPAR CPUs */
+            struct sysib_222 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            stw_p(&sysib.lpar_num, 0);
+            sysib.lcpuc = 0;
+            /* XXX change when SMP comes */
+            stw_p(&sysib.total_cpus, 1);
+            stw_p(&sysib.conf_cpus, 1);
+            stw_p(&sysib.standby_cpus, 0);
+            stw_p(&sysib.reserved_cpus, 0);
+            ebcdic_put(sysib.name, "QEMU    ", 8);
+            stl_p(&sysib.caf, 1000);
+            stw_p(&sysib.dedicated_cpus, 0);
+            stw_p(&sysib.shared_cpus, 0);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else {
+            cc = 3;
+        }
+        break;
+    }
+    case STSI_LEVEL_3:
+    {
+        if ((sel1 == 2) && (sel2 == 2)) {
+            /* VM CPUs */
+            struct sysib_322 sysib;
+
+            memset(&sysib, 0, sizeof(sysib));
+            sysib.count = 1;
+            /* XXX change when SMP comes */
+            stw_p(&sysib.vm[0].total_cpus, 1);
+            stw_p(&sysib.vm[0].conf_cpus, 1);
+            stw_p(&sysib.vm[0].standby_cpus, 0);
+            stw_p(&sysib.vm[0].reserved_cpus, 0);
+            ebcdic_put(sysib.vm[0].name, "KVMguest", 8);
+            stl_p(&sysib.vm[0].caf, 1000);
+            ebcdic_put(sysib.vm[0].cpi, "KVM/Linux       ", 16);
+            cpu_physical_memory_rw(a0, (uint8_t*)&sysib, sizeof(sysib), 1);
+        } else {
+            cc = 3;
+        }
+        break;
+    }
+    case STSI_LEVEL_CURRENT:
+        env->regs[0] = STSI_LEVEL_3;
+        break;
+    default:
+        cc = 3;
+        break;
+    }
+
+    return cc;
+}
+
+void HELPER(lctlg)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+    uint64_t src = a2;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        env->cregs[i] = ldq(src);
+        HELPER_LOG("load ctl %d from 0x%" PRIx64 " == 0x%" PRIx64 "\n",
+                   i, src, env->cregs[i]);
+        src += sizeof(uint64_t);
+
+        if (i == r3) {
+            break;
+        }
+    }
+
+    tlb_flush(env, 1);
+}
+
+void HELPER(lctl)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+    uint64_t src = a2;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        env->cregs[i] = (env->cregs[i] & 0xFFFFFFFF00000000ULL) | ldl(src);
+        src += sizeof(uint32_t);
+
+        if (i == r3) {
+            break;
+        }
+    }
+
+    tlb_flush(env, 1);
+}
+
+void HELPER(stctg)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+    uint64_t dest = a2;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        stq(dest, env->cregs[i]);
+        dest += sizeof(uint64_t);
+
+        if (i == r3) {
+            break;
+        }
+    }
+}
+
+void HELPER(stctl)(uint32_t r1, uint64_t a2, uint32_t r3)
+{
+    int i;
+    uint64_t dest = a2;
+
+    for (i = r1;; i = (i + 1) % 16) {
+        stl(dest, env->cregs[i]);
+        dest += sizeof(uint32_t);
+
+        if (i == r3) {
+            break;
+        }
+    }
+}
+
+uint32_t HELPER(tprot)(uint64_t a1, uint64_t a2)
+{
+    /* XXX implement */
+
+    return 0;
+}
+
+/* insert storage key extended */
+uint64_t HELPER(iske)(uint64_t r2)
+{
+    uint64_t addr = get_address(0, 0, r2);
+
+    if (addr > ram_size) {
+        return 0;
+    }
+
+    /* XXX maybe use qemu's internal keys? */
+    return env->storage_keys[addr / TARGET_PAGE_SIZE];
+}
+
+/* set storage key extended */
+void HELPER(sske)(uint32_t r1, uint64_t r2)
+{
+    uint64_t addr = get_address(0, 0, r2);
+
+    if (addr > ram_size) {
+        return;
+    }
+
+    env->storage_keys[addr / TARGET_PAGE_SIZE] = r1;
+}
+
+/* reset reference bit extended */
+uint32_t HELPER(rrbe)(uint32_t r1, uint64_t r2)
+{
+    if (r2 > ram_size) {
+        return 0;
+    }
+
+    /* XXX implement */
+#if 0
+    env->storage_keys[r2 / TARGET_PAGE_SIZE] &= ~SK_REFERENCED;
+#endif
+
+    /*
+     * cc
+     *
+     * 0  Reference bit zero; change bit zero
+     * 1  Reference bit zero; change bit one
+     * 2  Reference bit one; change bit zero
+     * 3  Reference bit one; change bit one
+     */
+    return 0;
+}
+
+/* compare and swap and purge */
+uint32_t HELPER(csp)(uint32_t r1, uint32_t r2)
+{
+    uint32_t cc;
+    uint32_t o1 = env->regs[r1];
+    uint64_t a2 = get_address_31fix(r2) & ~3ULL;
+    uint32_t o2 = ldl(a2);
+
+    if (o1 == o2) {
+        stl(a2, env->regs[(r1 + 1) & 15]);
+        if (env->regs[r2] & 0x3) {
+            /* flush TLB / ALB */
+            tlb_flush(env, 1);
+        }
+        cc = 0;
+    } else {
+        env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) | o2;
+        cc = 1;
+    }
+
+    return cc;
+}
+
+static uint32_t mvc_asc(int64_t l, uint64_t a1, uint64_t mode1, uint64_t a2,
+                        uint64_t mode2)
+{
+    target_ulong src, dest;
+    int flags, cc = 0, i;
+
+    if (!l) {
+        return 0;
+    } else if (l > 256) {
+        /* max 256 */
+        l = 256;
+        cc = 3;
+    }
+
+    if (mmu_translate(env, a1 & TARGET_PAGE_MASK, 1, mode1, &dest, &flags)) {
+        cpu_loop_exit();
+    }
+    dest |= a1 & ~TARGET_PAGE_MASK;
+
+    if (mmu_translate(env, a2 & TARGET_PAGE_MASK, 0, mode2, &src, &flags)) {
+        cpu_loop_exit();
+    }
+    src |= a2 & ~TARGET_PAGE_MASK;
+
+    /* XXX replace w/ memcpy */
+    for (i = 0; i < l; i++) {
+        /* XXX be more clever */
+        if ((((dest + i) & TARGET_PAGE_MASK) != (dest & TARGET_PAGE_MASK)) ||
+            (((src + i) & TARGET_PAGE_MASK) != (src & TARGET_PAGE_MASK))) {
+            mvc_asc(l - i, a1 + i, mode1, a2 + i, mode2);
+            break;
+        }
+        stb_phys(dest + i, ldub_phys(src + i));
+    }
+
+    return cc;
+}
+
+uint32_t HELPER(mvcs)(uint64_t l, uint64_t a1, uint64_t a2)
+{
+    HELPER_LOG("%s: %16" PRIx64 " %16" PRIx64 " %16" PRIx64 "\n",
+               __FUNCTION__, l, a1, a2);
+
+    return mvc_asc(l, a1, PSW_ASC_SECONDARY, a2, PSW_ASC_PRIMARY);
+}
+
+uint32_t HELPER(mvcp)(uint64_t l, uint64_t a1, uint64_t a2)
+{
+    HELPER_LOG("%s: %16" PRIx64 " %16" PRIx64 " %16" PRIx64 "\n",
+               __FUNCTION__, l, a1, a2);
+
+    return mvc_asc(l, a1, PSW_ASC_PRIMARY, a2, PSW_ASC_SECONDARY);
+}
+
+uint32_t HELPER(sigp)(uint64_t order_code, uint32_t r1, uint64_t cpu_addr)
+{
+    int cc = 0;
+
+    HELPER_LOG("%s: %016" PRIx64 " %08x %016" PRIx64 "\n",
+               __FUNCTION__, order_code, r1, cpu_addr);
+
+    /* Remember: Use "R1 or R1+1, whichever is the odd-numbered register"
+       as parameter (input). Status (output) is always R1. */
+
+    switch (order_code) {
+    case SIGP_SET_ARCH:
+        /* switch arch */
+        break;
+    case SIGP_SENSE:
+        /* enumerate CPU status */
+        if (cpu_addr) {
+            /* XXX implement when SMP comes */
+            return 3;
+        }
+        env->regs[r1] &= 0xffffffff00000000ULL;
+        cc = 1;
+        break;
+    default:
+        /* unknown sigp */
+        fprintf(stderr, "XXX unknown sigp: 0x%" PRIx64 "\n", order_code);
+        cc = 3;
+    }
+
+    return cc;
+}
+
+void HELPER(sacf)(uint64_t a1)
+{
+    HELPER_LOG("%s: %16" PRIx64 "\n", __FUNCTION__, a1);
+
+    switch (a1 & 0xf00) {
+    case 0x000:
+        env->psw.mask &= ~PSW_MASK_ASC;
+        env->psw.mask |= PSW_ASC_PRIMARY;
+        break;
+    case 0x100:
+        env->psw.mask &= ~PSW_MASK_ASC;
+        env->psw.mask |= PSW_ASC_SECONDARY;
+        break;
+    case 0x300:
+        env->psw.mask &= ~PSW_MASK_ASC;
+        env->psw.mask |= PSW_ASC_HOME;
+        break;
+    default:
+        qemu_log("unknown sacf mode: %" PRIx64 "\n", a1);
+        program_interrupt(env, PGM_SPECIFICATION, 2);
+        break;
+    }
+}
+
+/* invalidate pte */
+void HELPER(ipte)(uint64_t pte_addr, uint64_t vaddr)
+{
+    uint64_t page = vaddr & TARGET_PAGE_MASK;
+    uint64_t pte = 0;
+
+    /* XXX broadcast to other CPUs */
+
+    /* XXX Linux is nice enough to give us the exact pte address.
+           According to spec we'd have to find it out ourselves */
+    /* XXX Linux is fine with overwriting the pte, the spec requires
+           us to only set the invalid bit */
+    stq_phys(pte_addr, pte | _PAGE_INVALID);
+
+    /* XXX we exploit the fact that Linux passes the exact virtual
+           address here - it's not obliged to! */
+    tlb_flush_page(env, page);
+}
+
+/* flush local tlb */
+void HELPER(ptlb)(void)
+{
+    tlb_flush(env, 1);
+}
+
+/* store using real address */
+void HELPER(stura)(uint64_t addr, uint32_t v1)
+{
+    stw_phys(get_address(0, 0, addr), v1);
+}
+
+/* load real address */
+uint32_t HELPER(lra)(uint64_t addr, uint32_t r1)
+{
+    uint32_t cc = 0;
+    int old_exc = env->exception_index;
+    uint64_t asc = env->psw.mask & PSW_MASK_ASC;
+    uint64_t ret;
+    int flags;
+
+    /* XXX incomplete - has more corner cases */
+    if (!(env->psw.mask & PSW_MASK_64) && (addr >> 32)) {
+        program_interrupt(env, PGM_SPECIAL_OP, 2);
+    }
+
+    env->exception_index = old_exc;
+    if (mmu_translate(env, addr, 0, asc, &ret, &flags)) {
+        cc = 3;
+    }
+    if (env->exception_index == EXCP_PGM) {
+        ret = env->int_pgm_code | 0x80000000;
+    } else {
+        ret |= addr & ~TARGET_PAGE_MASK;
+    }
+    env->exception_index = old_exc;
+
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        env->regs[r1] = (env->regs[r1] & 0xffffffff00000000ULL) | (ret & 0xffffffffULL);
+    } else {
+        env->regs[r1] = ret;
+    }
+
+    return cc;
+}
+
+#endif