Patchwork target-alpha: An approach to fp insn qualifiers

login
register
mail settings
Submitter Richard Henderson
Date Dec. 14, 2009, 6:02 p.m.
Message ID <4B267DBC.8050909@twiddle.net>
Download mbox | patch
Permalink /patch/41117/
State New
Headers show

Comments

Richard Henderson - Dec. 14, 2009, 6:02 p.m.
One of the other folks here (I'm sorry, I've forgotten who as I 
misplaced the mail) passed me a preliminary patch to tackle the missing 
fp rounding mode support.  That patch added helpers to change the 
rounding mode, and injected them before and after every fp insn that 
forced a rounding mode.

After some thought I decided not to pursue this method for the simple 
reason that TCG does not have native floating point code generation. 
Given that we have to call one helper routine to perform the operation, 
we might as well arrange for that same routine to handle the rounding 
mode manipulation.

Indeed, the patch below doesn't just pass the rounding mode to the 
helper, but all of the qualifier fields.  Thus we can avoid a large 
explosion of helper routines for addt{,/u,/su,sui}{,/c,/m,/d}.

To complete the patch I should add some symbolic defines for /s, /u, etc 
to avoid magic constants cluttering the code.  I should figure out what 
I should pass to helper_excp for each arithmetic exception.  I should 
actually implement the arithmetic exceptions.

That said, these two patches are enough to pass the gcc testsuite with 
no regressions over native hardware.

Comments?


r~
commit 14cb36374f15413abfafb4fc030dcab25a2e0d9a
Author: Richard Henderson <rth@twiddle.net>
Date:   Fri Dec 11 08:40:22 2009 -0800

    Phase 1, adding a "Quals" argument to all of the fpu helpers.
commit a0cc18be5837339ad6da257a93597fc81a28c042
Author: Richard Henderson <rth@twiddle.net>
Date:   Mon Dec 14 09:31:22 2009 -0800

    target-alpha: Fix cvttq.
    
    The alpha fp-integer conversion instruction truncates
    instead of saturating like the generic IEEE version does.

diff --git a/target-alpha/op_helper.c b/target-alpha/op_helper.c
index b12c783..3bb0020 100644
--- a/target-alpha/op_helper.c
+++ b/target-alpha/op_helper.c
@@ -24,7 +24,7 @@
 
 /*****************************************************************************/
 /* Exceptions processing helpers */
-void helper_excp (int excp, int error)
+void QEMU_NORETURN helper_excp (int excp, int error)
 {
     env->exception_index = excp;
     env->error_code = error;
@@ -1166,15 +1166,112 @@ uint64_t helper_cvtqs (uint64_t a, uint32_t quals)
 
 uint64_t helper_cvttq (uint64_t a, uint32_t quals)
 {
-    float64 fa;
-    uint64_t ret;
-    uint32_t token;
-
-    fa = t_to_float64(a);
+    uint64_t ret, frac;
+    uint32_t token, exp, sign, exc = 0;
 
     token = begin_fp(quals);
-    float64_input(quals, fa);
-    ret = float64_to_int64(fa, &FP_STATUS);
+
+    /* Alpha integer conversion does not saturate, as the generic routine
+       does.  Instead it supplies a truncated result.  This fact is relied
+       upon by GCC in that without overflow enabled we can get unsigned
+       conversion for free with the same instruction.  */
+
+    sign = (a >> 63);
+    exp = (uint32_t)(a >> 52) & 0x7ff;
+    frac = a & 0xfffffffffffffull;
+
+    if (exp == 0) {
+        ret = 0;
+        if (frac != 0) {
+            /* ??? If DNZ set, map to zero without trapping.  */
+            /* ??? Figure out what kind of exception signal to send.  */
+            if (!(quals & 0x400))
+                helper_excp(EXCP_ARITH, 0);
+            goto do_underflow;
+        }
+    } else if (exp == 0x7ff) {
+        /* In keeping with the truncation result, both infinity and NaN
+           give result of zero.  See Table B-2 in the Alpha Architecture
+           Handbook.  */
+        ret = 0;
+        exc = float_flag_invalid;
+
+        /* Without /s qualifier, both Inf and NaN trap.  SNaN always traps. */
+        if (!(quals & 0x400) || (frac & 0x4000000000000ull))
+            helper_excp(EXCP_ARITH, 0);
+    } else {
+        int32_t shift;
+
+        /* Restore implicit bit.  */
+        frac |= 0x10000000000000ull;
+
+        shift = exp - 1023 - 52;
+        if (shift > 0) {
+            /* In this case the number is so large that we must shift
+               the fraction left.  There is no rounding to do, but we
+               must still set inexact for overflow.  */
+            if (shift < 63) {
+                ret = frac << shift;
+                if ((ret >> shift) != frac)
+                    exc = float_flag_inexact;
+            } else {
+                exc = float_flag_inexact;
+                ret = 0;
+            }
+        } else if (shift == 0) {
+            /* The exponent is exactly right for the 52-bit fraction.  */
+            ret = frac;
+        } else {
+            uint64_t round;
+
+            /* In this case the number is smaller than the fraction as
+               represented by the 52 bit number.  Here we must think 
+               about rounding the result.  Handle this by shifting the
+               fractional part of the number into the high bits of ROUND.
+               This will let us efficiently handle round-to-nearest.  */
+            shift = -shift;
+            if (shift < 63) {
+                ret = frac >> shift;
+                round = frac << (64 - shift);
+            } else {
+            do_underflow:
+                /* The exponent is so small we shift out everything.  */
+                ret = 0;
+                round = 1;
+            }
+
+            if (round) {
+                exc = float_flag_inexact;
+                switch (FP_STATUS.float_rounding_mode) {
+                case float_round_nearest_even:
+                    if (round == (1ull << 63)) {
+                        /* The remaining fraction is exactly 0.5;
+                           round to even.  */
+                        ret += (ret & 1);
+                    } else if (round > (1ull << 63)) {
+                        ret += 1;
+                    }
+                    break;
+                case float_round_to_zero:
+                    break;
+                case float_round_up:
+                    if (!sign)
+                        ret += 1;
+                    break;
+                case float_round_down:
+                    if (sign)
+                        ret += 1;
+                    break;
+                }
+            }
+        }
+
+        if (sign)
+            ret = -ret;
+    }
+
+    if (exc)
+        float_raise(exc, &FP_STATUS);
     end_fp(quals, token);
 
     return ret;
Laurent Desnogues - Dec. 14, 2009, 8:11 p.m.
On Mon, Dec 14, 2009 at 7:02 PM, Richard Henderson <rth@twiddle.net> wrote:
> One of the other folks here (I'm sorry, I've forgotten who as I misplaced
> the mail) passed me a preliminary patch to tackle the missing fp rounding
> mode support.  That patch added helpers to change the rounding mode, and
> injected them before and after every fp insn that forced a rounding mode.

That was me.

> After some thought I decided not to pursue this method for the simple reason
> that TCG does not have native floating point code generation.

I have started implementing TCG FP, but lack the time and desire to
complete it.  The speed up was pretty good when running on an ARM
host with a fast FPU.

> Given that we
> have to call one helper routine to perform the operation, we might as well
> arrange for that same routine to handle the rounding mode manipulation.
>
> Indeed, the patch below doesn't just pass the rounding mode to the helper,
> but all of the qualifier fields.  Thus we can avoid a large explosion of
> helper routines for addt{,/u,/su,sui}{,/c,/m,/d}.

I don't really like passing parts of opcodes to helpers, but as you say
that prevents explosion of helpers.  OTOH you could do lazy calls to
helpers that set rounding modes with my approach of separating them
from computation.

I'll take a closer look at your patch tomorrow.

> To complete the patch I should add some symbolic defines for /s, /u, etc to
> avoid magic constants cluttering the code.  I should figure out what I
> should pass to helper_excp for each arithmetic exception.  I should actually
> implement the arithmetic exceptions.
>
> That said, these two patches are enough to pass the gcc testsuite with no
> regressions over native hardware.

Can you give SPECint 2K equake a try? The symptom was the presence
of many NaN's.  If you don't have access to SPEC2K I'll try it.


Laurent
Richard Henderson - Dec. 14, 2009, 10:21 p.m.
On 12/14/2009 12:11 PM, Laurent Desnogues wrote:
> I don't really like passing parts of opcodes to helpers, but as you say
> that prevents explosion of helpers.  OTOH you could do lazy calls to
> helpers that set rounding modes with my approach of separating them
> from computation.

I don't particularly like it either, I just thought it was a bit
more efficient, given the current state of TCG.

If it's not acceptable, I can try to do some macro magic in both
op_helper.c and helper.h to do function replication on the /u,/su,/sui
bits and then follow your lead on the lazy rounding mode setting.

> Can you give SPECint 2K equake a try? The symptom was the presence
> of many NaN's.  If you don't have access to SPEC2K I'll try it.

Yes, I can.  I'll see about setting this up tonight.


r~

Patch

diff --git a/target-alpha/helper.h b/target-alpha/helper.h
index bedd3c0..1521a84 100644
--- a/target-alpha/helper.h
+++ b/target-alpha/helper.h
@@ -41,33 +41,33 @@  DEF_HELPER_1(store_fpcr, void, i64)
 
 DEF_HELPER_1(f_to_memory, i32, i64)
 DEF_HELPER_1(memory_to_f, i64, i32)
-DEF_HELPER_2(addf, i64, i64, i64)
-DEF_HELPER_2(subf, i64, i64, i64)
-DEF_HELPER_2(mulf, i64, i64, i64)
-DEF_HELPER_2(divf, i64, i64, i64)
-DEF_HELPER_1(sqrtf, i64, i64)
+DEF_HELPER_3(addf, i64, i64, i64, i32)
+DEF_HELPER_3(subf, i64, i64, i64, i32)
+DEF_HELPER_3(mulf, i64, i64, i64, i32)
+DEF_HELPER_3(divf, i64, i64, i64, i32)
+DEF_HELPER_2(sqrtf, i64, i64, i32)
 
 DEF_HELPER_1(g_to_memory, i64, i64)
 DEF_HELPER_1(memory_to_g, i64, i64)
-DEF_HELPER_2(addg, i64, i64, i64)
-DEF_HELPER_2(subg, i64, i64, i64)
-DEF_HELPER_2(mulg, i64, i64, i64)
-DEF_HELPER_2(divg, i64, i64, i64)
-DEF_HELPER_1(sqrtg, i64, i64)
+DEF_HELPER_3(addg, i64, i64, i64, i32)
+DEF_HELPER_3(subg, i64, i64, i64, i32)
+DEF_HELPER_3(mulg, i64, i64, i64, i32)
+DEF_HELPER_3(divg, i64, i64, i64, i32)
+DEF_HELPER_2(sqrtg, i64, i64, i32)
 
 DEF_HELPER_1(s_to_memory, i32, i64)
 DEF_HELPER_1(memory_to_s, i64, i32)
-DEF_HELPER_2(adds, i64, i64, i64)
-DEF_HELPER_2(subs, i64, i64, i64)
-DEF_HELPER_2(muls, i64, i64, i64)
-DEF_HELPER_2(divs, i64, i64, i64)
-DEF_HELPER_1(sqrts, i64, i64)
-
-DEF_HELPER_2(addt, i64, i64, i64)
-DEF_HELPER_2(subt, i64, i64, i64)
-DEF_HELPER_2(mult, i64, i64, i64)
-DEF_HELPER_2(divt, i64, i64, i64)
-DEF_HELPER_1(sqrtt, i64, i64)
+DEF_HELPER_3(adds, i64, i64, i64, i32)
+DEF_HELPER_3(subs, i64, i64, i64, i32)
+DEF_HELPER_3(muls, i64, i64, i64, i32)
+DEF_HELPER_3(divs, i64, i64, i64, i32)
+DEF_HELPER_2(sqrts, i64, i64, i32)
+
+DEF_HELPER_3(addt, i64, i64, i64, i32)
+DEF_HELPER_3(subt, i64, i64, i64, i32)
+DEF_HELPER_3(mult, i64, i64, i64, i32)
+DEF_HELPER_3(divt, i64, i64, i64, i32)
+DEF_HELPER_2(sqrtt, i64, i64, i32)
 
 DEF_HELPER_2(cmptun, i64, i64, i64)
 DEF_HELPER_2(cmpteq, i64, i64, i64)
@@ -81,15 +81,15 @@  DEF_HELPER_2(cpys, i64, i64, i64)
 DEF_HELPER_2(cpysn, i64, i64, i64)
 DEF_HELPER_2(cpyse, i64, i64, i64)
 
-DEF_HELPER_1(cvtts, i64, i64)
-DEF_HELPER_1(cvtst, i64, i64)
-DEF_HELPER_1(cvttq, i64, i64)
-DEF_HELPER_1(cvtqs, i64, i64)
-DEF_HELPER_1(cvtqt, i64, i64)
-DEF_HELPER_1(cvtqf, i64, i64)
-DEF_HELPER_1(cvtgf, i64, i64)
-DEF_HELPER_1(cvtgq, i64, i64)
-DEF_HELPER_1(cvtqg, i64, i64)
+DEF_HELPER_2(cvtts, i64, i64, i32)
+DEF_HELPER_2(cvtst, i64, i64, i32)
+DEF_HELPER_2(cvttq, i64, i64, i32)
+DEF_HELPER_2(cvtqs, i64, i64, i32)
+DEF_HELPER_2(cvtqt, i64, i64, i32)
+DEF_HELPER_2(cvtqf, i64, i64, i32)
+DEF_HELPER_2(cvtgf, i64, i64, i32)
+DEF_HELPER_2(cvtgq, i64, i64, i32)
+DEF_HELPER_2(cvtqg, i64, i64, i32)
 DEF_HELPER_1(cvtlq, i64, i64)
 DEF_HELPER_1(cvtql, i64, i64)
 DEF_HELPER_1(cvtqlv, i64, i64)
diff --git a/target-alpha/op_helper.c b/target-alpha/op_helper.c
index b2abf6c..b12c783 100644
--- a/target-alpha/op_helper.c
+++ b/target-alpha/op_helper.c
@@ -370,6 +370,85 @@  uint64_t helper_unpkbw (uint64_t op1)
 
 /* Floating point helpers */
 
+/* If the floating-point qualifiers specified a rounding mode,
+   set that rounding mode and remember the original mode for
+   resetting at the end of the instruction.  */
+static inline uint32_t begin_quals_roundmode(uint32_t qual)
+{
+    uint32_t rm = FP_STATUS.float_rounding_mode, old_rm = rm;
+
+    switch (qual & 0xc0) {
+    case 0x80:
+        rm = float_round_nearest_even;
+        break;
+    case 0x00:
+        rm = float_round_to_zero;
+        break;
+    case 0x40:
+        rm = float_round_down;
+        break;
+    case 0xc0:
+        return old_rm;
+    }
+    set_float_rounding_mode(rm, &FP_STATUS);
+    return old_rm;
+}
+
+/* If the floating-point qualifiers specified extra exception processing
+   (i.e. /u or /su), zero the exception flags so that we can determine if
+   the current instruction raises any exceptions.  Save the old acrued
+   exception status so that we can restore them at the end of the insn.  */
+static inline uint32_t begin_quals_exception(uint32_t qual)
+{
+    uint32_t old_exc = 0;
+    if (qual & 0x500) {
+        old_exc = (uint32_t)FP_STATUS.float_exception_flags << 8;
+        set_float_exception_flags(0, &FP_STATUS);
+    }
+    return old_exc;
+}
+
+/* Begin processing an fp operation.  Return a token that should be passed
+   when completing the fp operation.  */
+static inline uint32_t begin_fp(uint32_t quals)
+{
+    uint32_t ret = 0;
+
+    ret |= begin_quals_roundmode(quals);
+    ret |= begin_quals_exception(quals);
+
+    return ret;
+}
+
+/* End processing an fp operation.  */
+static inline void end_fp(uint32_t quals, uint32_t orig)
+{
+    uint8_t exc = FP_STATUS.float_exception_flags;
+
+    set_float_exception_flags(exc | (orig >> 8), &FP_STATUS);
+    set_float_rounding_mode(orig & 0xff, &FP_STATUS);
+
+    /* TODO: check quals and exc and raise any exceptions needed.  */
+}
+
+/* Raise any exceptions needed for using F, given the insn qualifiers.  */
+static inline void float32_input(uint32_t quals, float32 f)
+{
+    /* If /s is used, no exceptions are raised immediately.  */
+    /* ??? This for userspace only.  If we are emulating the real hw, then
+       we may well need to trap to the kernel for software emulation.  */
+    /* ??? Shouldn't we raise an exception for SNAN?  */
+    if (quals & 0x500)
+        return;
+    /* TODO: Check for inf, nan, denormal and trap.  */
+}
+
+static inline void float64_input(uint32_t quals, float64 f)
+{
+    /* TODO: Exactly like above, except for float64.  */
+}
+
+
 /* F floating (VAX) */
 static inline uint64_t float32_to_f(float32 fa)
 {
@@ -447,52 +526,86 @@  uint64_t helper_memory_to_f (uint32_t a)
     return r;
 }
 
-uint64_t helper_addf (uint64_t a, uint64_t b)
+uint64_t helper_addf (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = f_to_float32(a);
     fb = f_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_add(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_subf (uint64_t a, uint64_t b)
+uint64_t helper_subf (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = f_to_float32(a);
     fb = f_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_sub(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_mulf (uint64_t a, uint64_t b)
+uint64_t helper_mulf (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = f_to_float32(a);
     fb = f_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_mul(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_divf (uint64_t a, uint64_t b)
+uint64_t helper_divf (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = f_to_float32(a);
     fb = f_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_div(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_sqrtf (uint64_t t)
+uint64_t helper_sqrtf (uint64_t t, uint32_t quals)
 {
     float32 ft, fr;
+    uint32_t token;
 
     ft = f_to_float32(t);
+
+    token = begin_fp(quals);
+    float32_input(quals, ft);
     fr = float32_sqrt(ft, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
@@ -574,52 +687,86 @@  uint64_t helper_memory_to_g (uint64_t a)
     return r;
 }
 
-uint64_t helper_addg (uint64_t a, uint64_t b)
+uint64_t helper_addg (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = g_to_float64(a);
     fb = g_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_add(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
-uint64_t helper_subg (uint64_t a, uint64_t b)
+uint64_t helper_subg (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = g_to_float64(a);
     fb = g_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_sub(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
-uint64_t helper_mulg (uint64_t a, uint64_t b)
+uint64_t helper_mulg (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
-
+    uint32_t token;
+    
     fa = g_to_float64(a);
     fb = g_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_mul(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
-uint64_t helper_divg (uint64_t a, uint64_t b)
+uint64_t helper_divg (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = g_to_float64(a);
     fb = g_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_div(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
-uint64_t helper_sqrtg (uint64_t a)
+uint64_t helper_sqrtg (uint64_t a, uint32_t quals)
 {
     float64 fa, fr;
+    uint32_t token;
 
     fa = g_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
     fr = float64_sqrt(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
@@ -678,52 +825,86 @@  uint64_t helper_memory_to_s (uint32_t a)
     return float32_to_s_int(a);
 }
 
-uint64_t helper_adds (uint64_t a, uint64_t b)
+uint64_t helper_adds (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
     fb = s_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_add(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_subs (uint64_t a, uint64_t b)
+uint64_t helper_subs (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
     fb = s_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_sub(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_muls (uint64_t a, uint64_t b)
+uint64_t helper_muls (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
     fb = s_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_mul(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_divs (uint64_t a, uint64_t b)
+uint64_t helper_divs (uint64_t a, uint64_t b, uint32_t quals)
 {
     float32 fa, fb, fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
     fb = s_to_float32(b);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
+    float32_input(quals, fb);
     fr = float32_div(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_sqrts (uint64_t a)
+uint64_t helper_sqrts (uint64_t a, uint32_t quals)
 {
     float32 fa, fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
     fr = float32_sqrt(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
@@ -745,52 +926,86 @@  static inline uint64_t float64_to_t(float64 fa)
     return r.ll;
 }
 
-uint64_t helper_addt (uint64_t a, uint64_t b)
+uint64_t helper_addt (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
     fb = t_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_add(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
-uint64_t helper_subt (uint64_t a, uint64_t b)
+uint64_t helper_subt (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
     fb = t_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_sub(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+    
     return float64_to_t(fr);
 }
 
-uint64_t helper_mult (uint64_t a, uint64_t b)
+uint64_t helper_mult (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
     fb = t_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_mul(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
-uint64_t helper_divt (uint64_t a, uint64_t b)
+uint64_t helper_divt (uint64_t a, uint64_t b, uint32_t quals)
 {
     float64 fa, fb, fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
     fb = t_to_float64(b);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    float64_input(quals, fb);
     fr = float64_div(fa, fb, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
-uint64_t helper_sqrtt (uint64_t a)
+uint64_t helper_sqrtt (uint64_t a, uint32_t quals)
 {
     float64 fa, fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
     fr = float64_sqrt(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
@@ -905,70 +1120,131 @@  uint64_t helper_cmpglt(uint64_t a, uint64_t b)
 }
 
 /* Floating point format conversion */
-uint64_t helper_cvtts (uint64_t a)
+uint64_t helper_cvtts (uint64_t a, uint32_t quals)
 {
     float64 fa;
     float32 fr;
+    uint32_t token;
 
     fa = t_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
     fr = float64_to_float32(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_cvtst (uint64_t a)
+uint64_t helper_cvtst (uint64_t a, uint32_t quals)
 {
     float32 fa;
     float64 fr;
+    uint32_t token;
 
     fa = s_to_float32(a);
+
+    token = begin_fp(quals);
+    float32_input(quals, fa);
     fr = float32_to_float64(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
-uint64_t helper_cvtqs (uint64_t a)
+uint64_t helper_cvtqs (uint64_t a, uint32_t quals)
 {
-    float32 fr = int64_to_float32(a, &FP_STATUS);
+    float32 fr;
+    uint32_t token;
+
+    token = begin_fp(quals);
+    fr = int64_to_float32(a, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_s(fr);
 }
 
-uint64_t helper_cvttq (uint64_t a)
+uint64_t helper_cvttq (uint64_t a, uint32_t quals)
 {
-    float64 fa = t_to_float64(a);
-    return float64_to_int64_round_to_zero(fa, &FP_STATUS);
+    float64 fa;
+    uint64_t ret;
+    uint32_t token;
+
+    fa = t_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    ret = float64_to_int64(fa, &FP_STATUS);
+    end_fp(quals, token);
+
+    return ret;
 }
 
-uint64_t helper_cvtqt (uint64_t a)
+uint64_t helper_cvtqt (uint64_t a, uint32_t quals)
 {
-    float64 fr = int64_to_float64(a, &FP_STATUS);
+    float64 fr;
+    uint32_t token;
+
+    token = begin_fp(quals);
+    fr = int64_to_float64(a, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_t(fr);
 }
 
-uint64_t helper_cvtqf (uint64_t a)
+uint64_t helper_cvtqf (uint64_t a, uint32_t quals)
 {
-    float32 fr = int64_to_float32(a, &FP_STATUS);
+    float32 fr;
+    uint32_t token;
+
+    token = begin_fp(quals);
+    fr = int64_to_float32(a, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_cvtgf (uint64_t a)
+uint64_t helper_cvtgf (uint64_t a, uint32_t quals)
 {
     float64 fa;
     float32 fr;
+    uint32_t token;
 
     fa = g_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
     fr = float64_to_float32(fa, &FP_STATUS);
+    end_fp(quals, token);
+
     return float32_to_f(fr);
 }
 
-uint64_t helper_cvtgq (uint64_t a)
+uint64_t helper_cvtgq (uint64_t a, uint32_t quals)
 {
-    float64 fa = g_to_float64(a);
-    return float64_to_int64_round_to_zero(fa, &FP_STATUS);
+    float64 fa;
+    uint64_t ret;
+    uint32_t token;
+
+    fa = g_to_float64(a);
+
+    token = begin_fp(quals);
+    float64_input(quals, fa);
+    ret = float64_to_int64(fa, &FP_STATUS);
+    end_fp(quals, token);
+
+    return ret;
 }
 
-uint64_t helper_cvtqg (uint64_t a)
+uint64_t helper_cvtqg (uint64_t a, uint32_t quals)
 {
     float64 fr;
+    uint32_t token;
+
+    token = begin_fp(quals);
     fr = int64_to_float64(a, &FP_STATUS);
+    end_fp(quals, token);
+
     return float64_to_g(fr);
 }
 
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index be7e7bf..7b6ff2a 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -442,81 +442,79 @@  static void gen_fcmov(TCGCond inv_cond, int ra, int rb, int rc)
     gen_set_label(l1);
 }
 
-#define FARITH2(name)                                       \
-static inline void glue(gen_f, name)(int rb, int rc)        \
-{                                                           \
-    if (unlikely(rc == 31))                                 \
-      return;                                               \
-                                                            \
-    if (rb != 31)                                           \
-        gen_helper_ ## name (cpu_fir[rc], cpu_fir[rb]);    \
-    else {                                                  \
-        TCGv tmp = tcg_const_i64(0);                        \
-        gen_helper_ ## name (cpu_fir[rc], tmp);            \
-        tcg_temp_free(tmp);                                 \
-    }                                                       \
+#define FARITH2(name)                                   \
+static inline void glue(gen_f, name)(int rb, int rc)    \
+{                                                       \
+    if (unlikely(rc == 31))                             \
+      return;                                           \
+                                                        \
+    if (rb != 31)                                       \
+        gen_helper_ ## name (cpu_fir[rc], cpu_fir[rb]); \
+    else {                                              \
+        TCGv tmp = tcg_const_i64(0);                    \
+        gen_helper_ ## name (cpu_fir[rc], tmp);         \
+        tcg_temp_free(tmp);                             \
+    }                                                   \
 }
-FARITH2(sqrts)
-FARITH2(sqrtf)
-FARITH2(sqrtg)
-FARITH2(sqrtt)
-FARITH2(cvtgf)
-FARITH2(cvtgq)
-FARITH2(cvtqf)
-FARITH2(cvtqg)
-FARITH2(cvtst)
-FARITH2(cvtts)
-FARITH2(cvttq)
-FARITH2(cvtqs)
-FARITH2(cvtqt)
 FARITH2(cvtlq)
 FARITH2(cvtql)
 FARITH2(cvtqlv)
 FARITH2(cvtqlsv)
 
-#define FARITH3(name)                                                     \
-static inline void glue(gen_f, name)(int ra, int rb, int rc)              \
-{                                                                         \
-    if (unlikely(rc == 31))                                               \
-        return;                                                           \
-                                                                          \
-    if (ra != 31) {                                                       \
-        if (rb != 31)                                                     \
-            gen_helper_ ## name (cpu_fir[rc], cpu_fir[ra], cpu_fir[rb]);  \
-        else {                                                            \
-            TCGv tmp = tcg_const_i64(0);                                  \
-            gen_helper_ ## name (cpu_fir[rc], cpu_fir[ra], tmp);          \
-            tcg_temp_free(tmp);                                           \
-        }                                                                 \
-    } else {                                                              \
-        TCGv tmp = tcg_const_i64(0);                                      \
-        if (rb != 31)                                                     \
-            gen_helper_ ## name (cpu_fir[rc], tmp, cpu_fir[rb]);          \
-        else                                                              \
-            gen_helper_ ## name (cpu_fir[rc], tmp, tmp);                   \
-        tcg_temp_free(tmp);                                               \
-    }                                                                     \
+#define QFARITH2(name)                                          \
+static inline void glue(gen_f, name)(int rb, int rc, int opc)   \
+{                                                               \
+    TCGv_i32 quals;                                             \
+    if (unlikely(rc == 31))                                     \
+      return;                                                   \
+    quals = tcg_const_i32(opc & ~0x3f);                         \
+    if (rb != 31)                                               \
+        gen_helper_ ## name (cpu_fir[rc], cpu_fir[rb], quals);  \
+    else {                                                      \
+        TCGv tmp = tcg_const_i64(0);                            \
+        gen_helper_ ## name (cpu_fir[rc], tmp, quals);          \
+        tcg_temp_free(tmp);                                     \
+    }                                                           \
+    tcg_temp_free_i32(quals);                                   \
+}
+QFARITH2(sqrts)
+QFARITH2(sqrtf)
+QFARITH2(sqrtg)
+QFARITH2(sqrtt)
+QFARITH2(cvtgf)
+QFARITH2(cvtgq)
+QFARITH2(cvtqf)
+QFARITH2(cvtqg)
+QFARITH2(cvtst)
+QFARITH2(cvtts)
+QFARITH2(cvttq)
+QFARITH2(cvtqs)
+QFARITH2(cvtqt)
+
+#define FARITH3(name)                                           \
+static inline void glue(gen_f, name)(int ra, int rb, int rc)    \
+{                                                               \
+    TCGv zero, ta, tb;                                          \
+    if (unlikely(rc == 31))                                     \
+        return;                                                 \
+    ta = cpu_fir[ra];                                           \
+    tb = cpu_fir[rb];                                           \
+    if (unlikely(ra == 31)) {                                   \
+        zero = tcg_const_i64(0);                                \
+        ta = zero;                                              \
+    }                                                           \
+    if (unlikely(rb == 31)) {                                   \
+        if (ra != 31)                                           \
+            zero = tcg_const_i64(0);                            \
+        tb = zero;                                              \
+    }                                                           \
+    gen_helper_ ## name (cpu_fir[rc], ta, tb);                  \
+    if (ra == 31 || rb == 31)                                   \
+        tcg_temp_free(zero);                                    \
 }
-
-FARITH3(addf)
-FARITH3(subf)
-FARITH3(mulf)
-FARITH3(divf)
-FARITH3(addg)
-FARITH3(subg)
-FARITH3(mulg)
-FARITH3(divg)
 FARITH3(cmpgeq)
 FARITH3(cmpglt)
 FARITH3(cmpgle)
-FARITH3(adds)
-FARITH3(subs)
-FARITH3(muls)
-FARITH3(divs)
-FARITH3(addt)
-FARITH3(subt)
-FARITH3(mult)
-FARITH3(divt)
 FARITH3(cmptun)
 FARITH3(cmpteq)
 FARITH3(cmptlt)
@@ -525,6 +523,47 @@  FARITH3(cpys)
 FARITH3(cpysn)
 FARITH3(cpyse)
 
+#define QFARITH3(name)                                                  \
+static inline void glue(gen_f, name)(int ra, int rb, int rc, int opc)   \
+{                                                                       \
+    TCGv zero, ta, tb;                                                  \
+    TCGv_i32 quals;                                                     \
+    if (unlikely(rc == 31))                                             \
+        return;                                                         \
+    ta = cpu_fir[ra];                                                   \
+    tb = cpu_fir[rb];                                                   \
+    if (unlikely(ra == 31)) {                                           \
+        zero = tcg_const_i64(0);                                        \
+        ta = zero;                                                      \
+    }                                                                   \
+    if (unlikely(rb == 31)) {                                           \
+        if (ra != 31)                                                   \
+            zero = tcg_const_i64(0);                                    \
+        tb = zero;                                                      \
+    }                                                                   \
+    quals = tcg_const_i32(opc & ~0x3f);                                 \
+    gen_helper_ ## name (cpu_fir[rc], ta, tb, quals);                   \
+    tcg_temp_free_i32(quals);                                           \
+    if (ra == 31 || rb == 31)                                           \
+        tcg_temp_free(zero);                                            \
+}
+QFARITH3(addf)
+QFARITH3(subf)
+QFARITH3(mulf)
+QFARITH3(divf)
+QFARITH3(addg)
+QFARITH3(subg)
+QFARITH3(mulg)
+QFARITH3(divg)
+QFARITH3(adds)
+QFARITH3(subs)
+QFARITH3(muls)
+QFARITH3(divs)
+QFARITH3(addt)
+QFARITH3(subt)
+QFARITH3(mult)
+QFARITH3(divt)
+
 static inline uint64_t zapnot_mask(uint8_t lit)
 {
     uint64_t mask = 0;
@@ -1607,7 +1646,7 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
         }
         break;
     case 0x14:
-        switch (fpfn) { /* f11 & 0x3F */
+        switch (fpfn) { /* fn11 & 0x3F */
         case 0x04:
             /* ITOFS */
             if (!(ctx->amask & AMASK_FIX))
@@ -1626,13 +1665,13 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
             /* SQRTF */
             if (!(ctx->amask & AMASK_FIX))
                 goto invalid_opc;
-            gen_fsqrtf(rb, rc);
+            gen_fsqrtf(rb, rc, fn11);
             break;
         case 0x0B:
             /* SQRTS */
             if (!(ctx->amask & AMASK_FIX))
                 goto invalid_opc;
-            gen_fsqrts(rb, rc);
+            gen_fsqrts(rb, rc, fn11);
             break;
         case 0x14:
             /* ITOFF */
@@ -1663,13 +1702,13 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
             /* SQRTG */
             if (!(ctx->amask & AMASK_FIX))
                 goto invalid_opc;
-            gen_fsqrtg(rb, rc);
+            gen_fsqrtg(rb, rc, fn11);
             break;
         case 0x02B:
             /* SQRTT */
             if (!(ctx->amask & AMASK_FIX))
                 goto invalid_opc;
-            gen_fsqrtt(rb, rc);
+            gen_fsqrtt(rb, rc, fn11);
             break;
         default:
             goto invalid_opc;
@@ -1677,47 +1716,42 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
         break;
     case 0x15:
         /* VAX floating point */
-        /* XXX: rounding mode and trap are ignored (!) */
-        switch (fpfn) { /* f11 & 0x3F */
+        switch (fpfn) { /* fn11 & 0x3F */
         case 0x00:
             /* ADDF */
-            gen_faddf(ra, rb, rc);
+            gen_faddf(ra, rb, rc, fn11);
             break;
         case 0x01:
             /* SUBF */
-            gen_fsubf(ra, rb, rc);
+            gen_fsubf(ra, rb, rc, fn11);
             break;
         case 0x02:
             /* MULF */
-            gen_fmulf(ra, rb, rc);
+            gen_fmulf(ra, rb, rc, fn11);
             break;
         case 0x03:
             /* DIVF */
-            gen_fdivf(ra, rb, rc);
+            gen_fdivf(ra, rb, rc, fn11);
             break;
         case 0x1E:
             /* CVTDG */
-#if 0 // TODO
-            gen_fcvtdg(rb, rc);
-#else
+            /* TODO */
             goto invalid_opc;
-#endif
-            break;
         case 0x20:
             /* ADDG */
-            gen_faddg(ra, rb, rc);
+            gen_faddg(ra, rb, rc, fn11);
             break;
         case 0x21:
             /* SUBG */
-            gen_fsubg(ra, rb, rc);
+            gen_fsubg(ra, rb, rc, fn11);
             break;
         case 0x22:
             /* MULG */
-            gen_fmulg(ra, rb, rc);
+            gen_fmulg(ra, rb, rc, fn11);
             break;
         case 0x23:
             /* DIVG */
-            gen_fdivg(ra, rb, rc);
+            gen_fdivg(ra, rb, rc, fn11);
             break;
         case 0x25:
             /* CMPGEQ */
@@ -1733,27 +1767,23 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
             break;
         case 0x2C:
             /* CVTGF */
-            gen_fcvtgf(rb, rc);
+            gen_fcvtgf(rb, rc, fn11);
             break;
         case 0x2D:
             /* CVTGD */
-#if 0 // TODO
-            gen_fcvtgd(rb, rc);
-#else
+            /* TODO */
             goto invalid_opc;
-#endif
-            break;
         case 0x2F:
             /* CVTGQ */
-            gen_fcvtgq(rb, rc);
+            gen_fcvtgq(rb, rc, fn11);
             break;
         case 0x3C:
             /* CVTQF */
-            gen_fcvtqf(rb, rc);
+            gen_fcvtqf(rb, rc, fn11);
             break;
         case 0x3E:
             /* CVTQG */
-            gen_fcvtqg(rb, rc);
+            gen_fcvtqg(rb, rc, fn11);
             break;
         default:
             goto invalid_opc;
@@ -1761,39 +1791,38 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
         break;
     case 0x16:
         /* IEEE floating-point */
-        /* XXX: rounding mode and traps are ignored (!) */
-        switch (fpfn) { /* f11 & 0x3F */
+        switch (fpfn) { /* fn11 & 0x3F */
         case 0x00:
             /* ADDS */
-            gen_fadds(ra, rb, rc);
+            gen_fadds(ra, rb, rc, fn11);
             break;
         case 0x01:
             /* SUBS */
-            gen_fsubs(ra, rb, rc);
+            gen_fsubs(ra, rb, rc, fn11);
             break;
         case 0x02:
             /* MULS */
-            gen_fmuls(ra, rb, rc);
+            gen_fmuls(ra, rb, rc, fn11);
             break;
         case 0x03:
             /* DIVS */
-            gen_fdivs(ra, rb, rc);
+            gen_fdivs(ra, rb, rc, fn11);
             break;
         case 0x20:
             /* ADDT */
-            gen_faddt(ra, rb, rc);
+            gen_faddt(ra, rb, rc, fn11);
             break;
         case 0x21:
             /* SUBT */
-            gen_fsubt(ra, rb, rc);
+            gen_fsubt(ra, rb, rc, fn11);
             break;
         case 0x22:
             /* MULT */
-            gen_fmult(ra, rb, rc);
+            gen_fmult(ra, rb, rc, fn11);
             break;
         case 0x23:
             /* DIVT */
-            gen_fdivt(ra, rb, rc);
+            gen_fdivt(ra, rb, rc, fn11);
             break;
         case 0x24:
             /* CMPTUN */
@@ -1812,26 +1841,25 @@  static inline int translate_one(DisasContext *ctx, uint32_t insn)
             gen_fcmptle(ra, rb, rc);
             break;
         case 0x2C:
-            /* XXX: incorrect */
             if (fn11 == 0x2AC || fn11 == 0x6AC) {
                 /* CVTST */
-                gen_fcvtst(rb, rc);
+                gen_fcvtst(rb, rc, fn11);
             } else {
                 /* CVTTS */
-                gen_fcvtts(rb, rc);
+                gen_fcvtts(rb, rc, fn11);
             }
             break;
         case 0x2F:
             /* CVTTQ */
-            gen_fcvttq(rb, rc);
+            gen_fcvttq(rb, rc, fn11);
             break;
         case 0x3C:
             /* CVTQS */
-            gen_fcvtqs(rb, rc);
+            gen_fcvtqs(rb, rc, fn11);
             break;
         case 0x3E:
             /* CVTQT */
-            gen_fcvtqt(rb, rc);
+            gen_fcvtqt(rb, rc, fn11);
             break;
         default:
             goto invalid_opc;