diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a..9c6e4a3 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -5234,6 +5234,16 @@ int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
 }
 
 /*----------------------------------------------------------------------------
+| Returns 1 if the extended double-precision floating-point value `a' is an
+| unsupported; otherwise returns 0.
+*----------------------------------------------------------------------------*/
+int floatx80_is_unsupported(floatx80 a)
+{
+    return (extractFloatx80Exp(a) &&
+            !(extractFloatx80Frac(a) & LIT64(0x8000000000000000)));
+}
+
+/*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point
 | value `a' to the 32-bit two's complement integer format.  The conversion
 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
@@ -6828,6 +6838,178 @@ floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
                                           aSign, aExp, aSig, 0 STATUS_VAR );
 }
 
+/*
+ * BEGIN from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
+ *
+ * Converted to use Qemu type aliases, use C features only, etc.
+ */
+
+/* executes single exponent reduction cycle */
+static uint64 remainder_kernel(uint64 aSig0, uint64 bSig, int expDiff, uint64 *zSig0, uint64 *zSig1)
+{
+    uint64 term0, term1;
+    uint64 aSig1 = 0;
+
+    shortShift128Left(aSig1, aSig0, expDiff, &aSig1, &aSig0);
+    uint64 q = estimateDiv128To64(aSig1, aSig0, bSig);
+    mul64To128(bSig, q, &term0, &term1);
+    sub128(aSig1, aSig0, term0, term1, zSig1, zSig0);
+    while ((int64)(*zSig1) < 0) {
+        --q;
+        add128(*zSig1, *zSig0, 0, bSig, zSig1, zSig0);
+    }
+    return q;
+}
+
+static int do_fprem(floatx80 a, floatx80 b, floatx80 *r, uint64 *q, int rounding_mode STATUS_PARAM )
+{
+    int32 aExp, bExp, zExp, expDiff;
+    uint64 aSig0, aSig1, bSig;
+    int aSign;
+    *q = 0;
+
+    // handle unsupported extended double-precision floating encodings
+    if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b))
+    {
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+
+    aSig0 = extractFloatx80Frac(a);
+    aExp = extractFloatx80Exp(a);
+    aSign = extractFloatx80Sign(a);
+    bSig = extractFloatx80Frac(b);
+    bExp = extractFloatx80Exp(b);
+
+    if (aExp == 0x7FFF) {
+        if ((uint64) (aSig0<<1) || ((bExp == 0x7FFF) && (uint64) (bSig<<1))) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        float_raise(float_flag_invalid, status);
+        *r = floatx80_default_nan;
+        return -1;
+    }
+    if (bExp == 0x7FFF) {
+        if ((uint64) (bSig<<1)) {
+            *r = propagateFloatx80NaN(a, b, status);
+            return -1;
+        }
+        if (aExp == 0 && aSig0) {
+            float_raise(float_flag_input_denormal, status);
+            normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+            *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+            return 0;
+        }
+        *r = a;
+        return 0;
+
+    }
+    if (bExp == 0) {
+        if (bSig == 0) {
+            float_raise(float_flag_invalid, status);
+            *r = floatx80_default_nan;
+            return -1;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(bSig, &bExp, &bSig);
+    }
+    if (aExp == 0) {
+        if (aSig0 == 0) {
+            *r = a;
+            return 0;
+        }
+        float_raise(float_flag_input_denormal, status);
+        normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0);
+    }
+    expDiff = aExp - bExp;
+    aSig1 = 0;
+
+    uint32 overflow = 0;
+
+    if (expDiff >= 64) {
+        int n = (expDiff & 0x1f) | 0x20;
+        remainder_kernel(aSig0, bSig, n, &aSig0, &aSig1);
+        zExp = aExp - n;
+        overflow = 1;
+    }
+    else {
+        zExp = bExp;
+
+        if (expDiff < 0) {
+            if (expDiff < -1) {
+               *r = (extractFloatx80Frac(a) & LIT64(0x8000000000000000)) ?
+                    packFloatx80(aSign, aExp, aSig0) : a;
+               return 0;
+            }
+            shift128Right(aSig0, 0, 1, &aSig0, &aSig1);
+            expDiff = 0;
+        }
+
+        if (expDiff > 0) {
+            *q = remainder_kernel(aSig0, bSig, expDiff, &aSig0, &aSig1);
+        }
+        else {
+            if (bSig <= aSig0) {
+               aSig0 -= bSig;
+               *q = 1;
+            }
+        }
+
+        if (rounding_mode == float_round_nearest_even)
+        {
+            uint64 term0, term1;
+            shift128Right(bSig, 0, 1, &term0, &term1);
+
+            if (! lt128(aSig0, aSig1, term0, term1))
+            {
+               int lt = lt128(term0, term1, aSig0, aSig1);
+               int eq = eq128(aSig0, aSig1, term0, term1);
+
+               if ((eq && (*q & 1)) || lt) {
+                  aSign = !aSign;
+                  ++*q;
+               }
+               if (lt) sub128(bSig, 0, aSig0, aSig1, &aSig0, &aSig1);
+            }
+        }
+    }
+
+    *r = normalizeRoundAndPackFloatx80(80, aSign, zExp, aSig0, aSig1, status);
+    return overflow;
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with respect to the corresponding value `b'.  The operation is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+int floatx80_ieee754_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 *q STATUS_PARAM )
+{
+    return do_fprem(a, b, r, q, float_round_nearest_even STATUS_VAR );
+}
+
+/*----------------------------------------------------------------------------
+| Returns the remainder of the extended double-precision floating-point value
+| `a' with  respect to  the corresponding value `b'. Unlike previous function
+| the  function  does not compute  the remainder  specified  in  the IEC/IEEE
+| Standard  for Binary  Floating-Point  Arithmetic.  This  function  operates
+| differently  from the  previous  function in  the way  that it  rounds  the
+| quotient of 'a' divided by 'b' to an integer.
+*----------------------------------------------------------------------------*/
+
+int floatx80_remainder(floatx80 a, floatx80 b, floatx80 *r, uint64 *q STATUS_PARAM )
+{
+    return do_fprem(a, b, r, q, float_round_to_zero STATUS_VAR );
+}
+
+/*
+ * END from Bochs rev 11224 dated 2012-06-21 10:33:37 -0700
+ */
+
 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
 {
     flag aSign;
diff --git a/fpu/softfloat.h b/fpu/softfloat.h
index feec3a1..53d7827 100644
--- a/fpu/softfloat.h
+++ b/fpu/softfloat.h
@@ -497,10 +497,13 @@ int floatx80_lt_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_unordered_quiet( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare( floatx80, floatx80 STATUS_PARAM );
 int floatx80_compare_quiet( floatx80, floatx80 STATUS_PARAM );
+int floatx80_is_unsupported( floatx80 );
 int floatx80_is_quiet_nan( floatx80 );
 int floatx80_is_signaling_nan( floatx80 );
 floatx80 floatx80_maybe_silence_nan( floatx80 );
 floatx80 floatx80_scalbn( floatx80, int STATUS_PARAM );
+int floatx80_ieee754_remainder( floatx80, floatx80, floatx80 *, uint64 * STATUS_PARAM );
+int floatx80_remainder( floatx80, floatx80, floatx80 *, uint64 * STATUS_PARAM );
 
 INLINE floatx80 floatx80_abs(floatx80 a)
 {
diff --git a/target-i386/op_helper.c b/target-i386/op_helper.c
index 2862ea4..ae0c877 100644
--- a/target-i386/op_helper.c
+++ b/target-i386/op_helper.c
@@ -3620,6 +3620,23 @@ static void fpu_set_exception(int mask)
         env->fpus |= FPUS_SE | FPUS_B;
 }
 
+static int fpu_exception_from_fp_status(void)
+{
+    int mask;
+
+    const int float_flag_denormals = float_flag_input_denormal |
+                                     float_flag_output_denormal;
+
+    // Most flags have the same value in the enum and 387 status word, except
+    // the denormal flags.
+    mask = env->fp_status.float_exception_flags & ~float_flag_denormals;
+    if (env->fp_status.float_exception_flags & float_flag_denormals) {
+        mask |= FPUS_DE;
+    }
+
+    return mask;
+}
+
 static inline floatx80 helper_fdiv(floatx80 a, floatx80 b)
 {
     if (floatx80_is_zero(b)) {
@@ -4210,119 +4227,67 @@ void helper_fxtract(void)
 
 void helper_fprem1(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
-
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
-
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-        ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
-
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
+    floatx80 result;
+    uint64_t quotient;
+
+    /* TODO(catalinp):
+     *
+     * - What is the defined purpose of fp_status in Qemu? Seems many things
+     *   write into it, but few if any translate into env->fpus.
+     *
+     * - Bochs' FPU_pre_exception_handling resets a few more fields in fp_status
+     *   before doing the operation. What is the purpose of that and is this
+     *   necessary here?
+     */
+    env->fp_status.float_exception_flags = 0;
+    int flags = floatx80_ieee754_remainder(ST0, ST1, &result, &quotient,
+                                           &env->fp_status);
+
+    /* TODO(catalinp): Bochs also checks for unmasked exceptions before storing
+     * these flags. Should we also do this?
+     */
+    if (flags >= 0) {
         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
+        if (flags) {
+            env->fpus |= (1 << 10);
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) env->fpus |= (1 << 9);
+            if (quotient & 2) env->fpus |= (1 << 14);
+            if (quotient & 4) env->fpus |= (1 << 8);
+        }
     }
 
-    if (expdif < 53) {
-        dblq = fpsrcop / fptemp;
-        /* round dblq towards nearest integer */
-        dblq = rint(dblq);
-        st0 = fpsrcop - fptemp * dblq;
-
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
+    ST0 = result;
 
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, expdif - 50);
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    /* TODO(catalinp): Set only unmasked exceptions? */
+    fpu_set_exception(fpu_exception_from_fp_status());
 }
 
 void helper_fprem(void)
 {
-    double st0, st1, dblq, fpsrcop, fptemp;
-    CPU_LDoubleU fpsrcop1, fptemp1;
-    int expdif;
-    signed long long int q;
-
-    st0 = floatx80_to_double(ST0);
-    st1 = floatx80_to_double(ST1);
+    floatx80 result;
+    uint64_t quotient;
 
-    if (isinf(st0) || isnan(st0) || isnan(st1) || (st1 == 0.0)) {
-       ST0 = double_to_floatx80(0.0 / 0.0); /* NaN */
-       env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-       return;
-    }
-
-    fpsrcop = st0;
-    fptemp = st1;
-    fpsrcop1.d = ST0;
-    fptemp1.d = ST1;
-    expdif = EXPD(fpsrcop1) - EXPD(fptemp1);
+    /* TODO(catalinp): See comments in helper_fprem1() about lots of potential
+     * semantic ambiguities/differences between this and Bochs' implementation.
+     */
+    env->fp_status.float_exception_flags = 0;
+    int flags = floatx80_remainder(ST0, ST1, &result, &quotient, &env->fp_status);
 
-    if (expdif < 0) {
-        /* optimisation? taken from the AMD docs */
+    if (flags >= 0) {
         env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-        /* ST0 is unchanged */
-        return;
+        if (flags) {
+            env->fpus |= (1 << 10);
+        } else {            /* (C0,C3,C1) <-- (q2,q1,q0) */
+            if (quotient & 1) env->fpus |= (1 << 9);
+            if (quotient & 2) env->fpus |= (1 << 14);
+            if (quotient & 4) env->fpus |= (1 << 8);
+        }
     }
 
-    if ( expdif < 53 ) {
-        dblq = fpsrcop/*ST0*/ / fptemp/*ST1*/;
-        /* round dblq towards zero */
-        dblq = (dblq < 0.0) ? ceil(dblq) : floor(dblq);
-        st0 = fpsrcop/*ST0*/ - fptemp * dblq;
+    ST0 = result;
 
-        /* convert dblq to q by truncating towards zero */
-        if (dblq < 0.0)
-           q = (signed long long int)(-dblq);
-        else
-           q = (signed long long int)dblq;
-
-        env->fpus &= (~0x4700); /* (C3,C2,C1,C0) <-- 0000 */
-                                /* (C0,C3,C1) <-- (q2,q1,q0) */
-        env->fpus |= (q & 0x4) << (8 - 2);  /* (C0) <-- q2 */
-        env->fpus |= (q & 0x2) << (14 - 1); /* (C3) <-- q1 */
-        env->fpus |= (q & 0x1) << (9 - 0);  /* (C1) <-- q0 */
-    } else {
-        int N = 32 + (expdif % 32); /* as per AMD docs */
-        env->fpus |= 0x400;  /* C2 <-- 1 */
-        fptemp = pow(2.0, (double)(expdif - N));
-        fpsrcop = (st0 / st1) / fptemp;
-        /* fpsrcop = integer obtained by chopping */
-        fpsrcop = (fpsrcop < 0.0) ?
-                  -(floor(fabs(fpsrcop))) : floor(fpsrcop);
-        st0 -= (st1 * fpsrcop * fptemp);
-    }
-    ST0 = double_to_floatx80(st0);
+    fpu_set_exception(fpu_exception_from_fp_status());
 }
 
 void helper_fyl2xp1(void)
