diff mbox series

[6/8] softfloat: Implement float128_muladd

Message ID 20200924012453.659757-7-richard.henderson@linaro.org
State New
Headers show
Series softfloat: Implement float128_muladd | expand

Commit Message

Richard Henderson Sept. 24, 2020, 1:24 a.m. UTC
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/fpu/softfloat.h |   2 +
 fpu/softfloat.c         | 356 +++++++++++++++++++++++++++++++++++++++-
 tests/fp/fp-test.c      |   2 +-
 tests/fp/wrap.c.inc     |  12 ++
 4 files changed, 370 insertions(+), 2 deletions(-)

Comments

David Hildenbrand Sept. 24, 2020, 7:56 a.m. UTC | #1
[...]

>  /*----------------------------------------------------------------------------
>  | Packs the sign `zSign', the exponent `zExp', and the significand formed
>  | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
> @@ -7205,6 +7253,312 @@ float128 float128_mul(float128 a, float128 b, float_status *status)
>  
>  }
>  

I do wonder if a type for Int256 would make sense - instead of manually
passing these arrays.

> +static void shortShift256Left(uint64_t p[4], unsigned count)
> +{
> +    int negcount = -count & 63;

That's the same as "64 - count", right? (which I find easier to get)

> +
> +    if (count == 0) {
> +        return;
> +    }
> +    g_assert(count < 64);
> +    p[0] = (p[0] << count) | (p[1] >> negcount);
> +    p[1] = (p[1] << count) | (p[2] >> negcount);
> +    p[2] = (p[2] << count) | (p[3] >> negcount);
> +    p[3] = (p[3] << count);
> +}
> +
> +static void shift256RightJamming(uint64_t p[4], int count)
> +{
> +    uint64_t in = 0;
> +
> +    g_assert(count >= 0);
> +
> +    count = MIN(count, 256);
> +    for (; count >= 64; count -= 64) {
> +        in |= p[3];
> +        p[3] = p[2];
> +        p[2] = p[1];
> +        p[1] = p[0];
> +        p[0] = 0;
> +    }
> +
> +    if (count) {
> +        int negcount = -count & 63;

dito

> +
> +        in |= p[3] << negcount;
> +        p[3] = (p[2] << negcount) | (p[3] >> count);
> +        p[2] = (p[1] << negcount) | (p[2] >> count);
> +        p[1] = (p[0] << negcount) | (p[1] >> count);
> +        p[0] = p[0] >> count;
> +    }
> +    p[3] |= (in != 0);

Took ma a bit longer to understand, but now I know why the function name
has "Jamming" in it :)

[...]

> +
> +float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f,
> +                         int flags, float_status *status)
> +{
> +    bool inf_zero, p_sign, sign_flip;
> +    uint64_t p_frac[4];
> +    FloatParts128 a, b, c;
> +    int p_exp, exp_diff, shift, ab_mask, abc_mask;
> +    FloatClass p_cls;
> +
> +    float128_unpack(&a, a_f, status);
> +    float128_unpack(&b, b_f, status);
> +    float128_unpack(&c, c_f, status);
> +
> +    ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
> +    abc_mask = float_cmask(c.cls) | ab_mask;
> +    inf_zero = ab_mask == float_cmask_infzero;
> +
> +    /* If any input is a NaN, select the required result. */
> +    if (unlikely(abc_mask & float_cmask_anynan)) {
> +        if (unlikely(abc_mask & float_cmask_snan)) {
> +            float_raise(float_flag_invalid, status);
> +        }
> +
> +        int which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, status);
> +        if (status->default_nan_mode) {
> +            which = 3;
> +        }
> +        switch (which) {
> +        case 0:
> +            break;
> +        case 1:
> +            a_f = b_f;
> +            a.cls = b.cls;
> +            break;
> +        case 2:
> +            a_f = c_f;
> +            a.cls = c.cls;
> +            break;
> +        case 3:
> +            return float128_default_nan(status);
> +        }
> +        if (is_snan(a.cls)) {
> +            return float128_silence_nan(a_f, status);
> +        }
> +        return a_f;
> +    }
> +
> +    /* After dealing with input NaNs, look for Inf * Zero. */
> +    if (unlikely(inf_zero)) {
> +        float_raise(float_flag_invalid, status);
> +        return float128_default_nan(status);
> +    }
> +
> +    p_sign = a.sign ^ b.sign;
> +
> +    if (flags & float_muladd_negate_c) {
> +        c.sign ^= 1;
> +    }
> +    if (flags & float_muladd_negate_product) {
> +        p_sign ^= 1;
> +    }
> +    sign_flip = (flags & float_muladd_negate_result);
> +
> +    if (ab_mask & float_cmask_inf) {
> +        p_cls = float_class_inf;
> +    } else if (ab_mask & float_cmask_zero) {
> +        p_cls = float_class_zero;
> +    } else {
> +        p_cls = float_class_normal;
> +    }
> +
> +    if (c.cls == float_class_inf) {
> +        if (p_cls == float_class_inf && p_sign != c.sign) {
> +            /* +Inf + -Inf = NaN */
> +            float_raise(float_flag_invalid, status);
> +            return float128_default_nan(status);
> +        }
> +        /* Inf + Inf = Inf of the proper sign; reuse the return below. */
> +        p_cls = float_class_inf;
> +        p_sign = c.sign;
> +    }
> +
> +    if (p_cls == float_class_inf) {
> +        return packFloat128(p_sign ^ sign_flip, 0x7fff, 0, 0);
> +    }
> +
> +    if (p_cls == float_class_zero) {
> +        if (c.cls == float_class_zero) {
> +            if (p_sign != c.sign) {
> +                p_sign = status->float_rounding_mode == float_round_down;
> +            }
> +            return packFloat128(p_sign ^ sign_flip, 0, 0, 0);
> +        }
> +
> +        if (flags & float_muladd_halve_result) {
> +            c.exp -= 1;
> +        }
> +        return roundAndPackFloat128(c.sign ^ sign_flip,
> +                                    c.exp + 0x3fff - 1,
> +                                    c.frac0, c.frac1, 0, status);
> +    }
> +
> +    /* a & b should be normals now... */
> +    assert(a.cls == float_class_normal && b.cls == float_class_normal);
> +
> +    /* Multiply of 2 113-bit numbers produces a 226-bit result.  */
> +    mul128To256(a.frac0, a.frac1, b.frac0, b.frac1,
> +                &p_frac[0], &p_frac[1], &p_frac[2], &p_frac[3]);
> +
> +    /* Realign the binary point at bit 48 of p_frac[0].  */
> +    shift = clz64(p_frac[0]) - 15;
> +    g_assert(shift == 15 || shift == 16);
> +    shortShift256Left(p_frac, shift);
> +    p_exp = a.exp + b.exp - (shift - 16);
> +    exp_diff = p_exp - c.exp;
> +
> +    uint64_t c_frac[4] = { c.frac0, c.frac1, 0, 0 };
> +
> +    /* Add or subtract C from the intermediate product. */
> +    if (c.cls == float_class_zero) {
> +        /* Fall through to rounding after addition (with zero). */
> +    } else if (p_sign != c.sign) {
> +        /* Subtraction */
> +        if (exp_diff < 0) {
> +            shift256RightJamming(p_frac, -exp_diff);
> +            sub256(p_frac, c_frac, p_frac);
> +            p_exp = c.exp;
> +            p_sign ^= 1;
> +        } else if (exp_diff > 0) {
> +            shift256RightJamming(c_frac, exp_diff);
> +            sub256(p_frac, p_frac, c_frac);
> +        } else {
> +            /* Low 128 bits of C are known to be zero. */
> +            sub128(p_frac[0], p_frac[1], c_frac[0], c_frac[1],
> +                   &p_frac[0], &p_frac[1]);
> +            /*
> +             * Since we have normalized to bit 48 of p_frac[0],
> +             * a negative result means C > P and we need to invert.
> +             */
> +            if ((int64_t)p_frac[0] < 0) {
> +                neg256(p_frac);
> +                p_sign ^= 1;
> +            }
> +        }
> +
> +        /*
> +         * Gross normalization of the 256-bit subtraction result.
> +         * Fine tuning below shared with addition.
> +         */
> +        if (p_frac[0] != 0) {
> +            /* nothing to do */
> +        } else if (p_frac[1] != 0) {
> +            p_exp -= 64;
> +            p_frac[0] = p_frac[1];
> +            p_frac[1] = p_frac[2];
> +            p_frac[2] = p_frac[3];
> +            p_frac[3] = 0;
> +        } else if (p_frac[2] != 0) {
> +            p_exp -= 128;
> +            p_frac[0] = p_frac[2];
> +            p_frac[1] = p_frac[3];
> +            p_frac[2] = 0;
> +            p_frac[3] = 0;
> +        } else if (p_frac[3] != 0) {
> +            p_exp -= 192;
> +            p_frac[0] = p_frac[3];
> +            p_frac[1] = 0;
> +            p_frac[2] = 0;
> +            p_frac[3] = 0;
> +        } else {
> +            /* Subtraction was exact: result is zero. */
> +            p_sign = status->float_rounding_mode == float_round_down;
> +            return packFloat128(p_sign ^ sign_flip, 0, 0, 0);
> +        }
> +    } else {
> +        /* Addition */
> +        if (exp_diff <= 0) {
> +            shift256RightJamming(p_frac, -exp_diff);
> +            /* Low 128 bits of C are known to be zero. */
> +            add128(p_frac[0], p_frac[1], c_frac[0], c_frac[1],
> +                   &p_frac[0], &p_frac[1]);
> +            p_exp = c.exp;
> +        } else {
> +            shift256RightJamming(c_frac, exp_diff);
> +            add256(p_frac, c_frac);
> +        }
> +    }
> +
> +    /* Fine normalization of the 256-bit result: p_frac[0] != 0. */
> +    shift = clz64(p_frac[0]) - 15;
> +    if (shift < 0) {
> +        shift256RightJamming(p_frac, -shift);
> +    } else if (shift > 0) {
> +        shortShift256Left(p_frac, shift);
> +    }
> +    p_exp -= shift;
> +
> +    if (flags & float_muladd_halve_result) {
> +        p_exp -= 1;
> +    }
> +    return roundAndPackFloat128(p_sign ^ sign_flip,
> +                                p_exp + 0x3fff - 1,
> +                                p_frac[0], p_frac[1],
> +                                p_frac[2] | (p_frac[3] != 0),
> +                                status);
> +}

Wow, that's a beast :)
Richard Henderson Sept. 24, 2020, 1:30 p.m. UTC | #2
On 9/24/20 12:56 AM, David Hildenbrand wrote:
> I do wonder if a type for Int256 would make sense - instead of > manually passing these arrays.

I could do that.  It does name better, I suppose, in passing.  So long as
you're happy having the guts of the type be public, and not wrapping everything
like we do for Int128...

Either is better than the softfloat style, which would pass 12 arguments to
these helpers... ;-)

>> +static void shortShift256Left(uint64_t p[4], unsigned count)
>> +{
>> +    int negcount = -count & 63;
> 
> That's the same as "64 - count", right? (which I find easier to get)

In this case, yes.

Of course, more hosts have a "neg" instruction than they do a
"subtract-from-immediate" instruction.  When the shift instruction only
examines the low N bits, the "& 63" is optimized away, so this can result in 1
fewer instruction in the end.

Which is why I almost always use this form, and it's already all over the code
inherited from upstream.

> Wow, that's a beast :)

But not much worse than muladd_floats(), I'm pleased to say.


r~
David Hildenbrand Sept. 25, 2020, 9:17 a.m. UTC | #3
On 24.09.20 15:30, Richard Henderson wrote:
> On 9/24/20 12:56 AM, David Hildenbrand wrote:
>> I do wonder if a type for Int256 would make sense - instead of > manually passing these arrays.
> 
> I could do that.  It does name better, I suppose, in passing.  So long as
> you're happy having the guts of the type be public, and not wrapping everything
> like we do for Int128...

We can do that once we have hardware+compiler support for 256bit ;)

> 
> Either is better than the softfloat style, which would pass 12 arguments to
> these helpers... ;-)

Indeed

> 
>>> +static void shortShift256Left(uint64_t p[4], unsigned count)
>>> +{
>>> +    int negcount = -count & 63;
>>
>> That's the same as "64 - count", right? (which I find easier to get)
> 
> In this case, yes.
> 
> Of course, more hosts have a "neg" instruction than they do a
> "subtract-from-immediate" instruction.  When the shift instruction only
> examines the low N bits, the "& 63" is optimized away, so this can result in 1
> fewer instruction in the end.
> 
> Which is why I almost always use this form, and it's already all over the code
> inherited from upstream.
> 
>> Wow, that's a beast :)
> 
> But not much worse than muladd_floats(), I'm pleased to say.

Definitely!
diff mbox series

Patch

diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index 78ad5ca738..a38433deb4 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -1196,6 +1196,8 @@  float128 float128_sub(float128, float128, float_status *status);
 float128 float128_mul(float128, float128, float_status *status);
 float128 float128_div(float128, float128, float_status *status);
 float128 float128_rem(float128, float128, float_status *status);
+float128 float128_muladd(float128, float128, float128, int,
+                         float_status *status);
 float128 float128_sqrt(float128, float_status *status);
 FloatRelation float128_compare(float128, float128, float_status *status);
 FloatRelation float128_compare_quiet(float128, float128, float_status *status);
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index e038434a07..5b714fbd82 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -512,11 +512,19 @@  static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 
 typedef struct {
     uint64_t frac;
-    int32_t  exp;
+    int32_t exp;
     FloatClass cls;
     bool sign;
 } FloatParts;
 
+/* Similar for float128.  */
+typedef struct {
+    uint64_t frac0, frac1;
+    int32_t exp;
+    FloatClass cls;
+    bool sign;
+} FloatParts128;
+
 #define DECOMPOSED_BINARY_POINT    (64 - 2)
 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 #define DECOMPOSED_OVERFLOW_BIT    (DECOMPOSED_IMPLICIT_BIT << 1)
@@ -4574,6 +4582,46 @@  static void
 
 }
 
+/*----------------------------------------------------------------------------
+| Returns the parts of floating-point value `a'.
+*----------------------------------------------------------------------------*/
+
+static void float128_unpack(FloatParts128 *p, float128 a, float_status *status)
+{
+    p->sign = extractFloat128Sign(a);
+    p->exp = extractFloat128Exp(a);
+    p->frac0 = extractFloat128Frac0(a);
+    p->frac1 = extractFloat128Frac1(a);
+
+    if (p->exp == 0) {
+        if ((p->frac0 | p->frac1) == 0) {
+            p->cls = float_class_zero;
+        } else if (status->flush_inputs_to_zero) {
+            float_raise(float_flag_input_denormal, status);
+            p->cls = float_class_zero;
+            p->frac0 = p->frac1 = 0;
+        } else {
+            normalizeFloat128Subnormal(p->frac0, p->frac1, &p->exp,
+                                       &p->frac0, &p->frac1);
+            p->exp -= 0x3fff;
+            p->cls = float_class_normal;
+        }
+    } else if (p->exp == 0x7fff) {
+        if ((p->frac0 | p->frac1) == 0) {
+            p->cls = float_class_inf;
+        } else if (float128_is_signaling_nan(a, status)) {
+            p->cls = float_class_snan;
+        } else {
+            p->cls = float_class_qnan;
+        }
+    } else {
+        /* Add the implicit bit. */
+        p->frac0 |= UINT64_C(0x0001000000000000);
+        p->exp -= 0x3fff;
+        p->cls = float_class_normal;
+    }
+}
+
 /*----------------------------------------------------------------------------
 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
@@ -7205,6 +7253,312 @@  float128 float128_mul(float128 a, float128 b, float_status *status)
 
 }
 
+static void shortShift256Left(uint64_t p[4], unsigned count)
+{
+    int negcount = -count & 63;
+
+    if (count == 0) {
+        return;
+    }
+    g_assert(count < 64);
+    p[0] = (p[0] << count) | (p[1] >> negcount);
+    p[1] = (p[1] << count) | (p[2] >> negcount);
+    p[2] = (p[2] << count) | (p[3] >> negcount);
+    p[3] = (p[3] << count);
+}
+
+static void shift256RightJamming(uint64_t p[4], int count)
+{
+    uint64_t in = 0;
+
+    g_assert(count >= 0);
+
+    count = MIN(count, 256);
+    for (; count >= 64; count -= 64) {
+        in |= p[3];
+        p[3] = p[2];
+        p[2] = p[1];
+        p[1] = p[0];
+        p[0] = 0;
+    }
+
+    if (count) {
+        int negcount = -count & 63;
+
+        in |= p[3] << negcount;
+        p[3] = (p[2] << negcount) | (p[3] >> count);
+        p[2] = (p[1] << negcount) | (p[2] >> count);
+        p[1] = (p[0] << negcount) | (p[1] >> count);
+        p[0] = p[0] >> count;
+    }
+    p[3] |= (in != 0);
+}
+
+/* R = A - B */
+static void sub256(uint64_t r[4], uint64_t a[4], uint64_t b[4])
+{
+    bool borrow = false;
+
+    for (int i = 3; i >= 0; --i) {
+        if (borrow) {
+            borrow = a[i] <= b[i];
+            r[i] = a[i] - b[i] - 1;
+        } else {
+            borrow = a[i] < b[i];
+            r[i] = a[i] - b[i];
+        }
+    }
+}
+
+/* A = -A */
+static void neg256(uint64_t a[4])
+{
+    a[3] = -a[3];
+    if (likely(a[3])) {
+        goto not2;
+    }
+    a[2] = -a[2];
+    if (likely(a[2])) {
+        goto not1;
+    }
+    a[1] = -a[1];
+    if (likely(a[1])) {
+        goto not0;
+    }
+    a[0] = -a[0];
+    return;
+ not2:
+    a[2] = ~a[2];
+ not1:
+    a[1] = ~a[1];
+ not0:
+    a[0] = ~a[0];
+}
+
+/* A += B */
+static void add256(uint64_t a[4], uint64_t b[4])
+{
+    bool carry = false;
+
+    for (int i = 3; i >= 0; --i) {
+        uint64_t t = a[i] + b[i];
+        if (carry) {
+            t += 1;
+            carry = t <= a[i];
+        } else {
+            carry = t < a[i];
+        }
+        a[i] = t;
+    }
+}
+
+float128 float128_muladd(float128 a_f, float128 b_f, float128 c_f,
+                         int flags, float_status *status)
+{
+    bool inf_zero, p_sign, sign_flip;
+    uint64_t p_frac[4];
+    FloatParts128 a, b, c;
+    int p_exp, exp_diff, shift, ab_mask, abc_mask;
+    FloatClass p_cls;
+
+    float128_unpack(&a, a_f, status);
+    float128_unpack(&b, b_f, status);
+    float128_unpack(&c, c_f, status);
+
+    ab_mask = float_cmask(a.cls) | float_cmask(b.cls);
+    abc_mask = float_cmask(c.cls) | ab_mask;
+    inf_zero = ab_mask == float_cmask_infzero;
+
+    /* If any input is a NaN, select the required result. */
+    if (unlikely(abc_mask & float_cmask_anynan)) {
+        if (unlikely(abc_mask & float_cmask_snan)) {
+            float_raise(float_flag_invalid, status);
+        }
+
+        int which = pickNaNMulAdd(a.cls, b.cls, c.cls, inf_zero, status);
+        if (status->default_nan_mode) {
+            which = 3;
+        }
+        switch (which) {
+        case 0:
+            break;
+        case 1:
+            a_f = b_f;
+            a.cls = b.cls;
+            break;
+        case 2:
+            a_f = c_f;
+            a.cls = c.cls;
+            break;
+        case 3:
+            return float128_default_nan(status);
+        }
+        if (is_snan(a.cls)) {
+            return float128_silence_nan(a_f, status);
+        }
+        return a_f;
+    }
+
+    /* After dealing with input NaNs, look for Inf * Zero. */
+    if (unlikely(inf_zero)) {
+        float_raise(float_flag_invalid, status);
+        return float128_default_nan(status);
+    }
+
+    p_sign = a.sign ^ b.sign;
+
+    if (flags & float_muladd_negate_c) {
+        c.sign ^= 1;
+    }
+    if (flags & float_muladd_negate_product) {
+        p_sign ^= 1;
+    }
+    sign_flip = (flags & float_muladd_negate_result);
+
+    if (ab_mask & float_cmask_inf) {
+        p_cls = float_class_inf;
+    } else if (ab_mask & float_cmask_zero) {
+        p_cls = float_class_zero;
+    } else {
+        p_cls = float_class_normal;
+    }
+
+    if (c.cls == float_class_inf) {
+        if (p_cls == float_class_inf && p_sign != c.sign) {
+            /* +Inf + -Inf = NaN */
+            float_raise(float_flag_invalid, status);
+            return float128_default_nan(status);
+        }
+        /* Inf + Inf = Inf of the proper sign; reuse the return below. */
+        p_cls = float_class_inf;
+        p_sign = c.sign;
+    }
+
+    if (p_cls == float_class_inf) {
+        return packFloat128(p_sign ^ sign_flip, 0x7fff, 0, 0);
+    }
+
+    if (p_cls == float_class_zero) {
+        if (c.cls == float_class_zero) {
+            if (p_sign != c.sign) {
+                p_sign = status->float_rounding_mode == float_round_down;
+            }
+            return packFloat128(p_sign ^ sign_flip, 0, 0, 0);
+        }
+
+        if (flags & float_muladd_halve_result) {
+            c.exp -= 1;
+        }
+        return roundAndPackFloat128(c.sign ^ sign_flip,
+                                    c.exp + 0x3fff - 1,
+                                    c.frac0, c.frac1, 0, status);
+    }
+
+    /* a & b should be normals now... */
+    assert(a.cls == float_class_normal && b.cls == float_class_normal);
+
+    /* Multiply of 2 113-bit numbers produces a 226-bit result.  */
+    mul128To256(a.frac0, a.frac1, b.frac0, b.frac1,
+                &p_frac[0], &p_frac[1], &p_frac[2], &p_frac[3]);
+
+    /* Realign the binary point at bit 48 of p_frac[0].  */
+    shift = clz64(p_frac[0]) - 15;
+    g_assert(shift == 15 || shift == 16);
+    shortShift256Left(p_frac, shift);
+    p_exp = a.exp + b.exp - (shift - 16);
+    exp_diff = p_exp - c.exp;
+
+    uint64_t c_frac[4] = { c.frac0, c.frac1, 0, 0 };
+
+    /* Add or subtract C from the intermediate product. */
+    if (c.cls == float_class_zero) {
+        /* Fall through to rounding after addition (with zero). */
+    } else if (p_sign != c.sign) {
+        /* Subtraction */
+        if (exp_diff < 0) {
+            shift256RightJamming(p_frac, -exp_diff);
+            sub256(p_frac, c_frac, p_frac);
+            p_exp = c.exp;
+            p_sign ^= 1;
+        } else if (exp_diff > 0) {
+            shift256RightJamming(c_frac, exp_diff);
+            sub256(p_frac, p_frac, c_frac);
+        } else {
+            /* Low 128 bits of C are known to be zero. */
+            sub128(p_frac[0], p_frac[1], c_frac[0], c_frac[1],
+                   &p_frac[0], &p_frac[1]);
+            /*
+             * Since we have normalized to bit 48 of p_frac[0],
+             * a negative result means C > P and we need to invert.
+             */
+            if ((int64_t)p_frac[0] < 0) {
+                neg256(p_frac);
+                p_sign ^= 1;
+            }
+        }
+
+        /*
+         * Gross normalization of the 256-bit subtraction result.
+         * Fine tuning below shared with addition.
+         */
+        if (p_frac[0] != 0) {
+            /* nothing to do */
+        } else if (p_frac[1] != 0) {
+            p_exp -= 64;
+            p_frac[0] = p_frac[1];
+            p_frac[1] = p_frac[2];
+            p_frac[2] = p_frac[3];
+            p_frac[3] = 0;
+        } else if (p_frac[2] != 0) {
+            p_exp -= 128;
+            p_frac[0] = p_frac[2];
+            p_frac[1] = p_frac[3];
+            p_frac[2] = 0;
+            p_frac[3] = 0;
+        } else if (p_frac[3] != 0) {
+            p_exp -= 192;
+            p_frac[0] = p_frac[3];
+            p_frac[1] = 0;
+            p_frac[2] = 0;
+            p_frac[3] = 0;
+        } else {
+            /* Subtraction was exact: result is zero. */
+            p_sign = status->float_rounding_mode == float_round_down;
+            return packFloat128(p_sign ^ sign_flip, 0, 0, 0);
+        }
+    } else {
+        /* Addition */
+        if (exp_diff <= 0) {
+            shift256RightJamming(p_frac, -exp_diff);
+            /* Low 128 bits of C are known to be zero. */
+            add128(p_frac[0], p_frac[1], c_frac[0], c_frac[1],
+                   &p_frac[0], &p_frac[1]);
+            p_exp = c.exp;
+        } else {
+            shift256RightJamming(c_frac, exp_diff);
+            add256(p_frac, c_frac);
+        }
+    }
+
+    /* Fine normalization of the 256-bit result: p_frac[0] != 0. */
+    shift = clz64(p_frac[0]) - 15;
+    if (shift < 0) {
+        shift256RightJamming(p_frac, -shift);
+    } else if (shift > 0) {
+        shortShift256Left(p_frac, shift);
+    }
+    p_exp -= shift;
+
+    if (flags & float_muladd_halve_result) {
+        p_exp -= 1;
+    }
+    return roundAndPackFloat128(p_sign ^ sign_flip,
+                                p_exp + 0x3fff - 1,
+                                p_frac[0], p_frac[1],
+                                p_frac[2] | (p_frac[3] != 0),
+                                status);
+}
+
 /*----------------------------------------------------------------------------
 | Returns the result of dividing the quadruple-precision floating-point value
 | `a' by the corresponding value `b'.  The operation is performed according to
diff --git a/tests/fp/fp-test.c b/tests/fp/fp-test.c
index 06ffebd6db..9bbb0dba67 100644
--- a/tests/fp/fp-test.c
+++ b/tests/fp/fp-test.c
@@ -717,7 +717,7 @@  static void do_testfloat(int op, int rmode, bool exact)
         test_abz_f128(true_abz_f128M, subj_abz_f128M);
         break;
     case F128_MULADD:
-        not_implemented();
+        test_abcz_f128(slow_f128M_mulAdd, qemu_f128_mulAdd);
         break;
     case F128_SQRT:
         test_az_f128(slow_f128M_sqrt, qemu_f128M_sqrt);
diff --git a/tests/fp/wrap.c.inc b/tests/fp/wrap.c.inc
index 0cbd20013e..65a713deae 100644
--- a/tests/fp/wrap.c.inc
+++ b/tests/fp/wrap.c.inc
@@ -574,6 +574,18 @@  WRAP_MULADD(qemu_f32_mulAdd, float32_muladd, float32)
 WRAP_MULADD(qemu_f64_mulAdd, float64_muladd, float64)
 #undef WRAP_MULADD
 
+static void qemu_f128_mulAdd(const float128_t *ap, const float128_t *bp,
+                             const float128_t *cp, float128_t *res)
+{
+    float128 a, b, c, ret;
+
+    a = soft_to_qemu128(*ap);
+    b = soft_to_qemu128(*bp);
+    c = soft_to_qemu128(*cp);
+    ret = float128_muladd(a, b, c, 0, &qsf);
+    *res = qemu_to_soft128(ret);
+}
+
 #define WRAP_CMP16(name, func, retcond)         \
     static bool name(float16_t a, float16_t b)  \
     {                                           \