Patch Detail

GET /api/patches/1475763/?format=api
HTTP 200 OK
Allow: GET, PUT, PATCH, HEAD, OPTIONS
Content-Type: application/json
Vary: Accept

{
    "id": 1475763,
    "url": "http://patchwork.ozlabs.org/api/patches/1475763/?format=api",
    "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20210508014802.892561-54-richard.henderson@linaro.org/",
    "project": {
        "id": 14,
        "url": "http://patchwork.ozlabs.org/api/projects/14/?format=api",
        "name": "QEMU Development",
        "link_name": "qemu-devel",
        "list_id": "qemu-devel.nongnu.org",
        "list_email": "qemu-devel@nongnu.org",
        "web_url": "",
        "scm_url": "",
        "webscm_url": "",
        "list_archive_url": "",
        "list_archive_url_format": "",
        "commit_url_format": ""
    },
    "msgid": "<20210508014802.892561-54-richard.henderson@linaro.org>",
    "list_archive_url": null,
    "date": "2021-05-08T01:47:43",
    "name": "[53/72] softfloat: Move sqrt_float to softfloat-parts.c.inc",
    "commit_ref": null,
    "pull_url": null,
    "state": "new",
    "archived": false,
    "hash": "ae13cd9e1fddca77d67752f2b0a8da581e690eb4",
    "submitter": {
        "id": 72104,
        "url": "http://patchwork.ozlabs.org/api/people/72104/?format=api",
        "name": "Richard Henderson",
        "email": "richard.henderson@linaro.org"
    },
    "delegate": null,
    "mbox": "http://patchwork.ozlabs.org/project/qemu-devel/patch/20210508014802.892561-54-richard.henderson@linaro.org/mbox/",
    "series": [
        {
            "id": 242770,
            "url": "http://patchwork.ozlabs.org/api/series/242770/?format=api",
            "web_url": "http://patchwork.ozlabs.org/project/qemu-devel/list/?series=242770",
            "date": "2021-05-08T01:46:53",
            "name": "Convert floatx80 and float128 to FloatParts",
            "version": 1,
            "mbox": "http://patchwork.ozlabs.org/series/242770/mbox/"
        }
    ],
    "comments": "http://patchwork.ozlabs.org/api/patches/1475763/comments/",
    "check": "pending",
    "checks": "http://patchwork.ozlabs.org/api/patches/1475763/checks/",
    "tags": {},
    "related": [],
    "headers": {
        "Return-Path": "<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>",
        "X-Original-To": "incoming@patchwork.ozlabs.org",
        "Delivered-To": "patchwork-incoming@bilbo.ozlabs.org",
        "Authentication-Results": [
            "ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=nongnu.org\n (client-ip=209.51.188.17; helo=lists.gnu.org;\n envelope-from=qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org;\n receiver=<UNKNOWN>)",
            "ozlabs.org;\n\tdkim=fail reason=\"signature verification failed\" (2048-bit key;\n unprotected) header.d=linaro.org header.i=@linaro.org header.a=rsa-sha256\n header.s=google header.b=Ec42+a+n;\n\tdkim-atps=neutral"
        ],
        "Received": [
            "from lists.gnu.org (lists.gnu.org [209.51.188.17])\n\t(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))\n\t(No client certificate requested)\n\tby ozlabs.org (Postfix) with ESMTPS id 4FcWbg6JPfz9sTD\n\tfor <incoming@patchwork.ozlabs.org>; Sat,  8 May 2021 12:34:07 +1000 (AEST)",
            "from localhost ([::1]:53940 helo=lists1p.gnu.org)\n\tby lists.gnu.org with esmtp (Exim 4.90_1)\n\t(envelope-from <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>)\n\tid 1lfCn7-0003Bd-RH\n\tfor incoming@patchwork.ozlabs.org; Fri, 07 May 2021 22:34:05 -0400",
            "from eggs.gnu.org ([2001:470:142:3::10]:41632)\n by lists.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_256_GCM_SHA384:256)\n (Exim 4.90_1) (envelope-from <richard.henderson@linaro.org>)\n id 1lfC88-0003IR-JW\n for qemu-devel@nongnu.org; Fri, 07 May 2021 21:51:44 -0400",
            "from mail-pg1-x52d.google.com ([2607:f8b0:4864:20::52d]:42952)\n by eggs.gnu.org with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128)\n (Exim 4.90_1) (envelope-from <richard.henderson@linaro.org>)\n id 1lfC84-0005US-1O\n for qemu-devel@nongnu.org; Fri, 07 May 2021 21:51:44 -0400",
            "by mail-pg1-x52d.google.com with SMTP id m12so8627274pgr.9\n for <qemu-devel@nongnu.org>; Fri, 07 May 2021 18:51:39 -0700 (PDT)",
            "from localhost.localdomain ([71.212.144.24])\n by smtp.gmail.com with ESMTPSA id 204sm5861396pfw.158.2021.05.07.18.51.38\n (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);\n Fri, 07 May 2021 18:51:38 -0700 (PDT)"
        ],
        "DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google;\n h=from:to:cc:subject:date:message-id:in-reply-to:references\n :mime-version:content-transfer-encoding;\n bh=vX4LKvyYIDcCSlAmcHQYpRFHW+uxfOceE/Raw8onmTE=;\n b=Ec42+a+nqkzm6WmFPLp0gs1+KxvCtwhc/kEAtoTAWZJqfNf39pL4a0xDkJ/1F9bfm+\n rzzFYJAfOYeRoPfJLP21LXnzTQk71Trp3nQoxU/DvK3ZTs8Hj/5MkjyVGkv54gKZvOd5\n TMt0H+ciE8coUn1c2m/Ni352icSPHmI/bTkU/hW75q963of8m+57nJQbOG56b1odMK0J\n iRnIeicyad5ODIS3TOVus/fm7rFF67Fi/W/lrq9ivH7Trowt6yhK9byMv7+l5ogtEd7x\n 0db1sk66fl4lLILBvbw3RjS+I9It7WZoUQvYNeWYrbXFUzelZYrAEC726amBzmIiOePR\n /rMg==",
        "X-Google-DKIM-Signature": "v=1; a=rsa-sha256; c=relaxed/relaxed;\n d=1e100.net; s=20161025;\n h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to\n :references:mime-version:content-transfer-encoding;\n bh=vX4LKvyYIDcCSlAmcHQYpRFHW+uxfOceE/Raw8onmTE=;\n b=Ngy64TwtaINu/neWy7lalocSKkPUCn2GVkJqMRiaaD0z2m/ynwPSsmxrP/PjR841b/\n JnBDxY74upV0TG4Z023ilRo9yAfqFbeWn302IimyEX8WuNE8abNWQ7dd4/+LN7JMUYmy\n fmjHgOCBHLHS+ke08JhR3+YTynq0mcC5xa27DqmhOQxy/MlCGNZkjVJSfgDJA1v0fB3+\n RmsbLo6m+Q0CND8u6XiC3Tp7QVFAlUbfufDBTEyPA/BKU4xppAWHMkn9FB0gYrvomSWm\n bYqpXjQaFGvrdS1maAV7LVe6kRJPbom7ge4evpIv4xylzCdsQAQ02jKFTflSSYj7jcTA\n FdNA==",
        "X-Gm-Message-State": "AOAM533HP4ZIXZf362LzwwjlMjR8FJRxB9W7R6/A2FvBsMD3WfmynsgW\n 0P8USec8lguqBVBOcp2kKPFn8Z7vTiQqrg==",
        "X-Google-Smtp-Source": "\n ABdhPJzecPbpgZUVCo0GMdOyG0+pMZw/K++Z2BndpClFd/BhIiuY87ZQYqoVZZ5Zb4+CcGbCCwVPpw==",
        "X-Received": "by 2002:aa7:96ea:0:b029:28c:e131:f0f with SMTP id\n i10-20020aa796ea0000b029028ce1310f0fmr13509190pfq.11.1620438698453;\n Fri, 07 May 2021 18:51:38 -0700 (PDT)",
        "From": "Richard Henderson <richard.henderson@linaro.org>",
        "To": "qemu-devel@nongnu.org",
        "Subject": "[PATCH 53/72] softfloat: Move sqrt_float to softfloat-parts.c.inc",
        "Date": "Fri,  7 May 2021 18:47:43 -0700",
        "Message-Id": "<20210508014802.892561-54-richard.henderson@linaro.org>",
        "X-Mailer": "git-send-email 2.25.1",
        "In-Reply-To": "<20210508014802.892561-1-richard.henderson@linaro.org>",
        "References": "<20210508014802.892561-1-richard.henderson@linaro.org>",
        "MIME-Version": "1.0",
        "Content-Transfer-Encoding": "8bit",
        "Received-SPF": "pass client-ip=2607:f8b0:4864:20::52d;\n envelope-from=richard.henderson@linaro.org; helo=mail-pg1-x52d.google.com",
        "X-Spam_score_int": "-20",
        "X-Spam_score": "-2.1",
        "X-Spam_bar": "--",
        "X-Spam_report": "(-2.1 / 5.0 requ) BAYES_00=-1.9, DKIM_SIGNED=0.1,\n DKIM_VALID=-0.1, DKIM_VALID_AU=-0.1, DKIM_VALID_EF=-0.1,\n RCVD_IN_DNSWL_NONE=-0.0001, SPF_HELO_NONE=0.001,\n SPF_PASS=-0.001 autolearn=ham autolearn_force=no",
        "X-Spam_action": "no action",
        "X-BeenThere": "qemu-devel@nongnu.org",
        "X-Mailman-Version": "2.1.23",
        "Precedence": "list",
        "List-Id": "<qemu-devel.nongnu.org>",
        "List-Unsubscribe": "<https://lists.nongnu.org/mailman/options/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>",
        "List-Archive": "<https://lists.nongnu.org/archive/html/qemu-devel>",
        "List-Post": "<mailto:qemu-devel@nongnu.org>",
        "List-Help": "<mailto:qemu-devel-request@nongnu.org?subject=help>",
        "List-Subscribe": "<https://lists.nongnu.org/mailman/listinfo/qemu-devel>,\n <mailto:qemu-devel-request@nongnu.org?subject=subscribe>",
        "Cc": "alex.bennee@linaro.org, david@redhat.com",
        "Errors-To": "qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org",
        "Sender": "\"Qemu-devel\"\n <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>"
    },
    "content": "Rename to parts$N_sqrt.\nReimplement float128_sqrt with FloatParts128.\n\nReimplement with the inverse sqrt newton-raphson algorithm from musl.\nThis is significantly faster than even the berkeley sqrt n-r algorithm,\nbecause it does not use division instructions, only multiplication.\n\nOrdinarily, changing algorithms at the same time as migrating code is\na bad idea, but this is the only way I found that didn't break one of\nthe routines at the same time.\n\nSigned-off-by: Richard Henderson <richard.henderson@linaro.org>\n---\n fpu/softfloat.c           | 207 ++++++++++----------------------------\n fpu/softfloat-parts.c.inc | 206 +++++++++++++++++++++++++++++++++++++\n 2 files changed, 261 insertions(+), 152 deletions(-)",
    "diff": "diff --git a/fpu/softfloat.c b/fpu/softfloat.c\nindex f83ca20000..75a3bb881d 100644\n--- a/fpu/softfloat.c\n+++ b/fpu/softfloat.c\n@@ -819,6 +819,12 @@ static FloatParts128 *parts128_div(FloatParts128 *a, FloatParts128 *b,\n #define parts_div(A, B, S) \\\n     PARTS_GENERIC_64_128(div, A)(A, B, S)\n \n+static void parts64_sqrt(FloatParts64 *a, float_status *s, const FloatFmt *f);\n+static void parts128_sqrt(FloatParts128 *a, float_status *s, const FloatFmt *f);\n+\n+#define parts_sqrt(A, S, F) \\\n+    PARTS_GENERIC_64_128(sqrt, A)(A, S, F)\n+\n static bool parts64_round_to_int_normal(FloatParts64 *a, FloatRoundMode rm,\n                                         int scale, int frac_size);\n static bool parts128_round_to_int_normal(FloatParts128 *a, FloatRoundMode r,\n@@ -1385,6 +1391,30 @@ static void frac128_widen(FloatParts256 *r, FloatParts128 *a)\n \n #define frac_widen(A, B)  FRAC_GENERIC_64_128(widen, B)(A, B)\n \n+/*\n+ * Reciprocal sqrt table.  1 bit of exponent, 6-bits of mantessa.\n+ * From https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt_data.c\n+ * and thus MIT licenced.\n+ */\n+static const uint16_t rsqrt_tab[128] = {\n+    0xb451, 0xb2f0, 0xb196, 0xb044, 0xaef9, 0xadb6, 0xac79, 0xab43,\n+    0xaa14, 0xa8eb, 0xa7c8, 0xa6aa, 0xa592, 0xa480, 0xa373, 0xa26b,\n+    0xa168, 0xa06a, 0x9f70, 0x9e7b, 0x9d8a, 0x9c9d, 0x9bb5, 0x9ad1,\n+    0x99f0, 0x9913, 0x983a, 0x9765, 0x9693, 0x95c4, 0x94f8, 0x9430,\n+    0x936b, 0x92a9, 0x91ea, 0x912e, 0x9075, 0x8fbe, 0x8f0a, 0x8e59,\n+    0x8daa, 0x8cfe, 0x8c54, 0x8bac, 0x8b07, 0x8a64, 0x89c4, 0x8925,\n+    0x8889, 0x87ee, 0x8756, 0x86c0, 0x862b, 0x8599, 0x8508, 0x8479,\n+    0x83ec, 0x8361, 0x82d8, 0x8250, 0x81c9, 0x8145, 0x80c2, 0x8040,\n+    0xff02, 0xfd0e, 0xfb25, 0xf947, 0xf773, 0xf5aa, 0xf3ea, 0xf234,\n+    0xf087, 0xeee3, 0xed47, 0xebb3, 0xea27, 0xe8a3, 0xe727, 0xe5b2,\n+    0xe443, 0xe2dc, 0xe17a, 0xe020, 0xdecb, 0xdd7d, 0xdc34, 0xdaf1,\n+    0xd9b3, 0xd87b, 0xd748, 0xd61a, 0xd4f1, 0xd3cd, 0xd2ad, 0xd192,\n+    0xd07b, 0xcf69, 0xce5b, 0xcd51, 0xcc4a, 0xcb48, 0xca4a, 0xc94f,\n+    0xc858, 0xc764, 0xc674, 0xc587, 0xc49d, 0xc3b7, 0xc2d4, 0xc1f4,\n+    0xc116, 0xc03c, 0xbf65, 0xbe90, 0xbdbe, 0xbcef, 0xbc23, 0xbb59,\n+    0xba91, 0xb9cc, 0xb90a, 0xb84a, 0xb78c, 0xb6d0, 0xb617, 0xb560,\n+};\n+\n #define partsN(NAME)   glue(glue(glue(parts,N),_),NAME)\n #define FloatPartsN    glue(FloatParts,N)\n #define FloatPartsW    glue(FloatParts,W)\n@@ -3588,103 +3618,35 @@ float128 float128_scalbn(float128 a, int n, float_status *status)\n \n /*\n  * Square Root\n- *\n- * The old softfloat code did an approximation step before zeroing in\n- * on the final result. However for simpleness we just compute the\n- * square root by iterating down from the implicit bit to enough extra\n- * bits to ensure we get a correctly rounded result.\n- *\n- * This does mean however the calculation is slower than before,\n- * especially for 64 bit floats.\n  */\n \n-static FloatParts64 sqrt_float(FloatParts64 a, float_status *s, const FloatFmt *p)\n-{\n-    uint64_t a_frac, r_frac, s_frac;\n-    int bit, last_bit;\n-\n-    if (is_nan(a.cls)) {\n-        parts_return_nan(&a, s);\n-        return a;\n-    }\n-    if (a.cls == float_class_zero) {\n-        return a;  /* sqrt(+-0) = +-0 */\n-    }\n-    if (a.sign) {\n-        float_raise(float_flag_invalid, s);\n-        parts_default_nan(&a, s);\n-        return a;\n-    }\n-    if (a.cls == float_class_inf) {\n-        return a;  /* sqrt(+inf) = +inf */\n-    }\n-\n-    assert(a.cls == float_class_normal);\n-\n-    /* We need two overflow bits at the top. Adding room for that is a\n-     * right shift. If the exponent is odd, we can discard the low bit\n-     * by multiplying the fraction by 2; that's a left shift. Combine\n-     * those and we shift right by 1 if the exponent is odd, otherwise 2.\n-     */\n-    a_frac = a.frac >> (2 - (a.exp & 1));\n-    a.exp >>= 1;\n-\n-    /* Bit-by-bit computation of sqrt.  */\n-    r_frac = 0;\n-    s_frac = 0;\n-\n-    /* Iterate from implicit bit down to the 3 extra bits to compute a\n-     * properly rounded result. Remember we've inserted two more bits\n-     * at the top, so these positions are two less.\n-     */\n-    bit = DECOMPOSED_BINARY_POINT - 2;\n-    last_bit = MAX(p->frac_shift - 4, 0);\n-    do {\n-        uint64_t q = 1ULL << bit;\n-        uint64_t t_frac = s_frac + q;\n-        if (t_frac <= a_frac) {\n-            s_frac = t_frac + q;\n-            a_frac -= t_frac;\n-            r_frac += q;\n-        }\n-        a_frac <<= 1;\n-    } while (--bit >= last_bit);\n-\n-    /* Undo the right shift done above. If there is any remaining\n-     * fraction, the result is inexact. Set the sticky bit.\n-     */\n-    a.frac = (r_frac << 2) + (a_frac != 0);\n-\n-    return a;\n-}\n-\n float16 QEMU_FLATTEN float16_sqrt(float16 a, float_status *status)\n {\n-    FloatParts64 pa, pr;\n+    FloatParts64 p;\n \n-    float16_unpack_canonical(&pa, a, status);\n-    pr = sqrt_float(pa, status, &float16_params);\n-    return float16_round_pack_canonical(&pr, status);\n+    float16_unpack_canonical(&p, a, status);\n+    parts_sqrt(&p, status, &float16_params);\n+    return float16_round_pack_canonical(&p, status);\n }\n \n static float32 QEMU_SOFTFLOAT_ATTR\n soft_f32_sqrt(float32 a, float_status *status)\n {\n-    FloatParts64 pa, pr;\n+    FloatParts64 p;\n \n-    float32_unpack_canonical(&pa, a, status);\n-    pr = sqrt_float(pa, status, &float32_params);\n-    return float32_round_pack_canonical(&pr, status);\n+    float32_unpack_canonical(&p, a, status);\n+    parts_sqrt(&p, status, &float32_params);\n+    return float32_round_pack_canonical(&p, status);\n }\n \n static float64 QEMU_SOFTFLOAT_ATTR\n soft_f64_sqrt(float64 a, float_status *status)\n {\n-    FloatParts64 pa, pr;\n+    FloatParts64 p;\n \n-    float64_unpack_canonical(&pa, a, status);\n-    pr = sqrt_float(pa, status, &float64_params);\n-    return float64_round_pack_canonical(&pr, status);\n+    float64_unpack_canonical(&p, a, status);\n+    parts_sqrt(&p, status, &float64_params);\n+    return float64_round_pack_canonical(&p, status);\n }\n \n float32 QEMU_FLATTEN float32_sqrt(float32 xa, float_status *s)\n@@ -3743,11 +3705,20 @@ float64 QEMU_FLATTEN float64_sqrt(float64 xa, float_status *s)\n \n bfloat16 QEMU_FLATTEN bfloat16_sqrt(bfloat16 a, float_status *status)\n {\n-    FloatParts64 pa, pr;\n+    FloatParts64 p;\n \n-    bfloat16_unpack_canonical(&pa, a, status);\n-    pr = sqrt_float(pa, status, &bfloat16_params);\n-    return bfloat16_round_pack_canonical(&pr, status);\n+    bfloat16_unpack_canonical(&p, a, status);\n+    parts_sqrt(&p, status, &bfloat16_params);\n+    return bfloat16_round_pack_canonical(&p, status);\n+}\n+\n+float128 QEMU_FLATTEN float128_sqrt(float128 a, float_status *status)\n+{\n+    FloatParts128 p;\n+\n+    float128_unpack_canonical(&p, a, status);\n+    parts_sqrt(&p, status, &float128_params);\n+    return float128_round_pack_canonical(&p, status);\n }\n \n /*----------------------------------------------------------------------------\n@@ -6475,74 +6446,6 @@ float128 float128_rem(float128 a, float128 b, float_status *status)\n                                          status);\n }\n \n-/*----------------------------------------------------------------------------\n-| Returns the square root of the quadruple-precision floating-point value `a'.\n-| The operation is performed according to the IEC/IEEE Standard for Binary\n-| Floating-Point Arithmetic.\n-*----------------------------------------------------------------------------*/\n-\n-float128 float128_sqrt(float128 a, float_status *status)\n-{\n-    bool aSign;\n-    int32_t aExp, zExp;\n-    uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;\n-    uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;\n-\n-    aSig1 = extractFloat128Frac1( a );\n-    aSig0 = extractFloat128Frac0( a );\n-    aExp = extractFloat128Exp( a );\n-    aSign = extractFloat128Sign( a );\n-    if ( aExp == 0x7FFF ) {\n-        if (aSig0 | aSig1) {\n-            return propagateFloat128NaN(a, a, status);\n-        }\n-        if ( ! aSign ) return a;\n-        goto invalid;\n-    }\n-    if ( aSign ) {\n-        if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;\n- invalid:\n-        float_raise(float_flag_invalid, status);\n-        return float128_default_nan(status);\n-    }\n-    if ( aExp == 0 ) {\n-        if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );\n-        normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );\n-    }\n-    zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;\n-    aSig0 |= UINT64_C(0x0001000000000000);\n-    zSig0 = estimateSqrt32( aExp, aSig0>>17 );\n-    shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );\n-    zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );\n-    doubleZSig0 = zSig0<<1;\n-    mul64To128( zSig0, zSig0, &term0, &term1 );\n-    sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );\n-    while ( (int64_t) rem0 < 0 ) {\n-        --zSig0;\n-        doubleZSig0 -= 2;\n-        add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );\n-    }\n-    zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );\n-    if ( ( zSig1 & 0x1FFF ) <= 5 ) {\n-        if ( zSig1 == 0 ) zSig1 = 1;\n-        mul64To128( doubleZSig0, zSig1, &term1, &term2 );\n-        sub128( rem1, 0, term1, term2, &rem1, &rem2 );\n-        mul64To128( zSig1, zSig1, &term2, &term3 );\n-        sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );\n-        while ( (int64_t) rem1 < 0 ) {\n-            --zSig1;\n-            shortShift128Left( 0, zSig1, 1, &term2, &term3 );\n-            term3 |= 1;\n-            term2 |= doubleZSig0;\n-            add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );\n-        }\n-        zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );\n-    }\n-    shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );\n-    return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);\n-\n-}\n-\n static inline FloatRelation\n floatx80_compare_internal(floatx80 a, floatx80 b, bool is_quiet,\n                           float_status *status)\ndiff --git a/fpu/softfloat-parts.c.inc b/fpu/softfloat-parts.c.inc\nindex 9749c11a6b..293029711c 100644\n--- a/fpu/softfloat-parts.c.inc\n+++ b/fpu/softfloat-parts.c.inc\n@@ -595,6 +595,212 @@ static FloatPartsN *partsN(div)(FloatPartsN *a, FloatPartsN *b,\n     return a;\n }\n \n+/*\n+ * Square Root\n+ *\n+ * The base algorithm is lifted from\n+ * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtf.c\n+ * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrt.c\n+ * https://git.musl-libc.org/cgit/musl/tree/src/math/sqrtl.c\n+ * and is thus MIT licenced.\n+ */\n+static void partsN(sqrt)(FloatPartsN *a, float_status *status,\n+                         const FloatFmt *fmt)\n+{\n+    const uint32_t three32 = 3u << 30;\n+    const uint64_t three64 = 3ull << 62;\n+    uint32_t d32, m32, r32, s32, u32;            /* 32-bit computation */\n+    uint64_t d64, m64, r64, s64, u64;            /* 64-bit computation */\n+    uint64_t dh, dl, rh, rl, sh, sl, uh, ul;     /* 128-bit computation */\n+    uint64_t d0h, d0l, d1h, d1l, d2h, d2l;\n+    uint64_t discard;\n+    bool exp_odd;\n+    size_t index;\n+\n+    if (unlikely(a->cls != float_class_normal)) {\n+        switch (a->cls) {\n+        case float_class_snan:\n+        case float_class_qnan:\n+            parts_return_nan(a, status);\n+            return;\n+        case float_class_zero:\n+            return;\n+        case float_class_inf:\n+            if (unlikely(a->sign)) {\n+                goto d_nan;\n+            }\n+            return;\n+        default:\n+            g_assert_not_reached();\n+        }\n+    }\n+\n+    if (unlikely(a->sign)) {\n+        goto d_nan;\n+    }\n+\n+    /*\n+     * Argument reduction.\n+     * x = 4^e frac; with integer e, and frac in [1, 4)\n+     * m = frac fixed point at bit 62, since we're in base 4.\n+     * If base-2 exponent is odd, exchange that for multiply by 2,\n+     * which results in no shift.\n+     */\n+    exp_odd = a->exp & 1;\n+    index = extract64(a->frac_hi, 57, 6) | (!exp_odd << 6);\n+    if (!exp_odd) {\n+        frac_shr(a, 1);\n+    }\n+\n+    /*\n+     * Approximate r ~= 1/sqrt(m) and s ~= sqrt(m) when m in [1, 4).\n+     *\n+     * Initial estimate:\n+     * 7-bit lookup table (1-bit exponent and 6-bit significand).\n+     *\n+     * The relative error (e = r0*sqrt(m)-1) of a linear estimate\n+     * (r0 = a*m + b) is |e| < 0.085955 ~ 0x1.6p-4 at best;\n+     * a table lookup is faster and needs one less iteration.\n+     * The 7-bit table gives |e| < 0x1.fdp-9.\n+     *\n+     * A Newton-Raphson iteration for r is\n+     *   s = m*r\n+     *   d = s*r\n+     *   u = 3 - d\n+     *   r = r*u/2\n+     *\n+     * Fixed point representations:\n+     *   m, s, d, u, three are all 2.30; r is 0.32\n+     */\n+    m64 = a->frac_hi;\n+    m32 = m64 >> 32;\n+\n+    r32 = rsqrt_tab[index] << 16;\n+    /* |r*sqrt(m) - 1| < 0x1.FDp-9 */\n+\n+    s32 = ((uint64_t)m32 * r32) >> 32;\n+    d32 = ((uint64_t)s32 * r32) >> 32;\n+    u32 = three32 - d32;\n+\n+    if (N == 64) {\n+        /* float64 or smaller */\n+\n+        r32 = ((uint64_t)r32 * u32) >> 31;\n+        /* |r*sqrt(m) - 1| < 0x1.7Bp-16 */\n+\n+        s32 = ((uint64_t)m32 * r32) >> 32;\n+        d32 = ((uint64_t)s32 * r32) >> 32;\n+        u32 = three32 - d32;\n+\n+        if (fmt->frac_size <= 23) {\n+            /* float32 or smaller */\n+\n+            s32 = ((uint64_t)s32 * u32) >> 32;  /* 3.29 */\n+            s32 = (s32 - 1) >> 6;               /* 9.23 */\n+            /* s < sqrt(m) < s + 0x1.08p-23 */\n+\n+            /* compute nearest rounded result to 2.23 bits */\n+            uint32_t d0 = (m32 << 16) - s32 * s32;\n+            uint32_t d1 = s32 - d0;\n+            uint32_t d2 = d1 + s32 + 1;\n+            s32 += d1 >> 31;\n+            a->frac_hi = (uint64_t)s32 << (64 - 25);\n+\n+            /* increment or decrement for inexact */\n+            if (d2 != 0) {\n+                a->frac_hi += ((int32_t)(d1 ^ d2) < 0 ? -1 : 1);\n+            }\n+            goto done;\n+        }\n+\n+        /* float64 */\n+\n+        r64 = (uint64_t)r32 * u32 * 2;\n+        /* |r*sqrt(m) - 1| < 0x1.37-p29; convert to 64-bit arithmetic */\n+        mul64To128(m64, r64, &s64, &discard);\n+        mul64To128(s64, r64, &d64, &discard);\n+        u64 = three64 - d64;\n+\n+        mul64To128(s64, u64, &s64, &discard);  /* 3.61 */\n+        s64 = (s64 - 2) >> 9;                  /* 12.52 */\n+\n+        /* Compute nearest rounded result */\n+        uint64_t d0 = (m64 << 42) - s64 * s64;\n+        uint64_t d1 = s64 - d0;\n+        uint64_t d2 = d1 + s64 + 1;\n+        s64 += d1 >> 63;\n+        a->frac_hi = s64 << (64 - 54);\n+\n+        /* increment or decrement for inexact */\n+        if (d2 != 0) {\n+            a->frac_hi += ((int64_t)(d1 ^ d2) < 0 ? -1 : 1);\n+        }\n+        goto done;\n+    }\n+\n+    r64 = (uint64_t)r32 * u32 * 2;\n+    /* |r*sqrt(m) - 1| < 0x1.7Bp-16; convert to 64-bit arithmetic */\n+\n+    mul64To128(m64, r64, &s64, &discard);\n+    mul64To128(s64, r64, &d64, &discard);\n+    u64 = three64 - d64;\n+    mul64To128(u64, r64, &r64, &discard);\n+    r64 <<= 1;\n+    /* |r*sqrt(m) - 1| < 0x1.a5p-31 */\n+\n+    mul64To128(m64, r64, &s64, &discard);\n+    mul64To128(s64, r64, &d64, &discard);\n+    u64 = three64 - d64;\n+    mul64To128(u64, r64, &rh, &rl);\n+    add128(rh, rl, rh, rl, &rh, &rl);\n+    /* |r*sqrt(m) - 1| < 0x1.c001p-59; change to 128-bit arithmetic */\n+\n+    mul128To256(a->frac_hi, a->frac_lo, rh, rl, &sh, &sl, &discard, &discard);\n+    mul128To256(sh, sl, rh, rl, &dh, &dl, &discard, &discard);\n+    sub128(three64, 0, dh, dl, &uh, &ul);\n+    mul128To256(uh, ul, sh, sl, &sh, &sl, &discard, &discard);  /* 3.125 */\n+    /* -0x1p-116 < s - sqrt(m) < 0x3.8001p-125 */\n+\n+    sub128(sh, sl, 0, 4, &sh, &sl);\n+    shift128Right(sh, sl, 13, &sh, &sl);  /* 16.112 */\n+    /* s < sqrt(m) < s + 1ulp */\n+\n+    /* Compute nearest rounded result */\n+    mul64To128(sl, sl, &d0h, &d0l);\n+    d0h += 2 * sh * sl;\n+    sub128(a->frac_lo << 34, 0, d0h, d0l, &d0h, &d0l);\n+    sub128(sh, sl, d0h, d0l, &d1h, &d1l);\n+    add128(sh, sl, 0, 1, &d2h, &d2l);\n+    add128(d2h, d2l, d1h, d1l, &d2h, &d2l);\n+    add128(sh, sl, 0, d1h >> 63, &sh, &sl);\n+    shift128Left(sh, sl, 128 - 114, &sh, &sl);\n+\n+    /* increment or decrement for inexact */\n+    if (d2h | d2l) {\n+        if ((int64_t)(d1h ^ d2h) < 0) {\n+            sub128(sh, sl, 0, 1, &sh, &sl);\n+        } else {\n+            add128(sh, sl, 0, 1, &sh, &sl);\n+        }\n+    }\n+    a->frac_lo = sl;\n+    a->frac_hi = sh;\n+\n+ done:\n+    /* Convert back from base 4 to base 2. */\n+    a->exp >>= 1;\n+    if (!(a->frac_hi & DECOMPOSED_IMPLICIT_BIT)) {\n+        frac_add(a, a, a);\n+    } else {\n+        a->exp += 1;\n+    }\n+    return;\n+\n+ d_nan:\n+    float_raise(float_flag_invalid, status);\n+    parts_default_nan(a, status);\n+}\n+\n /*\n  * Rounds the floating-point value `a' to an integer, and returns the\n  * result as a floating-point value. The operation is performed\n",
    "prefixes": [
        "53/72"
    ]
}