diff mbox series

[26/72] softfloat: Convert float128_silence_nan to parts

Message ID 20210508014802.892561-27-richard.henderson@linaro.org
State New
Headers show
Series Convert floatx80 and float128 to FloatParts | expand

Commit Message

Richard Henderson May 8, 2021, 1:47 a.m. UTC
This is the minimal change that also introduces float128_params,
float128_unpack_raw, and float128_pack_raw without running into
unused symbol Werrors.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 fpu/softfloat.c                | 96 +++++++++++++++++++++++++++++-----
 fpu/softfloat-specialize.c.inc | 25 +++------
 2 files changed, 89 insertions(+), 32 deletions(-)

Comments

Alex Bennée May 13, 2021, 8:34 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> This is the minimal change that also introduces float128_params,
> float128_unpack_raw, and float128_pack_raw without running into
> unused symbol Werrors.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  fpu/softfloat.c                | 96 +++++++++++++++++++++++++++++-----
>  fpu/softfloat-specialize.c.inc | 25 +++------
>  2 files changed, 89 insertions(+), 32 deletions(-)
>
> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
> index 2d6f61ee7a..073b80d502 100644
> --- a/fpu/softfloat.c
> +++ b/fpu/softfloat.c
> @@ -500,14 +500,12 @@ static inline __attribute__((unused)) bool is_qnan(FloatClass c)
>  }
>  
>  /*
> - * Structure holding all of the decomposed parts of a float. The
> - * exponent is unbiased and the fraction is normalized. All
> - * calculations are done with a 64 bit fraction and then rounded as
> - * appropriate for the final format.
> + * Structure holding all of the decomposed parts of a float.
> + * The exponent is unbiased and the fraction is normalized.
>   *
> - * Thanks to the packed FloatClass a decent compiler should be able to
> - * fit the whole structure into registers and avoid using the stack
> - * for parameter passing.
> + * The fraction words are stored in big-endian word ordering,
> + * so that truncation from a larger format to a smaller format
> + * can be done simply by ignoring subsequent elements.
>   */
>  
>  typedef struct {
> @@ -526,6 +524,15 @@ typedef struct {
>      };
>  } FloatParts64;
>  
> +typedef struct {
> +    FloatClass cls;
> +    bool sign;
> +    int32_t exp;
> +    uint64_t frac_hi;
> +    uint64_t frac_lo;
> +} FloatParts128;
> +
> +/* These apply to the most significant word of each FloatPartsN. */
>  #define DECOMPOSED_BINARY_POINT    63
>  #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
>  
> @@ -561,11 +568,11 @@ typedef struct {
>      .exp_bias       = ((1 << E) - 1) >> 1,                           \
>      .exp_max        = (1 << E) - 1,                                  \
>      .frac_size      = F,                                             \
> -    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
> -    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
> -    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
> -    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
> -    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
> +    .frac_shift     = (-F - 1) & 63,                                 \
> +    .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
> +    .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
> +    .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
> +    .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
>

I have to admit I find the switch to (-F - 1) & 63 a little black
magical. Isn't the shift always going to end up a factor of the number
of exponent bits we need to move past and the natural size of the
original float?

Anyway my personal brain twisting aside it obviously works and
everything else looks fine so:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Richard Henderson May 13, 2021, 12:25 p.m. UTC | #2
On 5/13/21 3:34 AM, Alex Bennée wrote:
> 
> Richard Henderson <richard.henderson@linaro.org> writes:
> 
>> This is the minimal change that also introduces float128_params,
>> float128_unpack_raw, and float128_pack_raw without running into
>> unused symbol Werrors.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   fpu/softfloat.c                | 96 +++++++++++++++++++++++++++++-----
>>   fpu/softfloat-specialize.c.inc | 25 +++------
>>   2 files changed, 89 insertions(+), 32 deletions(-)
>>
>> diff --git a/fpu/softfloat.c b/fpu/softfloat.c
>> index 2d6f61ee7a..073b80d502 100644
>> --- a/fpu/softfloat.c
>> +++ b/fpu/softfloat.c
>> @@ -500,14 +500,12 @@ static inline __attribute__((unused)) bool is_qnan(FloatClass c)
>>   }
>>   
>>   /*
>> - * Structure holding all of the decomposed parts of a float. The
>> - * exponent is unbiased and the fraction is normalized. All
>> - * calculations are done with a 64 bit fraction and then rounded as
>> - * appropriate for the final format.
>> + * Structure holding all of the decomposed parts of a float.
>> + * The exponent is unbiased and the fraction is normalized.
>>    *
>> - * Thanks to the packed FloatClass a decent compiler should be able to
>> - * fit the whole structure into registers and avoid using the stack
>> - * for parameter passing.
>> + * The fraction words are stored in big-endian word ordering,
>> + * so that truncation from a larger format to a smaller format
>> + * can be done simply by ignoring subsequent elements.
>>    */
>>   
>>   typedef struct {
>> @@ -526,6 +524,15 @@ typedef struct {
>>       };
>>   } FloatParts64;
>>   
>> +typedef struct {
>> +    FloatClass cls;
>> +    bool sign;
>> +    int32_t exp;
>> +    uint64_t frac_hi;
>> +    uint64_t frac_lo;
>> +} FloatParts128;
>> +
>> +/* These apply to the most significant word of each FloatPartsN. */
>>   #define DECOMPOSED_BINARY_POINT    63
>>   #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
>>   
>> @@ -561,11 +568,11 @@ typedef struct {
>>       .exp_bias       = ((1 << E) - 1) >> 1,                           \
>>       .exp_max        = (1 << E) - 1,                                  \
>>       .frac_size      = F,                                             \
>> -    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
>> -    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
>> -    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
>> -    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
>> -    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
>> +    .frac_shift     = (-F - 1) & 63,                                 \
>> +    .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
>> +    .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
>> +    .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
>> +    .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
>>
> 
> I have to admit I find the switch to (-F - 1) & 63 a little black
> magical. Isn't the shift always going to end up a factor of the number
> of exponent bits we need to move past and the natural size of the
> original float?

Yep.  But now we're looking to compute the number relative to .frac_lo, rather 
than the entire logical fraction.


r~

> 
> Anyway my personal brain twisting aside it obviously works and
> everything else looks fine so:
> 
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
>
diff mbox series

Patch

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 2d6f61ee7a..073b80d502 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -500,14 +500,12 @@  static inline __attribute__((unused)) bool is_qnan(FloatClass c)
 }
 
 /*
- * Structure holding all of the decomposed parts of a float. The
- * exponent is unbiased and the fraction is normalized. All
- * calculations are done with a 64 bit fraction and then rounded as
- * appropriate for the final format.
+ * Structure holding all of the decomposed parts of a float.
+ * The exponent is unbiased and the fraction is normalized.
  *
- * Thanks to the packed FloatClass a decent compiler should be able to
- * fit the whole structure into registers and avoid using the stack
- * for parameter passing.
+ * The fraction words are stored in big-endian word ordering,
+ * so that truncation from a larger format to a smaller format
+ * can be done simply by ignoring subsequent elements.
  */
 
 typedef struct {
@@ -526,6 +524,15 @@  typedef struct {
     };
 } FloatParts64;
 
+typedef struct {
+    FloatClass cls;
+    bool sign;
+    int32_t exp;
+    uint64_t frac_hi;
+    uint64_t frac_lo;
+} FloatParts128;
+
+/* These apply to the most significant word of each FloatPartsN. */
 #define DECOMPOSED_BINARY_POINT    63
 #define DECOMPOSED_IMPLICIT_BIT    (1ull << DECOMPOSED_BINARY_POINT)
 
@@ -561,11 +568,11 @@  typedef struct {
     .exp_bias       = ((1 << E) - 1) >> 1,                           \
     .exp_max        = (1 << E) - 1,                                  \
     .frac_size      = F,                                             \
-    .frac_shift     = DECOMPOSED_BINARY_POINT - F,                   \
-    .frac_lsb       = 1ull << (DECOMPOSED_BINARY_POINT - F),         \
-    .frac_lsbm1     = 1ull << ((DECOMPOSED_BINARY_POINT - F) - 1),   \
-    .round_mask     = (1ull << (DECOMPOSED_BINARY_POINT - F)) - 1,   \
-    .roundeven_mask = (2ull << (DECOMPOSED_BINARY_POINT - F)) - 1
+    .frac_shift     = (-F - 1) & 63,                                 \
+    .frac_lsb       = 1ull << ((-F - 1) & 63),                       \
+    .frac_lsbm1     = 1ull << ((-F - 2) & 63),                       \
+    .round_mask     = (1ull << ((-F - 1) & 63)) - 1,                 \
+    .roundeven_mask = (2ull << ((-F - 1) & 63)) - 1
 
 static const FloatFmt float16_params = {
     FLOAT_PARAMS(5, 10)
@@ -588,6 +595,10 @@  static const FloatFmt float64_params = {
     FLOAT_PARAMS(11, 52)
 };
 
+static const FloatFmt float128_params = {
+    FLOAT_PARAMS(15, 112)
+};
+
 /* Unpack a float to parts, but do not canonicalize.  */
 static void unpack_raw64(FloatParts64 *r, const FloatFmt *fmt, uint64_t raw)
 {
@@ -622,6 +633,20 @@  static inline void float64_unpack_raw(FloatParts64 *p, float64 f)
     unpack_raw64(p, &float64_params, f);
 }
 
+static void float128_unpack_raw(FloatParts128 *p, float128 f)
+{
+    const int f_size = float128_params.frac_size - 64;
+    const int e_size = float128_params.exp_size;
+
+    *p = (FloatParts128) {
+        .cls = float_class_unclassified,
+        .sign = extract64(f.high, f_size + e_size, 1),
+        .exp = extract64(f.high, f_size, e_size),
+        .frac_hi = extract64(f.high, 0, f_size),
+        .frac_lo = f.low,
+    };
+}
+
 /* Pack a float from parts, but do not canonicalize.  */
 static uint64_t pack_raw64(const FloatParts64 *p, const FloatFmt *fmt)
 {
@@ -655,6 +680,18 @@  static inline float64 float64_pack_raw(const FloatParts64 *p)
     return make_float64(pack_raw64(p, &float64_params));
 }
 
+static float128 float128_pack_raw(const FloatParts128 *p)
+{
+    const int f_size = float128_params.frac_size - 64;
+    const int e_size = float128_params.exp_size;
+    uint64_t hi;
+
+    hi = (uint64_t)p->sign << (f_size + e_size);
+    hi = deposit64(hi, f_size, e_size, p->exp);
+    hi = deposit64(hi, 0, f_size, p->frac_hi);
+    return make_float128(hi, p->frac_lo);
+}
+
 /*----------------------------------------------------------------------------
 | Functions and definitions to determine:  (1) whether tininess for underflow
 | is detected before or after rounding by default, (2) what (if anything)
@@ -665,8 +702,30 @@  static inline float64 float64_pack_raw(const FloatParts64 *p)
 *----------------------------------------------------------------------------*/
 #include "softfloat-specialize.c.inc"
 
+#define PARTS_GENERIC_64_128(NAME, P) \
+    QEMU_GENERIC(P, (FloatParts128 *, parts128_##NAME), parts64_##NAME)
+
 #define parts_default_nan  parts64_default_nan
-#define parts_silence_nan  parts64_silence_nan
+#define parts_silence_nan(P, S)    PARTS_GENERIC_64_128(silence_nan, P)(P, S)
+
+
+/*
+ * Helper functions for softfloat-parts.c.inc, per-size operations.
+ */
+
+static void frac128_shl(FloatParts128 *a, int c)
+{
+    shift128Left(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
+}
+
+#define frac_shl(A, C)             frac128_shl(A, C)
+
+static void frac128_shr(FloatParts128 *a, int c)
+{
+    shift128Right(a->frac_hi, a->frac_lo, c, &a->frac_hi, &a->frac_lo);
+}
+
+#define frac_shr(A, C)             frac128_shr(A, C)
 
 /* Canonicalize EXP and FRAC, setting CLS.  */
 static FloatParts64 sf_canonicalize(FloatParts64 part, const FloatFmt *parm,
@@ -3850,6 +3909,17 @@  bfloat16 bfloat16_silence_nan(bfloat16 a, float_status *status)
     return bfloat16_pack_raw(&p);
 }
 
+float128 float128_silence_nan(float128 a, float_status *status)
+{
+    FloatParts128 p;
+
+    float128_unpack_raw(&p, a);
+    frac_shl(&p, float128_params.frac_shift);
+    parts_silence_nan(&p, status);
+    frac_shr(&p, float128_params.frac_shift);
+    return float128_pack_raw(&p);
+}
+
 /*----------------------------------------------------------------------------
 | If `a' is denormal and we are in flush-to-zero mode then set the
 | input-denormal exception and return zero. Otherwise just return the value.
diff --git a/fpu/softfloat-specialize.c.inc b/fpu/softfloat-specialize.c.inc
index 2a1bc66633..d892016f0f 100644
--- a/fpu/softfloat-specialize.c.inc
+++ b/fpu/softfloat-specialize.c.inc
@@ -198,6 +198,12 @@  static void parts64_silence_nan(FloatParts64 *p, float_status *status)
     p->cls = float_class_qnan;
 }
 
+static void parts128_silence_nan(FloatParts128 *p, float_status *status)
+{
+    p->frac_hi = parts_silence_nan_frac(p->frac_hi, status);
+    p->cls = float_class_qnan;
+}
+
 /*----------------------------------------------------------------------------
 | The pattern for a default generated extended double-precision NaN.
 *----------------------------------------------------------------------------*/
@@ -1057,25 +1063,6 @@  bool float128_is_signaling_nan(float128 a, float_status *status)
     }
 }
 
-/*----------------------------------------------------------------------------
-| Returns a quiet NaN from a signalling NaN for the quadruple-precision
-| floating point value `a'.
-*----------------------------------------------------------------------------*/
-
-float128 float128_silence_nan(float128 a, float_status *status)
-{
-    if (no_signaling_nans(status)) {
-        g_assert_not_reached();
-    } else {
-        if (snan_bit_is_one(status)) {
-            return float128_default_nan(status);
-        } else {
-            a.high |= UINT64_C(0x0000800000000000);
-            return a;
-        }
-    }
-}
-
 /*----------------------------------------------------------------------------
 | Returns the result of converting the quadruple-precision floating-point NaN
 | `a' to the canonical NaN format.  If `a' is a signaling NaN, the invalid