diff mbox series

sh4: mac.w: implement saturation arithmetic logic

Message ID 20240405075404.2122-2-zack@buhman.org
State New
Headers show
Series sh4: mac.w: implement saturation arithmetic logic | expand

Commit Message

Zack Buhman April 5, 2024, 7:53 a.m. UTC
The saturation arithmetic logic in helper_macw is not correct.

I tested and verified this behavior on a SH7091, the general pattern
is a code sequence such as:

	sets

	mov.l _mach,r2
	lds r2,mach
	mov.l _macl,r2
	lds r2,macl

	mova _n,r0
	mov r0,r1
	mova _m,r0
	mac.w @r0+,@r1+

 _mach: .long 0xffffffff
 _macl: .long 0xfffffffe
 _m:    .word 0x0002
        .word 0
 _n:    .word 0x0003
        .word 0

test 0:
  (mach should not be modified if an overflow did not occur)

  given, prior to saturation mac.l:
    mach = 0xffffffff ; macl = 0xfffffffe
    @r0  = 0x0002     ; @r1  = 0x0003

  expected saturation mac.w result:
    mach = 0xffffffff (unchanged)
    macl = 0x00000004

  qemu saturation mac.w result (before this commit):
    mach = 0x00000001
    macl = 0x80000000

  In the context of the helper_macw implementation prior to this
  commit, initially this appears to be a surprising result. This is
  because (prior to unary negation) the C literal `0x80000000` (due to
  being outside the range of a `signed int`) is evaluated as an
  `unsigned int` whereas the literal `1` (due to being inside the
  range of `signed int`) is evaluated as `signed int`, as in:

    static_assert(1 < -0x80000000 == 1);
    static_assert(1 < -1 == 0);

  This is because the unary negation of an unsigned int is an
  unsigned int.

  In other words, if the `res < -0x80000000` comparison used
  infinite-precision literals, the saturation mac.w result would have
  been:

    mach = 0x00000000
    macl = 0x00000004

  Due to this (forgivable) misunderstanding of C literals, the
  following behavior also occurs:

test 1:
  (`2 * 3 + 0` is not an overflow)

  given, prior to saturation mac.l:
    mach = 0x00000000 ; macl = 0x00000000
    @r0  = 0x0002     ; @r1  = 0x0003

  expected saturation mac.w result:
    mach = 0x00000000 (unchanged)
    macl = 0x00000006

  qemu saturation mac.w result (before this commit):
    mach = 0x00000001
    macl = 0x80000000

test 2:
  (mach should not be accumulated in saturation mode)
  (16-bit operands are sign-extended)

  given, prior to saturation mac.l:
    mach = 0x12345678 ; macl = 0x7ffffffe
    @r0  = 0x0002     ; @r1  = 0xfffd

  expected saturation mac.w result:
    mach = 0x12345678 (unchanged)
    macl = 0x7ffffff8

  qemu saturation mac.w result (before this commit):
    mach = 0x00000001
    macl = 0x7fffffff

test 3:
  (macl should have the correct saturation value)

  given, prior to saturation mac.l:
    mach = 0xabcdef12 ; macl = 0x7ffffffa
    @r0  = 0x0002     ; @r1  = 0x0003

  expected saturation mac.w result:
    mach = 0x00000001 (overwritten)
    macl = 0x7fffffff

  qemu saturation mac.w result (before this commit):
    mach = 0x00000001
    macl = 0x80000000

All of the above also matches the description of MAC.W as documented
in cd00147165-sh-4-32-bit-cpu-core-architecture-stmicroelectronics.pdf

Signed-off-by: Zack Buhman <zack@buhman.org>
---
 target/sh4/op_helper.c | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

Comments

Peter Maydell April 5, 2024, 2:55 p.m. UTC | #1
On Fri, 5 Apr 2024 at 08:55, Zack Buhman <zack@buhman.org> wrote:
>
> The saturation arithmetic logic in helper_macw is not correct.
>
> I tested and verified this behavior on a SH7091, the general pattern
> is a code sequence such as:
>
>         sets
>
>         mov.l _mach,r2
>         lds r2,mach
>         mov.l _macl,r2
>         lds r2,macl
>
>         mova _n,r0
>         mov r0,r1
>         mova _m,r0
>         mac.w @r0+,@r1+
>
>  _mach: .long 0xffffffff
>  _macl: .long 0xfffffffe
>  _m:    .word 0x0002
>         .word 0
>  _n:    .word 0x0003
>         .word 0
>
> test 0:
>   (mach should not be modified if an overflow did not occur)
>
>   given, prior to saturation mac.l:
>     mach = 0xffffffff ; macl = 0xfffffffe
>     @r0  = 0x0002     ; @r1  = 0x0003
>
>   expected saturation mac.w result:
>     mach = 0xffffffff (unchanged)
>     macl = 0x00000004
>
>   qemu saturation mac.w result (before this commit):
>     mach = 0x00000001
>     macl = 0x80000000
>
>   In the context of the helper_macw implementation prior to this
>   commit, initially this appears to be a surprising result. This is
>   because (prior to unary negation) the C literal `0x80000000` (due to
>   being outside the range of a `signed int`) is evaluated as an
>   `unsigned int` whereas the literal `1` (due to being inside the
>   range of `signed int`) is evaluated as `signed int`, as in:
>
>     static_assert(1 < -0x80000000 == 1);
>     static_assert(1 < -1 == 0);
>
>   This is because the unary negation of an unsigned int is an
>   unsigned int.

So we could also fix this by getting the C literals right
so that they are correctly the signed 64 bit values that
the author intended, right?

>   In other words, if the `res < -0x80000000` comparison used
>   infinite-precision literals, the saturation mac.w result would have
>   been:
>
>     mach = 0x00000000
>     macl = 0x00000004
>
>   Due to this (forgivable) misunderstanding of C literals, the
>   following behavior also occurs:
>
> test 1:
>   (`2 * 3 + 0` is not an overflow)
>
>   given, prior to saturation mac.l:
>     mach = 0x00000000 ; macl = 0x00000000
>     @r0  = 0x0002     ; @r1  = 0x0003
>
>   expected saturation mac.w result:
>     mach = 0x00000000 (unchanged)
>     macl = 0x00000006
>
>   qemu saturation mac.w result (before this commit):
>     mach = 0x00000001
>     macl = 0x80000000
>
> test 2:
>   (mach should not be accumulated in saturation mode)
>   (16-bit operands are sign-extended)
>
>   given, prior to saturation mac.l:
>     mach = 0x12345678 ; macl = 0x7ffffffe
>     @r0  = 0x0002     ; @r1  = 0xfffd
>
>   expected saturation mac.w result:
>     mach = 0x12345678 (unchanged)
>     macl = 0x7ffffff8
>
>   qemu saturation mac.w result (before this commit):
>     mach = 0x00000001
>     macl = 0x7fffffff
>
> test 3:
>   (macl should have the correct saturation value)
>
>   given, prior to saturation mac.l:
>     mach = 0xabcdef12 ; macl = 0x7ffffffa
>     @r0  = 0x0002     ; @r1  = 0x0003
>
>   expected saturation mac.w result:
>     mach = 0x00000001 (overwritten)
>     macl = 0x7fffffff
>
>   qemu saturation mac.w result (before this commit):
>     mach = 0x00000001
>     macl = 0x80000000
>
> All of the above also matches the description of MAC.W as documented
> in cd00147165-sh-4-32-bit-cpu-core-architecture-stmicroelectronics.pdf
>
> Signed-off-by: Zack Buhman <zack@buhman.org>
> ---
>  target/sh4/op_helper.c | 41 +++++++++++++++++++++++++++++++----------
>  1 file changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
> index ee16524083..b3c1e69f53 100644
> --- a/target/sh4/op_helper.c
> +++ b/target/sh4/op_helper.c
> @@ -187,20 +187,41 @@ void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
>
>  void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
>  {
> -    int64_t res;
> +    int16_t value0 = (int16_t)arg0;
> +    int16_t value1 = (int16_t)arg1;
> +    int32_t mul = ((int32_t)value0) * ((int32_t)value1);
>
> -    res = ((uint64_t) env->mach << 32) | env->macl;
> -    res += (int64_t) (int16_t) arg0 *(int64_t) (int16_t) arg1;
> -    env->mach = (res >> 32) & 0xffffffff;
> -    env->macl = res & 0xffffffff;
> +    /* Perform 32-bit saturation arithmetic if the S flag is set */
>      if (env->sr & (1u << SR_S)) {
> -        if (res < -0x80000000) {
> -            env->mach = 1;
> -            env->macl = 0x80000000;
> -        } else if (res > 0x000000007fffffff) {
> +        const int64_t upper_bound =  ((1ul << 31) - 1);
> +        const int64_t lower_bound = -((1ul << 31) - 0);

UL is usually the wrong suffix to use (and more generally,
in QEMU the "long" type is rarely the right type, because
it might be either 32 or 64 bits depending on the host).
Either we know the value fits in 32 bits and we want a 32-bit
type, in which case U is sufficient, or we want a 64-bit type,
in which case we need ULL.

> +
> +        /*
> +         * In saturation arithmetic mode, the accumulator is 32-bit
> +         * with carry. MACH is not considered during the addition
> +         * operation nor the 32-bit saturation logic.
> +         */
> +        int32_t mac = env->macl;
> +        int32_t result;
> +        bool overflow = sadd32_overflow(mac, mul, &result);
> +        if (overflow) {
> +            result = (mac < 0) ? lower_bound : upper_bound;
> +            /* MACH is set to 1 to denote overflow */
> +            env->macl = result;
>              env->mach = 1;
> -            env->macl = 0x7fffffff;
> +        } else {
> +            result = MIN(MAX(result, lower_bound), upper_bound);

Maybe I'm confused, but result is an int32_t, so when can it
be lower than lower_bound or higher than upper_bound ?

> +            /* If there was no overflow, MACH is unchanged */
> +            env->macl = result;
>          }
> +    } else {
> +        /* In non-saturation arithmetic mode, the accumulator is 64-bit */
> +        int64_t mac = (((uint64_t)env->mach) << 32) | env->macl;
> +
> +        /* The carry bit of the 64-bit addition is discarded */
> +        int64_t result = mac + (int64_t)mul;
> +        env->macl = result;
> +        env->mach = result >> 32;
>      }
>  }
>

thanks
-- PMM
Zack Buhman April 5, 2024, 11:19 p.m. UTC | #2
Peter Maydell <peter.maydell@linaro.org> writes:

> On Fri, 5 Apr 2024 at 08:55, Zack Buhman <zack@buhman.org> wrote:
>>
>> The saturation arithmetic logic in helper_macw is not correct.
>>
>> I tested and verified this behavior on a SH7091, the general pattern
>> is a code sequence such as:
>>
>>         sets
>>
>>         mov.l _mach,r2
>>         lds r2,mach
>>         mov.l _macl,r2
>>         lds r2,macl
>>
>>         mova _n,r0
>>         mov r0,r1
>>         mova _m,r0
>>         mac.w @r0+,@r1+
>>
>>  _mach: .long 0xffffffff
>>  _macl: .long 0xfffffffe
>>  _m:    .word 0x0002
>>         .word 0
>>  _n:    .word 0x0003
>>         .word 0
>>
>> test 0:
>>   (mach should not be modified if an overflow did not occur)
>>
>>   given, prior to saturation mac.l:
>>     mach = 0xffffffff ; macl = 0xfffffffe
>>     @r0  = 0x0002     ; @r1  = 0x0003
>>
>>   expected saturation mac.w result:
>>     mach = 0xffffffff (unchanged)
>>     macl = 0x00000004
>>
>>   qemu saturation mac.w result (before this commit):
>>     mach = 0x00000001
>>     macl = 0x80000000
>>
>>   In the context of the helper_macw implementation prior to this
>>   commit, initially this appears to be a surprising result. This is
>>   because (prior to unary negation) the C literal `0x80000000` (due to
>>   being outside the range of a `signed int`) is evaluated as an
>>   `unsigned int` whereas the literal `1` (due to being inside the
>>   range of `signed int`) is evaluated as `signed int`, as in:
>>
>>     static_assert(1 < -0x80000000 == 1);
>>     static_assert(1 < -1 == 0);
>>
>>   This is because the unary negation of an unsigned int is an
>>   unsigned int.
>
> So we could also fix this by getting the C literals right
> so that they are correctly the signed 64 bit values that
> the author intended, right?

Making -0x80000000 signed as intended, as a standalone change without
the other aspects of this patch, would not yield a completely correct
mac.w implementation. For example, in saturation mode, the following
logic does not exist prior to this patch:

- MACH must not be overwritten if an overflow did not occur

- MACH must not be included in the addition/accumulation operation (it
  is simply a 32-bit + 32-bit = 33-bit addition of MACL and the
  multiplication result)

>
>>   In other words, if the `res < -0x80000000` comparison used
>>   infinite-precision literals, the saturation mac.w result would have
>>   been:
>>
>>     mach = 0x00000000
>>     macl = 0x00000004
>>
>>   Due to this (forgivable) misunderstanding of C literals, the
>>   following behavior also occurs:
>>
>> test 1:
>>   (`2 * 3 + 0` is not an overflow)
>>
>>   given, prior to saturation mac.l:
>>     mach = 0x00000000 ; macl = 0x00000000
>>     @r0  = 0x0002     ; @r1  = 0x0003
>>
>>   expected saturation mac.w result:
>>     mach = 0x00000000 (unchanged)
>>     macl = 0x00000006
>>
>>   qemu saturation mac.w result (before this commit):
>>     mach = 0x00000001
>>     macl = 0x80000000
>>
>> test 2:
>>   (mach should not be accumulated in saturation mode)
>>   (16-bit operands are sign-extended)
>>
>>   given, prior to saturation mac.l:
>>     mach = 0x12345678 ; macl = 0x7ffffffe
>>     @r0  = 0x0002     ; @r1  = 0xfffd
>>
>>   expected saturation mac.w result:
>>     mach = 0x12345678 (unchanged)
>>     macl = 0x7ffffff8
>>
>>   qemu saturation mac.w result (before this commit):
>>     mach = 0x00000001
>>     macl = 0x7fffffff
>>
>> test 3:
>>   (macl should have the correct saturation value)
>>
>>   given, prior to saturation mac.l:
>>     mach = 0xabcdef12 ; macl = 0x7ffffffa
>>     @r0  = 0x0002     ; @r1  = 0x0003
>>
>>   expected saturation mac.w result:
>>     mach = 0x00000001 (overwritten)
>>     macl = 0x7fffffff
>>
>>   qemu saturation mac.w result (before this commit):
>>     mach = 0x00000001
>>     macl = 0x80000000
>>
>> All of the above also matches the description of MAC.W as documented
>> in cd00147165-sh-4-32-bit-cpu-core-architecture-stmicroelectronics.pdf
>>
>> Signed-off-by: Zack Buhman <zack@buhman.org>
>> ---
>>  target/sh4/op_helper.c | 41 +++++++++++++++++++++++++++++++----------
>>  1 file changed, 31 insertions(+), 10 deletions(-)
>>
>> diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
>> index ee16524083..b3c1e69f53 100644
>> --- a/target/sh4/op_helper.c
>> +++ b/target/sh4/op_helper.c
>> @@ -187,20 +187,41 @@ void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
>>
>>  void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
>>  {
>> -    int64_t res;
>> +    int16_t value0 = (int16_t)arg0;
>> +    int16_t value1 = (int16_t)arg1;
>> +    int32_t mul = ((int32_t)value0) * ((int32_t)value1);
>>
>> -    res = ((uint64_t) env->mach << 32) | env->macl;
>> -    res += (int64_t) (int16_t) arg0 *(int64_t) (int16_t) arg1;
>> -    env->mach = (res >> 32) & 0xffffffff;
>> -    env->macl = res & 0xffffffff;
>> +    /* Perform 32-bit saturation arithmetic if the S flag is set */
>>      if (env->sr & (1u << SR_S)) {
>> -        if (res < -0x80000000) {
>> -            env->mach = 1;
>> -            env->macl = 0x80000000;
>> -        } else if (res > 0x000000007fffffff) {
>> +        const int64_t upper_bound =  ((1ul << 31) - 1);
>> +        const int64_t lower_bound = -((1ul << 31) - 0);
>
> UL is usually the wrong suffix to use (and more generally,
> in QEMU the "long" type is rarely the right type, because
> it might be either 32 or 64 bits depending on the host).
> Either we know the value fits in 32 bits and we want a 32-bit
> type, in which case U is sufficient, or we want a 64-bit type,
> in which case we need ULL.
>
>> +
>> +        /*
>> +         * In saturation arithmetic mode, the accumulator is 32-bit
>> +         * with carry. MACH is not considered during the addition
>> +         * operation nor the 32-bit saturation logic.
>> +         */
>> +        int32_t mac = env->macl;
>> +        int32_t result;
>> +        bool overflow = sadd32_overflow(mac, mul, &result);
>> +        if (overflow) {
>> +            result = (mac < 0) ? lower_bound : upper_bound;
>> +            /* MACH is set to 1 to denote overflow */
>> +            env->macl = result;
>>              env->mach = 1;
>> -            env->macl = 0x7fffffff;
>> +        } else {
>> +            result = MIN(MAX(result, lower_bound), upper_bound);
>
> Maybe I'm confused, but result is an int32_t, so when can it
> be lower than lower_bound or higher than upper_bound ?

Correct, this MIN(MAX(...)) is a no-operation in this case. I will send
[PATCH v2] with this removed and your other suggestions included.

>
>> +            /* If there was no overflow, MACH is unchanged */
>> +            env->macl = result;
>>          }
>> +    } else {
>> +        /* In non-saturation arithmetic mode, the accumulator is 64-bit */
>> +        int64_t mac = (((uint64_t)env->mach) << 32) | env->macl;
>> +
>> +        /* The carry bit of the 64-bit addition is discarded */
>> +        int64_t result = mac + (int64_t)mul;
>> +        env->macl = result;
>> +        env->mach = result >> 32;
>>      }
>>  }
>>
>
> thanks
> -- PMM
diff mbox series

Patch

diff --git a/target/sh4/op_helper.c b/target/sh4/op_helper.c
index ee16524083..b3c1e69f53 100644
--- a/target/sh4/op_helper.c
+++ b/target/sh4/op_helper.c
@@ -187,20 +187,41 @@  void helper_macl(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 
 void helper_macw(CPUSH4State *env, uint32_t arg0, uint32_t arg1)
 {
-    int64_t res;
+    int16_t value0 = (int16_t)arg0;
+    int16_t value1 = (int16_t)arg1;
+    int32_t mul = ((int32_t)value0) * ((int32_t)value1);
 
-    res = ((uint64_t) env->mach << 32) | env->macl;
-    res += (int64_t) (int16_t) arg0 *(int64_t) (int16_t) arg1;
-    env->mach = (res >> 32) & 0xffffffff;
-    env->macl = res & 0xffffffff;
+    /* Perform 32-bit saturation arithmetic if the S flag is set */
     if (env->sr & (1u << SR_S)) {
-        if (res < -0x80000000) {
-            env->mach = 1;
-            env->macl = 0x80000000;
-        } else if (res > 0x000000007fffffff) {
+        const int64_t upper_bound =  ((1ul << 31) - 1);
+        const int64_t lower_bound = -((1ul << 31) - 0);
+
+        /*
+         * In saturation arithmetic mode, the accumulator is 32-bit
+         * with carry. MACH is not considered during the addition
+         * operation nor the 32-bit saturation logic.
+         */
+        int32_t mac = env->macl;
+        int32_t result;
+        bool overflow = sadd32_overflow(mac, mul, &result);
+        if (overflow) {
+            result = (mac < 0) ? lower_bound : upper_bound;
+            /* MACH is set to 1 to denote overflow */
+            env->macl = result;
             env->mach = 1;
-            env->macl = 0x7fffffff;
+        } else {
+            result = MIN(MAX(result, lower_bound), upper_bound);
+            /* If there was no overflow, MACH is unchanged */
+            env->macl = result;
         }
+    } else {
+        /* In non-saturation arithmetic mode, the accumulator is 64-bit */
+        int64_t mac = (((uint64_t)env->mach) << 32) | env->macl;
+
+        /* The carry bit of the 64-bit addition is discarded */
+        int64_t result = mac + (int64_t)mul;
+        env->macl = result;
+        env->mach = result >> 32;
     }
 }