diff mbox

[CFT] target-ppc: Fix narrow-mode add/sub carry output

Message ID 1364938954-1825-1-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson April 2, 2013, 9:42 p.m. UTC
Broken in b5a73f8d8a57e940f9bbeb399a9e47897522ee9a, the carry itself was
fixed in 79482e5ab38a05ca8869040b0d8b8f451f16ff62.  But we still need to
produce the full 64-bit addition.

Cc: Aurelien Jarno <aurelien@aurel32.net>
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
Aurelien, you reported this bug, but I'm unable to reproduce it at the
moment.  That test program you sent along is broken, attempting to run
insns in pages without the execute bit set.  I don't have time to fix
that today.

But per IRC, I think we know what the problem is.  And at least this
runs the linux-user-0.3 ppc binaries properly.  Hopefully it'll solve
the real problem with your guest openssl.


r~
---
 target-ppc/translate.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

Comments

Aurelien Jarno April 3, 2013, 8 p.m. UTC | #1
Hi,

On Tue, Apr 02, 2013 at 02:42:34PM -0700, Richard Henderson wrote:
> Broken in b5a73f8d8a57e940f9bbeb399a9e47897522ee9a, the carry itself was
> fixed in 79482e5ab38a05ca8869040b0d8b8f451f16ff62.  But we still need to
> produce the full 64-bit addition.
> 
> Cc: Aurelien Jarno <aurelien@aurel32.net>
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
> Aurelien, you reported this bug, but I'm unable to reproduce it at the
> moment.  That test program you sent along is broken, attempting to run

For people not on IRC, this code has been posted to the mailing list
sometimes ago:
http://lists.gnu.org/archive/html/qemu-devel/2012-08/txtdGjJbUgkD2.txt

> insns in pages without the execute bit set.  I don't have time to fix
> that today.

Yes, I just discovered that. Actually it runs fine in qemu-ppc
(linux-user), as well as on G4 machines, both QEMU emulated or real, as
they do not seem to check for the execute bit. I confirm it fails on a
more recent machine with a 64-bit CPU, both QEMU emulated or real, which
means that QEMU is emulating that part correctly at least.

I'll try to fix this.

> But per IRC, I think we know what the problem is.  And at least this
> runs the linux-user-0.3 ppc binaries properly.  Hopefully it'll solve
> the real problem with your guest openssl.

Thanks for this patch. It almost fixes the issue, please see my
comments below.

> r~
> ---
>  target-ppc/translate.c | 30 ++++++++++++++++++++----------
>  1 file changed, 20 insertions(+), 10 deletions(-)
> 
> diff --git a/target-ppc/translate.c b/target-ppc/translate.c
> index 5e741d1..1feadca 100644
> --- a/target-ppc/translate.c
> +++ b/target-ppc/translate.c
> @@ -775,15 +775,19 @@ static inline void gen_op_arith_add(DisasContext *ctx, TCGv ret, TCGv arg1,
>  
>      if (compute_ca) {
>          if (NARROW_MODE(ctx)) {
> +            /* Caution: a non-obvious corner case of the spec is that we
> +               must produce the *entire* 64-bit addition, but produce the
> +               carry into bit 32.  */
>              TCGv t1 = tcg_temp_new();
> -            tcg_gen_ext32u_tl(t1, arg2);
> -            tcg_gen_ext32u_tl(t0, arg1);
> -            tcg_gen_add_tl(t0, t0, t1);
> -            tcg_temp_free(t1);
> +            tcg_gen_xor_tl(t1, arg1, arg2);        /* add without carry */
> +            tcg_gen_add_tl(t0, arg1, arg2);
>              if (add_ca) {
>                  tcg_gen_add_tl(t0, t0, cpu_ca);
>              }
> -            tcg_gen_shri_tl(cpu_ca, t0, 32);
> +            tcg_gen_xor_tl(cpu_ca, t0, t1);        /* bits changed w/ carry */
> +            tcg_temp_free(t1);
> +            tcg_gen_shri_tl(cpu_ca, cpu_ca, 32);   /* extract bit 32 */
> +            tcg_gen_andi_tl(cpu_ca, cpu_ca, 1);
>          } else {
>              TCGv zero = tcg_const_tl(0);
>              if (add_ca) {

This part works fine for me and fixes the problem I reported.

> @@ -1129,17 +1133,23 @@ static inline void gen_op_arith_subf(DisasContext *ctx, TCGv ret, TCGv arg1,
>      if (compute_ca) {
>          /* dest = ~arg1 + arg2 [+ ca].  */
>          if (NARROW_MODE(ctx)) {
> +            /* Caution: a non-obvious corner case of the spec is that we
> +               must produce the *entire* 64-bit addition, but produce the
> +               carry into bit 32.  */
>              TCGv inv1 = tcg_temp_new();
> +            TCGv t1 = tcg_temp_new();
>              tcg_gen_not_tl(inv1, arg1);
> -            tcg_gen_ext32u_tl(t0, arg2);
> -            tcg_gen_ext32u_tl(inv1, inv1);
>              if (add_ca) {
> -                tcg_gen_add_tl(t0, t0, cpu_ca);
> +                tcg_gen_add_tl(t0, arg2, cpu_ca);
>              } else {
> -                tcg_gen_addi_tl(t0, t0, 1);
> +                tcg_gen_addi_tl(t0, arg2, 1);
>              }
> +            tcg_gen_xor_tl(t1, arg2, inv1);         /* add without carry */
>              tcg_gen_add_tl(t0, t0, inv1);
> -            tcg_gen_shri_tl(cpu_ca, t0, 32);
> +            tcg_gen_xor_tl(cpu_ca, t0, t1);         /* bits changes w/ carry */
> +            tcg_temp_free(t1);
> +            tcg_gen_shri_tl(cpu_ca, cpu_ca, 32);    /* extract bit 32 */
> +            tcg_gen_andi_tl(cpu_ca, cpu_ca, 1);

This part actually breaks things. It destroy the value of t0, and then
use it again. The condition a bit upper should be changed:

-    if (compute_ov && (TCGV_EQUAL(ret, arg1) || TCGV_EQUAL(ret, arg2)))  {
+    if ((compute_ov || compute_ca)
+        && (TCGV_EQUAL(ret, arg1) || TCGV_EQUAL(ret, arg2)))  {
         t0 = tcg_temp_new();
     }

That said this code is error prone and doesn't seem terribly useful
since we have an optimization pass on the TCG code. I have tried to run
some code with it and with t0 always allocated. Almost all of the
generated code is unchanged. In the remaining parts a few mov
instructions are shuffled, but the code size doesn't increase.

>          } else if (add_ca) {
>              TCGv zero, inv1 = tcg_temp_new();
>              tcg_gen_not_tl(inv1, arg1);

Aurelien
diff mbox

Patch

diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 5e741d1..1feadca 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -775,15 +775,19 @@  static inline void gen_op_arith_add(DisasContext *ctx, TCGv ret, TCGv arg1,
 
     if (compute_ca) {
         if (NARROW_MODE(ctx)) {
+            /* Caution: a non-obvious corner case of the spec is that we
+               must produce the *entire* 64-bit addition, but produce the
+               carry into bit 32.  */
             TCGv t1 = tcg_temp_new();
-            tcg_gen_ext32u_tl(t1, arg2);
-            tcg_gen_ext32u_tl(t0, arg1);
-            tcg_gen_add_tl(t0, t0, t1);
-            tcg_temp_free(t1);
+            tcg_gen_xor_tl(t1, arg1, arg2);        /* add without carry */
+            tcg_gen_add_tl(t0, arg1, arg2);
             if (add_ca) {
                 tcg_gen_add_tl(t0, t0, cpu_ca);
             }
-            tcg_gen_shri_tl(cpu_ca, t0, 32);
+            tcg_gen_xor_tl(cpu_ca, t0, t1);        /* bits changed w/ carry */
+            tcg_temp_free(t1);
+            tcg_gen_shri_tl(cpu_ca, cpu_ca, 32);   /* extract bit 32 */
+            tcg_gen_andi_tl(cpu_ca, cpu_ca, 1);
         } else {
             TCGv zero = tcg_const_tl(0);
             if (add_ca) {
@@ -1129,17 +1133,23 @@  static inline void gen_op_arith_subf(DisasContext *ctx, TCGv ret, TCGv arg1,
     if (compute_ca) {
         /* dest = ~arg1 + arg2 [+ ca].  */
         if (NARROW_MODE(ctx)) {
+            /* Caution: a non-obvious corner case of the spec is that we
+               must produce the *entire* 64-bit addition, but produce the
+               carry into bit 32.  */
             TCGv inv1 = tcg_temp_new();
+            TCGv t1 = tcg_temp_new();
             tcg_gen_not_tl(inv1, arg1);
-            tcg_gen_ext32u_tl(t0, arg2);
-            tcg_gen_ext32u_tl(inv1, inv1);
             if (add_ca) {
-                tcg_gen_add_tl(t0, t0, cpu_ca);
+                tcg_gen_add_tl(t0, arg2, cpu_ca);
             } else {
-                tcg_gen_addi_tl(t0, t0, 1);
+                tcg_gen_addi_tl(t0, arg2, 1);
             }
+            tcg_gen_xor_tl(t1, arg2, inv1);         /* add without carry */
             tcg_gen_add_tl(t0, t0, inv1);
-            tcg_gen_shri_tl(cpu_ca, t0, 32);
+            tcg_gen_xor_tl(cpu_ca, t0, t1);         /* bits changes w/ carry */
+            tcg_temp_free(t1);
+            tcg_gen_shri_tl(cpu_ca, cpu_ca, 32);    /* extract bit 32 */
+            tcg_gen_andi_tl(cpu_ca, cpu_ca, 1);
         } else if (add_ca) {
             TCGv zero, inv1 = tcg_temp_new();
             tcg_gen_not_tl(inv1, arg1);