diff mbox series

[v2,06/14] tcg/{i386, s390x}: Add earlyclobber to the op_add2's first output

Message ID 20230719094620.363206-7-iii@linux.ibm.com
State New
Headers show
Series target/s390x: Miscellaneous TCG fixes, part 2 | expand

Commit Message

Ilya Leoshkevich July 19, 2023, 9:44 a.m. UTC
i386 and s390x implementations of op_add2 require an earlyclobber,
which is currently missing. This breaks VCKSM in s390x guests. E.g., on
x86_64 the following op:

    add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5  pref=none,0xffff

is translated to:

    addl     %ebx, %r12d
    adcl     %r12d, %ebx

Introduce a new C_N1_O1_I4 constraint, and make sure that earlyclobber
of aliased outputs is honored.

Cc: qemu-stable@nongnu.org
Fixes: 82790a870992 ("tcg: Add markup for output requires new register")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
---
 tcg/i386/tcg-target-con-set.h  | 2 +-
 tcg/i386/tcg-target.c.inc      | 2 +-
 tcg/s390x/tcg-target-con-set.h | 5 ++---
 tcg/s390x/tcg-target.c.inc     | 4 ++--
 tcg/tcg.c                      | 8 +++++++-
 5 files changed, 13 insertions(+), 8 deletions(-)

Comments

Philippe Mathieu-Daudé July 19, 2023, 12:08 p.m. UTC | #1
Hi Ilya,

On 19/7/23 11:44, Ilya Leoshkevich wrote:
> i386 and s390x implementations of op_add2 require an earlyclobber,
> which is currently missing. This breaks VCKSM in s390x guests. E.g., on
> x86_64 the following op:
> 
>      add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5  pref=none,0xffff
> 
> is translated to:
> 
>      addl     %ebx, %r12d
>      adcl     %r12d, %ebx
> 
> Introduce a new C_N1_O1_I4 constraint, and make sure that earlyclobber
> of aliased outputs is honored.
> 
> Cc: qemu-stable@nongnu.org
> Fixes: 82790a870992 ("tcg: Add markup for output requires new register")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>   tcg/i386/tcg-target-con-set.h  | 2 +-
>   tcg/i386/tcg-target.c.inc      | 2 +-
>   tcg/s390x/tcg-target-con-set.h | 5 ++---
>   tcg/s390x/tcg-target.c.inc     | 4 ++--
>   tcg/tcg.c                      | 8 +++++++-
>   5 files changed, 13 insertions(+), 8 deletions(-)


> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 652e8ea6b93..ddfe9a96cb7 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -648,6 +648,7 @@ static void tcg_out_movext3(TCGContext *s, const TCGMovExtend *i1,
>   #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2, I1, I2),
>   #define C_O2_I3(O1, O2, I1, I2, I3)     C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3),
>   #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4),
> +#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, I3, I4),

No need for O2. Also can you place it earlier just after C_N1_I2?

-- >8 --
@@ -643,6 +643,7 @@ static void tcg_out_movext3(TCGContext *s, const 
TCGMovExtend *i1,
  #define C_O1_I4(O1, I1, I2, I3, I4)     C_PFX5(c_o1_i4_, O1, I1, I2, 
I3, I4),

  #define C_N1_I2(O1, I1, I2)             C_PFX3(c_n1_i2_, O1, I1, I2),
+#define C_N1_O1_I4(O1, I1, I2, I3, I4)  C_PFX5(c_n1_o1_i4_, O1, I1, I2, 
I3, I4),

  #define C_O2_I1(O1, O2, I1)             C_PFX3(c_o2_i1_, O1, O2, I1),
  #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2, I1, I2),
---

Thanks,

Phil.
Ilya Leoshkevich July 19, 2023, 12:30 p.m. UTC | #2
On Wed, 2023-07-19 at 14:08 +0200, Philippe Mathieu-Daudé wrote:
> Hi Ilya,
> 
> On 19/7/23 11:44, Ilya Leoshkevich wrote:
> > i386 and s390x implementations of op_add2 require an earlyclobber,
> > which is currently missing. This breaks VCKSM in s390x guests.
> > E.g., on
> > x86_64 the following op:
> > 
> >      add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5 
> > pref=none,0xffff
> > 
> > is translated to:
> > 
> >      addl     %ebx, %r12d
> >      adcl     %r12d, %ebx
> > 
> > Introduce a new C_N1_O1_I4 constraint, and make sure that
> > earlyclobber
> > of aliased outputs is honored.
> > 
> > Cc: qemu-stable@nongnu.org
> > Fixes: 82790a870992 ("tcg: Add markup for output requires new
> > register")
> > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> > ---
> >   tcg/i386/tcg-target-con-set.h  | 2 +-
> >   tcg/i386/tcg-target.c.inc      | 2 +-
> >   tcg/s390x/tcg-target-con-set.h | 5 ++---
> >   tcg/s390x/tcg-target.c.inc     | 4 ++--
> >   tcg/tcg.c                      | 8 +++++++-
> >   5 files changed, 13 insertions(+), 8 deletions(-)
> 
> 
> > diff --git a/tcg/tcg.c b/tcg/tcg.c
> > index 652e8ea6b93..ddfe9a96cb7 100644
> > --- a/tcg/tcg.c
> > +++ b/tcg/tcg.c
> > @@ -648,6 +648,7 @@ static void tcg_out_movext3(TCGContext *s,
> > const TCGMovExtend *i1,
> >   #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2,
> > I1, I2),
> >   #define C_O2_I3(O1, O2, I1, I2, I3)     C_PFX5(c_o2_i3_, O1, O2,
> > I1, I2, I3),
> >   #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2,
> > I1, I2, I3, I4),
> > +#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1,
> > O2, I1, I2, I3, I4),
> 
> No need for O2. Also can you place it earlier just after C_N1_I2?

Shouldn't it still be a 6-argument constraint?
INDEX_op_add2_i32 and friends take 6 arguments after all.

> -- >8 --
> @@ -643,6 +643,7 @@ static void tcg_out_movext3(TCGContext *s, const 
> TCGMovExtend *i1,
>   #define C_O1_I4(O1, I1, I2, I3, I4)     C_PFX5(c_o1_i4_, O1, I1,
> I2, 
> I3, I4),
> 
>   #define C_N1_I2(O1, I1, I2)             C_PFX3(c_n1_i2_, O1, I1,
> I2),
> +#define C_N1_O1_I4(O1, I1, I2, I3, I4)  C_PFX5(c_n1_o1_i4_, O1, I1,
> I2, 
> I3, I4),
> 
>   #define C_O2_I1(O1, O2, I1)             C_PFX3(c_o2_i1_, O1, O2,
> I1),
>   #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2,
> I1, I2),
> ---
> 
> Thanks,
> 
> Phil.
Philippe Mathieu-Daudé July 19, 2023, 12:42 p.m. UTC | #3
On 19/7/23 14:30, Ilya Leoshkevich wrote:
> On Wed, 2023-07-19 at 14:08 +0200, Philippe Mathieu-Daudé wrote:
>> Hi Ilya,
>>
>> On 19/7/23 11:44, Ilya Leoshkevich wrote:
>>> i386 and s390x implementations of op_add2 require an earlyclobber,
>>> which is currently missing. This breaks VCKSM in s390x guests.
>>> E.g., on
>>> x86_64 the following op:
>>>
>>>       add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5
>>> pref=none,0xffff
>>>
>>> is translated to:
>>>
>>>       addl     %ebx, %r12d
>>>       adcl     %r12d, %ebx
>>>
>>> Introduce a new C_N1_O1_I4 constraint, and make sure that
>>> earlyclobber
>>> of aliased outputs is honored.
>>>
>>> Cc: qemu-stable@nongnu.org
>>> Fixes: 82790a870992 ("tcg: Add markup for output requires new
>>> register")
>>> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
>>> ---
>>>    tcg/i386/tcg-target-con-set.h  | 2 +-
>>>    tcg/i386/tcg-target.c.inc      | 2 +-
>>>    tcg/s390x/tcg-target-con-set.h | 5 ++---
>>>    tcg/s390x/tcg-target.c.inc     | 4 ++--
>>>    tcg/tcg.c                      | 8 +++++++-
>>>    5 files changed, 13 insertions(+), 8 deletions(-)
>>
>>
>>> diff --git a/tcg/tcg.c b/tcg/tcg.c
>>> index 652e8ea6b93..ddfe9a96cb7 100644
>>> --- a/tcg/tcg.c
>>> +++ b/tcg/tcg.c
>>> @@ -648,6 +648,7 @@ static void tcg_out_movext3(TCGContext *s,
>>> const TCGMovExtend *i1,
>>>    #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2,
>>> I1, I2),
>>>    #define C_O2_I3(O1, O2, I1, I2, I3)     C_PFX5(c_o2_i3_, O1, O2,
>>> I1, I2, I3),
>>>    #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2,
>>> I1, I2, I3, I4),
>>> +#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1,
>>> O2, I1, I2, I3, I4),
>>
>> No need for O2. Also can you place it earlier just after C_N1_I2?
> 
> Shouldn't it still be a 6-argument constraint?
> INDEX_op_add2_i32 and friends take 6 arguments after all.

Oops sorry, you are right.
Philippe Mathieu-Daudé July 19, 2023, 2:04 p.m. UTC | #4
On 19/7/23 11:44, Ilya Leoshkevich wrote:
> i386 and s390x implementations of op_add2 require an earlyclobber,
> which is currently missing. This breaks VCKSM in s390x guests. E.g., on
> x86_64 the following op:
> 
>      add2_i32 tmp2,tmp3,tmp2,tmp3,tmp3,tmp2   dead: 0 2 3 4 5  pref=none,0xffff
> 
> is translated to:
> 
>      addl     %ebx, %r12d
>      adcl     %r12d, %ebx
> 
> Introduce a new C_N1_O1_I4 constraint, and make sure that earlyclobber
> of aliased outputs is honored.
> 
> Cc: qemu-stable@nongnu.org
> Fixes: 82790a870992 ("tcg: Add markup for output requires new register")
> Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
> ---
>   tcg/i386/tcg-target-con-set.h  | 2 +-
>   tcg/i386/tcg-target.c.inc      | 2 +-
>   tcg/s390x/tcg-target-con-set.h | 5 ++---
>   tcg/s390x/tcg-target.c.inc     | 4 ++--
>   tcg/tcg.c                      | 8 +++++++-
>   5 files changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
> index 91ceb0e1da2..cb4b25263e9 100644
> --- a/tcg/i386/tcg-target-con-set.h
> +++ b/tcg/i386/tcg-target-con-set.h
> @@ -53,4 +53,4 @@ C_O2_I1(r, r, L)
>   C_O2_I2(a, d, a, r)
>   C_O2_I2(r, r, L, L)
>   C_O2_I3(a, d, 0, 1, r)
> -C_O2_I4(r, r, 0, 1, re, re)
> +C_N1_O1_I4(r, r, 0, 1, re, re)

While here, please update the comment in header. Although the
description is not arch-specific; include/tcg/tcg.h or tcg/tcg.c
could hold it.

The patch LGTM but I don't feel confident enough to add a R-b tag.
If it still miss a R-b tag in a pair of days I'll have a deeper
look at it.

Regards,

Phil.
diff mbox series

Patch

diff --git a/tcg/i386/tcg-target-con-set.h b/tcg/i386/tcg-target-con-set.h
index 91ceb0e1da2..cb4b25263e9 100644
--- a/tcg/i386/tcg-target-con-set.h
+++ b/tcg/i386/tcg-target-con-set.h
@@ -53,4 +53,4 @@  C_O2_I1(r, r, L)
 C_O2_I2(a, d, a, r)
 C_O2_I2(r, r, L, L)
 C_O2_I3(a, d, 0, 1, r)
-C_O2_I4(r, r, 0, 1, re, re)
+C_N1_O1_I4(r, r, 0, 1, re, re)
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index ab997b5fb39..77482da0709 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -3335,7 +3335,7 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i32:
     case INDEX_op_sub2_i64:
-        return C_O2_I4(r, r, 0, 1, re, re);
+        return C_N1_O1_I4(r, r, 0, 1, re, re);
 
     case INDEX_op_ctz_i32:
     case INDEX_op_ctz_i64:
diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index cbad91b2b56..ce779e8b44a 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -41,6 +41,5 @@  C_O2_I1(o, m, r)
 C_O2_I2(o, m, 0, r)
 C_O2_I2(o, m, r, r)
 C_O2_I3(o, m, 0, 1, r)
-C_O2_I4(r, r, 0, 1, rA, r)
-C_O2_I4(r, r, 0, 1, ri, r)
-C_O2_I4(r, r, 0, 1, r, r)
+C_N1_O1_I4(r, r, 0, 1, ri, r)
+C_N1_O1_I4(r, r, 0, 1, rA, r)
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index a878acd8ca6..a94f7908d64 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -3229,11 +3229,11 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
 
     case INDEX_op_add2_i32:
     case INDEX_op_sub2_i32:
-        return C_O2_I4(r, r, 0, 1, ri, r);
+        return C_N1_O1_I4(r, r, 0, 1, ri, r);
 
     case INDEX_op_add2_i64:
     case INDEX_op_sub2_i64:
-        return C_O2_I4(r, r, 0, 1, rA, r);
+        return C_N1_O1_I4(r, r, 0, 1, rA, r);
 
     case INDEX_op_st_vec:
         return C_O0_I2(v, r);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 652e8ea6b93..ddfe9a96cb7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -648,6 +648,7 @@  static void tcg_out_movext3(TCGContext *s, const TCGMovExtend *i1,
 #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2, I1, I2),
 #define C_O2_I3(O1, O2, I1, I2, I3)     C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3),
 #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4),
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, I3, I4),
 
 typedef enum {
 #include "tcg-target-con-set.h"
@@ -668,6 +669,7 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
 #undef C_O2_I2
 #undef C_O2_I3
 #undef C_O2_I4
+#undef C_N1_O1_I4
 
 /* Put all of the constraint sets into an array, indexed by the enum. */
 
@@ -687,6 +689,7 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode);
 #define C_O2_I2(O1, O2, I1, I2)         { .args_ct_str = { #O1, #O2, #I1, #I2 } },
 #define C_O2_I3(O1, O2, I1, I2, I3)     { .args_ct_str = { #O1, #O2, #I1, #I2, #I3 } },
 #define C_O2_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { #O1, #O2, #I1, #I2, #I3, #I4 } },
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) { .args_ct_str = { "&" #O1, #O2, #I1, #I2, #I3, #I4 } },
 
 static const TCGTargetOpDef constraint_sets[] = {
 #include "tcg-target-con-set.h"
@@ -706,6 +709,7 @@  static const TCGTargetOpDef constraint_sets[] = {
 #undef C_O2_I2
 #undef C_O2_I3
 #undef C_O2_I4
+#undef C_N1_O1_I4
 
 /* Expand the enumerator to be returned from tcg_target_op_def(). */
 
@@ -725,6 +729,7 @@  static const TCGTargetOpDef constraint_sets[] = {
 #define C_O2_I2(O1, O2, I1, I2)         C_PFX4(c_o2_i2_, O1, O2, I1, I2)
 #define C_O2_I3(O1, O2, I1, I2, I3)     C_PFX5(c_o2_i3_, O1, O2, I1, I2, I3)
 #define C_O2_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_o2_i4_, O1, O2, I1, I2, I3, I4)
+#define C_N1_O1_I4(O1, O2, I1, I2, I3, I4) C_PFX6(c_n1_o1_i4_, O1, O2, I1, I2, I3, I4)
 
 #include "tcg-target.c.inc"
 
@@ -4703,7 +4708,8 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
                  * dead after the instruction, we must allocate a new
                  * register and move it.
                  */
-                if (temp_readonly(ts) || !IS_DEAD_ARG(i)) {
+                if (temp_readonly(ts) || !IS_DEAD_ARG(i)
+                    || def->args_ct[arg_ct->alias_index].newreg) {
                     allocate_new_reg = true;
                 } else if (ts->val_type == TEMP_VAL_REG) {
                     /*