diff mbox series

[v3,06/13] tcg/s390x: Support MIE2 multiply single instructions

Message ID 20221202065200.224537-7-richard.henderson@linaro.org
State New
Headers show
Series tcg/s390x: misc patches | expand

Commit Message

Richard Henderson Dec. 2, 2022, 6:51 a.m. UTC
The MIE2 facility adds 3-operand versions of multiply.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/s390x/tcg-target-con-set.h |  1 +
 tcg/s390x/tcg-target.h         |  1 +
 tcg/s390x/tcg-target.c.inc     | 34 ++++++++++++++++++++++++----------
 3 files changed, 26 insertions(+), 10 deletions(-)

Comments

Ilya Leoshkevich Dec. 6, 2022, 8:02 p.m. UTC | #1
On Thu, Dec 01, 2022 at 10:51:53PM -0800, Richard Henderson wrote:
> The MIE2 facility adds 3-operand versions of multiply.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/s390x/tcg-target-con-set.h |  1 +
>  tcg/s390x/tcg-target.h         |  1 +
>  tcg/s390x/tcg-target.c.inc     | 34 ++++++++++++++++++++++++----------
>  3 files changed, 26 insertions(+), 10 deletions(-)

Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com>

I have one small suggestion, see below.

> diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
> index 00ba727b70..33a82e3286 100644
> --- a/tcg/s390x/tcg-target-con-set.h
> +++ b/tcg/s390x/tcg-target-con-set.h
> @@ -23,6 +23,7 @@ C_O1_I2(r, 0, ri)
>  C_O1_I2(r, 0, rI)
>  C_O1_I2(r, 0, rJ)
>  C_O1_I2(r, r, ri)
> +C_O1_I2(r, r, rJ)
>  C_O1_I2(r, rZ, r)
>  C_O1_I2(v, v, r)
>  C_O1_I2(v, v, v)
> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> index 645f522058..bfd623a639 100644
> --- a/tcg/s390x/tcg-target.h
> +++ b/tcg/s390x/tcg-target.h
> @@ -63,6 +63,7 @@ typedef enum TCGReg {
>  #define FACILITY_FAST_BCR_SER         FACILITY_LOAD_ON_COND
>  #define FACILITY_DISTINCT_OPS         FACILITY_LOAD_ON_COND
>  #define FACILITY_LOAD_ON_COND2        53
> +#define FACILITY_MISC_INSN_EXT2       58
>  #define FACILITY_VECTOR               129
>  #define FACILITY_VECTOR_ENH1          135
>  
> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> index d02b433271..cd39b2a208 100644
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -180,6 +180,8 @@ typedef enum S390Opcode {
>      RRE_SLBGR   = 0xb989,
>      RRE_XGR     = 0xb982,
>  
> +    RRFa_MSRKC  = 0xb9fd,
> +    RRFa_MSGRKC = 0xb9ed,
>      RRFa_NRK    = 0xb9f4,
>      RRFa_NGRK   = 0xb9e4,
>      RRFa_ORK    = 0xb9f6,
> @@ -2140,14 +2142,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_mul_i32:
> +        a0 = args[0], a1 = args[1], a2 = (int32_t)args[2];
>          if (const_args[2]) {
> -            if ((int32_t)args[2] == (int16_t)args[2]) {
> -                tcg_out_insn(s, RI, MHI, args[0], args[2]);
> +            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);

Should we consider a0 == a1 case here as well, in order to get rid of
this extra move when possible?

> +            if (a2 == (int16_t)a2) {
> +                tcg_out_insn(s, RI, MHI, a0, a2);
>              } else {
> -                tcg_out_insn(s, RIL, MSFI, args[0], args[2]);
> +                tcg_out_insn(s, RIL, MSFI, a0, a2);
>              }
> +        } else if (a0 == a1) {
> +            tcg_out_insn(s, RRE, MSR, a0, a2);
>          } else {
> -            tcg_out_insn(s, RRE, MSR, args[0], args[2]);
> +            tcg_out_insn(s, RRFa, MSRKC, a0, a1, a2);
>          }
>          break;
>  
> @@ -2405,14 +2411,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
>          break;
>  
>      case INDEX_op_mul_i64:
> +        a0 = args[0], a1 = args[1], a2 = args[2];
>          if (const_args[2]) {
> -            if (args[2] == (int16_t)args[2]) {
> -                tcg_out_insn(s, RI, MGHI, args[0], args[2]);
> +            tcg_out_mov(s, TCG_TYPE_I64, a0, a1);

Same here.

> +            if (a2 == (int16_t)a2) {
> +                tcg_out_insn(s, RI, MGHI, a0, a2);
>              } else {
> -                tcg_out_insn(s, RIL, MSGFI, args[0], args[2]);
> +                tcg_out_insn(s, RIL, MSGFI, a0, a2);
>              }
> +        } else if (a0 == a1) {
> +            tcg_out_insn(s, RRE, MSGR, a0, a2);
>          } else {
> -            tcg_out_insn(s, RRE, MSGR, args[0], args[2]);
> +            tcg_out_insn(s, RRFa, MSGRKC, a0, a1, a2);
>          }
>          break;
>  
> @@ -3072,12 +3082,16 @@ static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
>             MULTIPLY SINGLE IMMEDIATE with a signed 32-bit, otherwise we
>             have only MULTIPLY HALFWORD IMMEDIATE, with a signed 16-bit.  */
>          return (HAVE_FACILITY(GEN_INST_EXT)
> -                ? C_O1_I2(r, 0, ri)
> +                ? (HAVE_FACILITY(MISC_INSN_EXT2)
> +                   ? C_O1_I2(r, r, ri)
> +                   : C_O1_I2(r, 0, ri))
>                  : C_O1_I2(r, 0, rI));
>  
>      case INDEX_op_mul_i64:
>          return (HAVE_FACILITY(GEN_INST_EXT)
> -                ? C_O1_I2(r, 0, rJ)
> +                ? (HAVE_FACILITY(MISC_INSN_EXT2)
> +                   ? C_O1_I2(r, r, rJ)
> +                   : C_O1_I2(r, 0, rJ))
>                  : C_O1_I2(r, 0, rI));
>  
>      case INDEX_op_shl_i32:
> -- 
> 2.34.1
> 
>
Richard Henderson Dec. 6, 2022, 8:20 p.m. UTC | #2
On Tue, 6 Dec 2022, 14:02 Ilya Leoshkevich, <iii@linux.ibm.com> wrote:

> On Thu, Dec 01, 2022 at 10:51:53PM -0800, Richard Henderson wrote:
> > The MIE2 facility adds 3-operand versions of multiply.
> >
> > Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> > ---
> >  tcg/s390x/tcg-target-con-set.h |  1 +
> >  tcg/s390x/tcg-target.h         |  1 +
> >  tcg/s390x/tcg-target.c.inc     | 34 ++++++++++++++++++++++++----------
> >  3 files changed, 26 insertions(+), 10 deletions(-)
>
> Reviewed-by: Ilya Leoshkevich <iii@linux.ibm.com>
>
> I have one small suggestion, see below.
>
> > diff --git a/tcg/s390x/tcg-target-con-set.h
> b/tcg/s390x/tcg-target-con-set.h
> > index 00ba727b70..33a82e3286 100644
> > --- a/tcg/s390x/tcg-target-con-set.h
> > +++ b/tcg/s390x/tcg-target-con-set.h
> > @@ -23,6 +23,7 @@ C_O1_I2(r, 0, ri)
> >  C_O1_I2(r, 0, rI)
> >  C_O1_I2(r, 0, rJ)
> >  C_O1_I2(r, r, ri)
> > +C_O1_I2(r, r, rJ)
> >  C_O1_I2(r, rZ, r)
> >  C_O1_I2(v, v, r)
> >  C_O1_I2(v, v, v)
> > diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> > index 645f522058..bfd623a639 100644
> > --- a/tcg/s390x/tcg-target.h
> > +++ b/tcg/s390x/tcg-target.h
> > @@ -63,6 +63,7 @@ typedef enum TCGReg {
> >  #define FACILITY_FAST_BCR_SER         FACILITY_LOAD_ON_COND
> >  #define FACILITY_DISTINCT_OPS         FACILITY_LOAD_ON_COND
> >  #define FACILITY_LOAD_ON_COND2        53
> > +#define FACILITY_MISC_INSN_EXT2       58
> >  #define FACILITY_VECTOR               129
> >  #define FACILITY_VECTOR_ENH1          135
> >
> > diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> > index d02b433271..cd39b2a208 100644
> > --- a/tcg/s390x/tcg-target.c.inc
> > +++ b/tcg/s390x/tcg-target.c.inc
> > @@ -180,6 +180,8 @@ typedef enum S390Opcode {
> >      RRE_SLBGR   = 0xb989,
> >      RRE_XGR     = 0xb982,
> >
> > +    RRFa_MSRKC  = 0xb9fd,
> > +    RRFa_MSGRKC = 0xb9ed,
> >      RRFa_NRK    = 0xb9f4,
> >      RRFa_NGRK   = 0xb9e4,
> >      RRFa_ORK    = 0xb9f6,
> > @@ -2140,14 +2142,18 @@ static inline void tcg_out_op(TCGContext *s,
> TCGOpcode opc,
> >          break;
> >
> >      case INDEX_op_mul_i32:
> > +        a0 = args[0], a1 = args[1], a2 = (int32_t)args[2];
> >          if (const_args[2]) {
> > -            if ((int32_t)args[2] == (int16_t)args[2]) {
> > -                tcg_out_insn(s, RI, MHI, args[0], args[2]);
> > +            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
>
> Should we consider a0 == a1 case here as well, in order to get rid of
> this extra move when possible?
>

tcg_out_mov already does that.


r~
diff mbox series

Patch

diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index 00ba727b70..33a82e3286 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -23,6 +23,7 @@  C_O1_I2(r, 0, ri)
 C_O1_I2(r, 0, rI)
 C_O1_I2(r, 0, rJ)
 C_O1_I2(r, r, ri)
+C_O1_I2(r, r, rJ)
 C_O1_I2(r, rZ, r)
 C_O1_I2(v, v, r)
 C_O1_I2(v, v, v)
diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
index 645f522058..bfd623a639 100644
--- a/tcg/s390x/tcg-target.h
+++ b/tcg/s390x/tcg-target.h
@@ -63,6 +63,7 @@  typedef enum TCGReg {
 #define FACILITY_FAST_BCR_SER         FACILITY_LOAD_ON_COND
 #define FACILITY_DISTINCT_OPS         FACILITY_LOAD_ON_COND
 #define FACILITY_LOAD_ON_COND2        53
+#define FACILITY_MISC_INSN_EXT2       58
 #define FACILITY_VECTOR               129
 #define FACILITY_VECTOR_ENH1          135
 
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index d02b433271..cd39b2a208 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -180,6 +180,8 @@  typedef enum S390Opcode {
     RRE_SLBGR   = 0xb989,
     RRE_XGR     = 0xb982,
 
+    RRFa_MSRKC  = 0xb9fd,
+    RRFa_MSGRKC = 0xb9ed,
     RRFa_NRK    = 0xb9f4,
     RRFa_NGRK   = 0xb9e4,
     RRFa_ORK    = 0xb9f6,
@@ -2140,14 +2142,18 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mul_i32:
+        a0 = args[0], a1 = args[1], a2 = (int32_t)args[2];
         if (const_args[2]) {
-            if ((int32_t)args[2] == (int16_t)args[2]) {
-                tcg_out_insn(s, RI, MHI, args[0], args[2]);
+            tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
+            if (a2 == (int16_t)a2) {
+                tcg_out_insn(s, RI, MHI, a0, a2);
             } else {
-                tcg_out_insn(s, RIL, MSFI, args[0], args[2]);
+                tcg_out_insn(s, RIL, MSFI, a0, a2);
             }
+        } else if (a0 == a1) {
+            tcg_out_insn(s, RRE, MSR, a0, a2);
         } else {
-            tcg_out_insn(s, RRE, MSR, args[0], args[2]);
+            tcg_out_insn(s, RRFa, MSRKC, a0, a1, a2);
         }
         break;
 
@@ -2405,14 +2411,18 @@  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
         break;
 
     case INDEX_op_mul_i64:
+        a0 = args[0], a1 = args[1], a2 = args[2];
         if (const_args[2]) {
-            if (args[2] == (int16_t)args[2]) {
-                tcg_out_insn(s, RI, MGHI, args[0], args[2]);
+            tcg_out_mov(s, TCG_TYPE_I64, a0, a1);
+            if (a2 == (int16_t)a2) {
+                tcg_out_insn(s, RI, MGHI, a0, a2);
             } else {
-                tcg_out_insn(s, RIL, MSGFI, args[0], args[2]);
+                tcg_out_insn(s, RIL, MSGFI, a0, a2);
             }
+        } else if (a0 == a1) {
+            tcg_out_insn(s, RRE, MSGR, a0, a2);
         } else {
-            tcg_out_insn(s, RRE, MSGR, args[0], args[2]);
+            tcg_out_insn(s, RRFa, MSGRKC, a0, a1, a2);
         }
         break;
 
@@ -3072,12 +3082,16 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
            MULTIPLY SINGLE IMMEDIATE with a signed 32-bit, otherwise we
            have only MULTIPLY HALFWORD IMMEDIATE, with a signed 16-bit.  */
         return (HAVE_FACILITY(GEN_INST_EXT)
-                ? C_O1_I2(r, 0, ri)
+                ? (HAVE_FACILITY(MISC_INSN_EXT2)
+                   ? C_O1_I2(r, r, ri)
+                   : C_O1_I2(r, 0, ri))
                 : C_O1_I2(r, 0, rI));
 
     case INDEX_op_mul_i64:
         return (HAVE_FACILITY(GEN_INST_EXT)
-                ? C_O1_I2(r, 0, rJ)
+                ? (HAVE_FACILITY(MISC_INSN_EXT2)
+                   ? C_O1_I2(r, r, rJ)
+                   : C_O1_I2(r, 0, rJ))
                 : C_O1_I2(r, 0, rI));
 
     case INDEX_op_shl_i32: