Patchwork [086/126] target-s390: Convert CLST, MVST

login
register
mail settings
Submitter Richard Henderson
Date Sept. 9, 2012, 9:05 p.m.
Message ID <1347224784-19472-87-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/182656/
State New
Headers show

Comments

Richard Henderson - Sept. 9, 2012, 9:05 p.m.
Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-s390x/helper.h      |   4 +-
 target-s390x/insn-data.def |   4 ++
 target-s390x/mem_helper.c  | 119 ++++++++++++++++++++++-----------------------
 target-s390x/translate.c   |  42 ++++++++--------
 4 files changed, 83 insertions(+), 86 deletions(-)
Blue Swirl - Sept. 11, 2012, 7:11 p.m.
On Sun, Sep 9, 2012 at 9:05 PM, Richard Henderson <rth@twiddle.net> wrote:
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
>  target-s390x/helper.h      |   4 +-
>  target-s390x/insn-data.def |   4 ++
>  target-s390x/mem_helper.c  | 119 ++++++++++++++++++++++-----------------------
>  target-s390x/translate.c   |  42 ++++++++--------
>  4 files changed, 83 insertions(+), 86 deletions(-)
>
> diff --git a/target-s390x/helper.h b/target-s390x/helper.h
> index 2bd4105..473e776 100644
> --- a/target-s390x/helper.h
> +++ b/target-s390x/helper.h
> @@ -25,9 +25,9 @@ DEF_HELPER_FLAGS_3(set_cc_subu64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64, i64, i
>  DEF_HELPER_FLAGS_3(set_cc_sub32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32, s32, s32)
>  DEF_HELPER_FLAGS_3(set_cc_subu32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32, i32)
>  DEF_HELPER_4(srst, i32, env, i32, i32, i32)
> -DEF_HELPER_4(clst, i32, env, i32, i32, i32)
> +DEF_HELPER_4(clst, i64, env, i64, i64, i64)
>  DEF_HELPER_4(mvpg, void, env, i64, i64, i64)
> -DEF_HELPER_4(mvst, void, env, i32, i32, i32)
> +DEF_HELPER_4(mvst, i64, env, i64, i64, i64)
>  DEF_HELPER_4(csg, i64, env, i64, i64, i64)
>  DEF_HELPER_4(cdsg, i32, env, i32, i64, i32)
>  DEF_HELPER_4(cs, i64, env, i64, i64, i64)
> diff --git a/target-s390x/insn-data.def b/target-s390x/insn-data.def
> index 4b30dd5..501d02e 100644
> --- a/target-s390x/insn-data.def
> +++ b/target-s390x/insn-data.def
> @@ -153,6 +153,8 @@
>      C(0xbd00, CLM,     RS_b,  Z,   r1_o, a2, 0, 0, clm, 0)
>      C(0xeb21, CLMY,    RSY_b, LD,  r1_o, a2, 0, 0, clm, 0)
>      C(0xeb20, CLMH,    RSY_b, Z,   r1_sr32, a2, 0, 0, clm, 0)
> +/* COMPARE LOGICAL STRING */
> +    C(0xb25d, CLST,    RRE,   Z,   r1_o, r2_o, 0, 0, clst, 0)
>
>  /* COMPARE AND SWAP */
>      C(0xba00, CS,      RS_a,  Z,   r1_o, a2, new, r1_32, cs, 0)
> @@ -396,6 +398,8 @@
>      C(0xa800, MVCLE,   RS_a,  Z,   0, a2, 0, 0, mvcle, 0)
>  /* MOVE PAGE */
>      C(0xb254, MVPG,    RRE,   Z,   r1_o, r2_o, 0, 0, mvpg, 0)
> +/* MOVE STRING */
> +    C(0xb255, MVST,    RRE,   Z,   r1_o, r2_o, 0, 0, mvst, 0)
>
>  /* MULTIPLY */
>      C(0x1c00, MR,      RR_a,  Z,   r1p1_32s, r2_32s, new, r1_D32, mul, 0)
> diff --git a/target-s390x/mem_helper.c b/target-s390x/mem_helper.c
> index e9cacf9..8cc4625 100644
> --- a/target-s390x/mem_helper.c
> +++ b/target-s390x/mem_helper.c
> @@ -310,36 +310,30 @@ uint32_t HELPER(clm)(CPUS390XState *env, uint32_t r1, uint32_t mask,
>      return cc;
>  }
>
> +static inline uint64_t fix_address(CPUS390XState *env, uint64_t a)
> +{
> +    /* 31-Bit mode */
> +    if (!(env->psw.mask & PSW_MASK_64)) {

PSW_MASK_64 bit could be added to TB flags and that could be checked
during translation, then the mask needs to be applied only when the
mode is active. Whether that actually improves performance depends on
how often the bit is changed. Also all PSW writes need to be handled,
possibly causing a TB flush.

Masking could be done with TCG ops too.

> +        a &= 0x7fffffff;
> +    }
> +    return a;
> +}
> +
>  static inline uint64_t get_address(CPUS390XState *env, int x2, int b2, int d2)
>  {
>      uint64_t r = d2;
> -
>      if (x2) {
>          r += env->regs[x2];
>      }
> -
>      if (b2) {
>          r += env->regs[b2];
>      }
> -
> -    /* 31-Bit mode */
> -    if (!(env->psw.mask & PSW_MASK_64)) {
> -        r &= 0x7fffffff;
> -    }
> -
> -    return r;
> +    return fix_address(env, r);
>  }
>
>  static inline uint64_t get_address_31fix(CPUS390XState *env, int reg)
>  {
> -    uint64_t r = env->regs[reg];
> -
> -    /* 31-Bit mode */
> -    if (!(env->psw.mask & PSW_MASK_64)) {
> -        r &= 0x7fffffff;
> -    }
> -
> -    return r;
> +    return fix_address(env, env->regs[reg]);
>  }
>
>  /* search string (c is byte to search, r2 is string, r1 end of string) */
> @@ -365,39 +359,40 @@ uint32_t HELPER(srst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
>  }
>
>  /* unsigned string compare (c is string terminator) */
> -uint32_t HELPER(clst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
> +uint64_t HELPER(clst)(CPUS390XState *env, uint64_t c, uint64_t s1, uint64_t s2)
>  {
> -    uint64_t s1 = get_address_31fix(env, r1);
> -    uint64_t s2 = get_address_31fix(env, r2);
> -    uint8_t v1, v2;
> -    uint32_t cc;
> +    uint32_t len;
>
>      c = c & 0xff;
> -#ifdef CONFIG_USER_ONLY
> -    if (!c) {
> -        HELPER_LOG("%s: comparing '%s' and '%s'\n",
> -                   __func__, (char *)g2h(s1), (char *)g2h(s2));
> -    }
> -#endif
> -    for (;;) {
> -        v1 = cpu_ldub_data(env, s1);
> -        v2 = cpu_ldub_data(env, s2);
> -        if ((v1 == c || v2 == c) || (v1 != v2)) {
> -            break;
> +    s1 = fix_address(env, s1);
> +    s2 = fix_address(env, s2);
> +
> +    /* Lest we fail to service interrupts in a timely manner, limit the
> +       amount of work we're willing to do.  For now, lets cap at 8k.  */
> +    for (len = 0; len < 0x2000; ++len) {
> +        uint8_t v1 = cpu_ldub_data(env, s1 + len);
> +        uint8_t v2 = cpu_ldub_data(env, s2 + len);
> +        if (v1 == v2) {
> +            if (v1 == c) {
> +                /* Equal.  CC=0, and don't advance the registers.  */
> +                env->cc_op = 0;
> +                env->retxl = s2;
> +                return s1;
> +            }
> +        } else {
> +            /* Unequal.  CC={1,2}, and advance the registers.  Note that
> +               the terminator need not be zero, but the string that contains
> +               the terminator is by definition "low".  */
> +            env->cc_op = (v1 == c ? 1 : v2 == c ? 2 : v1 < v2 ? 1 : 2);
> +            env->retxl = s2 + len;
> +            return s1 + len;
>          }
> -        s1++;
> -        s2++;
>      }
>
> -    if (v1 == v2) {
> -        cc = 0;
> -    } else {
> -        cc = (v1 < v2) ? 1 : 2;
> -        /* FIXME: 31-bit mode! */
> -        env->regs[r1] = s1;
> -        env->regs[r2] = s2;
> -    }
> -    return cc;
> +    /* CPU-determined bytes equal; advance the registers.  */
> +    env->cc_op = 3;
> +    env->retxl = s2 + len;
> +    return s1 + len;
>  }
>
>  /* move page */
> @@ -413,29 +408,31 @@ void HELPER(mvpg)(CPUS390XState *env, uint64_t r0, uint64_t r1, uint64_t r2)
>  }
>
>  /* string copy (c is string terminator) */
> -void HELPER(mvst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
> +uint64_t HELPER(mvst)(CPUS390XState *env, uint64_t c, uint64_t d, uint64_t s)
>  {
> -    uint64_t dest = get_address_31fix(env, r1);
> -    uint64_t src = get_address_31fix(env, r2);
> -    uint8_t v;
> +    uint32_t len;
>
>      c = c & 0xff;
> -#ifdef CONFIG_USER_ONLY
> -    if (!c) {
> -        HELPER_LOG("%s: copy '%s' to 0x%lx\n", __func__, (char *)g2h(src),
> -                   dest);
> -    }
> -#endif
> -    for (;;) {
> -        v = cpu_ldub_data(env, src);
> -        cpu_stb_data(env, dest, v);
> +    d = fix_address(env, d);
> +    s = fix_address(env, s);
> +
> +    /* Lest we fail to service interrupts in a timely manner, limit the
> +       amount of work we're willing to do.  For now, lets cap at 8k.  */
> +    for (len = 0; len < 0x2000; ++len) {
> +        uint8_t v = cpu_ldub_data(env, s + len);
> +        cpu_stb_data(env, d + len, v);
>          if (v == c) {
> -            break;
> +            /* Complete.  Set CC=1 and advance R1.  */
> +            env->cc_op = 1;
> +            env->retxl = s;
> +            return d + len;
>          }
> -        src++;
> -        dest++;
>      }
> -    env->regs[r1] = dest; /* FIXME: 31-bit mode! */
> +
> +    /* Incomplete.  Set CC=3 and signal to advance R1 and R2.  */
> +    env->cc_op = 3;
> +    env->retxl = s + len;
> +    return d + len;
>  }
>
>  /* compare and swap 64-bit */
> diff --git a/target-s390x/translate.c b/target-s390x/translate.c
> index a49475b..49f419a 100644
> --- a/target-s390x/translate.c
> +++ b/target-s390x/translate.c
> @@ -425,7 +425,7 @@ static TCGv_i64 get_address(DisasContext *s, int x2, int b2, int d2)
>      return tmp;
>  }
>
> -static void gen_op_movi_cc(DisasContext *s, uint32_t val)
> +static inline void gen_op_movi_cc(DisasContext *s, uint32_t val)
>  {
>      s->cc_op = CC_OP_CONST0 + val;
>  }
> @@ -1031,28 +1031,6 @@ static void disas_b2(DisasContext *s, int op, uint32_t insn)
>      LOG_DISAS("disas_b2: op 0x%x r1 %d r2 %d\n", op, r1, r2);
>
>      switch (op) {
> -    case 0x55: /* MVST     R1,R2     [RRE] */
> -        tmp32_1 = load_reg32(0);
> -        tmp32_2 = tcg_const_i32(r1);
> -        tmp32_3 = tcg_const_i32(r2);
> -        potential_page_fault(s);
> -        gen_helper_mvst(cpu_env, tmp32_1, tmp32_2, tmp32_3);
> -        tcg_temp_free_i32(tmp32_1);
> -        tcg_temp_free_i32(tmp32_2);
> -        tcg_temp_free_i32(tmp32_3);
> -        gen_op_movi_cc(s, 1);
> -        break;
> -    case 0x5d: /* CLST     R1,R2     [RRE] */
> -        tmp32_1 = load_reg32(0);
> -        tmp32_2 = tcg_const_i32(r1);
> -        tmp32_3 = tcg_const_i32(r2);
> -        potential_page_fault(s);
> -        gen_helper_clst(cc_op, cpu_env, tmp32_1, tmp32_2, tmp32_3);
> -        set_cc_static(s);
> -        tcg_temp_free_i32(tmp32_1);
> -        tcg_temp_free_i32(tmp32_2);
> -        tcg_temp_free_i32(tmp32_3);
> -        break;
>      case 0x5e: /* SRST     R1,R2     [RRE] */
>          tmp32_1 = load_reg32(0);
>          tmp32_2 = tcg_const_i32(r1);
> @@ -2044,6 +2022,15 @@ static ExitStatus op_clm(DisasContext *s, DisasOps *o)
>      return NO_EXIT;
>  }
>
> +static ExitStatus op_clst(DisasContext *s, DisasOps *o)
> +{
> +    potential_page_fault(s);
> +    gen_helper_clst(o->in1, cpu_env, regs[0], o->in1, o->in2);
> +    set_cc_static(s);
> +    return_low128(o->in2);
> +    return NO_EXIT;
> +}
> +
>  static ExitStatus op_cs(DisasContext *s, DisasOps *o)
>  {
>      int r3 = get_field(s->fields, r3);
> @@ -2589,6 +2576,15 @@ static ExitStatus op_mvpg(DisasContext *s, DisasOps *o)
>      return NO_EXIT;
>  }
>
> +static ExitStatus op_mvst(DisasContext *s, DisasOps *o)
> +{
> +    potential_page_fault(s);
> +    gen_helper_mvst(o->in1, cpu_env, regs[0], o->in1, o->in2);
> +    set_cc_static(s);
> +    return_low128(o->in2);
> +    return NO_EXIT;
> +}
> +
>  static ExitStatus op_mul(DisasContext *s, DisasOps *o)
>  {
>      tcg_gen_mul_i64(o->out, o->in1, o->in2);
> --
> 1.7.11.4
>
>
Richard Henderson - Sept. 11, 2012, 9:09 p.m.
On 09/11/2012 12:11 PM, Blue Swirl wrote:
> PSW_MASK_64 bit could be added to TB flags and that could be checked
> during translation, then the mask needs to be applied only when the
> mode is active. Whether that actually improves performance depends on
> how often the bit is changed. Also all PSW writes need to be handled,
> possibly causing a TB flush.

Actually I'm not sure why we check this at all, given that we only
actually handle 64-bit mode -- at least as documented by the code
implementing the SET ADDRESS MODE instruction.

That said, we do encode the bit in TB flags, and we do perform this
masking for qemu loads performed within the TB.  No TB flushes are
required because we simply don't match TBs with different flags.

As for clst, mvst, srst, I thought about performing the masking in
the TB, but didn't figure it was worth it.  Do you have an opinion, Alex?


r~
Alexander Graf - Sept. 18, 2012, 9:04 p.m.
On 09/11/2012 11:09 PM, Richard Henderson wrote:
> On 09/11/2012 12:11 PM, Blue Swirl wrote:
>> PSW_MASK_64 bit could be added to TB flags and that could be checked
>> during translation, then the mask needs to be applied only when the
>> mode is active. Whether that actually improves performance depends on
>> how often the bit is changed. Also all PSW writes need to be handled,
>> possibly causing a TB flush.
> Actually I'm not sure why we check this at all, given that we only
> actually handle 64-bit mode -- at least as documented by the code
> implementing the SET ADDRESS MODE instruction.

For kernel code, we only support 64 bit mode, yes. But for user space 
code, we need to support 31-bit mode to enable Debian to work. They 
still run all user space in 31-bit mode.

> That said, we do encode the bit in TB flags, and we do perform this
> masking for qemu loads performed within the TB.  No TB flushes are
> required because we simply don't match TBs with different flags.
>
> As for clst, mvst, srst, I thought about performing the masking in
> the TB, but didn't figure it was worth it.  Do you have an opinion, Alex?

I don't know if it's worth it either. When calling anything with address 
parameters we already need to save off all registers to env, since we 
could take a page fault any time. So moving code into TCG context 
shouldn't improve anything performance wise.


Alex

Patch

diff --git a/target-s390x/helper.h b/target-s390x/helper.h
index 2bd4105..473e776 100644
--- a/target-s390x/helper.h
+++ b/target-s390x/helper.h
@@ -25,9 +25,9 @@  DEF_HELPER_FLAGS_3(set_cc_subu64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64, i64, i
 DEF_HELPER_FLAGS_3(set_cc_sub32, TCG_CALL_PURE|TCG_CALL_CONST, i32, s32, s32, s32)
 DEF_HELPER_FLAGS_3(set_cc_subu32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32, i32)
 DEF_HELPER_4(srst, i32, env, i32, i32, i32)
-DEF_HELPER_4(clst, i32, env, i32, i32, i32)
+DEF_HELPER_4(clst, i64, env, i64, i64, i64)
 DEF_HELPER_4(mvpg, void, env, i64, i64, i64)
-DEF_HELPER_4(mvst, void, env, i32, i32, i32)
+DEF_HELPER_4(mvst, i64, env, i64, i64, i64)
 DEF_HELPER_4(csg, i64, env, i64, i64, i64)
 DEF_HELPER_4(cdsg, i32, env, i32, i64, i32)
 DEF_HELPER_4(cs, i64, env, i64, i64, i64)
diff --git a/target-s390x/insn-data.def b/target-s390x/insn-data.def
index 4b30dd5..501d02e 100644
--- a/target-s390x/insn-data.def
+++ b/target-s390x/insn-data.def
@@ -153,6 +153,8 @@ 
     C(0xbd00, CLM,     RS_b,  Z,   r1_o, a2, 0, 0, clm, 0)
     C(0xeb21, CLMY,    RSY_b, LD,  r1_o, a2, 0, 0, clm, 0)
     C(0xeb20, CLMH,    RSY_b, Z,   r1_sr32, a2, 0, 0, clm, 0)
+/* COMPARE LOGICAL STRING */
+    C(0xb25d, CLST,    RRE,   Z,   r1_o, r2_o, 0, 0, clst, 0)
 
 /* COMPARE AND SWAP */
     C(0xba00, CS,      RS_a,  Z,   r1_o, a2, new, r1_32, cs, 0)
@@ -396,6 +398,8 @@ 
     C(0xa800, MVCLE,   RS_a,  Z,   0, a2, 0, 0, mvcle, 0)
 /* MOVE PAGE */
     C(0xb254, MVPG,    RRE,   Z,   r1_o, r2_o, 0, 0, mvpg, 0)
+/* MOVE STRING */
+    C(0xb255, MVST,    RRE,   Z,   r1_o, r2_o, 0, 0, mvst, 0)
 
 /* MULTIPLY */
     C(0x1c00, MR,      RR_a,  Z,   r1p1_32s, r2_32s, new, r1_D32, mul, 0)
diff --git a/target-s390x/mem_helper.c b/target-s390x/mem_helper.c
index e9cacf9..8cc4625 100644
--- a/target-s390x/mem_helper.c
+++ b/target-s390x/mem_helper.c
@@ -310,36 +310,30 @@  uint32_t HELPER(clm)(CPUS390XState *env, uint32_t r1, uint32_t mask,
     return cc;
 }
 
+static inline uint64_t fix_address(CPUS390XState *env, uint64_t a)
+{
+    /* 31-Bit mode */
+    if (!(env->psw.mask & PSW_MASK_64)) {
+        a &= 0x7fffffff;
+    }
+    return a;
+}
+
 static inline uint64_t get_address(CPUS390XState *env, int x2, int b2, int d2)
 {
     uint64_t r = d2;
-
     if (x2) {
         r += env->regs[x2];
     }
-
     if (b2) {
         r += env->regs[b2];
     }
-
-    /* 31-Bit mode */
-    if (!(env->psw.mask & PSW_MASK_64)) {
-        r &= 0x7fffffff;
-    }
-
-    return r;
+    return fix_address(env, r);
 }
 
 static inline uint64_t get_address_31fix(CPUS390XState *env, int reg)
 {
-    uint64_t r = env->regs[reg];
-
-    /* 31-Bit mode */
-    if (!(env->psw.mask & PSW_MASK_64)) {
-        r &= 0x7fffffff;
-    }
-
-    return r;
+    return fix_address(env, env->regs[reg]);
 }
 
 /* search string (c is byte to search, r2 is string, r1 end of string) */
@@ -365,39 +359,40 @@  uint32_t HELPER(srst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
 }
 
 /* unsigned string compare (c is string terminator) */
-uint32_t HELPER(clst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
+uint64_t HELPER(clst)(CPUS390XState *env, uint64_t c, uint64_t s1, uint64_t s2)
 {
-    uint64_t s1 = get_address_31fix(env, r1);
-    uint64_t s2 = get_address_31fix(env, r2);
-    uint8_t v1, v2;
-    uint32_t cc;
+    uint32_t len;
 
     c = c & 0xff;
-#ifdef CONFIG_USER_ONLY
-    if (!c) {
-        HELPER_LOG("%s: comparing '%s' and '%s'\n",
-                   __func__, (char *)g2h(s1), (char *)g2h(s2));
-    }
-#endif
-    for (;;) {
-        v1 = cpu_ldub_data(env, s1);
-        v2 = cpu_ldub_data(env, s2);
-        if ((v1 == c || v2 == c) || (v1 != v2)) {
-            break;
+    s1 = fix_address(env, s1);
+    s2 = fix_address(env, s2);
+
+    /* Lest we fail to service interrupts in a timely manner, limit the
+       amount of work we're willing to do.  For now, lets cap at 8k.  */
+    for (len = 0; len < 0x2000; ++len) {
+        uint8_t v1 = cpu_ldub_data(env, s1 + len);
+        uint8_t v2 = cpu_ldub_data(env, s2 + len);
+        if (v1 == v2) {
+            if (v1 == c) {
+                /* Equal.  CC=0, and don't advance the registers.  */
+                env->cc_op = 0;
+                env->retxl = s2;
+                return s1;
+            }
+        } else {
+            /* Unequal.  CC={1,2}, and advance the registers.  Note that
+               the terminator need not be zero, but the string that contains
+               the terminator is by definition "low".  */
+            env->cc_op = (v1 == c ? 1 : v2 == c ? 2 : v1 < v2 ? 1 : 2);
+            env->retxl = s2 + len;
+            return s1 + len;
         }
-        s1++;
-        s2++;
     }
 
-    if (v1 == v2) {
-        cc = 0;
-    } else {
-        cc = (v1 < v2) ? 1 : 2;
-        /* FIXME: 31-bit mode! */
-        env->regs[r1] = s1;
-        env->regs[r2] = s2;
-    }
-    return cc;
+    /* CPU-determined bytes equal; advance the registers.  */
+    env->cc_op = 3;
+    env->retxl = s2 + len;
+    return s1 + len;
 }
 
 /* move page */
@@ -413,29 +408,31 @@  void HELPER(mvpg)(CPUS390XState *env, uint64_t r0, uint64_t r1, uint64_t r2)
 }
 
 /* string copy (c is string terminator) */
-void HELPER(mvst)(CPUS390XState *env, uint32_t c, uint32_t r1, uint32_t r2)
+uint64_t HELPER(mvst)(CPUS390XState *env, uint64_t c, uint64_t d, uint64_t s)
 {
-    uint64_t dest = get_address_31fix(env, r1);
-    uint64_t src = get_address_31fix(env, r2);
-    uint8_t v;
+    uint32_t len;
 
     c = c & 0xff;
-#ifdef CONFIG_USER_ONLY
-    if (!c) {
-        HELPER_LOG("%s: copy '%s' to 0x%lx\n", __func__, (char *)g2h(src),
-                   dest);
-    }
-#endif
-    for (;;) {
-        v = cpu_ldub_data(env, src);
-        cpu_stb_data(env, dest, v);
+    d = fix_address(env, d);
+    s = fix_address(env, s);
+
+    /* Lest we fail to service interrupts in a timely manner, limit the
+       amount of work we're willing to do.  For now, lets cap at 8k.  */
+    for (len = 0; len < 0x2000; ++len) {
+        uint8_t v = cpu_ldub_data(env, s + len);
+        cpu_stb_data(env, d + len, v);
         if (v == c) {
-            break;
+            /* Complete.  Set CC=1 and advance R1.  */
+            env->cc_op = 1;
+            env->retxl = s;
+            return d + len;
         }
-        src++;
-        dest++;
     }
-    env->regs[r1] = dest; /* FIXME: 31-bit mode! */
+
+    /* Incomplete.  Set CC=3 and signal to advance R1 and R2.  */
+    env->cc_op = 3;
+    env->retxl = s + len;
+    return d + len;
 }
 
 /* compare and swap 64-bit */
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index a49475b..49f419a 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -425,7 +425,7 @@  static TCGv_i64 get_address(DisasContext *s, int x2, int b2, int d2)
     return tmp;
 }
 
-static void gen_op_movi_cc(DisasContext *s, uint32_t val)
+static inline void gen_op_movi_cc(DisasContext *s, uint32_t val)
 {
     s->cc_op = CC_OP_CONST0 + val;
 }
@@ -1031,28 +1031,6 @@  static void disas_b2(DisasContext *s, int op, uint32_t insn)
     LOG_DISAS("disas_b2: op 0x%x r1 %d r2 %d\n", op, r1, r2);
 
     switch (op) {
-    case 0x55: /* MVST     R1,R2     [RRE] */
-        tmp32_1 = load_reg32(0);
-        tmp32_2 = tcg_const_i32(r1);
-        tmp32_3 = tcg_const_i32(r2);
-        potential_page_fault(s);
-        gen_helper_mvst(cpu_env, tmp32_1, tmp32_2, tmp32_3);
-        tcg_temp_free_i32(tmp32_1);
-        tcg_temp_free_i32(tmp32_2);
-        tcg_temp_free_i32(tmp32_3);
-        gen_op_movi_cc(s, 1);
-        break;
-    case 0x5d: /* CLST     R1,R2     [RRE] */
-        tmp32_1 = load_reg32(0);
-        tmp32_2 = tcg_const_i32(r1);
-        tmp32_3 = tcg_const_i32(r2);
-        potential_page_fault(s);
-        gen_helper_clst(cc_op, cpu_env, tmp32_1, tmp32_2, tmp32_3);
-        set_cc_static(s);
-        tcg_temp_free_i32(tmp32_1);
-        tcg_temp_free_i32(tmp32_2);
-        tcg_temp_free_i32(tmp32_3);
-        break;
     case 0x5e: /* SRST     R1,R2     [RRE] */
         tmp32_1 = load_reg32(0);
         tmp32_2 = tcg_const_i32(r1);
@@ -2044,6 +2022,15 @@  static ExitStatus op_clm(DisasContext *s, DisasOps *o)
     return NO_EXIT;
 }
 
+static ExitStatus op_clst(DisasContext *s, DisasOps *o)
+{
+    potential_page_fault(s);
+    gen_helper_clst(o->in1, cpu_env, regs[0], o->in1, o->in2);
+    set_cc_static(s);
+    return_low128(o->in2);
+    return NO_EXIT;
+}
+
 static ExitStatus op_cs(DisasContext *s, DisasOps *o)
 {
     int r3 = get_field(s->fields, r3);
@@ -2589,6 +2576,15 @@  static ExitStatus op_mvpg(DisasContext *s, DisasOps *o)
     return NO_EXIT;
 }
 
+static ExitStatus op_mvst(DisasContext *s, DisasOps *o)
+{
+    potential_page_fault(s);
+    gen_helper_mvst(o->in1, cpu_env, regs[0], o->in1, o->in2);
+    set_cc_static(s);
+    return_low128(o->in2);
+    return NO_EXIT;
+}
+
 static ExitStatus op_mul(DisasContext *s, DisasOps *o)
 {
     tcg_gen_mul_i64(o->out, o->in1, o->in2);