diff mbox

[v2,41/60] target-i386: Create gen_lea_v_seg

Message ID 1385694047-6116-42-git-send-email-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Nov. 29, 2013, 3 a.m. UTC
Add forgotten zero-extension in the TARGET_X86_64, !CODE64, ss32 case;
use this new function to implement gen_string_movl_A0_EDI,
gen_string_movl_A0_ESI, gen_add_A0_ds_seg.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 target-i386/translate.c | 159 ++++++++++++++++--------------------------------
 1 file changed, 52 insertions(+), 107 deletions(-)

Comments

Peter Maydell Dec. 26, 2013, 6:38 p.m. UTC | #1
On 29 November 2013 03:00, Richard Henderson <rth@twiddle.net> wrote:
> Add forgotten zero-extension in the TARGET_X86_64, !CODE64, ss32 case;
> use this new function to implement gen_string_movl_A0_EDI,
> gen_string_movl_A0_ESI, gen_add_A0_ds_seg.

I'm afraid I can't figure out how this code matches up with the previous
implementation of these functions. For example:

> +/* Compute SEG:REG into A0.  SEG is selected from the override segment
> +   (OVR_SEG) and the default segment (DEF_SEG).  OVR_SEG may be -1 to
> +   indicate no override.  */
> +static void gen_lea_v_seg(DisasContext *s, TCGv a0, int def_seg, int ovr_seg)
>  {
> -    int override;
> +    TCGMemOp aflag = s->aflag;
>
> -    override = s->override;
> -    switch (s->aflag) {
> +    switch (aflag) {
>  #ifdef TARGET_X86_64
>      case MO_64:
> -        if (override >= 0) {
> -            gen_op_movq_A0_seg(override);
> -            gen_op_addq_A0_reg_sN(0, R_ESI);
> -        } else {
> -            gen_op_movq_A0_reg(R_ESI);
> +        if (ovr_seg < 0) {
> +            tcg_gen_mov_tl(cpu_A0, a0);
> +            return;
>          }
>          break;
>  #endif
>      case MO_32:
>          /* 32 bit address */
> -        if (s->addseg && override < 0)
> -            override = R_DS;
> -        if (override >= 0) {
> -            gen_op_movl_A0_seg(override);
> -            gen_op_addl_A0_reg_sN(0, R_ESI);
> -        } else {
> -            gen_op_movl_A0_reg(R_ESI);
> +        if (ovr_seg < 0) {
> +            if (s->addseg) {
> +                ovr_seg = def_seg;
> +            } else {
> +                tcg_gen_ext32u_tl(cpu_A0, a0);
> +                return;
> +            }
>          }
>          break;
>      case MO_16:
> -        /* 16 address, always override */
> -        if (override < 0)
> -            override = R_DS;
> -        tcg_gen_ext16u_tl(cpu_A0, cpu_regs[R_ESI]);
> -        gen_op_addl_A0_seg(s, override);
> +        /* 16 bit address */
> +        if (ovr_seg < 0) {
> +            ovr_seg = def_seg;
> +        }
> +        tcg_gen_ext16u_tl(cpu_A0, a0);
> +        if (!s->addseg) {
> +            return;
> +        }

The old MO_16 code for gen_string_movl* doesn't care
about s->addseg, and always performs an add of a segment.
This new code might stop without doing the addition.

> +        a0 = cpu_A0;

This is also pretty confusing -- we have a parameter "a0"
which isn't the same thing as cpu_A0, and in this case
statement sometimes cpu_A0 is the result we're calculating
based on a0, and sometimes we don't touch cpu_A0 at
all and rely on following code to set it up, and in this case
we use cpu_A0 as a random temporary and then assign
it to a0...

>          break;
>      default:
>          tcg_abort();
>      }
> -}

I also find the handling of "ovr_seg < 0" pretty hard to read
in the new code -- the code for "we don't need to add a
segment" and "we do need to add a segment" is all mixed
up together, half the cases in the switch statement return
early and half fall out to maybe go through the code below,
and the code below is only for the "adding a segment" part.

I suspect it would be clearer if it was written:

   if (ovr_seg < 0 &&
      (s->aflag == MO_16 || (s->aflag == MO_32 && s->addseg))) {
      ovr_seg = def_seg;
   }
   if (ovr_seg < 0) {
       /* generate code for no segment add */
   } else {
       /* generate code for segment add */
   }

thanks
-- PMM
Richard Henderson Dec. 26, 2013, 7:31 p.m. UTC | #2
On 12/26/2013 10:38 AM, Peter Maydell wrote:
> The old MO_16 code for gen_string_movl* doesn't care
> about s->addseg, and always performs an add of a segment.
> This new code might stop without doing the addition.

The only time s->addseg will be false in 16-bit mode is during translation of
LEA.  I do need the addseg check there for LEA cleanup, but this change should
not affect gen_string_movl.

>> +        a0 = cpu_A0;
> 
> This is also pretty confusing -- we have a parameter "a0"
> which isn't the same thing as cpu_A0, and in this case
> statement sometimes cpu_A0 is the result we're calculating
> based on a0, and sometimes we don't touch cpu_A0 at
> all and rely on following code to set it up, and in this case
> we use cpu_A0 as a random temporary and then assign
> it to a0...

While I can agree that a0 vs cpu_A0 might be a tad confusing, a0 is always the
current input address, and cpu_A0 is always the output address.

I disagree with the characterization "random temporary".  Using the output as a
temporary in computing the output is totally sensible, given that our most
popular host platform is 2-address.

> I also find the handling of "ovr_seg < 0" pretty hard to read
> in the new code -- the code for "we don't need to add a
> segment" and "we do need to add a segment" is all mixed
> up together, half the cases in the switch statement return
> early and half fall out to maybe go through the code below,
> and the code below is only for the "adding a segment" part.
> 
> I suspect it would be clearer if it was written:
> 
>    if (ovr_seg < 0 &&
>       (s->aflag == MO_16 || (s->aflag == MO_32 && s->addseg))) {
>       ovr_seg = def_seg;
>    }
>    if (ovr_seg < 0) {
>        /* generate code for no segment add */
>    } else {
>        /* generate code for segment add */
>    }

Really?  I honestly find that harder to read, because that first condition is
so complicated.  Better to have it split out in the swtch statement where we're
already doing special things for M_16, M_32, and M_64.


r~
Peter Maydell Dec. 26, 2013, 9:27 p.m. UTC | #3
On 26 December 2013 19:31, Richard Henderson <rth@twiddle.net> wrote:
> On 12/26/2013 10:38 AM, Peter Maydell wrote:
>> The old MO_16 code for gen_string_movl* doesn't care
>> about s->addseg, and always performs an add of a segment.
>> This new code might stop without doing the addition.
>
> The only time s->addseg will be false in 16-bit mode is during translation of
> LEA.  I do need the addseg check there for LEA cleanup, but this change should
> not affect gen_string_movl.

Oh, is that the bit that does:

        val = s->addseg;
        s->addseg = 0;
        gen_lea_modrm(env, s, modrm);
        s->addseg = val;

? I think we should get rid of that -- s->addseg should always
mean "we can do the segment-base-is-zero optimization",
it shouldn't be a secret hidden parameter to the gen_lea functions
saying "don't do this addition".

> I disagree with the characterization "random temporary".  Using the output as a
> temporary in computing the output is totally sensible, given that our most
> popular host platform is 2-address.

This is the kind of thing that in an ideal world the register allocator
would deal with. The tcg/README optimisation suggestions
don't say anything about preferring to use X,X,Y rather than X,Y,Z
ops where possible, and typically allocating and using a special
purpose temp is more readable code.

>> I also find the handling of "ovr_seg < 0" pretty hard to read
>> in the new code -- the code for "we don't need to add a
>> segment" and "we do need to add a segment" is all mixed
>> up together, half the cases in the switch statement return
>> early and half fall out to maybe go through the code below,
>> and the code below is only for the "adding a segment" part.
>>
>> I suspect it would be clearer if it was written:
>>
>>    if (ovr_seg < 0 &&
>>       (s->aflag == MO_16 || (s->aflag == MO_32 && s->addseg))) {
>>       ovr_seg = def_seg;
>>    }
>>    if (ovr_seg < 0) {
>>        /* generate code for no segment add */
>>    } else {
>>        /* generate code for segment add */
>>    }
>
> Really?  I honestly find that harder to read, because that first condition is
> so complicated.  Better to have it split out in the swtch statement where we're
> already doing special things for M_16, M_32, and M_64.

I'm trying to separate out "what are we doing?" ie "do we need to
do no segment add, segment add of specified segment, or segment
add of default segment?" from "how do we do it" ie "emit these
tcg instructions".

More generally, it's pretty unclear to me why we're handling
"use default segment register for instruction" (ie ovr_seg == -1)
differently for the three cases. The whole addseg
business appears to be a combination of poorly documented
optimization and dodgy backdoor for the lea case as above.
Why is it OK to skip the addition of the base address for ES
(in the movl_A0_EDI case) when the comment for addseg says
it only applies to CS/DS/ES? Why is it not OK to skip the addition
of the base address for CS/DS/ES if it was specified by an
override prefix rather than being the default for the insn?

It seems to me that we ought to try to get this code to a
point where it looks more like:
    if (ovr_seg < 0) {
        ovr_seg = def_seg;
    }
    emit code to get address;
    if (!segment_base_guaranteed_zero(s, ovr_seg)) {
        emit code to add base to address;
    }

where segment_base_guaranteed_zero() is a helper
function like:
bool segment_base_guaranteed_zero(s, seg) {
      /* Return true if we can guarantee at translate time that
       * the base address of the specified segment is zero
       * (and thus can skip emitting code to add it)
       */
      return (!s->addseg &&
          (seg == R_CS || seg == R_DS || seg == R_SS));
}

thanks
-- PMM
Peter Maydell Dec. 26, 2013, 9:31 p.m. UTC | #4
On 26 December 2013 21:27, Peter Maydell <peter.maydell@linaro.org> wrote:

> Why is it OK to skip the addition of the base address for ES
> (in the movl_A0_EDI case) when the comment for addseg says
> it only applies to CS/DS/ES?

Scratch that, misread of the comment. addseg applies to DS/ES/SS.

> Why is it not OK to skip the addition
> of the base address for CS/DS/ES if it was specified by an
> override prefix rather than being the default for the insn?

This still applies though.

> It seems to me that we ought to try to get this code to a
> point where it looks more like:
>     if (ovr_seg < 0) {
>         ovr_seg = def_seg;
>     }
>     emit code to get address;
>     if (!segment_base_guaranteed_zero(s, ovr_seg)) {
>         emit code to add base to address;
>     }
>
> where segment_base_guaranteed_zero() is a helper
> function like:
> bool segment_base_guaranteed_zero(s, seg) {
>       /* Return true if we can guarantee at translate time that
>        * the base address of the specified segment is zero
>        * (and thus can skip emitting code to add it)
>        */
>       return (!s->addseg &&
>           (seg == R_CS || seg == R_DS || seg == R_SS));

s/R_CS/R_ES/;

> }

thanks
-- PMM
Richard Henderson Dec. 27, 2013, 2:49 p.m. UTC | #5
On 12/26/2013 01:27 PM, Peter Maydell wrote:
>> The only time s->addseg will be false in 16-bit mode is during translation of
>> LEA.  I do need the addseg check there for LEA cleanup, but this change should
>> not affect gen_string_movl.
> 
> Oh, is that the bit that does:
> 
>         val = s->addseg;
>         s->addseg = 0;
>         gen_lea_modrm(env, s, modrm);
>         s->addseg = val;
> 
> ? I think we should get rid of that -- s->addseg should always
> mean "we can do the segment-base-is-zero optimization",
> it shouldn't be a secret hidden parameter to the gen_lea functions
> saying "don't do this addition".

Perhaps.  I'd rather do that as follow-up, since lea_v_seg isn't the top-level
function called for LEA.  And if we clean up addseg, we might as well clean up
popl_esp_hack as well.

How about a comment for now?

>> I disagree with the characterization "random temporary".  Using the output as a
>> temporary in computing the output is totally sensible, given that our most
>> popular host platform is 2-address.
> 
> This is the kind of thing that in an ideal world the register allocator
> would deal with. The tcg/README optimisation suggestions
> don't say anything about preferring to use X,X,Y rather than X,Y,Z
> ops where possible, and typically allocating and using a special
> purpose temp is more readable code.

I agree that a real register allocator would handle it.

I disagree that a special purpose temp would result in more readable code,
since then we'd need to add *more* code to deallocate it after the other
arithmetic.

> More generally, it's pretty unclear to me why we're handling
> "use default segment register for instruction" (ie ovr_seg == -1)
> differently for the three cases.

Because the handling of segments is fundamentally different in each of the
three cpu modes?

> Why is it OK to skip the addition of the base address for ES
> (in the movl_A0_EDI case) when the comment for addseg says
> it only applies to CS/DS/ES?

Err... when are we skipping the addition in gen_string_movl_A0_EDI?  We pass in
R_ES as the segment register to use...

> It seems to me that we ought to try to get this code to a
> point where it looks more like:
>     if (ovr_seg < 0) {
>         ovr_seg = def_seg;
>     }
>     emit code to get address;
>     if (!segment_base_guaranteed_zero(s, ovr_seg)) {
>         emit code to add base to address;
>     }
> 
> where segment_base_guaranteed_zero() is a helper
> function like:
> bool segment_base_guaranteed_zero(s, seg) {
>       /* Return true if we can guarantee at translate time that
>        * the base address of the specified segment is zero
>        * (and thus can skip emitting code to add it)
>        */
>       return (!s->addseg &&
>           (seg == R_CS || seg == R_DS || seg == R_SS));
> }

That does look much nicer, I agree.


r~
Peter Maydell Dec. 27, 2013, 4:06 p.m. UTC | #6
On 27 December 2013 14:49, Richard Henderson <rth@twiddle.net> wrote:
> On 12/26/2013 01:27 PM, Peter Maydell wrote:
>>> The only time s->addseg will be false in 16-bit mode is during translation of
>>> LEA.  I do need the addseg check there for LEA cleanup, but this change should
>>> not affect gen_string_movl.
>>
>> Oh, is that the bit that does:
>>
>>         val = s->addseg;
>>         s->addseg = 0;
>>         gen_lea_modrm(env, s, modrm);
>>         s->addseg = val;
>>
>> ? I think we should get rid of that -- s->addseg should always
>> mean "we can do the segment-base-is-zero optimization",
>> it shouldn't be a secret hidden parameter to the gen_lea functions
>> saying "don't do this addition".
>
> Perhaps.  I'd rather do that as follow-up, since lea_v_seg isn't the top-level
> function called for LEA.  And if we clean up addseg, we might as well clean up
> popl_esp_hack as well.
>
> How about a comment for now?

Yes, a comment is fine for now.

>> More generally, it's pretty unclear to me why we're handling
>> "use default segment register for instruction" (ie ovr_seg == -1)
>> differently for the three cases.
>
> Because the handling of segments is fundamentally different in each of the
> three cpu modes?

Well, yes, but I've been unable to match up what the
code is doing differently in the three cases with what the
Intel manual says about them. For instance the manual says
that segmentation is mostly disabled in 64 bit mode, but although
the function has a conditional on CODE64 it doesn't seem to
be suppressing the add for CS/DS/ES/SS base as I would
expect it to. Perhaps an explanatory comment with a reference
to the relevant sections of the intel docs would help?

Some of this seems to be because the code is tangling up a
bunch of things at once (like the lea backdooring of addseg
for 16 bit mode).

>> Why is it OK to skip the addition of the base address for ES
>> (in the movl_A0_EDI case) when the comment for addseg says
>> it only applies to CS/DS/ES?
>
> Err... when are we skipping the addition in gen_string_movl_A0_EDI?  We pass in
> R_ES as the segment register to use...

Yeah, that was a confusion on my part.

thanks
-- PMM
diff mbox

Patch

diff --git a/target-i386/translate.c b/target-i386/translate.c
index 52849e1..02b45ef 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -440,64 +440,18 @@  static inline void gen_op_add_reg_T0(TCGMemOp size, int reg)
     gen_op_mov_reg_v(size, reg, cpu_tmp0);
 }
 
-static inline void gen_op_addl_A0_reg_sN(int shift, int reg)
-{
-    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
-    if (shift != 0)
-        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
-    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-    /* For x86_64, this sets the higher half of register to zero.
-       For i386, this is equivalent to a nop. */
-    tcg_gen_ext32u_tl(cpu_A0, cpu_A0);
-}
-
-static inline void gen_op_movl_A0_seg(int reg)
-{
-    tcg_gen_ld32u_tl(cpu_A0, cpu_env, offsetof(CPUX86State, segs[reg].base) + REG_L_OFFSET);
-}
-
 static inline void gen_op_addl_A0_seg(DisasContext *s, int reg)
 {
     tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State, segs[reg].base));
-#ifdef TARGET_X86_64
     if (CODE64(s)) {
-        tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+        tcg_gen_ext32u_tl(cpu_A0, cpu_A0);
         tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
     } else {
         tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-        tcg_gen_andi_tl(cpu_A0, cpu_A0, 0xffffffff);
+        tcg_gen_ext32u_tl(cpu_A0, cpu_A0);
     }
-#else
-    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-#endif
 }
 
-#ifdef TARGET_X86_64
-static inline void gen_op_movq_A0_seg(int reg)
-{
-    tcg_gen_ld_tl(cpu_A0, cpu_env, offsetof(CPUX86State, segs[reg].base));
-}
-
-static inline void gen_op_addq_A0_seg(int reg)
-{
-    tcg_gen_ld_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State, segs[reg].base));
-    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-}
-
-static inline void gen_op_movq_A0_reg(int reg)
-{
-    tcg_gen_mov_tl(cpu_A0, cpu_regs[reg]);
-}
-
-static inline void gen_op_addq_A0_reg_sN(int shift, int reg)
-{
-    tcg_gen_mov_tl(cpu_tmp0, cpu_regs[reg]);
-    if (shift != 0)
-        tcg_gen_shli_tl(cpu_tmp0, cpu_tmp0, shift);
-    tcg_gen_add_tl(cpu_A0, cpu_A0, cpu_tmp0);
-}
-#endif
-
 static inline void gen_op_ld_v(DisasContext *s, int idx, TCGv t0, TCGv a0)
 {
     tcg_gen_qemu_ld_tl(t0, a0, s->mem_index, idx | MO_LE);
@@ -523,70 +477,77 @@  static inline void gen_jmp_im(target_ulong pc)
     tcg_gen_st_tl(cpu_tmp0, cpu_env, offsetof(CPUX86State, eip));
 }
 
-static inline void gen_string_movl_A0_ESI(DisasContext *s)
+/* Compute SEG:REG into A0.  SEG is selected from the override segment
+   (OVR_SEG) and the default segment (DEF_SEG).  OVR_SEG may be -1 to
+   indicate no override.  */
+static void gen_lea_v_seg(DisasContext *s, TCGv a0, int def_seg, int ovr_seg)
 {
-    int override;
+    TCGMemOp aflag = s->aflag;
 
-    override = s->override;
-    switch (s->aflag) {
+    switch (aflag) {
 #ifdef TARGET_X86_64
     case MO_64:
-        if (override >= 0) {
-            gen_op_movq_A0_seg(override);
-            gen_op_addq_A0_reg_sN(0, R_ESI);
-        } else {
-            gen_op_movq_A0_reg(R_ESI);
+        if (ovr_seg < 0) {
+            tcg_gen_mov_tl(cpu_A0, a0);
+            return;
         }
         break;
 #endif
     case MO_32:
         /* 32 bit address */
-        if (s->addseg && override < 0)
-            override = R_DS;
-        if (override >= 0) {
-            gen_op_movl_A0_seg(override);
-            gen_op_addl_A0_reg_sN(0, R_ESI);
-        } else {
-            gen_op_movl_A0_reg(R_ESI);
+        if (ovr_seg < 0) {
+            if (s->addseg) {
+                ovr_seg = def_seg;
+            } else {
+                tcg_gen_ext32u_tl(cpu_A0, a0);
+                return;
+            }
         }
         break;
     case MO_16:
-        /* 16 address, always override */
-        if (override < 0)
-            override = R_DS;
-        tcg_gen_ext16u_tl(cpu_A0, cpu_regs[R_ESI]);
-        gen_op_addl_A0_seg(s, override);
+        /* 16 bit address */
+        if (ovr_seg < 0) {
+            ovr_seg = def_seg;
+        }
+        tcg_gen_ext16u_tl(cpu_A0, a0);
+        if (!s->addseg) {
+            return;
+        }
+        a0 = cpu_A0;
         break;
     default:
         tcg_abort();
     }
-}
 
-static inline void gen_string_movl_A0_EDI(DisasContext *s)
-{
-    switch (s->aflag) {
-#ifdef TARGET_X86_64
-    case MO_64:
-        gen_op_movq_A0_reg(R_EDI);
-        break;
-#endif
-    case MO_32:
-        if (s->addseg) {
-            gen_op_movl_A0_seg(R_ES);
-            gen_op_addl_A0_reg_sN(0, R_EDI);
+    if (ovr_seg >= 0) {
+        TCGv seg = tcg_temp_new();
+
+        tcg_gen_ld_tl(seg, cpu_env, offsetof(CPUX86State, segs[ovr_seg].base));
+
+        if (aflag == MO_64) {
+            tcg_gen_add_tl(cpu_A0, a0, seg);
+        } else if (CODE64(s)) {
+            tcg_gen_ext32u_tl(cpu_A0, a0);
+            tcg_gen_add_tl(cpu_A0, cpu_A0, seg);
         } else {
-            gen_op_movl_A0_reg(R_EDI);
+            tcg_gen_add_tl(cpu_A0, a0, seg);
+            tcg_gen_ext32u_tl(cpu_A0, cpu_A0);
         }
-        break;
-    case MO_16:
-        tcg_gen_ext16u_tl(cpu_A0, cpu_regs[R_EDI]);
-        gen_op_addl_A0_seg(s, R_ES);
-        break;
-    default:
-        tcg_abort();
+
+        tcg_temp_free(seg);
     }
 }
 
+static inline void gen_string_movl_A0_ESI(DisasContext *s)
+{
+    gen_lea_v_seg(s, cpu_regs[R_ESI], R_DS, s->override);
+}
+
+static inline void gen_string_movl_A0_EDI(DisasContext *s)
+{
+    gen_lea_v_seg(s, cpu_regs[R_EDI], R_ES, -1);
+}
+
 static inline void gen_op_movl_T0_Dshift(TCGMemOp ot) 
 {
     tcg_gen_ld32s_tl(cpu_T[0], cpu_env, offsetof(CPUX86State, df));
@@ -2141,23 +2102,7 @@  static void gen_nop_modrm(CPUX86State *env, DisasContext *s, int modrm)
 /* used for LEA and MOV AX, mem */
 static void gen_add_A0_ds_seg(DisasContext *s)
 {
-    int override, must_add_seg;
-    must_add_seg = s->addseg;
-    override = R_DS;
-    if (s->override >= 0) {
-        override = s->override;
-        must_add_seg = 1;
-    }
-    if (must_add_seg) {
-#ifdef TARGET_X86_64
-        if (CODE64(s)) {
-            gen_op_addq_A0_seg(override);
-        } else
-#endif
-        {
-            gen_op_addl_A0_seg(s, override);
-        }
-    }
+    gen_lea_v_seg(s, cpu_A0, R_DS, s->override);
 }
 
 /* generate modrm memory load or store of 'reg'. TMP0 is used if reg ==