Patchwork [2/2] tcg-i386: Use segment registers to implement GUEST_BASE.

login
register
mail settings
Submitter Richard Henderson
Date June 4, 2010, 12:35 a.m.
Message ID <1275611718-32630-3-git-send-email-rth@twiddle.net>
Download mbox | patch
Permalink /patch/54529/
State New
Headers show

Comments

Richard Henderson - June 4, 2010, 12:35 a.m.
For 32-bit, using a segment override is smaller than the 4-byte
immediate offset.  For 64-bit, segments can hold the entire 64-bit
offset whereas the 4-byte immediate cannot.

Only implemented for linux, with fallback to the immediate offset
if the system call fails.

Signed-off-by: Richard Henderson <rth@twiddle.net>
---
 tcg/i386/tcg-target.c |  206 +++++++++++++++++++++++++++++++++++-------------
 1 files changed, 150 insertions(+), 56 deletions(-)
Alexander Graf - June 4, 2010, 6:35 a.m.
On 04.06.2010, at 02:35, Richard Henderson wrote:

> For 32-bit, using a segment override is smaller than the 4-byte
> immediate offset.  For 64-bit, segments can hold the entire 64-bit
> offset whereas the 4-byte immediate cannot.

Very nice idea indeed :). Have you found it to be faster? IIRC segment accesses are slower when seg_offs != 0. But then again the code is smaller, so it might weigh it up.

> Only implemented for linux, with fallback to the immediate offset
> if the system call fails.
> 
> Signed-off-by: Richard Henderson <rth@twiddle.net>
> ---
> tcg/i386/tcg-target.c |  206 +++++++++++++++++++++++++++++++++++-------------
> 1 files changed, 150 insertions(+), 56 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index fab2a30..e34254f 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -240,6 +240,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
> # define P_REXB_R	0
> # define P_REXB_RM	0
> #endif
> +#define P_FS		0x4000
> +#define P_GS		0x8000
> 
> #define OPC_ARITH_EvIz	(0x81)
> #define OPC_ARITH_EvIb	(0x83)
> @@ -347,11 +349,29 @@ static const uint8_t tcg_cond_to_jcc[10] = {
>     [TCG_COND_GTU] = JCC_JA,
> };
> 
> +static inline void tcg_out_seg_prefix(TCGContext *s, int opc)
> +{
> +    switch (opc & (P_FS | P_GS)) {
> +    case 0:
> +        break;
> +    case P_FS:
> +        tcg_out8(s, 0x64);
> +        break;
> +    case P_GS:
> +        tcg_out8(s, 0x65);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> #if TCG_TARGET_REG_BITS == 64
> static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
> {
>     int rex;
> 
> +    tcg_out_seg_prefix(s, opc);
> +
>     if (opc & P_DATA16) {
>         /* We should never be asking for both 16 and 64-bit operation.  */
>         assert((opc & P_REXW) == 0);
> @@ -387,6 +407,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
> #else
> static void tcg_out_opc(TCGContext *s, int opc)
> {
> +    tcg_out_seg_prefix(s, opc);
> +
>     if (opc & P_DATA16) {
>         tcg_out8(s, 0x66);
>     }
> @@ -956,6 +978,48 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
>     tcg_out_branch(s, 0, dest);
> }
> 
> +#ifndef GUEST_BASE
> +#define GUEST_BASE 0
> +#endif
> +
> +#if defined(__x86_64__) && defined(__linux__)
> +# include <sys/syscall.h>
> +# include <asm/prctl.h>
> +
> +static int guest_base_flags;
> +static inline void setup_guest_base_seg(void)
> +{
> +    if (syscall(__NR_arch_prctl, ARCH_SET_GS, GUEST_BASE) == 0) {
> +        guest_base_flags = P_GS;

I'd like to see a comment here stating that FS is used for TLS.

> +    }
> +}
> +#elif defined(__i386__) && defined(__linux__)
> +# include <sys/syscall.h>
> +# include <asm/ldt.h>
> +
> +static int guest_base_flags;
> +static inline void setup_guest_base_seg(void)
> +{
> +    struct user_desc d;
> +
> +    memset(&d, 0, sizeof(d));
> +    d.entry_number = -1;                /* let the kernel choose */
> +    d.base_addr = GUEST_BASE;
> +    d.limit = 0xfffff;                  /* 4GB segment */
> +    d.seg_32bit = 1;
> +    d.limit_in_pages = 1;
> +    d.useable = 1;
> +
> +    if (syscall(__NR_set_thread_area, &d) == 0) {
> +        asm volatile("movw %w0, %%fs" : : "r"(d.entry_number * 8 + 3));

Same here for %gs.

[snip]

> @@ -1945,6 +2031,14 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
>     }
>     tcg_out_opc(s, OPC_RET, 0, 0, 0);
> +
> +    /* Try to set up %fs or %gs (whichever isn't already used for TLS)
> +       to point to GUEST_BASE.  The 1-byte segment override prefix is
> +       always smaller than the 4-byte offset we'd have to encode into
> +       the address, and is also able to handle the full 64-bit offset.  */

Ah, so that's where the comment hides. Uh. Better be safe than sorry and have it in both locations, no? :)

Alex
Richard Henderson - June 7, 2010, 7:45 p.m.
On 06/03/2010 11:35 PM, Alexander Graf wrote:
> 
> On 04.06.2010, at 02:35, Richard Henderson wrote:
> 
>> For 32-bit, using a segment override is smaller than the 4-byte 
>> immediate offset.  For 64-bit, segments can hold the entire 64-bit 
>> offset whereas the 4-byte immediate cannot.
> 
> Very nice idea indeed :). Have you found it to be faster? IIRC
> segment accesses are slower when seg_offs != 0. But then again the
> code is smaller, so it might weigh it up.

I've not yet tried to do any proper benchmarking.

I had thought that the segment access was slower only when limit != 4G,
i.e. when the segment boundaries need to be checked.  And that 64-bit
segments don't have boundaries, so the slowdown does not apply there either.

I'll put it on my queue for the weekend.


r~

Patch

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index fab2a30..e34254f 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -240,6 +240,8 @@  static inline int tcg_target_const_match(tcg_target_long val,
 # define P_REXB_R	0
 # define P_REXB_RM	0
 #endif
+#define P_FS		0x4000
+#define P_GS		0x8000
 
 #define OPC_ARITH_EvIz	(0x81)
 #define OPC_ARITH_EvIb	(0x83)
@@ -347,11 +349,29 @@  static const uint8_t tcg_cond_to_jcc[10] = {
     [TCG_COND_GTU] = JCC_JA,
 };
 
+static inline void tcg_out_seg_prefix(TCGContext *s, int opc)
+{
+    switch (opc & (P_FS | P_GS)) {
+    case 0:
+        break;
+    case P_FS:
+        tcg_out8(s, 0x64);
+        break;
+    case P_GS:
+        tcg_out8(s, 0x65);
+        break;
+    default:
+        tcg_abort();
+    }
+}
+
 #if TCG_TARGET_REG_BITS == 64
 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 {
     int rex;
 
+    tcg_out_seg_prefix(s, opc);
+
     if (opc & P_DATA16) {
         /* We should never be asking for both 16 and 64-bit operation.  */
         assert((opc & P_REXW) == 0);
@@ -387,6 +407,8 @@  static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 #else
 static void tcg_out_opc(TCGContext *s, int opc)
 {
+    tcg_out_seg_prefix(s, opc);
+
     if (opc & P_DATA16) {
         tcg_out8(s, 0x66);
     }
@@ -956,6 +978,48 @@  static void tcg_out_jmp(TCGContext *s, tcg_target_long dest)
     tcg_out_branch(s, 0, dest);
 }
 
+#ifndef GUEST_BASE
+#define GUEST_BASE 0
+#endif
+
+#if defined(__x86_64__) && defined(__linux__)
+# include <sys/syscall.h>
+# include <asm/prctl.h>
+
+static int guest_base_flags;
+static inline void setup_guest_base_seg(void)
+{
+    if (syscall(__NR_arch_prctl, ARCH_SET_GS, GUEST_BASE) == 0) {
+        guest_base_flags = P_GS;
+    }
+}
+#elif defined(__i386__) && defined(__linux__)
+# include <sys/syscall.h>
+# include <asm/ldt.h>
+
+static int guest_base_flags;
+static inline void setup_guest_base_seg(void)
+{
+    struct user_desc d;
+
+    memset(&d, 0, sizeof(d));
+    d.entry_number = -1;                /* let the kernel choose */
+    d.base_addr = GUEST_BASE;
+    d.limit = 0xfffff;                  /* 4GB segment */
+    d.seg_32bit = 1;
+    d.limit_in_pages = 1;
+    d.useable = 1;
+
+    if (syscall(__NR_set_thread_area, &d) == 0) {
+        asm volatile("movw %w0, %%fs" : : "r"(d.entry_number * 8 + 3));
+        guest_base_flags = P_FS;
+    }
+}
+#else
+# define guest_base_flags       0
+static inline void setup_guest_base_seg(void) { }
+#endif
+
 #if defined(CONFIG_SOFTMMU)
 
 #include "../../softmmu_defs.h"
@@ -1056,37 +1120,41 @@  static inline void tcg_out_tlb_load(TCGContext *s, int addrlo_idx,
 #endif
 
 static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
-                                   int base, tcg_target_long ofs, int sizeop)
+                                   int base, tcg_target_long ofs, int sizeop,
+                                   int prefix)
 {
 #ifdef TARGET_WORDS_BIGENDIAN
     const int bswap = 1;
 #else
     const int bswap = 0;
 #endif
+    int rexw = (TARGET_LONG_BITS == 64 ? P_REXW : 0);
+
     switch (sizeop) {
     case 0:
-        tcg_out_modrm_offset(s, OPC_MOVZBL, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVZBL + prefix, datalo, base, ofs);
         break;
     case 0 | 4:
-        tcg_out_modrm_offset(s, OPC_MOVSBL + P_REXW, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVSBL + prefix + rexw, datalo, base, ofs);
         break;
     case 1:
-        tcg_out_modrm_offset(s, OPC_MOVZWL, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVZWL + prefix, datalo, base, ofs);
         if (bswap) {
             tcg_out_rolw_8(s, datalo);
         }
         break;
     case 1 | 4:
         if (bswap) {
-            tcg_out_modrm_offset(s, OPC_MOVZWL, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVZWL + prefix, datalo, base, ofs);
             tcg_out_rolw_8(s, datalo);
-            tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
+            tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVSWL + prefix + rexw,
+                                 datalo, base, ofs);
         }
         break;
     case 2:
-        tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix, datalo, base, ofs);
         if (bswap) {
             tcg_out_bswap32(s, datalo);
         }
@@ -1094,17 +1162,18 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
 #if TCG_TARGET_REG_BITS == 64
     case 2 | 4:
         if (bswap) {
-            tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix, datalo, base, ofs);
             tcg_out_bswap32(s, datalo);
             tcg_out_ext32s(s, datalo, datalo);
         } else {
-            tcg_out_modrm_offset(s, OPC_MOVSLQ, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVSLQ + prefix, datalo, base, ofs);
         }
         break;
 #endif
     case 3:
         if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_ld(s, TCG_TYPE_I64, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix + P_REXW,
+                                 datalo, base, ofs);
             if (bswap) {
                 tcg_out_bswap64(s, datalo);
             }
@@ -1115,11 +1184,15 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
                 datahi = t;
             }
             if (base != datalo) {
-                tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
-                tcg_out_ld(s, TCG_TYPE_I32, datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix,
+                                     datalo, base, ofs);
+                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix,
+                                     datahi, base, ofs + 4);
             } else {
-                tcg_out_ld(s, TCG_TYPE_I32, datahi, base, ofs + 4);
-                tcg_out_ld(s, TCG_TYPE_I32, datalo, base, ofs);
+                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix,
+                                     datahi, base, ofs + 4);
+                tcg_out_modrm_offset(s, OPC_MOVL_GvEv + prefix,
+                                     datalo, base, ofs);
             }
             if (bswap) {
                 tcg_out_bswap32(s, datalo);
@@ -1135,8 +1208,7 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, int datalo, int datahi,
 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
    EAX. It will be useful once fixed registers globals are less
    common. */
-static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
-                            int opc)
+static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc)
 {
     int data_reg, data_reg2 = 0;
     int addrlo_idx;
@@ -1161,7 +1233,7 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
 
     /* TLB Hit.  */
     tcg_out_qemu_ld_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+                           tcg_target_call_iarg_regs[0], 0, opc, 0);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1230,28 +1302,32 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args,
     {
         int32_t offset = GUEST_BASE;
         int base = args[addrlo_idx];
-
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? We assume all operations have left us with register
-               contents that are zero extended.  So far this appears to
-               be true.  If we want to enforce this, we can either do
-               an explicit zero-extension here, or (if GUEST_BASE == 0)
-               use the ADDR32 prefix.  For now, do nothing.  */
-
-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
-                base = TCG_REG_RDI, offset = 0;
-            }
+        int prefix = 0;
+
+        /* ??? For 64-bit, we assume all operations have left us with register
+           contents that are zero extended.  So far this appears to be true.
+           If we want to enforce this, we can either do an explicit zero
+           extension here, or (if GUEST_BASE == 0 or guest_base_flags) use
+           the ADDR32 prefix.  For now, do nothing.  */
+
+        if (GUEST_BASE && guest_base_flags) {
+            prefix = guest_base_flags;
+            offset = 0;
+        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
+            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
+            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
+            base = TCG_REG_RDI, offset = 0;
         }
 
-        tcg_out_qemu_ld_direct(s, data_reg, data_reg2, base, offset, opc);
+        tcg_out_qemu_ld_direct(s, data_reg, data_reg2, base,
+                               offset, opc, prefix);
     }
 #endif
 }
 
 static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
-                                   int base, tcg_target_long ofs, int sizeop)
+                                   int base, tcg_target_long ofs, int sizeop,
+                                   int prefix)
 {
 #ifdef TARGET_WORDS_BIGENDIAN
     const int bswap = 1;
@@ -1266,7 +1342,8 @@  static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
 
     switch (sizeop) {
     case 0:
-        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + prefix,
+                             datalo, base, ofs);
         break;
     case 1:
         if (bswap) {
@@ -1274,7 +1351,8 @@  static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
             tcg_out_rolw_8(s, scratch);
             datalo = scratch;
         }
-        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + prefix,
+                             datalo, base, ofs);
         break;
     case 2:
         if (bswap) {
@@ -1282,7 +1360,7 @@  static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
             tcg_out_bswap32(s, scratch);
             datalo = scratch;
         }
-        tcg_out_st(s, TCG_TYPE_I32, datalo, base, ofs);
+        tcg_out_modrm_offset(s, OPC_MOVL_EvGv + prefix, datalo, base, ofs);
         break;
     case 3:
         if (TCG_TARGET_REG_BITS == 64) {
@@ -1291,17 +1369,22 @@  static void tcg_out_qemu_st_direct(TCGContext *s, int datalo, int datahi,
                 tcg_out_bswap64(s, scratch);
                 datalo = scratch;
             }
-            tcg_out_st(s, TCG_TYPE_I64, datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + prefix,
+                                 datalo, base, ofs);
         } else if (bswap) {
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
             tcg_out_bswap32(s, scratch);
-            tcg_out_st(s, TCG_TYPE_I32, scratch, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + prefix,
+                                 scratch, base, ofs);
             tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
             tcg_out_bswap32(s, scratch);
-            tcg_out_st(s, TCG_TYPE_I32, scratch, base, ofs + 4);
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + prefix,
+                                 scratch, base, ofs + 4);
         } else {
-            tcg_out_st(s, TCG_TYPE_I32, datalo, base, ofs);
-            tcg_out_st(s, TCG_TYPE_I32, datahi, base, ofs + 4);
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + prefix,
+                                 datalo, base, ofs);
+            tcg_out_modrm_offset(s, OPC_MOVL_EvGv + prefix,
+                                 datahi, base, ofs + 4);
         }
         break;
     default:
@@ -1336,7 +1419,7 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
 
     /* TLB Hit.  */
     tcg_out_qemu_st_direct(s, data_reg, data_reg2,
-                           tcg_target_call_iarg_regs[0], 0, opc);
+                           tcg_target_call_iarg_regs[0], 0, opc, 0);
 
     /* jmp label2 */
     tcg_out8(s, OPC_JMP_short);
@@ -1407,22 +1490,25 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args,
     {
         int32_t offset = GUEST_BASE;
         int base = args[addrlo_idx];
-
-        if (TCG_TARGET_REG_BITS == 64) {
-            /* ??? We assume all operations have left us with register
-               contents that are zero extended.  So far this appears to
-               be true.  If we want to enforce this, we can either do
-               an explicit zero-extension here, or (if GUEST_BASE == 0)
-               use the ADDR32 prefix.  For now, do nothing.  */
-
-            if (offset != GUEST_BASE) {
-                tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
-                tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
-                base = TCG_REG_RDI, offset = 0;
-            }
+        int prefix = 0;
+
+        /* ??? For 64-bit, we assume all operations have left us with register
+           contents that are zero extended.  So far this appears to be true.
+           If we want to enforce this, we can either do an explicit zero
+           extension here, or (if GUEST_BASE == 0 or guest_base_flags) use
+           the ADDR32 prefix.  For now, do nothing.  */
+
+        if (GUEST_BASE && guest_base_flags) {
+            prefix = guest_base_flags;
+            offset = 0;
+        } else if (TCG_TARGET_REG_BITS == 64 && offset != GUEST_BASE) {
+            tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_RDI, GUEST_BASE);
+            tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_RDI, base);
+            base = TCG_REG_RDI, offset = 0;
         }
 
-        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base, offset, opc);
+        tcg_out_qemu_st_direct(s, data_reg, data_reg2, base,
+                               offset, opc, prefix);
     }
 #endif
 }
@@ -1945,6 +2031,14 @@  static void tcg_target_qemu_prologue(TCGContext *s)
         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
     }
     tcg_out_opc(s, OPC_RET, 0, 0, 0);
+
+    /* Try to set up %fs or %gs (whichever isn't already used for TLS)
+       to point to GUEST_BASE.  The 1-byte segment override prefix is
+       always smaller than the 4-byte offset we'd have to encode into
+       the address, and is also able to handle the full 64-bit offset.  */
+    if (GUEST_BASE) {
+        setup_guest_base_seg();
+    }
 }
 
 static void tcg_target_init(TCGContext *s)