[3/3] linux-user/i386: Emulate x86_64 vsyscalls
diff mbox series

Message ID 20200114210921.11216-4-richard.henderson@linaro.org
State New
Headers show
Series
  • linux-user: Implement x86_64 vsyscalls
Related show

Commit Message

Richard Henderson Jan. 14, 2020, 9:09 p.m. UTC
Notice the magic page during translate, much like we already
do for the arm32 commpage.  At runtime, raise an exception to
return cpu_loop for emulation.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/cpu.h          |   1 +
 linux-user/i386/cpu_loop.c | 104 +++++++++++++++++++++++++++++++++++++
 target/i386/translate.c    |  16 +++++-
 3 files changed, 120 insertions(+), 1 deletion(-)

Comments

Alex Bennée Jan. 16, 2020, 10:51 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Notice the magic page during translate, much like we already
> do for the arm32 commpage.  At runtime, raise an exception to
> return cpu_loop for emulation.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

So I've been trying to test this but have run into a number of
(unrelated) hurdles on the way on my system. I think I've eliminated
these patches but it might be a regression.

Does running /sbin/ldconfig work ok for you?
Alex Bennée Jan. 16, 2020, 4:26 p.m. UTC | #2
Richard Henderson <richard.henderson@linaro.org> writes:

> Notice the magic page during translate, much like we already
> do for the arm32 commpage.  At runtime, raise an exception to
> return cpu_loop for emulation.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/i386/cpu.h          |   1 +
>  linux-user/i386/cpu_loop.c | 104 +++++++++++++++++++++++++++++++++++++
>  target/i386/translate.c    |  16 +++++-
>  3 files changed, 120 insertions(+), 1 deletion(-)
>
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 164d038d1f..3fb2d2a986 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1000,6 +1000,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
>  
>  #define EXCP_VMEXIT     0x100 /* only for system emulation */
>  #define EXCP_SYSCALL    0x101 /* only for user emulation */
> +#define EXCP_VSYSCALL   0x102 /* only for user emulation */
>  
>  /* i386-specific interrupt pending bits.  */
>  #define CPU_INTERRUPT_POLL      CPU_INTERRUPT_TGT_EXT_1
> diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c
> index e217cca5ee..8b7c9f7337 100644
> --- a/linux-user/i386/cpu_loop.c
> +++ b/linux-user/i386/cpu_loop.c
> @@ -92,6 +92,105 @@ static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr)
>      queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
>  }
>  
> +#ifdef TARGET_X86_64
> +static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len)
> +{
> +    /*
> +     * For all the vsyscalls, NULL means "don't write anything" not
> +     * "write it at address 0".
> +     */
> +    if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) {
> +        return true;
> +    }
> +
> +    gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr);
> +    return false;
> +}
> +
> +/*
> + * Since v3.1, the kernel traps and emulates the vsyscall page.
> + * Entry points other than the official generate SIGSEGV.
> + */
> +static void emulate_vsyscall(CPUX86State *env)
> +{
> +    int syscall;
> +    abi_ulong ret;
> +    uint64_t caller;
> +
> +    /*
> +     * Validate the entry point.  We have already validated the page
> +     * during translation, now verify the offset.
> +     */
> +    switch (env->eip & ~TARGET_PAGE_MASK) {
> +    case 0x000:
> +        syscall = TARGET_NR_gettimeofday;
> +        break;
> +    case 0x400:
> +        syscall = TARGET_NR_time;
> +        break;
> +    case 0x800:
> +        syscall = TARGET_NR_getcpu;
> +        break;
> +    default:
> +    sigsegv:
> +        /* Like force_sig(SIGSEGV).  */
> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
> +        return;
> +    }
> +
> +    /*
> +     * Validate the return address.
> +     * Note that the kernel treats this the same as an invalid entry point.
> +     */
> +    if (get_user_u64(caller, env->regs[R_ESP])) {
> +        goto sigsegv;
> +    }
> +
> +    /*
> +     * Validate the the pointer arguments.
> +     */
> +    switch (syscall) {
> +    case TARGET_NR_gettimeofday:
> +        if (!write_ok_or_segv(env, env->regs[R_EDI],
> +                              sizeof(struct target_timeval)) ||
> +            !write_ok_or_segv(env, env->regs[R_ESI],
> +                              sizeof(struct target_timezone))) {
> +            return;
> +        }
> +        break;
> +    case TARGET_NR_time:
> +        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(abi_long))) {
> +            return;
> +        }
> +        break;
> +    case TARGET_NR_getcpu:
> +        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(uint32_t)) ||
> +            !write_ok_or_segv(env, env->regs[R_ESI], sizeof(uint32_t))) {
> +            return;
> +        }
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    /*
> +     * Perform the syscall.  None of the vsyscalls should need restarting,
> +     * and all faults should have been caught above.
> +     */
> +    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
> +                     env->regs[R_EDX], env->regs[10], env->regs[8],
> +                     env->regs[9], 0, 0);

How come the register ABI to the syscall is different to the others. I
can see why syscall doesn't come from EAX but the others are a different
set to normal syscalls which might be why:

> +    g_assert(ret != -TARGET_ERESTARTSYS);
> +    g_assert(ret != -TARGET_QEMU_ESIGRETURN);
> +    g_assert(ret != -TARGET_EFAULT);

I'm seeing a EFAULT on the gettimeofday failure:

   #0  do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076                                               
   #1  0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180
   #2  cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246                              
   #3  0x000055555559640e in main (argc=<optimized out>, argv=<optimized
   #out>, envp=<optimized out>) at
   #/home/alex/lsrc/qemu.git/linux-user/main.c:865

arg1/arg2 don't seem right here.

> +    env->regs[R_EAX] = ret;
> +
> +    /* Emulate a ret instruction to leave the vsyscall page.  */
> +    env->eip = caller;
> +    env->regs[R_ESP] += 8;
> +}
> +#endif
> +
>  void cpu_loop(CPUX86State *env)
>  {
>      CPUState *cs = env_cpu(env);
> @@ -141,6 +240,11 @@ void cpu_loop(CPUX86State *env)
>                  env->regs[R_EAX] = ret;
>              }
>              break;
> +#endif
> +#ifdef TARGET_X86_64
> +        case EXCP_VSYSCALL:
> +            emulate_vsyscall(env);
> +            break;
>  #endif
>          case EXCP0B_NOSEG:
>          case EXCP0C_STACK:
> diff --git a/target/i386/translate.c b/target/i386/translate.c
> index 7c99ef1385..391b4ef149 100644
> --- a/target/i386/translate.c
> +++ b/target/i386/translate.c
> @@ -8555,7 +8555,21 @@ static bool i386_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu,
>  static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
>  {
>      DisasContext *dc = container_of(dcbase, DisasContext, base);
> -    target_ulong pc_next = disas_insn(dc, cpu);
> +    target_ulong pc_next;
> +
> +#if defined(TARGET_X86_64) && \
> +    defined(CONFIG_USER_ONLY) && \
> +    defined(CONFIG_LINUX)
> +    /*
> +     * Detect entry into the vsyscall page and invoke the syscall.
> +     */
> +    if ((dc->base.pc_next & TARGET_PAGE_MASK) == 0xffffffffff600000ull) {
> +        gen_exception(dc, EXCP_VSYSCALL, dc->base.pc_next);
> +        return;
> +    }
> +#endif
> +
> +    pc_next = disas_insn(dc, cpu);
>  
>      if (dc->tf || (dc->base.tb->flags & HF_INHIBIT_IRQ_MASK)) {
>          /* if single step mode, we generate only one instruction and
Richard Henderson Jan. 16, 2020, 6:19 p.m. UTC | #3
On 1/16/20 6:26 AM, Alex Bennée wrote:
>> +    /*
>> +     * Perform the syscall.  None of the vsyscalls should need restarting,
>> +     * and all faults should have been caught above.
>> +     */
>> +    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
>> +                     env->regs[R_EDX], env->regs[10], env->regs[8],
>> +                     env->regs[9], 0, 0);
> 
> How come the register ABI to the syscall is different to the others. I
> can see why syscall doesn't come from EAX but the others are a different
> set to normal syscalls which might be why:

Cut and paste error, I assume.

That said, the three syscalls have a maximum of 2 arguments,
so I could really just pass EDI and ESI and 0 for the rest...

> I'm seeing a EFAULT on the gettimeofday failure:

What getttimeofday failure?  Is this related to the mention of /sbin/ldconfig
in your previous message?

>    #0  do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076                                               
>    #1  0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180
>    #2  cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246                              
>    #3  0x000055555559640e in main (argc=<optimized out>, argv=<optimized
>    #out>, envp=<optimized out>) at
>    #/home/alex/lsrc/qemu.git/linux-user/main.c:865
> 
> arg1/arg2 don't seem right here.

Why?  NULL value for arg1 is legal, though semi-useless.

Ah, I see that our implementation of gettimeofday doesn't honor NULL.


r~
Richard Henderson Jan. 16, 2020, 6:22 p.m. UTC | #4
On 1/16/20 8:19 AM, Richard Henderson wrote:
> On 1/16/20 6:26 AM, Alex Bennée wrote:
>>> +    /*
>>> +     * Perform the syscall.  None of the vsyscalls should need restarting,
>>> +     * and all faults should have been caught above.
>>> +     */
>>> +    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
>>> +                     env->regs[R_EDX], env->regs[10], env->regs[8],
>>> +                     env->regs[9], 0, 0);
>>
>> How come the register ABI to the syscall is different to the others. I
>> can see why syscall doesn't come from EAX but the others are a different
>> set to normal syscalls which might be why:
> 
> Cut and paste error, I assume.

What register difference?


        case EXCP_SYSCALL:
            /* linux syscall from syscall instruction */
            ret = do_syscall(env,
                             env->regs[R_EAX],
                             env->regs[R_EDI],
                             env->regs[R_ESI],
                             env->regs[R_EDX],
                             env->regs[10],
                             env->regs[8],
                             env->regs[9],
                             0, 0);

Looks the same to me...


r~
Alex Bennée Jan. 16, 2020, 8:15 p.m. UTC | #5
Richard Henderson <richard.henderson@linaro.org> writes:

> On 1/16/20 6:26 AM, Alex Bennée wrote:
>>> +    /*
>>> +     * Perform the syscall.  None of the vsyscalls should need restarting,
>>> +     * and all faults should have been caught above.
>>> +     */
>>> +    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
>>> +                     env->regs[R_EDX], env->regs[10], env->regs[8],
>>> +                     env->regs[9], 0, 0);
>> 
>> How come the register ABI to the syscall is different to the others. I
>> can see why syscall doesn't come from EAX but the others are a different
>> set to normal syscalls which might be why:
>
> Cut and paste error, I assume.
>
> That said, the three syscalls have a maximum of 2 arguments,
> so I could really just pass EDI and ESI and 0 for the rest...
>
>> I'm seeing a EFAULT on the gettimeofday failure:
>
> What getttimeofday failure?  Is this related to the mention of /sbin/ldconfig
> in your previous message?

No - the buster x86064 ldconfig seg is unrelated to this series. It has
however spawned an additional bug in gdbstub while it was at it ;-)

>
>>    #0  do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076                                               
>>    #1  0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180
>>    #2  cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246                              
>>    #3  0x000055555559640e in main (argc=<optimized out>, argv=<optimized
>>    #out>, envp=<optimized out>) at
>>    #/home/alex/lsrc/qemu.git/linux-user/main.c:865
>> 
>> arg1/arg2 don't seem right here.
>
> Why?  NULL value for arg1 is legal, though semi-useless.
>
> Ah, I see that our implementation of gettimeofday doesn't honor NULL.
>
>
> r~

Patch
diff mbox series

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 164d038d1f..3fb2d2a986 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1000,6 +1000,7 @@  typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 
 #define EXCP_VMEXIT     0x100 /* only for system emulation */
 #define EXCP_SYSCALL    0x101 /* only for user emulation */
+#define EXCP_VSYSCALL   0x102 /* only for user emulation */
 
 /* i386-specific interrupt pending bits.  */
 #define CPU_INTERRUPT_POLL      CPU_INTERRUPT_TGT_EXT_1
diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c
index e217cca5ee..8b7c9f7337 100644
--- a/linux-user/i386/cpu_loop.c
+++ b/linux-user/i386/cpu_loop.c
@@ -92,6 +92,105 @@  static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr)
     queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
 }
 
+#ifdef TARGET_X86_64
+static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len)
+{
+    /*
+     * For all the vsyscalls, NULL means "don't write anything" not
+     * "write it at address 0".
+     */
+    if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) {
+        return true;
+    }
+
+    gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr);
+    return false;
+}
+
+/*
+ * Since v3.1, the kernel traps and emulates the vsyscall page.
+ * Entry points other than the official generate SIGSEGV.
+ */
+static void emulate_vsyscall(CPUX86State *env)
+{
+    int syscall;
+    abi_ulong ret;
+    uint64_t caller;
+
+    /*
+     * Validate the entry point.  We have already validated the page
+     * during translation, now verify the offset.
+     */
+    switch (env->eip & ~TARGET_PAGE_MASK) {
+    case 0x000:
+        syscall = TARGET_NR_gettimeofday;
+        break;
+    case 0x400:
+        syscall = TARGET_NR_time;
+        break;
+    case 0x800:
+        syscall = TARGET_NR_getcpu;
+        break;
+    default:
+    sigsegv:
+        /* Like force_sig(SIGSEGV).  */
+        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
+        return;
+    }
+
+    /*
+     * Validate the return address.
+     * Note that the kernel treats this the same as an invalid entry point.
+     */
+    if (get_user_u64(caller, env->regs[R_ESP])) {
+        goto sigsegv;
+    }
+
+    /*
+     * Validate the the pointer arguments.
+     */
+    switch (syscall) {
+    case TARGET_NR_gettimeofday:
+        if (!write_ok_or_segv(env, env->regs[R_EDI],
+                              sizeof(struct target_timeval)) ||
+            !write_ok_or_segv(env, env->regs[R_ESI],
+                              sizeof(struct target_timezone))) {
+            return;
+        }
+        break;
+    case TARGET_NR_time:
+        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(abi_long))) {
+            return;
+        }
+        break;
+    case TARGET_NR_getcpu:
+        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(uint32_t)) ||
+            !write_ok_or_segv(env, env->regs[R_ESI], sizeof(uint32_t))) {
+            return;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    /*
+     * Perform the syscall.  None of the vsyscalls should need restarting,
+     * and all faults should have been caught above.
+     */
+    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
+                     env->regs[R_EDX], env->regs[10], env->regs[8],
+                     env->regs[9], 0, 0);
+    g_assert(ret != -TARGET_ERESTARTSYS);
+    g_assert(ret != -TARGET_QEMU_ESIGRETURN);
+    g_assert(ret != -TARGET_EFAULT);
+    env->regs[R_EAX] = ret;
+
+    /* Emulate a ret instruction to leave the vsyscall page.  */
+    env->eip = caller;
+    env->regs[R_ESP] += 8;
+}
+#endif
+
 void cpu_loop(CPUX86State *env)
 {
     CPUState *cs = env_cpu(env);
@@ -141,6 +240,11 @@  void cpu_loop(CPUX86State *env)
                 env->regs[R_EAX] = ret;
             }
             break;
+#endif
+#ifdef TARGET_X86_64
+        case EXCP_VSYSCALL:
+            emulate_vsyscall(env);
+            break;
 #endif
         case EXCP0B_NOSEG:
         case EXCP0C_STACK:
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 7c99ef1385..391b4ef149 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -8555,7 +8555,21 @@  static bool i386_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu,
 static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
-    target_ulong pc_next = disas_insn(dc, cpu);
+    target_ulong pc_next;
+
+#if defined(TARGET_X86_64) && \
+    defined(CONFIG_USER_ONLY) && \
+    defined(CONFIG_LINUX)
+    /*
+     * Detect entry into the vsyscall page and invoke the syscall.
+     */
+    if ((dc->base.pc_next & TARGET_PAGE_MASK) == 0xffffffffff600000ull) {
+        gen_exception(dc, EXCP_VSYSCALL, dc->base.pc_next);
+        return;
+    }
+#endif
+
+    pc_next = disas_insn(dc, cpu);
 
     if (dc->tf || (dc->base.tb->flags & HF_INHIBIT_IRQ_MASK)) {
         /* if single step mode, we generate only one instruction and