diff mbox series

[v2,3/5] linux-user/i386: Emulate x86_64 vsyscalls

Message ID 20200116194341.402-4-richard.henderson@linaro.org
State New
Headers show
Series linux-user: Implement x86_64 vsyscalls | expand

Commit Message

Richard Henderson Jan. 16, 2020, 7:43 p.m. UTC
Notice the magic page during translate, much like we already
do for the arm32 commpage.  At runtime, raise an exception to
return cpu_loop for emulation.

Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 target/i386/cpu.h          |   1 +
 linux-user/i386/cpu_loop.c | 105 +++++++++++++++++++++++++++++++++++++
 target/i386/translate.c    |  16 +++++-
 3 files changed, 121 insertions(+), 1 deletion(-)

Comments

Alex Bennée Jan. 20, 2020, 11:48 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Notice the magic page during translate, much like we already
> do for the arm32 commpage.  At runtime, raise an exception to
> return cpu_loop for emulation.
>
> Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  target/i386/cpu.h          |   1 +
>  linux-user/i386/cpu_loop.c | 105 +++++++++++++++++++++++++++++++++++++
>  target/i386/translate.c    |  16 +++++-
>  3 files changed, 121 insertions(+), 1 deletion(-)
>
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 164d038d1f..3fb2d2a986 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -1000,6 +1000,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS];
>  
>  #define EXCP_VMEXIT     0x100 /* only for system emulation */
>  #define EXCP_SYSCALL    0x101 /* only for user emulation */
> +#define EXCP_VSYSCALL   0x102 /* only for user emulation */
>  
>  /* i386-specific interrupt pending bits.  */
>  #define CPU_INTERRUPT_POLL      CPU_INTERRUPT_TGT_EXT_1
> diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c
> index e217cca5ee..f9bf6cec27 100644
> --- a/linux-user/i386/cpu_loop.c
> +++ b/linux-user/i386/cpu_loop.c
> @@ -92,6 +92,106 @@ static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr)
>      queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
>  }
>  
> +#ifdef TARGET_X86_64
> +static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len)
> +{
> +    /*
> +     * For all the vsyscalls, NULL means "don't write anything" not
> +     * "write it at address 0".
> +     */
> +    if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) {
> +        return true;
> +    }
> +
> +    env->error_code = PG_ERROR_W_MASK | PG_ERROR_U_MASK;
> +    gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr);
> +    return false;
> +}
> +
> +/*
> + * Since v3.1, the kernel traps and emulates the vsyscall page.
> + * Entry points other than the official generate SIGSEGV.
> + */
> +static void emulate_vsyscall(CPUX86State *env)
> +{
> +    int syscall;
> +    abi_ulong ret;
> +    uint64_t caller;
> +
> +    /*
> +     * Validate the entry point.  We have already validated the page
> +     * during translation, now verify the offset.
> +     */
> +    switch (env->eip & ~TARGET_PAGE_MASK) {
> +    case 0x000:
> +        syscall = TARGET_NR_gettimeofday;
> +        break;
> +    case 0x400:
> +        syscall = TARGET_NR_time;
> +        break;
> +    case 0x800:
> +        syscall = TARGET_NR_getcpu;
> +        break;
> +    default:
> +    sigsegv:

this label looks a little extraneous.

Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Richard Henderson Jan. 21, 2020, 3:38 a.m. UTC | #2
On 1/20/20 1:48 AM, Alex Bennée wrote:
>> +    default:
>> +    sigsegv:
> 
> this label looks a little extraneous.
> 
> Otherwise:
> 
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
> 

Look a little further down:

> +    default:
> +    sigsegv:
> +        /* Like force_sig(SIGSEGV).  */
> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
> +        return;
> +    }
> +
> +    /*
> +     * Validate the return address.
> +     * Note that the kernel treats this the same as an invalid entry point.
> +     */
> +    if (get_user_u64(caller, env->regs[R_ESP])) {
> +        goto sigsegv;
> +    }


r~
Alex Bennée Jan. 21, 2020, 10:13 a.m. UTC | #3
Richard Henderson <richard.henderson@linaro.org> writes:

> On 1/20/20 1:48 AM, Alex Bennée wrote:
>>> +    default:
>>> +    sigsegv:
>> 
>> this label looks a little extraneous.
>> 
>> Otherwise:
>> 
>> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
>> 
>
> Look a little further down:
>
>> +    default:
>> +    sigsegv:
>> +        /* Like force_sig(SIGSEGV).  */
>> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>> +        return;
>> +    }
>> +
>> +    /*
>> +     * Validate the return address.
>> +     * Note that the kernel treats this the same as an invalid entry point.
>> +     */
>> +    if (get_user_u64(caller, env->regs[R_ESP])) {
>> +        goto sigsegv;
>> +    }

Wouldn't this read better:

    /*
     * Validate the entry point.  We have already validated the page
     * during translation, now verify the offset.
     */
    switch (env->eip & ~TARGET_PAGE_MASK) {
    case 0x000:
        syscall = TARGET_NR_gettimeofday;
        break;
    case 0x400:
        syscall = TARGET_NR_time;
        break;
    case 0x800:
        syscall = TARGET_NR_getcpu;
        break;
    default:
        syscall = -1;
        break;
    }

    /*
     * If we have an invalid entry point or an invalid return address we
     * generate a SIGSEG.
     */
    if (syscall < 0 || get_user_u64(caller, env->regs[R_ESP])) {
        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
        return;
    }
Richard Henderson Jan. 21, 2020, 3:51 p.m. UTC | #4
On 1/21/20 12:13 AM, Alex Bennée wrote:
> 
> Richard Henderson <richard.henderson@linaro.org> writes:
> 
>> On 1/20/20 1:48 AM, Alex Bennée wrote:
>>>> +    default:
>>>> +    sigsegv:
>>>
>>> this label looks a little extraneous.
>>>
>>> Otherwise:
>>>
>>> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
>>>
>>
>> Look a little further down:
>>
>>> +    default:
>>> +    sigsegv:
>>> +        /* Like force_sig(SIGSEGV).  */
>>> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>>> +        return;
>>> +    }
>>> +
>>> +    /*
>>> +     * Validate the return address.
>>> +     * Note that the kernel treats this the same as an invalid entry point.
>>> +     */
>>> +    if (get_user_u64(caller, env->regs[R_ESP])) {
>>> +        goto sigsegv;
>>> +    }
> 
> Wouldn't this read better:
> 
>     /*
>      * Validate the entry point.  We have already validated the page
>      * during translation, now verify the offset.
>      */
>     switch (env->eip & ~TARGET_PAGE_MASK) {
>     case 0x000:
>         syscall = TARGET_NR_gettimeofday;
>         break;
>     case 0x400:
>         syscall = TARGET_NR_time;
>         break;
>     case 0x800:
>         syscall = TARGET_NR_getcpu;
>         break;
>     default:
>         syscall = -1;
>         break;
>     }
> 
>     /*
>      * If we have an invalid entry point or an invalid return address we
>      * generate a SIGSEG.
>      */
>     if (syscall < 0 || get_user_u64(caller, env->regs[R_ESP])) {
>         gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>         return;
>     }
> 

Only if you have a violent goto allergy.


r~
Alex Bennée Jan. 21, 2020, 4:15 p.m. UTC | #5
Richard Henderson <richard.henderson@linaro.org> writes:

> On 1/21/20 12:13 AM, Alex Bennée wrote:
>> 
>> Richard Henderson <richard.henderson@linaro.org> writes:
>> 
>>> On 1/20/20 1:48 AM, Alex Bennée wrote:
>>>>> +    default:
>>>>> +    sigsegv:
>>>>
>>>> this label looks a little extraneous.
>>>>
>>>> Otherwise:
>>>>
>>>> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
>>>>
>>>
>>> Look a little further down:
>>>
>>>> +    default:
>>>> +    sigsegv:
>>>> +        /* Like force_sig(SIGSEGV).  */
>>>> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * Validate the return address.
>>>> +     * Note that the kernel treats this the same as an invalid entry point.
>>>> +     */
>>>> +    if (get_user_u64(caller, env->regs[R_ESP])) {
>>>> +        goto sigsegv;
>>>> +    }
>> 
>> Wouldn't this read better:
>> 
>>     /*
>>      * Validate the entry point.  We have already validated the page
>>      * during translation, now verify the offset.
>>      */
>>     switch (env->eip & ~TARGET_PAGE_MASK) {
>>     case 0x000:
>>         syscall = TARGET_NR_gettimeofday;
>>         break;
>>     case 0x400:
>>         syscall = TARGET_NR_time;
>>         break;
>>     case 0x800:
>>         syscall = TARGET_NR_getcpu;
>>         break;
>>     default:
>>         syscall = -1;
>>         break;
>>     }
>> 
>>     /*
>>      * If we have an invalid entry point or an invalid return address we
>>      * generate a SIGSEG.
>>      */
>>     if (syscall < 0 || get_user_u64(caller, env->regs[R_ESP])) {
>>         gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>>         return;
>>     }
>> 
>
> Only if you have a violent goto allergy.

gotos have their place but jumping backwards is confusing to eye. If the
compiler want to mess with layout after then it is free to do so.
Paolo Bonzini Jan. 21, 2020, 4:23 p.m. UTC | #6
On 21/01/20 17:15, Alex Bennée wrote:
> 
> Richard Henderson <richard.henderson@linaro.org> writes:
> 
>> On 1/21/20 12:13 AM, Alex Bennée wrote:
>>>
>>> Richard Henderson <richard.henderson@linaro.org> writes:
>>>
>>>> On 1/20/20 1:48 AM, Alex Bennée wrote:
>>>>>> +    default:
>>>>>> +    sigsegv:
>>>>>
>>>>> this label looks a little extraneous.
>>>>>
>>>>> Otherwise:
>>>>>
>>>>> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
>>>>>
>>>>
>>>> Look a little further down:
>>>>
>>>>> +    default:
>>>>> +    sigsegv:
>>>>> +        /* Like force_sig(SIGSEGV).  */
>>>>> +        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    /*
>>>>> +     * Validate the return address.
>>>>> +     * Note that the kernel treats this the same as an invalid entry point.
>>>>> +     */
>>>>> +    if (get_user_u64(caller, env->regs[R_ESP])) {
>>>>> +        goto sigsegv;
>>>>> +    }
>>>
>>> Wouldn't this read better:
>>>
>>>     /*
>>>      * Validate the entry point.  We have already validated the page
>>>      * during translation, now verify the offset.
>>>      */
>>>     switch (env->eip & ~TARGET_PAGE_MASK) {
>>>     case 0x000:
>>>         syscall = TARGET_NR_gettimeofday;
>>>         break;
>>>     case 0x400:
>>>         syscall = TARGET_NR_time;
>>>         break;
>>>     case 0x800:
>>>         syscall = TARGET_NR_getcpu;
>>>         break;
>>>     default:
>>>         syscall = -1;
>>>         break;
>>>     }
>>>
>>>     /*
>>>      * If we have an invalid entry point or an invalid return address we
>>>      * generate a SIGSEG.
>>>      */
>>>     if (syscall < 0 || get_user_u64(caller, env->regs[R_ESP])) {
>>>         gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
>>>         return;
>>>     }
>>>
>>
>> Only if you have a violent goto allergy.
> 
> gotos have their place but jumping backwards is confusing to eye. If the
> compiler want to mess with layout after then it is free to do so.

I agree, if anything I'd place the sigsegv label at the end of the
function but Alex's version is just fine too.

Paolo
diff mbox series

Patch

diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 164d038d1f..3fb2d2a986 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -1000,6 +1000,7 @@  typedef uint64_t FeatureWordArray[FEATURE_WORDS];
 
 #define EXCP_VMEXIT     0x100 /* only for system emulation */
 #define EXCP_SYSCALL    0x101 /* only for user emulation */
+#define EXCP_VSYSCALL   0x102 /* only for user emulation */
 
 /* i386-specific interrupt pending bits.  */
 #define CPU_INTERRUPT_POLL      CPU_INTERRUPT_TGT_EXT_1
diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c
index e217cca5ee..f9bf6cec27 100644
--- a/linux-user/i386/cpu_loop.c
+++ b/linux-user/i386/cpu_loop.c
@@ -92,6 +92,106 @@  static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr)
     queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info);
 }
 
+#ifdef TARGET_X86_64
+static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len)
+{
+    /*
+     * For all the vsyscalls, NULL means "don't write anything" not
+     * "write it at address 0".
+     */
+    if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) {
+        return true;
+    }
+
+    env->error_code = PG_ERROR_W_MASK | PG_ERROR_U_MASK;
+    gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr);
+    return false;
+}
+
+/*
+ * Since v3.1, the kernel traps and emulates the vsyscall page.
+ * Entry points other than the official generate SIGSEGV.
+ */
+static void emulate_vsyscall(CPUX86State *env)
+{
+    int syscall;
+    abi_ulong ret;
+    uint64_t caller;
+
+    /*
+     * Validate the entry point.  We have already validated the page
+     * during translation, now verify the offset.
+     */
+    switch (env->eip & ~TARGET_PAGE_MASK) {
+    case 0x000:
+        syscall = TARGET_NR_gettimeofday;
+        break;
+    case 0x400:
+        syscall = TARGET_NR_time;
+        break;
+    case 0x800:
+        syscall = TARGET_NR_getcpu;
+        break;
+    default:
+    sigsegv:
+        /* Like force_sig(SIGSEGV).  */
+        gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0);
+        return;
+    }
+
+    /*
+     * Validate the return address.
+     * Note that the kernel treats this the same as an invalid entry point.
+     */
+    if (get_user_u64(caller, env->regs[R_ESP])) {
+        goto sigsegv;
+    }
+
+    /*
+     * Validate the the pointer arguments.
+     */
+    switch (syscall) {
+    case TARGET_NR_gettimeofday:
+        if (!write_ok_or_segv(env, env->regs[R_EDI],
+                              sizeof(struct target_timeval)) ||
+            !write_ok_or_segv(env, env->regs[R_ESI],
+                              sizeof(struct target_timezone))) {
+            return;
+        }
+        break;
+    case TARGET_NR_time:
+        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(abi_long))) {
+            return;
+        }
+        break;
+    case TARGET_NR_getcpu:
+        if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(uint32_t)) ||
+            !write_ok_or_segv(env, env->regs[R_ESI], sizeof(uint32_t))) {
+            return;
+        }
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    /*
+     * Perform the syscall.  None of the vsyscalls should need restarting,
+     * and all faults should have been caught above.
+     */
+    ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI],
+                     env->regs[R_EDX], env->regs[10], env->regs[8],
+                     env->regs[9], 0, 0);
+    g_assert(ret != -TARGET_ERESTARTSYS);
+    g_assert(ret != -TARGET_QEMU_ESIGRETURN);
+    g_assert(ret != -TARGET_EFAULT);
+    env->regs[R_EAX] = ret;
+
+    /* Emulate a ret instruction to leave the vsyscall page.  */
+    env->eip = caller;
+    env->regs[R_ESP] += 8;
+}
+#endif
+
 void cpu_loop(CPUX86State *env)
 {
     CPUState *cs = env_cpu(env);
@@ -141,6 +241,11 @@  void cpu_loop(CPUX86State *env)
                 env->regs[R_EAX] = ret;
             }
             break;
+#endif
+#ifdef TARGET_X86_64
+        case EXCP_VSYSCALL:
+            emulate_vsyscall(env);
+            break;
 #endif
         case EXCP0B_NOSEG:
         case EXCP0C_STACK:
diff --git a/target/i386/translate.c b/target/i386/translate.c
index 7c99ef1385..391b4ef149 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -8555,7 +8555,21 @@  static bool i386_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu,
 static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu)
 {
     DisasContext *dc = container_of(dcbase, DisasContext, base);
-    target_ulong pc_next = disas_insn(dc, cpu);
+    target_ulong pc_next;
+
+#if defined(TARGET_X86_64) && \
+    defined(CONFIG_USER_ONLY) && \
+    defined(CONFIG_LINUX)
+    /*
+     * Detect entry into the vsyscall page and invoke the syscall.
+     */
+    if ((dc->base.pc_next & TARGET_PAGE_MASK) == 0xffffffffff600000ull) {
+        gen_exception(dc, EXCP_VSYSCALL, dc->base.pc_next);
+        return;
+    }
+#endif
+
+    pc_next = disas_insn(dc, cpu);
 
     if (dc->tf || (dc->base.tb->flags & HF_INHIBIT_IRQ_MASK)) {
         /* if single step mode, we generate only one instruction and