Message ID | 20200114210921.11216-4-richard.henderson@linaro.org |
---|---|
State | New |
Headers | show |
Series | linux-user: Implement x86_64 vsyscalls | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > Notice the magic page during translate, much like we already > do for the arm32 commpage. At runtime, raise an exception to > return cpu_loop for emulation. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> So I've been trying to test this but have run into a number of (unrelated) hurdles on the way on my system. I think I've eliminated these patches but it might be a regression. Does running /sbin/ldconfig work ok for you?
Richard Henderson <richard.henderson@linaro.org> writes: > Notice the magic page during translate, much like we already > do for the arm32 commpage. At runtime, raise an exception to > return cpu_loop for emulation. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/i386/cpu.h | 1 + > linux-user/i386/cpu_loop.c | 104 +++++++++++++++++++++++++++++++++++++ > target/i386/translate.c | 16 +++++- > 3 files changed, 120 insertions(+), 1 deletion(-) > > diff --git a/target/i386/cpu.h b/target/i386/cpu.h > index 164d038d1f..3fb2d2a986 100644 > --- a/target/i386/cpu.h > +++ b/target/i386/cpu.h > @@ -1000,6 +1000,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; > > #define EXCP_VMEXIT 0x100 /* only for system emulation */ > #define EXCP_SYSCALL 0x101 /* only for user emulation */ > +#define EXCP_VSYSCALL 0x102 /* only for user emulation */ > > /* i386-specific interrupt pending bits. */ > #define CPU_INTERRUPT_POLL CPU_INTERRUPT_TGT_EXT_1 > diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c > index e217cca5ee..8b7c9f7337 100644 > --- a/linux-user/i386/cpu_loop.c > +++ b/linux-user/i386/cpu_loop.c > @@ -92,6 +92,105 @@ static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr) > queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info); > } > > +#ifdef TARGET_X86_64 > +static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len) > +{ > + /* > + * For all the vsyscalls, NULL means "don't write anything" not > + * "write it at address 0". > + */ > + if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) { > + return true; > + } > + > + gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr); > + return false; > +} > + > +/* > + * Since v3.1, the kernel traps and emulates the vsyscall page. > + * Entry points other than the official generate SIGSEGV. > + */ > +static void emulate_vsyscall(CPUX86State *env) > +{ > + int syscall; > + abi_ulong ret; > + uint64_t caller; > + > + /* > + * Validate the entry point. We have already validated the page > + * during translation, now verify the offset. > + */ > + switch (env->eip & ~TARGET_PAGE_MASK) { > + case 0x000: > + syscall = TARGET_NR_gettimeofday; > + break; > + case 0x400: > + syscall = TARGET_NR_time; > + break; > + case 0x800: > + syscall = TARGET_NR_getcpu; > + break; > + default: > + sigsegv: > + /* Like force_sig(SIGSEGV). */ > + gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0); > + return; > + } > + > + /* > + * Validate the return address. > + * Note that the kernel treats this the same as an invalid entry point. > + */ > + if (get_user_u64(caller, env->regs[R_ESP])) { > + goto sigsegv; > + } > + > + /* > + * Validate the the pointer arguments. > + */ > + switch (syscall) { > + case TARGET_NR_gettimeofday: > + if (!write_ok_or_segv(env, env->regs[R_EDI], > + sizeof(struct target_timeval)) || > + !write_ok_or_segv(env, env->regs[R_ESI], > + sizeof(struct target_timezone))) { > + return; > + } > + break; > + case TARGET_NR_time: > + if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(abi_long))) { > + return; > + } > + break; > + case TARGET_NR_getcpu: > + if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(uint32_t)) || > + !write_ok_or_segv(env, env->regs[R_ESI], sizeof(uint32_t))) { > + return; > + } > + break; > + default: > + g_assert_not_reached(); > + } > + > + /* > + * Perform the syscall. None of the vsyscalls should need restarting, > + * and all faults should have been caught above. > + */ > + ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI], > + env->regs[R_EDX], env->regs[10], env->regs[8], > + env->regs[9], 0, 0); How come the register ABI to the syscall is different to the others. I can see why syscall doesn't come from EAX but the others are a different set to normal syscalls which might be why: > + g_assert(ret != -TARGET_ERESTARTSYS); > + g_assert(ret != -TARGET_QEMU_ESIGRETURN); > + g_assert(ret != -TARGET_EFAULT); I'm seeing a EFAULT on the gettimeofday failure: #0 do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076 #1 0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180 #2 cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246 #3 0x000055555559640e in main (argc=<optimized out>, argv=<optimized #out>, envp=<optimized out>) at #/home/alex/lsrc/qemu.git/linux-user/main.c:865 arg1/arg2 don't seem right here. > + env->regs[R_EAX] = ret; > + > + /* Emulate a ret instruction to leave the vsyscall page. */ > + env->eip = caller; > + env->regs[R_ESP] += 8; > +} > +#endif > + > void cpu_loop(CPUX86State *env) > { > CPUState *cs = env_cpu(env); > @@ -141,6 +240,11 @@ void cpu_loop(CPUX86State *env) > env->regs[R_EAX] = ret; > } > break; > +#endif > +#ifdef TARGET_X86_64 > + case EXCP_VSYSCALL: > + emulate_vsyscall(env); > + break; > #endif > case EXCP0B_NOSEG: > case EXCP0C_STACK: > diff --git a/target/i386/translate.c b/target/i386/translate.c > index 7c99ef1385..391b4ef149 100644 > --- a/target/i386/translate.c > +++ b/target/i386/translate.c > @@ -8555,7 +8555,21 @@ static bool i386_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu, > static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) > { > DisasContext *dc = container_of(dcbase, DisasContext, base); > - target_ulong pc_next = disas_insn(dc, cpu); > + target_ulong pc_next; > + > +#if defined(TARGET_X86_64) && \ > + defined(CONFIG_USER_ONLY) && \ > + defined(CONFIG_LINUX) > + /* > + * Detect entry into the vsyscall page and invoke the syscall. > + */ > + if ((dc->base.pc_next & TARGET_PAGE_MASK) == 0xffffffffff600000ull) { > + gen_exception(dc, EXCP_VSYSCALL, dc->base.pc_next); > + return; > + } > +#endif > + > + pc_next = disas_insn(dc, cpu); > > if (dc->tf || (dc->base.tb->flags & HF_INHIBIT_IRQ_MASK)) { > /* if single step mode, we generate only one instruction and
On 1/16/20 6:26 AM, Alex Bennée wrote: >> + /* >> + * Perform the syscall. None of the vsyscalls should need restarting, >> + * and all faults should have been caught above. >> + */ >> + ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI], >> + env->regs[R_EDX], env->regs[10], env->regs[8], >> + env->regs[9], 0, 0); > > How come the register ABI to the syscall is different to the others. I > can see why syscall doesn't come from EAX but the others are a different > set to normal syscalls which might be why: Cut and paste error, I assume. That said, the three syscalls have a maximum of 2 arguments, so I could really just pass EDI and ESI and 0 for the rest... > I'm seeing a EFAULT on the gettimeofday failure: What getttimeofday failure? Is this related to the mention of /sbin/ldconfig in your previous message? > #0 do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076 > #1 0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180 > #2 cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246 > #3 0x000055555559640e in main (argc=<optimized out>, argv=<optimized > #out>, envp=<optimized out>) at > #/home/alex/lsrc/qemu.git/linux-user/main.c:865 > > arg1/arg2 don't seem right here. Why? NULL value for arg1 is legal, though semi-useless. Ah, I see that our implementation of gettimeofday doesn't honor NULL. r~
On 1/16/20 8:19 AM, Richard Henderson wrote: > On 1/16/20 6:26 AM, Alex Bennée wrote: >>> + /* >>> + * Perform the syscall. None of the vsyscalls should need restarting, >>> + * and all faults should have been caught above. >>> + */ >>> + ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI], >>> + env->regs[R_EDX], env->regs[10], env->regs[8], >>> + env->regs[9], 0, 0); >> >> How come the register ABI to the syscall is different to the others. I >> can see why syscall doesn't come from EAX but the others are a different >> set to normal syscalls which might be why: > > Cut and paste error, I assume. What register difference? case EXCP_SYSCALL: /* linux syscall from syscall instruction */ ret = do_syscall(env, env->regs[R_EAX], env->regs[R_EDI], env->regs[R_ESI], env->regs[R_EDX], env->regs[10], env->regs[8], env->regs[9], 0, 0); Looks the same to me... r~
Richard Henderson <richard.henderson@linaro.org> writes: > On 1/16/20 6:26 AM, Alex Bennée wrote: >>> + /* >>> + * Perform the syscall. None of the vsyscalls should need restarting, >>> + * and all faults should have been caught above. >>> + */ >>> + ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI], >>> + env->regs[R_EDX], env->regs[10], env->regs[8], >>> + env->regs[9], 0, 0); >> >> How come the register ABI to the syscall is different to the others. I >> can see why syscall doesn't come from EAX but the others are a different >> set to normal syscalls which might be why: > > Cut and paste error, I assume. > > That said, the three syscalls have a maximum of 2 arguments, > so I could really just pass EDI and ESI and 0 for the rest... > >> I'm seeing a EFAULT on the gettimeofday failure: > > What getttimeofday failure? Is this related to the mention of /sbin/ldconfig > in your previous message? No - the buster x86064 ldconfig seg is unrelated to this series. It has however spawned an additional bug in gdbstub while it was at it ;-) > >> #0 do_syscall (cpu_env=cpu_env@entry=0x5555577d2b10, num=num@entry=96, arg1=0, arg2=0, arg3=4211016, arg4=8, arg5=274888677184, arg6=274886295415, arg7=0, arg8=0) at /home/alex/lsrc/qemu.git/linux-user/syscall.c:12076 >> #1 0x0000555555609b6e in emulate_vsyscall (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:180 >> #2 cpu_loop (env=0x5555577d2b10) at /home/alex/lsrc/qemu.git/linux-user/x86_64/../i386/cpu_loop.c:246 >> #3 0x000055555559640e in main (argc=<optimized out>, argv=<optimized >> #out>, envp=<optimized out>) at >> #/home/alex/lsrc/qemu.git/linux-user/main.c:865 >> >> arg1/arg2 don't seem right here. > > Why? NULL value for arg1 is legal, though semi-useless. > > Ah, I see that our implementation of gettimeofday doesn't honor NULL. > > > r~
diff --git a/target/i386/cpu.h b/target/i386/cpu.h index 164d038d1f..3fb2d2a986 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1000,6 +1000,7 @@ typedef uint64_t FeatureWordArray[FEATURE_WORDS]; #define EXCP_VMEXIT 0x100 /* only for system emulation */ #define EXCP_SYSCALL 0x101 /* only for user emulation */ +#define EXCP_VSYSCALL 0x102 /* only for user emulation */ /* i386-specific interrupt pending bits. */ #define CPU_INTERRUPT_POLL CPU_INTERRUPT_TGT_EXT_1 diff --git a/linux-user/i386/cpu_loop.c b/linux-user/i386/cpu_loop.c index e217cca5ee..8b7c9f7337 100644 --- a/linux-user/i386/cpu_loop.c +++ b/linux-user/i386/cpu_loop.c @@ -92,6 +92,105 @@ static void gen_signal(CPUX86State *env, int sig, int code, abi_ptr addr) queue_signal(env, info.si_signo, QEMU_SI_FAULT, &info); } +#ifdef TARGET_X86_64 +static bool write_ok_or_segv(CPUX86State *env, abi_ptr addr, size_t len) +{ + /* + * For all the vsyscalls, NULL means "don't write anything" not + * "write it at address 0". + */ + if (addr == 0 || access_ok(VERIFY_WRITE, addr, len)) { + return true; + } + + gen_signal(env, TARGET_SIGSEGV, TARGET_SEGV_MAPERR, addr); + return false; +} + +/* + * Since v3.1, the kernel traps and emulates the vsyscall page. + * Entry points other than the official generate SIGSEGV. + */ +static void emulate_vsyscall(CPUX86State *env) +{ + int syscall; + abi_ulong ret; + uint64_t caller; + + /* + * Validate the entry point. We have already validated the page + * during translation, now verify the offset. + */ + switch (env->eip & ~TARGET_PAGE_MASK) { + case 0x000: + syscall = TARGET_NR_gettimeofday; + break; + case 0x400: + syscall = TARGET_NR_time; + break; + case 0x800: + syscall = TARGET_NR_getcpu; + break; + default: + sigsegv: + /* Like force_sig(SIGSEGV). */ + gen_signal(env, TARGET_SIGSEGV, TARGET_SI_KERNEL, 0); + return; + } + + /* + * Validate the return address. + * Note that the kernel treats this the same as an invalid entry point. + */ + if (get_user_u64(caller, env->regs[R_ESP])) { + goto sigsegv; + } + + /* + * Validate the the pointer arguments. + */ + switch (syscall) { + case TARGET_NR_gettimeofday: + if (!write_ok_or_segv(env, env->regs[R_EDI], + sizeof(struct target_timeval)) || + !write_ok_or_segv(env, env->regs[R_ESI], + sizeof(struct target_timezone))) { + return; + } + break; + case TARGET_NR_time: + if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(abi_long))) { + return; + } + break; + case TARGET_NR_getcpu: + if (!write_ok_or_segv(env, env->regs[R_EDI], sizeof(uint32_t)) || + !write_ok_or_segv(env, env->regs[R_ESI], sizeof(uint32_t))) { + return; + } + break; + default: + g_assert_not_reached(); + } + + /* + * Perform the syscall. None of the vsyscalls should need restarting, + * and all faults should have been caught above. + */ + ret = do_syscall(env, syscall, env->regs[R_EDI], env->regs[R_ESI], + env->regs[R_EDX], env->regs[10], env->regs[8], + env->regs[9], 0, 0); + g_assert(ret != -TARGET_ERESTARTSYS); + g_assert(ret != -TARGET_QEMU_ESIGRETURN); + g_assert(ret != -TARGET_EFAULT); + env->regs[R_EAX] = ret; + + /* Emulate a ret instruction to leave the vsyscall page. */ + env->eip = caller; + env->regs[R_ESP] += 8; +} +#endif + void cpu_loop(CPUX86State *env) { CPUState *cs = env_cpu(env); @@ -141,6 +240,11 @@ void cpu_loop(CPUX86State *env) env->regs[R_EAX] = ret; } break; +#endif +#ifdef TARGET_X86_64 + case EXCP_VSYSCALL: + emulate_vsyscall(env); + break; #endif case EXCP0B_NOSEG: case EXCP0C_STACK: diff --git a/target/i386/translate.c b/target/i386/translate.c index 7c99ef1385..391b4ef149 100644 --- a/target/i386/translate.c +++ b/target/i386/translate.c @@ -8555,7 +8555,21 @@ static bool i386_tr_breakpoint_check(DisasContextBase *dcbase, CPUState *cpu, static void i386_tr_translate_insn(DisasContextBase *dcbase, CPUState *cpu) { DisasContext *dc = container_of(dcbase, DisasContext, base); - target_ulong pc_next = disas_insn(dc, cpu); + target_ulong pc_next; + +#if defined(TARGET_X86_64) && \ + defined(CONFIG_USER_ONLY) && \ + defined(CONFIG_LINUX) + /* + * Detect entry into the vsyscall page and invoke the syscall. + */ + if ((dc->base.pc_next & TARGET_PAGE_MASK) == 0xffffffffff600000ull) { + gen_exception(dc, EXCP_VSYSCALL, dc->base.pc_next); + return; + } +#endif + + pc_next = disas_insn(dc, cpu); if (dc->tf || (dc->base.tb->flags & HF_INHIBIT_IRQ_MASK)) { /* if single step mode, we generate only one instruction and
Notice the magic page during translate, much like we already do for the arm32 commpage. At runtime, raise an exception to return cpu_loop for emulation. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/i386/cpu.h | 1 + linux-user/i386/cpu_loop.c | 104 +++++++++++++++++++++++++++++++++++++ target/i386/translate.c | 16 +++++- 3 files changed, 120 insertions(+), 1 deletion(-)