Message ID | 4FE4F56D.1020201@web.de |
---|---|
State | New |
Headers | show |
Should have declared this [RFC] in the subject and CC'ed kvm... On 2012-06-23 00:45, Jan Kiszka wrote: > This sketches a possible path to get rid of the iothread lock on vmexits > in KVM mode. On x86, the the in-kernel irqchips has to be used because > we otherwise need to synchronize APIC and other per-cpu state accesses > that could be changed concurrently. Not yet fully analyzed is the NMI > injection path in the absence of an APIC. > > s390x should be fine without specific locking as their pre/post-run > callbacks are empty. Power requires locking for the pre-run callback. > > This patch is untested, but a similar version was successfully used in > a x86 setup with a network I/O path that needed no central iothread > locking anymore (required special MMIO exit handling). > --- > kvm-all.c | 18 ++++++++++++++++-- > target-i386/kvm.c | 7 +++++++ > target-ppc/kvm.c | 4 ++++ > 3 files changed, 27 insertions(+), 2 deletions(-) > > diff --git a/kvm-all.c b/kvm-all.c > index f8e4328..9c3e26f 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > return EXCP_HLT; > } > > + qemu_mutex_unlock_iothread(); > + > do { > if (env->kvm_vcpu_dirty) { > kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > */ > qemu_cpu_kick_self(); > } > - qemu_mutex_unlock_iothread(); > > run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > > - qemu_mutex_lock_iothread(); > kvm_arch_post_run(env, run); > > + /* TODO: push coalesced mmio flushing to the point where we access > + * devices that are using it (currently VGA and E1000). */ > + qemu_mutex_lock_iothread(); > kvm_flush_coalesced_mmio_buffer(); > + qemu_mutex_unlock_iothread(); > > if (run_ret < 0) { > if (run_ret == -EINTR || run_ret == -EAGAIN) { > @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) > switch (run->exit_reason) { > case KVM_EXIT_IO: > DPRINTF("handle_io\n"); > + qemu_mutex_lock_iothread(); > kvm_handle_io(run->io.port, > (uint8_t *)run + run->io.data_offset, > run->io.direction, > run->io.size, > run->io.count); > + qemu_mutex_unlock_iothread(); > ret = 0; > break; > case KVM_EXIT_MMIO: > DPRINTF("handle_mmio\n"); > + qemu_mutex_lock_iothread(); > cpu_physical_memory_rw(run->mmio.phys_addr, > run->mmio.data, > run->mmio.len, > run->mmio.is_write); > + qemu_mutex_unlock_iothread(); > ret = 0; > break; > case KVM_EXIT_IRQ_WINDOW_OPEN: > @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) > break; > case KVM_EXIT_SHUTDOWN: > DPRINTF("shutdown\n"); > + qemu_mutex_lock_iothread(); > qemu_system_reset_request(); > + qemu_mutex_unlock_iothread(); > ret = EXCP_INTERRUPT; > break; > case KVM_EXIT_UNKNOWN: > @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) > break; > default: > DPRINTF("kvm_arch_handle_exit\n"); > + qemu_mutex_lock_iothread(); > ret = kvm_arch_handle_exit(env, run); > + qemu_mutex_unlock_iothread(); > break; > } > } while (ret == 0); > > + qemu_mutex_lock_iothread(); > + > if (ret < 0) { > cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); > vm_stop(RUN_STATE_INTERNAL_ERROR); > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > index 0d0d8f6..0ad64d1 100644 > --- a/target-i386/kvm.c > +++ b/target-i386/kvm.c > @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > /* Inject NMI */ > if (env->interrupt_request & CPU_INTERRUPT_NMI) { > + qemu_mutex_lock_iothread(); > env->interrupt_request &= ~CPU_INTERRUPT_NMI; > + qemu_mutex_unlock_iothread(); > + > DPRINTF("injected NMI\n"); > ret = kvm_vcpu_ioctl(env, KVM_NMI); > if (ret < 0) { > @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > } > > if (!kvm_irqchip_in_kernel()) { > + qemu_mutex_lock_iothread(); > + > /* Force the VCPU out of its inner loop to process any INIT requests > * or pending TPR access reports. */ > if (env->interrupt_request & > @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > DPRINTF("setting tpr\n"); > run->cr8 = cpu_get_apic_tpr(env->apic_state); > + > + qemu_mutex_unlock_iothread(); > } > } > > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index c09cc39..60d91a5 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > int r; > unsigned irq; > > + qemu_mutex_lock_iothread(); > + > /* PowerPC QEMU tracks the various core input pins (interrupt, critical > * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ > if (!cap_interrupt_level && > @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > /* We don't know if there are more interrupts pending after this. However, > * the guest will return to userspace in the course of handling this one > * anyways, so we will get a chance to deliver the rest. */ > + > + qemu_mutex_unlock_iothread(); > } > > void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run) >
On 06/22/2012 05:45 PM, Jan Kiszka wrote: > This sketches a possible path to get rid of the iothread lock on vmexits > in KVM mode. On x86, the the in-kernel irqchips has to be used because > we otherwise need to synchronize APIC and other per-cpu state accesses > that could be changed concurrently. Not yet fully analyzed is the NMI > injection path in the absence of an APIC. > > s390x should be fine without specific locking as their pre/post-run > callbacks are empty. Power requires locking for the pre-run callback. > > This patch is untested, but a similar version was successfully used in > a x86 setup with a network I/O path that needed no central iothread > locking anymore (required special MMIO exit handling). > --- > kvm-all.c | 18 ++++++++++++++++-- > target-i386/kvm.c | 7 +++++++ > target-ppc/kvm.c | 4 ++++ > 3 files changed, 27 insertions(+), 2 deletions(-) > > diff --git a/kvm-all.c b/kvm-all.c > index f8e4328..9c3e26f 100644 > --- a/kvm-all.c > +++ b/kvm-all.c > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > return EXCP_HLT; > } > > + qemu_mutex_unlock_iothread(); > + > do { > if (env->kvm_vcpu_dirty) { > kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > */ > qemu_cpu_kick_self(); > } > - qemu_mutex_unlock_iothread(); > > run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > > - qemu_mutex_lock_iothread(); > kvm_arch_post_run(env, run); > > + /* TODO: push coalesced mmio flushing to the point where we access > + * devices that are using it (currently VGA and E1000). */ > + qemu_mutex_lock_iothread(); > kvm_flush_coalesced_mmio_buffer(); > + qemu_mutex_unlock_iothread(); > > if (run_ret< 0) { > if (run_ret == -EINTR || run_ret == -EAGAIN) { > @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) > switch (run->exit_reason) { > case KVM_EXIT_IO: > DPRINTF("handle_io\n"); > + qemu_mutex_lock_iothread(); > kvm_handle_io(run->io.port, > (uint8_t *)run + run->io.data_offset, > run->io.direction, > run->io.size, > run->io.count); > + qemu_mutex_unlock_iothread(); > ret = 0; > break; > case KVM_EXIT_MMIO: > DPRINTF("handle_mmio\n"); > + qemu_mutex_lock_iothread(); > cpu_physical_memory_rw(run->mmio.phys_addr, > run->mmio.data, > run->mmio.len, > run->mmio.is_write); > + qemu_mutex_unlock_iothread(); > ret = 0; > break; > case KVM_EXIT_IRQ_WINDOW_OPEN: > @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) > break; > case KVM_EXIT_SHUTDOWN: > DPRINTF("shutdown\n"); > + qemu_mutex_lock_iothread(); > qemu_system_reset_request(); > + qemu_mutex_unlock_iothread(); > ret = EXCP_INTERRUPT; > break; > case KVM_EXIT_UNKNOWN: > @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) > break; > default: > DPRINTF("kvm_arch_handle_exit\n"); > + qemu_mutex_lock_iothread(); > ret = kvm_arch_handle_exit(env, run); > + qemu_mutex_unlock_iothread(); > break; > } > } while (ret == 0); > > + qemu_mutex_lock_iothread(); > + > if (ret< 0) { > cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); > vm_stop(RUN_STATE_INTERNAL_ERROR); > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > index 0d0d8f6..0ad64d1 100644 > --- a/target-i386/kvm.c > +++ b/target-i386/kvm.c > @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > /* Inject NMI */ > if (env->interrupt_request& CPU_INTERRUPT_NMI) { Strictly speaking, wouldn't we need to use testbit() and setbit()? I would expect in the very least a barrier would be needed. Looks pretty nice overall. I'll need to apply and spend some time carefully walking through the code. Thanks for sharing! Regards, Anthony Liguori > + qemu_mutex_lock_iothread(); > env->interrupt_request&= ~CPU_INTERRUPT_NMI; > + qemu_mutex_unlock_iothread(); > + > DPRINTF("injected NMI\n"); > ret = kvm_vcpu_ioctl(env, KVM_NMI); > if (ret< 0) { > @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > } > > if (!kvm_irqchip_in_kernel()) { > + qemu_mutex_lock_iothread(); > + > /* Force the VCPU out of its inner loop to process any INIT requests > * or pending TPR access reports. */ > if (env->interrupt_request& > @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > DPRINTF("setting tpr\n"); > run->cr8 = cpu_get_apic_tpr(env->apic_state); > + > + qemu_mutex_unlock_iothread(); > } > } > > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index c09cc39..60d91a5 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > int r; > unsigned irq; > > + qemu_mutex_lock_iothread(); > + > /* PowerPC QEMU tracks the various core input pins (interrupt, critical > * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ > if (!cap_interrupt_level&& > @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > /* We don't know if there are more interrupts pending after this. However, > * the guest will return to userspace in the course of handling this one > * anyways, so we will get a chance to deliver the rest. */ > + > + qemu_mutex_unlock_iothread(); > } > > void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > Should have declared this [RFC] in the subject and CC'ed kvm... > > On 2012-06-23 00:45, Jan Kiszka wrote: > > This sketches a possible path to get rid of the iothread lock on vmexits > > in KVM mode. On x86, the the in-kernel irqchips has to be used because > > we otherwise need to synchronize APIC and other per-cpu state accesses > > that could be changed concurrently. Not yet fully analyzed is the NMI > > injection path in the absence of an APIC. > > > > s390x should be fine without specific locking as their pre/post-run > > callbacks are empty. Power requires locking for the pre-run callback. > > > > This patch is untested, but a similar version was successfully used in > > a x86 setup with a network I/O path that needed no central iothread > > locking anymore (required special MMIO exit handling). > > --- > > kvm-all.c | 18 ++++++++++++++++-- > > target-i386/kvm.c | 7 +++++++ > > target-ppc/kvm.c | 4 ++++ > > 3 files changed, 27 insertions(+), 2 deletions(-) > > > > diff --git a/kvm-all.c b/kvm-all.c > > index f8e4328..9c3e26f 100644 > > --- a/kvm-all.c > > +++ b/kvm-all.c > > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > > return EXCP_HLT; > > } > > > > + qemu_mutex_unlock_iothread(); > > + > > do { > > if (env->kvm_vcpu_dirty) { > > kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > > */ > > qemu_cpu_kick_self(); > > } > > - qemu_mutex_unlock_iothread(); > > > > run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > > > > - qemu_mutex_lock_iothread(); > > kvm_arch_post_run(env, run); target-i386/kvm.c void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) { if (run->if_flag) { env->eflags |= IF_MASK; } else { env->eflags &= ~IF_MASK; } cpu_set_apic_tpr(env->apic_state, run->cr8); cpu_set_apic_base(env->apic_state, run->apic_base); } Clearly there is no structure to any of the writes around the writes in x86's kvm_arch_post_run, so it is unsafe. In kvm_flush_coalesced_mmio_buffer, however, the first and last pointers can be read safely without the global lock (so you could move the lock after reading run->exit_reason, in that case). > > + /* TODO: push coalesced mmio flushing to the point where we access > > + * devices that are using it (currently VGA and E1000). */ > > + qemu_mutex_lock_iothread(); > > kvm_flush_coalesced_mmio_buffer(); > > + qemu_mutex_unlock_iothread(); But you have to flush first to then figure out which device the coalesced mmio belongs to (don't get that comment). It would be good to move the global lock acquision to as late as possible, but acquiring/reacquiring is not very useful (can result in a slowdown, actually).
On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote: > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > > Should have declared this [RFC] in the subject and CC'ed kvm... > > > > On 2012-06-23 00:45, Jan Kiszka wrote: > > > This sketches a possible path to get rid of the iothread lock on vmexits > > > in KVM mode. On x86, the the in-kernel irqchips has to be used because > > > we otherwise need to synchronize APIC and other per-cpu state accesses > > > that could be changed concurrently. Not yet fully analyzed is the NMI > > > injection path in the absence of an APIC. > > > > > > s390x should be fine without specific locking as their pre/post-run > > > callbacks are empty. Power requires locking for the pre-run callback. > > > > > > This patch is untested, but a similar version was successfully used in > > > a x86 setup with a network I/O path that needed no central iothread > > > locking anymore (required special MMIO exit handling). > > > --- > > > kvm-all.c | 18 ++++++++++++++++-- > > > target-i386/kvm.c | 7 +++++++ > > > target-ppc/kvm.c | 4 ++++ > > > 3 files changed, 27 insertions(+), 2 deletions(-) > > > > > > diff --git a/kvm-all.c b/kvm-all.c > > > index f8e4328..9c3e26f 100644 > > > --- a/kvm-all.c > > > +++ b/kvm-all.c > > > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > > > return EXCP_HLT; > > > } > > > > > > + qemu_mutex_unlock_iothread(); > > > + > > > do { > > > if (env->kvm_vcpu_dirty) { > > > kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > > > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > > > */ > > > qemu_cpu_kick_self(); > > > } > > > - qemu_mutex_unlock_iothread(); > > > > > > run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > > > > > > - qemu_mutex_lock_iothread(); > > > kvm_arch_post_run(env, run); > > target-i386/kvm.c > > void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) > { > if (run->if_flag) { > env->eflags |= IF_MASK; > } else { > env->eflags &= ~IF_MASK; > } > cpu_set_apic_tpr(env->apic_state, run->cr8); > cpu_set_apic_base(env->apic_state, run->apic_base); > } > > Clearly there is no structure to any of the writes around the writes > in x86's kvm_arch_post_run, so it is unsafe. No access protocol to the CPUState and apic devices (who can write when, who can read when).
On 2012-06-23 00:59, Anthony Liguori wrote: > On 06/22/2012 05:45 PM, Jan Kiszka wrote: >> This sketches a possible path to get rid of the iothread lock on vmexits >> in KVM mode. On x86, the the in-kernel irqchips has to be used because >> we otherwise need to synchronize APIC and other per-cpu state accesses >> that could be changed concurrently. Not yet fully analyzed is the NMI >> injection path in the absence of an APIC. >> >> s390x should be fine without specific locking as their pre/post-run >> callbacks are empty. Power requires locking for the pre-run callback. >> >> This patch is untested, but a similar version was successfully used in >> a x86 setup with a network I/O path that needed no central iothread >> locking anymore (required special MMIO exit handling). >> --- >> kvm-all.c | 18 ++++++++++++++++-- >> target-i386/kvm.c | 7 +++++++ >> target-ppc/kvm.c | 4 ++++ >> 3 files changed, 27 insertions(+), 2 deletions(-) >> >> diff --git a/kvm-all.c b/kvm-all.c >> index f8e4328..9c3e26f 100644 >> --- a/kvm-all.c >> +++ b/kvm-all.c >> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >> return EXCP_HLT; >> } >> >> + qemu_mutex_unlock_iothread(); >> + >> do { >> if (env->kvm_vcpu_dirty) { >> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >> */ >> qemu_cpu_kick_self(); >> } >> - qemu_mutex_unlock_iothread(); >> >> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >> >> - qemu_mutex_lock_iothread(); >> kvm_arch_post_run(env, run); >> >> + /* TODO: push coalesced mmio flushing to the point where we >> access >> + * devices that are using it (currently VGA and E1000). */ >> + qemu_mutex_lock_iothread(); >> kvm_flush_coalesced_mmio_buffer(); >> + qemu_mutex_unlock_iothread(); >> >> if (run_ret< 0) { >> if (run_ret == -EINTR || run_ret == -EAGAIN) { >> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) >> switch (run->exit_reason) { >> case KVM_EXIT_IO: >> DPRINTF("handle_io\n"); >> + qemu_mutex_lock_iothread(); >> kvm_handle_io(run->io.port, >> (uint8_t *)run + run->io.data_offset, >> run->io.direction, >> run->io.size, >> run->io.count); >> + qemu_mutex_unlock_iothread(); >> ret = 0; >> break; >> case KVM_EXIT_MMIO: >> DPRINTF("handle_mmio\n"); >> + qemu_mutex_lock_iothread(); >> cpu_physical_memory_rw(run->mmio.phys_addr, >> run->mmio.data, >> run->mmio.len, >> run->mmio.is_write); >> + qemu_mutex_unlock_iothread(); >> ret = 0; >> break; >> case KVM_EXIT_IRQ_WINDOW_OPEN: >> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) >> break; >> case KVM_EXIT_SHUTDOWN: >> DPRINTF("shutdown\n"); >> + qemu_mutex_lock_iothread(); >> qemu_system_reset_request(); >> + qemu_mutex_unlock_iothread(); >> ret = EXCP_INTERRUPT; >> break; >> case KVM_EXIT_UNKNOWN: >> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) >> break; >> default: >> DPRINTF("kvm_arch_handle_exit\n"); >> + qemu_mutex_lock_iothread(); >> ret = kvm_arch_handle_exit(env, run); >> + qemu_mutex_unlock_iothread(); >> break; >> } >> } while (ret == 0); >> >> + qemu_mutex_lock_iothread(); >> + >> if (ret< 0) { >> cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); >> vm_stop(RUN_STATE_INTERNAL_ERROR); >> diff --git a/target-i386/kvm.c b/target-i386/kvm.c >> index 0d0d8f6..0ad64d1 100644 >> --- a/target-i386/kvm.c >> +++ b/target-i386/kvm.c >> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct >> kvm_run *run) >> >> /* Inject NMI */ >> if (env->interrupt_request& CPU_INTERRUPT_NMI) { > > Strictly speaking, wouldn't we need to use testbit() and setbit()? I > would expect in the very least a barrier would be needed. I need to think about this as well. We ignored it so far, just saw it when hacking up this patch. > > Looks pretty nice overall. I'll need to apply and spend some time > carefully walking through the code. Without getting the coalesced mmio flushing out of the way, it does not buy us that much yet. But I have some idea... Jan
On 2012-06-23 02:22, Marcelo Tosatti wrote: > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >> Should have declared this [RFC] in the subject and CC'ed kvm... >> >> On 2012-06-23 00:45, Jan Kiszka wrote: >>> This sketches a possible path to get rid of the iothread lock on vmexits >>> in KVM mode. On x86, the the in-kernel irqchips has to be used because >>> we otherwise need to synchronize APIC and other per-cpu state accesses >>> that could be changed concurrently. Not yet fully analyzed is the NMI >>> injection path in the absence of an APIC. >>> >>> s390x should be fine without specific locking as their pre/post-run >>> callbacks are empty. Power requires locking for the pre-run callback. >>> >>> This patch is untested, but a similar version was successfully used in >>> a x86 setup with a network I/O path that needed no central iothread >>> locking anymore (required special MMIO exit handling). >>> --- >>> kvm-all.c | 18 ++++++++++++++++-- >>> target-i386/kvm.c | 7 +++++++ >>> target-ppc/kvm.c | 4 ++++ >>> 3 files changed, 27 insertions(+), 2 deletions(-) >>> >>> diff --git a/kvm-all.c b/kvm-all.c >>> index f8e4328..9c3e26f 100644 >>> --- a/kvm-all.c >>> +++ b/kvm-all.c >>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >>> return EXCP_HLT; >>> } >>> >>> + qemu_mutex_unlock_iothread(); >>> + >>> do { >>> if (env->kvm_vcpu_dirty) { >>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >>> */ >>> qemu_cpu_kick_self(); >>> } >>> - qemu_mutex_unlock_iothread(); >>> >>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>> >>> - qemu_mutex_lock_iothread(); >>> kvm_arch_post_run(env, run); > > target-i386/kvm.c > > void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) > { > if (run->if_flag) { > env->eflags |= IF_MASK; > } else { > env->eflags &= ~IF_MASK; > } > cpu_set_apic_tpr(env->apic_state, run->cr8); > cpu_set_apic_base(env->apic_state, run->apic_base); > } > > Clearly there is no structure to any of the writes around the writes > in x86's kvm_arch_post_run, so it is unsafe. Can't parse this yet. None of the fields touched above should be modified outside of the vcpu thread context (as long as that thread is inside the inner loop). Therefore, it should be safe to run that functions without any lock. Am I missing something? > > In kvm_flush_coalesced_mmio_buffer, however, the first and last pointers > can be read safely without the global lock (so you could move the lock > after reading run->exit_reason, in that case). > >>> + /* TODO: push coalesced mmio flushing to the point where we access >>> + * devices that are using it (currently VGA and E1000). */ >>> + qemu_mutex_lock_iothread(); >>> kvm_flush_coalesced_mmio_buffer(); >>> + qemu_mutex_unlock_iothread(); > > But you have to flush first to then figure out which device the > coalesced mmio belongs to (don't get that comment). kvm_flush must not be called unconditionally on vmexit, that is my point. I'm playing with the idea to tag memory regions that require flushing (as they are coalescing themselves or logically depend on coalesced regions). Then we would flush in the memory layer once a read or write is about to be performed on such a region. BTW, two more users arrived in the meantime: the G364 framebuffer and the i82378 PCI-ISA bridge (not sure yet what requests that bridge coalesces, if it's only VGA, but it looks a bit fishy). Jan
On 2012-06-23 11:06, Marcelo Tosatti wrote: > On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote: >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >>> Should have declared this [RFC] in the subject and CC'ed kvm... >>> >>> On 2012-06-23 00:45, Jan Kiszka wrote: >>>> This sketches a possible path to get rid of the iothread lock on vmexits >>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because >>>> we otherwise need to synchronize APIC and other per-cpu state accesses >>>> that could be changed concurrently. Not yet fully analyzed is the NMI >>>> injection path in the absence of an APIC. >>>> >>>> s390x should be fine without specific locking as their pre/post-run >>>> callbacks are empty. Power requires locking for the pre-run callback. >>>> >>>> This patch is untested, but a similar version was successfully used in >>>> a x86 setup with a network I/O path that needed no central iothread >>>> locking anymore (required special MMIO exit handling). >>>> --- >>>> kvm-all.c | 18 ++++++++++++++++-- >>>> target-i386/kvm.c | 7 +++++++ >>>> target-ppc/kvm.c | 4 ++++ >>>> 3 files changed, 27 insertions(+), 2 deletions(-) >>>> >>>> diff --git a/kvm-all.c b/kvm-all.c >>>> index f8e4328..9c3e26f 100644 >>>> --- a/kvm-all.c >>>> +++ b/kvm-all.c >>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >>>> return EXCP_HLT; >>>> } >>>> >>>> + qemu_mutex_unlock_iothread(); >>>> + >>>> do { >>>> if (env->kvm_vcpu_dirty) { >>>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >>>> */ >>>> qemu_cpu_kick_self(); >>>> } >>>> - qemu_mutex_unlock_iothread(); >>>> >>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>>> >>>> - qemu_mutex_lock_iothread(); >>>> kvm_arch_post_run(env, run); >> >> target-i386/kvm.c >> >> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) >> { >> if (run->if_flag) { >> env->eflags |= IF_MASK; >> } else { >> env->eflags &= ~IF_MASK; >> } >> cpu_set_apic_tpr(env->apic_state, run->cr8); >> cpu_set_apic_base(env->apic_state, run->apic_base); >> } >> >> Clearly there is no structure to any of the writes around the writes >> in x86's kvm_arch_post_run, so it is unsafe. > > No access protocol to the CPUState and apic devices (who can write when, > who can read when). > Hmm, we may need the iothread lock around cpu_set_apic_tpr for !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation can be but there as well. With in-kernel irqchip, there is no such need. Also, no one accesses eflags outside of the vcpu thread, independent of the irqchip mode. Jan
On 06/23/2012 02:45 PM, Jan Kiszka wrote: > > Hmm, we may need the iothread lock around cpu_set_apic_tpr for > !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation > can be but there as well. > > With in-kernel irqchip, there is no such need. Also, no one accesses > eflags outside of the vcpu thread, independent of the irqchip mode. In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt injection needs to be done atomically, but currently we check the tpr from the injecting thread, which means the cpu thread can race with it. We need to move the check to the vcpu thread so that the guest vcpu is halted.
On Sat, Jun 23, 2012 at 7:45 PM, Jan Kiszka <jan.kiszka@web.de> wrote: > On 2012-06-23 11:06, Marcelo Tosatti wrote: >> On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote: >>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >>>> Should have declared this [RFC] in the subject and CC'ed kvm... >>>> >>>> On 2012-06-23 00:45, Jan Kiszka wrote: >>>>> This sketches a possible path to get rid of the iothread lock on vmexits >>>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because >>>>> we otherwise need to synchronize APIC and other per-cpu state accesses >>>>> that could be changed concurrently. Not yet fully analyzed is the NMI >>>>> injection path in the absence of an APIC. >>>>> >>>>> s390x should be fine without specific locking as their pre/post-run >>>>> callbacks are empty. Power requires locking for the pre-run callback. >>>>> >>>>> This patch is untested, but a similar version was successfully used in >>>>> a x86 setup with a network I/O path that needed no central iothread >>>>> locking anymore (required special MMIO exit handling). >>>>> --- >>>>> kvm-all.c | 18 ++++++++++++++++-- >>>>> target-i386/kvm.c | 7 +++++++ >>>>> target-ppc/kvm.c | 4 ++++ >>>>> 3 files changed, 27 insertions(+), 2 deletions(-) >>>>> >>>>> diff --git a/kvm-all.c b/kvm-all.c >>>>> index f8e4328..9c3e26f 100644 >>>>> --- a/kvm-all.c >>>>> +++ b/kvm-all.c >>>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >>>>> return EXCP_HLT; >>>>> } >>>>> >>>>> + qemu_mutex_unlock_iothread(); >>>>> + >>>>> do { >>>>> if (env->kvm_vcpu_dirty) { >>>>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >>>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >>>>> */ >>>>> qemu_cpu_kick_self(); >>>>> } >>>>> - qemu_mutex_unlock_iothread(); >>>>> >>>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>>>> >>>>> - qemu_mutex_lock_iothread(); >>>>> kvm_arch_post_run(env, run); >>> >>> target-i386/kvm.c >>> >>> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) >>> { >>> if (run->if_flag) { >>> env->eflags |= IF_MASK; >>> } else { >>> env->eflags &= ~IF_MASK; >>> } >>> cpu_set_apic_tpr(env->apic_state, run->cr8); >>> cpu_set_apic_base(env->apic_state, run->apic_base); >>> } >>> >>> Clearly there is no structure to any of the writes around the writes >>> in x86's kvm_arch_post_run, so it is unsafe. >> >> No access protocol to the CPUState and apic devices (who can write when, >> who can read when). >> > > Hmm, we may need the iothread lock around cpu_set_apic_tpr for > !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation > can be but there as well. > As to !kvm_irqchip_in_kernel(),I think there is quite a few fields in CPUState (interrupt_request,eflags,halted,exit_request,exception_injected), which we must care about. They are heavily depended on env->apic_state, so we consider them as a atomic context, that is say during the updating of env's these context, we do not allow apic_state changed. And that is why I introduce cpu_lock in the patch "[PATCH 2/2] kvm: use per-cpu lock to free vcpu thread out of the big lock" Regards, pingfan > With in-kernel irqchip, there is no such need. Also, no one accesses > eflags outside of the vcpu thread, independent of the irqchip mode. > > Jan >
On 2012-06-24 10:49, Avi Kivity wrote: > On 06/23/2012 02:45 PM, Jan Kiszka wrote: >> >> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >> can be but there as well. >> >> With in-kernel irqchip, there is no such need. Also, no one accesses >> eflags outside of the vcpu thread, independent of the irqchip mode. > > In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt > injection needs to be done atomically, but currently we check the tpr > from the injecting thread, which means the cpu thread can race with it. > We need to move the check to the vcpu thread so that the guest vcpu is > halted. So apic_set_irq basically needs to be deferred to vcpu context, right? Will have a look. Jan
On 2012-06-24 15:34, liu ping fan wrote: > On Sat, Jun 23, 2012 at 7:45 PM, Jan Kiszka <jan.kiszka@web.de> wrote: >> On 2012-06-23 11:06, Marcelo Tosatti wrote: >>> On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote: >>>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >>>>> Should have declared this [RFC] in the subject and CC'ed kvm... >>>>> >>>>> On 2012-06-23 00:45, Jan Kiszka wrote: >>>>>> This sketches a possible path to get rid of the iothread lock on vmexits >>>>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because >>>>>> we otherwise need to synchronize APIC and other per-cpu state accesses >>>>>> that could be changed concurrently. Not yet fully analyzed is the NMI >>>>>> injection path in the absence of an APIC. >>>>>> >>>>>> s390x should be fine without specific locking as their pre/post-run >>>>>> callbacks are empty. Power requires locking for the pre-run callback. >>>>>> >>>>>> This patch is untested, but a similar version was successfully used in >>>>>> a x86 setup with a network I/O path that needed no central iothread >>>>>> locking anymore (required special MMIO exit handling). >>>>>> --- >>>>>> kvm-all.c | 18 ++++++++++++++++-- >>>>>> target-i386/kvm.c | 7 +++++++ >>>>>> target-ppc/kvm.c | 4 ++++ >>>>>> 3 files changed, 27 insertions(+), 2 deletions(-) >>>>>> >>>>>> diff --git a/kvm-all.c b/kvm-all.c >>>>>> index f8e4328..9c3e26f 100644 >>>>>> --- a/kvm-all.c >>>>>> +++ b/kvm-all.c >>>>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >>>>>> return EXCP_HLT; >>>>>> } >>>>>> >>>>>> + qemu_mutex_unlock_iothread(); >>>>>> + >>>>>> do { >>>>>> if (env->kvm_vcpu_dirty) { >>>>>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >>>>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >>>>>> */ >>>>>> qemu_cpu_kick_self(); >>>>>> } >>>>>> - qemu_mutex_unlock_iothread(); >>>>>> >>>>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>>>>> >>>>>> - qemu_mutex_lock_iothread(); >>>>>> kvm_arch_post_run(env, run); >>>> >>>> target-i386/kvm.c >>>> >>>> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) >>>> { >>>> if (run->if_flag) { >>>> env->eflags |= IF_MASK; >>>> } else { >>>> env->eflags &= ~IF_MASK; >>>> } >>>> cpu_set_apic_tpr(env->apic_state, run->cr8); >>>> cpu_set_apic_base(env->apic_state, run->apic_base); >>>> } >>>> >>>> Clearly there is no structure to any of the writes around the writes >>>> in x86's kvm_arch_post_run, so it is unsafe. >>> >>> No access protocol to the CPUState and apic devices (who can write when, >>> who can read when). >>> >> >> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >> can be but there as well. >> > As to !kvm_irqchip_in_kernel(),I think there is quite a few fields in > CPUState (interrupt_request,eflags,halted,exit_request,exception_injected), > which we must care about. They are heavily depended on > env->apic_state, so we consider them as a atomic context, that is say > during the updating of env's these context, we do not allow apic_state > changed. > And that is why I introduce cpu_lock in the patch "[PATCH 2/2] kvm: > use per-cpu lock to free vcpu thread out of the big lock" This per-cpu lock is optimizing the slow path (!kvm_irqchip_in_kernel) at the expense of the fast path. It's better to optimize for in-kernel irqchip first. One day, when the key problems are solved, we may look at userspace irqchip code path as well. Until then, BQL protection for those paths is perfectly fine. As a first step, I will post a series later that gets rid of kvm_flush_coalesced_mmio_buffer in the common vmexit path. Jan
On 06/24/2012 05:08 PM, Jan Kiszka wrote: > On 2012-06-24 10:49, Avi Kivity wrote: >> On 06/23/2012 02:45 PM, Jan Kiszka wrote: >>> >>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >>> can be but there as well. >>> >>> With in-kernel irqchip, there is no such need. Also, no one accesses >>> eflags outside of the vcpu thread, independent of the irqchip mode. >> >> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt >> injection needs to be done atomically, but currently we check the tpr >> from the injecting thread, which means the cpu thread can race with it. >> We need to move the check to the vcpu thread so that the guest vcpu is >> halted. > > So apic_set_irq basically needs to be deferred to vcpu context, right? > Will have a look. Correct. IIRC, the kernel's 0a5fff192388d2 made the problem much worse, but did not create it. It was either Vista or XP-64 which triggered the problem reliably. Copying Gleb in case he remembers more.
On 06/24/2012 05:08 PM, Jan Kiszka wrote: > As a first step, I will post a series later that gets rid of > kvm_flush_coalesced_mmio_buffer in the common vmexit path. If you defer this, I can think of two places that need to flush: - anything that accesses those memory areas (such as DMA to the framebuffer, or updating the display) - anything that modifies the memory map and possibly changes flushed addresses
On 2012-06-24 16:35, Avi Kivity wrote: > On 06/24/2012 05:08 PM, Jan Kiszka wrote: >> As a first step, I will post a series later that gets rid of >> kvm_flush_coalesced_mmio_buffer in the common vmexit path. > > If you defer this, I can think of two places that need to flush: > - anything that accesses those memory areas (such as DMA to the > framebuffer, or updating the display) - anything that accesses related areas (in case of VGA: PIO accesses to the control ports). I'm providing memory_region_set_flush_coalesced that allows to flush on non-coalesced region accesses as well. Some PIO accesses unfortunately still need open-coded qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet. > - anything that modifies the memory map and possibly changes flushed > addresses Good point, need to address this. Jan
On 06/24/2012 05:40 PM, Jan Kiszka wrote: > On 2012-06-24 16:35, Avi Kivity wrote: >> On 06/24/2012 05:08 PM, Jan Kiszka wrote: >>> As a first step, I will post a series later that gets rid of >>> kvm_flush_coalesced_mmio_buffer in the common vmexit path. >> >> If you defer this, I can think of two places that need to flush: >> - anything that accesses those memory areas (such as DMA to the >> framebuffer, or updating the display) > > - anything that accesses related areas (in case of VGA: PIO accesses to > the control ports). I'm providing memory_region_set_flush_coalesced that > allows to flush on non-coalesced region accesses as well. Some PIO > accesses unfortunately still need open-coded > qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet. Framebuffer access will bypass the MemoryRegionOps callbacks, did you intend to hook those? I'm not sure the problem is general enough to merit a check in our generic mmio dispatch code (granted, now it has a check in the vcpu exit path which is much worse).
On 2012-06-24 16:46, Avi Kivity wrote: > On 06/24/2012 05:40 PM, Jan Kiszka wrote: >> On 2012-06-24 16:35, Avi Kivity wrote: >>> On 06/24/2012 05:08 PM, Jan Kiszka wrote: >>>> As a first step, I will post a series later that gets rid of >>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path. >>> >>> If you defer this, I can think of two places that need to flush: >>> - anything that accesses those memory areas (such as DMA to the >>> framebuffer, or updating the display) >> >> - anything that accesses related areas (in case of VGA: PIO accesses to >> the control ports). I'm providing memory_region_set_flush_coalesced that >> allows to flush on non-coalesced region accesses as well. Some PIO >> accesses unfortunately still need open-coded >> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet. > > Framebuffer access will bypass the MemoryRegionOps callbacks, did you > intend to hook those? Are there really cases where the framebuffer is accessible both via MMIO and RAM-like mappings at the same time? If so, the current flushing on vmexit would help either as the direct mappings would not trigger exits. Or what do you mean? > > I'm not sure the problem is general enough to merit a check in our > generic mmio dispatch code (granted, now it has a check in the vcpu exit > path which is much worse). The current situation is indeed much worse. Jan
On 06/24/2012 05:51 PM, Jan Kiszka wrote: > On 2012-06-24 16:46, Avi Kivity wrote: >> On 06/24/2012 05:40 PM, Jan Kiszka wrote: >>> On 2012-06-24 16:35, Avi Kivity wrote: >>>> On 06/24/2012 05:08 PM, Jan Kiszka wrote: >>>>> As a first step, I will post a series later that gets rid of >>>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path. >>>> >>>> If you defer this, I can think of two places that need to flush: >>>> - anything that accesses those memory areas (such as DMA to the >>>> framebuffer, or updating the display) >>> >>> - anything that accesses related areas (in case of VGA: PIO accesses to >>> the control ports). I'm providing memory_region_set_flush_coalesced that >>> allows to flush on non-coalesced region accesses as well. Some PIO >>> accesses unfortunately still need open-coded >>> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet. >> >> Framebuffer access will bypass the MemoryRegionOps callbacks, did you >> intend to hook those? > > Are there really cases where the framebuffer is accessible both via MMIO > and RAM-like mappings at the same time? If so, the current flushing on > vmexit would help either as the direct mappings would not trigger exits. > Or what do you mean? I meant accesses by the display code to put the framebuffer in front of the user's eyes. Those just access the memory directly. We could wrap them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and have those issue the flush.
On 2012-06-24 16:56, Avi Kivity wrote: > On 06/24/2012 05:51 PM, Jan Kiszka wrote: >> On 2012-06-24 16:46, Avi Kivity wrote: >>> On 06/24/2012 05:40 PM, Jan Kiszka wrote: >>>> On 2012-06-24 16:35, Avi Kivity wrote: >>>>> On 06/24/2012 05:08 PM, Jan Kiszka wrote: >>>>>> As a first step, I will post a series later that gets rid of >>>>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path. >>>>> >>>>> If you defer this, I can think of two places that need to flush: >>>>> - anything that accesses those memory areas (such as DMA to the >>>>> framebuffer, or updating the display) >>>> >>>> - anything that accesses related areas (in case of VGA: PIO accesses to >>>> the control ports). I'm providing memory_region_set_flush_coalesced that >>>> allows to flush on non-coalesced region accesses as well. Some PIO >>>> accesses unfortunately still need open-coded >>>> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet. >>> >>> Framebuffer access will bypass the MemoryRegionOps callbacks, did you >>> intend to hook those? >> >> Are there really cases where the framebuffer is accessible both via MMIO >> and RAM-like mappings at the same time? If so, the current flushing on >> vmexit would help either as the direct mappings would not trigger exits. >> Or what do you mean? > > I meant accesses by the display code to put the framebuffer in front of > the user's eyes. Those just access the memory directly. We could wrap > them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and > have those issue the flush. That's a non issue, we already have to flush on display updates. See e.g. vga_update_display. Jan
On 06/24/2012 05:58 PM, Jan Kiszka wrote: >>> Are there really cases where the framebuffer is accessible both via MMIO >>> and RAM-like mappings at the same time? If so, the current flushing on >>> vmexit would help either as the direct mappings would not trigger exits. >>> Or what do you mean? >> >> I meant accesses by the display code to put the framebuffer in front of >> the user's eyes. Those just access the memory directly. We could wrap >> them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and >> have those issue the flush. > > That's a non issue, we already have to flush on display updates. See > e.g. vga_update_display. > Ah, of course.
On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > Should have declared this [RFC] in the subject and CC'ed kvm... > > On 2012-06-23 00:45, Jan Kiszka wrote: > > This sketches a possible path to get rid of the iothread lock on vmexits > > in KVM mode. On x86, the the in-kernel irqchips has to be used because > > we otherwise need to synchronize APIC and other per-cpu state accesses > > that could be changed concurrently. Not yet fully analyzed is the NMI > > injection path in the absence of an APIC. > > > > s390x should be fine without specific locking as their pre/post-run > > callbacks are empty. Power requires locking for the pre-run callback. > > > > This patch is untested, but a similar version was successfully used in > > a x86 setup with a network I/O path that needed no central iothread > > locking anymore (required special MMIO exit handling). > > --- > > kvm-all.c | 18 ++++++++++++++++-- > > target-i386/kvm.c | 7 +++++++ > > target-ppc/kvm.c | 4 ++++ > > 3 files changed, 27 insertions(+), 2 deletions(-) > > > > diff --git a/kvm-all.c b/kvm-all.c > > index f8e4328..9c3e26f 100644 > > --- a/kvm-all.c > > +++ b/kvm-all.c > > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > > return EXCP_HLT; > > } > > > > + qemu_mutex_unlock_iothread(); > > + > > do { > > if (env->kvm_vcpu_dirty) { > > kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > > */ > > qemu_cpu_kick_self(); > > } > > - qemu_mutex_unlock_iothread(); > > > > run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > > > > - qemu_mutex_lock_iothread(); > > kvm_arch_post_run(env, run); > > > > + /* TODO: push coalesced mmio flushing to the point where we access > > + * devices that are using it (currently VGA and E1000). */ > > + qemu_mutex_lock_iothread(); > > kvm_flush_coalesced_mmio_buffer(); > > + qemu_mutex_unlock_iothread(); > > > > if (run_ret < 0) { > > if (run_ret == -EINTR || run_ret == -EAGAIN) { > > @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) > > switch (run->exit_reason) { > > case KVM_EXIT_IO: > > DPRINTF("handle_io\n"); > > + qemu_mutex_lock_iothread(); > > kvm_handle_io(run->io.port, > > (uint8_t *)run + run->io.data_offset, > > run->io.direction, > > run->io.size, > > run->io.count); > > + qemu_mutex_unlock_iothread(); > > ret = 0; > > break; > > case KVM_EXIT_MMIO: > > DPRINTF("handle_mmio\n"); > > + qemu_mutex_lock_iothread(); > > cpu_physical_memory_rw(run->mmio.phys_addr, > > run->mmio.data, > > run->mmio.len, > > run->mmio.is_write); > > + qemu_mutex_unlock_iothread(); > > ret = 0; > > break; > > case KVM_EXIT_IRQ_WINDOW_OPEN: > > @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) > > break; > > case KVM_EXIT_SHUTDOWN: > > DPRINTF("shutdown\n"); > > + qemu_mutex_lock_iothread(); > > qemu_system_reset_request(); > > + qemu_mutex_unlock_iothread(); > > ret = EXCP_INTERRUPT; > > break; > > case KVM_EXIT_UNKNOWN: > > @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) > > break; > > default: > > DPRINTF("kvm_arch_handle_exit\n"); > > + qemu_mutex_lock_iothread(); > > ret = kvm_arch_handle_exit(env, run); > > + qemu_mutex_unlock_iothread(); > > break; > > } > > } while (ret == 0); > > > > + qemu_mutex_lock_iothread(); > > + > > if (ret < 0) { > > cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); > > vm_stop(RUN_STATE_INTERNAL_ERROR); > > diff --git a/target-i386/kvm.c b/target-i386/kvm.c > > index 0d0d8f6..0ad64d1 100644 > > --- a/target-i386/kvm.c > > +++ b/target-i386/kvm.c > > @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > > > /* Inject NMI */ > > if (env->interrupt_request & CPU_INTERRUPT_NMI) { > > + qemu_mutex_lock_iothread(); > > env->interrupt_request &= ~CPU_INTERRUPT_NMI; > > + qemu_mutex_unlock_iothread(); > > + > > DPRINTF("injected NMI\n"); > > ret = kvm_vcpu_ioctl(env, KVM_NMI); > > if (ret < 0) { > > @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > } > > > > if (!kvm_irqchip_in_kernel()) { > > + qemu_mutex_lock_iothread(); > > + > > /* Force the VCPU out of its inner loop to process any INIT requests > > * or pending TPR access reports. */ > > if (env->interrupt_request & > > @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > > > > DPRINTF("setting tpr\n"); > > run->cr8 = cpu_get_apic_tpr(env->apic_state); > > + > > + qemu_mutex_unlock_iothread(); > > } > > } > > > > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > > index c09cc39..60d91a5 100644 > > --- a/target-ppc/kvm.c > > +++ b/target-ppc/kvm.c > > @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > > int r; > > unsigned irq; > > > > + qemu_mutex_lock_iothread(); > > + > > /* PowerPC QEMU tracks the various core input pins (interrupt, critical > > * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ > > if (!cap_interrupt_level && > > @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > > /* We don't know if there are more interrupts pending after this. However, > > * the guest will return to userspace in the course of handling this one > > * anyways, so we will get a chance to deliver the rest. */ > > + > > + qemu_mutex_unlock_iothread(); > > } > > > > void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run) > > The following plan would allow progressive convertion to parallel operation. Jan mentioned the MMIO handler->MMIO handler deadlock in a private message. Jan: if there is recursive MMIO accesses, you can detect that and skip such MMIO handlers in dev_can_use_lock() ? Or blacklist. What i mentioned earlier about "unsolved issues" are the possibilities in "iothread flow" (belief is that it would be better determined at with execution experience / tuning). OK, so, each document starts with xyz.txt. This is compatible with TCG, the critical part is not dropping/acquire locks for decent performance (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD requests so). This came out from discussions with Avi. immediate-plan.txt ------------------ 1. read_lock(memmap_lock) 2. MemoryRegionSection mrs = lookup(addr) 3. qom_ref(mrs.mr->dev) 4. read_unlock(memmap_lock) 5. mutex_lock(dev->lock) 6. dispatch(&mrs, addr, data, size) 7. mutex_unlock(dev->lock) 8. qom_unref(mrs.mr->object) A) Add Device object to MemoryRegions. memory_region_add_subregion memory_region_add_eventfd memory_region_add_subregion_overlap B) qom_ref details Z) see device-hotplug.txt items lock-model.txt -------------- lock_device(dev) { if (dev_can_use_lock(dev)) mutex_lock(dev->lock) else mutex_lock(qemu_global_mutex) } dev_can_use_lock(dev) { if (all services used by dev mmio handler have) - qemu_global_mutex - trylock(dev->lock) and fail then yes else then no } That is, from the moment in which the device uses the order (dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock is forbidden by the IOTHREAD or anyone else. convert-to-unlocked.txt ----------------------- 1. read_lock(memmap_lock) 2. MemoryRegionSection mrs = lookup(addr) 3. qom_ref(mrs.mr->dev) 4. read_unlock(memmap_lock) 5. mutex_lock(dev->lock) 6. dispatch(&mrs, addr, data, size) 7. mutex_unlock(dev->lock) 8. qom_unref(mrs.mr->object) THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE TO UNLOCKED OPERATION. 1) qom_ref guarantees that the Device object does not disappear. However, these operations are allowed in parallel: a) after 4. device memory and ioports might change or be unregistered. Concurrent accesses of a device that is being hotunplugged or whose mappings change are responsability of the OS, so incorrect emulation is not QEMU's problem. However, device emulation is now subject to: - cpu_physical_memory_map failing. - stl_phys* failing. - cpu_physical_memory_rw failing/reading from unassigned mem. b) what should happen with a pending cpu_physical_memory_map pointer, when deleting the corresponding memory region? net.txt -------- iothread flow ============= 1) Skip-work-if-device-locked select(tap fd ready) tap_send if (trylock(TAPState->NetClientState->dev)) proceed; else continue; (data remains in queue) tap_read_packet qemu_send_packet_async DRAWBACK: lock intensive. DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with a scheme such as trylock() marks a flag saying "iothread wants lock". Checking each subsystem for possibility: - tap_send: OK - user,slirp: if not possible, use device->lock. - qemu-char: OK - interrupts: lock with qemu_global_mutex. - qemutimer: - read time: qemu_get_clock_ns (see later). 2) Queue-work-to-vcpu-context select(tap fd ready) tap_send if (trylock(TAPState->NetClientState->dev)) { proceed; } else { AddToDeviceWorkQueue(work); } tap_read_packet qemu_send_packet_async DRAWBACK: lock intensive DRAWBACK: cache effects 3) VCPU-thread-notify-device-unlocked select(tap fd ready) tap_send if (trylock(TAPState->NetClientState->dev)) { proceed; } else { signal VCPU thread to notify IOTHREAD when unlocked. } tap_read_packet GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow same order. PCI HOTPLUG EJECT FLOW ---------------------- 1) Monitor initiated. monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device -> piix4_device_hotplug -> SCI interrupt -> OS removes application users from device -> OS runs _EJ0 -> For hot removal, the device must be immediately ejected when OSPM calls the _EJ0 control method. The _EJ0 control method does not return until ejection is complete. After calling _EJ0, OSPM verifies the device no longer exists to determine if the eject succeeded. For _HID devices, OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the bus driver for that device. Avi: A pci eject guarantees that all accesses started after the eject completes do not see the device. Can you do: - qobject_remove() and have the device dispatch running safely? NO, therefore pci_unplug_device must hold dev->lock.
On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > net.txt > -------- > > > iothread flow > ============= > > 1) Skip-work-if-device-locked > > select(tap fd ready) > tap_send > if (trylock(TAPState->NetClientState->dev)) > proceed; > else > continue; (data remains in queue) > tap_read_packet > qemu_send_packet_async > > DRAWBACK: lock intensive. > DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with > a scheme such as trylock() marks a flag saying "iothread wants lock". One alternative is to say the lock cannot be held across system calls or other long-running operations (similar to interrupt handler spinlocks in the kernel). But QEMU threads are still at the mercy of the scheduler or page faults, so perhaps this is not a good idea because waiting on the lock could tie up the iothread. > 2) Queue-work-to-vcpu-context > > select(tap fd ready) > tap_send > if (trylock(TAPState->NetClientState->dev)) { > proceed; > } else { > AddToDeviceWorkQueue(work); > } > tap_read_packet > qemu_send_packet_async > > DRAWBACK: lock intensive > DRAWBACK: cache effects This almost suggests we should invert packet reception for NICs. tap fd ready should add a work item and the guest device calls into net/tap.c to pull out any packets from the work function: tap_send dev_add_work(work); virtio_net_rx_work_fn while ((req = virtqueue_pop(rx_queue))) { if (!net_receive_packet(netclient, req->buf)) { break; /* no more packets available */ } /* ...mark req complete and push onto virtqueue... */ } virtqueue_notify(rx_queue); The idea is to avoid copies on the rx path and make the locking simple by always deferring to a work function (which can be invoked immediately if the device isn't locked). Stefan
On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >> net.txt >> -------- >> >> >> iothread flow >> ============= >> >> 1) Skip-work-if-device-locked >> >> select(tap fd ready) >> tap_send >> if (trylock(TAPState->NetClientState->dev)) >> proceed; >> else >> continue; (data remains in queue) >> tap_read_packet >> qemu_send_packet_async >> >> DRAWBACK: lock intensive. >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with >> a scheme such as trylock() marks a flag saying "iothread wants lock". > > One alternative is to say the lock cannot be held across system calls > or other long-running operations (similar to interrupt handler > spinlocks in the kernel). But QEMU threads are still at the mercy of > the scheduler or page faults, so perhaps this is not a good idea > because waiting on the lock could tie up the iothread. > >> 2) Queue-work-to-vcpu-context >> >> select(tap fd ready) >> tap_send >> if (trylock(TAPState->NetClientState->dev)) { >> proceed; >> } else { >> AddToDeviceWorkQueue(work); >> } >> tap_read_packet >> qemu_send_packet_async >> >> DRAWBACK: lock intensive >> DRAWBACK: cache effects > > This almost suggests we should invert packet reception for NICs. tap > fd ready should add a work item and the guest device calls into > net/tap.c to pull out any packets from the work function: > > tap_send > dev_add_work(work); > > virtio_net_rx_work_fn > while ((req = virtqueue_pop(rx_queue))) { > if (!net_receive_packet(netclient, req->buf)) { > break; /* no more packets available */ > } > /* ...mark req complete and push onto virtqueue... */ > } > virtqueue_notify(rx_queue); > > The idea is to avoid copies on the rx path and make the locking simple > by always deferring to a work function (which can be invoked > immediately if the device isn't locked). I just realized this approach bypasses net/queue.c. I think it works really well for the peer model (no "VLANs"). Basically flow control is dictated by the peer (receiver) and we don't need to use NetQueue anymore. Whether this works elegantly for slirp and other backends, I'm not sure yet. Stefan
On 06/26/2012 10:34 PM, Marcelo Tosatti wrote: > > 1. read_lock(memmap_lock) > 2. MemoryRegionSection mrs = lookup(addr) > 3. qom_ref(mrs.mr->dev) > 4. read_unlock(memmap_lock) > > 5. mutex_lock(dev->lock) > 6. dispatch(&mrs, addr, data, size) > 7. mutex_unlock(dev->lock) > > 8. qom_unref(mrs.mr->object) The plan also includes eventually replacing read_lock() with rcu; so two devices can be accessed in parallel with no cacheline bouncing.
On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote: > On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > >> net.txt > >> -------- > >> > >> > >> iothread flow > >> ============= > >> > >> 1) Skip-work-if-device-locked > >> > >> select(tap fd ready) > >> tap_send > >> if (trylock(TAPState->NetClientState->dev)) > >> proceed; > >> else > >> continue; (data remains in queue) > >> tap_read_packet > >> qemu_send_packet_async > >> > >> DRAWBACK: lock intensive. > >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with > >> a scheme such as trylock() marks a flag saying "iothread wants lock". > > > > One alternative is to say the lock cannot be held across system calls > > or other long-running operations (similar to interrupt handler > > spinlocks in the kernel). But QEMU threads are still at the mercy of > > the scheduler or page faults, so perhaps this is not a good idea > > because waiting on the lock could tie up the iothread. > > > >> 2) Queue-work-to-vcpu-context > >> > >> select(tap fd ready) > >> tap_send > >> if (trylock(TAPState->NetClientState->dev)) { > >> proceed; > >> } else { > >> AddToDeviceWorkQueue(work); > >> } > >> tap_read_packet > >> qemu_send_packet_async > >> > >> DRAWBACK: lock intensive > >> DRAWBACK: cache effects > > > > This almost suggests we should invert packet reception for NICs. tap > > fd ready should add a work item and the guest device calls into > > net/tap.c to pull out any packets from the work function: > > > > tap_send > > dev_add_work(work); > > > > virtio_net_rx_work_fn > > while ((req = virtqueue_pop(rx_queue))) { > > if (!net_receive_packet(netclient, req->buf)) { > > break; /* no more packets available */ > > } > > /* ...mark req complete and push onto virtqueue... */ > > } > > virtqueue_notify(rx_queue); > > > > The idea is to avoid copies on the rx path and make the locking simple > > by always deferring to a work function (which can be invoked > > immediately if the device isn't locked). > > I just realized this approach bypasses net/queue.c. I think it works > really well for the peer model (no "VLANs"). Basically flow control > is dictated by the peer (receiver) and we don't need to use NetQueue > anymore. Whether this works elegantly for slirp and other backends, > I'm not sure yet. Right. The advantage is only backends where performance matters need to be converted (and only net hw drivers that matter performance wise need to be converted).
On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote: > On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: > > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > >> net.txt > >> -------- > >> > >> > >> iothread flow > >> ============= > >> > >> 1) Skip-work-if-device-locked > >> > >> select(tap fd ready) > >> tap_send > >> if (trylock(TAPState->NetClientState->dev)) > >> proceed; > >> else > >> continue; (data remains in queue) > >> tap_read_packet > >> qemu_send_packet_async > >> > >> DRAWBACK: lock intensive. > >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with > >> a scheme such as trylock() marks a flag saying "iothread wants lock". > > > > One alternative is to say the lock cannot be held across system calls > > or other long-running operations (similar to interrupt handler > > spinlocks in the kernel). But QEMU threads are still at the mercy of > > the scheduler or page faults, so perhaps this is not a good idea > > because waiting on the lock could tie up the iothread. > > > >> 2) Queue-work-to-vcpu-context > >> > >> select(tap fd ready) > >> tap_send > >> if (trylock(TAPState->NetClientState->dev)) { > >> proceed; > >> } else { > >> AddToDeviceWorkQueue(work); > >> } > >> tap_read_packet > >> qemu_send_packet_async > >> > >> DRAWBACK: lock intensive > >> DRAWBACK: cache effects > > > > This almost suggests we should invert packet reception for NICs. tap > > fd ready should add a work item and the guest device calls into > > net/tap.c to pull out any packets from the work function: > > > > tap_send > > dev_add_work(work); > > > > virtio_net_rx_work_fn > > while ((req = virtqueue_pop(rx_queue))) { > > if (!net_receive_packet(netclient, req->buf)) { > > break; /* no more packets available */ > > } > > /* ...mark req complete and push onto virtqueue... */ > > } > > virtqueue_notify(rx_queue); > > > > The idea is to avoid copies on the rx path and make the locking simple > > by always deferring to a work function (which can be invoked > > immediately if the device isn't locked). > > I just realized this approach bypasses net/queue.c. I think it works > really well for the peer model (no "VLANs"). Basically flow control > is dictated by the peer (receiver) and we don't need to use NetQueue > anymore. Whether this works elegantly for slirp and other backends, > I'm not sure yet. > > Stefan Right. The advantage is only backends where performance matters need to be converted (and only net hw drivers that matter performance wise need to be converted). Apparently you guys have very different ideas on the higher level model here. It would be good to agree on one structure (including modifications on the one above) to follow and share the work on that direction. Having competing implementations in this case is a waste of time.
On 2012-06-26 21:34, Marcelo Tosatti wrote: > The following plan would allow progressive convertion to parallel > operation. > > Jan mentioned the MMIO handler->MMIO handler deadlock in a private message. > > Jan: if there is recursive MMIO accesses, you can detect that and skip > such MMIO handlers in dev_can_use_lock() ? Or blacklist. The problem is harder as it may appear on first sight. I checked our code again, and it also still contains at least one unhandled lockup scenario. We could try to detect this but it's tricky, maybe even fragile in more complex scenarios (risk of false positives when using timeouts e.g.). Well, such kind of mutual device-to-device requests are likely all pathological, and I guess it would be ok to actually let the devices lock up. But then we need some way to recover them, at least via a virtual machine reset. That implies, of course, they must not lock up while holding the central lock... Need to look into details of your approach now. Jan
On Sat, Jun 23, 2012 at 11:22:07AM +0200, Jan Kiszka wrote: > On 2012-06-23 02:22, Marcelo Tosatti wrote: > > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > >> Should have declared this [RFC] in the subject and CC'ed kvm... > >> > >> On 2012-06-23 00:45, Jan Kiszka wrote: > >>> This sketches a possible path to get rid of the iothread lock on vmexits > >>> in KVM mode. On x86, the the in-kernel irqchips has to be used because > >>> we otherwise need to synchronize APIC and other per-cpu state accesses > >>> that could be changed concurrently. Not yet fully analyzed is the NMI > >>> injection path in the absence of an APIC. > >>> > >>> s390x should be fine without specific locking as their pre/post-run > >>> callbacks are empty. Power requires locking for the pre-run callback. > >>> > >>> This patch is untested, but a similar version was successfully used in > >>> a x86 setup with a network I/O path that needed no central iothread > >>> locking anymore (required special MMIO exit handling). > >>> --- > >>> kvm-all.c | 18 ++++++++++++++++-- > >>> target-i386/kvm.c | 7 +++++++ > >>> target-ppc/kvm.c | 4 ++++ > >>> 3 files changed, 27 insertions(+), 2 deletions(-) > >>> > >>> diff --git a/kvm-all.c b/kvm-all.c > >>> index f8e4328..9c3e26f 100644 > >>> --- a/kvm-all.c > >>> +++ b/kvm-all.c > >>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > >>> return EXCP_HLT; > >>> } > >>> > >>> + qemu_mutex_unlock_iothread(); > >>> + > >>> do { > >>> if (env->kvm_vcpu_dirty) { > >>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > >>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > >>> */ > >>> qemu_cpu_kick_self(); > >>> } > >>> - qemu_mutex_unlock_iothread(); > >>> > >>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > >>> > >>> - qemu_mutex_lock_iothread(); > >>> kvm_arch_post_run(env, run); > > > > target-i386/kvm.c > > > > void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run) > > { > > if (run->if_flag) { > > env->eflags |= IF_MASK; > > } else { > > env->eflags &= ~IF_MASK; > > } > > cpu_set_apic_tpr(env->apic_state, run->cr8); > > cpu_set_apic_base(env->apic_state, run->apic_base); > > } > > > > Clearly there is no structure to any of the writes around the writes > > in x86's kvm_arch_post_run, so it is unsafe. > > Can't parse this yet. > > None of the fields touched above should be modified outside of the vcpu > thread context (as long as that thread is inside the inner loop). > Therefore, it should be safe to run that functions without any lock. Am > I missing something? Maybe no in practice, for env->eflags. But it should be formalized, eg BUG_ON(!vcpu) env->xxx, or some other form of making it not accessible outside vcpu context. However, as an example APIC_COMMON_GET_CLASS -> OBJECT_GET_CLASS -> .... static TypeImpl *type_table_lookup(const char *name) { return g_hash_table_lookup(type_table_get(), name); } Is that safe?
On Wed, Jun 27, 2012 at 12:19 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote: >> On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote: >> > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: >> >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >> >> net.txt >> >> -------- >> >> >> >> >> >> iothread flow >> >> ============= >> >> >> >> 1) Skip-work-if-device-locked >> >> >> >> select(tap fd ready) >> >> tap_send >> >> if (trylock(TAPState->NetClientState->dev)) >> >> proceed; >> >> else >> >> continue; (data remains in queue) >> >> tap_read_packet >> >> qemu_send_packet_async >> >> >> >> DRAWBACK: lock intensive. >> >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with >> >> a scheme such as trylock() marks a flag saying "iothread wants lock". >> > >> > One alternative is to say the lock cannot be held across system calls >> > or other long-running operations (similar to interrupt handler >> > spinlocks in the kernel). But QEMU threads are still at the mercy of >> > the scheduler or page faults, so perhaps this is not a good idea >> > because waiting on the lock could tie up the iothread. >> > >> >> 2) Queue-work-to-vcpu-context >> >> >> >> select(tap fd ready) >> >> tap_send >> >> if (trylock(TAPState->NetClientState->dev)) { >> >> proceed; >> >> } else { >> >> AddToDeviceWorkQueue(work); >> >> } >> >> tap_read_packet >> >> qemu_send_packet_async >> >> >> >> DRAWBACK: lock intensive >> >> DRAWBACK: cache effects >> > >> > This almost suggests we should invert packet reception for NICs. tap >> > fd ready should add a work item and the guest device calls into >> > net/tap.c to pull out any packets from the work function: >> > >> > tap_send >> > dev_add_work(work); >> > >> > virtio_net_rx_work_fn >> > while ((req = virtqueue_pop(rx_queue))) { >> > if (!net_receive_packet(netclient, req->buf)) { >> > break; /* no more packets available */ >> > } >> > /* ...mark req complete and push onto virtqueue... */ >> > } >> > virtqueue_notify(rx_queue); >> > >> > The idea is to avoid copies on the rx path and make the locking simple >> > by always deferring to a work function (which can be invoked >> > immediately if the device isn't locked). >> >> I just realized this approach bypasses net/queue.c. I think it works >> really well for the peer model (no "VLANs"). Basically flow control >> is dictated by the peer (receiver) and we don't need to use NetQueue >> anymore. Whether this works elegantly for slirp and other backends, >> I'm not sure yet. >> >> Stefan > > Right. The advantage is only backends where performance matters need to > be converted (and only net hw drivers that matter performance wise need > to be converted). > > Apparently you guys have very different ideas on the higher level > model here. It would be good to agree on one structure (including > modifications on the one above) to follow and share the work on that > direction. "You guys" == Ping Fan + Jan + Anthony + Avi? I'm not working directly on QBL removal myself. Stefan
On 06/26/2012 02:34 PM, Marcelo Tosatti wrote: > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: >> Should have declared this [RFC] in the subject and CC'ed kvm... >> >> On 2012-06-23 00:45, Jan Kiszka wrote: >>> This sketches a possible path to get rid of the iothread lock on vmexits >>> in KVM mode. On x86, the the in-kernel irqchips has to be used because >>> we otherwise need to synchronize APIC and other per-cpu state accesses >>> that could be changed concurrently. Not yet fully analyzed is the NMI >>> injection path in the absence of an APIC. >>> >>> s390x should be fine without specific locking as their pre/post-run >>> callbacks are empty. Power requires locking for the pre-run callback. >>> >>> This patch is untested, but a similar version was successfully used in >>> a x86 setup with a network I/O path that needed no central iothread >>> locking anymore (required special MMIO exit handling). >>> --- >>> kvm-all.c | 18 ++++++++++++++++-- >>> target-i386/kvm.c | 7 +++++++ >>> target-ppc/kvm.c | 4 ++++ >>> 3 files changed, 27 insertions(+), 2 deletions(-) >>> >>> diff --git a/kvm-all.c b/kvm-all.c >>> index f8e4328..9c3e26f 100644 >>> --- a/kvm-all.c >>> +++ b/kvm-all.c >>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) >>> return EXCP_HLT; >>> } >>> >>> + qemu_mutex_unlock_iothread(); >>> + >>> do { >>> if (env->kvm_vcpu_dirty) { >>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); >>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) >>> */ >>> qemu_cpu_kick_self(); >>> } >>> - qemu_mutex_unlock_iothread(); >>> >>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); >>> >>> - qemu_mutex_lock_iothread(); >>> kvm_arch_post_run(env, run); >>> >>> + /* TODO: push coalesced mmio flushing to the point where we access >>> + * devices that are using it (currently VGA and E1000). */ >>> + qemu_mutex_lock_iothread(); >>> kvm_flush_coalesced_mmio_buffer(); >>> + qemu_mutex_unlock_iothread(); >>> >>> if (run_ret< 0) { >>> if (run_ret == -EINTR || run_ret == -EAGAIN) { >>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) >>> switch (run->exit_reason) { >>> case KVM_EXIT_IO: >>> DPRINTF("handle_io\n"); >>> + qemu_mutex_lock_iothread(); >>> kvm_handle_io(run->io.port, >>> (uint8_t *)run + run->io.data_offset, >>> run->io.direction, >>> run->io.size, >>> run->io.count); >>> + qemu_mutex_unlock_iothread(); >>> ret = 0; >>> break; >>> case KVM_EXIT_MMIO: >>> DPRINTF("handle_mmio\n"); >>> + qemu_mutex_lock_iothread(); >>> cpu_physical_memory_rw(run->mmio.phys_addr, >>> run->mmio.data, >>> run->mmio.len, >>> run->mmio.is_write); >>> + qemu_mutex_unlock_iothread(); >>> ret = 0; >>> break; >>> case KVM_EXIT_IRQ_WINDOW_OPEN: >>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) >>> break; >>> case KVM_EXIT_SHUTDOWN: >>> DPRINTF("shutdown\n"); >>> + qemu_mutex_lock_iothread(); >>> qemu_system_reset_request(); >>> + qemu_mutex_unlock_iothread(); >>> ret = EXCP_INTERRUPT; >>> break; >>> case KVM_EXIT_UNKNOWN: >>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) >>> break; >>> default: >>> DPRINTF("kvm_arch_handle_exit\n"); >>> + qemu_mutex_lock_iothread(); >>> ret = kvm_arch_handle_exit(env, run); >>> + qemu_mutex_unlock_iothread(); >>> break; >>> } >>> } while (ret == 0); >>> >>> + qemu_mutex_lock_iothread(); >>> + >>> if (ret< 0) { >>> cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); >>> vm_stop(RUN_STATE_INTERNAL_ERROR); >>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c >>> index 0d0d8f6..0ad64d1 100644 >>> --- a/target-i386/kvm.c >>> +++ b/target-i386/kvm.c >>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) >>> >>> /* Inject NMI */ >>> if (env->interrupt_request& CPU_INTERRUPT_NMI) { >>> + qemu_mutex_lock_iothread(); >>> env->interrupt_request&= ~CPU_INTERRUPT_NMI; >>> + qemu_mutex_unlock_iothread(); >>> + >>> DPRINTF("injected NMI\n"); >>> ret = kvm_vcpu_ioctl(env, KVM_NMI); >>> if (ret< 0) { >>> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) >>> } >>> >>> if (!kvm_irqchip_in_kernel()) { >>> + qemu_mutex_lock_iothread(); >>> + >>> /* Force the VCPU out of its inner loop to process any INIT requests >>> * or pending TPR access reports. */ >>> if (env->interrupt_request& >>> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) >>> >>> DPRINTF("setting tpr\n"); >>> run->cr8 = cpu_get_apic_tpr(env->apic_state); >>> + >>> + qemu_mutex_unlock_iothread(); >>> } >>> } >>> >>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c >>> index c09cc39..60d91a5 100644 >>> --- a/target-ppc/kvm.c >>> +++ b/target-ppc/kvm.c >>> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) >>> int r; >>> unsigned irq; >>> >>> + qemu_mutex_lock_iothread(); >>> + >>> /* PowerPC QEMU tracks the various core input pins (interrupt, critical >>> * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ >>> if (!cap_interrupt_level&& >>> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) >>> /* We don't know if there are more interrupts pending after this. However, >>> * the guest will return to userspace in the course of handling this one >>> * anyways, so we will get a chance to deliver the rest. */ >>> + >>> + qemu_mutex_unlock_iothread(); >>> } >>> >>> void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run) >>> > > The following plan would allow progressive convertion to parallel > operation. > > Jan mentioned the MMIO handler->MMIO handler deadlock in a private message. > > Jan: if there is recursive MMIO accesses, you can detect that and skip > such MMIO handlers in dev_can_use_lock() ? Or blacklist. > > What i mentioned earlier about "unsolved issues" are the possibilities > in "iothread flow" (belief is that it would be better determined at > with execution experience / tuning). > > OK, so, each document starts with xyz.txt. This is compatible with TCG, > the critical part is not dropping/acquire locks for decent performance > (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD > requests so). > > This came out from discussions with Avi. > > immediate-plan.txt > ------------------ > > 1. read_lock(memmap_lock) > 2. MemoryRegionSection mrs = lookup(addr) > 3. qom_ref(mrs.mr->dev) > 4. read_unlock(memmap_lock) > > 5. mutex_lock(dev->lock) > 6. dispatch(&mrs, addr, data, size) > 7. mutex_unlock(dev->lock) Just a detail, I don't think we should acquire a device specific lock in global code. Rather, I think we should acquire the global lock before dispatch unless a MemoryRegion is marked as being unlocked. Regards, Anthony Liguori > > 8. qom_unref(mrs.mr->object) > > > A) Add Device object to MemoryRegions. > > memory_region_add_subregion > memory_region_add_eventfd > memory_region_add_subregion_overlap > > > B) qom_ref details > > Z) see device-hotplug.txt items > > > > lock-model.txt > -------------- > > > lock_device(dev) { > if (dev_can_use_lock(dev)) > mutex_lock(dev->lock) > else > mutex_lock(qemu_global_mutex) > } > > dev_can_use_lock(dev) > { > if (all services used by dev mmio handler have) > - qemu_global_mutex > - trylock(dev->lock) and fail > > then yes > else > then no > } > > > That is, from the moment in which the device uses the order > (dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock > is forbidden by the IOTHREAD or anyone else. > > > convert-to-unlocked.txt > ----------------------- > > > 1. read_lock(memmap_lock) > 2. MemoryRegionSection mrs = lookup(addr) > 3. qom_ref(mrs.mr->dev) > 4. read_unlock(memmap_lock) > > 5. mutex_lock(dev->lock) > 6. dispatch(&mrs, addr, data, size) > 7. mutex_unlock(dev->lock) > > 8. qom_unref(mrs.mr->object) > > THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE > TO UNLOCKED OPERATION. > > 1) qom_ref guarantees that the Device object does not disappear. However, > these operations are allowed in parallel: > > a) after 4. device memory and ioports might change or be unregistered. > > Concurrent accesses of a device that is being hotunplugged or > whose mappings change are responsability of the OS, so > incorrect emulation is not QEMU's problem. However, device > emulation is now subject to: > - cpu_physical_memory_map failing. > - stl_phys* failing. > - cpu_physical_memory_rw failing/reading from unassigned mem. > > b) what should happen with a pending cpu_physical_memory_map pointer, > when deleting the corresponding memory region? > > > net.txt > -------- > > > iothread flow > ============= > > 1) Skip-work-if-device-locked > > select(tap fd ready) > tap_send > if (trylock(TAPState->NetClientState->dev)) > proceed; > else > continue; (data remains in queue) > tap_read_packet > qemu_send_packet_async > > DRAWBACK: lock intensive. > DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with > a scheme such as trylock() marks a flag saying "iothread wants lock". > > Checking each subsystem for possibility: > > - tap_send: OK > - user,slirp: if not possible, use device->lock. > - qemu-char: OK > - interrupts: lock with qemu_global_mutex. > - qemutimer: > - read time: qemu_get_clock_ns (see later). > > > 2) Queue-work-to-vcpu-context > > select(tap fd ready) > tap_send > if (trylock(TAPState->NetClientState->dev)) { > proceed; > } else { > AddToDeviceWorkQueue(work); > } > tap_read_packet > qemu_send_packet_async > > DRAWBACK: lock intensive > DRAWBACK: cache effects > > 3) VCPU-thread-notify-device-unlocked > > select(tap fd ready) > tap_send > if (trylock(TAPState->NetClientState->dev)) { > proceed; > } else { > signal VCPU thread to notify IOTHREAD when unlocked. > } > tap_read_packet > > > GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow > same order. > > > PCI HOTPLUG EJECT FLOW > ---------------------- > > 1) Monitor initiated. > > monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device -> > piix4_device_hotplug -> SCI interrupt -> OS removes > application users from device -> OS runs _EJ0 -> > > For hot removal, the device must be immediately ejected when OSPM calls > the _EJ0 control method. The _EJ0 control method does not return until > ejection is complete. After calling _EJ0, OSPM verifies the device no > longer exists to determine if the eject succeeded. For _HID devices, > OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the > bus driver for that device. > > > Avi: A pci eject guarantees that all accesses > started after the eject completes do not see the device. > > Can you do: > > - qobject_remove() > > and have the device dispatch running safely? NO, therefore pci_unplug_device > must hold dev->lock. > > > >
On 06/28/2012 05:10 PM, Anthony Liguori wrote: >> >> 1. read_lock(memmap_lock) >> 2. MemoryRegionSection mrs = lookup(addr) >> 3. qom_ref(mrs.mr->dev) >> 4. read_unlock(memmap_lock) >> >> 5. mutex_lock(dev->lock) >> 6. dispatch(&mrs, addr, data, size) >> 7. mutex_unlock(dev->lock) > > Just a detail, I don't think we should acquire a device specific lock in > global code. Rather, I think we should acquire the global lock before > dispatch unless a MemoryRegion is marked as being unlocked. I think I agree. The mutex acquisition is moved inside dispatch (in device-specific code). A side effect is that device code must not call memory_region_destroy() outside its constructor or destructor.
On Thu, Jun 28, 2012 at 09:10:39AM -0500, Anthony Liguori wrote: > On 06/26/2012 02:34 PM, Marcelo Tosatti wrote: > >On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote: > >>Should have declared this [RFC] in the subject and CC'ed kvm... > >> > >>On 2012-06-23 00:45, Jan Kiszka wrote: > >>>This sketches a possible path to get rid of the iothread lock on vmexits > >>>in KVM mode. On x86, the the in-kernel irqchips has to be used because > >>>we otherwise need to synchronize APIC and other per-cpu state accesses > >>>that could be changed concurrently. Not yet fully analyzed is the NMI > >>>injection path in the absence of an APIC. > >>> > >>>s390x should be fine without specific locking as their pre/post-run > >>>callbacks are empty. Power requires locking for the pre-run callback. > >>> > >>>This patch is untested, but a similar version was successfully used in > >>>a x86 setup with a network I/O path that needed no central iothread > >>>locking anymore (required special MMIO exit handling). > >>>--- > >>> kvm-all.c | 18 ++++++++++++++++-- > >>> target-i386/kvm.c | 7 +++++++ > >>> target-ppc/kvm.c | 4 ++++ > >>> 3 files changed, 27 insertions(+), 2 deletions(-) > >>> > >>>diff --git a/kvm-all.c b/kvm-all.c > >>>index f8e4328..9c3e26f 100644 > >>>--- a/kvm-all.c > >>>+++ b/kvm-all.c > >>>@@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) > >>> return EXCP_HLT; > >>> } > >>> > >>>+ qemu_mutex_unlock_iothread(); > >>>+ > >>> do { > >>> if (env->kvm_vcpu_dirty) { > >>> kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); > >>>@@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) > >>> */ > >>> qemu_cpu_kick_self(); > >>> } > >>>- qemu_mutex_unlock_iothread(); > >>> > >>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); > >>> > >>>- qemu_mutex_lock_iothread(); > >>> kvm_arch_post_run(env, run); > >>> > >>>+ /* TODO: push coalesced mmio flushing to the point where we access > >>>+ * devices that are using it (currently VGA and E1000). */ > >>>+ qemu_mutex_lock_iothread(); > >>> kvm_flush_coalesced_mmio_buffer(); > >>>+ qemu_mutex_unlock_iothread(); > >>> > >>> if (run_ret< 0) { > >>> if (run_ret == -EINTR || run_ret == -EAGAIN) { > >>>@@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) > >>> switch (run->exit_reason) { > >>> case KVM_EXIT_IO: > >>> DPRINTF("handle_io\n"); > >>>+ qemu_mutex_lock_iothread(); > >>> kvm_handle_io(run->io.port, > >>> (uint8_t *)run + run->io.data_offset, > >>> run->io.direction, > >>> run->io.size, > >>> run->io.count); > >>>+ qemu_mutex_unlock_iothread(); > >>> ret = 0; > >>> break; > >>> case KVM_EXIT_MMIO: > >>> DPRINTF("handle_mmio\n"); > >>>+ qemu_mutex_lock_iothread(); > >>> cpu_physical_memory_rw(run->mmio.phys_addr, > >>> run->mmio.data, > >>> run->mmio.len, > >>> run->mmio.is_write); > >>>+ qemu_mutex_unlock_iothread(); > >>> ret = 0; > >>> break; > >>> case KVM_EXIT_IRQ_WINDOW_OPEN: > >>>@@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) > >>> break; > >>> case KVM_EXIT_SHUTDOWN: > >>> DPRINTF("shutdown\n"); > >>>+ qemu_mutex_lock_iothread(); > >>> qemu_system_reset_request(); > >>>+ qemu_mutex_unlock_iothread(); > >>> ret = EXCP_INTERRUPT; > >>> break; > >>> case KVM_EXIT_UNKNOWN: > >>>@@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) > >>> break; > >>> default: > >>> DPRINTF("kvm_arch_handle_exit\n"); > >>>+ qemu_mutex_lock_iothread(); > >>> ret = kvm_arch_handle_exit(env, run); > >>>+ qemu_mutex_unlock_iothread(); > >>> break; > >>> } > >>> } while (ret == 0); > >>> > >>>+ qemu_mutex_lock_iothread(); > >>>+ > >>> if (ret< 0) { > >>> cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); > >>> vm_stop(RUN_STATE_INTERNAL_ERROR); > >>>diff --git a/target-i386/kvm.c b/target-i386/kvm.c > >>>index 0d0d8f6..0ad64d1 100644 > >>>--- a/target-i386/kvm.c > >>>+++ b/target-i386/kvm.c > >>>@@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > >>> > >>> /* Inject NMI */ > >>> if (env->interrupt_request& CPU_INTERRUPT_NMI) { > >>>+ qemu_mutex_lock_iothread(); > >>> env->interrupt_request&= ~CPU_INTERRUPT_NMI; > >>>+ qemu_mutex_unlock_iothread(); > >>>+ > >>> DPRINTF("injected NMI\n"); > >>> ret = kvm_vcpu_ioctl(env, KVM_NMI); > >>> if (ret< 0) { > >>>@@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > >>> } > >>> > >>> if (!kvm_irqchip_in_kernel()) { > >>>+ qemu_mutex_lock_iothread(); > >>>+ > >>> /* Force the VCPU out of its inner loop to process any INIT requests > >>> * or pending TPR access reports. */ > >>> if (env->interrupt_request& > >>>@@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) > >>> > >>> DPRINTF("setting tpr\n"); > >>> run->cr8 = cpu_get_apic_tpr(env->apic_state); > >>>+ > >>>+ qemu_mutex_unlock_iothread(); > >>> } > >>> } > >>> > >>>diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > >>>index c09cc39..60d91a5 100644 > >>>--- a/target-ppc/kvm.c > >>>+++ b/target-ppc/kvm.c > >>>@@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > >>> int r; > >>> unsigned irq; > >>> > >>>+ qemu_mutex_lock_iothread(); > >>>+ > >>> /* PowerPC QEMU tracks the various core input pins (interrupt, critical > >>> * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ > >>> if (!cap_interrupt_level&& > >>>@@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) > >>> /* We don't know if there are more interrupts pending after this. However, > >>> * the guest will return to userspace in the course of handling this one > >>> * anyways, so we will get a chance to deliver the rest. */ > >>>+ > >>>+ qemu_mutex_unlock_iothread(); > >>> } > >>> > >>> void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run) > >>> > > > >The following plan would allow progressive convertion to parallel > >operation. > > > >Jan mentioned the MMIO handler->MMIO handler deadlock in a private message. > > > >Jan: if there is recursive MMIO accesses, you can detect that and skip > >such MMIO handlers in dev_can_use_lock() ? Or blacklist. > > > >What i mentioned earlier about "unsolved issues" are the possibilities > >in "iothread flow" (belief is that it would be better determined at > >with execution experience / tuning). > > > >OK, so, each document starts with xyz.txt. This is compatible with TCG, > >the critical part is not dropping/acquire locks for decent performance > >(that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD > >requests so). > > > >This came out from discussions with Avi. > > > >immediate-plan.txt > >------------------ > > > >1. read_lock(memmap_lock) > >2. MemoryRegionSection mrs = lookup(addr) > >3. qom_ref(mrs.mr->dev) > >4. read_unlock(memmap_lock) > > > >5. mutex_lock(dev->lock) > >6. dispatch(&mrs, addr, data, size) > >7. mutex_unlock(dev->lock) > > Just a detail, I don't think we should acquire a device specific > lock in global code. Why? The above outdated, it should be: 5. lock_device(dev) 6. dispatch() 7. unlock_device(dev) See lock_device below. > Rather, I think we should acquire the global > lock before dispatch unless a MemoryRegion is marked as being > unlocked. See lock_device and dev_can_use_lock below. The per-device lock is used if all services used by the dispatch handler (such as timers, interrupt, network data transmission, etc), are prepared (converted) to skip entering device protected region, when executed from non-vcpu contexes, such as: main_loop: select() mutex_lock(qemu_global_mutex) iohandler() if (mutex_trylock(dev->lock) == SUCCESS) push_data_to(dev) else continue processing of
On Thu, Jun 28, 2012 at 09:10:39AM -0500, Anthony Liguori wrote: > > > >1. read_lock(memmap_lock) > >2. MemoryRegionSection mrs = lookup(addr) > >3. qom_ref(mrs.mr->dev) > >4. read_unlock(memmap_lock) > > > >5. mutex_lock(dev->lock) > >6. dispatch(&mrs, addr, data, size) > >7. mutex_unlock(dev->lock) > > Just a detail, I don't think we should acquire a device specific > lock in global code. Rather, I think we should acquire the global > lock before dispatch unless a MemoryRegion is marked as being > unlocked. "The basic plan is introduce granular locking starting at the KVM dispatch level until we can get to MemoryRegion dispatch. We'll then have some way to indicate that a MemoryRegion's callbacks should be invoked without holding the qemu global mutex." Before that is possible, the callback must not make use of data structures currently protected by qemu_global_mutex, such as timers, interrupts (that is, you would have to split locks for each individual service invoked from inside callbacks, which is a recipe for disaster). With lock_device() below you can have mutex_lock(dev->lock) device specific work mutex_lock(qemu_global_mutex) raise irq, send packet, etc mutex_unlock(qemu_global_mutex) mutex_unlock(dev->lock) and iothread doing the select() pseudocode in the previous email.
On 2012-06-24 16:08, Jan Kiszka wrote: > On 2012-06-24 10:49, Avi Kivity wrote: >> On 06/23/2012 02:45 PM, Jan Kiszka wrote: >>> >>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >>> can be but there as well. >>> >>> With in-kernel irqchip, there is no such need. Also, no one accesses >>> eflags outside of the vcpu thread, independent of the irqchip mode. >> >> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt >> injection needs to be done atomically, but currently we check the tpr >> from the injecting thread, which means the cpu thread can race with it. >> We need to move the check to the vcpu thread so that the guest vcpu is >> halted. > > So apic_set_irq basically needs to be deferred to vcpu context, right? > Will have a look. Tried to wrap my head around this, but only found different issues (patches under construction). First of all, a simple run_on_cpu doesn't work as it may drops the BQL at unexpected points inside device models. Then I thought about what could actually race here: The testing of the userspace TPR value under BQL vs. some modification by the CPU while in KVM mode. So we may either inject while the CPU is trying to prevent this - harmless as it happens on real hw as well - or not inject while the CPU is enabling this. But the latter is quickly resolved because all such TPR changes in userspace APIC mode are trapped and then processed under BQL. At that point we will also reevaluate the pending interrupts and inject what was deferred before (kvm_arch_post_run -> cpu_set_apic_tpr -> apic_set_tpr -> apic_update_irq, or via apic_mem_writel). So where is a race? Jan
On 2012-07-06 19:16, Jan Kiszka wrote: > On 2012-06-24 16:08, Jan Kiszka wrote: >> On 2012-06-24 10:49, Avi Kivity wrote: >>> On 06/23/2012 02:45 PM, Jan Kiszka wrote: >>>> >>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >>>> can be but there as well. >>>> >>>> With in-kernel irqchip, there is no such need. Also, no one accesses >>>> eflags outside of the vcpu thread, independent of the irqchip mode. >>> >>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt >>> injection needs to be done atomically, but currently we check the tpr >>> from the injecting thread, which means the cpu thread can race with it. >>> We need to move the check to the vcpu thread so that the guest vcpu is >>> halted. >> >> So apic_set_irq basically needs to be deferred to vcpu context, right? >> Will have a look. > > Tried to wrap my head around this, but only found different issues > (patches under construction). > > First of all, a simple run_on_cpu doesn't work as it may drops the BQL > at unexpected points inside device models. > > Then I thought about what could actually race here: The testing of the > userspace TPR value under BQL vs. some modification by the CPU while in > KVM mode. So we may either inject while the CPU is trying to prevent > this - harmless as it happens on real hw as well - or not inject while Hmm, this could actually be a problem as the race window is extended beyond the instruction that raises TPR. So we may generate unexpected spurious interrupts for the guest. And that's because the userspace APIC fails to update the CPU interrupt pin properly when the TPR is modified. Here is the bug... Jan > the CPU is enabling this. But the latter is quickly resolved because all > such TPR changes in userspace APIC mode are trapped and then processed > under BQL. At that point we will also reevaluate the pending interrupts > and inject what was deferred before (kvm_arch_post_run -> > cpu_set_apic_tpr -> apic_set_tpr -> apic_update_irq, or via > apic_mem_writel). > > So where is a race? > > Jan >
On 07/06/2012 09:06 PM, Jan Kiszka wrote: > On 2012-07-06 19:16, Jan Kiszka wrote: >> On 2012-06-24 16:08, Jan Kiszka wrote: >>> On 2012-06-24 10:49, Avi Kivity wrote: >>>> On 06/23/2012 02:45 PM, Jan Kiszka wrote: >>>>> >>>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for >>>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation >>>>> can be but there as well. >>>>> >>>>> With in-kernel irqchip, there is no such need. Also, no one accesses >>>>> eflags outside of the vcpu thread, independent of the irqchip mode. >>>> >>>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr. Interrupt >>>> injection needs to be done atomically, but currently we check the tpr >>>> from the injecting thread, which means the cpu thread can race with it. >>>> We need to move the check to the vcpu thread so that the guest vcpu is >>>> halted. >>> >>> So apic_set_irq basically needs to be deferred to vcpu context, right? >>> Will have a look. >> >> Tried to wrap my head around this, but only found different issues >> (patches under construction). >> >> First of all, a simple run_on_cpu doesn't work as it may drops the BQL >> at unexpected points inside device models. >> >> Then I thought about what could actually race here: The testing of the >> userspace TPR value under BQL vs. some modification by the CPU while in >> KVM mode. So we may either inject while the CPU is trying to prevent >> this - harmless as it happens on real hw as well - or not inject while > > Hmm, this could actually be a problem as the race window is extended > beyond the instruction that raises TPR. So we may generate unexpected > spurious interrupts for the guest. And that's because the userspace APIC > fails to update the CPU interrupt pin properly when the TPR is modified. > Here is the bug... Exactly. To avoid dropping the lock we can do something similar to kvm - queue a request to reevaluate IRR, then signal that vcpu to exit. No need to drop the lock. Possibly a check in apic_update_irq() whether we're running in the vcpu thread; if not, signal and return. That function's effects are asynchronous if run outside the vcpu, so it's a safe change.
diff --git a/kvm-all.c b/kvm-all.c index f8e4328..9c3e26f 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env) return EXCP_HLT; } + qemu_mutex_unlock_iothread(); + do { if (env->kvm_vcpu_dirty) { kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env) */ qemu_cpu_kick_self(); } - qemu_mutex_unlock_iothread(); run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); - qemu_mutex_lock_iothread(); kvm_arch_post_run(env, run); + /* TODO: push coalesced mmio flushing to the point where we access + * devices that are using it (currently VGA and E1000). */ + qemu_mutex_lock_iothread(); kvm_flush_coalesced_mmio_buffer(); + qemu_mutex_unlock_iothread(); if (run_ret < 0) { if (run_ret == -EINTR || run_ret == -EAGAIN) { @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env) switch (run->exit_reason) { case KVM_EXIT_IO: DPRINTF("handle_io\n"); + qemu_mutex_lock_iothread(); kvm_handle_io(run->io.port, (uint8_t *)run + run->io.data_offset, run->io.direction, run->io.size, run->io.count); + qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_MMIO: DPRINTF("handle_mmio\n"); + qemu_mutex_lock_iothread(); cpu_physical_memory_rw(run->mmio.phys_addr, run->mmio.data, run->mmio.len, run->mmio.is_write); + qemu_mutex_unlock_iothread(); ret = 0; break; case KVM_EXIT_IRQ_WINDOW_OPEN: @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env) break; case KVM_EXIT_SHUTDOWN: DPRINTF("shutdown\n"); + qemu_mutex_lock_iothread(); qemu_system_reset_request(); + qemu_mutex_unlock_iothread(); ret = EXCP_INTERRUPT; break; case KVM_EXIT_UNKNOWN: @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env) break; default: DPRINTF("kvm_arch_handle_exit\n"); + qemu_mutex_lock_iothread(); ret = kvm_arch_handle_exit(env, run); + qemu_mutex_unlock_iothread(); break; } } while (ret == 0); + qemu_mutex_lock_iothread(); + if (ret < 0) { cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE); vm_stop(RUN_STATE_INTERNAL_ERROR); diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 0d0d8f6..0ad64d1 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) /* Inject NMI */ if (env->interrupt_request & CPU_INTERRUPT_NMI) { + qemu_mutex_lock_iothread(); env->interrupt_request &= ~CPU_INTERRUPT_NMI; + qemu_mutex_unlock_iothread(); + DPRINTF("injected NMI\n"); ret = kvm_vcpu_ioctl(env, KVM_NMI); if (ret < 0) { @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) } if (!kvm_irqchip_in_kernel()) { + qemu_mutex_lock_iothread(); + /* Force the VCPU out of its inner loop to process any INIT requests * or pending TPR access reports. */ if (env->interrupt_request & @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run) DPRINTF("setting tpr\n"); run->cr8 = cpu_get_apic_tpr(env->apic_state); + + qemu_mutex_unlock_iothread(); } } diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index c09cc39..60d91a5 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) int r; unsigned irq; + qemu_mutex_lock_iothread(); + /* PowerPC QEMU tracks the various core input pins (interrupt, critical * interrupt, reset, etc) in PPC-specific env->irq_input_state. */ if (!cap_interrupt_level && @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run) /* We don't know if there are more interrupts pending after this. However, * the guest will return to userspace in the course of handling this one * anyways, so we will get a chance to deliver the rest. */ + + qemu_mutex_unlock_iothread(); } void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)