Patchwork kvm: First step to push iothread lock out of inner run loop

login
register
mail settings
Submitter Jan Kiszka
Date June 22, 2012, 10:45 p.m.
Message ID <4FE4F56D.1020201@web.de>
Download mbox | patch
Permalink /patch/166680/
State New
Headers show

Comments

Jan Kiszka - June 22, 2012, 10:45 p.m.
This sketches a possible path to get rid of the iothread lock on vmexits
in KVM mode. On x86, the the in-kernel irqchips has to be used because
we otherwise need to synchronize APIC and other per-cpu state accesses
that could be changed concurrently. Not yet fully analyzed is the NMI
injection path in the absence of an APIC.

s390x should be fine without specific locking as their pre/post-run
callbacks are empty. Power requires locking for the pre-run callback.

This patch is untested, but a similar version was successfully used in
a x86 setup with a network I/O path that needed no central iothread
locking anymore (required special MMIO exit handling).
---
 kvm-all.c         |   18 ++++++++++++++++--
 target-i386/kvm.c |    7 +++++++
 target-ppc/kvm.c  |    4 ++++
 3 files changed, 27 insertions(+), 2 deletions(-)
Jan Kiszka - June 22, 2012, 10:55 p.m.
Should have declared this [RFC] in the subject and CC'ed kvm...

On 2012-06-23 00:45, Jan Kiszka wrote:
> This sketches a possible path to get rid of the iothread lock on vmexits
> in KVM mode. On x86, the the in-kernel irqchips has to be used because
> we otherwise need to synchronize APIC and other per-cpu state accesses
> that could be changed concurrently. Not yet fully analyzed is the NMI
> injection path in the absence of an APIC.
> 
> s390x should be fine without specific locking as their pre/post-run
> callbacks are empty. Power requires locking for the pre-run callback.
> 
> This patch is untested, but a similar version was successfully used in
> a x86 setup with a network I/O path that needed no central iothread
> locking anymore (required special MMIO exit handling).
> ---
>  kvm-all.c         |   18 ++++++++++++++++--
>  target-i386/kvm.c |    7 +++++++
>  target-ppc/kvm.c  |    4 ++++
>  3 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/kvm-all.c b/kvm-all.c
> index f8e4328..9c3e26f 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>          return EXCP_HLT;
>      }
>  
> +    qemu_mutex_unlock_iothread();
> +
>      do {
>          if (env->kvm_vcpu_dirty) {
>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>               */
>              qemu_cpu_kick_self();
>          }
> -        qemu_mutex_unlock_iothread();
>  
>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>  
> -        qemu_mutex_lock_iothread();
>          kvm_arch_post_run(env, run);
>  
> +        /* TODO: push coalesced mmio flushing to the point where we access
> +         * devices that are using it (currently VGA and E1000). */
> +        qemu_mutex_lock_iothread();
>          kvm_flush_coalesced_mmio_buffer();
> +        qemu_mutex_unlock_iothread();
>  
>          if (run_ret < 0) {
>              if (run_ret == -EINTR || run_ret == -EAGAIN) {
> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>          switch (run->exit_reason) {
>          case KVM_EXIT_IO:
>              DPRINTF("handle_io\n");
> +            qemu_mutex_lock_iothread();
>              kvm_handle_io(run->io.port,
>                            (uint8_t *)run + run->io.data_offset,
>                            run->io.direction,
>                            run->io.size,
>                            run->io.count);
> +            qemu_mutex_unlock_iothread();
>              ret = 0;
>              break;
>          case KVM_EXIT_MMIO:
>              DPRINTF("handle_mmio\n");
> +            qemu_mutex_lock_iothread();
>              cpu_physical_memory_rw(run->mmio.phys_addr,
>                                     run->mmio.data,
>                                     run->mmio.len,
>                                     run->mmio.is_write);
> +            qemu_mutex_unlock_iothread();
>              ret = 0;
>              break;
>          case KVM_EXIT_IRQ_WINDOW_OPEN:
> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>              break;
>          case KVM_EXIT_SHUTDOWN:
>              DPRINTF("shutdown\n");
> +            qemu_mutex_lock_iothread();
>              qemu_system_reset_request();
> +            qemu_mutex_unlock_iothread();
>              ret = EXCP_INTERRUPT;
>              break;
>          case KVM_EXIT_UNKNOWN:
> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>              break;
>          default:
>              DPRINTF("kvm_arch_handle_exit\n");
> +            qemu_mutex_lock_iothread();
>              ret = kvm_arch_handle_exit(env, run);
> +            qemu_mutex_unlock_iothread();
>              break;
>          }
>      } while (ret == 0);
>  
> +    qemu_mutex_lock_iothread();
> +
>      if (ret < 0) {
>          cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>          vm_stop(RUN_STATE_INTERNAL_ERROR);
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 0d0d8f6..0ad64d1 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>  
>      /* Inject NMI */
>      if (env->interrupt_request & CPU_INTERRUPT_NMI) {
> +        qemu_mutex_lock_iothread();
>          env->interrupt_request &= ~CPU_INTERRUPT_NMI;
> +        qemu_mutex_unlock_iothread();
> +
>          DPRINTF("injected NMI\n");
>          ret = kvm_vcpu_ioctl(env, KVM_NMI);
>          if (ret < 0) {
> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>      }
>  
>      if (!kvm_irqchip_in_kernel()) {
> +        qemu_mutex_lock_iothread();
> +
>          /* Force the VCPU out of its inner loop to process any INIT requests
>           * or pending TPR access reports. */
>          if (env->interrupt_request &
> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>  
>          DPRINTF("setting tpr\n");
>          run->cr8 = cpu_get_apic_tpr(env->apic_state);
> +
> +        qemu_mutex_unlock_iothread();
>      }
>  }
>  
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index c09cc39..60d91a5 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>      int r;
>      unsigned irq;
>  
> +    qemu_mutex_lock_iothread();
> +
>      /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>       * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>      if (!cap_interrupt_level &&
> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>      /* We don't know if there are more interrupts pending after this. However,
>       * the guest will return to userspace in the course of handling this one
>       * anyways, so we will get a chance to deliver the rest. */
> +
> +    qemu_mutex_unlock_iothread();
>  }
>  
>  void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
>
Anthony Liguori - June 22, 2012, 10:59 p.m.
On 06/22/2012 05:45 PM, Jan Kiszka wrote:
> This sketches a possible path to get rid of the iothread lock on vmexits
> in KVM mode. On x86, the the in-kernel irqchips has to be used because
> we otherwise need to synchronize APIC and other per-cpu state accesses
> that could be changed concurrently. Not yet fully analyzed is the NMI
> injection path in the absence of an APIC.
>
> s390x should be fine without specific locking as their pre/post-run
> callbacks are empty. Power requires locking for the pre-run callback.
>
> This patch is untested, but a similar version was successfully used in
> a x86 setup with a network I/O path that needed no central iothread
> locking anymore (required special MMIO exit handling).
> ---
>   kvm-all.c         |   18 ++++++++++++++++--
>   target-i386/kvm.c |    7 +++++++
>   target-ppc/kvm.c  |    4 ++++
>   3 files changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/kvm-all.c b/kvm-all.c
> index f8e4328..9c3e26f 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>           return EXCP_HLT;
>       }
>
> +    qemu_mutex_unlock_iothread();
> +
>       do {
>           if (env->kvm_vcpu_dirty) {
>               kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>                */
>               qemu_cpu_kick_self();
>           }
> -        qemu_mutex_unlock_iothread();
>
>           run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>
> -        qemu_mutex_lock_iothread();
>           kvm_arch_post_run(env, run);
>
> +        /* TODO: push coalesced mmio flushing to the point where we access
> +         * devices that are using it (currently VGA and E1000). */
> +        qemu_mutex_lock_iothread();
>           kvm_flush_coalesced_mmio_buffer();
> +        qemu_mutex_unlock_iothread();
>
>           if (run_ret<  0) {
>               if (run_ret == -EINTR || run_ret == -EAGAIN) {
> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>           switch (run->exit_reason) {
>           case KVM_EXIT_IO:
>               DPRINTF("handle_io\n");
> +            qemu_mutex_lock_iothread();
>               kvm_handle_io(run->io.port,
>                             (uint8_t *)run + run->io.data_offset,
>                             run->io.direction,
>                             run->io.size,
>                             run->io.count);
> +            qemu_mutex_unlock_iothread();
>               ret = 0;
>               break;
>           case KVM_EXIT_MMIO:
>               DPRINTF("handle_mmio\n");
> +            qemu_mutex_lock_iothread();
>               cpu_physical_memory_rw(run->mmio.phys_addr,
>                                      run->mmio.data,
>                                      run->mmio.len,
>                                      run->mmio.is_write);
> +            qemu_mutex_unlock_iothread();
>               ret = 0;
>               break;
>           case KVM_EXIT_IRQ_WINDOW_OPEN:
> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>               break;
>           case KVM_EXIT_SHUTDOWN:
>               DPRINTF("shutdown\n");
> +            qemu_mutex_lock_iothread();
>               qemu_system_reset_request();
> +            qemu_mutex_unlock_iothread();
>               ret = EXCP_INTERRUPT;
>               break;
>           case KVM_EXIT_UNKNOWN:
> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>               break;
>           default:
>               DPRINTF("kvm_arch_handle_exit\n");
> +            qemu_mutex_lock_iothread();
>               ret = kvm_arch_handle_exit(env, run);
> +            qemu_mutex_unlock_iothread();
>               break;
>           }
>       } while (ret == 0);
>
> +    qemu_mutex_lock_iothread();
> +
>       if (ret<  0) {
>           cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>           vm_stop(RUN_STATE_INTERNAL_ERROR);
> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> index 0d0d8f6..0ad64d1 100644
> --- a/target-i386/kvm.c
> +++ b/target-i386/kvm.c
> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>
>       /* Inject NMI */
>       if (env->interrupt_request&  CPU_INTERRUPT_NMI) {

Strictly speaking, wouldn't we need to use testbit() and setbit()?  I would 
expect in the very least a barrier would be needed.

Looks pretty nice overall.  I'll need to apply and spend some time carefully 
walking through the code.

Thanks for sharing!

Regards,

Anthony Liguori

> +        qemu_mutex_lock_iothread();
>           env->interrupt_request&= ~CPU_INTERRUPT_NMI;
> +        qemu_mutex_unlock_iothread();
> +
>           DPRINTF("injected NMI\n");
>           ret = kvm_vcpu_ioctl(env, KVM_NMI);
>           if (ret<  0) {
> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>       }
>
>       if (!kvm_irqchip_in_kernel()) {
> +        qemu_mutex_lock_iothread();
> +
>           /* Force the VCPU out of its inner loop to process any INIT requests
>            * or pending TPR access reports. */
>           if (env->interrupt_request&
> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>
>           DPRINTF("setting tpr\n");
>           run->cr8 = cpu_get_apic_tpr(env->apic_state);
> +
> +        qemu_mutex_unlock_iothread();
>       }
>   }
>
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index c09cc39..60d91a5 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>       int r;
>       unsigned irq;
>
> +    qemu_mutex_lock_iothread();
> +
>       /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>        * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>       if (!cap_interrupt_level&&
> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>       /* We don't know if there are more interrupts pending after this. However,
>        * the guest will return to userspace in the course of handling this one
>        * anyways, so we will get a chance to deliver the rest. */
> +
> +    qemu_mutex_unlock_iothread();
>   }
>
>   void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
Marcelo Tosatti - June 23, 2012, 12:22 a.m.
On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> Should have declared this [RFC] in the subject and CC'ed kvm...
> 
> On 2012-06-23 00:45, Jan Kiszka wrote:
> > This sketches a possible path to get rid of the iothread lock on vmexits
> > in KVM mode. On x86, the the in-kernel irqchips has to be used because
> > we otherwise need to synchronize APIC and other per-cpu state accesses
> > that could be changed concurrently. Not yet fully analyzed is the NMI
> > injection path in the absence of an APIC.
> > 
> > s390x should be fine without specific locking as their pre/post-run
> > callbacks are empty. Power requires locking for the pre-run callback.
> > 
> > This patch is untested, but a similar version was successfully used in
> > a x86 setup with a network I/O path that needed no central iothread
> > locking anymore (required special MMIO exit handling).
> > ---
> >  kvm-all.c         |   18 ++++++++++++++++--
> >  target-i386/kvm.c |    7 +++++++
> >  target-ppc/kvm.c  |    4 ++++
> >  3 files changed, 27 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kvm-all.c b/kvm-all.c
> > index f8e4328..9c3e26f 100644
> > --- a/kvm-all.c
> > +++ b/kvm-all.c
> > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> >          return EXCP_HLT;
> >      }
> >  
> > +    qemu_mutex_unlock_iothread();
> > +
> >      do {
> >          if (env->kvm_vcpu_dirty) {
> >              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> >               */
> >              qemu_cpu_kick_self();
> >          }
> > -        qemu_mutex_unlock_iothread();
> >  
> >          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> >  
> > -        qemu_mutex_lock_iothread();
> >          kvm_arch_post_run(env, run);

target-i386/kvm.c

void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
{       
    if (run->if_flag) {
        env->eflags |= IF_MASK;
    } else {
        env->eflags &= ~IF_MASK;
    }
    cpu_set_apic_tpr(env->apic_state, run->cr8);
    cpu_set_apic_base(env->apic_state, run->apic_base);
}

Clearly there is no structure to any of the writes around the writes
in x86's kvm_arch_post_run, so it is unsafe.

In kvm_flush_coalesced_mmio_buffer, however, the first and last pointers 
can be read safely without the global lock (so you could move the lock
after reading run->exit_reason, in that case).

> > +        /* TODO: push coalesced mmio flushing to the point where we access
> > +         * devices that are using it (currently VGA and E1000). */
> > +        qemu_mutex_lock_iothread();
> >          kvm_flush_coalesced_mmio_buffer();
> > +        qemu_mutex_unlock_iothread();

But you have to flush first to then figure out which device the
coalesced mmio belongs to (don't get that comment).

It would be good to move the global lock acquision to as late
as possible, but acquiring/reacquiring is not very useful 
(can result in a slowdown, actually).
Marcelo Tosatti - June 23, 2012, 9:06 a.m.
On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> > Should have declared this [RFC] in the subject and CC'ed kvm...
> > 
> > On 2012-06-23 00:45, Jan Kiszka wrote:
> > > This sketches a possible path to get rid of the iothread lock on vmexits
> > > in KVM mode. On x86, the the in-kernel irqchips has to be used because
> > > we otherwise need to synchronize APIC and other per-cpu state accesses
> > > that could be changed concurrently. Not yet fully analyzed is the NMI
> > > injection path in the absence of an APIC.
> > > 
> > > s390x should be fine without specific locking as their pre/post-run
> > > callbacks are empty. Power requires locking for the pre-run callback.
> > > 
> > > This patch is untested, but a similar version was successfully used in
> > > a x86 setup with a network I/O path that needed no central iothread
> > > locking anymore (required special MMIO exit handling).
> > > ---
> > >  kvm-all.c         |   18 ++++++++++++++++--
> > >  target-i386/kvm.c |    7 +++++++
> > >  target-ppc/kvm.c  |    4 ++++
> > >  3 files changed, 27 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/kvm-all.c b/kvm-all.c
> > > index f8e4328..9c3e26f 100644
> > > --- a/kvm-all.c
> > > +++ b/kvm-all.c
> > > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> > >          return EXCP_HLT;
> > >      }
> > >  
> > > +    qemu_mutex_unlock_iothread();
> > > +
> > >      do {
> > >          if (env->kvm_vcpu_dirty) {
> > >              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> > > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> > >               */
> > >              qemu_cpu_kick_self();
> > >          }
> > > -        qemu_mutex_unlock_iothread();
> > >  
> > >          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> > >  
> > > -        qemu_mutex_lock_iothread();
> > >          kvm_arch_post_run(env, run);
> 
> target-i386/kvm.c
> 
> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
> {       
>     if (run->if_flag) {
>         env->eflags |= IF_MASK;
>     } else {
>         env->eflags &= ~IF_MASK;
>     }
>     cpu_set_apic_tpr(env->apic_state, run->cr8);
>     cpu_set_apic_base(env->apic_state, run->apic_base);
> }
> 
> Clearly there is no structure to any of the writes around the writes
> in x86's kvm_arch_post_run, so it is unsafe.

No access protocol to the CPUState and apic devices (who can write when,
who can read when).
Jan Kiszka - June 23, 2012, 9:11 a.m.
On 2012-06-23 00:59, Anthony Liguori wrote:
> On 06/22/2012 05:45 PM, Jan Kiszka wrote:
>> This sketches a possible path to get rid of the iothread lock on vmexits
>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>> we otherwise need to synchronize APIC and other per-cpu state accesses
>> that could be changed concurrently. Not yet fully analyzed is the NMI
>> injection path in the absence of an APIC.
>>
>> s390x should be fine without specific locking as their pre/post-run
>> callbacks are empty. Power requires locking for the pre-run callback.
>>
>> This patch is untested, but a similar version was successfully used in
>> a x86 setup with a network I/O path that needed no central iothread
>> locking anymore (required special MMIO exit handling).
>> ---
>>   kvm-all.c         |   18 ++++++++++++++++--
>>   target-i386/kvm.c |    7 +++++++
>>   target-ppc/kvm.c  |    4 ++++
>>   3 files changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index f8e4328..9c3e26f 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>           return EXCP_HLT;
>>       }
>>
>> +    qemu_mutex_unlock_iothread();
>> +
>>       do {
>>           if (env->kvm_vcpu_dirty) {
>>               kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>                */
>>               qemu_cpu_kick_self();
>>           }
>> -        qemu_mutex_unlock_iothread();
>>
>>           run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>
>> -        qemu_mutex_lock_iothread();
>>           kvm_arch_post_run(env, run);
>>
>> +        /* TODO: push coalesced mmio flushing to the point where we
>> access
>> +         * devices that are using it (currently VGA and E1000). */
>> +        qemu_mutex_lock_iothread();
>>           kvm_flush_coalesced_mmio_buffer();
>> +        qemu_mutex_unlock_iothread();
>>
>>           if (run_ret<  0) {
>>               if (run_ret == -EINTR || run_ret == -EAGAIN) {
>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>>           switch (run->exit_reason) {
>>           case KVM_EXIT_IO:
>>               DPRINTF("handle_io\n");
>> +            qemu_mutex_lock_iothread();
>>               kvm_handle_io(run->io.port,
>>                             (uint8_t *)run + run->io.data_offset,
>>                             run->io.direction,
>>                             run->io.size,
>>                             run->io.count);
>> +            qemu_mutex_unlock_iothread();
>>               ret = 0;
>>               break;
>>           case KVM_EXIT_MMIO:
>>               DPRINTF("handle_mmio\n");
>> +            qemu_mutex_lock_iothread();
>>               cpu_physical_memory_rw(run->mmio.phys_addr,
>>                                      run->mmio.data,
>>                                      run->mmio.len,
>>                                      run->mmio.is_write);
>> +            qemu_mutex_unlock_iothread();
>>               ret = 0;
>>               break;
>>           case KVM_EXIT_IRQ_WINDOW_OPEN:
>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>               break;
>>           case KVM_EXIT_SHUTDOWN:
>>               DPRINTF("shutdown\n");
>> +            qemu_mutex_lock_iothread();
>>               qemu_system_reset_request();
>> +            qemu_mutex_unlock_iothread();
>>               ret = EXCP_INTERRUPT;
>>               break;
>>           case KVM_EXIT_UNKNOWN:
>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>               break;
>>           default:
>>               DPRINTF("kvm_arch_handle_exit\n");
>> +            qemu_mutex_lock_iothread();
>>               ret = kvm_arch_handle_exit(env, run);
>> +            qemu_mutex_unlock_iothread();
>>               break;
>>           }
>>       } while (ret == 0);
>>
>> +    qemu_mutex_lock_iothread();
>> +
>>       if (ret<  0) {
>>           cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>>           vm_stop(RUN_STATE_INTERNAL_ERROR);
>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>> index 0d0d8f6..0ad64d1 100644
>> --- a/target-i386/kvm.c
>> +++ b/target-i386/kvm.c
>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct
>> kvm_run *run)
>>
>>       /* Inject NMI */
>>       if (env->interrupt_request&  CPU_INTERRUPT_NMI) {
> 
> Strictly speaking, wouldn't we need to use testbit() and setbit()?  I
> would expect in the very least a barrier would be needed.

I need to think about this as well. We ignored it so far, just saw it
when hacking up this patch.

> 
> Looks pretty nice overall.  I'll need to apply and spend some time
> carefully walking through the code.

Without getting the coalesced mmio flushing out of the way, it does not
buy us that much yet. But I have some idea...

Jan
Jan Kiszka - June 23, 2012, 9:22 a.m.
On 2012-06-23 02:22, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>
>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>> injection path in the absence of an APIC.
>>>
>>> s390x should be fine without specific locking as their pre/post-run
>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>
>>> This patch is untested, but a similar version was successfully used in
>>> a x86 setup with a network I/O path that needed no central iothread
>>> locking anymore (required special MMIO exit handling).
>>> ---
>>>  kvm-all.c         |   18 ++++++++++++++++--
>>>  target-i386/kvm.c |    7 +++++++
>>>  target-ppc/kvm.c  |    4 ++++
>>>  3 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index f8e4328..9c3e26f 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>          return EXCP_HLT;
>>>      }
>>>  
>>> +    qemu_mutex_unlock_iothread();
>>> +
>>>      do {
>>>          if (env->kvm_vcpu_dirty) {
>>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               */
>>>              qemu_cpu_kick_self();
>>>          }
>>> -        qemu_mutex_unlock_iothread();
>>>  
>>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>  
>>> -        qemu_mutex_lock_iothread();
>>>          kvm_arch_post_run(env, run);
> 
> target-i386/kvm.c
> 
> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
> {       
>     if (run->if_flag) {
>         env->eflags |= IF_MASK;
>     } else {
>         env->eflags &= ~IF_MASK;
>     }
>     cpu_set_apic_tpr(env->apic_state, run->cr8);
>     cpu_set_apic_base(env->apic_state, run->apic_base);
> }
> 
> Clearly there is no structure to any of the writes around the writes
> in x86's kvm_arch_post_run, so it is unsafe.

Can't parse this yet.

None of the fields touched above should be modified outside of the vcpu
thread context (as long as that thread is inside the inner loop).
Therefore, it should be safe to run that functions without any lock. Am
I missing something?

> 
> In kvm_flush_coalesced_mmio_buffer, however, the first and last pointers 
> can be read safely without the global lock (so you could move the lock
> after reading run->exit_reason, in that case).
> 
>>> +        /* TODO: push coalesced mmio flushing to the point where we access
>>> +         * devices that are using it (currently VGA and E1000). */
>>> +        qemu_mutex_lock_iothread();
>>>          kvm_flush_coalesced_mmio_buffer();
>>> +        qemu_mutex_unlock_iothread();
> 
> But you have to flush first to then figure out which device the
> coalesced mmio belongs to (don't get that comment).

kvm_flush must not be called unconditionally on vmexit, that is my
point. I'm playing with the idea to tag memory regions that require
flushing (as they are coalescing themselves or logically depend on
coalesced regions). Then we would flush in the memory layer once a read
or write is about to be performed on such a region.

BTW, two more users arrived in the meantime: the G364 framebuffer and
the i82378 PCI-ISA bridge (not sure yet what requests that bridge
coalesces, if it's only VGA, but it looks a bit fishy).

Jan
Jan Kiszka - June 23, 2012, 11:45 a.m.
On 2012-06-23 11:06, Marcelo Tosatti wrote:
> On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote:
>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>>
>>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>>> injection path in the absence of an APIC.
>>>>
>>>> s390x should be fine without specific locking as their pre/post-run
>>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>>
>>>> This patch is untested, but a similar version was successfully used in
>>>> a x86 setup with a network I/O path that needed no central iothread
>>>> locking anymore (required special MMIO exit handling).
>>>> ---
>>>>  kvm-all.c         |   18 ++++++++++++++++--
>>>>  target-i386/kvm.c |    7 +++++++
>>>>  target-ppc/kvm.c  |    4 ++++
>>>>  3 files changed, 27 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/kvm-all.c b/kvm-all.c
>>>> index f8e4328..9c3e26f 100644
>>>> --- a/kvm-all.c
>>>> +++ b/kvm-all.c
>>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>          return EXCP_HLT;
>>>>      }
>>>>  
>>>> +    qemu_mutex_unlock_iothread();
>>>> +
>>>>      do {
>>>>          if (env->kvm_vcpu_dirty) {
>>>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>               */
>>>>              qemu_cpu_kick_self();
>>>>          }
>>>> -        qemu_mutex_unlock_iothread();
>>>>  
>>>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>>  
>>>> -        qemu_mutex_lock_iothread();
>>>>          kvm_arch_post_run(env, run);
>>
>> target-i386/kvm.c
>>
>> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
>> {       
>>     if (run->if_flag) {
>>         env->eflags |= IF_MASK;
>>     } else {
>>         env->eflags &= ~IF_MASK;
>>     }
>>     cpu_set_apic_tpr(env->apic_state, run->cr8);
>>     cpu_set_apic_base(env->apic_state, run->apic_base);
>> }
>>
>> Clearly there is no structure to any of the writes around the writes
>> in x86's kvm_arch_post_run, so it is unsafe.
> 
> No access protocol to the CPUState and apic devices (who can write when,
> who can read when).
> 

Hmm, we may need the iothread lock around cpu_set_apic_tpr for
!kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
can be but there as well.

With in-kernel irqchip, there is no such need. Also, no one accesses
eflags outside of the vcpu thread, independent of the irqchip mode.

Jan
Avi Kivity - June 24, 2012, 8:49 a.m.
On 06/23/2012 02:45 PM, Jan Kiszka wrote:
> 
> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
> can be but there as well.
> 
> With in-kernel irqchip, there is no such need. Also, no one accesses
> eflags outside of the vcpu thread, independent of the irqchip mode.

In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
injection needs to be done atomically, but currently we check the tpr
from the injecting thread, which means the cpu thread can race with it.
 We need to move the check to the vcpu thread so that the guest vcpu is
halted.
pingfan liu - June 24, 2012, 1:34 p.m.
On Sat, Jun 23, 2012 at 7:45 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
> On 2012-06-23 11:06, Marcelo Tosatti wrote:
>> On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote:
>>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>>>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>>>
>>>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>>>> injection path in the absence of an APIC.
>>>>>
>>>>> s390x should be fine without specific locking as their pre/post-run
>>>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>>>
>>>>> This patch is untested, but a similar version was successfully used in
>>>>> a x86 setup with a network I/O path that needed no central iothread
>>>>> locking anymore (required special MMIO exit handling).
>>>>> ---
>>>>>  kvm-all.c         |   18 ++++++++++++++++--
>>>>>  target-i386/kvm.c |    7 +++++++
>>>>>  target-ppc/kvm.c  |    4 ++++
>>>>>  3 files changed, 27 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/kvm-all.c b/kvm-all.c
>>>>> index f8e4328..9c3e26f 100644
>>>>> --- a/kvm-all.c
>>>>> +++ b/kvm-all.c
>>>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>>          return EXCP_HLT;
>>>>>      }
>>>>>
>>>>> +    qemu_mutex_unlock_iothread();
>>>>> +
>>>>>      do {
>>>>>          if (env->kvm_vcpu_dirty) {
>>>>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>>               */
>>>>>              qemu_cpu_kick_self();
>>>>>          }
>>>>> -        qemu_mutex_unlock_iothread();
>>>>>
>>>>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>>>
>>>>> -        qemu_mutex_lock_iothread();
>>>>>          kvm_arch_post_run(env, run);
>>>
>>> target-i386/kvm.c
>>>
>>> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
>>> {
>>>     if (run->if_flag) {
>>>         env->eflags |= IF_MASK;
>>>     } else {
>>>         env->eflags &= ~IF_MASK;
>>>     }
>>>     cpu_set_apic_tpr(env->apic_state, run->cr8);
>>>     cpu_set_apic_base(env->apic_state, run->apic_base);
>>> }
>>>
>>> Clearly there is no structure to any of the writes around the writes
>>> in x86's kvm_arch_post_run, so it is unsafe.
>>
>> No access protocol to the CPUState and apic devices (who can write when,
>> who can read when).
>>
>
> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
> can be but there as well.
>
As to !kvm_irqchip_in_kernel(),I think there is quite a few fields in
CPUState (interrupt_request,eflags,halted,exit_request,exception_injected),
which we must care about. They  are heavily depended on
env->apic_state, so we consider them as a atomic context, that is say
during the updating of env's these context, we do not allow apic_state
changed.
And that is why I introduce cpu_lock in the patch "[PATCH 2/2] kvm:
use per-cpu lock to free vcpu thread out of the big lock"

Regards,
pingfan


> With in-kernel irqchip, there is no such need. Also, no one accesses
> eflags outside of the vcpu thread, independent of the irqchip mode.
>
> Jan
>
Jan Kiszka - June 24, 2012, 2:08 p.m.
On 2012-06-24 10:49, Avi Kivity wrote:
> On 06/23/2012 02:45 PM, Jan Kiszka wrote:
>>
>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>> can be but there as well.
>>
>> With in-kernel irqchip, there is no such need. Also, no one accesses
>> eflags outside of the vcpu thread, independent of the irqchip mode.
> 
> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
> injection needs to be done atomically, but currently we check the tpr
> from the injecting thread, which means the cpu thread can race with it.
>  We need to move the check to the vcpu thread so that the guest vcpu is
> halted.

So apic_set_irq basically needs to be deferred to vcpu context, right?
Will have a look.

Jan
Jan Kiszka - June 24, 2012, 2:08 p.m.
On 2012-06-24 15:34, liu ping fan wrote:
> On Sat, Jun 23, 2012 at 7:45 PM, Jan Kiszka <jan.kiszka@web.de> wrote:
>> On 2012-06-23 11:06, Marcelo Tosatti wrote:
>>> On Fri, Jun 22, 2012 at 09:22:59PM -0300, Marcelo Tosatti wrote:
>>>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>>>>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>>>>
>>>>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>>>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>>>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>>>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>>>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>>>>> injection path in the absence of an APIC.
>>>>>>
>>>>>> s390x should be fine without specific locking as their pre/post-run
>>>>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>>>>
>>>>>> This patch is untested, but a similar version was successfully used in
>>>>>> a x86 setup with a network I/O path that needed no central iothread
>>>>>> locking anymore (required special MMIO exit handling).
>>>>>> ---
>>>>>>  kvm-all.c         |   18 ++++++++++++++++--
>>>>>>  target-i386/kvm.c |    7 +++++++
>>>>>>  target-ppc/kvm.c  |    4 ++++
>>>>>>  3 files changed, 27 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/kvm-all.c b/kvm-all.c
>>>>>> index f8e4328..9c3e26f 100644
>>>>>> --- a/kvm-all.c
>>>>>> +++ b/kvm-all.c
>>>>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>>>          return EXCP_HLT;
>>>>>>      }
>>>>>>
>>>>>> +    qemu_mutex_unlock_iothread();
>>>>>> +
>>>>>>      do {
>>>>>>          if (env->kvm_vcpu_dirty) {
>>>>>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>>>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>>>>               */
>>>>>>              qemu_cpu_kick_self();
>>>>>>          }
>>>>>> -        qemu_mutex_unlock_iothread();
>>>>>>
>>>>>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>>>>
>>>>>> -        qemu_mutex_lock_iothread();
>>>>>>          kvm_arch_post_run(env, run);
>>>>
>>>> target-i386/kvm.c
>>>>
>>>> void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
>>>> {
>>>>     if (run->if_flag) {
>>>>         env->eflags |= IF_MASK;
>>>>     } else {
>>>>         env->eflags &= ~IF_MASK;
>>>>     }
>>>>     cpu_set_apic_tpr(env->apic_state, run->cr8);
>>>>     cpu_set_apic_base(env->apic_state, run->apic_base);
>>>> }
>>>>
>>>> Clearly there is no structure to any of the writes around the writes
>>>> in x86's kvm_arch_post_run, so it is unsafe.
>>>
>>> No access protocol to the CPUState and apic devices (who can write when,
>>> who can read when).
>>>
>>
>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>> can be but there as well.
>>
> As to !kvm_irqchip_in_kernel(),I think there is quite a few fields in
> CPUState (interrupt_request,eflags,halted,exit_request,exception_injected),
> which we must care about. They  are heavily depended on
> env->apic_state, so we consider them as a atomic context, that is say
> during the updating of env's these context, we do not allow apic_state
> changed.
> And that is why I introduce cpu_lock in the patch "[PATCH 2/2] kvm:
> use per-cpu lock to free vcpu thread out of the big lock"

This per-cpu lock is optimizing the slow path (!kvm_irqchip_in_kernel)
at the expense of the fast path. It's better to optimize for in-kernel
irqchip first. One day, when the key problems are solved, we may look at
userspace irqchip code path as well. Until then, BQL protection for
those paths is perfectly fine.

As a first step, I will post a series later that gets rid of
kvm_flush_coalesced_mmio_buffer in the common vmexit path.

Jan
Avi Kivity - June 24, 2012, 2:31 p.m.
On 06/24/2012 05:08 PM, Jan Kiszka wrote:
> On 2012-06-24 10:49, Avi Kivity wrote:
>> On 06/23/2012 02:45 PM, Jan Kiszka wrote:
>>>
>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>>> can be but there as well.
>>>
>>> With in-kernel irqchip, there is no such need. Also, no one accesses
>>> eflags outside of the vcpu thread, independent of the irqchip mode.
>> 
>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
>> injection needs to be done atomically, but currently we check the tpr
>> from the injecting thread, which means the cpu thread can race with it.
>>  We need to move the check to the vcpu thread so that the guest vcpu is
>> halted.
> 
> So apic_set_irq basically needs to be deferred to vcpu context, right?
> Will have a look.

Correct. IIRC, the kernel's 0a5fff192388d2 made the problem much worse,
but did not create it.  It was either Vista or XP-64 which triggered the
problem reliably.  Copying Gleb in case he remembers more.
Avi Kivity - June 24, 2012, 2:35 p.m.
On 06/24/2012 05:08 PM, Jan Kiszka wrote:
> As a first step, I will post a series later that gets rid of
> kvm_flush_coalesced_mmio_buffer in the common vmexit path.

If you defer this, I can think of two places that need to flush:
- anything that accesses those memory areas (such as DMA to the
framebuffer, or updating the display)
- anything that modifies the memory map and possibly changes flushed
addresses
Jan Kiszka - June 24, 2012, 2:40 p.m.
On 2012-06-24 16:35, Avi Kivity wrote:
> On 06/24/2012 05:08 PM, Jan Kiszka wrote:
>> As a first step, I will post a series later that gets rid of
>> kvm_flush_coalesced_mmio_buffer in the common vmexit path.
> 
> If you defer this, I can think of two places that need to flush:
> - anything that accesses those memory areas (such as DMA to the
> framebuffer, or updating the display)

- anything that accesses related areas (in case of VGA: PIO accesses to
the control ports). I'm providing memory_region_set_flush_coalesced that
allows to flush on non-coalesced region accesses as well. Some PIO
accesses unfortunately still need open-coded
qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet.

> - anything that modifies the memory map and possibly changes flushed
> addresses

Good point, need to address this.

Jan
Avi Kivity - June 24, 2012, 2:46 p.m.
On 06/24/2012 05:40 PM, Jan Kiszka wrote:
> On 2012-06-24 16:35, Avi Kivity wrote:
>> On 06/24/2012 05:08 PM, Jan Kiszka wrote:
>>> As a first step, I will post a series later that gets rid of
>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path.
>> 
>> If you defer this, I can think of two places that need to flush:
>> - anything that accesses those memory areas (such as DMA to the
>> framebuffer, or updating the display)
> 
> - anything that accesses related areas (in case of VGA: PIO accesses to
> the control ports). I'm providing memory_region_set_flush_coalesced that
> allows to flush on non-coalesced region accesses as well. Some PIO
> accesses unfortunately still need open-coded
> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet.

Framebuffer access will bypass the MemoryRegionOps callbacks, did you
intend to hook those?

I'm not sure the problem is general enough to merit a check in our
generic mmio dispatch code (granted, now it has a check in the vcpu exit
path which is much worse).
Jan Kiszka - June 24, 2012, 2:51 p.m.
On 2012-06-24 16:46, Avi Kivity wrote:
> On 06/24/2012 05:40 PM, Jan Kiszka wrote:
>> On 2012-06-24 16:35, Avi Kivity wrote:
>>> On 06/24/2012 05:08 PM, Jan Kiszka wrote:
>>>> As a first step, I will post a series later that gets rid of
>>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path.
>>>
>>> If you defer this, I can think of two places that need to flush:
>>> - anything that accesses those memory areas (such as DMA to the
>>> framebuffer, or updating the display)
>>
>> - anything that accesses related areas (in case of VGA: PIO accesses to
>> the control ports). I'm providing memory_region_set_flush_coalesced that
>> allows to flush on non-coalesced region accesses as well. Some PIO
>> accesses unfortunately still need open-coded
>> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet.
> 
> Framebuffer access will bypass the MemoryRegionOps callbacks, did you
> intend to hook those?

Are there really cases where the framebuffer is accessible both via MMIO
and RAM-like mappings at the same time? If so, the current flushing on
vmexit would help either as the direct mappings would not trigger exits.
Or what do you mean?

> 
> I'm not sure the problem is general enough to merit a check in our
> generic mmio dispatch code (granted, now it has a check in the vcpu exit
> path which is much worse).

The current situation is indeed much worse.

Jan
Avi Kivity - June 24, 2012, 2:56 p.m.
On 06/24/2012 05:51 PM, Jan Kiszka wrote:
> On 2012-06-24 16:46, Avi Kivity wrote:
>> On 06/24/2012 05:40 PM, Jan Kiszka wrote:
>>> On 2012-06-24 16:35, Avi Kivity wrote:
>>>> On 06/24/2012 05:08 PM, Jan Kiszka wrote:
>>>>> As a first step, I will post a series later that gets rid of
>>>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path.
>>>>
>>>> If you defer this, I can think of two places that need to flush:
>>>> - anything that accesses those memory areas (such as DMA to the
>>>> framebuffer, or updating the display)
>>>
>>> - anything that accesses related areas (in case of VGA: PIO accesses to
>>> the control ports). I'm providing memory_region_set_flush_coalesced that
>>> allows to flush on non-coalesced region accesses as well. Some PIO
>>> accesses unfortunately still need open-coded
>>> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet.
>> 
>> Framebuffer access will bypass the MemoryRegionOps callbacks, did you
>> intend to hook those?
> 
> Are there really cases where the framebuffer is accessible both via MMIO
> and RAM-like mappings at the same time? If so, the current flushing on
> vmexit would help either as the direct mappings would not trigger exits.
> Or what do you mean?

I meant accesses by the display code to put the framebuffer in front of
the user's eyes.  Those just access the memory directly.  We could wrap
them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and
have those issue the flush.
Jan Kiszka - June 24, 2012, 2:58 p.m.
On 2012-06-24 16:56, Avi Kivity wrote:
> On 06/24/2012 05:51 PM, Jan Kiszka wrote:
>> On 2012-06-24 16:46, Avi Kivity wrote:
>>> On 06/24/2012 05:40 PM, Jan Kiszka wrote:
>>>> On 2012-06-24 16:35, Avi Kivity wrote:
>>>>> On 06/24/2012 05:08 PM, Jan Kiszka wrote:
>>>>>> As a first step, I will post a series later that gets rid of
>>>>>> kvm_flush_coalesced_mmio_buffer in the common vmexit path.
>>>>>
>>>>> If you defer this, I can think of two places that need to flush:
>>>>> - anything that accesses those memory areas (such as DMA to the
>>>>> framebuffer, or updating the display)
>>>>
>>>> - anything that accesses related areas (in case of VGA: PIO accesses to
>>>> the control ports). I'm providing memory_region_set_flush_coalesced that
>>>> allows to flush on non-coalesced region accesses as well. Some PIO
>>>> accesses unfortunately still need open-coded
>>>> qemu_flush_coalesced_mmio_buffer as they do not use memory regions yet.
>>>
>>> Framebuffer access will bypass the MemoryRegionOps callbacks, did you
>>> intend to hook those?
>>
>> Are there really cases where the framebuffer is accessible both via MMIO
>> and RAM-like mappings at the same time? If so, the current flushing on
>> vmexit would help either as the direct mappings would not trigger exits.
>> Or what do you mean?
> 
> I meant accesses by the display code to put the framebuffer in front of
> the user's eyes.  Those just access the memory directly.  We could wrap
> them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and
> have those issue the flush.

That's a non issue, we already have to flush on display updates. See
e.g. vga_update_display.

Jan
Avi Kivity - June 24, 2012, 2:59 p.m.
On 06/24/2012 05:58 PM, Jan Kiszka wrote:

>>> Are there really cases where the framebuffer is accessible both via MMIO
>>> and RAM-like mappings at the same time? If so, the current flushing on
>>> vmexit would help either as the direct mappings would not trigger exits.
>>> Or what do you mean?
>> 
>> I meant accesses by the display code to put the framebuffer in front of
>> the user's eyes.  Those just access the memory directly.  We could wrap
>> them with memory_region_get/put_ram_ptr() (as Xen requires anyway) and
>> have those issue the flush.
> 
> That's a non issue, we already have to flush on display updates. See
> e.g. vga_update_display.
> 

Ah, of course.
Marcelo Tosatti - June 26, 2012, 7:34 p.m.
On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> Should have declared this [RFC] in the subject and CC'ed kvm...
> 
> On 2012-06-23 00:45, Jan Kiszka wrote:
> > This sketches a possible path to get rid of the iothread lock on vmexits
> > in KVM mode. On x86, the the in-kernel irqchips has to be used because
> > we otherwise need to synchronize APIC and other per-cpu state accesses
> > that could be changed concurrently. Not yet fully analyzed is the NMI
> > injection path in the absence of an APIC.
> > 
> > s390x should be fine without specific locking as their pre/post-run
> > callbacks are empty. Power requires locking for the pre-run callback.
> > 
> > This patch is untested, but a similar version was successfully used in
> > a x86 setup with a network I/O path that needed no central iothread
> > locking anymore (required special MMIO exit handling).
> > ---
> >  kvm-all.c         |   18 ++++++++++++++++--
> >  target-i386/kvm.c |    7 +++++++
> >  target-ppc/kvm.c  |    4 ++++
> >  3 files changed, 27 insertions(+), 2 deletions(-)
> > 
> > diff --git a/kvm-all.c b/kvm-all.c
> > index f8e4328..9c3e26f 100644
> > --- a/kvm-all.c
> > +++ b/kvm-all.c
> > @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> >          return EXCP_HLT;
> >      }
> >  
> > +    qemu_mutex_unlock_iothread();
> > +
> >      do {
> >          if (env->kvm_vcpu_dirty) {
> >              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> > @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> >               */
> >              qemu_cpu_kick_self();
> >          }
> > -        qemu_mutex_unlock_iothread();
> >  
> >          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> >  
> > -        qemu_mutex_lock_iothread();
> >          kvm_arch_post_run(env, run);
> >  
> > +        /* TODO: push coalesced mmio flushing to the point where we access
> > +         * devices that are using it (currently VGA and E1000). */
> > +        qemu_mutex_lock_iothread();
> >          kvm_flush_coalesced_mmio_buffer();
> > +        qemu_mutex_unlock_iothread();
> >  
> >          if (run_ret < 0) {
> >              if (run_ret == -EINTR || run_ret == -EAGAIN) {
> > @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
> >          switch (run->exit_reason) {
> >          case KVM_EXIT_IO:
> >              DPRINTF("handle_io\n");
> > +            qemu_mutex_lock_iothread();
> >              kvm_handle_io(run->io.port,
> >                            (uint8_t *)run + run->io.data_offset,
> >                            run->io.direction,
> >                            run->io.size,
> >                            run->io.count);
> > +            qemu_mutex_unlock_iothread();
> >              ret = 0;
> >              break;
> >          case KVM_EXIT_MMIO:
> >              DPRINTF("handle_mmio\n");
> > +            qemu_mutex_lock_iothread();
> >              cpu_physical_memory_rw(run->mmio.phys_addr,
> >                                     run->mmio.data,
> >                                     run->mmio.len,
> >                                     run->mmio.is_write);
> > +            qemu_mutex_unlock_iothread();
> >              ret = 0;
> >              break;
> >          case KVM_EXIT_IRQ_WINDOW_OPEN:
> > @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
> >              break;
> >          case KVM_EXIT_SHUTDOWN:
> >              DPRINTF("shutdown\n");
> > +            qemu_mutex_lock_iothread();
> >              qemu_system_reset_request();
> > +            qemu_mutex_unlock_iothread();
> >              ret = EXCP_INTERRUPT;
> >              break;
> >          case KVM_EXIT_UNKNOWN:
> > @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
> >              break;
> >          default:
> >              DPRINTF("kvm_arch_handle_exit\n");
> > +            qemu_mutex_lock_iothread();
> >              ret = kvm_arch_handle_exit(env, run);
> > +            qemu_mutex_unlock_iothread();
> >              break;
> >          }
> >      } while (ret == 0);
> >  
> > +    qemu_mutex_lock_iothread();
> > +
> >      if (ret < 0) {
> >          cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
> >          vm_stop(RUN_STATE_INTERNAL_ERROR);
> > diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> > index 0d0d8f6..0ad64d1 100644
> > --- a/target-i386/kvm.c
> > +++ b/target-i386/kvm.c
> > @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >  
> >      /* Inject NMI */
> >      if (env->interrupt_request & CPU_INTERRUPT_NMI) {
> > +        qemu_mutex_lock_iothread();
> >          env->interrupt_request &= ~CPU_INTERRUPT_NMI;
> > +        qemu_mutex_unlock_iothread();
> > +
> >          DPRINTF("injected NMI\n");
> >          ret = kvm_vcpu_ioctl(env, KVM_NMI);
> >          if (ret < 0) {
> > @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >      }
> >  
> >      if (!kvm_irqchip_in_kernel()) {
> > +        qemu_mutex_lock_iothread();
> > +
> >          /* Force the VCPU out of its inner loop to process any INIT requests
> >           * or pending TPR access reports. */
> >          if (env->interrupt_request &
> > @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >  
> >          DPRINTF("setting tpr\n");
> >          run->cr8 = cpu_get_apic_tpr(env->apic_state);
> > +
> > +        qemu_mutex_unlock_iothread();
> >      }
> >  }
> >  
> > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> > index c09cc39..60d91a5 100644
> > --- a/target-ppc/kvm.c
> > +++ b/target-ppc/kvm.c
> > @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >      int r;
> >      unsigned irq;
> >  
> > +    qemu_mutex_lock_iothread();
> > +
> >      /* PowerPC QEMU tracks the various core input pins (interrupt, critical
> >       * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
> >      if (!cap_interrupt_level &&
> > @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >      /* We don't know if there are more interrupts pending after this. However,
> >       * the guest will return to userspace in the course of handling this one
> >       * anyways, so we will get a chance to deliver the rest. */
> > +
> > +    qemu_mutex_unlock_iothread();
> >  }
> >  
> >  void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
> > 

The following plan would allow progressive convertion to parallel
operation.

Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.

Jan: if there is recursive MMIO accesses, you can detect that and skip
such MMIO handlers in dev_can_use_lock() ? Or blacklist.

What i mentioned earlier about "unsolved issues" are the possibilities
in "iothread flow" (belief is that it would be better determined at
with execution experience / tuning).

OK, so, each document starts with xyz.txt. This is compatible with TCG, 
the critical part is not dropping/acquire locks for decent performance
(that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD 
requests so).

This came out from discussions with Avi.

immediate-plan.txt
------------------

1.  read_lock(memmap_lock)
2.  MemoryRegionSection mrs = lookup(addr)
3.  qom_ref(mrs.mr->dev)
4.  read_unlock(memmap_lock)

5.  mutex_lock(dev->lock)
6.  dispatch(&mrs, addr, data, size)
7.  mutex_unlock(dev->lock)

8.  qom_unref(mrs.mr->object)


A) Add Device object to MemoryRegions.

memory_region_add_subregion
memory_region_add_eventfd
memory_region_add_subregion_overlap


B) qom_ref details

Z) see device-hotplug.txt items



lock-model.txt
--------------


lock_device(dev) {
    if (dev_can_use_lock(dev))
        mutex_lock(dev->lock)
    else
        mutex_lock(qemu_global_mutex)
}   

dev_can_use_lock(dev)
{
    if (all services used by dev mmio handler have)
        - qemu_global_mutex
        - trylock(dev->lock) and fail
        
        then yes
    else
        then no
}


That is, from the moment in which the device uses the order
(dev->lock -> qemu_global_mutex), the qemu_global_mutex -> dev->lock 
is forbidden by the IOTHREAD or anyone else.


convert-to-unlocked.txt
-----------------------


1.  read_lock(memmap_lock)
2.  MemoryRegionSection mrs = lookup(addr)
3.  qom_ref(mrs.mr->dev)
4.  read_unlock(memmap_lock)

5.  mutex_lock(dev->lock)
6.  dispatch(&mrs, addr, data, size)
7.  mutex_unlock(dev->lock)

8.  qom_unref(mrs.mr->object)

THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
TO UNLOCKED OPERATION.

1) qom_ref guarantees that the Device object does not disappear. However,
these operations are allowed in parallel:

    a) after 4. device memory and ioports might change or be unregistered.

       Concurrent accesses of a device that is being hotunplugged or 
       whose mappings change are responsability of the OS, so 
       incorrect emulation is not QEMU's problem. However, device
       emulation is now subject to:
            - cpu_physical_memory_map failing.
            - stl_phys* failing.
            - cpu_physical_memory_rw failing/reading from unassigned mem.

    b) what should happen with a pending cpu_physical_memory_map pointer,
       when deleting the corresponding memory region?


net.txt
--------


iothread flow
=============

1) Skip-work-if-device-locked

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev))
        proceed;
    else
        continue; (data remains in queue)
    tap_read_packet
    qemu_send_packet_async

DRAWBACK: lock intensive.
DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
a scheme such as trylock() marks a flag saying "iothread wants lock".

Checking each subsystem for possibility:

- tap_send: OK
    - user,slirp: if not possible, use device->lock.
- qemu-char: OK
- interrupts: lock with qemu_global_mutex.
- qemutimer: 
- read time: qemu_get_clock_ns (see later).


2) Queue-work-to-vcpu-context

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev)) {
        proceed;
    } else {
        AddToDeviceWorkQueue(work);
    }
    tap_read_packet
    qemu_send_packet_async

DRAWBACK: lock intensive
DRAWBACK: cache effects

3) VCPU-thread-notify-device-unlocked

select(tap fd ready)
tap_send
    if (trylock(TAPState->NetClientState->dev)) {
        proceed;
    } else {
        signal VCPU thread to notify IOTHREAD when unlocked.
    }
    tap_read_packet

    
GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
same order.


PCI HOTPLUG EJECT FLOW
----------------------

1) Monitor initiated.

monitor -> qmp_device_del -> qdev_unplug -> pci_unplug_device ->
piix4_device_hotplug -> SCI interrupt -> OS removes
application users from device -> OS runs _EJ0 -> 

For hot removal, the device must be immediately ejected when OSPM calls
the _EJ0 control method. The _EJ0 control method does not return until
ejection is complete. After calling _EJ0, OSPM verifies the device no
longer exists to determine if the eject succeeded. For _HID devices,
OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
bus driver for that device.


Avi:  A pci eject guarantees that all accesses
started after the eject completes do not see the device. 

Can you do:

- qobject_remove() 

and have the device dispatch running safely? NO, therefore pci_unplug_device
must hold dev->lock.
Stefan Hajnoczi - June 27, 2012, 7:39 a.m.
On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> net.txt
> --------
>
>
> iothread flow
> =============
>
> 1) Skip-work-if-device-locked
>
> select(tap fd ready)
> tap_send
>    if (trylock(TAPState->NetClientState->dev))
>        proceed;
>    else
>        continue; (data remains in queue)
>    tap_read_packet
>    qemu_send_packet_async
>
> DRAWBACK: lock intensive.
> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> a scheme such as trylock() marks a flag saying "iothread wants lock".

One alternative is to say the lock cannot be held across system calls
or other long-running operations (similar to interrupt handler
spinlocks in the kernel).  But QEMU threads are still at the mercy of
the scheduler or page faults, so perhaps this is not a good idea
because waiting on the lock could tie up the iothread.

> 2) Queue-work-to-vcpu-context
>
> select(tap fd ready)
> tap_send
>    if (trylock(TAPState->NetClientState->dev)) {
>        proceed;
>    } else {
>        AddToDeviceWorkQueue(work);
>    }
>    tap_read_packet
>    qemu_send_packet_async
>
> DRAWBACK: lock intensive
> DRAWBACK: cache effects

This almost suggests we should invert packet reception for NICs.  tap
fd ready should add a work item and the guest device calls into
net/tap.c to pull out any packets from the work function:

tap_send
    dev_add_work(work);

virtio_net_rx_work_fn
    while ((req = virtqueue_pop(rx_queue))) {
        if (!net_receive_packet(netclient, req->buf)) {
            break; /* no more packets available */
        }
        /* ...mark req complete and push onto virtqueue... */
    }
    virtqueue_notify(rx_queue);

The idea is to avoid copies on the rx path and make the locking simple
by always deferring to a work function (which can be invoked
immediately if the device isn't locked).

Stefan
Stefan Hajnoczi - June 27, 2012, 7:41 a.m.
On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> net.txt
>> --------
>>
>>
>> iothread flow
>> =============
>>
>> 1) Skip-work-if-device-locked
>>
>> select(tap fd ready)
>> tap_send
>>    if (trylock(TAPState->NetClientState->dev))
>>        proceed;
>>    else
>>        continue; (data remains in queue)
>>    tap_read_packet
>>    qemu_send_packet_async
>>
>> DRAWBACK: lock intensive.
>> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
>> a scheme such as trylock() marks a flag saying "iothread wants lock".
>
> One alternative is to say the lock cannot be held across system calls
> or other long-running operations (similar to interrupt handler
> spinlocks in the kernel).  But QEMU threads are still at the mercy of
> the scheduler or page faults, so perhaps this is not a good idea
> because waiting on the lock could tie up the iothread.
>
>> 2) Queue-work-to-vcpu-context
>>
>> select(tap fd ready)
>> tap_send
>>    if (trylock(TAPState->NetClientState->dev)) {
>>        proceed;
>>    } else {
>>        AddToDeviceWorkQueue(work);
>>    }
>>    tap_read_packet
>>    qemu_send_packet_async
>>
>> DRAWBACK: lock intensive
>> DRAWBACK: cache effects
>
> This almost suggests we should invert packet reception for NICs.  tap
> fd ready should add a work item and the guest device calls into
> net/tap.c to pull out any packets from the work function:
>
> tap_send
>    dev_add_work(work);
>
> virtio_net_rx_work_fn
>    while ((req = virtqueue_pop(rx_queue))) {
>        if (!net_receive_packet(netclient, req->buf)) {
>            break; /* no more packets available */
>        }
>        /* ...mark req complete and push onto virtqueue... */
>    }
>    virtqueue_notify(rx_queue);
>
> The idea is to avoid copies on the rx path and make the locking simple
> by always deferring to a work function (which can be invoked
> immediately if the device isn't locked).

I just realized this approach bypasses net/queue.c.  I think it works
really well for the peer model (no "VLANs").  Basically flow control
is dictated by the peer (receiver) and we don't need to use NetQueue
anymore.  Whether this works elegantly for slirp and other backends,
I'm not sure yet.

Stefan
Avi Kivity - June 27, 2012, 7:54 a.m.
On 06/26/2012 10:34 PM, Marcelo Tosatti wrote:
> 
> 1.  read_lock(memmap_lock)
> 2.  MemoryRegionSection mrs = lookup(addr)
> 3.  qom_ref(mrs.mr->dev)
> 4.  read_unlock(memmap_lock)
> 
> 5.  mutex_lock(dev->lock)
> 6.  dispatch(&mrs, addr, data, size)
> 7.  mutex_unlock(dev->lock)
> 
> 8.  qom_unref(mrs.mr->object)

The plan also includes eventually replacing read_lock() with rcu; so two
devices can be accessed in parallel with no cacheline bouncing.
Marcelo Tosatti - June 27, 2012, 11:09 a.m.
On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote:
> On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> >> net.txt
> >> --------
> >>
> >>
> >> iothread flow
> >> =============
> >>
> >> 1) Skip-work-if-device-locked
> >>
> >> select(tap fd ready)
> >> tap_send
> >>    if (trylock(TAPState->NetClientState->dev))
> >>        proceed;
> >>    else
> >>        continue; (data remains in queue)
> >>    tap_read_packet
> >>    qemu_send_packet_async
> >>
> >> DRAWBACK: lock intensive.
> >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> >> a scheme such as trylock() marks a flag saying "iothread wants lock".
> >
> > One alternative is to say the lock cannot be held across system calls
> > or other long-running operations (similar to interrupt handler
> > spinlocks in the kernel).  But QEMU threads are still at the mercy of
> > the scheduler or page faults, so perhaps this is not a good idea
> > because waiting on the lock could tie up the iothread.
> >
> >> 2) Queue-work-to-vcpu-context
> >>
> >> select(tap fd ready)
> >> tap_send
> >>    if (trylock(TAPState->NetClientState->dev)) {
> >>        proceed;
> >>    } else {
> >>        AddToDeviceWorkQueue(work);
> >>    }
> >>    tap_read_packet
> >>    qemu_send_packet_async
> >>
> >> DRAWBACK: lock intensive
> >> DRAWBACK: cache effects
> >
> > This almost suggests we should invert packet reception for NICs.  tap
> > fd ready should add a work item and the guest device calls into
> > net/tap.c to pull out any packets from the work function:
> >
> > tap_send
> >    dev_add_work(work);
> >
> > virtio_net_rx_work_fn
> >    while ((req = virtqueue_pop(rx_queue))) {
> >        if (!net_receive_packet(netclient, req->buf)) {
> >            break; /* no more packets available */
> >        }
> >        /* ...mark req complete and push onto virtqueue... */
> >    }
> >    virtqueue_notify(rx_queue);
> >
> > The idea is to avoid copies on the rx path and make the locking simple
> > by always deferring to a work function (which can be invoked
> > immediately if the device isn't locked).
>
> I just realized this approach bypasses net/queue.c.  I think it works
> really well for the peer model (no "VLANs").  Basically flow control
> is dictated by the peer (receiver) and we don't need to use NetQueue
> anymore.  Whether this works elegantly for slirp and other backends,
> I'm not sure yet.

Right. The advantage is only backends where performance matters 
need to be converted (and only net hw drivers that matter performance
wise need to be converted).
Marcelo Tosatti - June 27, 2012, 11:19 a.m.
On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote:
> On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> >> net.txt
> >> --------
> >>
> >>
> >> iothread flow
> >> =============
> >>
> >> 1) Skip-work-if-device-locked
> >>
> >> select(tap fd ready)
> >> tap_send
> >>    if (trylock(TAPState->NetClientState->dev))
> >>        proceed;
> >>    else
> >>        continue; (data remains in queue)
> >>    tap_read_packet
> >>    qemu_send_packet_async
> >>
> >> DRAWBACK: lock intensive.
> >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> >> a scheme such as trylock() marks a flag saying "iothread wants lock".
> >
> > One alternative is to say the lock cannot be held across system calls
> > or other long-running operations (similar to interrupt handler
> > spinlocks in the kernel).  But QEMU threads are still at the mercy of
> > the scheduler or page faults, so perhaps this is not a good idea
> > because waiting on the lock could tie up the iothread.
> >
> >> 2) Queue-work-to-vcpu-context
> >>
> >> select(tap fd ready)
> >> tap_send
> >>    if (trylock(TAPState->NetClientState->dev)) {
> >>        proceed;
> >>    } else {
> >>        AddToDeviceWorkQueue(work);
> >>    }
> >>    tap_read_packet
> >>    qemu_send_packet_async
> >>
> >> DRAWBACK: lock intensive
> >> DRAWBACK: cache effects
> >
> > This almost suggests we should invert packet reception for NICs.  tap
> > fd ready should add a work item and the guest device calls into
> > net/tap.c to pull out any packets from the work function:
> >
> > tap_send
> >    dev_add_work(work);
> >
> > virtio_net_rx_work_fn
> >    while ((req = virtqueue_pop(rx_queue))) {
> >        if (!net_receive_packet(netclient, req->buf)) {
> >            break; /* no more packets available */
> >        }
> >        /* ...mark req complete and push onto virtqueue... */
> >    }
> >    virtqueue_notify(rx_queue);
> >
> > The idea is to avoid copies on the rx path and make the locking simple
> > by always deferring to a work function (which can be invoked
> > immediately if the device isn't locked).
> 
> I just realized this approach bypasses net/queue.c.  I think it works
> really well for the peer model (no "VLANs").  Basically flow control
> is dictated by the peer (receiver) and we don't need to use NetQueue
> anymore.  Whether this works elegantly for slirp and other backends,
> I'm not sure yet.
> 
> Stefan

Right. The advantage is only backends where performance matters need to
be converted (and only net hw drivers that matter performance wise need
to be converted).

Apparently you guys have very different ideas on the higher level
model here. It would be good to agree on one structure (including
modifications on the one above) to follow and share the work on that
direction.

Having competing implementations in this case is a waste of time.
Jan Kiszka - June 27, 2012, 2:36 p.m.
On 2012-06-26 21:34, Marcelo Tosatti wrote:
> The following plan would allow progressive convertion to parallel
> operation.
> 
> Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
> 
> Jan: if there is recursive MMIO accesses, you can detect that and skip
> such MMIO handlers in dev_can_use_lock() ? Or blacklist.

The problem is harder as it may appear on first sight. I checked our
code again, and it also still contains at least one unhandled lockup
scenario. We could try to detect this but it's tricky, maybe even
fragile in more complex scenarios (risk of false positives when using
timeouts e.g.).

Well, such kind of mutual device-to-device requests are likely all
pathological, and I guess it would be ok to actually let the devices
lock up. But then we need some way to recover them, at least via a
virtual machine reset. That implies, of course, they must not lock up
while holding the central lock...

Need to look into details of your approach now.

Jan
Marcelo Tosatti - June 28, 2012, 1:11 a.m.
On Sat, Jun 23, 2012 at 11:22:07AM +0200, Jan Kiszka wrote:
> On 2012-06-23 02:22, Marcelo Tosatti wrote:
> > On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> >> Should have declared this [RFC] in the subject and CC'ed kvm...
> >>
> >> On 2012-06-23 00:45, Jan Kiszka wrote:
> >>> This sketches a possible path to get rid of the iothread lock on vmexits
> >>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
> >>> we otherwise need to synchronize APIC and other per-cpu state accesses
> >>> that could be changed concurrently. Not yet fully analyzed is the NMI
> >>> injection path in the absence of an APIC.
> >>>
> >>> s390x should be fine without specific locking as their pre/post-run
> >>> callbacks are empty. Power requires locking for the pre-run callback.
> >>>
> >>> This patch is untested, but a similar version was successfully used in
> >>> a x86 setup with a network I/O path that needed no central iothread
> >>> locking anymore (required special MMIO exit handling).
> >>> ---
> >>>  kvm-all.c         |   18 ++++++++++++++++--
> >>>  target-i386/kvm.c |    7 +++++++
> >>>  target-ppc/kvm.c  |    4 ++++
> >>>  3 files changed, 27 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/kvm-all.c b/kvm-all.c
> >>> index f8e4328..9c3e26f 100644
> >>> --- a/kvm-all.c
> >>> +++ b/kvm-all.c
> >>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>          return EXCP_HLT;
> >>>      }
> >>>  
> >>> +    qemu_mutex_unlock_iothread();
> >>> +
> >>>      do {
> >>>          if (env->kvm_vcpu_dirty) {
> >>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> >>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>               */
> >>>              qemu_cpu_kick_self();
> >>>          }
> >>> -        qemu_mutex_unlock_iothread();
> >>>  
> >>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> >>>  
> >>> -        qemu_mutex_lock_iothread();
> >>>          kvm_arch_post_run(env, run);
> > 
> > target-i386/kvm.c
> > 
> > void kvm_arch_post_run(CPUX86State *env, struct kvm_run *run)
> > {       
> >     if (run->if_flag) {
> >         env->eflags |= IF_MASK;
> >     } else {
> >         env->eflags &= ~IF_MASK;
> >     }
> >     cpu_set_apic_tpr(env->apic_state, run->cr8);
> >     cpu_set_apic_base(env->apic_state, run->apic_base);
> > }
> > 
> > Clearly there is no structure to any of the writes around the writes
> > in x86's kvm_arch_post_run, so it is unsafe.
> 
> Can't parse this yet.
> 
> None of the fields touched above should be modified outside of the vcpu
> thread context (as long as that thread is inside the inner loop).
> Therefore, it should be safe to run that functions without any lock. Am
> I missing something?

Maybe no in practice, for env->eflags. But it should be formalized,
eg BUG_ON(!vcpu) env->xxx, or some other form of making it not
accessible outside vcpu context.

However, as an example

APIC_COMMON_GET_CLASS -> OBJECT_GET_CLASS -> ....

static TypeImpl *type_table_lookup(const char *name)
{
    return g_hash_table_lookup(type_table_get(), name);
}

Is that safe?
Stefan Hajnoczi - June 28, 2012, 8:45 a.m.
On Wed, Jun 27, 2012 at 12:19 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Jun 27, 2012 at 08:41:49AM +0100, Stefan Hajnoczi wrote:
>> On Wed, Jun 27, 2012 at 8:39 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> > On Tue, Jun 26, 2012 at 8:34 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
>> >> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> >> net.txt
>> >> --------
>> >>
>> >>
>> >> iothread flow
>> >> =============
>> >>
>> >> 1) Skip-work-if-device-locked
>> >>
>> >> select(tap fd ready)
>> >> tap_send
>> >>    if (trylock(TAPState->NetClientState->dev))
>> >>        proceed;
>> >>    else
>> >>        continue; (data remains in queue)
>> >>    tap_read_packet
>> >>    qemu_send_packet_async
>> >>
>> >> DRAWBACK: lock intensive.
>> >> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
>> >> a scheme such as trylock() marks a flag saying "iothread wants lock".
>> >
>> > One alternative is to say the lock cannot be held across system calls
>> > or other long-running operations (similar to interrupt handler
>> > spinlocks in the kernel).  But QEMU threads are still at the mercy of
>> > the scheduler or page faults, so perhaps this is not a good idea
>> > because waiting on the lock could tie up the iothread.
>> >
>> >> 2) Queue-work-to-vcpu-context
>> >>
>> >> select(tap fd ready)
>> >> tap_send
>> >>    if (trylock(TAPState->NetClientState->dev)) {
>> >>        proceed;
>> >>    } else {
>> >>        AddToDeviceWorkQueue(work);
>> >>    }
>> >>    tap_read_packet
>> >>    qemu_send_packet_async
>> >>
>> >> DRAWBACK: lock intensive
>> >> DRAWBACK: cache effects
>> >
>> > This almost suggests we should invert packet reception for NICs.  tap
>> > fd ready should add a work item and the guest device calls into
>> > net/tap.c to pull out any packets from the work function:
>> >
>> > tap_send
>> >    dev_add_work(work);
>> >
>> > virtio_net_rx_work_fn
>> >    while ((req = virtqueue_pop(rx_queue))) {
>> >        if (!net_receive_packet(netclient, req->buf)) {
>> >            break; /* no more packets available */
>> >        }
>> >        /* ...mark req complete and push onto virtqueue... */
>> >    }
>> >    virtqueue_notify(rx_queue);
>> >
>> > The idea is to avoid copies on the rx path and make the locking simple
>> > by always deferring to a work function (which can be invoked
>> > immediately if the device isn't locked).
>>
>> I just realized this approach bypasses net/queue.c.  I think it works
>> really well for the peer model (no "VLANs").  Basically flow control
>> is dictated by the peer (receiver) and we don't need to use NetQueue
>> anymore.  Whether this works elegantly for slirp and other backends,
>> I'm not sure yet.
>>
>> Stefan
>
> Right. The advantage is only backends where performance matters need to
> be converted (and only net hw drivers that matter performance wise need
> to be converted).
>
> Apparently you guys have very different ideas on the higher level
> model here. It would be good to agree on one structure (including
> modifications on the one above) to follow and share the work on that
> direction.

"You guys" == Ping Fan + Jan + Anthony + Avi?  I'm not working
directly on QBL removal myself.

Stefan
Anthony Liguori - June 28, 2012, 2:10 p.m.
On 06/26/2012 02:34 PM, Marcelo Tosatti wrote:
> On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
>> Should have declared this [RFC] in the subject and CC'ed kvm...
>>
>> On 2012-06-23 00:45, Jan Kiszka wrote:
>>> This sketches a possible path to get rid of the iothread lock on vmexits
>>> in KVM mode. On x86, the the in-kernel irqchips has to be used because
>>> we otherwise need to synchronize APIC and other per-cpu state accesses
>>> that could be changed concurrently. Not yet fully analyzed is the NMI
>>> injection path in the absence of an APIC.
>>>
>>> s390x should be fine without specific locking as their pre/post-run
>>> callbacks are empty. Power requires locking for the pre-run callback.
>>>
>>> This patch is untested, but a similar version was successfully used in
>>> a x86 setup with a network I/O path that needed no central iothread
>>> locking anymore (required special MMIO exit handling).
>>> ---
>>>   kvm-all.c         |   18 ++++++++++++++++--
>>>   target-i386/kvm.c |    7 +++++++
>>>   target-ppc/kvm.c  |    4 ++++
>>>   3 files changed, 27 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index f8e4328..9c3e26f 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>           return EXCP_HLT;
>>>       }
>>>
>>> +    qemu_mutex_unlock_iothread();
>>> +
>>>       do {
>>>           if (env->kvm_vcpu_dirty) {
>>>               kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
>>> @@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
>>>                */
>>>               qemu_cpu_kick_self();
>>>           }
>>> -        qemu_mutex_unlock_iothread();
>>>
>>>           run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> -        qemu_mutex_lock_iothread();
>>>           kvm_arch_post_run(env, run);
>>>
>>> +        /* TODO: push coalesced mmio flushing to the point where we access
>>> +         * devices that are using it (currently VGA and E1000). */
>>> +        qemu_mutex_lock_iothread();
>>>           kvm_flush_coalesced_mmio_buffer();
>>> +        qemu_mutex_unlock_iothread();
>>>
>>>           if (run_ret<  0) {
>>>               if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>> @@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
>>>           switch (run->exit_reason) {
>>>           case KVM_EXIT_IO:
>>>               DPRINTF("handle_io\n");
>>> +            qemu_mutex_lock_iothread();
>>>               kvm_handle_io(run->io.port,
>>>                             (uint8_t *)run + run->io.data_offset,
>>>                             run->io.direction,
>>>                             run->io.size,
>>>                             run->io.count);
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = 0;
>>>               break;
>>>           case KVM_EXIT_MMIO:
>>>               DPRINTF("handle_mmio\n");
>>> +            qemu_mutex_lock_iothread();
>>>               cpu_physical_memory_rw(run->mmio.phys_addr,
>>>                                      run->mmio.data,
>>>                                      run->mmio.len,
>>>                                      run->mmio.is_write);
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = 0;
>>>               break;
>>>           case KVM_EXIT_IRQ_WINDOW_OPEN:
>>> @@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               break;
>>>           case KVM_EXIT_SHUTDOWN:
>>>               DPRINTF("shutdown\n");
>>> +            qemu_mutex_lock_iothread();
>>>               qemu_system_reset_request();
>>> +            qemu_mutex_unlock_iothread();
>>>               ret = EXCP_INTERRUPT;
>>>               break;
>>>           case KVM_EXIT_UNKNOWN:
>>> @@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               break;
>>>           default:
>>>               DPRINTF("kvm_arch_handle_exit\n");
>>> +            qemu_mutex_lock_iothread();
>>>               ret = kvm_arch_handle_exit(env, run);
>>> +            qemu_mutex_unlock_iothread();
>>>               break;
>>>           }
>>>       } while (ret == 0);
>>>
>>> +    qemu_mutex_lock_iothread();
>>> +
>>>       if (ret<  0) {
>>>           cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
>>>           vm_stop(RUN_STATE_INTERNAL_ERROR);
>>> diff --git a/target-i386/kvm.c b/target-i386/kvm.c
>>> index 0d0d8f6..0ad64d1 100644
>>> --- a/target-i386/kvm.c
>>> +++ b/target-i386/kvm.c
>>> @@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>>       /* Inject NMI */
>>>       if (env->interrupt_request&  CPU_INTERRUPT_NMI) {
>>> +        qemu_mutex_lock_iothread();
>>>           env->interrupt_request&= ~CPU_INTERRUPT_NMI;
>>> +        qemu_mutex_unlock_iothread();
>>> +
>>>           DPRINTF("injected NMI\n");
>>>           ret = kvm_vcpu_ioctl(env, KVM_NMI);
>>>           if (ret<  0) {
>>> @@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>       }
>>>
>>>       if (!kvm_irqchip_in_kernel()) {
>>> +        qemu_mutex_lock_iothread();
>>> +
>>>           /* Force the VCPU out of its inner loop to process any INIT requests
>>>            * or pending TPR access reports. */
>>>           if (env->interrupt_request&
>>> @@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
>>>
>>>           DPRINTF("setting tpr\n");
>>>           run->cr8 = cpu_get_apic_tpr(env->apic_state);
>>> +
>>> +        qemu_mutex_unlock_iothread();
>>>       }
>>>   }
>>>
>>> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
>>> index c09cc39..60d91a5 100644
>>> --- a/target-ppc/kvm.c
>>> +++ b/target-ppc/kvm.c
>>> @@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>>       int r;
>>>       unsigned irq;
>>>
>>> +    qemu_mutex_lock_iothread();
>>> +
>>>       /* PowerPC QEMU tracks the various core input pins (interrupt, critical
>>>        * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
>>>       if (!cap_interrupt_level&&
>>> @@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
>>>       /* We don't know if there are more interrupts pending after this. However,
>>>        * the guest will return to userspace in the course of handling this one
>>>        * anyways, so we will get a chance to deliver the rest. */
>>> +
>>> +    qemu_mutex_unlock_iothread();
>>>   }
>>>
>>>   void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
>>>
>
> The following plan would allow progressive convertion to parallel
> operation.
>
> Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
>
> Jan: if there is recursive MMIO accesses, you can detect that and skip
> such MMIO handlers in dev_can_use_lock() ? Or blacklist.
>
> What i mentioned earlier about "unsolved issues" are the possibilities
> in "iothread flow" (belief is that it would be better determined at
> with execution experience / tuning).
>
> OK, so, each document starts with xyz.txt. This is compatible with TCG,
> the critical part is not dropping/acquire locks for decent performance
> (that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD
> requests so).
>
> This came out from discussions with Avi.
>
> immediate-plan.txt
> ------------------
>
> 1.  read_lock(memmap_lock)
> 2.  MemoryRegionSection mrs = lookup(addr)
> 3.  qom_ref(mrs.mr->dev)
> 4.  read_unlock(memmap_lock)
>
> 5.  mutex_lock(dev->lock)
> 6.  dispatch(&mrs, addr, data, size)
> 7.  mutex_unlock(dev->lock)

Just a detail, I don't think we should acquire a device specific lock in global 
code.  Rather, I think we should acquire the global lock before dispatch unless 
a MemoryRegion is marked as being unlocked.

Regards,

Anthony Liguori

>
> 8.  qom_unref(mrs.mr->object)
>
>
> A) Add Device object to MemoryRegions.
>
> memory_region_add_subregion
> memory_region_add_eventfd
> memory_region_add_subregion_overlap
>
>
> B) qom_ref details
>
> Z) see device-hotplug.txt items
>
>
>
> lock-model.txt
> --------------
>
>
> lock_device(dev) {
>      if (dev_can_use_lock(dev))
>          mutex_lock(dev->lock)
>      else
>          mutex_lock(qemu_global_mutex)
> }
>
> dev_can_use_lock(dev)
> {
>      if (all services used by dev mmio handler have)
>          - qemu_global_mutex
>          - trylock(dev->lock) and fail
>
>          then yes
>      else
>          then no
> }
>
>
> That is, from the moment in which the device uses the order
> (dev->lock ->  qemu_global_mutex), the qemu_global_mutex ->  dev->lock
> is forbidden by the IOTHREAD or anyone else.
>
>
> convert-to-unlocked.txt
> -----------------------
>
>
> 1.  read_lock(memmap_lock)
> 2.  MemoryRegionSection mrs = lookup(addr)
> 3.  qom_ref(mrs.mr->dev)
> 4.  read_unlock(memmap_lock)
>
> 5.  mutex_lock(dev->lock)
> 6.  dispatch(&mrs, addr, data, size)
> 7.  mutex_unlock(dev->lock)
>
> 8.  qom_unref(mrs.mr->object)
>
> THE ITEMS BELOW SHOULD BE DOCUMENTATION TO CONVERT DEVICE
> TO UNLOCKED OPERATION.
>
> 1) qom_ref guarantees that the Device object does not disappear. However,
> these operations are allowed in parallel:
>
>      a) after 4. device memory and ioports might change or be unregistered.
>
>         Concurrent accesses of a device that is being hotunplugged or
>         whose mappings change are responsability of the OS, so
>         incorrect emulation is not QEMU's problem. However, device
>         emulation is now subject to:
>              - cpu_physical_memory_map failing.
>              - stl_phys* failing.
>              - cpu_physical_memory_rw failing/reading from unassigned mem.
>
>      b) what should happen with a pending cpu_physical_memory_map pointer,
>         when deleting the corresponding memory region?
>
>
> net.txt
> --------
>
>
> iothread flow
> =============
>
> 1) Skip-work-if-device-locked
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev))
>          proceed;
>      else
>          continue; (data remains in queue)
>      tap_read_packet
>      qemu_send_packet_async
>
> DRAWBACK: lock intensive.
> DRAWBACK: VCPU threads can starve IOTHREAD (can be fixed with
> a scheme such as trylock() marks a flag saying "iothread wants lock".
>
> Checking each subsystem for possibility:
>
> - tap_send: OK
>      - user,slirp: if not possible, use device->lock.
> - qemu-char: OK
> - interrupts: lock with qemu_global_mutex.
> - qemutimer:
> - read time: qemu_get_clock_ns (see later).
>
>
> 2) Queue-work-to-vcpu-context
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev)) {
>          proceed;
>      } else {
>          AddToDeviceWorkQueue(work);
>      }
>      tap_read_packet
>      qemu_send_packet_async
>
> DRAWBACK: lock intensive
> DRAWBACK: cache effects
>
> 3) VCPU-thread-notify-device-unlocked
>
> select(tap fd ready)
> tap_send
>      if (trylock(TAPState->NetClientState->dev)) {
>          proceed;
>      } else {
>          signal VCPU thread to notify IOTHREAD when unlocked.
>      }
>      tap_read_packet
>
>
> GENERAL OBJECTIVE: to avoid lock inversion. IOTHREAD should follow
> same order.
>
>
> PCI HOTPLUG EJECT FLOW
> ----------------------
>
> 1) Monitor initiated.
>
> monitor ->  qmp_device_del ->  qdev_unplug ->  pci_unplug_device ->
> piix4_device_hotplug ->  SCI interrupt ->  OS removes
> application users from device ->  OS runs _EJ0 ->
>
> For hot removal, the device must be immediately ejected when OSPM calls
> the _EJ0 control method. The _EJ0 control method does not return until
> ejection is complete. After calling _EJ0, OSPM verifies the device no
> longer exists to determine if the eject succeeded. For _HID devices,
> OSPM evaluates the _STA method. For _ADR devices, OSPM checks with the
> bus driver for that device.
>
>
> Avi:  A pci eject guarantees that all accesses
> started after the eject completes do not see the device.
>
> Can you do:
>
> - qobject_remove()
>
> and have the device dispatch running safely? NO, therefore pci_unplug_device
> must hold dev->lock.
>
>
>
>
Avi Kivity - June 28, 2012, 3:12 p.m.
On 06/28/2012 05:10 PM, Anthony Liguori wrote:
>>
>> 1.  read_lock(memmap_lock)
>> 2.  MemoryRegionSection mrs = lookup(addr)
>> 3.  qom_ref(mrs.mr->dev)
>> 4.  read_unlock(memmap_lock)
>>
>> 5.  mutex_lock(dev->lock)
>> 6.  dispatch(&mrs, addr, data, size)
>> 7.  mutex_unlock(dev->lock)
> 
> Just a detail, I don't think we should acquire a device specific lock in
> global code.  Rather, I think we should acquire the global lock before
> dispatch unless a MemoryRegion is marked as being unlocked.

I think I agree.  The mutex acquisition is moved inside dispatch (in
device-specific code).  A side effect is that device code must not call
memory_region_destroy() outside its constructor or destructor.
Marcelo Tosatti - June 29, 2012, 1:29 a.m.
On Thu, Jun 28, 2012 at 09:10:39AM -0500, Anthony Liguori wrote:
> On 06/26/2012 02:34 PM, Marcelo Tosatti wrote:
> >On Sat, Jun 23, 2012 at 12:55:49AM +0200, Jan Kiszka wrote:
> >>Should have declared this [RFC] in the subject and CC'ed kvm...
> >>
> >>On 2012-06-23 00:45, Jan Kiszka wrote:
> >>>This sketches a possible path to get rid of the iothread lock on vmexits
> >>>in KVM mode. On x86, the the in-kernel irqchips has to be used because
> >>>we otherwise need to synchronize APIC and other per-cpu state accesses
> >>>that could be changed concurrently. Not yet fully analyzed is the NMI
> >>>injection path in the absence of an APIC.
> >>>
> >>>s390x should be fine without specific locking as their pre/post-run
> >>>callbacks are empty. Power requires locking for the pre-run callback.
> >>>
> >>>This patch is untested, but a similar version was successfully used in
> >>>a x86 setup with a network I/O path that needed no central iothread
> >>>locking anymore (required special MMIO exit handling).
> >>>---
> >>>  kvm-all.c         |   18 ++++++++++++++++--
> >>>  target-i386/kvm.c |    7 +++++++
> >>>  target-ppc/kvm.c  |    4 ++++
> >>>  3 files changed, 27 insertions(+), 2 deletions(-)
> >>>
> >>>diff --git a/kvm-all.c b/kvm-all.c
> >>>index f8e4328..9c3e26f 100644
> >>>--- a/kvm-all.c
> >>>+++ b/kvm-all.c
> >>>@@ -1460,6 +1460,8 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>          return EXCP_HLT;
> >>>      }
> >>>
> >>>+    qemu_mutex_unlock_iothread();
> >>>+
> >>>      do {
> >>>          if (env->kvm_vcpu_dirty) {
> >>>              kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
> >>>@@ -1476,14 +1478,16 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>               */
> >>>              qemu_cpu_kick_self();
> >>>          }
> >>>-        qemu_mutex_unlock_iothread();
> >>>
> >>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
> >>>
> >>>-        qemu_mutex_lock_iothread();
> >>>          kvm_arch_post_run(env, run);
> >>>
> >>>+        /* TODO: push coalesced mmio flushing to the point where we access
> >>>+         * devices that are using it (currently VGA and E1000). */
> >>>+        qemu_mutex_lock_iothread();
> >>>          kvm_flush_coalesced_mmio_buffer();
> >>>+        qemu_mutex_unlock_iothread();
> >>>
> >>>          if (run_ret<  0) {
> >>>              if (run_ret == -EINTR || run_ret == -EAGAIN) {
> >>>@@ -1499,19 +1503,23 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>          switch (run->exit_reason) {
> >>>          case KVM_EXIT_IO:
> >>>              DPRINTF("handle_io\n");
> >>>+            qemu_mutex_lock_iothread();
> >>>              kvm_handle_io(run->io.port,
> >>>                            (uint8_t *)run + run->io.data_offset,
> >>>                            run->io.direction,
> >>>                            run->io.size,
> >>>                            run->io.count);
> >>>+            qemu_mutex_unlock_iothread();
> >>>              ret = 0;
> >>>              break;
> >>>          case KVM_EXIT_MMIO:
> >>>              DPRINTF("handle_mmio\n");
> >>>+            qemu_mutex_lock_iothread();
> >>>              cpu_physical_memory_rw(run->mmio.phys_addr,
> >>>                                     run->mmio.data,
> >>>                                     run->mmio.len,
> >>>                                     run->mmio.is_write);
> >>>+            qemu_mutex_unlock_iothread();
> >>>              ret = 0;
> >>>              break;
> >>>          case KVM_EXIT_IRQ_WINDOW_OPEN:
> >>>@@ -1520,7 +1528,9 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>              break;
> >>>          case KVM_EXIT_SHUTDOWN:
> >>>              DPRINTF("shutdown\n");
> >>>+            qemu_mutex_lock_iothread();
> >>>              qemu_system_reset_request();
> >>>+            qemu_mutex_unlock_iothread();
> >>>              ret = EXCP_INTERRUPT;
> >>>              break;
> >>>          case KVM_EXIT_UNKNOWN:
> >>>@@ -1533,11 +1543,15 @@ int kvm_cpu_exec(CPUArchState *env)
> >>>              break;
> >>>          default:
> >>>              DPRINTF("kvm_arch_handle_exit\n");
> >>>+            qemu_mutex_lock_iothread();
> >>>              ret = kvm_arch_handle_exit(env, run);
> >>>+            qemu_mutex_unlock_iothread();
> >>>              break;
> >>>          }
> >>>      } while (ret == 0);
> >>>
> >>>+    qemu_mutex_lock_iothread();
> >>>+
> >>>      if (ret<  0) {
> >>>          cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
> >>>          vm_stop(RUN_STATE_INTERNAL_ERROR);
> >>>diff --git a/target-i386/kvm.c b/target-i386/kvm.c
> >>>index 0d0d8f6..0ad64d1 100644
> >>>--- a/target-i386/kvm.c
> >>>+++ b/target-i386/kvm.c
> >>>@@ -1631,7 +1631,10 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >>>
> >>>      /* Inject NMI */
> >>>      if (env->interrupt_request&  CPU_INTERRUPT_NMI) {
> >>>+        qemu_mutex_lock_iothread();
> >>>          env->interrupt_request&= ~CPU_INTERRUPT_NMI;
> >>>+        qemu_mutex_unlock_iothread();
> >>>+
> >>>          DPRINTF("injected NMI\n");
> >>>          ret = kvm_vcpu_ioctl(env, KVM_NMI);
> >>>          if (ret<  0) {
> >>>@@ -1641,6 +1644,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >>>      }
> >>>
> >>>      if (!kvm_irqchip_in_kernel()) {
> >>>+        qemu_mutex_lock_iothread();
> >>>+
> >>>          /* Force the VCPU out of its inner loop to process any INIT requests
> >>>           * or pending TPR access reports. */
> >>>          if (env->interrupt_request&
> >>>@@ -1682,6 +1687,8 @@ void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
> >>>
> >>>          DPRINTF("setting tpr\n");
> >>>          run->cr8 = cpu_get_apic_tpr(env->apic_state);
> >>>+
> >>>+        qemu_mutex_unlock_iothread();
> >>>      }
> >>>  }
> >>>
> >>>diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> >>>index c09cc39..60d91a5 100644
> >>>--- a/target-ppc/kvm.c
> >>>+++ b/target-ppc/kvm.c
> >>>@@ -471,6 +471,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >>>      int r;
> >>>      unsigned irq;
> >>>
> >>>+    qemu_mutex_lock_iothread();
> >>>+
> >>>      /* PowerPC QEMU tracks the various core input pins (interrupt, critical
> >>>       * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
> >>>      if (!cap_interrupt_level&&
> >>>@@ -497,6 +499,8 @@ void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
> >>>      /* We don't know if there are more interrupts pending after this. However,
> >>>       * the guest will return to userspace in the course of handling this one
> >>>       * anyways, so we will get a chance to deliver the rest. */
> >>>+
> >>>+    qemu_mutex_unlock_iothread();
> >>>  }
> >>>
> >>>  void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)
> >>>
> >
> >The following plan would allow progressive convertion to parallel
> >operation.
> >
> >Jan mentioned the MMIO handler->MMIO handler deadlock in a private message.
> >
> >Jan: if there is recursive MMIO accesses, you can detect that and skip
> >such MMIO handlers in dev_can_use_lock() ? Or blacklist.
> >
> >What i mentioned earlier about "unsolved issues" are the possibilities
> >in "iothread flow" (belief is that it would be better determined at
> >with execution experience / tuning).
> >
> >OK, so, each document starts with xyz.txt. This is compatible with TCG,
> >the critical part is not dropping/acquire locks for decent performance
> >(that is, only drop memmap_lock from TCG VCPUS thread if IOTHREAD
> >requests so).
> >
> >This came out from discussions with Avi.
> >
> >immediate-plan.txt
> >------------------
> >
> >1.  read_lock(memmap_lock)
> >2.  MemoryRegionSection mrs = lookup(addr)
> >3.  qom_ref(mrs.mr->dev)
> >4.  read_unlock(memmap_lock)
> >
> >5.  mutex_lock(dev->lock)
> >6.  dispatch(&mrs, addr, data, size)
> >7.  mutex_unlock(dev->lock)
> 
> Just a detail, I don't think we should acquire a device specific
> lock in global code.  

Why?

The above outdated, it should be:

5. lock_device(dev)
6. dispatch()
7. unlock_device(dev)

See lock_device below.

> Rather, I think we should acquire the global
> lock before dispatch unless a MemoryRegion is marked as being
> unlocked.

See lock_device and dev_can_use_lock below.

The per-device lock is used if all services used by the dispatch
handler (such as timers, interrupt, network data transmission, etc),
are prepared (converted) to skip entering device protected region, when
executed from non-vcpu contexes, such as:


main_loop:

select()
mutex_lock(qemu_global_mutex)
iohandler()
	if (mutex_trylock(dev->lock) == SUCCESS)
		push_data_to(dev)	   
	else
		continue processing of
Marcelo Tosatti - June 29, 2012, 1:45 a.m.
On Thu, Jun 28, 2012 at 09:10:39AM -0500, Anthony Liguori wrote:
> >
> >1.  read_lock(memmap_lock)
> >2.  MemoryRegionSection mrs = lookup(addr)
> >3.  qom_ref(mrs.mr->dev)
> >4.  read_unlock(memmap_lock)
> >
> >5.  mutex_lock(dev->lock)
> >6.  dispatch(&mrs, addr, data, size)
> >7.  mutex_unlock(dev->lock)
> 
> Just a detail, I don't think we should acquire a device specific
> lock in global code.  Rather, I think we should acquire the global
> lock before dispatch unless a MemoryRegion is marked as being
> unlocked.

"The basic plan is introduce granular locking starting at the KVM
dispatch level until we can get to MemoryRegion dispatch. We'll then
have some way to indicate that a MemoryRegion's callbacks should be
invoked without holding the qemu global mutex."

Before that is possible, the callback must not make use of data
structures currently protected by qemu_global_mutex, such as 
timers, interrupts (that is, you would have to split locks 
for each individual service invoked from inside callbacks,
which is a recipe for disaster).

With lock_device() below you can have

mutex_lock(dev->lock)
device specific work
mutex_lock(qemu_global_mutex)
raise irq, send packet, etc
mutex_unlock(qemu_global_mutex)
mutex_unlock(dev->lock)

and iothread doing the select() pseudocode in the previous email.
Jan Kiszka - July 6, 2012, 5:16 p.m.
On 2012-06-24 16:08, Jan Kiszka wrote:
> On 2012-06-24 10:49, Avi Kivity wrote:
>> On 06/23/2012 02:45 PM, Jan Kiszka wrote:
>>>
>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>>> can be but there as well.
>>>
>>> With in-kernel irqchip, there is no such need. Also, no one accesses
>>> eflags outside of the vcpu thread, independent of the irqchip mode.
>>
>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
>> injection needs to be done atomically, but currently we check the tpr
>> from the injecting thread, which means the cpu thread can race with it.
>>  We need to move the check to the vcpu thread so that the guest vcpu is
>> halted.
> 
> So apic_set_irq basically needs to be deferred to vcpu context, right?
> Will have a look.

Tried to wrap my head around this, but only found different issues
(patches under construction).

First of all, a simple run_on_cpu doesn't work as it may drops the BQL
at unexpected points inside device models.

Then I thought about what could actually race here: The testing of the
userspace TPR value under BQL vs. some modification by the CPU while in
KVM mode. So we may either inject while the CPU is trying to prevent
this - harmless as it happens on real hw as well - or not inject while
the CPU is enabling this. But the latter is quickly resolved because all
such TPR changes in userspace APIC mode are trapped and then processed
under BQL. At that point we will also reevaluate the pending interrupts
and inject what was deferred before (kvm_arch_post_run ->
cpu_set_apic_tpr -> apic_set_tpr -> apic_update_irq, or via
apic_mem_writel).

So where is a race?

Jan
Jan Kiszka - July 6, 2012, 6:06 p.m.
On 2012-07-06 19:16, Jan Kiszka wrote:
> On 2012-06-24 16:08, Jan Kiszka wrote:
>> On 2012-06-24 10:49, Avi Kivity wrote:
>>> On 06/23/2012 02:45 PM, Jan Kiszka wrote:
>>>>
>>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>>>> can be but there as well.
>>>>
>>>> With in-kernel irqchip, there is no such need. Also, no one accesses
>>>> eflags outside of the vcpu thread, independent of the irqchip mode.
>>>
>>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
>>> injection needs to be done atomically, but currently we check the tpr
>>> from the injecting thread, which means the cpu thread can race with it.
>>>  We need to move the check to the vcpu thread so that the guest vcpu is
>>> halted.
>>
>> So apic_set_irq basically needs to be deferred to vcpu context, right?
>> Will have a look.
> 
> Tried to wrap my head around this, but only found different issues
> (patches under construction).
> 
> First of all, a simple run_on_cpu doesn't work as it may drops the BQL
> at unexpected points inside device models.
> 
> Then I thought about what could actually race here: The testing of the
> userspace TPR value under BQL vs. some modification by the CPU while in
> KVM mode. So we may either inject while the CPU is trying to prevent
> this - harmless as it happens on real hw as well - or not inject while

Hmm, this could actually be a problem as the race window is extended
beyond the instruction that raises TPR. So we may generate unexpected
spurious interrupts for the guest. And that's because the userspace APIC
fails to update the CPU interrupt pin properly when the TPR is modified.
Here is the bug...

Jan

> the CPU is enabling this. But the latter is quickly resolved because all
> such TPR changes in userspace APIC mode are trapped and then processed
> under BQL. At that point we will also reevaluate the pending interrupts
> and inject what was deferred before (kvm_arch_post_run ->
> cpu_set_apic_tpr -> apic_set_tpr -> apic_update_irq, or via
> apic_mem_writel).
> 
> So where is a race?
> 
> Jan
>
Avi Kivity - July 8, 2012, 7:49 a.m.
On 07/06/2012 09:06 PM, Jan Kiszka wrote:
> On 2012-07-06 19:16, Jan Kiszka wrote:
>> On 2012-06-24 16:08, Jan Kiszka wrote:
>>> On 2012-06-24 10:49, Avi Kivity wrote:
>>>> On 06/23/2012 02:45 PM, Jan Kiszka wrote:
>>>>>
>>>>> Hmm, we may need the iothread lock around cpu_set_apic_tpr for
>>>>> !kvm_irqchip_in_kernel(). And as we are at it, apic_base manipulation
>>>>> can be but there as well.
>>>>>
>>>>> With in-kernel irqchip, there is no such need. Also, no one accesses
>>>>> eflags outside of the vcpu thread, independent of the irqchip mode.
>>>>
>>>> In fact !kvm_irqchip_in_kernel() is broken wrt the tpr.  Interrupt
>>>> injection needs to be done atomically, but currently we check the tpr
>>>> from the injecting thread, which means the cpu thread can race with it.
>>>>  We need to move the check to the vcpu thread so that the guest vcpu is
>>>> halted.
>>>
>>> So apic_set_irq basically needs to be deferred to vcpu context, right?
>>> Will have a look.
>> 
>> Tried to wrap my head around this, but only found different issues
>> (patches under construction).
>> 
>> First of all, a simple run_on_cpu doesn't work as it may drops the BQL
>> at unexpected points inside device models.
>> 
>> Then I thought about what could actually race here: The testing of the
>> userspace TPR value under BQL vs. some modification by the CPU while in
>> KVM mode. So we may either inject while the CPU is trying to prevent
>> this - harmless as it happens on real hw as well - or not inject while
> 
> Hmm, this could actually be a problem as the race window is extended
> beyond the instruction that raises TPR. So we may generate unexpected
> spurious interrupts for the guest. And that's because the userspace APIC
> fails to update the CPU interrupt pin properly when the TPR is modified.
> Here is the bug...

Exactly.

To avoid dropping the lock we can do something similar to kvm - queue a
request to reevaluate IRR, then signal that vcpu to exit.  No need to
drop the lock.  Possibly a check in apic_update_irq() whether we're
running in the vcpu thread; if not, signal and return.  That function's
effects are asynchronous if run outside the vcpu, so it's a safe change.

Patch

diff --git a/kvm-all.c b/kvm-all.c
index f8e4328..9c3e26f 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1460,6 +1460,8 @@  int kvm_cpu_exec(CPUArchState *env)
         return EXCP_HLT;
     }
 
+    qemu_mutex_unlock_iothread();
+
     do {
         if (env->kvm_vcpu_dirty) {
             kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
@@ -1476,14 +1478,16 @@  int kvm_cpu_exec(CPUArchState *env)
              */
             qemu_cpu_kick_self();
         }
-        qemu_mutex_unlock_iothread();
 
         run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
 
-        qemu_mutex_lock_iothread();
         kvm_arch_post_run(env, run);
 
+        /* TODO: push coalesced mmio flushing to the point where we access
+         * devices that are using it (currently VGA and E1000). */
+        qemu_mutex_lock_iothread();
         kvm_flush_coalesced_mmio_buffer();
+        qemu_mutex_unlock_iothread();
 
         if (run_ret < 0) {
             if (run_ret == -EINTR || run_ret == -EAGAIN) {
@@ -1499,19 +1503,23 @@  int kvm_cpu_exec(CPUArchState *env)
         switch (run->exit_reason) {
         case KVM_EXIT_IO:
             DPRINTF("handle_io\n");
+            qemu_mutex_lock_iothread();
             kvm_handle_io(run->io.port,
                           (uint8_t *)run + run->io.data_offset,
                           run->io.direction,
                           run->io.size,
                           run->io.count);
+            qemu_mutex_unlock_iothread();
             ret = 0;
             break;
         case KVM_EXIT_MMIO:
             DPRINTF("handle_mmio\n");
+            qemu_mutex_lock_iothread();
             cpu_physical_memory_rw(run->mmio.phys_addr,
                                    run->mmio.data,
                                    run->mmio.len,
                                    run->mmio.is_write);
+            qemu_mutex_unlock_iothread();
             ret = 0;
             break;
         case KVM_EXIT_IRQ_WINDOW_OPEN:
@@ -1520,7 +1528,9 @@  int kvm_cpu_exec(CPUArchState *env)
             break;
         case KVM_EXIT_SHUTDOWN:
             DPRINTF("shutdown\n");
+            qemu_mutex_lock_iothread();
             qemu_system_reset_request();
+            qemu_mutex_unlock_iothread();
             ret = EXCP_INTERRUPT;
             break;
         case KVM_EXIT_UNKNOWN:
@@ -1533,11 +1543,15 @@  int kvm_cpu_exec(CPUArchState *env)
             break;
         default:
             DPRINTF("kvm_arch_handle_exit\n");
+            qemu_mutex_lock_iothread();
             ret = kvm_arch_handle_exit(env, run);
+            qemu_mutex_unlock_iothread();
             break;
         }
     } while (ret == 0);
 
+    qemu_mutex_lock_iothread();
+
     if (ret < 0) {
         cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
         vm_stop(RUN_STATE_INTERNAL_ERROR);
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 0d0d8f6..0ad64d1 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1631,7 +1631,10 @@  void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
 
     /* Inject NMI */
     if (env->interrupt_request & CPU_INTERRUPT_NMI) {
+        qemu_mutex_lock_iothread();
         env->interrupt_request &= ~CPU_INTERRUPT_NMI;
+        qemu_mutex_unlock_iothread();
+
         DPRINTF("injected NMI\n");
         ret = kvm_vcpu_ioctl(env, KVM_NMI);
         if (ret < 0) {
@@ -1641,6 +1644,8 @@  void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
     }
 
     if (!kvm_irqchip_in_kernel()) {
+        qemu_mutex_lock_iothread();
+
         /* Force the VCPU out of its inner loop to process any INIT requests
          * or pending TPR access reports. */
         if (env->interrupt_request &
@@ -1682,6 +1687,8 @@  void kvm_arch_pre_run(CPUX86State *env, struct kvm_run *run)
 
         DPRINTF("setting tpr\n");
         run->cr8 = cpu_get_apic_tpr(env->apic_state);
+
+        qemu_mutex_unlock_iothread();
     }
 }
 
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index c09cc39..60d91a5 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -471,6 +471,8 @@  void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
     int r;
     unsigned irq;
 
+    qemu_mutex_lock_iothread();
+
     /* PowerPC QEMU tracks the various core input pins (interrupt, critical
      * interrupt, reset, etc) in PPC-specific env->irq_input_state. */
     if (!cap_interrupt_level &&
@@ -497,6 +499,8 @@  void kvm_arch_pre_run(CPUPPCState *env, struct kvm_run *run)
     /* We don't know if there are more interrupts pending after this. However,
      * the guest will return to userspace in the course of handling this one
      * anyways, so we will get a chance to deliver the rest. */
+
+    qemu_mutex_unlock_iothread();
 }
 
 void kvm_arch_post_run(CPUPPCState *env, struct kvm_run *run)