diff mbox

[v2,0/3] AioContext: ctx->dispatching is dead, all hail ctx->notify_me

Message ID 55A8CB3E.5090404@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini July 17, 2015, 9:30 a.m. UTC
On 17/07/2015 06:44, Paolo Bonzini wrote:
> 
> 
> On 16/07/2015 21:05, Richard W.M. Jones wrote:
>>
>> Sorry to spoil things, but I'm still seeing this bug, although it is
>> now a lot less frequent with your patch.  I would estimate it happens
>> more often than 1 in 5 runs with qemu.git, and probably 1 in 200 runs
>> with qemu.git + the v2 patch series.
>>
>> It's the exact same hang in both cases.
>>
>> Is it possible that this patch doesn't completely close any race?
>>
>> Still, it is an improvement, so there is that.
> 
> Would seem at first glance like a different bug.
> 
> Interestingly, adding some "tracing" (qemu_clock_get_ns) makes the bug
> more likely: now it reproduces in about 10 tries.  Of course :) adding
> other kinds of tracing instead make it go away again (>50 tries).
> 
> Perhaps this:
> 
>    i/o thread         vcpu thread                   worker thread
>    ---------------------------------------------------------------------
>    lock_iothread
>    notify_me = 1
>    ...
>    unlock_iothread
>                       lock_iothread
>                       notify_me = 3
>                       ppoll
>                       notify_me = 1
>                                                      bh->scheduled = 1
>                                                      event_notifier_set
>                       event_notifier_test_and_clear
>    ppoll
>     ^^ hang
> 	
> In the exact shape above, it doesn't seem too likely to happen, but
> perhaps there's another simpler case.  Still, the bug exists.
> 
> The above is not really related to notify_me.  Here the notification is
> not being optimized away!  So I wonder if this one has been there forever.
> 
> Fam suggested putting the event_notifier_test_and_clear before
> aio_bh_poll(), but it does not work.  I'll look more close
> 
> However, an unconditional event_notifier_test_and_clear is pretty
> expensive.  On one hand, obviously correctness comes first.  On the
> other hand, an expensive operation at the wrong place can mask the race
> very easily; I'll let the fix run for a while, but I'm not sure if a
> successful test really says anything useful.

So it may not be useful, but still successful test is successful. :)
The following patch, which also includes the delta between v2 and v3
of this series, survived 674 reboots before hitting a definitely
unrelated problem:

error: kvm run failed Function not implemented
PC=00000000bf671210  SP=00000000c00001f0
X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)

For the record, this is the kvm_run struct:

$6 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 0, 
  ready_for_interrupt_injection = 0 '\000', if_flag = 0 '\000', flags = 0, cr8 = 0, apic_base = 0, {hw = {
      hardware_exit_reason = 150994968}, fail_entry = {hardware_entry_failure_reason = 150994968}, ex = {
      exception = 150994968, error_code = 0}, io = {direction = 24 '\030', size = 0 '\000', port = 2304, 
      count = 0, data_offset = 144}, debug = {arch = {<No data fields>}}, mmio = {phys_addr = 150994968, 
      data = "\220\000\000\000\000\000\000", len = 4, is_write = 0 '\000'}, hypercall = {nr = 150994968, 
      args = {144, 4, 0, 0, 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 150994968, 
      is_write = 144, pad = 0}, s390_sieic = {icptcode = 24 '\030', ipa = 2304, ipb = 0}, 
    s390_reset_flags = 150994968, s390_ucontrol = {trans_exc_code = 150994968, pgm_code = 144}, dcr = {
      dcrn = 150994968, data = 0, is_write = 144 '\220'}, internal = {suberror = 150994968, ndata = 0, 
      data = {144, 4, 0 <repeats 14 times>}}, osi = {gprs = {150994968, 144, 4, 0 <repeats 29 times>}}, 
    papr_hcall = {nr = 150994968, ret = 144, args = {4, 0, 0, 0, 0, 0, 0, 0, 0}}, s390_tsch = {
      subchannel_id = 24, subchannel_nr = 2304, io_int_parm = 0, io_int_word = 144, ipb = 0, 
      dequeued = 4 '\004'}, epr = {epr = 150994968}, system_event = {type = 150994968, flags = 144}, 
    s390_stsi = {addr = 150994968, ar = 144 '\220', reserved = 0 '\000', fc = 0 '\000', sel1 = 0 '\000', 
      sel2 = 0}, 
    padding = "\030\000\000\t\000\000\000\000\220\000\000\000\000\000\000\000\004", '\000' <repeats 238 times>}, kvm_valid_regs = 0, kvm_dirty_regs = 0, s = {regs = {<No data fields>}, 
    padding = '\000' <repeats 2047 times>}}

Marc, does it ring any bell?

Paolo

Comments

Richard W.M. Jones July 17, 2015, 12:58 p.m. UTC | #1
On Fri, Jul 17, 2015 at 11:30:38AM +0200, Paolo Bonzini wrote:
> error: kvm run failed Function not implemented
> PC=00000000bf671210  SP=00000000c00001f0
> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)

Vaguely reminiscent of this bug:

https://bugzilla.redhat.com/show_bug.cgi?id=1194366

(See comment 7 in particular)

> diff --git a/aio-posix.c b/aio-posix.c
> index 268d14d..d2011d0 100644
> --- a/aio-posix.c
> +++ b/aio-posix.c
> @@ -273,6 +273,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
>          aio_context_acquire(ctx);
>      }
>  
> +    /* This should be optimized... */
> +    event_notifier_test_and_clear(&ctx->notifier);
> +
> +    if (blocking) {
> +        atomic_sub(&ctx->notify_me, 2);
> +    }
> +
>      /* if we have any readable fds, dispatch event */
>      if (ret > 0) {
>          for (i = 0; i < npfd; i++) {
> @@ -283,10 +290,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
>      npfd = 0;
>      ctx->walking_handlers--;
>  
> -    if (blocking) {
> -        atomic_sub(&ctx->notify_me, 2);
> -    }
> -
>      /* Run dispatch even if there were no readable fds to run timers */
>      if (aio_dispatch(ctx)) {
>          progress = true;
> diff --git a/aio-win32.c b/aio-win32.c
> index 2bfd5f8..33809fd 100644
> --- a/aio-win32.c
> +++ b/aio-win32.c
> @@ -326,6 +326,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>          if (timeout) {
>              aio_context_acquire(ctx);
>          }
> +
> +        /* This should be optimized... */
> +        event_notifier_test_and_clear(&ctx->notifier);
> +
>          if (blocking) {
>              assert(first);
>              atomic_sub(&ctx->notify_me, 2);
> diff --git a/async.c b/async.c
> index 9204907..120e183 100644
> --- a/async.c
> +++ b/async.c
> @@ -202,6 +202,9 @@ aio_ctx_check(GSource *source)
>      AioContext *ctx = (AioContext *) source;
>      QEMUBH *bh;
>  
> +    /* This should be optimized... */
> +    event_notifier_test_and_clear(&ctx->notifier);
> +
>      atomic_and(&ctx->notify_me, ~1);
>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>          if (!bh->deleted && bh->scheduled) {
> @@ -280,6 +280,10 @@ static void aio_rfifolock_cb(void *opaque)
>      aio_notify(opaque);
>  }
>  
> +static void event_notifier_dummy_cb(EventNotifier *e)
> +{
> +}
> +
>  AioContext *aio_context_new(Error **errp)
>  {
>      int ret;
> @@ -292,7 +296,7 @@ AioContext *aio_context_new(Error **errp)
>          return NULL;
>      }
>      g_source_set_can_recurse(&ctx->source, true);
> -    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
> +    aio_set_event_notifier(ctx, &ctx->notifier, event_notifier_dummy_cb);
>      ctx->thread_pool = NULL;
>      qemu_mutex_init(&ctx->bh_lock);
>      rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);

With this patch, I've got to 500 iterations without seeing the error.

Still testing it however ...

Rich.
Paolo Bonzini July 17, 2015, 1:02 p.m. UTC | #2
On 17/07/2015 14:58, Richard W.M. Jones wrote:
> On Fri, Jul 17, 2015 at 11:30:38AM +0200, Paolo Bonzini wrote:
>> error: kvm run failed Function not implemented
>> PC=00000000bf671210  SP=00000000c00001f0
>> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
>> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
>> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
>> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
>> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
>> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
>> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
>> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
> 
> Vaguely reminiscent of this bug:
> 
> https://bugzilla.redhat.com/show_bug.cgi?id=1194366
> 
> (See comment 7 in particular)

Must be it, I was using an old kernel.

Thanks!

Paolo

>> diff --git a/aio-posix.c b/aio-posix.c
>> index 268d14d..d2011d0 100644
>> --- a/aio-posix.c
>> +++ b/aio-posix.c
>> @@ -273,6 +273,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>          aio_context_acquire(ctx);
>>      }
>>  
>> +    /* This should be optimized... */
>> +    event_notifier_test_and_clear(&ctx->notifier);
>> +
>> +    if (blocking) {
>> +        atomic_sub(&ctx->notify_me, 2);
>> +    }
>> +
>>      /* if we have any readable fds, dispatch event */
>>      if (ret > 0) {
>>          for (i = 0; i < npfd; i++) {
>> @@ -283,10 +290,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>      npfd = 0;
>>      ctx->walking_handlers--;
>>  
>> -    if (blocking) {
>> -        atomic_sub(&ctx->notify_me, 2);
>> -    }
>> -
>>      /* Run dispatch even if there were no readable fds to run timers */
>>      if (aio_dispatch(ctx)) {
>>          progress = true;
>> diff --git a/aio-win32.c b/aio-win32.c
>> index 2bfd5f8..33809fd 100644
>> --- a/aio-win32.c
>> +++ b/aio-win32.c
>> @@ -326,6 +326,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>          if (timeout) {
>>              aio_context_acquire(ctx);
>>          }
>> +
>> +        /* This should be optimized... */
>> +        event_notifier_test_and_clear(&ctx->notifier);
>> +
>>          if (blocking) {
>>              assert(first);
>>              atomic_sub(&ctx->notify_me, 2);
>> diff --git a/async.c b/async.c
>> index 9204907..120e183 100644
>> --- a/async.c
>> +++ b/async.c
>> @@ -202,6 +202,9 @@ aio_ctx_check(GSource *source)
>>      AioContext *ctx = (AioContext *) source;
>>      QEMUBH *bh;
>>  
>> +    /* This should be optimized... */
>> +    event_notifier_test_and_clear(&ctx->notifier);
>> +
>>      atomic_and(&ctx->notify_me, ~1);
>>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>>          if (!bh->deleted && bh->scheduled) {
>> @@ -280,6 +280,10 @@ static void aio_rfifolock_cb(void *opaque)
>>      aio_notify(opaque);
>>  }
>>  
>> +static void event_notifier_dummy_cb(EventNotifier *e)
>> +{
>> +}
>> +
>>  AioContext *aio_context_new(Error **errp)
>>  {
>>      int ret;
>> @@ -292,7 +296,7 @@ AioContext *aio_context_new(Error **errp)
>>          return NULL;
>>      }
>>      g_source_set_can_recurse(&ctx->source, true);
>> -    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
>> +    aio_set_event_notifier(ctx, &ctx->notifier, event_notifier_dummy_cb);
>>      ctx->thread_pool = NULL;
>>      qemu_mutex_init(&ctx->bh_lock);
>>      rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
> 
> With this patch, I've got to 500 iterations without seeing the error.
> 
> Still testing it however ...
> 
> Rich.
>
Marc Zyngier July 17, 2015, 1:28 p.m. UTC | #3
On Fri, 17 Jul 2015 10:30:38 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> 
> 
> On 17/07/2015 06:44, Paolo Bonzini wrote:
> > 
> > 
> > On 16/07/2015 21:05, Richard W.M. Jones wrote:
> >>
> >> Sorry to spoil things, but I'm still seeing this bug, although it is
> >> now a lot less frequent with your patch.  I would estimate it happens
> >> more often than 1 in 5 runs with qemu.git, and probably 1 in 200 runs
> >> with qemu.git + the v2 patch series.
> >>
> >> It's the exact same hang in both cases.
> >>
> >> Is it possible that this patch doesn't completely close any race?
> >>
> >> Still, it is an improvement, so there is that.
> > 
> > Would seem at first glance like a different bug.
> > 
> > Interestingly, adding some "tracing" (qemu_clock_get_ns) makes the bug
> > more likely: now it reproduces in about 10 tries.  Of course :) adding
> > other kinds of tracing instead make it go away again (>50 tries).
> > 
> > Perhaps this:
> > 
> >    i/o thread         vcpu thread                   worker thread
> >    ---------------------------------------------------------------------
> >    lock_iothread
> >    notify_me = 1
> >    ...
> >    unlock_iothread
> >                       lock_iothread
> >                       notify_me = 3
> >                       ppoll
> >                       notify_me = 1
> >                                                      bh->scheduled = 1
> >                                                      event_notifier_set
> >                       event_notifier_test_and_clear
> >    ppoll
> >     ^^ hang
> > 	
> > In the exact shape above, it doesn't seem too likely to happen, but
> > perhaps there's another simpler case.  Still, the bug exists.
> > 
> > The above is not really related to notify_me.  Here the notification is
> > not being optimized away!  So I wonder if this one has been there forever.
> > 
> > Fam suggested putting the event_notifier_test_and_clear before
> > aio_bh_poll(), but it does not work.  I'll look more close
> > 
> > However, an unconditional event_notifier_test_and_clear is pretty
> > expensive.  On one hand, obviously correctness comes first.  On the
> > other hand, an expensive operation at the wrong place can mask the race
> > very easily; I'll let the fix run for a while, but I'm not sure if a
> > successful test really says anything useful.
> 
> So it may not be useful, but still successful test is successful. :)
> The following patch, which also includes the delta between v2 and v3
> of this series, survived 674 reboots before hitting a definitely
> unrelated problem:
> 
> error: kvm run failed Function not implemented
> PC=00000000bf671210  SP=00000000c00001f0
> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
> 
> For the record, this is the kvm_run struct:
> 
> $6 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 0, 
>   ready_for_interrupt_injection = 0 '\000', if_flag = 0 '\000', flags = 0, cr8 = 0, apic_base = 0, {hw = {
>       hardware_exit_reason = 150994968}, fail_entry = {hardware_entry_failure_reason = 150994968}, ex = {
>       exception = 150994968, error_code = 0}, io = {direction = 24 '\030', size = 0 '\000', port = 2304, 
>       count = 0, data_offset = 144}, debug = {arch = {<No data fields>}}, mmio = {phys_addr = 150994968, 
>       data = "\220\000\000\000\000\000\000", len = 4, is_write = 0 '\000'}, hypercall = {nr = 150994968, 
>       args = {144, 4, 0, 0, 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 150994968, 
>       is_write = 144, pad = 0}, s390_sieic = {icptcode = 24 '\030', ipa = 2304, ipb = 0}, 
>     s390_reset_flags = 150994968, s390_ucontrol = {trans_exc_code = 150994968, pgm_code = 144}, dcr = {
>       dcrn = 150994968, data = 0, is_write = 144 '\220'}, internal = {suberror = 150994968, ndata = 0, 
>       data = {144, 4, 0 <repeats 14 times>}}, osi = {gprs = {150994968, 144, 4, 0 <repeats 29 times>}}, 
>     papr_hcall = {nr = 150994968, ret = 144, args = {4, 0, 0, 0, 0, 0, 0, 0, 0}}, s390_tsch = {
>       subchannel_id = 24, subchannel_nr = 2304, io_int_parm = 0, io_int_word = 144, ipb = 0, 
>       dequeued = 4 '\004'}, epr = {epr = 150994968}, system_event = {type = 150994968, flags = 144}, 
>     s390_stsi = {addr = 150994968, ar = 144 '\220', reserved = 0 '\000', fc = 0 '\000', sel1 = 0 '\000', 
>       sel2 = 0}, 
>     padding = "\030\000\000\t\000\000\000\000\220\000\000\000\000\000\000\000\004", '\000' <repeats 238 times>}, kvm_valid_regs = 0, kvm_dirty_regs = 0, s = {regs = {<No data fields>}, 
>     padding = '\000' <repeats 2047 times>}}
> 
> Marc, does it ring any bell?

Well, this is an example of a guest accessing non-memory using an
instruction that we cannot safely emulate - not an IO accessor (load
multiple, for example).

In this case, we kill the guest (we could as well inject a fault).

This vcpu seems to be accessing 0x9000018 (the mmio structure points
there), but I can't immediately relate it to the content of the
registers.

What looks a bit weird is that all the registers are clamped to 32bit,
but the PSTATE indicates it is running in 64bit (EL1h, which makes the
PC being utterly wrong).

What are you running there?

Thanks,

	M.

> Paolo
> 
> diff --git a/aio-posix.c b/aio-posix.c
> index 268d14d..d2011d0 100644
> --- a/aio-posix.c
> +++ b/aio-posix.c
> @@ -273,6 +273,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
>          aio_context_acquire(ctx);
>      }
>  
> +    /* This should be optimized... */
> +    event_notifier_test_and_clear(&ctx->notifier);
> +
> +    if (blocking) {
> +        atomic_sub(&ctx->notify_me, 2);
> +    }
> +
>      /* if we have any readable fds, dispatch event */
>      if (ret > 0) {
>          for (i = 0; i < npfd; i++) {
> @@ -283,10 +290,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
>      npfd = 0;
>      ctx->walking_handlers--;
>  
> -    if (blocking) {
> -        atomic_sub(&ctx->notify_me, 2);
> -    }
> -
>      /* Run dispatch even if there were no readable fds to run timers */
>      if (aio_dispatch(ctx)) {
>          progress = true;
> diff --git a/aio-win32.c b/aio-win32.c
> index 2bfd5f8..33809fd 100644
> --- a/aio-win32.c
> +++ b/aio-win32.c
> @@ -326,6 +326,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>          if (timeout) {
>              aio_context_acquire(ctx);
>          }
> +
> +        /* This should be optimized... */
> +        event_notifier_test_and_clear(&ctx->notifier);
> +
>          if (blocking) {
>              assert(first);
>              atomic_sub(&ctx->notify_me, 2);
> diff --git a/async.c b/async.c
> index 9204907..120e183 100644
> --- a/async.c
> +++ b/async.c
> @@ -202,6 +202,9 @@ aio_ctx_check(GSource *source)
>      AioContext *ctx = (AioContext *) source;
>      QEMUBH *bh;
>  
> +    /* This should be optimized... */
> +    event_notifier_test_and_clear(&ctx->notifier);
> +
>      atomic_and(&ctx->notify_me, ~1);
>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>          if (!bh->deleted && bh->scheduled) {
> @@ -280,6 +280,10 @@ static void aio_rfifolock_cb(void *opaque)
>      aio_notify(opaque);
>  }
>  
> +static void event_notifier_dummy_cb(EventNotifier *e)
> +{
> +}
> +
>  AioContext *aio_context_new(Error **errp)
>  {
>      int ret;
> @@ -292,7 +296,7 @@ AioContext *aio_context_new(Error **errp)
>          return NULL;
>      }
>      g_source_set_can_recurse(&ctx->source, true);
> -    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
> +    aio_set_event_notifier(ctx, &ctx->notifier, event_notifier_dummy_cb);
>      ctx->thread_pool = NULL;
>      qemu_mutex_init(&ctx->bh_lock);
>      rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
>
Laszlo Ersek July 17, 2015, 1:39 p.m. UTC | #4
On 07/17/15 15:28, Marc Zyngier wrote:
> On Fri, 17 Jul 2015 10:30:38 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
>>
>>
>> On 17/07/2015 06:44, Paolo Bonzini wrote:
>>>
>>>
>>> On 16/07/2015 21:05, Richard W.M. Jones wrote:
>>>>
>>>> Sorry to spoil things, but I'm still seeing this bug, although it is
>>>> now a lot less frequent with your patch.  I would estimate it happens
>>>> more often than 1 in 5 runs with qemu.git, and probably 1 in 200 runs
>>>> with qemu.git + the v2 patch series.
>>>>
>>>> It's the exact same hang in both cases.
>>>>
>>>> Is it possible that this patch doesn't completely close any race?
>>>>
>>>> Still, it is an improvement, so there is that.
>>>
>>> Would seem at first glance like a different bug.
>>>
>>> Interestingly, adding some "tracing" (qemu_clock_get_ns) makes the bug
>>> more likely: now it reproduces in about 10 tries.  Of course :) adding
>>> other kinds of tracing instead make it go away again (>50 tries).
>>>
>>> Perhaps this:
>>>
>>>    i/o thread         vcpu thread                   worker thread
>>>    ---------------------------------------------------------------------
>>>    lock_iothread
>>>    notify_me = 1
>>>    ...
>>>    unlock_iothread
>>>                       lock_iothread
>>>                       notify_me = 3
>>>                       ppoll
>>>                       notify_me = 1
>>>                                                      bh->scheduled = 1
>>>                                                      event_notifier_set
>>>                       event_notifier_test_and_clear
>>>    ppoll
>>>     ^^ hang
>>> 	
>>> In the exact shape above, it doesn't seem too likely to happen, but
>>> perhaps there's another simpler case.  Still, the bug exists.
>>>
>>> The above is not really related to notify_me.  Here the notification is
>>> not being optimized away!  So I wonder if this one has been there forever.
>>>
>>> Fam suggested putting the event_notifier_test_and_clear before
>>> aio_bh_poll(), but it does not work.  I'll look more close
>>>
>>> However, an unconditional event_notifier_test_and_clear is pretty
>>> expensive.  On one hand, obviously correctness comes first.  On the
>>> other hand, an expensive operation at the wrong place can mask the race
>>> very easily; I'll let the fix run for a while, but I'm not sure if a
>>> successful test really says anything useful.
>>
>> So it may not be useful, but still successful test is successful. :)
>> The following patch, which also includes the delta between v2 and v3
>> of this series, survived 674 reboots before hitting a definitely
>> unrelated problem:
>>
>> error: kvm run failed Function not implemented
>> PC=00000000bf671210  SP=00000000c00001f0
>> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
>> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
>> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
>> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
>> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
>> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
>> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
>> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
>>
>> For the record, this is the kvm_run struct:
>>
>> $6 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 0, 
>>   ready_for_interrupt_injection = 0 '\000', if_flag = 0 '\000', flags = 0, cr8 = 0, apic_base = 0, {hw = {
>>       hardware_exit_reason = 150994968}, fail_entry = {hardware_entry_failure_reason = 150994968}, ex = {
>>       exception = 150994968, error_code = 0}, io = {direction = 24 '\030', size = 0 '\000', port = 2304, 
>>       count = 0, data_offset = 144}, debug = {arch = {<No data fields>}}, mmio = {phys_addr = 150994968, 
>>       data = "\220\000\000\000\000\000\000", len = 4, is_write = 0 '\000'}, hypercall = {nr = 150994968, 
>>       args = {144, 4, 0, 0, 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 150994968, 
>>       is_write = 144, pad = 0}, s390_sieic = {icptcode = 24 '\030', ipa = 2304, ipb = 0}, 
>>     s390_reset_flags = 150994968, s390_ucontrol = {trans_exc_code = 150994968, pgm_code = 144}, dcr = {
>>       dcrn = 150994968, data = 0, is_write = 144 '\220'}, internal = {suberror = 150994968, ndata = 0, 
>>       data = {144, 4, 0 <repeats 14 times>}}, osi = {gprs = {150994968, 144, 4, 0 <repeats 29 times>}}, 
>>     papr_hcall = {nr = 150994968, ret = 144, args = {4, 0, 0, 0, 0, 0, 0, 0, 0}}, s390_tsch = {
>>       subchannel_id = 24, subchannel_nr = 2304, io_int_parm = 0, io_int_word = 144, ipb = 0, 
>>       dequeued = 4 '\004'}, epr = {epr = 150994968}, system_event = {type = 150994968, flags = 144}, 
>>     s390_stsi = {addr = 150994968, ar = 144 '\220', reserved = 0 '\000', fc = 0 '\000', sel1 = 0 '\000', 
>>       sel2 = 0}, 
>>     padding = "\030\000\000\t\000\000\000\000\220\000\000\000\000\000\000\000\004", '\000' <repeats 238 times>}, kvm_valid_regs = 0, kvm_dirty_regs = 0, s = {regs = {<No data fields>}, 
>>     padding = '\000' <repeats 2047 times>}}
>>
>> Marc, does it ring any bell?
> 
> Well, this is an example of a guest accessing non-memory using an
> instruction that we cannot safely emulate - not an IO accessor (load
> multiple, for example).
> 
> In this case, we kill the guest (we could as well inject a fault).
> 
> This vcpu seems to be accessing 0x9000018 (the mmio structure points
> there), but I can't immediately relate it to the content of the
> registers.

    [VIRT_UART] =               { 0x09000000, 0x00001000 },

Thanks
Laszlo

> 
> What looks a bit weird is that all the registers are clamped to 32bit,
> but the PSTATE indicates it is running in 64bit (EL1h, which makes the
> PC being utterly wrong).
> 
> What are you running there?
> 
> Thanks,
> 
> 	M.
> 
>> Paolo
>>
>> diff --git a/aio-posix.c b/aio-posix.c
>> index 268d14d..d2011d0 100644
>> --- a/aio-posix.c
>> +++ b/aio-posix.c
>> @@ -273,6 +273,13 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>          aio_context_acquire(ctx);
>>      }
>>  
>> +    /* This should be optimized... */
>> +    event_notifier_test_and_clear(&ctx->notifier);
>> +
>> +    if (blocking) {
>> +        atomic_sub(&ctx->notify_me, 2);
>> +    }
>> +
>>      /* if we have any readable fds, dispatch event */
>>      if (ret > 0) {
>>          for (i = 0; i < npfd; i++) {
>> @@ -283,10 +290,6 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>      npfd = 0;
>>      ctx->walking_handlers--;
>>  
>> -    if (blocking) {
>> -        atomic_sub(&ctx->notify_me, 2);
>> -    }
>> -
>>      /* Run dispatch even if there were no readable fds to run timers */
>>      if (aio_dispatch(ctx)) {
>>          progress = true;
>> diff --git a/aio-win32.c b/aio-win32.c
>> index 2bfd5f8..33809fd 100644
>> --- a/aio-win32.c
>> +++ b/aio-win32.c
>> @@ -326,6 +326,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>>          if (timeout) {
>>              aio_context_acquire(ctx);
>>          }
>> +
>> +        /* This should be optimized... */
>> +        event_notifier_test_and_clear(&ctx->notifier);
>> +
>>          if (blocking) {
>>              assert(first);
>>              atomic_sub(&ctx->notify_me, 2);
>> diff --git a/async.c b/async.c
>> index 9204907..120e183 100644
>> --- a/async.c
>> +++ b/async.c
>> @@ -202,6 +202,9 @@ aio_ctx_check(GSource *source)
>>      AioContext *ctx = (AioContext *) source;
>>      QEMUBH *bh;
>>  
>> +    /* This should be optimized... */
>> +    event_notifier_test_and_clear(&ctx->notifier);
>> +
>>      atomic_and(&ctx->notify_me, ~1);
>>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>>          if (!bh->deleted && bh->scheduled) {
>> @@ -280,6 +280,10 @@ static void aio_rfifolock_cb(void *opaque)
>>      aio_notify(opaque);
>>  }
>>  
>> +static void event_notifier_dummy_cb(EventNotifier *e)
>> +{
>> +}
>> +
>>  AioContext *aio_context_new(Error **errp)
>>  {
>>      int ret;
>> @@ -292,7 +296,7 @@ AioContext *aio_context_new(Error **errp)
>>          return NULL;
>>      }
>>      g_source_set_can_recurse(&ctx->source, true);
>> -    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
>> +    aio_set_event_notifier(ctx, &ctx->notifier, event_notifier_dummy_cb);
>>      ctx->thread_pool = NULL;
>>      qemu_mutex_init(&ctx->bh_lock);
>>      rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
>>
> 
> 
>
Marc Zyngier July 17, 2015, 1:48 p.m. UTC | #5
On Fri, 17 Jul 2015 14:39:55 +0100
Laszlo Ersek <lersek@redhat.com> wrote:

> On 07/17/15 15:28, Marc Zyngier wrote:
> > On Fri, 17 Jul 2015 10:30:38 +0100
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> >>
> >>
> >> On 17/07/2015 06:44, Paolo Bonzini wrote:
> >>>
> >>>
> >>> On 16/07/2015 21:05, Richard W.M. Jones wrote:
> >>>>
> >>>> Sorry to spoil things, but I'm still seeing this bug, although it is
> >>>> now a lot less frequent with your patch.  I would estimate it happens
> >>>> more often than 1 in 5 runs with qemu.git, and probably 1 in 200 runs
> >>>> with qemu.git + the v2 patch series.
> >>>>
> >>>> It's the exact same hang in both cases.
> >>>>
> >>>> Is it possible that this patch doesn't completely close any race?
> >>>>
> >>>> Still, it is an improvement, so there is that.
> >>>
> >>> Would seem at first glance like a different bug.
> >>>
> >>> Interestingly, adding some "tracing" (qemu_clock_get_ns) makes the bug
> >>> more likely: now it reproduces in about 10 tries.  Of course :) adding
> >>> other kinds of tracing instead make it go away again (>50 tries).
> >>>
> >>> Perhaps this:
> >>>
> >>>    i/o thread         vcpu thread                   worker thread
> >>>    ---------------------------------------------------------------------
> >>>    lock_iothread
> >>>    notify_me = 1
> >>>    ...
> >>>    unlock_iothread
> >>>                       lock_iothread
> >>>                       notify_me = 3
> >>>                       ppoll
> >>>                       notify_me = 1
> >>>                                                      bh->scheduled = 1
> >>>                                                      event_notifier_set
> >>>                       event_notifier_test_and_clear
> >>>    ppoll
> >>>     ^^ hang
> >>> 	
> >>> In the exact shape above, it doesn't seem too likely to happen, but
> >>> perhaps there's another simpler case.  Still, the bug exists.
> >>>
> >>> The above is not really related to notify_me.  Here the notification is
> >>> not being optimized away!  So I wonder if this one has been there forever.
> >>>
> >>> Fam suggested putting the event_notifier_test_and_clear before
> >>> aio_bh_poll(), but it does not work.  I'll look more close
> >>>
> >>> However, an unconditional event_notifier_test_and_clear is pretty
> >>> expensive.  On one hand, obviously correctness comes first.  On the
> >>> other hand, an expensive operation at the wrong place can mask the race
> >>> very easily; I'll let the fix run for a while, but I'm not sure if a
> >>> successful test really says anything useful.
> >>
> >> So it may not be useful, but still successful test is successful. :)
> >> The following patch, which also includes the delta between v2 and v3
> >> of this series, survived 674 reboots before hitting a definitely
> >> unrelated problem:
> >>
> >> error: kvm run failed Function not implemented
> >> PC=00000000bf671210  SP=00000000c00001f0
> >> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
> >> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
> >> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
> >> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
> >> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
> >> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
> >> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
> >> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
> >>
> >> For the record, this is the kvm_run struct:
> >>
> >> $6 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 0, 
> >>   ready_for_interrupt_injection = 0 '\000', if_flag = 0 '\000', flags = 0, cr8 = 0, apic_base = 0, {hw = {
> >>       hardware_exit_reason = 150994968}, fail_entry = {hardware_entry_failure_reason = 150994968}, ex = {
> >>       exception = 150994968, error_code = 0}, io = {direction = 24 '\030', size = 0 '\000', port = 2304, 
> >>       count = 0, data_offset = 144}, debug = {arch = {<No data fields>}}, mmio = {phys_addr = 150994968, 
> >>       data = "\220\000\000\000\000\000\000", len = 4, is_write = 0 '\000'}, hypercall = {nr = 150994968, 
> >>       args = {144, 4, 0, 0, 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 150994968, 
> >>       is_write = 144, pad = 0}, s390_sieic = {icptcode = 24 '\030', ipa = 2304, ipb = 0}, 
> >>     s390_reset_flags = 150994968, s390_ucontrol = {trans_exc_code = 150994968, pgm_code = 144}, dcr = {
> >>       dcrn = 150994968, data = 0, is_write = 144 '\220'}, internal = {suberror = 150994968, ndata = 0, 
> >>       data = {144, 4, 0 <repeats 14 times>}}, osi = {gprs = {150994968, 144, 4, 0 <repeats 29 times>}}, 
> >>     papr_hcall = {nr = 150994968, ret = 144, args = {4, 0, 0, 0, 0, 0, 0, 0, 0}}, s390_tsch = {
> >>       subchannel_id = 24, subchannel_nr = 2304, io_int_parm = 0, io_int_word = 144, ipb = 0, 
> >>       dequeued = 4 '\004'}, epr = {epr = 150994968}, system_event = {type = 150994968, flags = 144}, 
> >>     s390_stsi = {addr = 150994968, ar = 144 '\220', reserved = 0 '\000', fc = 0 '\000', sel1 = 0 '\000', 
> >>       sel2 = 0}, 
> >>     padding = "\030\000\000\t\000\000\000\000\220\000\000\000\000\000\000\000\004", '\000' <repeats 238 times>}, kvm_valid_regs = 0, kvm_dirty_regs = 0, s = {regs = {<No data fields>}, 
> >>     padding = '\000' <repeats 2047 times>}}
> >>
> >> Marc, does it ring any bell?
> > 
> > Well, this is an example of a guest accessing non-memory using an
> > instruction that we cannot safely emulate - not an IO accessor (load
> > multiple, for example).
> > 
> > In this case, we kill the guest (we could as well inject a fault).
> > 
> > This vcpu seems to be accessing 0x9000018 (the mmio structure points
> > there), but I can't immediately relate it to the content of the
> > registers.
> 
>     [VIRT_UART] =               { 0x09000000, 0x00001000 },
> 

Still: there is nothing in the registers that remotely points to that
area. X0 is the closest, but it'd take a big negative offset to get
there.

Is that a Linux kernel? or something else?

	M.
Richard W.M. Jones July 17, 2015, 1:53 p.m. UTC | #6
On Fri, Jul 17, 2015 at 02:48:40PM +0100, Marc Zyngier wrote:
> Still: there is nothing in the registers that remotely points to that
> area. X0 is the closest, but it'd take a big negative offset to get
> there.
> 
> Is that a Linux kernel? or something else?

You're sure it's not this one?
https://bugzilla.redhat.com/show_bug.cgi?id=1194366
That was caused by ftrace screwing up guest memory, so it was
effectively running random code.  It is also fixed (by you in fact).

Rich.
Laszlo Ersek July 17, 2015, 1:57 p.m. UTC | #7
On 07/17/15 15:48, Marc Zyngier wrote:
> On Fri, 17 Jul 2015 14:39:55 +0100
> Laszlo Ersek <lersek@redhat.com> wrote:
> 
>> On 07/17/15 15:28, Marc Zyngier wrote:
>>> On Fri, 17 Jul 2015 10:30:38 +0100
>>> Paolo Bonzini <pbonzini@redhat.com> wrote:
>>>
>>>>
>>>>
>>>> On 17/07/2015 06:44, Paolo Bonzini wrote:
>>>>>
>>>>>
>>>>> On 16/07/2015 21:05, Richard W.M. Jones wrote:
>>>>>>
>>>>>> Sorry to spoil things, but I'm still seeing this bug, although it is
>>>>>> now a lot less frequent with your patch.  I would estimate it happens
>>>>>> more often than 1 in 5 runs with qemu.git, and probably 1 in 200 runs
>>>>>> with qemu.git + the v2 patch series.
>>>>>>
>>>>>> It's the exact same hang in both cases.
>>>>>>
>>>>>> Is it possible that this patch doesn't completely close any race?
>>>>>>
>>>>>> Still, it is an improvement, so there is that.
>>>>>
>>>>> Would seem at first glance like a different bug.
>>>>>
>>>>> Interestingly, adding some "tracing" (qemu_clock_get_ns) makes the bug
>>>>> more likely: now it reproduces in about 10 tries.  Of course :) adding
>>>>> other kinds of tracing instead make it go away again (>50 tries).
>>>>>
>>>>> Perhaps this:
>>>>>
>>>>>    i/o thread         vcpu thread                   worker thread
>>>>>    ---------------------------------------------------------------------
>>>>>    lock_iothread
>>>>>    notify_me = 1
>>>>>    ...
>>>>>    unlock_iothread
>>>>>                       lock_iothread
>>>>>                       notify_me = 3
>>>>>                       ppoll
>>>>>                       notify_me = 1
>>>>>                                                      bh->scheduled = 1
>>>>>                                                      event_notifier_set
>>>>>                       event_notifier_test_and_clear
>>>>>    ppoll
>>>>>     ^^ hang
>>>>> 	
>>>>> In the exact shape above, it doesn't seem too likely to happen, but
>>>>> perhaps there's another simpler case.  Still, the bug exists.
>>>>>
>>>>> The above is not really related to notify_me.  Here the notification is
>>>>> not being optimized away!  So I wonder if this one has been there forever.
>>>>>
>>>>> Fam suggested putting the event_notifier_test_and_clear before
>>>>> aio_bh_poll(), but it does not work.  I'll look more close
>>>>>
>>>>> However, an unconditional event_notifier_test_and_clear is pretty
>>>>> expensive.  On one hand, obviously correctness comes first.  On the
>>>>> other hand, an expensive operation at the wrong place can mask the race
>>>>> very easily; I'll let the fix run for a while, but I'm not sure if a
>>>>> successful test really says anything useful.
>>>>
>>>> So it may not be useful, but still successful test is successful. :)
>>>> The following patch, which also includes the delta between v2 and v3
>>>> of this series, survived 674 reboots before hitting a definitely
>>>> unrelated problem:
>>>>
>>>> error: kvm run failed Function not implemented
>>>> PC=00000000bf671210  SP=00000000c00001f0
>>>> X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
>>>> X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
>>>> X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
>>>> X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
>>>> X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
>>>> X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
>>>> X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
>>>> X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
>>>>
>>>> For the record, this is the kvm_run struct:
>>>>
>>>> $6 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 0, 
>>>>   ready_for_interrupt_injection = 0 '\000', if_flag = 0 '\000', flags = 0, cr8 = 0, apic_base = 0, {hw = {
>>>>       hardware_exit_reason = 150994968}, fail_entry = {hardware_entry_failure_reason = 150994968}, ex = {
>>>>       exception = 150994968, error_code = 0}, io = {direction = 24 '\030', size = 0 '\000', port = 2304, 
>>>>       count = 0, data_offset = 144}, debug = {arch = {<No data fields>}}, mmio = {phys_addr = 150994968, 
>>>>       data = "\220\000\000\000\000\000\000", len = 4, is_write = 0 '\000'}, hypercall = {nr = 150994968, 
>>>>       args = {144, 4, 0, 0, 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 150994968, 
>>>>       is_write = 144, pad = 0}, s390_sieic = {icptcode = 24 '\030', ipa = 2304, ipb = 0}, 
>>>>     s390_reset_flags = 150994968, s390_ucontrol = {trans_exc_code = 150994968, pgm_code = 144}, dcr = {
>>>>       dcrn = 150994968, data = 0, is_write = 144 '\220'}, internal = {suberror = 150994968, ndata = 0, 
>>>>       data = {144, 4, 0 <repeats 14 times>}}, osi = {gprs = {150994968, 144, 4, 0 <repeats 29 times>}}, 
>>>>     papr_hcall = {nr = 150994968, ret = 144, args = {4, 0, 0, 0, 0, 0, 0, 0, 0}}, s390_tsch = {
>>>>       subchannel_id = 24, subchannel_nr = 2304, io_int_parm = 0, io_int_word = 144, ipb = 0, 
>>>>       dequeued = 4 '\004'}, epr = {epr = 150994968}, system_event = {type = 150994968, flags = 144}, 
>>>>     s390_stsi = {addr = 150994968, ar = 144 '\220', reserved = 0 '\000', fc = 0 '\000', sel1 = 0 '\000', 
>>>>       sel2 = 0}, 
>>>>     padding = "\030\000\000\t\000\000\000\000\220\000\000\000\000\000\000\000\004", '\000' <repeats 238 times>}, kvm_valid_regs = 0, kvm_dirty_regs = 0, s = {regs = {<No data fields>}, 
>>>>     padding = '\000' <repeats 2047 times>}}
>>>>
>>>> Marc, does it ring any bell?
>>>
>>> Well, this is an example of a guest accessing non-memory using an
>>> instruction that we cannot safely emulate - not an IO accessor (load
>>> multiple, for example).
>>>
>>> In this case, we kill the guest (we could as well inject a fault).
>>>
>>> This vcpu seems to be accessing 0x9000018 (the mmio structure points
>>> there), but I can't immediately relate it to the content of the
>>> registers.
>>
>>     [VIRT_UART] =               { 0x09000000, 0x00001000 },
>>
> 
> Still: there is nothing in the registers that remotely points to that
> area. X0 is the closest, but it'd take a big negative offset to get
> there.
> 
> Is that a Linux kernel? or something else?

Assuming I managed to decipher the nesting of the contexts, the register
dump was pasted by Paolo. IIRC Paolo has been using the guest firmware
(the ArmVirtPkg/ArmVirtQemu.dsc platform build in edk2, also known as
"AAVMF" sometimes) to reproduce the issue.

Thanks
Laszlo
Marc Zyngier July 17, 2015, 2:03 p.m. UTC | #8
On Fri, 17 Jul 2015 14:53:06 +0100
"Richard W.M. Jones" <rjones@redhat.com> wrote:

> On Fri, Jul 17, 2015 at 02:48:40PM +0100, Marc Zyngier wrote:
> > Still: there is nothing in the registers that remotely points to that
> > area. X0 is the closest, but it'd take a big negative offset to get
> > there.
> > 
> > Is that a Linux kernel? or something else?
> 
> You're sure it's not this one?
> https://bugzilla.redhat.com/show_bug.cgi?id=1194366
> That was caused by ftrace screwing up guest memory, so it was
> effectively running random code.  It is also fixed (by you in fact).

Don't think so. The bug you quote was the guest kernel being buggy, and
touching non-memory space.

This new issue seems different - this is not a Linux kernel, by the
look of it.

	M.
Paolo Bonzini July 17, 2015, 2:04 p.m. UTC | #9
On 17/07/2015 15:28, Marc Zyngier wrote:
> > 
> > Marc, does it ring any bell?
> Well, this is an example of a guest accessing non-memory using an
> instruction that we cannot safely emulate - not an IO accessor (load
> multiple, for example).
> 
> In this case, we kill the guest (we could as well inject a fault).

Yup, I later found this in the dmesg:

[1875219.903969] kvm [14843]: load/store instruction decoding not implemented

> This vcpu seems to be accessing 0x9000018 (the mmio structure points
> there), but I can't immediately relate it to the content of the
> registers.

0x9000018 is the pl011, which makes some sense.

Have you ever seen a corrupted register dump?  The guest RAM goes
from 0x40000000 to 0xbfffffff, so SP is pointing outside memory.

> > PC=00000000bf671210  SP=00000000c00001f0
> > X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
> > X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
> > X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
> > X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
> > X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
> > X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
> > X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
> > X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)

If the register dump is not corrupted, execution went in the weeds...
I don't have the guest anymore but it's just firmware so the memory
contents are well reproducible:

0x00000000bf671200:  a97d6ffa      ldmdbge	sp!, {r1, r3, r4, r5, r6, r7, r8, r9, sl, fp, sp, lr}^
0x00000000bf671204:  a97e77fc      ldmdbge	lr!, {r2, r3, r4, r5, r6, r7, r8, r9, sl, ip, sp, lr}^
0x00000000bf671208:  f85f03fe      undefined instruction 0xf85f03fe
0x00000000bf67120c:  910803ff      strdls	r0, [r8, -pc]
0x00000000bf671210:  ad7007e0      ldclge	7, cr0, [r0, #-896]!
0x00000000bf671214:  ad710fe2      ldclge	15, cr0, [r1, #-904]!
0x00000000bf671218:  ad7217e4      ldclge	7, cr1, [r2, #-912]!
0x00000000bf67121c:  ad731fe6      ldclge	15, cr1, [r3, #-920]!
0x00000000bf671220:  ad7427e8      ldclge	7, cr2, [r4, #-928]!
0x00000000bf671224:  ad752fea      ldclge	15, cr2, [r5, #-936]!

> What looks a bit weird is that all the registers are clamped to 32bit,
> but the PSTATE indicates it is running in 64bit (EL1h, which makes the
> PC being utterly wrong).

The PC can be okay since UEFI code doesn't really use virtual addressing,
but the other registers are weird indeed.

> What are you running there?

Just firmware.  It's a UEFI reboot loop (as soon as you get to the
UEFI shell QEMU exits and the script starts a new one).

The kernel is an old 3.19 one, I'll upgrade and retest.

Paolo
Marc Zyngier July 17, 2015, 2:18 p.m. UTC | #10
On Fri, 17 Jul 2015 15:04:27 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> 
> 
> On 17/07/2015 15:28, Marc Zyngier wrote:
> > > 
> > > Marc, does it ring any bell?
> > Well, this is an example of a guest accessing non-memory using an
> > instruction that we cannot safely emulate - not an IO accessor (load
> > multiple, for example).
> > 
> > In this case, we kill the guest (we could as well inject a fault).
> 
> Yup, I later found this in the dmesg:
> 
> [1875219.903969] kvm [14843]: load/store instruction decoding not implemented
> 
> > This vcpu seems to be accessing 0x9000018 (the mmio structure points
> > there), but I can't immediately relate it to the content of the
> > registers.
> 
> 0x9000018 is the pl011, which makes some sense.
> 
> Have you ever seen a corrupted register dump?  The guest RAM goes
> from 0x40000000 to 0xbfffffff, so SP is pointing outside memory.

I've never seen such a corruption - so far.

> > > PC=00000000bf671210  SP=00000000c00001f0
> > > X00=000000000a003e70 X01=0000000000000000 X02=00000000bf680548 X03=0000000000000030
> > > X04=00000000bbb5fc18 X05=00000000004b7770 X06=00000000bf721930 X07=000000000000009a
> > > X08=00000000bf716858 X09=0000000000000090 X10=0000000000000000 X11=0000000000000046
> > > X12=00000000a007e03a X13=0000000000000000 X14=0000000000000000 X15=0000000000000000
> > > X16=00000000bf716df0 X17=0000000000000000 X18=0000000000000000 X19=00000000bf6f5f18
> > > X20=0000000000000000 X21=0000000000000000 X22=0000000000000000 X23=0000000000000000
> > > X24=0000000000000000 X25=0000000000000000 X26=0000000000000000 X27=0000000000000000
> > > X28=0000000000000000 X29=0000000000000000 X30=0000000000000000 PSTATE=60000305 (flags -ZC-)
> 
> If the register dump is not corrupted, execution went in the weeds...
> I don't have the guest anymore but it's just firmware so the memory
> contents are well reproducible:
> 
> 0x00000000bf671200:  a97d6ffa      ldmdbge	sp!, {r1, r3, r4, r5, r6, r7, r8, r9, sl, fp, sp, lr}^
> 0x00000000bf671204:  a97e77fc      ldmdbge	lr!, {r2, r3, r4, r5, r6, r7, r8, r9, sl, ip, sp, lr}^
> 0x00000000bf671208:  f85f03fe      undefined instruction 0xf85f03fe
> 0x00000000bf67120c:  910803ff      strdls	r0, [r8, -pc]
> 0x00000000bf671210:  ad7007e0      ldclge	7, cr0, [r0, #-896]!
> 0x00000000bf671214:  ad710fe2      ldclge	15, cr0, [r1, #-904]!
> 0x00000000bf671218:  ad7217e4      ldclge	7, cr1, [r2, #-912]!
> 0x00000000bf67121c:  ad731fe6      ldclge	15, cr1, [r3, #-920]!
> 0x00000000bf671220:  ad7427e8      ldclge	7, cr2, [r4, #-928]!
> 0x00000000bf671224:  ad752fea      ldclge	15, cr2, [r5, #-936]!

But that's all 32bit code, and your guest was running in 64bit. What
does it look like as A64?

> > What looks a bit weird is that all the registers are clamped to 32bit,
> > but the PSTATE indicates it is running in 64bit (EL1h, which makes the
> > PC being utterly wrong).
> 
> The PC can be okay since UEFI code doesn't really use virtual addressing,
> but the other registers are weird indeed.

It definitely looks like something tramped on your register file. KVM
doesn't do that at all (we use the whole AArch64 register file
irrespective of the execution state).

Is your UEFI guest 32 or 64bit?

	M.
diff mbox

Patch

diff --git a/aio-posix.c b/aio-posix.c
index 268d14d..d2011d0 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -273,6 +273,13 @@  bool aio_poll(AioContext *ctx, bool blocking)
         aio_context_acquire(ctx);
     }
 
+    /* This should be optimized... */
+    event_notifier_test_and_clear(&ctx->notifier);
+
+    if (blocking) {
+        atomic_sub(&ctx->notify_me, 2);
+    }
+
     /* if we have any readable fds, dispatch event */
     if (ret > 0) {
         for (i = 0; i < npfd; i++) {
@@ -283,10 +290,6 @@  bool aio_poll(AioContext *ctx, bool blocking)
     npfd = 0;
     ctx->walking_handlers--;
 
-    if (blocking) {
-        atomic_sub(&ctx->notify_me, 2);
-    }
-
     /* Run dispatch even if there were no readable fds to run timers */
     if (aio_dispatch(ctx)) {
         progress = true;
diff --git a/aio-win32.c b/aio-win32.c
index 2bfd5f8..33809fd 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -326,6 +326,10 @@  bool aio_poll(AioContext *ctx, bool blocking)
         if (timeout) {
             aio_context_acquire(ctx);
         }
+
+        /* This should be optimized... */
+        event_notifier_test_and_clear(&ctx->notifier);
+
         if (blocking) {
             assert(first);
             atomic_sub(&ctx->notify_me, 2);
diff --git a/async.c b/async.c
index 9204907..120e183 100644
--- a/async.c
+++ b/async.c
@@ -202,6 +202,9 @@  aio_ctx_check(GSource *source)
     AioContext *ctx = (AioContext *) source;
     QEMUBH *bh;
 
+    /* This should be optimized... */
+    event_notifier_test_and_clear(&ctx->notifier);
+
     atomic_and(&ctx->notify_me, ~1);
     for (bh = ctx->first_bh; bh; bh = bh->next) {
         if (!bh->deleted && bh->scheduled) {
@@ -280,6 +280,10 @@  static void aio_rfifolock_cb(void *opaque)
     aio_notify(opaque);
 }
 
+static void event_notifier_dummy_cb(EventNotifier *e)
+{
+}
+
 AioContext *aio_context_new(Error **errp)
 {
     int ret;
@@ -292,7 +296,7 @@  AioContext *aio_context_new(Error **errp)
         return NULL;
     }
     g_source_set_can_recurse(&ctx->source, true);
-    aio_set_event_notifier(ctx, &ctx->notifier, NULL);
+    aio_set_event_notifier(ctx, &ctx->notifier, event_notifier_dummy_cb);
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
     rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);