diff mbox

[Qemu-block] RFC cdrom in own thread?

Message ID 558415C8.3060207@kamp.de
State New
Headers show

Commit Message

Peter Lieven June 19, 2015, 1:14 p.m. UTC
Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
>>>>>> No symbol table info available.
>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0, nfds=3,
>>>>>>      timeout=4999424576) at qemu-timer.c:326
>>>>>>          ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>          tvsec = 4
>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0, blocking=true)
>>>>>>      at aio-posix.c:231
>>>>>>          node = 0x0
>>>>>>          was_dispatching = false
>>>>>>          ret = 1
>>>>>>          progress = false
>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>> offset=4292007936,
>>>>>>      qiov=0x7ffff554f760, is_write=false, flags=0) at block.c:2699
>>>>>>          aio_context = 0x5555563528e0
>>>>>>          co = 0x5555563888a0
>>>>>>          rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>            qiov = 0x7ffff554f760, is_write = false, ret = 2147483647,
>>>>>> flags = 0}
>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>> sector_num=8382828,
>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false, flags=0)
>>>>>>      at block.c:2722
>>>>>>          qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1, size =
>>>>>> 2048}
>>>>>>          iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>> sector_num=8382828,
>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>> No locals.
>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>> sector_num=8382828,
>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4) at block/block-backend.c:404
>>>>>> No locals.
>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>> lba=2095707,
>>>>>>      buf=0x7ffff44cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
>>>>>>          ret = 32767
>>>>> Here is the problem: The ATAPI emulation uses synchronous blk_read()
>>>>> instead of the AIO or coroutine interfaces. This means that it keeps
>>>>> polling for request completion while it holds the BQL until the request
>>>>> is completed.
>>>> I will look at this.
>>
>> I need some further help. My way to "emulate" a hung NFS Server is to
>> block it in the Firewall. Currently I face the problem that I cannot mount
>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
>> a kernel NFS mount). It reads a few sectors and then stalls (maybe another
>> bug):
>>
>> (gdb) thread apply all bt full
>>
>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>> util/qemu-thread-posix.c:120
>>         err = <optimized out>
>>         __func__ = "qemu_cond_broadcast"
>> #1  0x0000555555911164 in rfifolock_unlock (r=r@entry=0x555556259910) at
>> util/rfifolock.c:75
>>         __PRETTY_FUNCTION__ = "rfifolock_unlock"
>> #2  0x0000555555875921 in aio_context_release (ctx=ctx@entry=0x5555562598b0)
>> at async.c:329
>> No locals.
>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>> blocking=blocking@entry=true) at aio-posix.c:272
>>         node = <optimized out>
>>         was_dispatching = false
>>         i = <optimized out>
>>         ret = <optimized out>
>>         progress = false
>>         timeout = 611734526
>>         __PRETTY_FUNCTION__ = "aio_poll"
>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>> block/io.c:552
>>         aio_context = 0x5555562598b0
>>         co = <optimized out>
>>         rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags = (unknown: 0)}
>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>     flags=flags@entry=(unknown: 0)) at block/io.c:575
>>         qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size = 2048}
>>         iov = {iov_base = 0x555557874800, iov_len = 2048}
>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>> No locals.
>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>         ret = <optimized out>
>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at hw/ide/atapi.c:116
>>         ret = <optimized out>
>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>         byte_count_limit = <optimized out>
>>         size = <optimized out>
>>         ret = 2
> This is still the same scenario Kevin explained.
>
> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
> function holds the QEMU global mutex while waiting for the I/O request
> to complete.  This blocks other vcpu threads and the main loop thread.
>
> The solution is to convert the CD-ROM emulation code to use
> blk_aio_readv() instead of blk_read().

I tried a little, but i am stuck with my approach. I reads one sector
and then doesn't continue. Maybe someone with more knowledge
of ATAPI/IDE could help?


Thanks,
Peter

Comments

Stefan Hajnoczi June 22, 2015, 9:25 a.m. UTC | #1
On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
>>>>>>> No symbol table info available.
>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0, nfds=3,
>>>>>>>      timeout=4999424576) at qemu-timer.c:326
>>>>>>>          ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>          tvsec = 4
>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0, blocking=true)
>>>>>>>      at aio-posix.c:231
>>>>>>>          node = 0x0
>>>>>>>          was_dispatching = false
>>>>>>>          ret = 1
>>>>>>>          progress = false
>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>> offset=4292007936,
>>>>>>>      qiov=0x7ffff554f760, is_write=false, flags=0) at block.c:2699
>>>>>>>          aio_context = 0x5555563528e0
>>>>>>>          co = 0x5555563888a0
>>>>>>>          rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>            qiov = 0x7ffff554f760, is_write = false, ret = 2147483647,
>>>>>>> flags = 0}
>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>> sector_num=8382828,
>>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false, flags=0)
>>>>>>>      at block.c:2722
>>>>>>>          qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1, size =
>>>>>>> 2048}
>>>>>>>          iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>> sector_num=8382828,
>>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>> No locals.
>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>> sector_num=8382828,
>>>>>>>      buf=0x7ffff44cc800 "(", nb_sectors=4) at block/block-backend.c:404
>>>>>>> No locals.
>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>> lba=2095707,
>>>>>>>      buf=0x7ffff44cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
>>>>>>>          ret = 32767
>>>>>> Here is the problem: The ATAPI emulation uses synchronous blk_read()
>>>>>> instead of the AIO or coroutine interfaces. This means that it keeps
>>>>>> polling for request completion while it holds the BQL until the request
>>>>>> is completed.
>>>>> I will look at this.
>>>
>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>> block it in the Firewall. Currently I face the problem that I cannot mount
>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe another
>>> bug):
>>>
>>> (gdb) thread apply all bt full
>>>
>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>> util/qemu-thread-posix.c:120
>>>         err = <optimized out>
>>>         __func__ = "qemu_cond_broadcast"
>>> #1  0x0000555555911164 in rfifolock_unlock (r=r@entry=0x555556259910) at
>>> util/rfifolock.c:75
>>>         __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>> #2  0x0000555555875921 in aio_context_release (ctx=ctx@entry=0x5555562598b0)
>>> at async.c:329
>>> No locals.
>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>         node = <optimized out>
>>>         was_dispatching = false
>>>         i = <optimized out>
>>>         ret = <optimized out>
>>>         progress = false
>>>         timeout = 611734526
>>>         __PRETTY_FUNCTION__ = "aio_poll"
>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>> block/io.c:552
>>>         aio_context = 0x5555562598b0
>>>         co = <optimized out>
>>>         rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags = (unknown: 0)}
>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>     flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>         qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size = 2048}
>>>         iov = {iov_base = 0x555557874800, iov_len = 2048}
>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>> No locals.
>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>         ret = <optimized out>
>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at hw/ide/atapi.c:116
>>>         ret = <optimized out>
>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>         byte_count_limit = <optimized out>
>>>         size = <optimized out>
>>>         ret = 2
>> This is still the same scenario Kevin explained.
>>
>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>> function holds the QEMU global mutex while waiting for the I/O request
>> to complete.  This blocks other vcpu threads and the main loop thread.
>>
>> The solution is to convert the CD-ROM emulation code to use
>> blk_aio_readv() instead of blk_read().
>
> I tried a little, but i am stuck with my approach. I reads one sector
> and then doesn't continue. Maybe someone with more knowledge
> of ATAPI/IDE could help?

Converting synchronous code to asynchronous requires an understanding
of the device's state transitions.  Asynchronous code has to put the
device registers into a busy state until the request completes.  It
also needs to handle hardware register accesses that occur while the
request is still pending.

I don't know ATAPI/IDE code well enough to suggest a fix.

> diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
> index 950e311..cdcbd49 100644
> --- a/hw/ide/atapi.c
> +++ b/hw/ide/atapi.c
> @@ -27,6 +27,8 @@
>  #include "hw/scsi/scsi.h"
>  #include "sysemu/block-backend.h"
>
> +#define DEBUG_IDE_ATAPI 1
> +
>  static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret);
>
>  static void padstr8(uint8_t *buf, int buf_size, const char *src)
> @@ -105,31 +107,55 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
>      memset(buf, 0, 288);
>  }
>
> -static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
> +static void cd_read_sector_cb(void *opaque, int ret) {
> +    IDEState *s = opaque;
> +
> +    block_acct_done(blk_get_stats(s->blk), &s->acct);
> +
> +    printf("cd_read_sector_cb lba %d ret = %d\n", s->lba, ret);
> +
> +    if (ret < 0) {
> +        ide_atapi_io_error(s, ret);
> +        return;
> +    }
> +
> +    if (s->cd_sector_size == 2352) {
> +        cd_data_to_raw(s->io_buffer, s->lba);
> +    }
> +
> +    s->lba++;
> +    s->io_buffer_index = 0;
> +
> +    ide_atapi_cmd_reply_end(s);
> +}
> +
> +static BlockAIOCB *cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
>  {
> -    int ret;
> +    BlockAIOCB *aiocb = NULL;
>
> -    switch(sector_size) {
> -    case 2048:
> -        block_acct_start(blk_get_stats(s->blk), &s->acct,
> -                         4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
> -        ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
> -        block_acct_done(blk_get_stats(s->blk), &s->acct);
> -        break;
> -    case 2352:
> +    if (sector_size != 2048 && sector_size != 2352) {
> +        return NULL;
> +    }
> +
> +    s->iov.iov_base = buf;
> +    if (sector_size == 2352) {
> +        buf += 4;
> +    }
> +
> +    s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&s->qiov, &s->iov, 1);
> +
> +    printf("cd_read_sector lba %d\n", lba);
> +
> +    aiocb = blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
> +                          cd_read_sector_cb, s);
> +
> +    if (aiocb != NULL) {
>          block_acct_start(blk_get_stats(s->blk), &s->acct,
>                           4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
> -        ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
> -        block_acct_done(blk_get_stats(s->blk), &s->acct);
> -        if (ret < 0)
> -            return ret;
> -        cd_data_to_raw(buf, lba);
> -        break;
> -    default:
> -        ret = -EIO;
> -        break;
>      }
> -    return ret;
> +
> +    return aiocb;
>  }
>
>  void ide_atapi_cmd_ok(IDEState *s)
> @@ -170,7 +196,7 @@ void ide_atapi_io_error(IDEState *s, int ret)
>  /* The whole ATAPI transfer logic is handled in this function */
>  void ide_atapi_cmd_reply_end(IDEState *s)
>  {
> -    int byte_count_limit, size, ret;
> +    int byte_count_limit, size;
>  #ifdef DEBUG_IDE_ATAPI
>      printf("reply: tx_size=%d elem_tx_size=%d index=%d\n",
>             s->packet_transfer_size,
> @@ -187,13 +213,10 @@ void ide_atapi_cmd_reply_end(IDEState *s)
>      } else {
>          /* see if a new sector must be read */
>          if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
> -            ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
> -            if (ret < 0) {
> -                ide_atapi_io_error(s, ret);
> -                return;
> +            if (cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size) == NULL) {
> +                ide_atapi_io_error(s, -EIO);
>              }
> -            s->lba++;
> -            s->io_buffer_index = 0;
> +            return;
>          }
>          if (s->elementary_transfer_size > 0) {
>              /* there are some data left to transmit in this elementary
>
> Thanks,
> Peter
Peter Lieven June 22, 2015, 1:09 p.m. UTC | #2
Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>> No symbol table info available.
>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0, nfds=3,
>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>           tvsec = 4
>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0, blocking=true)
>>>>>>>>       at aio-posix.c:231
>>>>>>>>           node = 0x0
>>>>>>>>           was_dispatching = false
>>>>>>>>           ret = 1
>>>>>>>>           progress = false
>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>> offset=4292007936,
>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at block.c:2699
>>>>>>>>           aio_context = 0x5555563528e0
>>>>>>>>           co = 0x5555563888a0
>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret = 2147483647,
>>>>>>>> flags = 0}
>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>> sector_num=8382828,
>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false, flags=0)
>>>>>>>>       at block.c:2722
>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1, size =
>>>>>>>> 2048}
>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>> sector_num=8382828,
>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>> No locals.
>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>> sector_num=8382828,
>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block/block-backend.c:404
>>>>>>>> No locals.
>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>> lba=2095707,
>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at hw/ide/atapi.c:116
>>>>>>>>           ret = 32767
>>>>>>> Here is the problem: The ATAPI emulation uses synchronous blk_read()
>>>>>>> instead of the AIO or coroutine interfaces. This means that it keeps
>>>>>>> polling for request completion while it holds the BQL until the request
>>>>>>> is completed.
>>>>>> I will look at this.
>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>> block it in the Firewall. Currently I face the problem that I cannot mount
>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously tried with
>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe another
>>>> bug):
>>>>
>>>> (gdb) thread apply all bt full
>>>>
>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>>> util/qemu-thread-posix.c:120
>>>>          err = <optimized out>
>>>>          __func__ = "qemu_cond_broadcast"
>>>> #1  0x0000555555911164 in rfifolock_unlock (r=r@entry=0x555556259910) at
>>>> util/rfifolock.c:75
>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>> #2  0x0000555555875921 in aio_context_release (ctx=ctx@entry=0x5555562598b0)
>>>> at async.c:329
>>>> No locals.
>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>>          node = <optimized out>
>>>>          was_dispatching = false
>>>>          i = <optimized out>
>>>>          ret = <optimized out>
>>>>          progress = false
>>>>          timeout = 611734526
>>>>          __PRETTY_FUNCTION__ = "aio_poll"
>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>>> block/io.c:552
>>>>          aio_context = 0x5555562598b0
>>>>          co = <optimized out>
>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags = (unknown: 0)}
>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>>      flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size = 2048}
>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>>> No locals.
>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>>          ret = <optimized out>
>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at hw/ide/atapi.c:116
>>>>          ret = <optimized out>
>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>          byte_count_limit = <optimized out>
>>>>          size = <optimized out>
>>>>          ret = 2
>>> This is still the same scenario Kevin explained.
>>>
>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>> function holds the QEMU global mutex while waiting for the I/O request
>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>
>>> The solution is to convert the CD-ROM emulation code to use
>>> blk_aio_readv() instead of blk_read().
>> I tried a little, but i am stuck with my approach. I reads one sector
>> and then doesn't continue. Maybe someone with more knowledge
>> of ATAPI/IDE could help?
> Converting synchronous code to asynchronous requires an understanding
> of the device's state transitions.  Asynchronous code has to put the
> device registers into a busy state until the request completes.  It
> also needs to handle hardware register accesses that occur while the
> request is still pending.

That was my assumption as well. But I don't know how to proceed...

>
> I don't know ATAPI/IDE code well enough to suggest a fix.

Maybe @John can help?

Peter
John Snow June 22, 2015, 9:54 p.m. UTC | #3
On 06/22/2015 09:09 AM, Peter Lieven wrote:
> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>> No symbol table info available.
>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
>>>>>>>>> nfds=3,
>>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>           tvsec = 4
>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
>>>>>>>>> blocking=true)
>>>>>>>>>       at aio-posix.c:231
>>>>>>>>>           node = 0x0
>>>>>>>>>           was_dispatching = false
>>>>>>>>>           ret = 1
>>>>>>>>>           progress = false
>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>>> offset=4292007936,
>>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at
>>>>>>>>> block.c:2699
>>>>>>>>>           aio_context = 0x5555563528e0
>>>>>>>>>           co = 0x5555563888a0
>>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret =
>>>>>>>>> 2147483647,
>>>>>>>>> flags = 0}
>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>>> sector_num=8382828,
>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>> flags=0)
>>>>>>>>>       at block.c:2722
>>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
>>>>>>>>> size =
>>>>>>>>> 2048}
>>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>>> sector_num=8382828,
>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>> No locals.
>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>>> sector_num=8382828,
>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at
>>>>>>>>> block/block-backend.c:404
>>>>>>>>> No locals.
>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>>> lba=2095707,
>>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at
>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>           ret = 32767
>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>> blk_read()
>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>> keeps
>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>> request
>>>>>>>> is completed.
>>>>>>> I will look at this.
>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>>> block it in the Firewall. Currently I face the problem that I
>>>>> cannot mount
>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
>>>>> tried with
>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
>>>>> another
>>>>> bug):
>>>>>
>>>>> (gdb) thread apply all bt full
>>>>>
>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>>>> util/qemu-thread-posix.c:120
>>>>>          err = <optimized out>
>>>>>          __func__ = "qemu_cond_broadcast"
>>>>> #1  0x0000555555911164 in rfifolock_unlock
>>>>> (r=r@entry=0x555556259910) at
>>>>> util/rfifolock.c:75
>>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>>> #2  0x0000555555875921 in aio_context_release
>>>>> (ctx=ctx@entry=0x5555562598b0)
>>>>> at async.c:329
>>>>> No locals.
>>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>>>          node = <optimized out>
>>>>>          was_dispatching = false
>>>>>          i = <optimized out>
>>>>>          ret = <optimized out>
>>>>>          progress = false
>>>>>          timeout = 611734526
>>>>>          __PRETTY_FUNCTION__ = "aio_poll"
>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>>>> block/io.c:552
>>>>>          aio_context = 0x5555562598b0
>>>>>          co = <optimized out>
>>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
>>>>> (unknown: 0)}
>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>>>      flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
>>>>> = 2048}
>>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>>>> No locals.
>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>>>          ret = <optimized out>
>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
>>>>> hw/ide/atapi.c:116
>>>>>          ret = <optimized out>
>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>>          byte_count_limit = <optimized out>
>>>>>          size = <optimized out>
>>>>>          ret = 2
>>>> This is still the same scenario Kevin explained.
>>>>
>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>>> function holds the QEMU global mutex while waiting for the I/O request
>>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>>
>>>> The solution is to convert the CD-ROM emulation code to use
>>>> blk_aio_readv() instead of blk_read().
>>> I tried a little, but i am stuck with my approach. I reads one sector
>>> and then doesn't continue. Maybe someone with more knowledge
>>> of ATAPI/IDE could help?
>> Converting synchronous code to asynchronous requires an understanding
>> of the device's state transitions.  Asynchronous code has to put the
>> device registers into a busy state until the request completes.  It
>> also needs to handle hardware register accesses that occur while the
>> request is still pending.
> 
> That was my assumption as well. But I don't know how to proceed...
> 
>>
>> I don't know ATAPI/IDE code well enough to suggest a fix.
> 
> Maybe @John can help?
> 
> Peter
> 

Sure thing. I will take a deep look as soon as I get my NCQ patches out
the door. I don't have high hopes for a proper comprehensive fix for
2.4, of course, but is there anything we should stick a band-aid on for
2.4? My current reading is "It's just as broken as it's always been, so
it's not necessarily dire."

Also: since ATAPI apparently is doing all of its reads in a synchronous
manner in the SCSI fakery layer it has, I think a lot of the work is
done by making sure the IDE device is set +BSY +DRQ which will prevent
any new commands being sent to it just like DMA_READ commands already do.

Looks like the ATAPI state machine is supposed to be something like this:

CMD_PACKET is received: BSY bit is set.
Device is ready to receive command packet: -BSY +DRQ
Based on the nIEN bit, we either wait for an interrupt, or:
Poll the status register until BSY and DRQ clear.

The IDE layer already prevents new commands from showing up while we
have BSY or DRQ set, and it looks like the flow is:

- ide_exec_cmd sets +BSY
- cmd_packet sets -BST
- ide_transfer_start sets +DRQ
  (Note: this is fully synchronous for e.g. AHCI, PCI/ISA will wait
   for PIO data)
- After the last byte is transferred, we'll invoke ide_atapi_cmd.
- ide_atapi_cmd invokes e.g. cmd_inquiry
- cmd_inquiry will fill its buffer and invoke ide_atapi_cmd_reply.
- ide_atapi_cmd_reply either does a DMA transfer (-BSY +DRQ)
  or a PIO reply (ide_atapi_cmd_reply_end) (-BSY -DRQ)
  (Note again: AHCI is still fully synchronous here, PCI/ISA will
   wait for data reads.)

(Hmm, it looks like there's an opening for new commands to show up here,
since we've got -BSY and -DRQ)

- ide_atapi_cmd_reply_end will call ide_atapi_cmd_ok, which will
  clear the error bits, definitely set -BSY -DRQ +RDY, and set the IRQ
  if nIEN is not set.


I think this won't be too bad, since the ide_exec_cmd layer itself is
already used to commands returning that aren't actually finished yet,
and the cmd_packet launcher itself also assumes the same.

The way the ATAPI commands seem to work is: Tell the core layer that
we're not finished (even if we possibly are already) and set the
appropriate status bits ourselves after we're done, synchronously or not.

A lot of the pathways are almost all protected by BSY/DRQ the whole way
and we already have a nearly asynchronous method for clearing them only
when the command is actually complete.

Maybe I'll start hacking away at this after hard freeze to see what I
can do. If you already started, want to link me to a git and I'll start
from there?

--js
Peter Lieven June 23, 2015, 6:36 a.m. UTC | #4
Am 22.06.2015 um 23:54 schrieb John Snow:
>
> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>> No symbol table info available.
>>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
>>>>>>>>>> nfds=3,
>>>>>>>>>>        timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>            ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>            tvsec = 4
>>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
>>>>>>>>>> blocking=true)
>>>>>>>>>>        at aio-posix.c:231
>>>>>>>>>>            node = 0x0
>>>>>>>>>>            was_dispatching = false
>>>>>>>>>>            ret = 1
>>>>>>>>>>            progress = false
>>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>>>> offset=4292007936,
>>>>>>>>>>        qiov=0x7ffff554f760, is_write=false, flags=0) at
>>>>>>>>>> block.c:2699
>>>>>>>>>>            aio_context = 0x5555563528e0
>>>>>>>>>>            co = 0x5555563888a0
>>>>>>>>>>            rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>>>              qiov = 0x7ffff554f760, is_write = false, ret =
>>>>>>>>>> 2147483647,
>>>>>>>>>> flags = 0}
>>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>        buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>> flags=0)
>>>>>>>>>>        at block.c:2722
>>>>>>>>>>            qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
>>>>>>>>>> size =
>>>>>>>>>> 2048}
>>>>>>>>>>            iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>        buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>> No locals.
>>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>        buf=0x7ffff44cc800 "(", nb_sectors=4) at
>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>> No locals.
>>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>>>> lba=2095707,
>>>>>>>>>>        buf=0x7ffff44cc800 "(", sector_size=2048) at
>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>            ret = 32767
>>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>>> blk_read()
>>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>>> keeps
>>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>>> request
>>>>>>>>> is completed.
>>>>>>>> I will look at this.
>>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>>>> block it in the Firewall. Currently I face the problem that I
>>>>>> cannot mount
>>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
>>>>>> tried with
>>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
>>>>>> another
>>>>>> bug):
>>>>>>
>>>>>> (gdb) thread apply all bt full
>>>>>>
>>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>>>>> util/qemu-thread-posix.c:120
>>>>>>           err = <optimized out>
>>>>>>           __func__ = "qemu_cond_broadcast"
>>>>>> #1  0x0000555555911164 in rfifolock_unlock
>>>>>> (r=r@entry=0x555556259910) at
>>>>>> util/rfifolock.c:75
>>>>>>           __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>>>> #2  0x0000555555875921 in aio_context_release
>>>>>> (ctx=ctx@entry=0x5555562598b0)
>>>>>> at async.c:329
>>>>>> No locals.
>>>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>>>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>>>>           node = <optimized out>
>>>>>>           was_dispatching = false
>>>>>>           i = <optimized out>
>>>>>>           ret = <optimized out>
>>>>>>           progress = false
>>>>>>           timeout = 611734526
>>>>>>           __PRETTY_FUNCTION__ = "aio_poll"
>>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>>>>> block/io.c:552
>>>>>>           aio_context = 0x5555562598b0
>>>>>>           co = <optimized out>
>>>>>>           rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
>>>>>> (unknown: 0)}
>>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>>>>       flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>>>>           qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
>>>>>> = 2048}
>>>>>>           iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>>>>> No locals.
>>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>>>>           ret = <optimized out>
>>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
>>>>>> hw/ide/atapi.c:116
>>>>>>           ret = <optimized out>
>>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>>>           byte_count_limit = <optimized out>
>>>>>>           size = <optimized out>
>>>>>>           ret = 2
>>>>> This is still the same scenario Kevin explained.
>>>>>
>>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>>>> function holds the QEMU global mutex while waiting for the I/O request
>>>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>>>
>>>>> The solution is to convert the CD-ROM emulation code to use
>>>>> blk_aio_readv() instead of blk_read().
>>>> I tried a little, but i am stuck with my approach. I reads one sector
>>>> and then doesn't continue. Maybe someone with more knowledge
>>>> of ATAPI/IDE could help?
>>> Converting synchronous code to asynchronous requires an understanding
>>> of the device's state transitions.  Asynchronous code has to put the
>>> device registers into a busy state until the request completes.  It
>>> also needs to handle hardware register accesses that occur while the
>>> request is still pending.
>> That was my assumption as well. But I don't know how to proceed...
>>
>>> I don't know ATAPI/IDE code well enough to suggest a fix.
>> Maybe @John can help?
>>
>> Peter
>>
> Sure thing. I will take a deep look as soon as I get my NCQ patches out
> the door. I don't have high hopes for a proper comprehensive fix for
> 2.4, of course, but is there anything we should stick a band-aid on for
> 2.4? My current reading is "It's just as broken as it's always been, so
> it's not necessarily dire."

It was broken all the time. So there is no particular need to fix
it for 2.4. But it should be fixed.

The second problem in the code is that a DMA cancel drains
all block devices. This is the second point I have seen a VM
hang when I forcibly should down the NFS server where my ISOs
are on. I don't know if thats something that can be fixed as well.

>
> Also: since ATAPI apparently is doing all of its reads in a synchronous
> manner in the SCSI fakery layer it has, I think a lot of the work is
> done by making sure the IDE device is set +BSY +DRQ which will prevent
> any new commands being sent to it just like DMA_READ commands already do.
>
> Looks like the ATAPI state machine is supposed to be something like this:
>
> CMD_PACKET is received: BSY bit is set.
> Device is ready to receive command packet: -BSY +DRQ
> Based on the nIEN bit, we either wait for an interrupt, or:
> Poll the status register until BSY and DRQ clear.
>
> The IDE layer already prevents new commands from showing up while we
> have BSY or DRQ set, and it looks like the flow is:
>
> - ide_exec_cmd sets +BSY
> - cmd_packet sets -BST
> - ide_transfer_start sets +DRQ
>    (Note: this is fully synchronous for e.g. AHCI, PCI/ISA will wait
>     for PIO data)
> - After the last byte is transferred, we'll invoke ide_atapi_cmd.
> - ide_atapi_cmd invokes e.g. cmd_inquiry
> - cmd_inquiry will fill its buffer and invoke ide_atapi_cmd_reply.
> - ide_atapi_cmd_reply either does a DMA transfer (-BSY +DRQ)
>    or a PIO reply (ide_atapi_cmd_reply_end) (-BSY -DRQ)
>    (Note again: AHCI is still fully synchronous here, PCI/ISA will
>     wait for data reads.)
>
> (Hmm, it looks like there's an opening for new commands to show up here,
> since we've got -BSY and -DRQ)
>
> - ide_atapi_cmd_reply_end will call ide_atapi_cmd_ok, which will
>    clear the error bits, definitely set -BSY -DRQ +RDY, and set the IRQ
>    if nIEN is not set.
>
>
> I think this won't be too bad, since the ide_exec_cmd layer itself is
> already used to commands returning that aren't actually finished yet,
> and the cmd_packet launcher itself also assumes the same.
>
> The way the ATAPI commands seem to work is: Tell the core layer that
> we're not finished (even if we possibly are already) and set the
> appropriate status bits ourselves after we're done, synchronously or not.
>
> A lot of the pathways are almost all protected by BSY/DRQ the whole way
> and we already have a nearly asynchronous method for clearing them only
> when the command is actually complete.

>
> Maybe I'll start hacking away at this after hard freeze to see what I
> can do. If you already started, want to link me to a git and I'll start
> from there?

Thanks for your comprehensive explainations. It makes the states
at least a bit clearer to me. I try to find some time to check why
my patch does not work. What I have is in this repo. Its not much
AND does not work, but maybe its just a small thing that needs
to be changed...

https://github.com/plieven/qemu/tree/atapi_async

Peter
Peter Lieven Aug. 14, 2015, 1:43 p.m. UTC | #5
Am 22.06.2015 um 23:54 schrieb John Snow:
>
> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>> No symbol table info available.
>>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
>>>>>>>>>> nfds=3,
>>>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>           tvsec = 4
>>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
>>>>>>>>>> blocking=true)
>>>>>>>>>>       at aio-posix.c:231
>>>>>>>>>>           node = 0x0
>>>>>>>>>>           was_dispatching = false
>>>>>>>>>>           ret = 1
>>>>>>>>>>           progress = false
>>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>>>> offset=4292007936,
>>>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at
>>>>>>>>>> block.c:2699
>>>>>>>>>>           aio_context = 0x5555563528e0
>>>>>>>>>>           co = 0x5555563888a0
>>>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret =
>>>>>>>>>> 2147483647,
>>>>>>>>>> flags = 0}
>>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>> flags=0)
>>>>>>>>>>       at block.c:2722
>>>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
>>>>>>>>>> size =
>>>>>>>>>> 2048}
>>>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>> No locals.
>>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at
>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>> No locals.
>>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>>>> lba=2095707,
>>>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at
>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>           ret = 32767
>>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>>> blk_read()
>>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>>> keeps
>>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>>> request
>>>>>>>>> is completed.
>>>>>>>> I will look at this.
>>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>>>> block it in the Firewall. Currently I face the problem that I
>>>>>> cannot mount
>>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
>>>>>> tried with
>>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
>>>>>> another
>>>>>> bug):
>>>>>>
>>>>>> (gdb) thread apply all bt full
>>>>>>
>>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>>>>> util/qemu-thread-posix.c:120
>>>>>>          err = <optimized out>
>>>>>>          __func__ = "qemu_cond_broadcast"
>>>>>> #1  0x0000555555911164 in rfifolock_unlock
>>>>>> (r=r@entry=0x555556259910) at
>>>>>> util/rfifolock.c:75
>>>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>>>> #2  0x0000555555875921 in aio_context_release
>>>>>> (ctx=ctx@entry=0x5555562598b0)
>>>>>> at async.c:329
>>>>>> No locals.
>>>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>>>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>>>>          node = <optimized out>
>>>>>>          was_dispatching = false
>>>>>>          i = <optimized out>
>>>>>>          ret = <optimized out>
>>>>>>          progress = false
>>>>>>          timeout = 611734526
>>>>>>          __PRETTY_FUNCTION__ = "aio_poll"
>>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>>>>> block/io.c:552
>>>>>>          aio_context = 0x5555562598b0
>>>>>>          co = <optimized out>
>>>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
>>>>>> (unknown: 0)}
>>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>>>>      flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
>>>>>> = 2048}
>>>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>>>>> No locals.
>>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>>>>          ret = <optimized out>
>>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
>>>>>> hw/ide/atapi.c:116
>>>>>>          ret = <optimized out>
>>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>>>          byte_count_limit = <optimized out>
>>>>>>          size = <optimized out>
>>>>>>          ret = 2
>>>>> This is still the same scenario Kevin explained.
>>>>>
>>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>>>> function holds the QEMU global mutex while waiting for the I/O request
>>>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>>>
>>>>> The solution is to convert the CD-ROM emulation code to use
>>>>> blk_aio_readv() instead of blk_read().
>>>> I tried a little, but i am stuck with my approach. I reads one sector
>>>> and then doesn't continue. Maybe someone with more knowledge
>>>> of ATAPI/IDE could help?
>>> Converting synchronous code to asynchronous requires an understanding
>>> of the device's state transitions.  Asynchronous code has to put the
>>> device registers into a busy state until the request completes.  It
>>> also needs to handle hardware register accesses that occur while the
>>> request is still pending.
>> That was my assumption as well. But I don't know how to proceed...
>>
>>> I don't know ATAPI/IDE code well enough to suggest a fix.
>> Maybe @John can help?
>>
>> Peter
>>
>

I looked into this again and it seems that the remaining problem (at least when the CDROM is
mounted via libnfs) is the blk_drain_all() in bmdma_cmd_writeb. At least I end there if I have
a proper OS booted and cut off the NFS server. The VM remains responsive until the guest OS
issues a DMA cancel.

I do not know what the proper solution is. I had the following ideas so far (not knowing if the
approaches would be correct or not).

a) Do not clear BM_STATUS_DMAING if we are not able to drain all requests. This works until
the connection is reestablished. The guest OS issues DMA cancel operations again and
again, but when the connectivity is back I end in the following assertion:

qemu-system-x86_64: ./hw/ide/pci.h:65: bmdma_active_if: Assertion `bmdma->bus->retry_unit != (uint8_t)-1' failed.

b) Call the aiocb with -ECANCELED and somehow (?) turn all the callbacks of the outstanding IOs into NOPs.

c) Follow the hint in the comment in bmdma_cmd_writeb (however this works out):
             * In the future we'll be able to safely cancel the I/O if the
             * whole DMA operation will be submitted to disk with a single
             * aio operation with preadv/pwritev.

Your help is appreciated.

Thanks,
Peter
Kevin Wolf Aug. 14, 2015, 2:08 p.m. UTC | #6
Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
> Am 22.06.2015 um 23:54 schrieb John Snow:
> >
> > On 06/22/2015 09:09 AM, Peter Lieven wrote:
> >> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
> >>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
> >>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
> >>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
> >>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
> >>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
> >>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
> >>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
> >>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
> >>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
> >>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
> >>>>>>>>>> No symbol table info available.
> >>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
> >>>>>>>>>> nfds=3,
> >>>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
> >>>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
> >>>>>>>>>>           tvsec = 4
> >>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
> >>>>>>>>>> blocking=true)
> >>>>>>>>>>       at aio-posix.c:231
> >>>>>>>>>>           node = 0x0
> >>>>>>>>>>           was_dispatching = false
> >>>>>>>>>>           ret = 1
> >>>>>>>>>>           progress = false
> >>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
> >>>>>>>>>> offset=4292007936,
> >>>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at
> >>>>>>>>>> block.c:2699
> >>>>>>>>>>           aio_context = 0x5555563528e0
> >>>>>>>>>>           co = 0x5555563888a0
> >>>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
> >>>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret =
> >>>>>>>>>> 2147483647,
> >>>>>>>>>> flags = 0}
> >>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
> >>>>>>>>>> sector_num=8382828,
> >>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
> >>>>>>>>>> flags=0)
> >>>>>>>>>>       at block.c:2722
> >>>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
> >>>>>>>>>> size =
> >>>>>>>>>> 2048}
> >>>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
> >>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
> >>>>>>>>>> sector_num=8382828,
> >>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
> >>>>>>>>>> No locals.
> >>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
> >>>>>>>>>> sector_num=8382828,
> >>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at
> >>>>>>>>>> block/block-backend.c:404
> >>>>>>>>>> No locals.
> >>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
> >>>>>>>>>> lba=2095707,
> >>>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at
> >>>>>>>>>> hw/ide/atapi.c:116
> >>>>>>>>>>           ret = 32767
> >>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
> >>>>>>>>> blk_read()
> >>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
> >>>>>>>>> keeps
> >>>>>>>>> polling for request completion while it holds the BQL until the
> >>>>>>>>> request
> >>>>>>>>> is completed.
> >>>>>>>> I will look at this.
> >>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
> >>>>>> block it in the Firewall. Currently I face the problem that I
> >>>>>> cannot mount
> >>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
> >>>>>> tried with
> >>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
> >>>>>> another
> >>>>>> bug):
> >>>>>>
> >>>>>> (gdb) thread apply all bt full
> >>>>>>
> >>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
> >>>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
> >>>>>> util/qemu-thread-posix.c:120
> >>>>>>          err = <optimized out>
> >>>>>>          __func__ = "qemu_cond_broadcast"
> >>>>>> #1  0x0000555555911164 in rfifolock_unlock
> >>>>>> (r=r@entry=0x555556259910) at
> >>>>>> util/rfifolock.c:75
> >>>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
> >>>>>> #2  0x0000555555875921 in aio_context_release
> >>>>>> (ctx=ctx@entry=0x5555562598b0)
> >>>>>> at async.c:329
> >>>>>> No locals.
> >>>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
> >>>>>> blocking=blocking@entry=true) at aio-posix.c:272
> >>>>>>          node = <optimized out>
> >>>>>>          was_dispatching = false
> >>>>>>          i = <optimized out>
> >>>>>>          ret = <optimized out>
> >>>>>>          progress = false
> >>>>>>          timeout = 611734526
> >>>>>>          __PRETTY_FUNCTION__ = "aio_poll"
> >>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
> >>>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
> >>>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
> >>>>>> block/io.c:552
> >>>>>>          aio_context = 0x5555562598b0
> >>>>>>          co = <optimized out>
> >>>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
> >>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
> >>>>>> (unknown: 0)}
> >>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
> >>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
> >>>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
> >>>>>>      flags=flags@entry=(unknown: 0)) at block/io.c:575
> >>>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
> >>>>>> = 2048}
> >>>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
> >>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
> >>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
> >>>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
> >>>>>> No locals.
> >>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
> >>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
> >>>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
> >>>>>>          ret = <optimized out>
> >>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
> >>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
> >>>>>> hw/ide/atapi.c:116
> >>>>>>          ret = <optimized out>
> >>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
> >>>>>>          byte_count_limit = <optimized out>
> >>>>>>          size = <optimized out>
> >>>>>>          ret = 2
> >>>>> This is still the same scenario Kevin explained.
> >>>>>
> >>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
> >>>>> function holds the QEMU global mutex while waiting for the I/O request
> >>>>> to complete.  This blocks other vcpu threads and the main loop thread.
> >>>>>
> >>>>> The solution is to convert the CD-ROM emulation code to use
> >>>>> blk_aio_readv() instead of blk_read().
> >>>> I tried a little, but i am stuck with my approach. I reads one sector
> >>>> and then doesn't continue. Maybe someone with more knowledge
> >>>> of ATAPI/IDE could help?
> >>> Converting synchronous code to asynchronous requires an understanding
> >>> of the device's state transitions.  Asynchronous code has to put the
> >>> device registers into a busy state until the request completes.  It
> >>> also needs to handle hardware register accesses that occur while the
> >>> request is still pending.
> >> That was my assumption as well. But I don't know how to proceed...
> >>
> >>> I don't know ATAPI/IDE code well enough to suggest a fix.
> >> Maybe @John can help?
> >>
> >> Peter
> >>
> >
> 
> I looked into this again and it seems that the remaining problem (at least when the CDROM is
> mounted via libnfs) is the blk_drain_all() in bmdma_cmd_writeb. At least I end there if I have
> a proper OS booted and cut off the NFS server. The VM remains responsive until the guest OS
> issues a DMA cancel.
> 
> I do not know what the proper solution is. I had the following ideas so far (not knowing if the
> approaches would be correct or not).
> 
> a) Do not clear BM_STATUS_DMAING if we are not able to drain all requests. This works until
> the connection is reestablished. The guest OS issues DMA cancel operations again and
> again, but when the connectivity is back I end in the following assertion:
> 
> qemu-system-x86_64: ./hw/ide/pci.h:65: bmdma_active_if: Assertion `bmdma->bus->retry_unit != (uint8_t)-1' failed.

I would have to check the specs to see if this is allowed.

> b) Call the aiocb with -ECANCELED and somehow (?) turn all the callbacks of the outstanding IOs into NOPs.

This wouldn't be correct for write requests: We would tell the guest
that the request is cancelled when it's actually still in flight. At
some point it could still complete, however, and that's not expected by
the guest.

> c) Follow the hint in the comment in bmdma_cmd_writeb (however this works out):
>              * In the future we'll be able to safely cancel the I/O if the
>              * whole DMA operation will be submitted to disk with a single
>              * aio operation with preadv/pwritev.

Not sure how likely it is that cancelling that single AIOCB will
actually cancel the operation and not end up doing bdrv_drain_all()
internally instead because there is no good way of cancelling the
request.

Kevin
Peter Lieven Aug. 14, 2015, 2:21 p.m. UTC | #7
Am 14.08.2015 um 16:08 schrieb Kevin Wolf:
> Am 14.08.2015 um 15:43 hat Peter Lieven geschrieben:
>> Am 22.06.2015 um 23:54 schrieb John Snow:
>>> On 06/22/2015 09:09 AM, Peter Lieven wrote:
>>>> Am 22.06.2015 um 11:25 schrieb Stefan Hajnoczi:
>>>>> On Fri, Jun 19, 2015 at 2:14 PM, Peter Lieven <pl@kamp.de> wrote:
>>>>>> Am 18.06.2015 um 11:36 schrieb Stefan Hajnoczi:
>>>>>>> On Thu, Jun 18, 2015 at 10:29 AM, Peter Lieven <pl@kamp.de> wrote:
>>>>>>>> Am 18.06.2015 um 10:42 schrieb Kevin Wolf:
>>>>>>>>> Am 18.06.2015 um 10:30 hat Peter Lieven geschrieben:
>>>>>>>>>> Am 18.06.2015 um 09:45 schrieb Kevin Wolf:
>>>>>>>>>>> Am 18.06.2015 um 09:12 hat Peter Lieven geschrieben:
>>>>>>>>>>>> Thread 2 (Thread 0x7ffff5550700 (LWP 2636)):
>>>>>>>>>>>> #0  0x00007ffff5d87aa3 in ppoll () from
>>>>>>>>>>>> /lib/x86_64-linux-gnu/libc.so.6
>>>>>>>>>>>> No symbol table info available.
>>>>>>>>>>>> #1  0x0000555555955d91 in qemu_poll_ns (fds=0x5555563889c0,
>>>>>>>>>>>> nfds=3,
>>>>>>>>>>>>       timeout=4999424576) at qemu-timer.c:326
>>>>>>>>>>>>           ts = {tv_sec = 4, tv_nsec = 999424576}
>>>>>>>>>>>>           tvsec = 4
>>>>>>>>>>>> #2  0x0000555555956feb in aio_poll (ctx=0x5555563528e0,
>>>>>>>>>>>> blocking=true)
>>>>>>>>>>>>       at aio-posix.c:231
>>>>>>>>>>>>           node = 0x0
>>>>>>>>>>>>           was_dispatching = false
>>>>>>>>>>>>           ret = 1
>>>>>>>>>>>>           progress = false
>>>>>>>>>>>> #3  0x000055555594aeed in bdrv_prwv_co (bs=0x55555637eae0,
>>>>>>>>>>>> offset=4292007936,
>>>>>>>>>>>>       qiov=0x7ffff554f760, is_write=false, flags=0) at
>>>>>>>>>>>> block.c:2699
>>>>>>>>>>>>           aio_context = 0x5555563528e0
>>>>>>>>>>>>           co = 0x5555563888a0
>>>>>>>>>>>>           rwco = {bs = 0x55555637eae0, offset = 4292007936,
>>>>>>>>>>>>             qiov = 0x7ffff554f760, is_write = false, ret =
>>>>>>>>>>>> 2147483647,
>>>>>>>>>>>> flags = 0}
>>>>>>>>>>>> #4  0x000055555594afa9 in bdrv_rw_co (bs=0x55555637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4, is_write=false,
>>>>>>>>>>>> flags=0)
>>>>>>>>>>>>       at block.c:2722
>>>>>>>>>>>>           qiov = {iov = 0x7ffff554f780, niov = 1, nalloc = -1,
>>>>>>>>>>>> size =
>>>>>>>>>>>> 2048}
>>>>>>>>>>>>           iov = {iov_base = 0x7ffff44cc800, iov_len = 2048}
>>>>>>>>>>>> #5  0x000055555594b008 in bdrv_read (bs=0x55555637eae0,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at block.c:2730
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #6  0x000055555599acef in blk_read (blk=0x555556376820,
>>>>>>>>>>>> sector_num=8382828,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", nb_sectors=4) at
>>>>>>>>>>>> block/block-backend.c:404
>>>>>>>>>>>> No locals.
>>>>>>>>>>>> #7  0x0000555555833ed2 in cd_read_sector (s=0x555556408f88,
>>>>>>>>>>>> lba=2095707,
>>>>>>>>>>>>       buf=0x7ffff44cc800 "(", sector_size=2048) at
>>>>>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>>>>>           ret = 32767
>>>>>>>>>>> Here is the problem: The ATAPI emulation uses synchronous
>>>>>>>>>>> blk_read()
>>>>>>>>>>> instead of the AIO or coroutine interfaces. This means that it
>>>>>>>>>>> keeps
>>>>>>>>>>> polling for request completion while it holds the BQL until the
>>>>>>>>>>> request
>>>>>>>>>>> is completed.
>>>>>>>>>> I will look at this.
>>>>>>>> I need some further help. My way to "emulate" a hung NFS Server is to
>>>>>>>> block it in the Firewall. Currently I face the problem that I
>>>>>>>> cannot mount
>>>>>>>> a CD Iso via libnfs (nfs://) without hanging Qemu (i previously
>>>>>>>> tried with
>>>>>>>> a kernel NFS mount). It reads a few sectors and then stalls (maybe
>>>>>>>> another
>>>>>>>> bug):
>>>>>>>>
>>>>>>>> (gdb) thread apply all bt full
>>>>>>>>
>>>>>>>> Thread 3 (Thread 0x7ffff0c21700 (LWP 29710)):
>>>>>>>> #0  qemu_cond_broadcast (cond=cond@entry=0x555556259940) at
>>>>>>>> util/qemu-thread-posix.c:120
>>>>>>>>          err = <optimized out>
>>>>>>>>          __func__ = "qemu_cond_broadcast"
>>>>>>>> #1  0x0000555555911164 in rfifolock_unlock
>>>>>>>> (r=r@entry=0x555556259910) at
>>>>>>>> util/rfifolock.c:75
>>>>>>>>          __PRETTY_FUNCTION__ = "rfifolock_unlock"
>>>>>>>> #2  0x0000555555875921 in aio_context_release
>>>>>>>> (ctx=ctx@entry=0x5555562598b0)
>>>>>>>> at async.c:329
>>>>>>>> No locals.
>>>>>>>> #3  0x000055555588434c in aio_poll (ctx=ctx@entry=0x5555562598b0,
>>>>>>>> blocking=blocking@entry=true) at aio-posix.c:272
>>>>>>>>          node = <optimized out>
>>>>>>>>          was_dispatching = false
>>>>>>>>          i = <optimized out>
>>>>>>>>          ret = <optimized out>
>>>>>>>>          progress = false
>>>>>>>>          timeout = 611734526
>>>>>>>>          __PRETTY_FUNCTION__ = "aio_poll"
>>>>>>>> #4  0x00005555558bc43d in bdrv_prwv_co (bs=bs@entry=0x55555627c0f0,
>>>>>>>> offset=offset@entry=7038976, qiov=qiov@entry=0x7ffff0c208f0,
>>>>>>>> is_write=is_write@entry=false, flags=flags@entry=(unknown: 0)) at
>>>>>>>> block/io.c:552
>>>>>>>>          aio_context = 0x5555562598b0
>>>>>>>>          co = <optimized out>
>>>>>>>>          rwco = {bs = 0x55555627c0f0, offset = 7038976, qiov =
>>>>>>>> 0x7ffff0c208f0, is_write = false, ret = 2147483647, flags =
>>>>>>>> (unknown: 0)}
>>>>>>>> #5  0x00005555558bc533 in bdrv_rw_co (bs=0x55555627c0f0,
>>>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>>>> nb_sectors=nb_sectors@entry=4, is_write=is_write@entry=false,
>>>>>>>>      flags=flags@entry=(unknown: 0)) at block/io.c:575
>>>>>>>>          qiov = {iov = 0x7ffff0c208e0, niov = 1, nalloc = -1, size
>>>>>>>> = 2048}
>>>>>>>>          iov = {iov_base = 0x555557874800, iov_len = 2048}
>>>>>>>> #6  0x00005555558bc593 in bdrv_read (bs=<optimized out>,
>>>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>>>> nb_sectors=nb_sectors@entry=4) at block/io.c:583
>>>>>>>> No locals.
>>>>>>>> #7  0x00005555558af75d in blk_read (blk=<optimized out>,
>>>>>>>> sector_num=sector_num@entry=13748, buf=buf@entry=0x555557874800 "(",
>>>>>>>> nb_sectors=nb_sectors@entry=4) at block/block-backend.c:493
>>>>>>>>          ret = <optimized out>
>>>>>>>> #8  0x00005555557abb88 in cd_read_sector (sector_size=<optimized out>,
>>>>>>>> buf=0x555557874800 "(", lba=3437, s=0x55555760db70) at
>>>>>>>> hw/ide/atapi.c:116
>>>>>>>>          ret = <optimized out>
>>>>>>>> #9  ide_atapi_cmd_reply_end (s=0x55555760db70) at hw/ide/atapi.c:190
>>>>>>>>          byte_count_limit = <optimized out>
>>>>>>>>          size = <optimized out>
>>>>>>>>          ret = 2
>>>>>>> This is still the same scenario Kevin explained.
>>>>>>>
>>>>>>> The ATAPI CD-ROM emulation code is using synchronous blk_read().  This
>>>>>>> function holds the QEMU global mutex while waiting for the I/O request
>>>>>>> to complete.  This blocks other vcpu threads and the main loop thread.
>>>>>>>
>>>>>>> The solution is to convert the CD-ROM emulation code to use
>>>>>>> blk_aio_readv() instead of blk_read().
>>>>>> I tried a little, but i am stuck with my approach. I reads one sector
>>>>>> and then doesn't continue. Maybe someone with more knowledge
>>>>>> of ATAPI/IDE could help?
>>>>> Converting synchronous code to asynchronous requires an understanding
>>>>> of the device's state transitions.  Asynchronous code has to put the
>>>>> device registers into a busy state until the request completes.  It
>>>>> also needs to handle hardware register accesses that occur while the
>>>>> request is still pending.
>>>> That was my assumption as well. But I don't know how to proceed...
>>>>
>>>>> I don't know ATAPI/IDE code well enough to suggest a fix.
>>>> Maybe @John can help?
>>>>
>>>> Peter
>>>>
>> I looked into this again and it seems that the remaining problem (at least when the CDROM is
>> mounted via libnfs) is the blk_drain_all() in bmdma_cmd_writeb. At least I end there if I have
>> a proper OS booted and cut off the NFS server. The VM remains responsive until the guest OS
>> issues a DMA cancel.
>>
>> I do not know what the proper solution is. I had the following ideas so far (not knowing if the
>> approaches would be correct or not).
>>
>> a) Do not clear BM_STATUS_DMAING if we are not able to drain all requests. This works until
>> the connection is reestablished. The guest OS issues DMA cancel operations again and
>> again, but when the connectivity is back I end in the following assertion:
>>
>> qemu-system-x86_64: ./hw/ide/pci.h:65: bmdma_active_if: Assertion `bmdma->bus->retry_unit != (uint8_t)-1' failed.
> I would have to check the specs to see if this is allowed.

Maybe there is a better approach after all...

>
>> b) Call the aiocb with -ECANCELED and somehow (?) turn all the callbacks of the outstanding IOs into NOPs.
> This wouldn't be correct for write requests: We would tell the guest
> that the request is cancelled when it's actually still in flight. At
> some point it could still complete, however, and that's not expected by
> the guest.

In case of a CDROM we have a read only device. So this could work?

>
>> c) Follow the hint in the comment in bmdma_cmd_writeb (however this works out):
>>              * In the future we'll be able to safely cancel the I/O if the
>>              * whole DMA operation will be submitted to disk with a single
>>              * aio operation with preadv/pwritev.
> Not sure how likely it is that cancelling that single AIOCB will
> actually cancel the operation and not end up doing bdrv_drain_all()
> internally instead because there is no good way of cancelling the
> request.

You might be right.

It seems that the whole thing is not trivial. But it seems so wrong that a whole vServer goes
down just because someone forgot to eject a CDROM he used once and then the NFS
server has a hickup.

Peter
diff mbox

Patch

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 950e311..cdcbd49 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -27,6 +27,8 @@ 
 #include "hw/scsi/scsi.h"
 #include "sysemu/block-backend.h"
 
+#define DEBUG_IDE_ATAPI 1
+
 static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret);
 
 static void padstr8(uint8_t *buf, int buf_size, const char *src)
@@ -105,31 +107,55 @@  static void cd_data_to_raw(uint8_t *buf, int lba)
     memset(buf, 0, 288);
 }
 
-static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
+static void cd_read_sector_cb(void *opaque, int ret) {
+    IDEState *s = opaque;
+
+    block_acct_done(blk_get_stats(s->blk), &s->acct);
+
+    printf("cd_read_sector_cb lba %d ret = %d\n", s->lba, ret);
+
+    if (ret < 0) {
+        ide_atapi_io_error(s, ret);
+        return;
+    }
+
+    if (s->cd_sector_size == 2352) {
+        cd_data_to_raw(s->io_buffer, s->lba);
+    }
+
+    s->lba++;
+    s->io_buffer_index = 0;
+
+    ide_atapi_cmd_reply_end(s);
+}
+
+static BlockAIOCB *cd_read_sector(IDEState *s, int lba, void *buf, int sector_size)
 {
-    int ret;
+    BlockAIOCB *aiocb = NULL;
 
-    switch(sector_size) {
-    case 2048:
-        block_acct_start(blk_get_stats(s->blk), &s->acct,
-                         4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-        ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
-        break;
-    case 2352:
+    if (sector_size != 2048 && sector_size != 2352) {
+        return NULL;
+    }
+
+    s->iov.iov_base = buf;
+    if (sector_size == 2352) {
+        buf += 4;
+    }
+
+    s->iov.iov_len = 4 * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&s->qiov, &s->iov, 1);
+
+    printf("cd_read_sector lba %d\n", lba);
+
+    aiocb = blk_aio_readv(s->blk, (int64_t)lba << 2, &s->qiov, 4,
+                          cd_read_sector_cb, s);
+
+    if (aiocb != NULL) {
         block_acct_start(blk_get_stats(s->blk), &s->acct,
                          4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
-        ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
-        if (ret < 0)
-            return ret;
-        cd_data_to_raw(buf, lba);
-        break;
-    default:
-        ret = -EIO;
-        break;
     }
-    return ret;
+
+    return aiocb;
 }
 
 void ide_atapi_cmd_ok(IDEState *s)
@@ -170,7 +196,7 @@  void ide_atapi_io_error(IDEState *s, int ret)
 /* The whole ATAPI transfer logic is handled in this function */
 void ide_atapi_cmd_reply_end(IDEState *s)
 {
-    int byte_count_limit, size, ret;
+    int byte_count_limit, size;
 #ifdef DEBUG_IDE_ATAPI
     printf("reply: tx_size=%d elem_tx_size=%d index=%d\n",
            s->packet_transfer_size,
@@ -187,13 +213,10 @@  void ide_atapi_cmd_reply_end(IDEState *s)
     } else {
         /* see if a new sector must be read */
         if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
-            ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size);
-            if (ret < 0) {
-                ide_atapi_io_error(s, ret);
-                return;
+            if (cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size) == NULL) {
+                ide_atapi_io_error(s, -EIO);
             }
-            s->lba++;
-            s->io_buffer_index = 0;
+            return;
         }
         if (s->elementary_transfer_size > 0) {
             /* there are some data left to transmit in this elementary