diff mbox series

block: Fix qemu crash when using scsi-block

Message ID 1511364808-30171-1-git-send-email-deepa.srinivasan@oracle.com
State New
Headers show
Series block: Fix qemu crash when using scsi-block | expand

Commit Message

Deepa Srinivasan Nov. 22, 2017, 3:33 p.m. UTC
Starting qemu with the following arguments causes qemu to segfault:
... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1

This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
details about the bug follow.

blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().

When blk_aio_ioctl() is executed from within a coroutine context (e.g.
iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
the current coroutine's wakeup queue. blk_aio_ioctl() then returns.

When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
....
    BlkRwCo *rwco = &acb->rwco;

    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
                             rwco->qiov->iov[0].iov_base);  <--- qiov is
                                                                 invalid here
...

In the case when blk_aio_ioctl() is called from a non-coroutine context,
blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
execution is complete, control returns to blk_aio_ioctl_entry() after the call
to blk_co_ioctl(). There is no invalid reference after this point, but the
function is still holding on to invalid pointers.

The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
the request struct which the IO buffer is part of. The memory for this struct is
guaranteed to be valid till the AIO is completed.

Signed-off-by: Deepa Srinivasan <deepa.srinivasan@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Mark Kanda <mark.kanda@oracle.com>
---
 block/block-backend.c          | 13 ++-----------
 hw/block/virtio-blk.c          |  9 ++++++++-
 hw/scsi/scsi-disk.c            | 10 +++++++++-
 hw/scsi/scsi-generic.c         |  9 ++++++++-
 include/sysemu/block-backend.h |  2 +-
 5 files changed, 28 insertions(+), 15 deletions(-)

Comments

Paolo Bonzini Nov. 22, 2017, 4:34 p.m. UTC | #1
On 22/11/2017 16:33, Deepa Srinivasan wrote:
> Starting qemu with the following arguments causes qemu to segfault:
> ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
> iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
> 
> This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
> blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
> details about the bug follow.
> 
> blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
> coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
> 
> When blk_aio_ioctl() is executed from within a coroutine context (e.g.
> iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
> the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
> 
> When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
> ....
>     BlkRwCo *rwco = &acb->rwco;
> 
>     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
>                              rwco->qiov->iov[0].iov_base);  <--- qiov is
>                                                                  invalid here
> ...
> 
> In the case when blk_aio_ioctl() is called from a non-coroutine context,
> blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
> qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
> execution is complete, control returns to blk_aio_ioctl_entry() after the call
> to blk_co_ioctl(). There is no invalid reference after this point, but the
> function is still holding on to invalid pointers.
> 
> The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
> the request struct which the IO buffer is part of. The memory for this struct is
> guaranteed to be valid till the AIO is completed.
> 
> Signed-off-by: Deepa Srinivasan <deepa.srinivasan@oracle.com>
> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> Reviewed-by: Mark Kanda <mark.kanda@oracle.com>
> ---
>  block/block-backend.c          | 13 ++-----------
>  hw/block/virtio-blk.c          |  9 ++++++++-
>  hw/scsi/scsi-disk.c            | 10 +++++++++-
>  hw/scsi/scsi-generic.c         |  9 ++++++++-
>  include/sysemu/block-backend.h |  2 +-
>  5 files changed, 28 insertions(+), 15 deletions(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index baef8e7..c275827 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1472,19 +1472,10 @@ static void blk_aio_ioctl_entry(void *opaque)
>      blk_aio_complete(acb);
>  }
>  
> -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
> +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
>                            BlockCompletionFunc *cb, void *opaque)

I think this is not the best way to fix the bug, because it adds extra
unnecessary code in the callers.

Perhaps you can change BlkRwCo's "qiov" field to "void *buf" and the
same for blk_aio_prwv's "qiov" argument?

Then the QEMUIOVector is not needed at all, and blk_co_ioctl can just
use rwco->buf.

Thanks,

Paolo

>  {
> -    QEMUIOVector qiov;
> -    struct iovec iov;
> -
> -    iov = (struct iovec) {
> -        .iov_base = buf,
> -        .iov_len = 0,
> -    };
> -    qemu_iovec_init_external(&qiov, &iov, 1);
> -
> -    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
> +    return blk_aio_prwv(blk, req, 0, qiov, blk_aio_ioctl_entry, 0, cb, opaque);
>  }
>  
>  int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 05d1440..ed9f774 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -151,6 +151,8 @@ out:
>  typedef struct {
>      VirtIOBlockReq *req;
>      struct sg_io_hdr hdr;
> +    QEMUIOVector qiov;
> +    struct iovec iov;
>  } VirtIOBlockIoctlReq;
>  
>  static void virtio_blk_ioctl_complete(void *opaque, int status)
> @@ -298,7 +300,12 @@ static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
>      ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
>      ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
>  
> -    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
> +    ioctl_req->iov.iov_base = &ioctl_req->hdr;
> +    ioctl_req->iov.iov_len = 0;
> +
> +    qemu_iovec_init_external(&ioctl_req->qiov, &ioctl_req->iov, 1);
> +
> +    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->qiov,
>                          virtio_blk_ioctl_complete, ioctl_req);
>      if (!acb) {
>          g_free(ioctl_req);
> diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
> index 1243117..7cbe18d 100644
> --- a/hw/scsi/scsi-disk.c
> +++ b/hw/scsi/scsi-disk.c
> @@ -2636,6 +2636,9 @@ typedef struct SCSIBlockReq {
>      SCSIDiskReq req;
>      sg_io_hdr_t io_header;
>  
> +    QEMUIOVector qiov;
> +    struct iovec iov;
> +
>      /* Selected bytes of the original CDB, copied into our own CDB.  */
>      uint8_t cmd, cdb1, group_number;
>  
> @@ -2722,7 +2725,12 @@ static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
>      io_header->usr_ptr = r;
>      io_header->flags |= SG_FLAG_DIRECT_IO;
>  
> -    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
> +    req->iov.iov_base = io_header;
> +    req->iov.iov_len = 0;
> +
> +    qemu_iovec_init_external(&req->qiov, &req->iov, 1);
> +
> +    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, &req->qiov, cb, opaque);
>      assert(aiocb != NULL);
>      return aiocb;
>  }
> diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
> index bd0d9ff..856af7c 100644
> --- a/hw/scsi/scsi-generic.c
> +++ b/hw/scsi/scsi-generic.c
> @@ -46,6 +46,8 @@ typedef struct SCSIGenericReq {
>      int buflen;
>      int len;
>      sg_io_hdr_t io_header;
> +    QEMUIOVector qiov;
> +    struct iovec iov;
>  } SCSIGenericReq;
>  
>  static void scsi_generic_save_request(QEMUFile *f, SCSIRequest *req)
> @@ -135,7 +137,12 @@ static int execute_command(BlockBackend *blk,
>      r->io_header.usr_ptr = r;
>      r->io_header.flags |= SG_FLAG_DIRECT_IO;
>  
> -    r->req.aiocb = blk_aio_ioctl(blk, SG_IO, &r->io_header, complete, r);
> +    r->iov.iov_base = &r->io_header;
> +    r->iov.iov_len = 0;
> +
> +    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
> +
> +    r->req.aiocb = blk_aio_ioctl(blk, SG_IO, &r->qiov, complete, r);
>      if (r->req.aiocb == NULL) {
>          return -EIO;
>      }
> diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
> index c4e52a5..32f4486 100644
> --- a/include/sysemu/block-backend.h
> +++ b/include/sysemu/block-backend.h
> @@ -151,7 +151,7 @@ void blk_aio_cancel(BlockAIOCB *acb);
>  void blk_aio_cancel_async(BlockAIOCB *acb);
>  int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
>  int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
> -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
> +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
>                            BlockCompletionFunc *cb, void *opaque);
>  int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
>  int blk_co_flush(BlockBackend *blk);
>
Stefan Hajnoczi Nov. 22, 2017, 5:06 p.m. UTC | #2
On Wed, Nov 22, 2017 at 07:33:28AM -0800, Deepa Srinivasan wrote:
> Starting qemu with the following arguments causes qemu to segfault:
> ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
> iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
> 
> This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
> blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
> details about the bug follow.
> 
> blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
> coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
> 
> When blk_aio_ioctl() is executed from within a coroutine context (e.g.
> iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
> the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
> 
> When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
> ....
>     BlkRwCo *rwco = &acb->rwco;
> 
>     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
>                              rwco->qiov->iov[0].iov_base);  <--- qiov is
>                                                                  invalid here
> ...
> 
> In the case when blk_aio_ioctl() is called from a non-coroutine context,
> blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
> qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
> execution is complete, control returns to blk_aio_ioctl_entry() after the call
> to blk_co_ioctl(). There is no invalid reference after this point, but the
> function is still holding on to invalid pointers.
> 
> The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
> the request struct which the IO buffer is part of. The memory for this struct is
> guaranteed to be valid till the AIO is completed.

Thanks for the patch!

AIO APIs currently don't require the caller to match qiov's lifetime to
the I/O request lifetime.  This patch changes that for blk_aio_ioctl()
only.  If we want to do this consistently then all aio callers need to
be audited and fixed.

The alternative is to make the API copy qiov when necessary.  That is
less efficient but avoids modifying all callers.

Either way, the lifetime of qiov must be consistent across all aio APIs,
not just blk_aio_ioctl().
Kevin Wolf Nov. 22, 2017, 6:04 p.m. UTC | #3
Am 22.11.2017 um 18:06 hat Stefan Hajnoczi geschrieben:
> On Wed, Nov 22, 2017 at 07:33:28AM -0800, Deepa Srinivasan wrote:
> > Starting qemu with the following arguments causes qemu to segfault:
> > ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
> > iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
> > 
> > This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
> > blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
> > details about the bug follow.
> > 
> > blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
> > coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
> > 
> > When blk_aio_ioctl() is executed from within a coroutine context (e.g.
> > iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
> > the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
> > 
> > When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
> > ....
> >     BlkRwCo *rwco = &acb->rwco;
> > 
> >     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
> >                              rwco->qiov->iov[0].iov_base);  <--- qiov is
> >                                                                  invalid here
> > ...
> > 
> > In the case when blk_aio_ioctl() is called from a non-coroutine context,
> > blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
> > qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
> > execution is complete, control returns to blk_aio_ioctl_entry() after the call
> > to blk_co_ioctl(). There is no invalid reference after this point, but the
> > function is still holding on to invalid pointers.
> > 
> > The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
> > the request struct which the IO buffer is part of. The memory for this struct is
> > guaranteed to be valid till the AIO is completed.
> 
> Thanks for the patch!
> 
> AIO APIs currently don't require the caller to match qiov's lifetime to
> the I/O request lifetime.  This patch changes that for blk_aio_ioctl()
> only.  If we want to do this consistently then all aio callers need to
> be audited and fixed.
> 
> The alternative is to make the API copy qiov when necessary.  That is
> less efficient but avoids modifying all callers.
> 
> Either way, the lifetime of qiov must be consistent across all aio APIs,
> not just blk_aio_ioctl().

Don't all blk_aio_*() APIs that take a qiov pointer require that it
remains valid until the request completes? I don't think they are copied
anywhere for blk_aio_preadv/pwritev() before being passed to the block
driver.

So this does look consistent with the existing functions to me.

Kevin
Kevin Wolf Nov. 22, 2017, 6:06 p.m. UTC | #4
Am 22.11.2017 um 17:34 hat Paolo Bonzini geschrieben:
> On 22/11/2017 16:33, Deepa Srinivasan wrote:
> > Starting qemu with the following arguments causes qemu to segfault:
> > ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
> > iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
> > 
> > This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
> > blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
> > details about the bug follow.
> > 
> > blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
> > coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
> > 
> > When blk_aio_ioctl() is executed from within a coroutine context (e.g.
> > iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
> > the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
> > 
> > When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
> > ....
> >     BlkRwCo *rwco = &acb->rwco;
> > 
> >     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
> >                              rwco->qiov->iov[0].iov_base);  <--- qiov is
> >                                                                  invalid here
> > ...
> > 
> > In the case when blk_aio_ioctl() is called from a non-coroutine context,
> > blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
> > qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
> > execution is complete, control returns to blk_aio_ioctl_entry() after the call
> > to blk_co_ioctl(). There is no invalid reference after this point, but the
> > function is still holding on to invalid pointers.
> > 
> > The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
> > the request struct which the IO buffer is part of. The memory for this struct is
> > guaranteed to be valid till the AIO is completed.
> > 
> > Signed-off-by: Deepa Srinivasan <deepa.srinivasan@oracle.com>
> > Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
> > Reviewed-by: Mark Kanda <mark.kanda@oracle.com>
> > ---
> >  block/block-backend.c          | 13 ++-----------
> >  hw/block/virtio-blk.c          |  9 ++++++++-
> >  hw/scsi/scsi-disk.c            | 10 +++++++++-
> >  hw/scsi/scsi-generic.c         |  9 ++++++++-
> >  include/sysemu/block-backend.h |  2 +-
> >  5 files changed, 28 insertions(+), 15 deletions(-)
> > 
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index baef8e7..c275827 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1472,19 +1472,10 @@ static void blk_aio_ioctl_entry(void *opaque)
> >      blk_aio_complete(acb);
> >  }
> >  
> > -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
> > +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
> >                            BlockCompletionFunc *cb, void *opaque)
> 
> I think this is not the best way to fix the bug, because it adds extra
> unnecessary code in the callers.
> 
> Perhaps you can change BlkRwCo's "qiov" field to "void *buf" and the
> same for blk_aio_prwv's "qiov" argument?
> 
> Then the QEMUIOVector is not needed at all, and blk_co_ioctl can just
> use rwco->buf.

But the same struct is used for read and write requests that do use an
actual QEMUIOVector and not just a linear buffer.

Kevin
Paolo Bonzini Nov. 22, 2017, 7:24 p.m. UTC | #5
On 22/11/2017 19:06, Kevin Wolf wrote:
> Am 22.11.2017 um 17:34 hat Paolo Bonzini geschrieben:
>> On 22/11/2017 16:33, Deepa Srinivasan wrote:
>>> Starting qemu with the following arguments causes qemu to segfault:
>>> ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
>>> iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
>>>
>>> This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
>>> blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
>>> details about the bug follow.
>>>
>>> blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
>>> coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
>>>
>>> When blk_aio_ioctl() is executed from within a coroutine context (e.g.
>>> iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
>>> the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
>>>
>>> When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
>>> ....
>>>     BlkRwCo *rwco = &acb->rwco;
>>>
>>>     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
>>>                              rwco->qiov->iov[0].iov_base);  <--- qiov is
>>>                                                                  invalid here
>>> ...
>>>
>>> In the case when blk_aio_ioctl() is called from a non-coroutine context,
>>> blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
>>> qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
>>> execution is complete, control returns to blk_aio_ioctl_entry() after the call
>>> to blk_co_ioctl(). There is no invalid reference after this point, but the
>>> function is still holding on to invalid pointers.
>>>
>>> The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
>>> the request struct which the IO buffer is part of. The memory for this struct is
>>> guaranteed to be valid till the AIO is completed.
>>>
>>> Signed-off-by: Deepa Srinivasan <deepa.srinivasan@oracle.com>
>>> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>>> Reviewed-by: Mark Kanda <mark.kanda@oracle.com>
>>> ---
>>>  block/block-backend.c          | 13 ++-----------
>>>  hw/block/virtio-blk.c          |  9 ++++++++-
>>>  hw/scsi/scsi-disk.c            | 10 +++++++++-
>>>  hw/scsi/scsi-generic.c         |  9 ++++++++-
>>>  include/sysemu/block-backend.h |  2 +-
>>>  5 files changed, 28 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index baef8e7..c275827 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1472,19 +1472,10 @@ static void blk_aio_ioctl_entry(void *opaque)
>>>      blk_aio_complete(acb);
>>>  }
>>>  
>>> -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>>> +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
>>>                            BlockCompletionFunc *cb, void *opaque)
>>
>> I think this is not the best way to fix the bug, because it adds extra
>> unnecessary code in the callers.
>>
>> Perhaps you can change BlkRwCo's "qiov" field to "void *buf" and the
>> same for blk_aio_prwv's "qiov" argument?
>>
>> Then the QEMUIOVector is not needed at all, and blk_co_ioctl can just
>> use rwco->buf.
> 
> But the same struct is used for read and write requests that do use an
> actual QEMUIOVector and not just a linear buffer.

Then let's call it "void *opaque", or make it a union (but I think
that's overkill).

The QEMUIOVector pointer is opaque as far as blk_aio_prwv is concerned,
and it is only created by blk_aio_ioctl for blk_aio_ioctl_entry to
extract buf:

    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
                             rwco->qiov->iov[0].iov_base);

Exposing the fake QEMUIOVector to the callers of blk_aio_ioctl is much
uglier than using a void* for what is effectively a multi-type pointer.

Paolo
Deepa Srinivasan Nov. 23, 2017, 2:55 a.m. UTC | #6
I agree that passing in QEMUIOVector to blk_aio_ioctl() as a holder of the void* buffer used in blk_aio_ioctl_entry() is unnecessary. But, as Kevin noted, read and write were using the QEMUIOVector in BlkRwCo.

To avoid changes to the callers of blk_aio_ioctl(), I’ll change blk_aio_prwv() to take a void pointer instead of QEMUIOVector* and use a union to hold the buffer in BlkRwCo.

> On Nov 22, 2017, at 11:24 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> On 22/11/2017 19:06, Kevin Wolf wrote:
>> Am 22.11.2017 um 17:34 hat Paolo Bonzini geschrieben:
>>> On 22/11/2017 16:33, Deepa Srinivasan wrote:
>>>> Starting qemu with the following arguments causes qemu to segfault:
>>>> ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
>>>> iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
>>>> 
>>>> This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
>>>> blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
>>>> details about the bug follow.
>>>> 
>>>> blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
>>>> coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
>>>> 
>>>> When blk_aio_ioctl() is executed from within a coroutine context (e.g.
>>>> iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
>>>> the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
>>>> 
>>>> When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
>>>> ....
>>>>    BlkRwCo *rwco = &acb->rwco;
>>>> 
>>>>    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
>>>>                             rwco->qiov->iov[0].iov_base);  <--- qiov is
>>>>                                                                 invalid here
>>>> ...
>>>> 
>>>> In the case when blk_aio_ioctl() is called from a non-coroutine context,
>>>> blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
>>>> qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
>>>> execution is complete, control returns to blk_aio_ioctl_entry() after the call
>>>> to blk_co_ioctl(). There is no invalid reference after this point, but the
>>>> function is still holding on to invalid pointers.
>>>> 
>>>> The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
>>>> the request struct which the IO buffer is part of. The memory for this struct is
>>>> guaranteed to be valid till the AIO is completed.
>>>> 
>>>> Signed-off-by: Deepa Srinivasan <deepa.srinivasan@oracle.com>
>>>> Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
>>>> Reviewed-by: Mark Kanda <mark.kanda@oracle.com>
>>>> ---
>>>> block/block-backend.c          | 13 ++-----------
>>>> hw/block/virtio-blk.c          |  9 ++++++++-
>>>> hw/scsi/scsi-disk.c            | 10 +++++++++-
>>>> hw/scsi/scsi-generic.c         |  9 ++++++++-
>>>> include/sysemu/block-backend.h |  2 +-
>>>> 5 files changed, 28 insertions(+), 15 deletions(-)
>>>> 
>>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>>> index baef8e7..c275827 100644
>>>> --- a/block/block-backend.c
>>>> +++ b/block/block-backend.c
>>>> @@ -1472,19 +1472,10 @@ static void blk_aio_ioctl_entry(void *opaque)
>>>>     blk_aio_complete(acb);
>>>> }
>>>> 
>>>> -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
>>>> +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
>>>>                           BlockCompletionFunc *cb, void *opaque)
>>> 
>>> I think this is not the best way to fix the bug, because it adds extra
>>> unnecessary code in the callers.
>>> 
>>> Perhaps you can change BlkRwCo's "qiov" field to "void *buf" and the
>>> same for blk_aio_prwv's "qiov" argument?
>>> 
>>> Then the QEMUIOVector is not needed at all, and blk_co_ioctl can just
>>> use rwco->buf.
>> 
>> But the same struct is used for read and write requests that do use an
>> actual QEMUIOVector and not just a linear buffer.
> 
> Then let's call it "void *opaque", or make it a union (but I think
> that's overkill).
> 
> The QEMUIOVector pointer is opaque as far as blk_aio_prwv is concerned,
> and it is only created by blk_aio_ioctl for blk_aio_ioctl_entry to
> extract buf:
> 
>    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
>                             rwco->qiov->iov[0].iov_base);
> 
> Exposing the fake QEMUIOVector to the callers of blk_aio_ioctl is much
> uglier than using a void* for what is effectively a multi-type pointer.
> 
> Paolo
Paolo Bonzini Nov. 23, 2017, 9:18 a.m. UTC | #7
On 23/11/2017 03:55, Deepa Srinivasan wrote:
> I agree that passing in QEMUIOVector to blk_aio_ioctl() as a holder of
> the void* buffer used in blk_aio_ioctl_entry() is unnecessary. But, as
> Kevin noted, read and write were using the QEMUIOVector in BlkRwCo.
> 
> To avoid changes to the callers of blk_aio_ioctl(), I’ll change
> blk_aio_prwv() to take a void pointer instead of QEMUIOVector* and use a
> union to hold the buffer in BlkRwCo.

The union is unnecessary.  A QEMUIOVector* can be stored in a void* just
fine.

Paolo
Stefan Hajnoczi Nov. 23, 2017, 10:23 a.m. UTC | #8
On Wed, Nov 22, 2017 at 07:04:26PM +0100, Kevin Wolf wrote:
> Am 22.11.2017 um 18:06 hat Stefan Hajnoczi geschrieben:
> > On Wed, Nov 22, 2017 at 07:33:28AM -0800, Deepa Srinivasan wrote:
> > > Starting qemu with the following arguments causes qemu to segfault:
> > > ... -device lsi,id=lsi0 -drive file=iscsi:<...>,format=raw,if=none,node-name=
> > > iscsi1 -device scsi-block,bus=lsi0.0,id=<...>,drive=iscsi1
> > > 
> > > This patch fixes blk_aio_ioctl() so it does not pass stack addresses to
> > > blk_aio_ioctl_entry() which may be invoked after blk_aio_ioctl() returns. More
> > > details about the bug follow.
> > > 
> > > blk_aio_ioctl() invokes blk_aio_prwv() with blk_aio_ioctl_entry as the
> > > coroutine parameter. blk_aio_prwv() ultimately calls aio_co_enter().
> > > 
> > > When blk_aio_ioctl() is executed from within a coroutine context (e.g.
> > > iscsi_bh_cb()), aio_co_enter() adds the coroutine (blk_aio_ioctl_entry) to
> > > the current coroutine's wakeup queue. blk_aio_ioctl() then returns.
> > > 
> > > When blk_aio_ioctl_entry() executes later, it accesses an invalid pointer:
> > > ....
> > >     BlkRwCo *rwco = &acb->rwco;
> > > 
> > >     rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
> > >                              rwco->qiov->iov[0].iov_base);  <--- qiov is
> > >                                                                  invalid here
> > > ...
> > > 
> > > In the case when blk_aio_ioctl() is called from a non-coroutine context,
> > > blk_aio_ioctl_entry() executes immediately. But if bdrv_co_ioctl() calls
> > > qemu_coroutine_yield(), blk_aio_ioctl() will return. When the coroutine
> > > execution is complete, control returns to blk_aio_ioctl_entry() after the call
> > > to blk_co_ioctl(). There is no invalid reference after this point, but the
> > > function is still holding on to invalid pointers.
> > > 
> > > The fix is to allocate memory for the QEMUIOVector and struct iovec as part of
> > > the request struct which the IO buffer is part of. The memory for this struct is
> > > guaranteed to be valid till the AIO is completed.
> > 
> > Thanks for the patch!
> > 
> > AIO APIs currently don't require the caller to match qiov's lifetime to
> > the I/O request lifetime.  This patch changes that for blk_aio_ioctl()
> > only.  If we want to do this consistently then all aio callers need to
> > be audited and fixed.
> > 
> > The alternative is to make the API copy qiov when necessary.  That is
> > less efficient but avoids modifying all callers.
> > 
> > Either way, the lifetime of qiov must be consistent across all aio APIs,
> > not just blk_aio_ioctl().
> 
> Don't all blk_aio_*() APIs that take a qiov pointer require that it
> remains valid until the request completes? I don't think they are copied
> anywhere for blk_aio_preadv/pwritev() before being passed to the block
> driver.
> 
> So this does look consistent with the existing functions to me.

You are right.  I audited the blk_aio_preadv() callers and they all keep
qiov around until the request is complete.

Actually this makes sense because even in the simple non-coroutine case
with aio=threads the qiov hasn't necessarily been read yet when the
function returns.  The aio_worker() function executes later and only
then is qiov handed to the host kernel.

So this is a one-off bug in blk_aio_ioctl() callers.

Stefan
Paolo Bonzini Nov. 23, 2017, 10:42 a.m. UTC | #9
On 23/11/2017 11:23, Stefan Hajnoczi wrote:
> You are right.  I audited the blk_aio_preadv() callers and they all keep
> qiov around until the request is complete.
> 
> Actually this makes sense because even in the simple non-coroutine case
> with aio=threads the qiov hasn't necessarily been read yet when the
> function returns.  The aio_worker() function executes later and only
> then is qiov handed to the host kernel.
> 
> So this is a one-off bug in blk_aio_ioctl() callers.

Only in blk_aio_ioctl, not in the callers.

Paolo
diff mbox series

Patch

diff --git a/block/block-backend.c b/block/block-backend.c
index baef8e7..c275827 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1472,19 +1472,10 @@  static void blk_aio_ioctl_entry(void *opaque)
     blk_aio_complete(acb);
 }
 
-BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
+BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
                           BlockCompletionFunc *cb, void *opaque)
 {
-    QEMUIOVector qiov;
-    struct iovec iov;
-
-    iov = (struct iovec) {
-        .iov_base = buf,
-        .iov_len = 0,
-    };
-    qemu_iovec_init_external(&qiov, &iov, 1);
-
-    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
+    return blk_aio_prwv(blk, req, 0, qiov, blk_aio_ioctl_entry, 0, cb, opaque);
 }
 
 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 05d1440..ed9f774 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -151,6 +151,8 @@  out:
 typedef struct {
     VirtIOBlockReq *req;
     struct sg_io_hdr hdr;
+    QEMUIOVector qiov;
+    struct iovec iov;
 } VirtIOBlockIoctlReq;
 
 static void virtio_blk_ioctl_complete(void *opaque, int status)
@@ -298,7 +300,12 @@  static int virtio_blk_handle_scsi_req(VirtIOBlockReq *req)
     ioctl_req->hdr.sbp = elem->in_sg[elem->in_num - 3].iov_base;
     ioctl_req->hdr.mx_sb_len = elem->in_sg[elem->in_num - 3].iov_len;
 
-    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->hdr,
+    ioctl_req->iov.iov_base = &ioctl_req->hdr;
+    ioctl_req->iov.iov_len = 0;
+
+    qemu_iovec_init_external(&ioctl_req->qiov, &ioctl_req->iov, 1);
+
+    acb = blk_aio_ioctl(blk->blk, SG_IO, &ioctl_req->qiov,
                         virtio_blk_ioctl_complete, ioctl_req);
     if (!acb) {
         g_free(ioctl_req);
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 1243117..7cbe18d 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -2636,6 +2636,9 @@  typedef struct SCSIBlockReq {
     SCSIDiskReq req;
     sg_io_hdr_t io_header;
 
+    QEMUIOVector qiov;
+    struct iovec iov;
+
     /* Selected bytes of the original CDB, copied into our own CDB.  */
     uint8_t cmd, cdb1, group_number;
 
@@ -2722,7 +2725,12 @@  static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
     io_header->usr_ptr = r;
     io_header->flags |= SG_FLAG_DIRECT_IO;
 
-    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
+    req->iov.iov_base = io_header;
+    req->iov.iov_len = 0;
+
+    qemu_iovec_init_external(&req->qiov, &req->iov, 1);
+
+    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, &req->qiov, cb, opaque);
     assert(aiocb != NULL);
     return aiocb;
 }
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index bd0d9ff..856af7c 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -46,6 +46,8 @@  typedef struct SCSIGenericReq {
     int buflen;
     int len;
     sg_io_hdr_t io_header;
+    QEMUIOVector qiov;
+    struct iovec iov;
 } SCSIGenericReq;
 
 static void scsi_generic_save_request(QEMUFile *f, SCSIRequest *req)
@@ -135,7 +137,12 @@  static int execute_command(BlockBackend *blk,
     r->io_header.usr_ptr = r;
     r->io_header.flags |= SG_FLAG_DIRECT_IO;
 
-    r->req.aiocb = blk_aio_ioctl(blk, SG_IO, &r->io_header, complete, r);
+    r->iov.iov_base = &r->io_header;
+    r->iov.iov_len = 0;
+
+    qemu_iovec_init_external(&r->qiov, &r->iov, 1);
+
+    r->req.aiocb = blk_aio_ioctl(blk, SG_IO, &r->qiov, complete, r);
     if (r->req.aiocb == NULL) {
         return -EIO;
     }
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index c4e52a5..32f4486 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -151,7 +151,7 @@  void blk_aio_cancel(BlockAIOCB *acb);
 void blk_aio_cancel_async(BlockAIOCB *acb);
 int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
 int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf);
-BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
+BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, QEMUIOVector *qiov,
                           BlockCompletionFunc *cb, void *opaque);
 int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes);
 int blk_co_flush(BlockBackend *blk);