diff mbox

[v6,01/13] block: Add op blocker type "device IO"

Message ID 1432190583-10518-2-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng May 21, 2015, 6:42 a.m. UTC
It blocks device IO.

All bdrv_op_block_all/blk_op_block_all callers are taken care of:

- virtio_blk_data_plane_create
- virtio_scsi_hotplug

  Device creation, unblock it.

- bdrv_set_backing_hd

  Backing hd is not used by device, so blocking is OK.

- backup_start

  Blocking target when backup is running, unblock it.

- mirror_complete

  Blocking s->to_replace until mirror_exit, OK.

- block_job_complete

  The block job may be long running. Unblock it.

- init_blk_migration

  The block migration may be long running, Unblock it.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 blockjob.c                      | 1 +
 hw/block/dataplane/virtio-blk.c | 1 +
 hw/scsi/virtio-scsi.c           | 1 +
 include/block/block.h           | 1 +
 migration/block.c               | 1 +
 5 files changed, 5 insertions(+)

Comments

Wen Congyang May 21, 2015, 7:06 a.m. UTC | #1
On 05/21/2015 02:42 PM, Fam Zheng wrote:
> It blocks device IO.
> 
> All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> 
> - virtio_blk_data_plane_create
> - virtio_scsi_hotplug
> 
>   Device creation, unblock it.
> 
> - bdrv_set_backing_hd
> 
>   Backing hd is not used by device, so blocking is OK.
> 
> - backup_start
> 
>   Blocking target when backup is running, unblock it.

Do you forget it?

> 
> - mirror_complete
> 
>   Blocking s->to_replace until mirror_exit, OK.
> 
> - block_job_complete
> 
>   The block job may be long running. Unblock it.

It should be block_job_create()

Thanks
Wen Congyang

> 
> - init_blk_migration
> 
>   The block migration may be long running, Unblock it.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  blockjob.c                      | 1 +
>  hw/block/dataplane/virtio-blk.c | 1 +
>  hw/scsi/virtio-scsi.c           | 1 +
>  include/block/block.h           | 1 +
>  migration/block.c               | 1 +
>  5 files changed, 5 insertions(+)
> 
> diff --git a/blockjob.c b/blockjob.c
> index 2755465..e39bdde 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -51,6 +51,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
>                 BlockJobType_lookup[driver->job_type]);
>      bdrv_op_block_all(bs, job->blocker);
>      bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
> +    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, job->blocker);
>  
>      job->driver        = driver;
>      job->bs            = bs;
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 3db139b..3ecc8bd 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -209,6 +209,7 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR, s->blocker);
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
> +    blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
>  
>      *dataplane = s;
>  }
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index e242fef..5e15fa6 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -772,6 +772,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
>              return;
>          }
>          blk_op_block_all(sd->conf.blk, s->blocker);
> +        blk_op_unblock(sd->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
>          aio_context_acquire(s->ctx);
>          blk_set_aio_context(sd->conf.blk, s->ctx);
>          aio_context_release(s->ctx);
> diff --git a/include/block/block.h b/include/block/block.h
> index 7d1a717..906fb31 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -159,6 +159,7 @@ typedef enum BlockOpType {
>      BLOCK_OP_TYPE_RESIZE,
>      BLOCK_OP_TYPE_STREAM,
>      BLOCK_OP_TYPE_REPLACE,
> +    BLOCK_OP_TYPE_DEVICE_IO,
>      BLOCK_OP_TYPE_MAX,
>  } BlockOpType;
>  
> diff --git a/migration/block.c b/migration/block.c
> index ddb59cc..b833bac 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -379,6 +379,7 @@ static void init_blk_migration(QEMUFile *f)
>          alloc_aio_bitmap(bmds);
>          error_setg(&bmds->blocker, "block device is in use by migration");
>          bdrv_op_block_all(bs, bmds->blocker);
> +        bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, bmds->blocker);
>          bdrv_ref(bs);
>  
>          block_mig_state.total_sector_sum += sectors;
>
Fam Zheng May 21, 2015, 7:32 a.m. UTC | #2
On Thu, 05/21 15:06, Wen Congyang wrote:
> On 05/21/2015 02:42 PM, Fam Zheng wrote:
> > It blocks device IO.
> > 
> > All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> > 
> > - virtio_blk_data_plane_create
> > - virtio_scsi_hotplug
> > 
> >   Device creation, unblock it.
> > 
> > - bdrv_set_backing_hd
> > 
> >   Backing hd is not used by device, so blocking is OK.
> > 
> > - backup_start
> > 
> >   Blocking target when backup is running, unblock it.
> 
> Do you forget it?

Oh I think the commit log is wrong: the target image is only written to by
block job, there cannot be a device on it, so it it's similar to
bdrv_set_backing_hd.

> 
> > 
> > - mirror_complete
> > 
> >   Blocking s->to_replace until mirror_exit, OK.
> > 
> > - block_job_complete
> > 
> >   The block job may be long running. Unblock it.
> 
> It should be block_job_create()

Yes.

Fam

> 
> Thanks
> Wen Congyang
> 
> > 
> > - init_blk_migration
> > 
> >   The block migration may be long running, Unblock it.
> > 
> > Signed-off-by: Fam Zheng <famz@redhat.com>
> > ---
> >  blockjob.c                      | 1 +
> >  hw/block/dataplane/virtio-blk.c | 1 +
> >  hw/scsi/virtio-scsi.c           | 1 +
> >  include/block/block.h           | 1 +
> >  migration/block.c               | 1 +
> >  5 files changed, 5 insertions(+)
> > 
> > diff --git a/blockjob.c b/blockjob.c
> > index 2755465..e39bdde 100644
> > --- a/blockjob.c
> > +++ b/blockjob.c
> > @@ -51,6 +51,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
> >                 BlockJobType_lookup[driver->job_type]);
> >      bdrv_op_block_all(bs, job->blocker);
> >      bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
> > +    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, job->blocker);
> >  
> >      job->driver        = driver;
> >      job->bs            = bs;
> > diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> > index 3db139b..3ecc8bd 100644
> > --- a/hw/block/dataplane/virtio-blk.c
> > +++ b/hw/block/dataplane/virtio-blk.c
> > @@ -209,6 +209,7 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
> >      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR, s->blocker);
> >      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
> >      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
> > +    blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
> >  
> >      *dataplane = s;
> >  }
> > diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> > index e242fef..5e15fa6 100644
> > --- a/hw/scsi/virtio-scsi.c
> > +++ b/hw/scsi/virtio-scsi.c
> > @@ -772,6 +772,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
> >              return;
> >          }
> >          blk_op_block_all(sd->conf.blk, s->blocker);
> > +        blk_op_unblock(sd->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
> >          aio_context_acquire(s->ctx);
> >          blk_set_aio_context(sd->conf.blk, s->ctx);
> >          aio_context_release(s->ctx);
> > diff --git a/include/block/block.h b/include/block/block.h
> > index 7d1a717..906fb31 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -159,6 +159,7 @@ typedef enum BlockOpType {
> >      BLOCK_OP_TYPE_RESIZE,
> >      BLOCK_OP_TYPE_STREAM,
> >      BLOCK_OP_TYPE_REPLACE,
> > +    BLOCK_OP_TYPE_DEVICE_IO,
> >      BLOCK_OP_TYPE_MAX,
> >  } BlockOpType;
> >  
> > diff --git a/migration/block.c b/migration/block.c
> > index ddb59cc..b833bac 100644
> > --- a/migration/block.c
> > +++ b/migration/block.c
> > @@ -379,6 +379,7 @@ static void init_blk_migration(QEMUFile *f)
> >          alloc_aio_bitmap(bmds);
> >          error_setg(&bmds->blocker, "block device is in use by migration");
> >          bdrv_op_block_all(bs, bmds->blocker);
> > +        bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, bmds->blocker);
> >          bdrv_ref(bs);
> >  
> >          block_mig_state.total_sector_sum += sectors;
> > 
>
Wen Congyang May 21, 2015, 8 a.m. UTC | #3
On 05/21/2015 02:42 PM, Fam Zheng wrote:
> It blocks device IO.

Does tt only block virtio-blk/scsi? Not all block types?

Thanks
Wen Congyang

> 
> All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> 
> - virtio_blk_data_plane_create
> - virtio_scsi_hotplug
> 
>   Device creation, unblock it.
> 
> - bdrv_set_backing_hd
> 
>   Backing hd is not used by device, so blocking is OK.
> 
> - backup_start
> 
>   Blocking target when backup is running, unblock it.
> 
> - mirror_complete
> 
>   Blocking s->to_replace until mirror_exit, OK.
> 
> - block_job_complete
> 
>   The block job may be long running. Unblock it.
> 
> - init_blk_migration
> 
>   The block migration may be long running, Unblock it.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  blockjob.c                      | 1 +
>  hw/block/dataplane/virtio-blk.c | 1 +
>  hw/scsi/virtio-scsi.c           | 1 +
>  include/block/block.h           | 1 +
>  migration/block.c               | 1 +
>  5 files changed, 5 insertions(+)
> 
> diff --git a/blockjob.c b/blockjob.c
> index 2755465..e39bdde 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -51,6 +51,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
>                 BlockJobType_lookup[driver->job_type]);
>      bdrv_op_block_all(bs, job->blocker);
>      bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
> +    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, job->blocker);
>  
>      job->driver        = driver;
>      job->bs            = bs;
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 3db139b..3ecc8bd 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -209,6 +209,7 @@ void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR, s->blocker);
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
>      blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
> +    blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
>  
>      *dataplane = s;
>  }
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index e242fef..5e15fa6 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -772,6 +772,7 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
>              return;
>          }
>          blk_op_block_all(sd->conf.blk, s->blocker);
> +        blk_op_unblock(sd->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
>          aio_context_acquire(s->ctx);
>          blk_set_aio_context(sd->conf.blk, s->ctx);
>          aio_context_release(s->ctx);
> diff --git a/include/block/block.h b/include/block/block.h
> index 7d1a717..906fb31 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -159,6 +159,7 @@ typedef enum BlockOpType {
>      BLOCK_OP_TYPE_RESIZE,
>      BLOCK_OP_TYPE_STREAM,
>      BLOCK_OP_TYPE_REPLACE,
> +    BLOCK_OP_TYPE_DEVICE_IO,
>      BLOCK_OP_TYPE_MAX,
>  } BlockOpType;
>  
> diff --git a/migration/block.c b/migration/block.c
> index ddb59cc..b833bac 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -379,6 +379,7 @@ static void init_blk_migration(QEMUFile *f)
>          alloc_aio_bitmap(bmds);
>          error_setg(&bmds->blocker, "block device is in use by migration");
>          bdrv_op_block_all(bs, bmds->blocker);
> +        bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, bmds->blocker);
>          bdrv_ref(bs);
>  
>          block_mig_state.total_sector_sum += sectors;
>
Fam Zheng May 21, 2015, 12:44 p.m. UTC | #4
On Thu, 05/21 16:00, Wen Congyang wrote:
> On 05/21/2015 02:42 PM, Fam Zheng wrote:
> > It blocks device IO.
> 
> Does tt only block virtio-blk/scsi? Not all block types?

It's only necessary for dataplane enabled devices, which are virtio-blk and
virtio-scsi at the moment. Other types are OK.


Fam
Fam Zheng May 22, 2015, 4:54 a.m. UTC | #5
On Thu, 05/21 15:32, Fam Zheng wrote:
> On Thu, 05/21 15:06, Wen Congyang wrote:
> > On 05/21/2015 02:42 PM, Fam Zheng wrote:
> > > It blocks device IO.
> > > 
> > > All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> > > 
> > > - virtio_blk_data_plane_create
> > > - virtio_scsi_hotplug
> > > 
> > >   Device creation, unblock it.
> > > 
> > > - bdrv_set_backing_hd
> > > 
> > >   Backing hd is not used by device, so blocking is OK.
> > > 
> > > - backup_start
> > > 
> > >   Blocking target when backup is running, unblock it.
> > 
> > Do you forget it?
> 
> Oh I think the commit log is wrong: the target image is only written to by
> block job, there cannot be a device on it, so it it's similar to
> bdrv_set_backing_hd.

Correction: if it's blockdev-backup, the target could have a device, in that
sense it should be unblocked like block_job_create(). I'll fix it.

Fam
Wen Congyang May 22, 2015, 6:18 a.m. UTC | #6
On 05/21/2015 08:44 PM, Fam Zheng wrote:
> On Thu, 05/21 16:00, Wen Congyang wrote:
>> On 05/21/2015 02:42 PM, Fam Zheng wrote:
>>> It blocks device IO.
>>
>> Does tt only block virtio-blk/scsi? Not all block types?
> 
> It's only necessary for dataplane enabled devices, which are virtio-blk and
> virtio-scsi at the moment. Other types are OK.

Hmm, it is OK for qmp transaction. But I think it is not OK for mirror job.

Thanks
Wen Congyang

> 
> 
> Fam
> .
>
Max Reitz May 23, 2015, 4:51 p.m. UTC | #7
On 22.05.2015 06:54, Fam Zheng wrote:
> On Thu, 05/21 15:32, Fam Zheng wrote:
>> On Thu, 05/21 15:06, Wen Congyang wrote:
>>> On 05/21/2015 02:42 PM, Fam Zheng wrote:
>>>> It blocks device IO.
>>>>
>>>> All bdrv_op_block_all/blk_op_block_all callers are taken care of:
>>>>
>>>> - virtio_blk_data_plane_create
>>>> - virtio_scsi_hotplug
>>>>
>>>>    Device creation, unblock it.
>>>>
>>>> - bdrv_set_backing_hd
>>>>
>>>>    Backing hd is not used by device, so blocking is OK.
>>>>
>>>> - backup_start
>>>>
>>>>    Blocking target when backup is running, unblock it.
>>> Do you forget it?
>> Oh I think the commit log is wrong: the target image is only written to by
>> block job, there cannot be a device on it, so it it's similar to
>> bdrv_set_backing_hd.
> Correction: if it's blockdev-backup, the target could have a device, in that
> sense it should be unblocked like block_job_create(). I'll fix it.

Really? I think it makes sense not to allow I/O on a backup target. At 
least I can't imagine a use case where you'd want to do that... But that 
doesn't necessarily mean anything, of course.

Max
Fam Zheng May 25, 2015, 2:15 a.m. UTC | #8
On Sat, 05/23 18:51, Max Reitz wrote:
> On 22.05.2015 06:54, Fam Zheng wrote:
> >On Thu, 05/21 15:32, Fam Zheng wrote:
> >>On Thu, 05/21 15:06, Wen Congyang wrote:
> >>>On 05/21/2015 02:42 PM, Fam Zheng wrote:
> >>>>It blocks device IO.
> >>>>
> >>>>All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> >>>>
> >>>>- virtio_blk_data_plane_create
> >>>>- virtio_scsi_hotplug
> >>>>
> >>>>   Device creation, unblock it.
> >>>>
> >>>>- bdrv_set_backing_hd
> >>>>
> >>>>   Backing hd is not used by device, so blocking is OK.
> >>>>
> >>>>- backup_start
> >>>>
> >>>>   Blocking target when backup is running, unblock it.
> >>>Do you forget it?
> >>Oh I think the commit log is wrong: the target image is only written to by
> >>block job, there cannot be a device on it, so it it's similar to
> >>bdrv_set_backing_hd.
> >Correction: if it's blockdev-backup, the target could have a device, in that
> >sense it should be unblocked like block_job_create(). I'll fix it.
> 
> Really? I think it makes sense not to allow I/O on a backup target. At least
> I can't imagine a use case where you'd want to do that... But that doesn't
> necessarily mean anything, of course.

Sure that nobody other than backup job itself should write to backup target,
but it's valid to read it - image fleecing aims to export it through NBD
server. If you attach it back to guest, it is as valid a scenario, isn't it?

So at least for image fleecing, we need to either 1) split device IO blocker to
read and write and only block write on target. 2) don't add device IO blocker
at all, expect the disk's end user to take the responsibility. And I'm afraid
1) will be too complicated.

Fam
Kevin Wolf May 26, 2015, 2:22 p.m. UTC | #9
Am 21.05.2015 um 08:42 hat Fam Zheng geschrieben:
> It blocks device IO.

What I'm missing here is a description of the case that it protects
against. Blocking device I/O doesn't sound like a valid thing to want
per se, but it might be true that we need it as long as we don't have
the "real" op blockers that Jeff is working on. However, as long as you
don't say why you want that, I can't say whether it's true or not.

And of course, it doesn't block anything yet after this patch, it's just
set or cleared without any effect. In fact, it seems that even at the
end of the series, there is no bdrv_op_is_blocked() call that checks for
BLOCK_OP_TYPE_DEVICE_IO. Is this intentional?
> 
> All bdrv_op_block_all/blk_op_block_all callers are taken care of:
> 
> - virtio_blk_data_plane_create
> - virtio_scsi_hotplug
> 
>   Device creation, unblock it.
> 
> - bdrv_set_backing_hd
> 
>   Backing hd is not used by device, so blocking is OK.

If the backing file becomes the active layer after committing,
bdrv_swap() will keep the blockers of the overlay, so that the image
doesn't have the device I/O blocker any more. Correct?

> - backup_start
> 
>   Blocking target when backup is running, unblock it.
> 
> - mirror_complete
> 
>   Blocking s->to_replace until mirror_exit, OK.
> 
> - block_job_complete
> 
>   The block job may be long running. Unblock it.
> 
> - init_blk_migration
> 
>   The block migration may be long running, Unblock it.
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>

Kevin
Max Reitz May 26, 2015, 2:24 p.m. UTC | #10
On 26.05.2015 16:22, Kevin Wolf wrote:
> Am 21.05.2015 um 08:42 hat Fam Zheng geschrieben:
>> It blocks device IO.
> What I'm missing here is a description of the case that it protects
> against. Blocking device I/O doesn't sound like a valid thing to want
> per se, but it might be true that we need it as long as we don't have
> the "real" op blockers that Jeff is working on. However, as long as you
> don't say why you want that, I can't say whether it's true or not.
>
> And of course, it doesn't block anything yet after this patch, it's just
> set or cleared without any effect. In fact, it seems that even at the
> end of the series, there is no bdrv_op_is_blocked() call that checks for
> BLOCK_OP_TYPE_DEVICE_IO. Is this intentional?

Probably yes, because patches 2 and 3 introduce a notifier for when op 
blockers are set up and removed. This is used by virtio-blk and 
virtio-scsi to pause and unpause device I/O, respectively.

Max

>> All bdrv_op_block_all/blk_op_block_all callers are taken care of:
>>
>> - virtio_blk_data_plane_create
>> - virtio_scsi_hotplug
>>
>>    Device creation, unblock it.
>>
>> - bdrv_set_backing_hd
>>
>>    Backing hd is not used by device, so blocking is OK.
> If the backing file becomes the active layer after committing,
> bdrv_swap() will keep the blockers of the overlay, so that the image
> doesn't have the device I/O blocker any more. Correct?
>
>> - backup_start
>>
>>    Blocking target when backup is running, unblock it.
>>
>> - mirror_complete
>>
>>    Blocking s->to_replace until mirror_exit, OK.
>>
>> - block_job_complete
>>
>>    The block job may be long running. Unblock it.
>>
>> - init_blk_migration
>>
>>    The block migration may be long running, Unblock it.
>>
>> Signed-off-by: Fam Zheng <famz@redhat.com>
> Kevin
Kevin Wolf May 27, 2015, 9:07 a.m. UTC | #11
Am 26.05.2015 um 16:24 hat Max Reitz geschrieben:
> On 26.05.2015 16:22, Kevin Wolf wrote:
> >Am 21.05.2015 um 08:42 hat Fam Zheng geschrieben:
> >>It blocks device IO.
> >What I'm missing here is a description of the case that it protects
> >against. Blocking device I/O doesn't sound like a valid thing to want
> >per se, but it might be true that we need it as long as we don't have
> >the "real" op blockers that Jeff is working on. However, as long as you
> >don't say why you want that, I can't say whether it's true or not.
> >
> >And of course, it doesn't block anything yet after this patch, it's just
> >set or cleared without any effect. In fact, it seems that even at the
> >end of the series, there is no bdrv_op_is_blocked() call that checks for
> >BLOCK_OP_TYPE_DEVICE_IO. Is this intentional?
> 
> Probably yes, because patches 2 and 3 introduce a notifier for when
> op blockers are set up and removed. This is used by virtio-blk and
> virtio-scsi to pause and unpause device I/O, respectively.

Indeed, I missed that.

Having thought a bit more about this, I think we need to carefully check
whether op blockers are really the right tool here; and if they are, we
need to document the reason in the commit message at least.

The op blocker model as used until now has a few characteristics:

* Old model: You have a period of time between blocking and unblocking
  during which starting the corresponding operation is prohibited. The
  operation checks whether the blocker is set when it starts, and it
  fails if the operation is blocked.

  New model that Jeff is working on: You still have block/unblock (for a
  category rather than a specific operation, though), but you also have
  a start/stop pair for using the functionality. If you start an
  operation that is blocked, it still errors out. However, blocking an
  operation that is already started fails as well now.

  This series: An already running operation is interrupted when the
  blocker is set, and it continues when it is removed. You had to
  introduce new infrastructure (notifiers) because this doesn't match
  the op blockers model.

* Op blockers used to be set to protect high-level, long-running
  background operations, that are usually initiated by the user.
  bdrv_drain() is low-level and a blocking foreground operation.

* Op blockers came into effect only after monitor interactions and used
  to be on a slow path. We're now in the I/O path. It's not a problem
  with the current code per se (as you introduced notifiers, so we don't
  have to iterate the list all the time), but it broadens the scope of
  op blockers significantly and we shouldn't be doing that carelessly.

* Op blockers have an error message. It's unused for the new case.

This is the first part of what's troubling me with this series, as it
makes me doubtful if op blockers are the right tool to implement what
the commit message says (block device I/O). This is "are we doing the
thing right?"

The second part should actually come first, though: "Are we doing the
right thing?" I'm also doubtful whether blocking device I/O is even what
we should do.

Why is device I/O special compared to block job I/O or NBD server I/O?
If the answer is "because block jobs are already paused while draining"
(and probably nobody thought much about the NBD server), then chances
are that it's not the right thing.  In fact, using two different
mechanisms for pausing block jobs and pausing device I/O seems
inconsistent and wrong.

I suspect that the real solution needs to be in the block layer, though
I'm not quite sure yet what it would be like. Perhaps a function pair
like blk_stop/resume_io() that is used by bdrv_drain() callers and works
on the BlockBackend level. (The BDS level is problematic because we
don't want to stop all I/O; many callers just want to drain I/O of
_other_ callers so they can do their own I/O without having to care
about concurrency).

Kevin
Paolo Bonzini May 27, 2015, 9:50 a.m. UTC | #12
On 27/05/2015 11:07, Kevin Wolf wrote:
> This is the first part of what's troubling me with this series, as it
> makes me doubtful if op blockers are the right tool to implement what
> the commit message says (block device I/O). This is "are we doing the
> thing right?"
> 
> The second part should actually come first, though: "Are we doing the
> right thing?" I'm also doubtful whether blocking device I/O is even what
> we should do.
> 
> Why is device I/O special compared to block job I/O or NBD server I/O?

Because block job I/O doesn't modify the source disk.  For the target
disk, block jobs should definitely treat themselves as device I/O and
register notifiers that pause themselves on bdrv_drain.

> If the answer is "because block jobs are already paused while draining"
> (and probably nobody thought much about the NBD server), then chances
> are that it's not the right thing.  In fact, using two different
> mechanisms for pausing block jobs and pausing device I/O seems
> inconsistent and wrong.
> 
> I suspect that the real solution needs to be in the block layer, though
> I'm not quite sure yet what it would be like. Perhaps a function pair
> like blk_stop/resume_io() that is used by bdrv_drain() callers and works
> on the BlockBackend level.

This is suspiciously similar to the first idea that I and Stefan had,
which was a blk_pause/blk_resume API, implemented through a notifier list.

Paolo
Kevin Wolf May 27, 2015, 10:10 a.m. UTC | #13
Am 27.05.2015 um 11:50 hat Paolo Bonzini geschrieben:
> 
> 
> On 27/05/2015 11:07, Kevin Wolf wrote:
> > This is the first part of what's troubling me with this series, as it
> > makes me doubtful if op blockers are the right tool to implement what
> > the commit message says (block device I/O). This is "are we doing the
> > thing right?"
> > 
> > The second part should actually come first, though: "Are we doing the
> > right thing?" I'm also doubtful whether blocking device I/O is even what
> > we should do.
> > 
> > Why is device I/O special compared to block job I/O or NBD server I/O?
> 
> Because block job I/O doesn't modify the source disk.  For the target
> disk, block jobs should definitely treat themselves as device I/O and
> register notifiers that pause themselves on bdrv_drain.

Block jobs don't generally have a source and a target; in fact, the
design didn't even consider that such jobs exist (otherwise, we wouldn't
have bs->job).

There are some jobs that use a BDS read-only (source for backup, mirror,
commit) just like there are guest devices that use the BDS read-only
(CD-ROMs). And others write to it (streaming, hard disks).

This doesn't seem to be the crucial difference between both.

> > If the answer is "because block jobs are already paused while draining"
> > (and probably nobody thought much about the NBD server), then chances
> > are that it's not the right thing.  In fact, using two different
> > mechanisms for pausing block jobs and pausing device I/O seems
> > inconsistent and wrong.
> > 
> > I suspect that the real solution needs to be in the block layer, though
> > I'm not quite sure yet what it would be like. Perhaps a function pair
> > like blk_stop/resume_io() that is used by bdrv_drain() callers and works
> > on the BlockBackend level.
> 
> This is suspiciously similar to the first idea that I and Stefan had,
> which was a blk_pause/blk_resume API, implemented through a notifier list.

Any problems with it because of which Fam decided against it?

Kevin
Paolo Bonzini May 27, 2015, 10:43 a.m. UTC | #14
On 27/05/2015 12:10, Kevin Wolf wrote:
> Am 27.05.2015 um 11:50 hat Paolo Bonzini geschrieben:
>>
>>
>> On 27/05/2015 11:07, Kevin Wolf wrote:
>>> This is the first part of what's troubling me with this series, as it
>>> makes me doubtful if op blockers are the right tool to implement what
>>> the commit message says (block device I/O). This is "are we doing the
>>> thing right?"
>>>
>>> The second part should actually come first, though: "Are we doing the
>>> right thing?" I'm also doubtful whether blocking device I/O is even what
>>> we should do.
>>>
>>> Why is device I/O special compared to block job I/O or NBD server I/O?
>>
>> Because block job I/O doesn't modify the source disk.  For the target
>> disk, block jobs should definitely treat themselves as device I/O and
>> register notifiers that pause themselves on bdrv_drain.
> 
> Block jobs don't generally have a source and a target; in fact, the
> design didn't even consider that such jobs exist (otherwise, we wouldn't
> have bs->job).

Mirror and backup have targets.

> There are some jobs that use a BDS read-only (source for backup, mirror,
> commit) just like there are guest devices that use the BDS read-only
> (CD-ROMs). And others write to it (streaming, hard disks).

Streaming doesn't write to the BDS.  It reads it while copy-on-read is
active.  It's subtle, but it means that streaming can proceed without
changing the contents of the disk.  As far as safety of atomic
transactions is concerned, streaming need not be paused.

However, you're right that there is no particular reason why a more
generic pause/resume wouldn't pause jobs as well.  It wouldn't be a
problem either way as far as the job is concerned, and a better-defined
API is a better API.

>> This is suspiciously similar to the first idea that I and Stefan had,
>> which was a blk_pause/blk_resume API, implemented through a notifier list.
> 
> Any problems with it because of which Fam decided against it?

I don't know, but I don't think so.

Paolo
Fam Zheng May 28, 2015, 2:49 a.m. UTC | #15
On Wed, 05/27 12:43, Paolo Bonzini wrote:
> 
> 
> On 27/05/2015 12:10, Kevin Wolf wrote:
> > Am 27.05.2015 um 11:50 hat Paolo Bonzini geschrieben:
> >>
> >>
> >> On 27/05/2015 11:07, Kevin Wolf wrote:
> >>> This is the first part of what's troubling me with this series, as it
> >>> makes me doubtful if op blockers are the right tool to implement what
> >>> the commit message says (block device I/O). This is "are we doing the
> >>> thing right?"
> >>>
> >>> The second part should actually come first, though: "Are we doing the
> >>> right thing?" I'm also doubtful whether blocking device I/O is even what
> >>> we should do.
> >>>
> >>> Why is device I/O special compared to block job I/O or NBD server I/O?
> >>
> >> Because block job I/O doesn't modify the source disk.  For the target
> >> disk, block jobs should definitely treat themselves as device I/O and
> >> register notifiers that pause themselves on bdrv_drain.
> > 
> > Block jobs don't generally have a source and a target; in fact, the
> > design didn't even consider that such jobs exist (otherwise, we wouldn't
> > have bs->job).
> 
> Mirror and backup have targets.
> 
> > There are some jobs that use a BDS read-only (source for backup, mirror,
> > commit) just like there are guest devices that use the BDS read-only
> > (CD-ROMs). And others write to it (streaming, hard disks).
> 
> Streaming doesn't write to the BDS.  It reads it while copy-on-read is
> active.  It's subtle, but it means that streaming can proceed without
> changing the contents of the disk.  As far as safety of atomic
> transactions is concerned, streaming need not be paused.
> 
> However, you're right that there is no particular reason why a more
> generic pause/resume wouldn't pause jobs as well.  It wouldn't be a
> problem either way as far as the job is concerned, and a better-defined
> API is a better API.
> 
> >> This is suspiciously similar to the first idea that I and Stefan had,
> >> which was a blk_pause/blk_resume API, implemented through a notifier list.
> > 
> > Any problems with it because of which Fam decided against it?

I am not against it.

Initially I started with writing blk_pause/resume, and soon I though it would
be useful if blk_aio_read and friends can check the pause state before issuing
IO:

https://lists.gnu.org/archive/html/qemu-devel/2015-05/msg02608.html

So I reused op blockers. But the idea was later abandoned, so it's now very
similar to the blk_pause/resume idea.

BTW, I remember we once proposed the new op blocker model should have a
category for I/O, no?

On a second thought, although in this series, we only need to modify
virtio-{scsi,blk}, Wen pointed out that mirror job also wants this during
completion (because bdrv_swap is in a BH after the job coroutine returns).  So,
in order for the solution to be general, and complete, this nofitier list
approach relies on the listening devices to do the right thing, which requires
modifying all of them, and is harder to maintain.

Fam
Paolo Bonzini May 28, 2015, 8:23 a.m. UTC | #16
On 28/05/2015 04:49, Fam Zheng wrote:
> 
> On a second thought, although in this series, we only need to modify
> virtio-{scsi,blk}, Wen pointed out that mirror job also wants this during
> completion (because bdrv_swap is in a BH after the job coroutine returns).

Mirror needs to pause/resume the source.  It doesn't need to handle
pause/resume of the target, does it?

> So, in order for the solution to be general, and complete, this nofitier list
> approach relies on the listening devices to do the right thing, which requires
> modifying all of them, and is harder to maintain.

Isn't it only devices that use aio_set_event_notifier for their ioeventfd?

Paolo
Kevin Wolf May 28, 2015, 9:40 a.m. UTC | #17
Am 28.05.2015 um 04:49 hat Fam Zheng geschrieben:
> On Wed, 05/27 12:43, Paolo Bonzini wrote:
> > 
> > 
> > On 27/05/2015 12:10, Kevin Wolf wrote:
> > > Am 27.05.2015 um 11:50 hat Paolo Bonzini geschrieben:
> > >>
> > >>
> > >> On 27/05/2015 11:07, Kevin Wolf wrote:
> > >>> This is the first part of what's troubling me with this series, as it
> > >>> makes me doubtful if op blockers are the right tool to implement what
> > >>> the commit message says (block device I/O). This is "are we doing the
> > >>> thing right?"
> > >>>
> > >>> The second part should actually come first, though: "Are we doing the
> > >>> right thing?" I'm also doubtful whether blocking device I/O is even what
> > >>> we should do.
> > >>>
> > >>> Why is device I/O special compared to block job I/O or NBD server I/O?
> > >>
> > >> Because block job I/O doesn't modify the source disk.  For the target
> > >> disk, block jobs should definitely treat themselves as device I/O and
> > >> register notifiers that pause themselves on bdrv_drain.
> > > 
> > > Block jobs don't generally have a source and a target; in fact, the
> > > design didn't even consider that such jobs exist (otherwise, we wouldn't
> > > have bs->job).
> > 
> > Mirror and backup have targets.
> > 
> > > There are some jobs that use a BDS read-only (source for backup, mirror,
> > > commit) just like there are guest devices that use the BDS read-only
> > > (CD-ROMs). And others write to it (streaming, hard disks).
> > 
> > Streaming doesn't write to the BDS.  It reads it while copy-on-read is
> > active.  It's subtle, but it means that streaming can proceed without
> > changing the contents of the disk.  As far as safety of atomic
> > transactions is concerned, streaming need not be paused.
> > 
> > However, you're right that there is no particular reason why a more
> > generic pause/resume wouldn't pause jobs as well.  It wouldn't be a
> > problem either way as far as the job is concerned, and a better-defined
> > API is a better API.
> > 
> > >> This is suspiciously similar to the first idea that I and Stefan had,
> > >> which was a blk_pause/blk_resume API, implemented through a notifier list.
> > > 
> > > Any problems with it because of which Fam decided against it?
> 
> I am not against it.
> 
> Initially I started with writing blk_pause/resume, and soon I though it would
> be useful if blk_aio_read and friends can check the pause state before issuing
> IO:
> 
> https://lists.gnu.org/archive/html/qemu-devel/2015-05/msg02608.html
> 
> So I reused op blockers. But the idea was later abandoned, so it's now very
> similar to the blk_pause/resume idea.
> 
> BTW, I remember we once proposed the new op blocker model should have a
> category for I/O, no?

Yes, but I was always thinking of this on a much higher level: As soon
as a device is attached to a backend, it announces that it uses the
category, and it only drops it again when it's detached.

The use case for this is starting background jobs that don't work e.g.
while the image is written to. In such cases you want to block if the
attached device _can_ submit write request, not just if a write request
is already in flight.

Mutual exclusion on a per-request basis is something different.

> On a second thought, although in this series, we only need to modify
> virtio-{scsi,blk}, Wen pointed out that mirror job also wants this during
> completion (because bdrv_swap is in a BH after the job coroutine returns).  So,
> in order for the solution to be general, and complete, this nofitier list
> approach relies on the listening devices to do the right thing, which requires
> modifying all of them, and is harder to maintain.

Indeed. blk_pause/resume would handle everything in one central place
in the block layer instead of spreading the logic across all the block
layer users.

Kevin
Fam Zheng May 28, 2015, 10:46 a.m. UTC | #18
On Thu, 05/28 10:23, Paolo Bonzini wrote:
> 
> 
> On 28/05/2015 04:49, Fam Zheng wrote:
> > 
> > On a second thought, although in this series, we only need to modify
> > virtio-{scsi,blk}, Wen pointed out that mirror job also wants this during
> > completion (because bdrv_swap is in a BH after the job coroutine returns).
> 
> Mirror needs to pause/resume the source.  It doesn't need to handle
> pause/resume of the target, does it?
> 
> > So, in order for the solution to be general, and complete, this nofitier list
> > approach relies on the listening devices to do the right thing, which requires
> > modifying all of them, and is harder to maintain.
> 
> Isn't it only devices that use aio_set_event_notifier for their ioeventfd?

I think it's only the case for qmp_transaction, mirror job needs all block
devices to implement pause: mirror_run guarantees source and target in sync
when it returns; but bdrv_swap is deferred to a main loop bottom half -  what
if there is a guest write to source in between?

Fam
Paolo Bonzini May 28, 2015, 10:52 a.m. UTC | #19
On 28/05/2015 12:46, Fam Zheng wrote:
>> > 
>> > Mirror needs to pause/resume the source.  It doesn't need to handle
>> > pause/resume of the target, does it?
>> > 
>>> > > So, in order for the solution to be general, and complete, this nofitier list
>>> > > approach relies on the listening devices to do the right thing, which requires
>>> > > modifying all of them, and is harder to maintain.
>> > 
>> > Isn't it only devices that use aio_set_event_notifier for their ioeventfd?
> I think it's only the case for qmp_transaction, mirror job needs all block
> devices to implement pause: mirror_run guarantees source and target in sync
> when it returns; but bdrv_swap is deferred to a main loop bottom half -  what
> if there is a guest write to source in between?

Whoever uses ioeventfd needs to implement pause/resume, yes---not just
dataplane, also "regular" virtio-blk/virtio-scsi.

However, everyone else should be okay, because the bottom half runs
immediately and the big QEMU lock is not released in the meanwhile.  So
the CPUs have no occasion to run.  This needs a comment!

Paolo
Fam Zheng May 28, 2015, 10:55 a.m. UTC | #20
On Thu, 05/28 11:40, Kevin Wolf wrote:
> Indeed. blk_pause/resume would handle everything in one central place
> in the block layer instead of spreading the logic across all the block
> layer users.

Sorry, I'm confused. Do you mean there is a way to implement blk_pause
completely in block layer, without the necessity of various notifier handlers
in device models?

Fam
Paolo Bonzini May 28, 2015, 11 a.m. UTC | #21
On 28/05/2015 12:55, Fam Zheng wrote:
> > Indeed. blk_pause/resume would handle everything in one central place
> > in the block layer instead of spreading the logic across all the block
> > layer users.
>
> Sorry, I'm confused. Do you mean there is a way to implement blk_pause
> completely in block layer, without the necessity of various notifier handlers
> in device models?

How would you do that?  Do you have to keep a queue of pending requests
in the BlockBackend?  Since bdrv_drain_all may never return (e.g. stuck
NFS connection with nfs=hard), the guest can force unbounded allocation
in the host, which is bad.

In addition, the BDS doesn't have a list of BlockBackends attached to
it.  So you need the BlockBackends to register themselves for
pause/resume in some way---for example with a notifier list.

Then it's irrelevant whether it's the device model or the BB that
attaches itself to the notifier list.  You can start with doing it in
the device models (those that use ioeventfd), and later it can be moved
to the BB.  The low-level implementation remains the same.

Paolo
Fam Zheng May 28, 2015, 11:11 a.m. UTC | #22
On Thu, 05/28 12:52, Paolo Bonzini wrote:
> 
> 
> On 28/05/2015 12:46, Fam Zheng wrote:
> >> > 
> >> > Mirror needs to pause/resume the source.  It doesn't need to handle
> >> > pause/resume of the target, does it?
> >> > 
> >>> > > So, in order for the solution to be general, and complete, this nofitier list
> >>> > > approach relies on the listening devices to do the right thing, which requires
> >>> > > modifying all of them, and is harder to maintain.
> >> > 
> >> > Isn't it only devices that use aio_set_event_notifier for their ioeventfd?
> > I think it's only the case for qmp_transaction, mirror job needs all block
> > devices to implement pause: mirror_run guarantees source and target in sync
> > when it returns; but bdrv_swap is deferred to a main loop bottom half -  what
> > if there is a guest write to source in between?
> 
> Whoever uses ioeventfd needs to implement pause/resume, yes---not just
> dataplane, also "regular" virtio-blk/virtio-scsi.
> 
> However, everyone else should be okay, because the bottom half runs
> immediately and the big QEMU lock is not released in the meanwhile.  So
> the CPUs have no occasion to run.  This needs a comment!
> 

I'm not sure. It seems timer callbacks also do I/O, for example
nvme_process_sq().

Fam
Paolo Bonzini May 28, 2015, 11:19 a.m. UTC | #23
On 28/05/2015 13:11, Fam Zheng wrote:
> > Whoever uses ioeventfd needs to implement pause/resume, yes---not just
> > dataplane, also "regular" virtio-blk/virtio-scsi.
> > 
> > However, everyone else should be okay, because the bottom half runs
> > immediately and the big QEMU lock is not released in the meanwhile.  So
> > the CPUs have no occasion to run.  This needs a comment!
> 
> I'm not sure. It seems timer callbacks also do I/O, for example
> nvme_process_sq().

Right, that's also true for USB devices. :(

Perhaps we can skip block_job_defer_to_main_loop if not necessary
(bs->aio_context == qemu_get_aio_context()).

Paolo
Kevin Wolf May 28, 2015, 11:24 a.m. UTC | #24
Am 28.05.2015 um 13:00 hat Paolo Bonzini geschrieben:
> On 28/05/2015 12:55, Fam Zheng wrote:
> > > Indeed. blk_pause/resume would handle everything in one central place
> > > in the block layer instead of spreading the logic across all the block
> > > layer users.
> >
> > Sorry, I'm confused. Do you mean there is a way to implement blk_pause
> > completely in block layer, without the necessity of various notifier handlers
> > in device models?
> 
> How would you do that?  Do you have to keep a queue of pending requests
> in the BlockBackend?  Since bdrv_drain_all may never return (e.g. stuck
> NFS connection with nfs=hard), the guest can force unbounded allocation
> in the host, which is bad.

We already queue requests for things like I/O throttling or
serialisation. Why would this be any worse?

> In addition, the BDS doesn't have a list of BlockBackends attached to
> it.  So you need the BlockBackends to register themselves for
> pause/resume in some way---for example with a notifier list.
> 
> Then it's irrelevant whether it's the device model or the BB that
> attaches itself to the notifier list.  You can start with doing it in
> the device models (those that use ioeventfd), and later it can be moved
> to the BB.  The low-level implementation remains the same.

The reason for doing it in the block layer is that it's in one place and
we can be sure that it's applied. We can still in addition modify
specific users to avoid even trying to send requests, but I think it's
good to have the single place that always ensures correct functionality
of the drain instead of making it dependent on the user.

Kevin
Paolo Bonzini May 28, 2015, 11:41 a.m. UTC | #25
On 28/05/2015 13:24, Kevin Wolf wrote:
> Am 28.05.2015 um 13:00 hat Paolo Bonzini geschrieben:
>> On 28/05/2015 12:55, Fam Zheng wrote:
>>>> Indeed. blk_pause/resume would handle everything in one central place
>>>> in the block layer instead of spreading the logic across all the block
>>>> layer users.
>>>
>>> Sorry, I'm confused. Do you mean there is a way to implement blk_pause
>>> completely in block layer, without the necessity of various notifier handlers
>>> in device models?
>>
>> How would you do that?  Do you have to keep a queue of pending requests
>> in the BlockBackend?  Since bdrv_drain_all may never return (e.g. stuck
>> NFS connection with nfs=hard), the guest can force unbounded allocation
>> in the host, which is bad.
> 
> We already queue requests for things like I/O throttling or
> serialisation. Why would this be any worse?

The fact that it's potentially unbounded makes me nervous.  But you're
right that we sort of expect the guest to not have too high a queue
depth.  And serialization is also potentially unbounded.

So it may indeed be the best way.  Just to double check, is it correct
that the API would still be BDS-based, i.e.
bdrv_pause_backends/bdrv_resume_backends, and the BlockBackends would
attach themselves to pause/resume notifier lists?

Paolo

>> In addition, the BDS doesn't have a list of BlockBackends attached to
>> it.  So you need the BlockBackends to register themselves for
>> pause/resume in some way---for example with a notifier list.
>>
>> Then it's irrelevant whether it's the device model or the BB that
>> attaches itself to the notifier list.  You can start with doing it in
>> the device models (those that use ioeventfd), and later it can be moved
>> to the BB.  The low-level implementation remains the same.
> 
> The reason for doing it in the block layer is that it's in one place and
> we can be sure that it's applied. We can still in addition modify
> specific users to avoid even trying to send requests, but I think it's
> good to have the single place that always ensures correct functionality
> of the drain instead of making it dependent on the user.
> 
> Kevin
>
Fam Zheng May 28, 2015, 11:44 a.m. UTC | #26
On Thu, 05/28 13:24, Kevin Wolf wrote:
> Am 28.05.2015 um 13:00 hat Paolo Bonzini geschrieben:
> > On 28/05/2015 12:55, Fam Zheng wrote:
> > > > Indeed. blk_pause/resume would handle everything in one central place
> > > > in the block layer instead of spreading the logic across all the block
> > > > layer users.
> > >
> > > Sorry, I'm confused. Do you mean there is a way to implement blk_pause
> > > completely in block layer, without the necessity of various notifier handlers
> > > in device models?
> > 
> > How would you do that?  Do you have to keep a queue of pending requests
> > in the BlockBackend?  Since bdrv_drain_all may never return (e.g. stuck
> > NFS connection with nfs=hard), the guest can force unbounded allocation
> > in the host, which is bad.
> 
> We already queue requests for things like I/O throttling or
> serialisation. Why would this be any worse?
> 
> > In addition, the BDS doesn't have a list of BlockBackends attached to
> > it.  So you need the BlockBackends to register themselves for
> > pause/resume in some way---for example with a notifier list.
> > 
> > Then it's irrelevant whether it's the device model or the BB that
> > attaches itself to the notifier list.  You can start with doing it in
> > the device models (those that use ioeventfd), and later it can be moved
> > to the BB.  The low-level implementation remains the same.
> 
> The reason for doing it in the block layer is that it's in one place and
> we can be sure that it's applied. We can still in addition modify
> specific users to avoid even trying to send requests, but I think it's
> good to have the single place that always ensures correct functionality
> of the drain instead of making it dependent on the user.
> 

How to do that for the synchronous blk_write callers that don't run in
a coroutine?

Fam
Paolo Bonzini May 28, 2015, 11:47 a.m. UTC | #27
On 28/05/2015 13:44, Fam Zheng wrote:
> > The reason for doing it in the block layer is that it's in one place and
> > we can be sure that it's applied. We can still in addition modify
> > specific users to avoid even trying to send requests, but I think it's
> > good to have the single place that always ensures correct functionality
> > of the drain instead of making it dependent on the user.
> 
> How to do that for the synchronous blk_write callers that don't run in
> a coroutine?

They would be completely oblivious to it.

Their call to blk_co_write would queue the request.  Then blk_write
calls aio_poll, which ultimately would result in blk_resume and unqueue
the request.

Paolo
Fam Zheng May 28, 2015, 12:04 p.m. UTC | #28
On Thu, 05/28 13:47, Paolo Bonzini wrote:
> 
> 
> On 28/05/2015 13:44, Fam Zheng wrote:
> > > The reason for doing it in the block layer is that it's in one place and
> > > we can be sure that it's applied. We can still in addition modify
> > > specific users to avoid even trying to send requests, but I think it's
> > > good to have the single place that always ensures correct functionality
> > > of the drain instead of making it dependent on the user.
> > 
> > How to do that for the synchronous blk_write callers that don't run in
> > a coroutine?
> 
> They would be completely oblivious to it.
> 
> Their call to blk_co_write would queue the request.  Then blk_write
> calls aio_poll, which ultimately would result in blk_resume and unqueue
> the request.

OK.. I thought that, then we have to make a rule that a blk_pause must be
resolved by an aio_poll() loop.  This is somehow tricky, for example I assume
blk_write() would only call aio_poll() of its own AioContext? But the pairing
blk_resume() will be in the main context.  To make it worse, what if there is
no BDS in the main context, should it be special cased in the block layer?

Fam
Fam Zheng May 28, 2015, 12:05 p.m. UTC | #29
On Thu, 05/28 13:19, Paolo Bonzini wrote:
> 
> 
> On 28/05/2015 13:11, Fam Zheng wrote:
> > > Whoever uses ioeventfd needs to implement pause/resume, yes---not just
> > > dataplane, also "regular" virtio-blk/virtio-scsi.
> > > 
> > > However, everyone else should be okay, because the bottom half runs
> > > immediately and the big QEMU lock is not released in the meanwhile.  So
> > > the CPUs have no occasion to run.  This needs a comment!
> > 
> > I'm not sure. It seems timer callbacks also do I/O, for example
> > nvme_process_sq().
> 
> Right, that's also true for USB devices. :(
> 
> Perhaps we can skip block_job_defer_to_main_loop if not necessary
> (bs->aio_context == qemu_get_aio_context()).

I think so. It will make dataplane even more specialized but that seems the
only way to fix the problem at the moment.

Fam
Andrey Korolyov May 29, 2015, 11:11 a.m. UTC | #30
On Thu, May 28, 2015 at 3:05 PM, Fam Zheng <famz@redhat.com> wrote:
> On Thu, 05/28 13:19, Paolo Bonzini wrote:
>>
>>
>> On 28/05/2015 13:11, Fam Zheng wrote:
>> > > Whoever uses ioeventfd needs to implement pause/resume, yes---not just
>> > > dataplane, also "regular" virtio-blk/virtio-scsi.
>> > >
>> > > However, everyone else should be okay, because the bottom half runs
>> > > immediately and the big QEMU lock is not released in the meanwhile.  So
>> > > the CPUs have no occasion to run.  This needs a comment!
>> >
>> > I'm not sure. It seems timer callbacks also do I/O, for example
>> > nvme_process_sq().
>>
>> Right, that's also true for USB devices. :(
>>
>> Perhaps we can skip block_job_defer_to_main_loop if not necessary
>> (bs->aio_context == qemu_get_aio_context()).
>
> I think so. It will make dataplane even more specialized but that seems the
> only way to fix the problem at the moment.
>
> Fam
>

Sorry for a potential thread hijack, but I`m curious about the reasons
to not making advertised queue depth for non-passthrough backends an
independent tunable, is there any concerns behind that?
Paolo Bonzini May 30, 2015, 1:21 p.m. UTC | #31
On 29/05/2015 13:11, Andrey Korolyov wrote:
> Sorry for a potential thread hijack, but I`m curious about the reasons
> to not making advertised queue depth for non-passthrough backends an
> independent tunable, is there any concerns behind that?

It certainly can be made tunable.  Usually it is bounded actually, for
example PATA has a queue depth of 1 (!) while AHCI has 32.  For virtio
(blk and scsi) it actually is already somewhat tunable, since the queue
depth is limited by the size of the virtqueue.

Paolo
diff mbox

Patch

diff --git a/blockjob.c b/blockjob.c
index 2755465..e39bdde 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -51,6 +51,7 @@  void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                BlockJobType_lookup[driver->job_type]);
     bdrv_op_block_all(bs, job->blocker);
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
+    bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, job->blocker);
 
     job->driver        = driver;
     job->bs            = bs;
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 3db139b..3ecc8bd 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -209,6 +209,7 @@  void virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
     blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_MIRROR, s->blocker);
     blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_STREAM, s->blocker);
     blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_REPLACE, s->blocker);
+    blk_op_unblock(conf->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
 
     *dataplane = s;
 }
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index e242fef..5e15fa6 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -772,6 +772,7 @@  static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
             return;
         }
         blk_op_block_all(sd->conf.blk, s->blocker);
+        blk_op_unblock(sd->conf.blk, BLOCK_OP_TYPE_DEVICE_IO, s->blocker);
         aio_context_acquire(s->ctx);
         blk_set_aio_context(sd->conf.blk, s->ctx);
         aio_context_release(s->ctx);
diff --git a/include/block/block.h b/include/block/block.h
index 7d1a717..906fb31 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -159,6 +159,7 @@  typedef enum BlockOpType {
     BLOCK_OP_TYPE_RESIZE,
     BLOCK_OP_TYPE_STREAM,
     BLOCK_OP_TYPE_REPLACE,
+    BLOCK_OP_TYPE_DEVICE_IO,
     BLOCK_OP_TYPE_MAX,
 } BlockOpType;
 
diff --git a/migration/block.c b/migration/block.c
index ddb59cc..b833bac 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -379,6 +379,7 @@  static void init_blk_migration(QEMUFile *f)
         alloc_aio_bitmap(bmds);
         error_setg(&bmds->blocker, "block device is in use by migration");
         bdrv_op_block_all(bs, bmds->blocker);
+        bdrv_op_unblock(bs, BLOCK_OP_TYPE_DEVICE_IO, bmds->blocker);
         bdrv_ref(bs);
 
         block_mig_state.total_sector_sum += sectors;