Patchwork [4/5] virtio-blk: release reference to RAM's memoryRegion

login
register
mail settings
Submitter pingfan liu
Date April 1, 2013, 8:20 a.m.
Message ID <1364804434-7980-5-git-send-email-qemulist@gmail.com>
Download mbox | patch
Permalink /patch/232679/
State New
Headers show

Comments

pingfan liu - April 1, 2013, 8:20 a.m.
From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>

virtio-blk will reference to RAM's memoryRegion when the req has been
done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.

Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
---
 hw/dataplane/virtio-blk.c |   52 ++++++++++++++++++++++++++++++++++----------
 1 files changed, 40 insertions(+), 12 deletions(-)
liguang - April 2, 2013, 5:58 a.m.
在 2013-04-01一的 16:20 +0800,Liu Ping Fan写道:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> virtio-blk will reference to RAM's memoryRegion when the req has been
> done. 

  do you mean unreference after req completed?

> So we can avoid to call bdrv_drain_all() when RAM hot unplug.
> 
> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> ---
>  hw/dataplane/virtio-blk.c |   52 ++++++++++++++++++++++++++++++++++----------
>  1 files changed, 40 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
> index 1242d61..437174d 100644
> --- a/hw/dataplane/virtio-blk.c
> +++ b/hw/dataplane/virtio-blk.c
> @@ -34,6 +34,8 @@ enum {
>  
>  typedef struct {
>      struct iocb iocb;               /* Linux AIO control block */
> +    MemoryRegion *mrs[VRING_MAX];
> +    int mrs_cnt;
>      QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
>      unsigned int head;              /* vring descriptor index */
>      struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
> @@ -120,6 +122,9 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
>       * transferred plus the status bytes.
>       */
>      vring_push(&s->vring, req->head, len + sizeof(hdr));
> +    while (--req->mrs_cnt >= 0) {
> +        memory_region_unref(req->mrs[req->mrs_cnt]);
> +    }
>  
>      s->num_reqs--;
>  }
> @@ -155,7 +160,8 @@ static void do_get_id_cmd(VirtIOBlockDataPlane *s,
>  static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>                         struct iovec *iov, unsigned int iov_cnt,
>                         long long offset, unsigned int head,
> -                       QEMUIOVector *inhdr)
> +                       QEMUIOVector *inhdr,
> +                       MemoryRegion **mrs, int cnt)
>  {
>      struct iocb *iocb;
>      QEMUIOVector qiov;
> @@ -187,6 +193,8 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>  
>      /* Fill in virtio block metadata needed for completion */
>      VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
> +    memcpy(req->mrs, mrs, cnt*sizeof(MemoryRegion *));
> +    req->mrs_cnt = cnt;
>      req->head = head;
>      req->inhdr = inhdr;
>      req->bounce_iov = bounce_iov;
> @@ -196,19 +204,22 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>  
>  static int process_request(IOQueue *ioq, struct iovec iov[],
>                             unsigned int out_num, unsigned int in_num,
> -                           unsigned int head)
> +                           unsigned int head, MemoryRegion **mrs)
>  {
>      VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
>      struct iovec *in_iov = &iov[out_num];
>      struct virtio_blk_outhdr outhdr;
>      QEMUIOVector *inhdr;
>      size_t in_size;
> +    unsigned int i, cnt = out_num+in_num;
> +    int ret;
>  
>      /* Copy in outhdr */
>      if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
>                              sizeof(outhdr)) != sizeof(outhdr))) {
>          error_report("virtio-blk request outhdr too short");
> -        return -EFAULT;
> +        ret = -EFAULT;
> +        goto free_mrs;
>      }
>      iov_discard_front(&iov, &out_num, sizeof(outhdr));
>  
> @@ -216,7 +227,8 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>      in_size = iov_size(in_iov, in_num);
>      if (in_size < sizeof(struct virtio_blk_inhdr)) {
>          error_report("virtio_blk request inhdr too short");
> -        return -EFAULT;
> +        ret = -EFAULT;
> +        goto free_mrs;
>      }
>      inhdr = g_slice_new(QEMUIOVector);
>      qemu_iovec_init(inhdr, 1);
> @@ -229,18 +241,22 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>      outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
>  
>      switch (outhdr.type) {
> +    /* For VIRTIO_BLK_T_IN/OUT, MemoryRegion will be release when cb */
>      case VIRTIO_BLK_T_IN:
> -        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
> +        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr,
> +            mrs, cnt);
>          return 0;
>  
>      case VIRTIO_BLK_T_OUT:
> -        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
> +        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr,
> +            mrs, cnt);
>          return 0;
>  
>      case VIRTIO_BLK_T_SCSI_CMD:
>          /* TODO support SCSI commands */
>          complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
> -        return 0;
> +        ret = 0;
> +        goto free_mrs;

IMHO, 's/goto free_mrs/break/g', so free_mrs label can be removed.

>  
>      case VIRTIO_BLK_T_FLUSH:
>          /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
> @@ -249,18 +265,27 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>          } else {
>              complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
>          }
> -        return 0;
> +        ret = 0;
> +        goto free_mrs;
>  
>      case VIRTIO_BLK_T_GET_ID:
>          do_get_id_cmd(s, in_iov, in_num, head, inhdr);
> -        return 0;
> +        ret = 0;
> +        goto free_mrs;
>  
>      default:
>          error_report("virtio-blk unsupported request type %#x", outhdr.type);
>          qemu_iovec_destroy(inhdr);
>          g_slice_free(QEMUIOVector, inhdr);
> -        return -EFAULT;
> +        ret = -EFAULT;
> +        goto free_mrs;
> +    }
> +
> +free_mrs:
> +    for (i = 0; i < cnt; i++) {
> +        memory_region_unref(mrs[i]);
>      }
> +    return ret;
>  }
>  
>  static int flush_true(EventNotifier *e)
> @@ -286,6 +311,7 @@ static void handle_notify(EventNotifier *e)
>      struct iovec iovec[VRING_MAX];
>      struct iovec *end = &iovec[VRING_MAX];
>      struct iovec *iov = iovec;
> +    MemoryRegion *mrs[VRING_MAX];
>  
>      /* When a request is read from the vring, the index of the first descriptor
>       * (aka head) is returned so that the completed request can be pushed onto
> @@ -304,7 +330,8 @@ static void handle_notify(EventNotifier *e)
>          vring_disable_notification(s->vdev, &s->vring);
>  
>          for (;;) {
> -            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
> +            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num,
> +                            mrs);
>              if (head < 0) {
>                  break; /* no more requests */
>              }
> @@ -312,7 +339,8 @@ static void handle_notify(EventNotifier *e)
>              trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
>                                                          head);
>  
> -            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
> +            if (process_request(&s->ioqueue, iov, out_num, in_num, head,
> +                    mrs) < 0) {
>                  vring_set_broken(&s->vring);
>                  break;
>              }
Stefan Hajnoczi - April 11, 2013, 10:20 a.m.
On Mon, Apr 01, 2013 at 04:20:33PM +0800, Liu Ping Fan wrote:
> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> 
> virtio-blk will reference to RAM's memoryRegion when the req has been
> done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.

How does the hot unplug operation work without bdrv_drain_all()?  In
other words, how do we safely remove a MemoryRegion and wait for it to
become unreferenced?

Stefan
pingfan liu - April 12, 2013, 4:44 a.m.
On Tue, Apr 2, 2013 at 1:58 PM, li guang <lig.fnst@cn.fujitsu.com> wrote:
> 在 2013-04-01一的 16:20 +0800,Liu Ping Fan写道:
>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>
>> virtio-blk will reference to RAM's memoryRegion when the req has been
>> done.
>
>   do you mean unreference after req completed?
>
Yes, should s/reference/unreference/

>> So we can avoid to call bdrv_drain_all() when RAM hot unplug.
>>
>> Signed-off-by: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> ---
>>  hw/dataplane/virtio-blk.c |   52 ++++++++++++++++++++++++++++++++++----------
>>  1 files changed, 40 insertions(+), 12 deletions(-)
>>
>> diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
>> index 1242d61..437174d 100644
>> --- a/hw/dataplane/virtio-blk.c
>> +++ b/hw/dataplane/virtio-blk.c
>> @@ -34,6 +34,8 @@ enum {
>>
>>  typedef struct {
>>      struct iocb iocb;               /* Linux AIO control block */
>> +    MemoryRegion *mrs[VRING_MAX];
>> +    int mrs_cnt;
>>      QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
>>      unsigned int head;              /* vring descriptor index */
>>      struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
>> @@ -120,6 +122,9 @@ static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
>>       * transferred plus the status bytes.
>>       */
>>      vring_push(&s->vring, req->head, len + sizeof(hdr));
>> +    while (--req->mrs_cnt >= 0) {
>> +        memory_region_unref(req->mrs[req->mrs_cnt]);
>> +    }
>>
>>      s->num_reqs--;
>>  }
>> @@ -155,7 +160,8 @@ static void do_get_id_cmd(VirtIOBlockDataPlane *s,
>>  static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>>                         struct iovec *iov, unsigned int iov_cnt,
>>                         long long offset, unsigned int head,
>> -                       QEMUIOVector *inhdr)
>> +                       QEMUIOVector *inhdr,
>> +                       MemoryRegion **mrs, int cnt)
>>  {
>>      struct iocb *iocb;
>>      QEMUIOVector qiov;
>> @@ -187,6 +193,8 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>>
>>      /* Fill in virtio block metadata needed for completion */
>>      VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
>> +    memcpy(req->mrs, mrs, cnt*sizeof(MemoryRegion *));
>> +    req->mrs_cnt = cnt;
>>      req->head = head;
>>      req->inhdr = inhdr;
>>      req->bounce_iov = bounce_iov;
>> @@ -196,19 +204,22 @@ static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
>>
>>  static int process_request(IOQueue *ioq, struct iovec iov[],
>>                             unsigned int out_num, unsigned int in_num,
>> -                           unsigned int head)
>> +                           unsigned int head, MemoryRegion **mrs)
>>  {
>>      VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
>>      struct iovec *in_iov = &iov[out_num];
>>      struct virtio_blk_outhdr outhdr;
>>      QEMUIOVector *inhdr;
>>      size_t in_size;
>> +    unsigned int i, cnt = out_num+in_num;
>> +    int ret;
>>
>>      /* Copy in outhdr */
>>      if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
>>                              sizeof(outhdr)) != sizeof(outhdr))) {
>>          error_report("virtio-blk request outhdr too short");
>> -        return -EFAULT;
>> +        ret = -EFAULT;
>> +        goto free_mrs;
>>      }
>>      iov_discard_front(&iov, &out_num, sizeof(outhdr));
>>
>> @@ -216,7 +227,8 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>>      in_size = iov_size(in_iov, in_num);
>>      if (in_size < sizeof(struct virtio_blk_inhdr)) {
>>          error_report("virtio_blk request inhdr too short");
>> -        return -EFAULT;
>> +        ret = -EFAULT;
>> +        goto free_mrs;
>>      }
>>      inhdr = g_slice_new(QEMUIOVector);
>>      qemu_iovec_init(inhdr, 1);
>> @@ -229,18 +241,22 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>>      outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
>>
>>      switch (outhdr.type) {
>> +    /* For VIRTIO_BLK_T_IN/OUT, MemoryRegion will be release when cb */
>>      case VIRTIO_BLK_T_IN:
>> -        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
>> +        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr,
>> +            mrs, cnt);
>>          return 0;
>>
>>      case VIRTIO_BLK_T_OUT:
>> -        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
>> +        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr,
>> +            mrs, cnt);
>>          return 0;
>>
>>      case VIRTIO_BLK_T_SCSI_CMD:
>>          /* TODO support SCSI commands */
>>          complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
>> -        return 0;
>> +        ret = 0;
>> +        goto free_mrs;
>
> IMHO, 's/goto free_mrs/break/g', so free_mrs label can be removed.
>
Adopt.  Thanks.
Pingfan
>>
>>      case VIRTIO_BLK_T_FLUSH:
>>          /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
>> @@ -249,18 +265,27 @@ static int process_request(IOQueue *ioq, struct iovec iov[],
>>          } else {
>>              complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
>>          }
>> -        return 0;
>> +        ret = 0;
>> +        goto free_mrs;
>>
>>      case VIRTIO_BLK_T_GET_ID:
>>          do_get_id_cmd(s, in_iov, in_num, head, inhdr);
>> -        return 0;
>> +        ret = 0;
>> +        goto free_mrs;
>>
>>      default:
>>          error_report("virtio-blk unsupported request type %#x", outhdr.type);
>>          qemu_iovec_destroy(inhdr);
>>          g_slice_free(QEMUIOVector, inhdr);
>> -        return -EFAULT;
>> +        ret = -EFAULT;
>> +        goto free_mrs;
>> +    }
>> +
>> +free_mrs:
>> +    for (i = 0; i < cnt; i++) {
>> +        memory_region_unref(mrs[i]);
>>      }
>> +    return ret;
>>  }
>>
>>  static int flush_true(EventNotifier *e)
>> @@ -286,6 +311,7 @@ static void handle_notify(EventNotifier *e)
>>      struct iovec iovec[VRING_MAX];
>>      struct iovec *end = &iovec[VRING_MAX];
>>      struct iovec *iov = iovec;
>> +    MemoryRegion *mrs[VRING_MAX];
>>
>>      /* When a request is read from the vring, the index of the first descriptor
>>       * (aka head) is returned so that the completed request can be pushed onto
>> @@ -304,7 +330,8 @@ static void handle_notify(EventNotifier *e)
>>          vring_disable_notification(s->vdev, &s->vring);
>>
>>          for (;;) {
>> -            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
>> +            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num,
>> +                            mrs);
>>              if (head < 0) {
>>                  break; /* no more requests */
>>              }
>> @@ -312,7 +339,8 @@ static void handle_notify(EventNotifier *e)
>>              trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
>>                                                          head);
>>
>> -            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
>> +            if (process_request(&s->ioqueue, iov, out_num, in_num, head,
>> +                    mrs) < 0) {
>>                  vring_set_broken(&s->vring);
>>                  break;
>>              }
>
>
pingfan liu - April 12, 2013, 4:48 a.m.
On Thu, Apr 11, 2013 at 6:20 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Mon, Apr 01, 2013 at 04:20:33PM +0800, Liu Ping Fan wrote:
>> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>>
>> virtio-blk will reference to RAM's memoryRegion when the req has been
>> done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.
>
> How does the hot unplug operation work without bdrv_drain_all()?  In
> other words, how do we safely remove a MemoryRegion and wait for it to
> become unreferenced?
>
bdrv_drain_all() forces the end of usage of memoryRegion. But we can
let the req done callback ( marks this req finish the end of usage of
mr) to release the refcnt of memoryRegion.

> Stefan
Stefan Hajnoczi - April 12, 2013, 8:45 a.m.
On Fri, Apr 12, 2013 at 12:48:12PM +0800, liu ping fan wrote:
> On Thu, Apr 11, 2013 at 6:20 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Mon, Apr 01, 2013 at 04:20:33PM +0800, Liu Ping Fan wrote:
> >> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> >>
> >> virtio-blk will reference to RAM's memoryRegion when the req has been
> >> done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.
> >
> > How does the hot unplug operation work without bdrv_drain_all()?  In
> > other words, how do we safely remove a MemoryRegion and wait for it to
> > become unreferenced?
> >
> bdrv_drain_all() forces the end of usage of memoryRegion. But we can
> let the req done callback ( marks this req finish the end of usage of
> mr) to release the refcnt of memoryRegion.

Yes.  What I'm interested in is the wait mechanism for the QEMU thread
to wait until the memory region(s) become unreferenced.

This patch series is only one half of the memory unplug puzzle and I'd
like to understand how the other half - the unplug operation - will be
implemented.

Just a summary would be interesting - especially how a QEMU thread will
wait until memory regions have been released.  The reference counter
doesn't have any notification that would allow a blocking wait.

Then you have the RCU concept.  So maybe the unplug operation will not
block but instead be called several times from the event loop until all
references have been released?

Stefan
pingfan liu - April 12, 2013, 9:05 a.m.
On Fri, Apr 12, 2013 at 4:45 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Apr 12, 2013 at 12:48:12PM +0800, liu ping fan wrote:
>> On Thu, Apr 11, 2013 at 6:20 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
>> > On Mon, Apr 01, 2013 at 04:20:33PM +0800, Liu Ping Fan wrote:
>> >> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
>> >>
>> >> virtio-blk will reference to RAM's memoryRegion when the req has been
>> >> done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.
>> >
>> > How does the hot unplug operation work without bdrv_drain_all()?  In
>> > other words, how do we safely remove a MemoryRegion and wait for it to
>> > become unreferenced?
>> >
>> bdrv_drain_all() forces the end of usage of memoryRegion. But we can
>> let the req done callback ( marks this req finish the end of usage of
>> mr) to release the refcnt of memoryRegion.
>
> Yes.  What I'm interested in is the wait mechanism for the QEMU thread
> to wait until the memory region(s) become unreferenced.
>
> This patch series is only one half of the memory unplug puzzle and I'd
> like to understand how the other half - the unplug operation - will be
> implemented.
>
The unplug patch is still under developed, more detail please refer to
Vasilis Liaskovitis's patches:
   http://lists.gnu.org/archive/html/qemu-devel/2012-12/msg02693.html

> Just a summary would be interesting - especially how a QEMU thread will
> wait until memory regions have been released.  The reference counter
> doesn't have any notification that would allow a blocking wait.
>
Sorry, not understand "a blocking wait".  To summary, when
initializing, RamDevice's refcnt == 1, and unplug will release this
one. Meanwhile, all the MemoryListeners which are async with unplug,
will inc refcnt to against the unplug event.
> Then you have the RCU concept.  So maybe the unplug operation will not
> block but instead be called several times from the event loop until all
> references have been released?
>
As mentioned above, unplug will put its own refcnt. And unplug will not block.


> Stefan
Stefan Hajnoczi - April 16, 2013, 7:57 a.m.
On Fri, Apr 12, 2013 at 05:05:41PM +0800, liu ping fan wrote:
> On Fri, Apr 12, 2013 at 4:45 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> > On Fri, Apr 12, 2013 at 12:48:12PM +0800, liu ping fan wrote:
> >> On Thu, Apr 11, 2013 at 6:20 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> >> > On Mon, Apr 01, 2013 at 04:20:33PM +0800, Liu Ping Fan wrote:
> >> >> From: Liu Ping Fan <pingfank@linux.vnet.ibm.com>
> >> >>
> >> >> virtio-blk will reference to RAM's memoryRegion when the req has been
> >> >> done.  So we can avoid to call bdrv_drain_all() when RAM hot unplug.
> >> >
> >> > How does the hot unplug operation work without bdrv_drain_all()?  In
> >> > other words, how do we safely remove a MemoryRegion and wait for it to
> >> > become unreferenced?
> >> >
> >> bdrv_drain_all() forces the end of usage of memoryRegion. But we can
> >> let the req done callback ( marks this req finish the end of usage of
> >> mr) to release the refcnt of memoryRegion.
> >
> > Yes.  What I'm interested in is the wait mechanism for the QEMU thread
> > to wait until the memory region(s) become unreferenced.
> >
> > This patch series is only one half of the memory unplug puzzle and I'd
> > like to understand how the other half - the unplug operation - will be
> > implemented.
> >
> The unplug patch is still under developed, more detail please refer to
> Vasilis Liaskovitis's patches:
>    http://lists.gnu.org/archive/html/qemu-devel/2012-12/msg02693.html
> 
> > Just a summary would be interesting - especially how a QEMU thread will
> > wait until memory regions have been released.  The reference counter
> > doesn't have any notification that would allow a blocking wait.
> >
> Sorry, not understand "a blocking wait".  To summary, when
> initializing, RamDevice's refcnt == 1, and unplug will release this
> one. Meanwhile, all the MemoryListeners which are async with unplug,
> will inc refcnt to against the unplug event.

Okay, thanks for the summary.  I don't need to see patches, I just want
to understand how the changes implemented in this series will be used.

> > Then you have the RCU concept.  So maybe the unplug operation will not
> > block but instead be called several times from the event loop until all
> > references have been released?
> >
> As mentioned above, unplug will put its own refcnt. And unplug will not block.

So it sounds like unplug will not block and there is no guarantee the
memory is actually unplugged when the monitor command completes.  The
memory region is only released when the last reference count holder lets
go.

This means that pending I/O to a hung NFS mount can delay the actual
unplug for unbounded time (by default the kernel NFS client keeps
retrying and does not fail the I/O request).  The user will be able to
issue additional monitor commands and see that memory is not yet
unplugged?

Stefan
Paolo Bonzini - April 16, 2013, 8:12 a.m.
Il 16/04/2013 09:57, Stefan Hajnoczi ha scritto:
> So it sounds like unplug will not block and there is no guarantee the
> memory is actually unplugged when the monitor command completes.  The
> memory region is only released when the last reference count holder lets
> go.
> 
> This means that pending I/O to a hung NFS mount can delay the actual
> unplug for unbounded time (by default the kernel NFS client keeps
> retrying and does not fail the I/O request).  The user will be able to
> issue additional monitor commands and see that memory is not yet
> unplugged?

I think "info mtree" would provide information.  We can add an event
too, similar to the recently added DEVICE_DELETED.

Paolo

Patch

diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
index 1242d61..437174d 100644
--- a/hw/dataplane/virtio-blk.c
+++ b/hw/dataplane/virtio-blk.c
@@ -34,6 +34,8 @@  enum {
 
 typedef struct {
     struct iocb iocb;               /* Linux AIO control block */
+    MemoryRegion *mrs[VRING_MAX];
+    int mrs_cnt;
     QEMUIOVector *inhdr;            /* iovecs for virtio_blk_inhdr */
     unsigned int head;              /* vring descriptor index */
     struct iovec *bounce_iov;       /* used if guest buffers are unaligned */
@@ -120,6 +122,9 @@  static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
      * transferred plus the status bytes.
      */
     vring_push(&s->vring, req->head, len + sizeof(hdr));
+    while (--req->mrs_cnt >= 0) {
+        memory_region_unref(req->mrs[req->mrs_cnt]);
+    }
 
     s->num_reqs--;
 }
@@ -155,7 +160,8 @@  static void do_get_id_cmd(VirtIOBlockDataPlane *s,
 static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
                        struct iovec *iov, unsigned int iov_cnt,
                        long long offset, unsigned int head,
-                       QEMUIOVector *inhdr)
+                       QEMUIOVector *inhdr,
+                       MemoryRegion **mrs, int cnt)
 {
     struct iocb *iocb;
     QEMUIOVector qiov;
@@ -187,6 +193,8 @@  static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
 
     /* Fill in virtio block metadata needed for completion */
     VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
+    memcpy(req->mrs, mrs, cnt*sizeof(MemoryRegion *));
+    req->mrs_cnt = cnt;
     req->head = head;
     req->inhdr = inhdr;
     req->bounce_iov = bounce_iov;
@@ -196,19 +204,22 @@  static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
 
 static int process_request(IOQueue *ioq, struct iovec iov[],
                            unsigned int out_num, unsigned int in_num,
-                           unsigned int head)
+                           unsigned int head, MemoryRegion **mrs)
 {
     VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
     struct iovec *in_iov = &iov[out_num];
     struct virtio_blk_outhdr outhdr;
     QEMUIOVector *inhdr;
     size_t in_size;
+    unsigned int i, cnt = out_num+in_num;
+    int ret;
 
     /* Copy in outhdr */
     if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
                             sizeof(outhdr)) != sizeof(outhdr))) {
         error_report("virtio-blk request outhdr too short");
-        return -EFAULT;
+        ret = -EFAULT;
+        goto free_mrs;
     }
     iov_discard_front(&iov, &out_num, sizeof(outhdr));
 
@@ -216,7 +227,8 @@  static int process_request(IOQueue *ioq, struct iovec iov[],
     in_size = iov_size(in_iov, in_num);
     if (in_size < sizeof(struct virtio_blk_inhdr)) {
         error_report("virtio_blk request inhdr too short");
-        return -EFAULT;
+        ret = -EFAULT;
+        goto free_mrs;
     }
     inhdr = g_slice_new(QEMUIOVector);
     qemu_iovec_init(inhdr, 1);
@@ -229,18 +241,22 @@  static int process_request(IOQueue *ioq, struct iovec iov[],
     outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
 
     switch (outhdr.type) {
+    /* For VIRTIO_BLK_T_IN/OUT, MemoryRegion will be release when cb */
     case VIRTIO_BLK_T_IN:
-        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr,
+            mrs, cnt);
         return 0;
 
     case VIRTIO_BLK_T_OUT:
-        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
+        do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr,
+            mrs, cnt);
         return 0;
 
     case VIRTIO_BLK_T_SCSI_CMD:
         /* TODO support SCSI commands */
         complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
-        return 0;
+        ret = 0;
+        goto free_mrs;
 
     case VIRTIO_BLK_T_FLUSH:
         /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
@@ -249,18 +265,27 @@  static int process_request(IOQueue *ioq, struct iovec iov[],
         } else {
             complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
         }
-        return 0;
+        ret = 0;
+        goto free_mrs;
 
     case VIRTIO_BLK_T_GET_ID:
         do_get_id_cmd(s, in_iov, in_num, head, inhdr);
-        return 0;
+        ret = 0;
+        goto free_mrs;
 
     default:
         error_report("virtio-blk unsupported request type %#x", outhdr.type);
         qemu_iovec_destroy(inhdr);
         g_slice_free(QEMUIOVector, inhdr);
-        return -EFAULT;
+        ret = -EFAULT;
+        goto free_mrs;
+    }
+
+free_mrs:
+    for (i = 0; i < cnt; i++) {
+        memory_region_unref(mrs[i]);
     }
+    return ret;
 }
 
 static int flush_true(EventNotifier *e)
@@ -286,6 +311,7 @@  static void handle_notify(EventNotifier *e)
     struct iovec iovec[VRING_MAX];
     struct iovec *end = &iovec[VRING_MAX];
     struct iovec *iov = iovec;
+    MemoryRegion *mrs[VRING_MAX];
 
     /* When a request is read from the vring, the index of the first descriptor
      * (aka head) is returned so that the completed request can be pushed onto
@@ -304,7 +330,8 @@  static void handle_notify(EventNotifier *e)
         vring_disable_notification(s->vdev, &s->vring);
 
         for (;;) {
-            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
+            head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num,
+                            mrs);
             if (head < 0) {
                 break; /* no more requests */
             }
@@ -312,7 +339,8 @@  static void handle_notify(EventNotifier *e)
             trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
                                                         head);
 
-            if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
+            if (process_request(&s->ioqueue, iov, out_num, in_num, head,
+                    mrs) < 0) {
                 vring_set_broken(&s->vring);
                 break;
             }