Patchwork [v4,3/8] block: add request tracking

login
register
mail settings
Submitter Stefan Hajnoczi
Date Nov. 23, 2011, 11:47 a.m.
Message ID <1322048878-26348-4-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/127300/
State New
Headers show

Comments

Stefan Hajnoczi - Nov. 23, 2011, 11:47 a.m.
The block layer does not know about pending requests.  This information
is necessary for copy-on-read since overlapping requests must be
serialized to prevent races that corrupt the image.

The BlockDriverState gets a new tracked_request list field which
contains all pending requests.  Each request is a BdrvTrackedRequest
record with sector_num, nb_sectors, and is_write fields.

Note that request tracking is always enabled but hopefully this extra
work is so small that it doesn't justify adding an enable/disable flag.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block.c     |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
 block_int.h |    4 ++++
 2 files changed, 51 insertions(+), 1 deletions(-)
Marcelo Tosatti - Dec. 5, 2011, 12:17 p.m.
On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote:
> The block layer does not know about pending requests.  This information
> is necessary for copy-on-read since overlapping requests must be
> serialized to prevent races that corrupt the image.
> 
> The BlockDriverState gets a new tracked_request list field which
> contains all pending requests.  Each request is a BdrvTrackedRequest
> record with sector_num, nb_sectors, and is_write fields.
> 
> Note that request tracking is always enabled but hopefully this extra
> work is so small that it doesn't justify adding an enable/disable flag.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  block.c     |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>  block_int.h |    4 ++++
>  2 files changed, 51 insertions(+), 1 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 0df7eb9..27c4e84 100644
> --- a/block.c
> +++ b/block.c
> @@ -1071,6 +1071,42 @@ void bdrv_commit_all(void)
>      }
>  }
>  
> +struct BdrvTrackedRequest {
> +    BlockDriverState *bs;
> +    int64_t sector_num;
> +    int nb_sectors;
> +    bool is_write;
> +    QLIST_ENTRY(BdrvTrackedRequest) list;
> +};
> +
> +/**
> + * Remove an active request from the tracked requests list
> + *
> + * This function should be called when a tracked request is completing.
> + */
> +static void tracked_request_end(BdrvTrackedRequest *req)
> +{
> +    QLIST_REMOVE(req, list);
> +}
> +
> +/**
> + * Add an active request to the tracked requests list
> + */
> +static void tracked_request_begin(BdrvTrackedRequest *req,
> +                                  BlockDriverState *bs,
> +                                  int64_t sector_num,
> +                                  int nb_sectors, bool is_write)
> +{
> +    *req = (BdrvTrackedRequest){
> +        .bs = bs,
> +        .sector_num = sector_num,
> +        .nb_sectors = nb_sectors,
> +        .is_write = is_write,
> +    };
> +
> +    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
> +}
> +
>  /*
>   * Return values:
>   * 0        - success
> @@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>  {
>      BlockDriver *drv = bs->drv;
> +    BdrvTrackedRequest req;
> +    int ret;
>  
>      if (!drv) {
>          return -ENOMEDIUM;
> @@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>          bdrv_io_limits_intercept(bs, false, nb_sectors);
>      }
>  
> -    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
> +    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +    tracked_request_end(&req);
> +    return ret;
>  }
>  
>  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
> @@ -1381,6 +1422,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>  {
>      BlockDriver *drv = bs->drv;
> +    BdrvTrackedRequest req;
>      int ret;
>  
>      if (!bs->drv) {
> @@ -1398,6 +1440,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>          bdrv_io_limits_intercept(bs, true, nb_sectors);
>      }
>  
> +    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
> +
>      ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
>  
>      if (bs->dirty_bitmap) {
> @@ -1408,6 +1452,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
>          bs->wr_highest_sector = sector_num + nb_sectors - 1;
>      }
>  
> +    tracked_request_end(&req);
> +
>      return ret;
>  }

There is no need to worry about synchronous read/write requests
bypassing this interface, correct?
Paolo Bonzini - Dec. 5, 2011, 12:20 p.m.
On 12/05/2011 01:17 PM, Marcelo Tosatti wrote:
> There is no need to worry about synchronous read/write requests
> bypassing this interface, correct?
>

Synchronous read/write requests do not exist anymore (in the sense that 
they also go through coroutines).

Paolo
Marcelo Tosatti - Dec. 5, 2011, 4:09 p.m.
On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote:
> The block layer does not know about pending requests.  This information
> is necessary for copy-on-read since overlapping requests must be
> serialized to prevent races that corrupt the image.
> 
> The BlockDriverState gets a new tracked_request list field which
> contains all pending requests.  Each request is a BdrvTrackedRequest
> record with sector_num, nb_sectors, and is_write fields.
> 
> Note that request tracking is always enabled but hopefully this extra
> work is so small that it doesn't justify adding an enable/disable flag.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  block.c     |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
>  block_int.h |    4 ++++
>  2 files changed, 51 insertions(+), 1 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 0df7eb9..27c4e84 100644
> --- a/block.c
> +++ b/block.c
> @@ -1071,6 +1071,42 @@ void bdrv_commit_all(void)
>      }
>  }
>  
> +struct BdrvTrackedRequest {
> +    BlockDriverState *bs;
> +    int64_t sector_num;
> +    int nb_sectors;
> +    bool is_write;
> +    QLIST_ENTRY(BdrvTrackedRequest) list;
> +};
> +
> +/**
> + * Remove an active request from the tracked requests list
> + *
> + * This function should be called when a tracked request is completing.
> + */
> +static void tracked_request_end(BdrvTrackedRequest *req)
> +{
> +    QLIST_REMOVE(req, list);
> +}
> +
> +/**
> + * Add an active request to the tracked requests list
> + */
> +static void tracked_request_begin(BdrvTrackedRequest *req,
> +                                  BlockDriverState *bs,
> +                                  int64_t sector_num,
> +                                  int nb_sectors, bool is_write)
> +{
> +    *req = (BdrvTrackedRequest){
> +        .bs = bs,
> +        .sector_num = sector_num,
> +        .nb_sectors = nb_sectors,
> +        .is_write = is_write,
> +    };
> +
> +    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
> +}
> +
>  /*
>   * Return values:
>   * 0        - success
> @@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>      int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
>  {
>      BlockDriver *drv = bs->drv;
> +    BdrvTrackedRequest req;
> +    int ret;
>  
>      if (!drv) {
>          return -ENOMEDIUM;
> @@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>          bdrv_io_limits_intercept(bs, false, nb_sectors);
>      }
>  
> -    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
> +    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +    tracked_request_end(&req);
> +    return ret;
>  }

Stefan,

On earlier discussion, you replied to me:

"
>>      req = tracked_request_add(bs, sector_num, nb_sectors, false);
>
> The tracked request should include cluster round info?

When checking A and B for overlap, only one of them needs to be
rounded in order for overlap detection to be correct.  Therefore we
can store the original request [start, length) in tracked_requests and
only round the new request.
"

The problem AFAICS is this:

- Store a non-cluster-aligned request in the tracked request list.
- Wait on that non-cluster-aligned request
  (wait_for_overlapping_requests).
- Submit cluster-aligned request for COR request.

So, the tracked request list does not properly reflect the in-flight 
COR requests. Which can result in:

1) guest reads sector 10.
2) <sector_num=10,nb_sectors=2> added to tracked request list.
3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align>
4) unrelated guest operation writes to sector 13, nb_sectors=1. That is
allowed to proceed without waiting because tracked request list does not
reflect what COR in-flight requests.

Am i missing something here?
Stefan Hajnoczi - Dec. 5, 2011, 4:20 p.m.
On Mon, Dec 5, 2011 at 4:09 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote:
> On earlier discussion, you replied to me:
>
> "
>>>      req = tracked_request_add(bs, sector_num, nb_sectors, false);
>>
>> The tracked request should include cluster round info?
>
> When checking A and B for overlap, only one of them needs to be
> rounded in order for overlap detection to be correct.  Therefore we
> can store the original request [start, length) in tracked_requests and
> only round the new request.
> "
>
> The problem AFAICS is this:
>
> - Store a non-cluster-aligned request in the tracked request list.
> - Wait on that non-cluster-aligned request
>  (wait_for_overlapping_requests).
> - Submit cluster-aligned request for COR request.
>
> So, the tracked request list does not properly reflect the in-flight
> COR requests. Which can result in:
>
> 1) guest reads sector 10.
> 2) <sector_num=10,nb_sectors=2> added to tracked request list.
> 3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align>

It will also round down to sector_num=0 when cluster size is 128
sectors (e.g. qcow2 and qed).

> 4) unrelated guest operation writes to sector 13, nb_sectors=1. That is
> allowed to proceed without waiting because tracked request list does not
> reflect what COR in-flight requests.

The tracked request list has <sector_num=10, nb_sectors=2> and the
candidate write request is <sector_num=13, nb_sectors=1>.

In wait_for_overlapping_requests() we round the candidate request to
<sector_num=0, nb_sectors=cluster_size>.  This rounded request does
overlap <sector_num=10, sectors=2> so it will need to wait.

Therefore CoR and write will not execute at the same time.

Does this make more sense?

Stefan
Marcelo Tosatti - Dec. 5, 2011, 4:31 p.m.
On Mon, Dec 05, 2011 at 04:20:55PM +0000, Stefan Hajnoczi wrote:
> On Mon, Dec 5, 2011 at 4:09 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote:
> > On earlier discussion, you replied to me:
> >
> > "
> >>>      req = tracked_request_add(bs, sector_num, nb_sectors, false);
> >>
> >> The tracked request should include cluster round info?
> >
> > When checking A and B for overlap, only one of them needs to be
> > rounded in order for overlap detection to be correct.  Therefore we
> > can store the original request [start, length) in tracked_requests and
> > only round the new request.
> > "
> >
> > The problem AFAICS is this:
> >
> > - Store a non-cluster-aligned request in the tracked request list.
> > - Wait on that non-cluster-aligned request
> >  (wait_for_overlapping_requests).
> > - Submit cluster-aligned request for COR request.
> >
> > So, the tracked request list does not properly reflect the in-flight
> > COR requests. Which can result in:
> >
> > 1) guest reads sector 10.
> > 2) <sector_num=10,nb_sectors=2> added to tracked request list.
> > 3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align>
> 
> It will also round down to sector_num=0 when cluster size is 128
> sectors (e.g. qcow2 and qed).
> 
> > 4) unrelated guest operation writes to sector 13, nb_sectors=1. That is
> > allowed to proceed without waiting because tracked request list does not
> > reflect what COR in-flight requests.
> 
> The tracked request list has <sector_num=10, nb_sectors=2> and the
> candidate write request is <sector_num=13, nb_sectors=1>.
> 
> In wait_for_overlapping_requests() we round the candidate request to
> <sector_num=0, nb_sectors=cluster_size>.  This rounded request does
> overlap <sector_num=10, sectors=2> so it will need to wait.
> 
> Therefore CoR and write will not execute at the same time.
> 
> Does this make more sense?
> 
> Stefan

Ah yes same mistake on my part, again. Thanks.

Patch

diff --git a/block.c b/block.c
index 0df7eb9..27c4e84 100644
--- a/block.c
+++ b/block.c
@@ -1071,6 +1071,42 @@  void bdrv_commit_all(void)
     }
 }
 
+struct BdrvTrackedRequest {
+    BlockDriverState *bs;
+    int64_t sector_num;
+    int nb_sectors;
+    bool is_write;
+    QLIST_ENTRY(BdrvTrackedRequest) list;
+};
+
+/**
+ * Remove an active request from the tracked requests list
+ *
+ * This function should be called when a tracked request is completing.
+ */
+static void tracked_request_end(BdrvTrackedRequest *req)
+{
+    QLIST_REMOVE(req, list);
+}
+
+/**
+ * Add an active request to the tracked requests list
+ */
+static void tracked_request_begin(BdrvTrackedRequest *req,
+                                  BlockDriverState *bs,
+                                  int64_t sector_num,
+                                  int nb_sectors, bool is_write)
+{
+    *req = (BdrvTrackedRequest){
+        .bs = bs,
+        .sector_num = sector_num,
+        .nb_sectors = nb_sectors,
+        .is_write = is_write,
+    };
+
+    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
+}
+
 /*
  * Return values:
  * 0        - success
@@ -1350,6 +1386,8 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
+    int ret;
 
     if (!drv) {
         return -ENOMEDIUM;
@@ -1363,7 +1401,10 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, false, nb_sectors);
     }
 
-    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    tracked_request_end(&req);
+    return ret;
 }
 
 int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
@@ -1381,6 +1422,7 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest req;
     int ret;
 
     if (!bs->drv) {
@@ -1398,6 +1440,8 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, true, nb_sectors);
     }
 
+    tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
+
     ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 
     if (bs->dirty_bitmap) {
@@ -1408,6 +1452,8 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
+    tracked_request_end(&req);
+
     return ret;
 }
 
diff --git a/block_int.h b/block_int.h
index f9e2c9a..788fde9 100644
--- a/block_int.h
+++ b/block_int.h
@@ -51,6 +51,8 @@ 
 #define BLOCK_OPT_PREALLOC      "preallocation"
 #define BLOCK_OPT_SUBFMT        "subformat"
 
+typedef struct BdrvTrackedRequest BdrvTrackedRequest;
+
 typedef struct AIOPool {
     void (*cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
@@ -250,6 +252,8 @@  struct BlockDriverState {
     int in_use; /* users other than guest access, eg. block migration */
     QTAILQ_ENTRY(BlockDriverState) list;
     void *private;
+
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 };
 
 struct BlockDriverAIOCB {