Message ID | 1322048878-26348-4-git-send-email-stefanha@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote: > The block layer does not know about pending requests. This information > is necessary for copy-on-read since overlapping requests must be > serialized to prevent races that corrupt the image. > > The BlockDriverState gets a new tracked_request list field which > contains all pending requests. Each request is a BdrvTrackedRequest > record with sector_num, nb_sectors, and is_write fields. > > Note that request tracking is always enabled but hopefully this extra > work is so small that it doesn't justify adding an enable/disable flag. > > Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > --- > block.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- > block_int.h | 4 ++++ > 2 files changed, 51 insertions(+), 1 deletions(-) > > diff --git a/block.c b/block.c > index 0df7eb9..27c4e84 100644 > --- a/block.c > +++ b/block.c > @@ -1071,6 +1071,42 @@ void bdrv_commit_all(void) > } > } > > +struct BdrvTrackedRequest { > + BlockDriverState *bs; > + int64_t sector_num; > + int nb_sectors; > + bool is_write; > + QLIST_ENTRY(BdrvTrackedRequest) list; > +}; > + > +/** > + * Remove an active request from the tracked requests list > + * > + * This function should be called when a tracked request is completing. > + */ > +static void tracked_request_end(BdrvTrackedRequest *req) > +{ > + QLIST_REMOVE(req, list); > +} > + > +/** > + * Add an active request to the tracked requests list > + */ > +static void tracked_request_begin(BdrvTrackedRequest *req, > + BlockDriverState *bs, > + int64_t sector_num, > + int nb_sectors, bool is_write) > +{ > + *req = (BdrvTrackedRequest){ > + .bs = bs, > + .sector_num = sector_num, > + .nb_sectors = nb_sectors, > + .is_write = is_write, > + }; > + > + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); > +} > + > /* > * Return values: > * 0 - success > @@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) > { > BlockDriver *drv = bs->drv; > + BdrvTrackedRequest req; > + int ret; > > if (!drv) { > return -ENOMEDIUM; > @@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, > bdrv_io_limits_intercept(bs, false, nb_sectors); > } > > - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); > + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); > + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); > + tracked_request_end(&req); > + return ret; > } > > int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, > @@ -1381,6 +1422,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) > { > BlockDriver *drv = bs->drv; > + BdrvTrackedRequest req; > int ret; > > if (!bs->drv) { > @@ -1398,6 +1440,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, > bdrv_io_limits_intercept(bs, true, nb_sectors); > } > > + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); > + > ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); > > if (bs->dirty_bitmap) { > @@ -1408,6 +1452,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, > bs->wr_highest_sector = sector_num + nb_sectors - 1; > } > > + tracked_request_end(&req); > + > return ret; > } There is no need to worry about synchronous read/write requests bypassing this interface, correct?
On 12/05/2011 01:17 PM, Marcelo Tosatti wrote: > There is no need to worry about synchronous read/write requests > bypassing this interface, correct? > Synchronous read/write requests do not exist anymore (in the sense that they also go through coroutines). Paolo
On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote: > The block layer does not know about pending requests. This information > is necessary for copy-on-read since overlapping requests must be > serialized to prevent races that corrupt the image. > > The BlockDriverState gets a new tracked_request list field which > contains all pending requests. Each request is a BdrvTrackedRequest > record with sector_num, nb_sectors, and is_write fields. > > Note that request tracking is always enabled but hopefully this extra > work is so small that it doesn't justify adding an enable/disable flag. > > Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> > --- > block.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- > block_int.h | 4 ++++ > 2 files changed, 51 insertions(+), 1 deletions(-) > > diff --git a/block.c b/block.c > index 0df7eb9..27c4e84 100644 > --- a/block.c > +++ b/block.c > @@ -1071,6 +1071,42 @@ void bdrv_commit_all(void) > } > } > > +struct BdrvTrackedRequest { > + BlockDriverState *bs; > + int64_t sector_num; > + int nb_sectors; > + bool is_write; > + QLIST_ENTRY(BdrvTrackedRequest) list; > +}; > + > +/** > + * Remove an active request from the tracked requests list > + * > + * This function should be called when a tracked request is completing. > + */ > +static void tracked_request_end(BdrvTrackedRequest *req) > +{ > + QLIST_REMOVE(req, list); > +} > + > +/** > + * Add an active request to the tracked requests list > + */ > +static void tracked_request_begin(BdrvTrackedRequest *req, > + BlockDriverState *bs, > + int64_t sector_num, > + int nb_sectors, bool is_write) > +{ > + *req = (BdrvTrackedRequest){ > + .bs = bs, > + .sector_num = sector_num, > + .nb_sectors = nb_sectors, > + .is_write = is_write, > + }; > + > + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); > +} > + > /* > * Return values: > * 0 - success > @@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) > { > BlockDriver *drv = bs->drv; > + BdrvTrackedRequest req; > + int ret; > > if (!drv) { > return -ENOMEDIUM; > @@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, > bdrv_io_limits_intercept(bs, false, nb_sectors); > } > > - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); > + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); > + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); > + tracked_request_end(&req); > + return ret; > } Stefan, On earlier discussion, you replied to me: " >> req = tracked_request_add(bs, sector_num, nb_sectors, false); > > The tracked request should include cluster round info? When checking A and B for overlap, only one of them needs to be rounded in order for overlap detection to be correct. Therefore we can store the original request [start, length) in tracked_requests and only round the new request. " The problem AFAICS is this: - Store a non-cluster-aligned request in the tracked request list. - Wait on that non-cluster-aligned request (wait_for_overlapping_requests). - Submit cluster-aligned request for COR request. So, the tracked request list does not properly reflect the in-flight COR requests. Which can result in: 1) guest reads sector 10. 2) <sector_num=10,nb_sectors=2> added to tracked request list. 3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align> 4) unrelated guest operation writes to sector 13, nb_sectors=1. That is allowed to proceed without waiting because tracked request list does not reflect what COR in-flight requests. Am i missing something here?
On Mon, Dec 5, 2011 at 4:09 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote: > On earlier discussion, you replied to me: > > " >>> req = tracked_request_add(bs, sector_num, nb_sectors, false); >> >> The tracked request should include cluster round info? > > When checking A and B for overlap, only one of them needs to be > rounded in order for overlap detection to be correct. Therefore we > can store the original request [start, length) in tracked_requests and > only round the new request. > " > > The problem AFAICS is this: > > - Store a non-cluster-aligned request in the tracked request list. > - Wait on that non-cluster-aligned request > (wait_for_overlapping_requests). > - Submit cluster-aligned request for COR request. > > So, the tracked request list does not properly reflect the in-flight > COR requests. Which can result in: > > 1) guest reads sector 10. > 2) <sector_num=10,nb_sectors=2> added to tracked request list. > 3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align> It will also round down to sector_num=0 when cluster size is 128 sectors (e.g. qcow2 and qed). > 4) unrelated guest operation writes to sector 13, nb_sectors=1. That is > allowed to proceed without waiting because tracked request list does not > reflect what COR in-flight requests. The tracked request list has <sector_num=10, nb_sectors=2> and the candidate write request is <sector_num=13, nb_sectors=1>. In wait_for_overlapping_requests() we round the candidate request to <sector_num=0, nb_sectors=cluster_size>. This rounded request does overlap <sector_num=10, sectors=2> so it will need to wait. Therefore CoR and write will not execute at the same time. Does this make more sense? Stefan
On Mon, Dec 05, 2011 at 04:20:55PM +0000, Stefan Hajnoczi wrote: > On Mon, Dec 5, 2011 at 4:09 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote: > > On Wed, Nov 23, 2011 at 11:47:53AM +0000, Stefan Hajnoczi wrote: > > On earlier discussion, you replied to me: > > > > " > >>> req = tracked_request_add(bs, sector_num, nb_sectors, false); > >> > >> The tracked request should include cluster round info? > > > > When checking A and B for overlap, only one of them needs to be > > rounded in order for overlap detection to be correct. Therefore we > > can store the original request [start, length) in tracked_requests and > > only round the new request. > > " > > > > The problem AFAICS is this: > > > > - Store a non-cluster-aligned request in the tracked request list. > > - Wait on that non-cluster-aligned request > > (wait_for_overlapping_requests). > > - Submit cluster-aligned request for COR request. > > > > So, the tracked request list does not properly reflect the in-flight > > COR requests. Which can result in: > > > > 1) guest reads sector 10. > > 2) <sector_num=10,nb_sectors=2> added to tracked request list. > > 3) COR code submits read for <sector_num=10,nb_sectors=2+cluster_align> > > It will also round down to sector_num=0 when cluster size is 128 > sectors (e.g. qcow2 and qed). > > > 4) unrelated guest operation writes to sector 13, nb_sectors=1. That is > > allowed to proceed without waiting because tracked request list does not > > reflect what COR in-flight requests. > > The tracked request list has <sector_num=10, nb_sectors=2> and the > candidate write request is <sector_num=13, nb_sectors=1>. > > In wait_for_overlapping_requests() we round the candidate request to > <sector_num=0, nb_sectors=cluster_size>. This rounded request does > overlap <sector_num=10, sectors=2> so it will need to wait. > > Therefore CoR and write will not execute at the same time. > > Does this make more sense? > > Stefan Ah yes same mistake on my part, again. Thanks.
diff --git a/block.c b/block.c index 0df7eb9..27c4e84 100644 --- a/block.c +++ b/block.c @@ -1071,6 +1071,42 @@ void bdrv_commit_all(void) } } +struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + bool is_write; + QLIST_ENTRY(BdrvTrackedRequest) list; +}; + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + QLIST_REMOVE(req, list); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, bool is_write) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .is_write = is_write, + }; + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + /* * Return values: * 0 - success @@ -1350,6 +1386,8 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + int ret; if (!drv) { return -ENOMEDIUM; @@ -1363,7 +1401,10 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, bdrv_io_limits_intercept(bs, false, nb_sectors); } - return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + tracked_request_end(&req); + return ret; } int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, @@ -1381,6 +1422,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; int ret; if (!bs->drv) { @@ -1398,6 +1440,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bdrv_io_limits_intercept(bs, true, nb_sectors); } + tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); if (bs->dirty_bitmap) { @@ -1408,6 +1452,8 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, bs->wr_highest_sector = sector_num + nb_sectors - 1; } + tracked_request_end(&req); + return ret; } diff --git a/block_int.h b/block_int.h index f9e2c9a..788fde9 100644 --- a/block_int.h +++ b/block_int.h @@ -51,6 +51,8 @@ #define BLOCK_OPT_PREALLOC "preallocation" #define BLOCK_OPT_SUBFMT "subformat" +typedef struct BdrvTrackedRequest BdrvTrackedRequest; + typedef struct AIOPool { void (*cancel)(BlockDriverAIOCB *acb); int aiocb_size; @@ -250,6 +252,8 @@ struct BlockDriverState { int in_use; /* users other than guest access, eg. block migration */ QTAILQ_ENTRY(BlockDriverState) list; void *private; + + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; }; struct BlockDriverAIOCB {
The block layer does not know about pending requests. This information is necessary for copy-on-read since overlapping requests must be serialized to prevent races that corrupt the image. The BlockDriverState gets a new tracked_request list field which contains all pending requests. Each request is a BdrvTrackedRequest record with sector_num, nb_sectors, and is_write fields. Note that request tracking is always enabled but hopefully this extra work is so small that it doesn't justify adding an enable/disable flag. Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> --- block.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- block_int.h | 4 ++++ 2 files changed, 51 insertions(+), 1 deletions(-)