Patchwork sheepdog: implement SD_OP_FLUSH_VDI operation

login
register
mail settings
Submitter namei.unix@gmail.com
Date April 2, 2012, 5:35 p.m.
Message ID <1333388150-12491-1-git-send-email-namei.unix@gmail.com>
Download mbox | patch
Permalink /patch/150224/
State New
Headers show

Comments

namei.unix@gmail.com - April 2, 2012, 5:35 p.m.
From: Liu Yuan <tailai.ly@taobao.com>

Flush operation is supposed to flush the write-back cache of
sheepdog cluster.

By issuing flush operation, we can assure the Guest of data
reaching the sheepdog cluster storage.

Cc: Kevin Wolf <kwolf@redhat.com>
Reviewd-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
---
 block/sheepdog.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 50 insertions(+), 0 deletions(-)
Michael Tokarev - April 2, 2012, 8:01 p.m.
On 02.04.2012 21:35, Liu Yuan wrote:
> From: Liu Yuan <tailai.ly@taobao.com>
> 
> Flush operation is supposed to flush the write-back cache of
> sheepdog cluster.
> 
> By issuing flush operation, we can assure the Guest of data
> reaching the sheepdog cluster storage.
> 
> Cc: Kevin Wolf <kwolf@redhat.com>
> Reviewd-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
> Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
> ---
>  block/sheepdog.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 files changed, 50 insertions(+), 0 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index 00276f6f..c08c69b 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -32,9 +32,11 @@
>  #define SD_OP_RELEASE_VDI    0x13
>  #define SD_OP_GET_VDI_INFO   0x14
>  #define SD_OP_READ_VDIS      0x15
> +#define SD_OP_FLUSH_VDI      0x16
>  
>  #define SD_FLAG_CMD_WRITE    0x01
>  #define SD_FLAG_CMD_COW      0x02
> +#define SD_FLAG_CMD_CACHE    0x04
>  
>  #define SD_RES_SUCCESS       0x00 /* Success */
>  #define SD_RES_UNKNOWN       0x01 /* Unknown error */
> @@ -179,6 +181,8 @@ typedef struct SheepdogInode {
>      uint32_t data_vdi_id[MAX_DATA_OBJS];
>  } SheepdogInode;
>  
> +static int cache_enabled;

Why it is a global property instead of per-device
(in BDRVSheepdogState) property?

> @@ -1011,6 +1024,10 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
>      QLIST_INIT(&s->outstanding_aio_head);
>      s->fd = -1;
>  
> +    if (flags & BDRV_O_CACHE_WB) {
> +            cache_enabled = 1;
> +    }

You test per-device flag here, and set a global variable, why?

> +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
> +{
> +    BDRVSheepdogState *s = bs->opaque;
> +    SheepdogObjReq hdr = { 0 };
> +    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
> +    SheepdogInode *inode = &s->inode;
> +    int fd, ret;
> +    unsigned int wlen = 0, rlen = 0;
> +
> +    fd = connect_to_sdog(s->addr, s->port);
> +    if (fd < 0) {
> +        return -1;
> +    }

Do you really need _another_ connection here?
How about resolving server names, handling
connection timeouts (in case the server can't
accept new connection for some reason) and
other surrounding things?

Thanks,

/mjt
namei.unix@gmail.com - April 3, 2012, 5:14 a.m.
On 04/03/2012 04:01 AM, Michael Tokarev wrote:

> On 02.04.2012 21:35, Liu Yuan wrote:
>> From: Liu Yuan <tailai.ly@taobao.com>
>>
>> Flush operation is supposed to flush the write-back cache of
>> sheepdog cluster.
>>
>> By issuing flush operation, we can assure the Guest of data
>> reaching the sheepdog cluster storage.
>>
>> Cc: Kevin Wolf <kwolf@redhat.com>
>> Reviewd-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
>> Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
>> ---
>>  block/sheepdog.c |   50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 files changed, 50 insertions(+), 0 deletions(-)
>>
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index 00276f6f..c08c69b 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -32,9 +32,11 @@
>>  #define SD_OP_RELEASE_VDI    0x13
>>  #define SD_OP_GET_VDI_INFO   0x14
>>  #define SD_OP_READ_VDIS      0x15
>> +#define SD_OP_FLUSH_VDI      0x16
>>  
>>  #define SD_FLAG_CMD_WRITE    0x01
>>  #define SD_FLAG_CMD_COW      0x02
>> +#define SD_FLAG_CMD_CACHE    0x04
>>  
>>  #define SD_RES_SUCCESS       0x00 /* Success */
>>  #define SD_RES_UNKNOWN       0x01 /* Unknown error */
>> @@ -179,6 +181,8 @@ typedef struct SheepdogInode {
>>      uint32_t data_vdi_id[MAX_DATA_OBJS];
>>  } SheepdogInode;
>>  
>> +static int cache_enabled;
> 
> Why it is a global property instead of per-device
> (in BDRVSheepdogState) property?
> 
>> @@ -1011,6 +1024,10 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
>>      QLIST_INIT(&s->outstanding_aio_head);
>>      s->fd = -1;
>>  
>> +    if (flags & BDRV_O_CACHE_WB) {
>> +            cache_enabled = 1;
>> +    }
> 
> You test per-device flag here, and set a global variable, why?
> 




Hi Michael,
	Yes, I'd better embed this variable into BDRVSheepdogState.

>> +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
>> +{
>> +    BDRVSheepdogState *s = bs->opaque;
>> +    SheepdogObjReq hdr = { 0 };
>> +    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
>> +    SheepdogInode *inode = &s->inode;
>> +    int fd, ret;
>> +    unsigned int wlen = 0, rlen = 0;
>> +
>> +    fd = connect_to_sdog(s->addr, s->port);
>> +    if (fd < 0) {
>> +        return -1;
>> +    }
> 
> Do you really need _another_ connection here?


I am unsure if I can use bs->fd or not, Kazum, would you please check
this usage? I tried use bs->fd in V2 and everything goes okay.

Thanks,
Yuan
Christoph Hellwig - April 20, 2012, 6:05 p.m.
On Tue, Apr 03, 2012 at 01:35:50AM +0800, Liu Yuan wrote:
> From: Liu Yuan <tailai.ly@taobao.com>
> 
> Flush operation is supposed to flush the write-back cache of
> sheepdog cluster.
> 
> By issuing flush operation, we can assure the Guest of data
> reaching the sheepdog cluster storage.

How does qemu know that the cache might need flushing in the backend?

If the backend expects a flush we need to make sure one is sent after
every write if the user selects cache=writethrough or cache=directsync.
Given how expensive that is you should also consider adding an equivalent
to the FUA flag for writes in that case, or simply allow the qemu driver
to tell the cluster that it needs to operate in writethrough mode for
this VDI.
MORITA Kazutaka - April 20, 2012, 7:15 p.m.
At Fri, 20 Apr 2012 20:05:48 +0200,
Christoph Hellwig wrote:
> 
> On Tue, Apr 03, 2012 at 01:35:50AM +0800, Liu Yuan wrote:
> > From: Liu Yuan <tailai.ly@taobao.com>
> > 
> > Flush operation is supposed to flush the write-back cache of
> > sheepdog cluster.
> > 
> > By issuing flush operation, we can assure the Guest of data
> > reaching the sheepdog cluster storage.
> 
> How does qemu know that the cache might need flushing in the backend?
> 
> If the backend expects a flush we need to make sure one is sent after
> every write if the user selects cache=writethrough or cache=directsync.
> Given how expensive that is you should also consider adding an equivalent
> to the FUA flag for writes in that case, or simply allow the qemu driver
> to tell the cluster that it needs to operate in writethrough mode for
> this VDI.

His patch sets the SD_FLAG_CMD_CACHE flag for writes only when the
user selects cache=writeback or cache=none.  If SD_FLAG_CMD_CACHE is
not set in the request, Sheepdog servers are forced to flush the cache
like FUA commands.

Thanks,

Kazutaka
Christoph Hellwig - April 23, 2012, 6:08 a.m.
On Fri, Apr 20, 2012 at 12:15:36PM -0700, MORITA Kazutaka wrote:
> His patch sets the SD_FLAG_CMD_CACHE flag for writes only when the
> user selects cache=writeback or cache=none.  If SD_FLAG_CMD_CACHE is
> not set in the request, Sheepdog servers are forced to flush the cache
> like FUA commands.

Ok, I missed that part and it thus seems ok.  What still sems missing
is error handling in case the sheepdog cluster doesn't actually support
the new flag.  What happens if cache=none is specified with a cluster
not actually supporting it?  Remember that cache=none is the default for
many management frontends to qemu.
MORITA Kazutaka - April 23, 2012, 4:26 p.m.
At Mon, 23 Apr 2012 08:08:46 +0200,
Christoph Hellwig wrote:
> 
> On Fri, Apr 20, 2012 at 12:15:36PM -0700, MORITA Kazutaka wrote:
> > His patch sets the SD_FLAG_CMD_CACHE flag for writes only when the
> > user selects cache=writeback or cache=none.  If SD_FLAG_CMD_CACHE is
> > not set in the request, Sheepdog servers are forced to flush the cache
> > like FUA commands.
> 
> Ok, I missed that part and it thus seems ok.  What still sems missing
> is error handling in case the sheepdog cluster doesn't actually support
> the new flag.  What happens if cache=none is specified with a cluster
> not actually supporting it?  Remember that cache=none is the default for
> many management frontends to qemu.

SD_FLAG_CMD_CACHE is ignored in the older version of Sheepdog, so,
even if we specify cache=writeback or cache=none, the data is written
with O_DSYNC always and cannot be cached in the server's page cache or
volatile disk cache either.  I think it is safe.

It seems that there is another version problem with this patch;
bdrv_co_flush_to_disk() results in error with the older Sheepdog which
doesn't support SD_OP_FLUSH_VDI.  The simple fix is to increment
SD_PROTO_VER and prevent the newer qemu from connecting to the older
Sheepdog cluster, I think.

Thanks,

Kazutaka
Christoph Hellwig - April 24, 2012, 6:58 a.m.
On Tue, Apr 24, 2012 at 01:26:43AM +0900, MORITA Kazutaka wrote:
> SD_FLAG_CMD_CACHE is ignored in the older version of Sheepdog, so,
> even if we specify cache=writeback or cache=none, the data is written
> with O_DSYNC always and cannot be cached in the server's page cache or
> volatile disk cache either.  I think it is safe.
> 
> It seems that there is another version problem with this patch;
> bdrv_co_flush_to_disk() results in error with the older Sheepdog which
> doesn't support SD_OP_FLUSH_VDI.  The simple fix is to increment
> SD_PROTO_VER and prevent the newer qemu from connecting to the older
> Sheepdog cluster, I think.

Incrementin the version seems like a must.  But qemu can still connect
to older protocol versions fine if connecting to the current one fails,
t just must not send SD_OP_FLUSH_VDI (and preferably not set SD_FLAG_CMD_CACHE)
in that case.

Patch

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 00276f6f..c08c69b 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -32,9 +32,11 @@ 
 #define SD_OP_RELEASE_VDI    0x13
 #define SD_OP_GET_VDI_INFO   0x14
 #define SD_OP_READ_VDIS      0x15
+#define SD_OP_FLUSH_VDI      0x16
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
+#define SD_FLAG_CMD_CACHE    0x04
 
 #define SD_RES_SUCCESS       0x00 /* Success */
 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
@@ -179,6 +181,8 @@  typedef struct SheepdogInode {
     uint32_t data_vdi_id[MAX_DATA_OBJS];
 } SheepdogInode;
 
+static int cache_enabled;
+
 /*
  * 64 bit FNV-1a non-zero initial basis
  */
@@ -900,6 +904,10 @@  static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
         hdr.flags = SD_FLAG_CMD_WRITE | flags;
     }
 
+    if (cache_enabled) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.cow_oid = old_oid;
     hdr.copies = s->inode.nr_copies;
@@ -965,6 +973,11 @@  static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
         rlen = datalen;
         hdr.opcode = SD_OP_READ_OBJ;
     }
+
+    if (cache_enabled) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.data_length = datalen;
     hdr.offset = offset;
@@ -1011,6 +1024,10 @@  static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     QLIST_INIT(&s->outstanding_aio_head);
     s->fd = -1;
 
+    if (flags & BDRV_O_CACHE_WB) {
+            cache_enabled = 1;
+    }
+
     memset(vdi, 0, sizeof(vdi));
     memset(tag, 0, sizeof(tag));
     if (parse_vdiname(s, filename, vdi, &snapid, tag) < 0) {
@@ -1575,6 +1592,38 @@  static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
+static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogObjReq hdr = { 0 };
+    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
+    SheepdogInode *inode = &s->inode;
+    int fd, ret;
+    unsigned int wlen = 0, rlen = 0;
+
+    fd = connect_to_sdog(s->addr, s->port);
+    if (fd < 0) {
+        return -1;
+    }
+
+    hdr.opcode = SD_OP_FLUSH_VDI;
+    hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+
+    ret = do_req(fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
+    closesocket(fd);
+    if (ret) {
+        error_report("failed to send a request to the sheep");
+        return -1;
+    }
+
+    if (rsp->result != SD_RES_SUCCESS) {
+        error_report("%s", sd_strerror(rsp->result));
+        return -1;
+    }
+
+    return 0;
+}
+
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
     BDRVSheepdogState *s = bs->opaque;
@@ -1904,6 +1953,7 @@  BlockDriver bdrv_sheepdog = {
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
+    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,