diff mbox

[v3] sheepdog: selectable object size support

Message ID 1422001440-11276-1-git-send-email-ishizaki.teruaki@lab.ntt.co.jp
State New
Headers show

Commit Message

Teruaki Ishizaki Jan. 23, 2015, 8:24 a.m. UTC
Previously, qemu block driver of sheepdog used hard-coded VDI object size.
This patch enables users to handle "block_size_shift" value for
calculating VDI object size.

When you start qemu, you don't need to specify additional command option.

But when you create the VDI which doesn't have default object size
with qemu-img command, you specify block_size_shift option.

If you want to create a VDI of 8MB(1 << 23) object size,
you need to specify following command option.

 # qemu-img create -o block_size_shift=23 sheepdog:test1 100M

In addition, when you don't specify qemu-img command option,
a default value of sheepdog cluster is used for creating VDI.

 # qemu-img create sheepdog:test2 100M

Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
---
V3:
 - Delete the needless operation of buffer.
 - Delete the needless operations of request header
   for SD_OP_GET_CLUSTER_DEFAULT.
 - Fix coding style problems.

V2:
 - Fix coding style problem (white space).
 - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
 - Initialize request header to use block_size_shift specified by user.
---
 block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
 include/block/block_int.h |    1 +
 2 files changed, 119 insertions(+), 22 deletions(-)

Comments

Hitoshi Mitake Jan. 23, 2015, 8:30 a.m. UTC | #1
At Fri, 23 Jan 2015 17:24:00 +0900,
Teruaki Ishizaki wrote:
> 
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
> 
> When you start qemu, you don't need to specify additional command option.
> 
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
> 
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
> 
>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> 
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
> 
>  # qemu-img create sheepdog:test2 100M
> 
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> ---
> V3:
>  - Delete the needless operation of buffer.
>  - Delete the needless operations of request header
>    for SD_OP_GET_CLUSTER_DEFAULT.
>  - Fix coding style problems.
> 
> V2:
>  - Fix coding style problem (white space).
>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
>  - Initialize request header to use block_size_shift specified by user.
> ---
>  block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
>  include/block/block_int.h |    1 +
>  2 files changed, 119 insertions(+), 22 deletions(-)

Looks good to me.
Acked-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>

Thanks,
Hitoshi
Kevin Wolf Jan. 23, 2015, 1:53 p.m. UTC | #2
Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> This patch enables users to handle "block_size_shift" value for
> calculating VDI object size.
> 
> When you start qemu, you don't need to specify additional command option.
> 
> But when you create the VDI which doesn't have default object size
> with qemu-img command, you specify block_size_shift option.
> 
> If you want to create a VDI of 8MB(1 << 23) object size,
> you need to specify following command option.
> 
>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> 
> In addition, when you don't specify qemu-img command option,
> a default value of sheepdog cluster is used for creating VDI.
> 
>  # qemu-img create sheepdog:test2 100M
> 
> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> ---
> V3:
>  - Delete the needless operation of buffer.
>  - Delete the needless operations of request header
>    for SD_OP_GET_CLUSTER_DEFAULT.
>  - Fix coding style problems.
> 
> V2:
>  - Fix coding style problem (white space).
>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
>  - Initialize request header to use block_size_shift specified by user.
> ---
>  block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
>  include/block/block_int.h |    1 +
>  2 files changed, 119 insertions(+), 22 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index be3176f..c9f06db 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -37,6 +37,7 @@
>  #define SD_OP_READ_VDIS      0x15
>  #define SD_OP_FLUSH_VDI      0x16
>  #define SD_OP_DEL_VDI        0x17
> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>  
>  #define SD_FLAG_CMD_WRITE    0x01
>  #define SD_FLAG_CMD_COW      0x02
> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>      uint32_t base_vdi_id;
>      uint8_t copies;
>      uint8_t copy_policy;
> -    uint8_t reserved[2];
> +    uint8_t store_policy;
> +    uint8_t block_size_shift;
>      uint32_t snapid;
>      uint32_t type;
>      uint32_t pad[2];
> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>      uint32_t pad[5];
>  } SheepdogVdiRsp;
>  
> +typedef struct SheepdogClusterRsp {
> +    uint8_t proto_ver;
> +    uint8_t opcode;
> +    uint16_t flags;
> +    uint32_t epoch;
> +    uint32_t id;
> +    uint32_t data_length;
> +    uint32_t result;
> +    uint8_t nr_copies;
> +    uint8_t copy_policy;
> +    uint8_t block_size_shift;
> +    uint8_t __pad1;
> +    uint32_t __pad2[6];
> +} SheepdogClusterRsp;
> +
>  typedef struct SheepdogInode {
>      char name[SD_MAX_VDI_LEN];
>      char tag[SD_MAX_VDI_TAG_LEN];
> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>      hdr.vdi_size = s->inode.vdi_size;
>      hdr.copy_policy = s->inode.copy_policy;
>      hdr.copies = s->inode.nr_copies;
> +    hdr.block_size_shift = s->inode.block_size_shift;
>  
>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>  
> @@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>  static int sd_prealloc(const char *filename, Error **errp)
>  {
>      BlockDriverState *bs = NULL;
> +    BDRVSheepdogState *base = NULL;
>      uint32_t idx, max_idx;
> +    uint32_t object_size;
>      int64_t vdi_size;
> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> +    void *buf = NULL;
>      int ret;
>  
>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> @@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
>          ret = vdi_size;
>          goto out;
>      }
> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> +
> +    base = bs->opaque;
> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> +    buf = g_malloc0(object_size);

If I understand correctly, block_size_shift can be up to 31, i.e. this
is a 2 GB allocation. Do you really think this is a good idea?

At least use g_try_malloc0() here, so that a memory allocation failure
doesn't crash qemu. (Same goes for all potentially huge allocations that
you make in the whole codebase.)

> +    max_idx = DIV_ROUND_UP(vdi_size, object_size);
>  
>      for (idx = 0; idx < max_idx; idx++) {
>          /*
>           * The created image can be a cloned image, so we need to read
>           * a data from the source image.
>           */
> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> +        ret = bdrv_pread(bs, idx * object_size, buf, object_size);
>          if (ret < 0) {
>              goto out;
>          }
> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
> +        ret = bdrv_pwrite(bs, idx * object_size, buf, object_size);
>          if (ret < 0) {
>              goto out;
>          }
> @@ -1610,7 +1635,9 @@ out_with_err_set:
>      if (bs) {
>          bdrv_unref(bs);
>      }
> -    g_free(buf);
> +    if (buf) {
> +        g_free(buf);
> +    }

This is unnecessary. g_free(NULL) is valid, it does nothing.

>      return ret;
>  }
> @@ -1669,6 +1696,17 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>      return 0;
>  }
>  
> +static int parse_block_size_shift(BDRVSheepdogState *s, const char *opt)
> +{
> +    struct SheepdogInode *inode = &s->inode;
> +    inode->block_size_shift = (uint8_t)atoi(opt);

atoi() is best avoided, it has poor error handling.

But I think you don't need this parse function at all, see below.

> +    if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
>  static int sd_create(const char *filename, QemuOpts *opts,
>                       Error **errp)
>  {
> @@ -1679,6 +1717,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>      BDRVSheepdogState *s;
>      char tag[SD_MAX_VDI_TAG_LEN];
>      uint32_t snapid;
> +    uint64_t max_vdi_size;
>      bool prealloc = false;
>  
>      s = g_new0(BDRVSheepdogState, 1);
> @@ -1718,10 +1757,15 @@ static int sd_create(const char *filename, QemuOpts *opts,
>          }
>      }
>  
> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
> -        error_setg(errp, "too big image size");
> -        ret = -EINVAL;
> -        goto out;
> +    g_free(buf);
> +    buf = qemu_opt_get_del(opts, BLOCK_OPT_BLOCK_SIZE_SHIFT);
> +    if (buf) {
> +        ret = parse_block_size_shift(s, buf);
> +        if (ret < 0) {
> +            error_setg(errp, "Invalid block_size_shift:"
> +                             " '%s'. please use 20-31", buf);
> +            goto out;
> +        }
>      }

You could use qemu_opt_get_number_del() here and get a number directly
from QemuOpts that you don't have to parse any more, if you defined the
option as QEMU_OPT_NUMBER instead of QEMU_OPT_STRING.

>      if (backing_file) {
> @@ -1757,6 +1801,45 @@ static int sd_create(const char *filename, QemuOpts *opts,
>      }
>  
>      s->aio_context = qemu_get_aio_context();
> +
> +    /* if block_size_shift is not specified, get cluster default value */
> +    if (s->inode.block_size_shift == 0) {
> +        SheepdogVdiReq hdr;
> +        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
> +        Error *local_err = NULL;
> +        int fd;
> +        unsigned int wlen = 0, rlen = 0;
> +
> +        fd = connect_to_sdog(s, &local_err);
> +        if (fd < 0) {
> +            error_report("%s", error_get_pretty(local_err));
> +            error_free(local_err);
> +            ret = -EIO;
> +            goto out;
> +        }
> +
> +        memset(&hdr, 0, sizeof(hdr));
> +        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
> +        hdr.proto_ver = SD_PROTO_VER;
> +
> +        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
> +                     NULL, &wlen, &rlen);
> +        closesocket(fd);
> +        if (ret) {
> +            error_setg_errno(errp, -ret, "failed to get cluster default");
> +            goto out;
> +        }
> +        s->inode.block_size_shift = rsp->block_size_shift;
> +    }
> +
> +    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
> +
> +    if (s->inode.vdi_size > max_vdi_size) {
> +        error_setg(errp, "too big image size");

Please capitalise the first word in an error messages, i.e. "Too big
image size".

> +        ret = -EINVAL;
> +        goto out;
> +    }
> +
>      ret = do_sd_create(s, &vid, 0, errp);
>      if (ret) {
>          goto out;

Kevin
Markus Armbruster Jan. 26, 2015, 9:18 a.m. UTC | #3
Kevin Wolf <kwolf@redhat.com> writes:

> Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>> 
>> When you start qemu, you don't need to specify additional command option.
>> 
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>> 
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>> 
>>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>> 
>> In addition, when you don't specify qemu-img command option,
>> a default value of sheepdog cluster is used for creating VDI.
>> 
>>  # qemu-img create sheepdog:test2 100M
>> 
>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>> ---
>> V3:
>>  - Delete the needless operation of buffer.
>>  - Delete the needless operations of request header
>>    for SD_OP_GET_CLUSTER_DEFAULT.
>>  - Fix coding style problems.
>> 
>> V2:
>>  - Fix coding style problem (white space).
>>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
>>  - Initialize request header to use block_size_shift specified by user.
>> ---
>>  block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
>>  include/block/block_int.h |    1 +
>>  2 files changed, 119 insertions(+), 22 deletions(-)
>> 
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index be3176f..c9f06db 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -37,6 +37,7 @@
>>  #define SD_OP_READ_VDIS      0x15
>>  #define SD_OP_FLUSH_VDI      0x16
>>  #define SD_OP_DEL_VDI        0x17
>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>  
>>  #define SD_FLAG_CMD_WRITE    0x01
>>  #define SD_FLAG_CMD_COW      0x02
>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>      uint32_t base_vdi_id;
>>      uint8_t copies;
>>      uint8_t copy_policy;
>> -    uint8_t reserved[2];
>> +    uint8_t store_policy;
>> +    uint8_t block_size_shift;
>>      uint32_t snapid;
>>      uint32_t type;
>>      uint32_t pad[2];
>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>      uint32_t pad[5];
>>  } SheepdogVdiRsp;
>>  
>> +typedef struct SheepdogClusterRsp {
>> +    uint8_t proto_ver;
>> +    uint8_t opcode;
>> +    uint16_t flags;
>> +    uint32_t epoch;
>> +    uint32_t id;
>> +    uint32_t data_length;
>> +    uint32_t result;
>> +    uint8_t nr_copies;
>> +    uint8_t copy_policy;
>> +    uint8_t block_size_shift;
>> +    uint8_t __pad1;
>> +    uint32_t __pad2[6];
>> +} SheepdogClusterRsp;
>> +
>>  typedef struct SheepdogInode {
>>      char name[SD_MAX_VDI_LEN];
>>      char tag[SD_MAX_VDI_TAG_LEN];
>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>      hdr.vdi_size = s->inode.vdi_size;
>>      hdr.copy_policy = s->inode.copy_policy;
>>      hdr.copies = s->inode.nr_copies;
>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>  
>>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>  
>> @@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>  static int sd_prealloc(const char *filename, Error **errp)
>>  {
>>      BlockDriverState *bs = NULL;
>> +    BDRVSheepdogState *base = NULL;
>>      uint32_t idx, max_idx;
>> +    uint32_t object_size;
>>      int64_t vdi_size;
>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>> +    void *buf = NULL;
>>      int ret;
>>  
>>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>> @@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
>>          ret = vdi_size;
>>          goto out;
>>      }
>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>> +
>> +    base = bs->opaque;
>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>> +    buf = g_malloc0(object_size);
>
> If I understand correctly, block_size_shift can be up to 31, i.e. this
> is a 2 GB allocation. Do you really think this is a good idea?
>
> At least use g_try_malloc0() here, so that a memory allocation failure
> doesn't crash qemu. (Same goes for all potentially huge allocations that
> you make in the whole codebase.)
>
>> +    max_idx = DIV_ROUND_UP(vdi_size, object_size);
>>  
>>      for (idx = 0; idx < max_idx; idx++) {
>>          /*
>>           * The created image can be a cloned image, so we need to read
>>           * a data from the source image.
>>           */
>> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pread(bs, idx * object_size, buf, object_size);
>>          if (ret < 0) {
>>              goto out;
>>          }
>> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pwrite(bs, idx * object_size, buf, object_size);
>>          if (ret < 0) {
>>              goto out;
>>          }
>> @@ -1610,7 +1635,9 @@ out_with_err_set:
>>      if (bs) {
>>          bdrv_unref(bs);
>>      }
>> -    g_free(buf);
>> +    if (buf) {
>> +        g_free(buf);
>> +    }
>
> This is unnecessary. g_free(NULL) is valid, it does nothing.
>
>>      return ret;
>>  }
>> @@ -1669,6 +1696,17 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>>      return 0;
>>  }
>>  
>> +static int parse_block_size_shift(BDRVSheepdogState *s, const char *opt)
>> +{
>> +    struct SheepdogInode *inode = &s->inode;
>> +    inode->block_size_shift = (uint8_t)atoi(opt);
>
> atoi() is best avoided, it has poor error handling.
>
> But I think you don't need this parse function at all, see below.
>
>> +    if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  static int sd_create(const char *filename, QemuOpts *opts,
>>                       Error **errp)
>>  {
>> @@ -1679,6 +1717,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>      BDRVSheepdogState *s;
>>      char tag[SD_MAX_VDI_TAG_LEN];
>>      uint32_t snapid;
>> +    uint64_t max_vdi_size;
>>      bool prealloc = false;
>>  
>>      s = g_new0(BDRVSheepdogState, 1);
>> @@ -1718,10 +1757,15 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>          }
>>      }
>>  
>> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>> -        error_setg(errp, "too big image size");
>> -        ret = -EINVAL;
>> -        goto out;
>> +    g_free(buf);
>> +    buf = qemu_opt_get_del(opts, BLOCK_OPT_BLOCK_SIZE_SHIFT);
>> +    if (buf) {
>> +        ret = parse_block_size_shift(s, buf);
>> +        if (ret < 0) {
>> +            error_setg(errp, "Invalid block_size_shift:"
>> +                             " '%s'. please use 20-31", buf);
>> +            goto out;
>> +        }
>>      }
>
> You could use qemu_opt_get_number_del() here and get a number directly
> from QemuOpts that you don't have to parse any more, if you defined the
> option as QEMU_OPT_NUMBER instead of QEMU_OPT_STRING.

s/could/should/

>>      if (backing_file) {
>> @@ -1757,6 +1801,45 @@ static int sd_create(const char *filename,
>> QemuOpts *opts,
>>      }
>>  
>>      s->aio_context = qemu_get_aio_context();
>> +
>> +    /* if block_size_shift is not specified, get cluster default value */
>> +    if (s->inode.block_size_shift == 0) {
>> +        SheepdogVdiReq hdr;
>> +        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
>> +        Error *local_err = NULL;
>> +        int fd;
>> +        unsigned int wlen = 0, rlen = 0;
>> +
>> +        fd = connect_to_sdog(s, &local_err);
>> +        if (fd < 0) {
>> +            error_report("%s", error_get_pretty(local_err));
>> +            error_free(local_err);
>> +            ret = -EIO;
>> +            goto out;
>> +        }
>> +
>> +        memset(&hdr, 0, sizeof(hdr));
>> +        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
>> +        hdr.proto_ver = SD_PROTO_VER;
>> +
>> +        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
>> +                     NULL, &wlen, &rlen);
>> +        closesocket(fd);
>> +        if (ret) {
>> +            error_setg_errno(errp, -ret, "failed to get cluster default");
>> +            goto out;
>> +        }
>> +        s->inode.block_size_shift = rsp->block_size_shift;
>> +    }
>> +
>> + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) *
>> MAX_DATA_OBJS;
>> +
>> +    if (s->inode.vdi_size > max_vdi_size) {
>> +        error_setg(errp, "too big image size");
>
> Please capitalise the first word in an error messages, i.e. "Too big
> image size".

Make it "Image too big", or even better include the actual limit in the
message.

>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>>      ret = do_sd_create(s, &vid, 0, errp);
>>      if (ret) {
>>          goto out;
>
> Kevin
Teruaki Ishizaki Jan. 26, 2015, 9:52 a.m. UTC | #4
Hi, Kevin

Thanks for your review!

(2015/01/23 22:53), Kevin Wolf wrote:
> Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>> This patch enables users to handle "block_size_shift" value for
>> calculating VDI object size.
>>
>> When you start qemu, you don't need to specify additional command option.
>>
>> But when you create the VDI which doesn't have default object size
>> with qemu-img command, you specify block_size_shift option.
>>
>> If you want to create a VDI of 8MB(1 << 23) object size,
>> you need to specify following command option.
>>
>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>
>> In addition, when you don't specify qemu-img command option,
>> a default value of sheepdog cluster is used for creating VDI.
>>
>>   # qemu-img create sheepdog:test2 100M
>>
>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>> ---
>> V3:
>>   - Delete the needless operation of buffer.
>>   - Delete the needless operations of request header
>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>   - Fix coding style problems.
>>
>> V2:
>>   - Fix coding style problem (white space).
>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
>>   - Initialize request header to use block_size_shift specified by user.
>> ---
>>   block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
>>   include/block/block_int.h |    1 +
>>   2 files changed, 119 insertions(+), 22 deletions(-)
>>
>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>> index be3176f..c9f06db 100644
>> --- a/block/sheepdog.c
>> +++ b/block/sheepdog.c
>> @@ -37,6 +37,7 @@
>>   #define SD_OP_READ_VDIS      0x15
>>   #define SD_OP_FLUSH_VDI      0x16
>>   #define SD_OP_DEL_VDI        0x17
>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>
>>   #define SD_FLAG_CMD_WRITE    0x01
>>   #define SD_FLAG_CMD_COW      0x02
>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>       uint32_t base_vdi_id;
>>       uint8_t copies;
>>       uint8_t copy_policy;
>> -    uint8_t reserved[2];
>> +    uint8_t store_policy;
>> +    uint8_t block_size_shift;
>>       uint32_t snapid;
>>       uint32_t type;
>>       uint32_t pad[2];
>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>       uint32_t pad[5];
>>   } SheepdogVdiRsp;
>>
>> +typedef struct SheepdogClusterRsp {
>> +    uint8_t proto_ver;
>> +    uint8_t opcode;
>> +    uint16_t flags;
>> +    uint32_t epoch;
>> +    uint32_t id;
>> +    uint32_t data_length;
>> +    uint32_t result;
>> +    uint8_t nr_copies;
>> +    uint8_t copy_policy;
>> +    uint8_t block_size_shift;
>> +    uint8_t __pad1;
>> +    uint32_t __pad2[6];
>> +} SheepdogClusterRsp;
>> +
>>   typedef struct SheepdogInode {
>>       char name[SD_MAX_VDI_LEN];
>>       char tag[SD_MAX_VDI_TAG_LEN];
>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>       hdr.vdi_size = s->inode.vdi_size;
>>       hdr.copy_policy = s->inode.copy_policy;
>>       hdr.copies = s->inode.nr_copies;
>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>
>>       ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>
>> @@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>   static int sd_prealloc(const char *filename, Error **errp)
>>   {
>>       BlockDriverState *bs = NULL;
>> +    BDRVSheepdogState *base = NULL;
>>       uint32_t idx, max_idx;
>> +    uint32_t object_size;
>>       int64_t vdi_size;
>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>> +    void *buf = NULL;
>>       int ret;
>>
>>       ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>> @@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
>>           ret = vdi_size;
>>           goto out;
>>       }
>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>> +
>> +    base = bs->opaque;
>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>> +    buf = g_malloc0(object_size);
>
> If I understand correctly, block_size_shift can be up to 31, i.e. this
> is a 2 GB allocation. Do you really think this is a good idea?
I think that it is not best idea.
Sheepdog internal limit of block_size_shift is 31, so I set logical 
limit of block_size_shift for 31.

>
> At least use g_try_malloc0() here, so that a memory allocation failure
> doesn't crash qemu. (Same goes for all potentially huge allocations that
> you make in the whole codebase.)
As you pointed out, memory allocation was needed for that value.
I suppose that block_size_shift should be up to 26(64MB) or 27(128M)
actually.

Should it be checked actual limits in that source code?

>
>> +    max_idx = DIV_ROUND_UP(vdi_size, object_size);
>>
>>       for (idx = 0; idx < max_idx; idx++) {
>>           /*
>>            * The created image can be a cloned image, so we need to read
>>            * a data from the source image.
>>            */
>> -        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pread(bs, idx * object_size, buf, object_size);
>>           if (ret < 0) {
>>               goto out;
>>           }
>> -        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
>> +        ret = bdrv_pwrite(bs, idx * object_size, buf, object_size);
>>           if (ret < 0) {
>>               goto out;
>>           }
>> @@ -1610,7 +1635,9 @@ out_with_err_set:
>>       if (bs) {
>>           bdrv_unref(bs);
>>       }
>> -    g_free(buf);
>> +    if (buf) {
>> +        g_free(buf);
>> +    }
>
> This is unnecessary. g_free(NULL) is valid, it does nothing.
OK, I'll delete those changes.

>
>>       return ret;
>>   }
>> @@ -1669,6 +1696,17 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
>>       return 0;
>>   }
>>
>> +static int parse_block_size_shift(BDRVSheepdogState *s, const char *opt)
>> +{
>> +    struct SheepdogInode *inode = &s->inode;
>> +    inode->block_size_shift = (uint8_t)atoi(opt);
>
> atoi() is best avoided, it has poor error handling.
>
> But I think you don't need this parse function at all, see below.
>
>> +    if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>   static int sd_create(const char *filename, QemuOpts *opts,
>>                        Error **errp)
>>   {
>> @@ -1679,6 +1717,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>       BDRVSheepdogState *s;
>>       char tag[SD_MAX_VDI_TAG_LEN];
>>       uint32_t snapid;
>> +    uint64_t max_vdi_size;
>>       bool prealloc = false;
>>
>>       s = g_new0(BDRVSheepdogState, 1);
>> @@ -1718,10 +1757,15 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>           }
>>       }
>>
>> -    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
>> -        error_setg(errp, "too big image size");
>> -        ret = -EINVAL;
>> -        goto out;
>> +    g_free(buf);
>> +    buf = qemu_opt_get_del(opts, BLOCK_OPT_BLOCK_SIZE_SHIFT);
>> +    if (buf) {
>> +        ret = parse_block_size_shift(s, buf);
>> +        if (ret < 0) {
>> +            error_setg(errp, "Invalid block_size_shift:"
>> +                             " '%s'. please use 20-31", buf);
>> +            goto out;
>> +        }
>>       }
>
> You could use qemu_opt_get_number_del() here and get a number directly
> from QemuOpts that you don't have to parse any more, if you defined the
> option as QEMU_OPT_NUMBER instead of QEMU_OPT_STRING.
>
Thank you for your advice!
I'll change to use qemu_opt_get_number_del() and not to use atoi()
function.


>>       if (backing_file) {
>> @@ -1757,6 +1801,45 @@ static int sd_create(const char *filename, QemuOpts *opts,
>>       }
>>
>>       s->aio_context = qemu_get_aio_context();
>> +
>> +    /* if block_size_shift is not specified, get cluster default value */
>> +    if (s->inode.block_size_shift == 0) {
>> +        SheepdogVdiReq hdr;
>> +        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
>> +        Error *local_err = NULL;
>> +        int fd;
>> +        unsigned int wlen = 0, rlen = 0;
>> +
>> +        fd = connect_to_sdog(s, &local_err);
>> +        if (fd < 0) {
>> +            error_report("%s", error_get_pretty(local_err));
>> +            error_free(local_err);
>> +            ret = -EIO;
>> +            goto out;
>> +        }
>> +
>> +        memset(&hdr, 0, sizeof(hdr));
>> +        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
>> +        hdr.proto_ver = SD_PROTO_VER;
>> +
>> +        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
>> +                     NULL, &wlen, &rlen);
>> +        closesocket(fd);
>> +        if (ret) {
>> +            error_setg_errno(errp, -ret, "failed to get cluster default");
>> +            goto out;
>> +        }
>> +        s->inode.block_size_shift = rsp->block_size_shift;
>> +    }
>> +
>> +    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
>> +
>> +    if (s->inode.vdi_size > max_vdi_size) {
>> +        error_setg(errp, "too big image size");
>
> Please capitalise the first word in an error messages, i.e. "Too big
> image size".
OK, I'll change an error message properly!

>
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>>       ret = do_sd_create(s, &vid, 0, errp);
>>       if (ret) {
>>           goto out;
>

Thanks,
Teruaki Ishizaki
Kevin Wolf Jan. 26, 2015, 10:33 a.m. UTC | #5
Am 26.01.2015 um 10:52 hat Teruaki Ishizaki geschrieben:
> Hi, Kevin
> 
> Thanks for your review!
> 
> (2015/01/23 22:53), Kevin Wolf wrote:
> >Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
> >>Previously, qemu block driver of sheepdog used hard-coded VDI object size.
> >>This patch enables users to handle "block_size_shift" value for
> >>calculating VDI object size.
> >>
> >>When you start qemu, you don't need to specify additional command option.
> >>
> >>But when you create the VDI which doesn't have default object size
> >>with qemu-img command, you specify block_size_shift option.
> >>
> >>If you want to create a VDI of 8MB(1 << 23) object size,
> >>you need to specify following command option.
> >>
> >>  # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
> >>
> >>In addition, when you don't specify qemu-img command option,
> >>a default value of sheepdog cluster is used for creating VDI.
> >>
> >>  # qemu-img create sheepdog:test2 100M
> >>
> >>Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
> >>---
> >>V3:
> >>  - Delete the needless operation of buffer.
> >>  - Delete the needless operations of request header
> >>    for SD_OP_GET_CLUSTER_DEFAULT.
> >>  - Fix coding style problems.
> >>
> >>V2:
> >>  - Fix coding style problem (white space).
> >>  - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
> >>  - Initialize request header to use block_size_shift specified by user.
> >>---
> >>  block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
> >>  include/block/block_int.h |    1 +
> >>  2 files changed, 119 insertions(+), 22 deletions(-)
> >>
> >>diff --git a/block/sheepdog.c b/block/sheepdog.c
> >>index be3176f..c9f06db 100644
> >>--- a/block/sheepdog.c
> >>+++ b/block/sheepdog.c
> >>@@ -37,6 +37,7 @@
> >>  #define SD_OP_READ_VDIS      0x15
> >>  #define SD_OP_FLUSH_VDI      0x16
> >>  #define SD_OP_DEL_VDI        0x17
> >>+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
> >>
> >>  #define SD_FLAG_CMD_WRITE    0x01
> >>  #define SD_FLAG_CMD_COW      0x02
> >>@@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
> >>      uint32_t base_vdi_id;
> >>      uint8_t copies;
> >>      uint8_t copy_policy;
> >>-    uint8_t reserved[2];
> >>+    uint8_t store_policy;
> >>+    uint8_t block_size_shift;
> >>      uint32_t snapid;
> >>      uint32_t type;
> >>      uint32_t pad[2];
> >>@@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
> >>      uint32_t pad[5];
> >>  } SheepdogVdiRsp;
> >>
> >>+typedef struct SheepdogClusterRsp {
> >>+    uint8_t proto_ver;
> >>+    uint8_t opcode;
> >>+    uint16_t flags;
> >>+    uint32_t epoch;
> >>+    uint32_t id;
> >>+    uint32_t data_length;
> >>+    uint32_t result;
> >>+    uint8_t nr_copies;
> >>+    uint8_t copy_policy;
> >>+    uint8_t block_size_shift;
> >>+    uint8_t __pad1;
> >>+    uint32_t __pad2[6];
> >>+} SheepdogClusterRsp;
> >>+
> >>  typedef struct SheepdogInode {
> >>      char name[SD_MAX_VDI_LEN];
> >>      char tag[SD_MAX_VDI_TAG_LEN];
> >>@@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>      hdr.vdi_size = s->inode.vdi_size;
> >>      hdr.copy_policy = s->inode.copy_policy;
> >>      hdr.copies = s->inode.nr_copies;
> >>+    hdr.block_size_shift = s->inode.block_size_shift;
> >>
> >>      ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
> >>
> >>@@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
> >>  static int sd_prealloc(const char *filename, Error **errp)
> >>  {
> >>      BlockDriverState *bs = NULL;
> >>+    BDRVSheepdogState *base = NULL;
> >>      uint32_t idx, max_idx;
> >>+    uint32_t object_size;
> >>      int64_t vdi_size;
> >>-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
> >>+    void *buf = NULL;
> >>      int ret;
> >>
> >>      ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
> >>@@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
> >>          ret = vdi_size;
> >>          goto out;
> >>      }
> >>-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
> >>+
> >>+    base = bs->opaque;
> >>+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
> >>+    buf = g_malloc0(object_size);
> >
> >If I understand correctly, block_size_shift can be up to 31, i.e. this
> >is a 2 GB allocation. Do you really think this is a good idea?
> I think that it is not best idea.
> Sheepdog internal limit of block_size_shift is 31, so I set logical
> limit of block_size_shift for 31.

I think there is no real requirement to use the object size here, this
is only the buffer size for some bdrv_read/write calls, which works fine
in smaller chunks.

So you could still allow block_size_shift up to 31, but use a different
number here. Perhaps always leaving it at 4 MB is good enough, otherwise
you could use something like MIN(24, base->inode.block_size_shift).

> >At least use g_try_malloc0() here, so that a memory allocation failure
> >doesn't crash qemu. (Same goes for all potentially huge allocations that
> >you make in the whole codebase.)
> As you pointed out, memory allocation was needed for that value.
> I suppose that block_size_shift should be up to 26(64MB) or 27(128M)
> actually.
> 
> Should it be checked actual limits in that source code?

I suggest limiting the maximum buffer size as above so that a failure
becomes unlikely, and using g_try_malloc() to handle any allocation
failure that occurs anyway gracefully.

Kevin
Teruaki Ishizaki Jan. 27, 2015, 3:57 a.m. UTC | #6
(2015/01/26 19:33), Kevin Wolf wrote:
> Am 26.01.2015 um 10:52 hat Teruaki Ishizaki geschrieben:
>> Hi, Kevin
>>
>> Thanks for your review!
>>
>> (2015/01/23 22:53), Kevin Wolf wrote:
>>> Am 23.01.2015 um 09:24 hat Teruaki Ishizaki geschrieben:
>>>> Previously, qemu block driver of sheepdog used hard-coded VDI object size.
>>>> This patch enables users to handle "block_size_shift" value for
>>>> calculating VDI object size.
>>>>
>>>> When you start qemu, you don't need to specify additional command option.
>>>>
>>>> But when you create the VDI which doesn't have default object size
>>>> with qemu-img command, you specify block_size_shift option.
>>>>
>>>> If you want to create a VDI of 8MB(1 << 23) object size,
>>>> you need to specify following command option.
>>>>
>>>>   # qemu-img create -o block_size_shift=23 sheepdog:test1 100M
>>>>
>>>> In addition, when you don't specify qemu-img command option,
>>>> a default value of sheepdog cluster is used for creating VDI.
>>>>
>>>>   # qemu-img create sheepdog:test2 100M
>>>>
>>>> Signed-off-by: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
>>>> ---
>>>> V3:
>>>>   - Delete the needless operation of buffer.
>>>>   - Delete the needless operations of request header
>>>>     for SD_OP_GET_CLUSTER_DEFAULT.
>>>>   - Fix coding style problems.
>>>>
>>>> V2:
>>>>   - Fix coding style problem (white space).
>>>>   - Add members, store_policy and block_size_shift to struct SheepdogVdiReq
>>>>   - Initialize request header to use block_size_shift specified by user.
>>>> ---
>>>>   block/sheepdog.c          |  140 ++++++++++++++++++++++++++++++++++++++-------
>>>>   include/block/block_int.h |    1 +
>>>>   2 files changed, 119 insertions(+), 22 deletions(-)
>>>>
>>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>>> index be3176f..c9f06db 100644
>>>> --- a/block/sheepdog.c
>>>> +++ b/block/sheepdog.c
>>>> @@ -37,6 +37,7 @@
>>>>   #define SD_OP_READ_VDIS      0x15
>>>>   #define SD_OP_FLUSH_VDI      0x16
>>>>   #define SD_OP_DEL_VDI        0x17
>>>> +#define SD_OP_GET_CLUSTER_DEFAULT   0x18
>>>>
>>>>   #define SD_FLAG_CMD_WRITE    0x01
>>>>   #define SD_FLAG_CMD_COW      0x02
>>>> @@ -167,7 +168,8 @@ typedef struct SheepdogVdiReq {
>>>>       uint32_t base_vdi_id;
>>>>       uint8_t copies;
>>>>       uint8_t copy_policy;
>>>> -    uint8_t reserved[2];
>>>> +    uint8_t store_policy;
>>>> +    uint8_t block_size_shift;
>>>>       uint32_t snapid;
>>>>       uint32_t type;
>>>>       uint32_t pad[2];
>>>> @@ -186,6 +188,21 @@ typedef struct SheepdogVdiRsp {
>>>>       uint32_t pad[5];
>>>>   } SheepdogVdiRsp;
>>>>
>>>> +typedef struct SheepdogClusterRsp {
>>>> +    uint8_t proto_ver;
>>>> +    uint8_t opcode;
>>>> +    uint16_t flags;
>>>> +    uint32_t epoch;
>>>> +    uint32_t id;
>>>> +    uint32_t data_length;
>>>> +    uint32_t result;
>>>> +    uint8_t nr_copies;
>>>> +    uint8_t copy_policy;
>>>> +    uint8_t block_size_shift;
>>>> +    uint8_t __pad1;
>>>> +    uint32_t __pad2[6];
>>>> +} SheepdogClusterRsp;
>>>> +
>>>>   typedef struct SheepdogInode {
>>>>       char name[SD_MAX_VDI_LEN];
>>>>       char tag[SD_MAX_VDI_TAG_LEN];
>>>> @@ -1544,6 +1561,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>       hdr.vdi_size = s->inode.vdi_size;
>>>>       hdr.copy_policy = s->inode.copy_policy;
>>>>       hdr.copies = s->inode.nr_copies;
>>>> +    hdr.block_size_shift = s->inode.block_size_shift;
>>>>
>>>>       ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
>>>>
>>>> @@ -1569,9 +1587,11 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
>>>>   static int sd_prealloc(const char *filename, Error **errp)
>>>>   {
>>>>       BlockDriverState *bs = NULL;
>>>> +    BDRVSheepdogState *base = NULL;
>>>>       uint32_t idx, max_idx;
>>>> +    uint32_t object_size;
>>>>       int64_t vdi_size;
>>>> -    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
>>>> +    void *buf = NULL;
>>>>       int ret;
>>>>
>>>>       ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
>>>> @@ -1585,18 +1605,23 @@ static int sd_prealloc(const char *filename, Error **errp)
>>>>           ret = vdi_size;
>>>>           goto out;
>>>>       }
>>>> -    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
>>>> +
>>>> +    base = bs->opaque;
>>>> +    object_size = (UINT32_C(1) << base->inode.block_size_shift);
>>>> +    buf = g_malloc0(object_size);
>>>
>>> If I understand correctly, block_size_shift can be up to 31, i.e. this
>>> is a 2 GB allocation. Do you really think this is a good idea?
>> I think that it is not best idea.
>> Sheepdog internal limit of block_size_shift is 31, so I set logical
>> limit of block_size_shift for 31.
>
> I think there is no real requirement to use the object size here, this
> is only the buffer size for some bdrv_read/write calls, which works fine
> in smaller chunks.
>
> So you could still allow block_size_shift up to 31, but use a different
> number here. Perhaps always leaving it at 4 MB is good enough, otherwise
> you could use something like MIN(24, base->inode.block_size_shift).
>
>>> At least use g_try_malloc0() here, so that a memory allocation failure
>>> doesn't crash qemu. (Same goes for all potentially huge allocations that
>>> you make in the whole codebase.)
>> As you pointed out, memory allocation was needed for that value.
>> I suppose that block_size_shift should be up to 26(64MB) or 27(128M)
>> actually.
>>
>> Should it be checked actual limits in that source code?
>
> I suggest limiting the maximum buffer size as above so that a failure
> becomes unlikely, and using g_try_malloc() to handle any allocation
> failure that occurs anyway gracefully.
OK, I'll change bdrv_read/write buffer size up to 4MB.

Thanks,
Teruaki Ishizaki
diff mbox

Patch

diff --git a/block/sheepdog.c b/block/sheepdog.c
index be3176f..c9f06db 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -37,6 +37,7 @@ 
 #define SD_OP_READ_VDIS      0x15
 #define SD_OP_FLUSH_VDI      0x16
 #define SD_OP_DEL_VDI        0x17
+#define SD_OP_GET_CLUSTER_DEFAULT   0x18
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
@@ -167,7 +168,8 @@  typedef struct SheepdogVdiReq {
     uint32_t base_vdi_id;
     uint8_t copies;
     uint8_t copy_policy;
-    uint8_t reserved[2];
+    uint8_t store_policy;
+    uint8_t block_size_shift;
     uint32_t snapid;
     uint32_t type;
     uint32_t pad[2];
@@ -186,6 +188,21 @@  typedef struct SheepdogVdiRsp {
     uint32_t pad[5];
 } SheepdogVdiRsp;
 
+typedef struct SheepdogClusterRsp {
+    uint8_t proto_ver;
+    uint8_t opcode;
+    uint16_t flags;
+    uint32_t epoch;
+    uint32_t id;
+    uint32_t data_length;
+    uint32_t result;
+    uint8_t nr_copies;
+    uint8_t copy_policy;
+    uint8_t block_size_shift;
+    uint8_t __pad1;
+    uint32_t __pad2[6];
+} SheepdogClusterRsp;
+
 typedef struct SheepdogInode {
     char name[SD_MAX_VDI_LEN];
     char tag[SD_MAX_VDI_TAG_LEN];
@@ -1544,6 +1561,7 @@  static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
     hdr.vdi_size = s->inode.vdi_size;
     hdr.copy_policy = s->inode.copy_policy;
     hdr.copies = s->inode.nr_copies;
+    hdr.block_size_shift = s->inode.block_size_shift;
 
     ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
 
@@ -1569,9 +1587,11 @@  static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot,
 static int sd_prealloc(const char *filename, Error **errp)
 {
     BlockDriverState *bs = NULL;
+    BDRVSheepdogState *base = NULL;
     uint32_t idx, max_idx;
+    uint32_t object_size;
     int64_t vdi_size;
-    void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+    void *buf = NULL;
     int ret;
 
     ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
@@ -1585,18 +1605,23 @@  static int sd_prealloc(const char *filename, Error **errp)
         ret = vdi_size;
         goto out;
     }
-    max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+    base = bs->opaque;
+    object_size = (UINT32_C(1) << base->inode.block_size_shift);
+    buf = g_malloc0(object_size);
+
+    max_idx = DIV_ROUND_UP(vdi_size, object_size);
 
     for (idx = 0; idx < max_idx; idx++) {
         /*
          * The created image can be a cloned image, so we need to read
          * a data from the source image.
          */
-        ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pread(bs, idx * object_size, buf, object_size);
         if (ret < 0) {
             goto out;
         }
-        ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+        ret = bdrv_pwrite(bs, idx * object_size, buf, object_size);
         if (ret < 0) {
             goto out;
         }
@@ -1610,7 +1635,9 @@  out_with_err_set:
     if (bs) {
         bdrv_unref(bs);
     }
-    g_free(buf);
+    if (buf) {
+        g_free(buf);
+    }
 
     return ret;
 }
@@ -1669,6 +1696,17 @@  static int parse_redundancy(BDRVSheepdogState *s, const char *opt)
     return 0;
 }
 
+static int parse_block_size_shift(BDRVSheepdogState *s, const char *opt)
+{
+    struct SheepdogInode *inode = &s->inode;
+    inode->block_size_shift = (uint8_t)atoi(opt);
+    if (inode->block_size_shift < 20 || inode->block_size_shift > 31) {
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 static int sd_create(const char *filename, QemuOpts *opts,
                      Error **errp)
 {
@@ -1679,6 +1717,7 @@  static int sd_create(const char *filename, QemuOpts *opts,
     BDRVSheepdogState *s;
     char tag[SD_MAX_VDI_TAG_LEN];
     uint32_t snapid;
+    uint64_t max_vdi_size;
     bool prealloc = false;
 
     s = g_new0(BDRVSheepdogState, 1);
@@ -1718,10 +1757,15 @@  static int sd_create(const char *filename, QemuOpts *opts,
         }
     }
 
-    if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
-        error_setg(errp, "too big image size");
-        ret = -EINVAL;
-        goto out;
+    g_free(buf);
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_BLOCK_SIZE_SHIFT);
+    if (buf) {
+        ret = parse_block_size_shift(s, buf);
+        if (ret < 0) {
+            error_setg(errp, "Invalid block_size_shift:"
+                             " '%s'. please use 20-31", buf);
+            goto out;
+        }
     }
 
     if (backing_file) {
@@ -1757,6 +1801,45 @@  static int sd_create(const char *filename, QemuOpts *opts,
     }
 
     s->aio_context = qemu_get_aio_context();
+
+    /* if block_size_shift is not specified, get cluster default value */
+    if (s->inode.block_size_shift == 0) {
+        SheepdogVdiReq hdr;
+        SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
+        Error *local_err = NULL;
+        int fd;
+        unsigned int wlen = 0, rlen = 0;
+
+        fd = connect_to_sdog(s, &local_err);
+        if (fd < 0) {
+            error_report("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            ret = -EIO;
+            goto out;
+        }
+
+        memset(&hdr, 0, sizeof(hdr));
+        hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
+        hdr.proto_ver = SD_PROTO_VER;
+
+        ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+                     NULL, &wlen, &rlen);
+        closesocket(fd);
+        if (ret) {
+            error_setg_errno(errp, -ret, "failed to get cluster default");
+            goto out;
+        }
+        s->inode.block_size_shift = rsp->block_size_shift;
+    }
+
+    max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
+
+    if (s->inode.vdi_size > max_vdi_size) {
+        error_setg(errp, "too big image size");
+        ret = -EINVAL;
+        goto out;
+    }
+
     ret = do_sd_create(s, &vid, 0, errp);
     if (ret) {
         goto out;
@@ -2013,9 +2096,10 @@  static int coroutine_fn sd_co_rw_vector(void *p)
     SheepdogAIOCB *acb = p;
     int ret = 0;
     unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
-    unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+    unsigned long idx;
+    uint32_t object_size;
     uint64_t oid;
-    uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+    uint64_t offset;
     BDRVSheepdogState *s = acb->common.bs->opaque;
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
@@ -2032,6 +2116,10 @@  static int coroutine_fn sd_co_rw_vector(void *p)
         }
     }
 
+    object_size = (UINT32_C(1) << inode->block_size_shift);
+    idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+    offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
+
     /*
      * Make sure we don't free the aiocb before we are done with all requests.
      * This additional reference is dropped at the end of this function.
@@ -2045,7 +2133,7 @@  static int coroutine_fn sd_co_rw_vector(void *p)
 
         oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
 
-        len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
+        len = MIN(total - done, object_size - offset);
 
         switch (acb->aiocb_type) {
         case AIOCB_READ_UDATA:
@@ -2069,7 +2157,7 @@  static int coroutine_fn sd_co_rw_vector(void *p)
              * We discard the object only when the whole object is
              * 1) allocated 2) trimmed. Otherwise, simply skip it.
              */
-            if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+            if (len != object_size || inode->data_vdi_id[idx] == 0) {
                 goto done;
             }
             break;
@@ -2426,6 +2514,7 @@  static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     uint64_t offset;
     uint32_t vdi_index;
     uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
+    uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
 
     fd = connect_to_sdog(s, &local_err);
     if (fd < 0) {
@@ -2435,10 +2524,10 @@  static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
     }
 
     while (remaining) {
-        vdi_index = pos / SD_DATA_OBJ_SIZE;
-        offset = pos % SD_DATA_OBJ_SIZE;
+        vdi_index = pos / object_size;
+        offset = pos % object_size;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
+        data_len = MIN(remaining, object_size - offset);
 
         vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
 
@@ -2525,10 +2614,11 @@  sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
     uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
-    unsigned long start = offset / SD_DATA_OBJ_SIZE,
+    unsigned long start = offset / object_size,
                   end = DIV_ROUND_UP((sector_num + nb_sectors) *
-                                     BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+                                     BDRV_SECTOR_SIZE, object_size);
     unsigned long idx;
     int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
 
@@ -2547,7 +2637,7 @@  sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
         }
     }
 
-    *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+    *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
     if (*pnum > nb_sectors) {
         *pnum = nb_sectors;
     }
@@ -2558,14 +2648,15 @@  static int64_t sd_get_allocated_file_size(BlockDriverState *bs)
 {
     BDRVSheepdogState *s = bs->opaque;
     SheepdogInode *inode = &s->inode;
-    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+    uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
+    unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
     uint64_t size = 0;
 
     for (i = 0; i < last; i++) {
         if (inode->data_vdi_id[i] == 0) {
             continue;
         }
-        size += SD_DATA_OBJ_SIZE;
+        size += object_size;
     }
     return size;
 }
@@ -2594,6 +2685,11 @@  static QemuOptsList sd_create_opts = {
             .type = QEMU_OPT_STRING,
             .help = "Redundancy of the image"
         },
+        {
+            .name = BLOCK_OPT_BLOCK_SIZE_SHIFT,
+            .type = QEMU_OPT_STRING,
+            .help = "Block Size Shift of the image"
+        },
         { /* end of list */ }
     }
 };
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 06a21dd..023c6bb 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -56,6 +56,7 @@ 
 #define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
 #define BLOCK_OPT_REDUNDANCY        "redundancy"
 #define BLOCK_OPT_NOCOW             "nocow"
+#define BLOCK_OPT_BLOCK_SIZE_SHIFT  "block_size_shift"
 
 #define BLOCK_PROBE_BUF_SIZE        512