diff mbox

[1/1] block: change default memory alignment for block requests to 4096

Message ID 1422470996-20820-2-git-send-email-den@openvz.org
State New
Headers show

Commit Message

Denis V. Lunev Jan. 28, 2015, 6:49 p.m. UTC
The following sequence
    int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
    for (i = 0; i < 100000; i++)
            write(fd, buf, 4096);
performs 10% better if buf is aligned to 4096 bytes rather then to
512 bytes on HDD with 512/4096 logical/physical sector size.

The difference is quite reliable.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
---
 block.c           | 4 ++--
 block/raw-posix.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

Comments

Denis V. Lunev Jan. 28, 2015, 7:59 p.m. UTC | #1
On 28/01/15 21:49, Denis V. Lunev wrote:
> The following sequence
>      int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
>      for (i = 0; i < 100000; i++)
>              write(fd, buf, 4096);
> performs 10% better if buf is aligned to 4096 bytes rather then to
> 512 bytes on HDD with 512/4096 logical/physical sector size.
>
> The difference is quite reliable.
>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block.c           | 4 ++--
>   block/raw-posix.c | 4 ++--
>   2 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/block.c b/block.c
> index d45e4dd..bc5d1e7 100644
> --- a/block.c
> +++ b/block.c
> @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
>           bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
>           bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
>       } else {
> -        bs->bl.opt_mem_alignment = 512;
> +        bs->bl.opt_mem_alignment = 4096;
>       }
>   
>       if (bs->backing_hd) {
> @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
>   
>       bs->open_flags = flags;
>       bs->guest_block_size = 512;
> -    bs->request_alignment = 512;
> +    bs->request_alignment = 4096;
>       bs->zero_beyond_eof = true;
>       open_flags = bdrv_open_flags(bs, flags);
>       bs->read_only = !(open_flags & BDRV_O_RDWR);
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index ec38fee..d1b3388 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>       if (!s->buf_align) {
>           size_t align;
>           buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>               if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
>                   s->buf_align = align;
>                   break;
> @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>       if (!bs->request_alignment) {
>           size_t align;
>           buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>               if (pread(fd, buf, align, 0) >= 0) {
>                   bs->request_alignment = align;
>                   break;
sorry, the patch is wrong. It breaks 'make check-block'.
I will redo it and perform more testing.

request-alignment related changes are wrong :(
I have run tests without them but added them as
a obvious last minute addition.
Paolo Bonzini Jan. 28, 2015, 8:07 p.m. UTC | #2
On 28/01/2015 19:49, Denis V. Lunev wrote:
> The following sequence
>     int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
>     for (i = 0; i < 100000; i++)
>             write(fd, buf, 4096);
> performs 10% better if buf is aligned to 4096 bytes rather then to
> 512 bytes on HDD with 512/4096 logical/physical sector size.
> 
> The difference is quite reliable.

The 10% difference, however, is probably not enough to cover the cost of
providing a bounce buffer if a guest is (rightfully) using a 512-byte
aligned buffer: bs->bl.opt_mem_alignment is in fact badly named and it
should be bs->bl.min_mem_alignment instead.

Instead, you probably should patch bdrv_opt_mem_align to return at least
4096, and leave the detection logic intact.  This will let
qemu_blockalign return a properly aligned buffer to qemu-img and other
in-process allocations, without negatively affecting the guest.

Thanks,

Paolo

> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block.c           | 4 ++--
>  block/raw-posix.c | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/block.c b/block.c
> index d45e4dd..bc5d1e7 100644
> --- a/block.c
> +++ b/block.c
> @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
>          bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
>          bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
>      } else {
> -        bs->bl.opt_mem_alignment = 512;
> +        bs->bl.opt_mem_alignment = 4096;
>      }
>  
>      if (bs->backing_hd) {
> @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
>  
>      bs->open_flags = flags;
>      bs->guest_block_size = 512;
> -    bs->request_alignment = 512;
> +    bs->request_alignment = 4096;
>      bs->zero_beyond_eof = true;
>      open_flags = bdrv_open_flags(bs, flags);
>      bs->read_only = !(open_flags & BDRV_O_RDWR);
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index ec38fee..d1b3388 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>      if (!s->buf_align) {
>          size_t align;
>          buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>              if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
>                  s->buf_align = align;
>                  break;
> @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>      if (!bs->request_alignment) {
>          size_t align;
>          buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>              if (pread(fd, buf, align, 0) >= 0) {
>                  bs->request_alignment = align;
>                  break;
>
Denis V. Lunev Jan. 28, 2015, 8:13 p.m. UTC | #3
On 28/01/15 23:07, Paolo Bonzini wrote:
>
> On 28/01/2015 19:49, Denis V. Lunev wrote:
>> The following sequence
>>      int fd = open(argv[1], O_RDWR | O_CREAT | O_DIRECT, 0644);
>>      for (i = 0; i < 100000; i++)
>>              write(fd, buf, 4096);
>> performs 10% better if buf is aligned to 4096 bytes rather then to
>> 512 bytes on HDD with 512/4096 logical/physical sector size.
>>
>> The difference is quite reliable.
> The 10% difference, however, is probably not enough to cover the cost of
> providing a bounce buffer if a guest is (rightfully) using a 512-byte
> aligned buffer: bs->bl.opt_mem_alignment is in fact badly named and it
> should be bs->bl.min_mem_alignment instead.
>
> Instead, you probably should patch bdrv_opt_mem_align to return at least
> 4096, and leave the detection logic intact.  This will let
> qemu_blockalign return a properly aligned buffer to qemu-img and other
> in-process allocations, without negatively affecting the guest.
>
> Thanks,
>
> Paolo
ok, this looks good to me :)


>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> ---
>>   block.c           | 4 ++--
>>   block/raw-posix.c | 4 ++--
>>   2 files changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index d45e4dd..bc5d1e7 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -543,7 +543,7 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
>>           bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
>>           bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
>>       } else {
>> -        bs->bl.opt_mem_alignment = 512;
>> +        bs->bl.opt_mem_alignment = 4096;
>>       }
>>   
>>       if (bs->backing_hd) {
>> @@ -966,7 +966,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
>>   
>>       bs->open_flags = flags;
>>       bs->guest_block_size = 512;
>> -    bs->request_alignment = 512;
>> +    bs->request_alignment = 4096;
>>       bs->zero_beyond_eof = true;
>>       open_flags = bdrv_open_flags(bs, flags);
>>       bs->read_only = !(open_flags & BDRV_O_RDWR);
>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>> index ec38fee..d1b3388 100644
>> --- a/block/raw-posix.c
>> +++ b/block/raw-posix.c
>> @@ -266,7 +266,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>       if (!s->buf_align) {
>>           size_t align;
>>           buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
>> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
>> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>>               if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
>>                   s->buf_align = align;
>>                   break;
>> @@ -278,7 +278,7 @@ static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
>>       if (!bs->request_alignment) {
>>           size_t align;
>>           buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
>> -        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
>> +        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
>>               if (pread(fd, buf, align, 0) >= 0) {
>>                   bs->request_alignment = align;
>>                   break;
>>
diff mbox

Patch

diff --git a/block.c b/block.c
index d45e4dd..bc5d1e7 100644
--- a/block.c
+++ b/block.c
@@ -543,7 +543,7 @@  void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
         bs->bl.max_transfer_length = bs->file->bl.max_transfer_length;
         bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment;
     } else {
-        bs->bl.opt_mem_alignment = 512;
+        bs->bl.opt_mem_alignment = 4096;
     }
 
     if (bs->backing_hd) {
@@ -966,7 +966,7 @@  static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     bs->open_flags = flags;
     bs->guest_block_size = 512;
-    bs->request_alignment = 512;
+    bs->request_alignment = 4096;
     bs->zero_beyond_eof = true;
     open_flags = bdrv_open_flags(bs, flags);
     bs->read_only = !(open_flags & BDRV_O_RDWR);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ec38fee..d1b3388 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -266,7 +266,7 @@  static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     if (!s->buf_align) {
         size_t align;
         buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
-        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
             if (pread(fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
                 s->buf_align = align;
                 break;
@@ -278,7 +278,7 @@  static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp)
     if (!bs->request_alignment) {
         size_t align;
         buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
-        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+        for (align = 4096; align <= MAX_BLOCKSIZE; align <<= 1) {
             if (pread(fd, buf, align, 0) >= 0) {
                 bs->request_alignment = align;
                 break;