Patchwork [RFC,03/19] block: Don't use guest sector size for qemu_blockalign()

login
register
mail settings
Submitter Kevin Wolf
Date Dec. 6, 2013, 5:22 p.m.
Message ID <1386350580-5666-4-git-send-email-kwolf@redhat.com>
Download mbox | patch
Permalink /patch/298155/
State New
Headers show

Comments

Kevin Wolf - Dec. 6, 2013, 5:22 p.m.
bs->buffer_alignment is set by the device emulation and contains the
logical block size of the guest device. This isn't something that the
block layer should know, and even less something to use for determining
the right alignment of buffers to be used for the host.

The new function bdrv_opt_mem_align() allows for hooks in a BlockDriver
so that it can tell the qemu block layer the optimal alignment to be
used so that no bounce buffer must be used in the driver.

This patch may change the buffer alignment from 4k to 512 for all
callers that used qemu_blockalign() with the top-level image format
BlockDriverState. The value was never propagated to other levels in the
tree, so in particular raw-posix never required anything else than 512.

While on disks with 4k sectors direct I/O requires a 4k alignment,
memory may still be okay when aligned to 512 byte boundaries. This is
what must have happened in practice, because otherwise this would
already have failed earlier. Therefore I don't expect regressions even
with this intermediate state. Later, raw-posix can implement the hook
and expose a different memory alignment requirement.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   | 32 +++++++++++++++++++++++++++++---
 include/block/block.h     |  1 +
 include/block/block_int.h |  4 ++++
 3 files changed, 34 insertions(+), 3 deletions(-)
Wayne Xia - Dec. 10, 2013, 3:18 a.m.
于 2013/12/7 1:22, Kevin Wolf 写道:
> bs->buffer_alignment is set by the device emulation and contains the
> logical block size of the guest device. This isn't something that the
> block layer should know, and even less something to use for determining
> the right alignment of buffers to be used for the host.
> 
> The new function bdrv_opt_mem_align() allows for hooks in a BlockDriver
> so that it can tell the qemu block layer the optimal alignment to be
> used so that no bounce buffer must be used in the driver.
> 
> This patch may change the buffer alignment from 4k to 512 for all
> callers that used qemu_blockalign() with the top-level image format
> BlockDriverState. The value was never propagated to other levels in the
> tree, so in particular raw-posix never required anything else than 512.
> 
> While on disks with 4k sectors direct I/O requires a 4k alignment,
> memory may still be okay when aligned to 512 byte boundaries. This is
> what must have happened in practice, because otherwise this would
> already have failed earlier. Therefore I don't expect regressions even
> with this intermediate state. Later, raw-posix can implement the hook
> and expose a different memory alignment requirement.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>   block.c                   | 32 +++++++++++++++++++++++++++++---
>   include/block/block.h     |  1 +
>   include/block/block_int.h |  4 ++++
>   3 files changed, 34 insertions(+), 3 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 613201b..669793b 100644
> --- a/block.c
> +++ b/block.c
> @@ -213,6 +213,31 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
>       qemu_co_queue_next(&bs->throttled_reqs[is_write]);
>   }
> 
> +size_t bdrv_opt_mem_align(BlockDriverState *bs)
> +{
> +    size_t alignment;
> +
> +    if (!bs || !bs->drv) {
> +        /* 4k should be on the safe side */
> +        return 4096;
> +    }
> +
> +    if (bs->drv->bdrv_opt_mem_align) {
> +        return bs->drv->bdrv_opt_mem_align(bs);
> +    }
> +
> +    if (bs->file) {
> +        alignment = bdrv_opt_mem_align(bs->file);
> +    } else {
> +        alignment = 512;
> +    }
> +
> +    if (bs->backing_hd) {
> +        alignment = MAX(alignment, bdrv_opt_mem_align(bs->backing_hd));
> +    }

  Maybe I didn't understand the commit message correctly, does this code
intend to get MAX alignment value in a chain? For example:
base(4096)->mid(512)->top(1024) results: 4096
The condition to traver the backing files seems complex.

> +    return alignment;
> +}
> +
>   /* check if the path starts with "<protocol>:" */
>   static int path_has_protocol(const char *path)
>   {
> @@ -4335,7 +4360,7 @@ void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
> 
>   void *qemu_blockalign(BlockDriverState *bs, size_t size)
>   {
> -    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
> +    return qemu_memalign(bdrv_opt_mem_align(bs), size);
>   }
> 
>   /*
> @@ -4344,12 +4369,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
>   bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
>   {
>       int i;
> +    size_t alignment = bdrv_opt_mem_align(bs);
> 
>       for (i = 0; i < qiov->niov; i++) {
> -        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
> +        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
>               return false;
>           }
> -        if (qiov->iov[i].iov_len % bs->buffer_alignment) {
> +        if (qiov->iov[i].iov_len % alignment) {
>               return false;
>           }
>       }
> diff --git a/include/block/block.h b/include/block/block.h
> index 3560deb..d262c0e 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -383,6 +383,7 @@ void bdrv_img_create(const char *filename, const char *fmt,
>                        char *options, uint64_t img_size, int flags,
>                        Error **errp, bool quiet);
> 
> +size_t bdrv_opt_mem_align(BlockDriverState *bs);
>   void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
>   void *qemu_blockalign(BlockDriverState *bs, size_t size);
>   bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
> diff --git a/include/block/block_int.h b/include/block/block_int.h
> index 1666066..6a84f83 100644
> --- a/include/block/block_int.h
> +++ b/include/block/block_int.h
> @@ -119,6 +119,10 @@ struct BlockDriver {
>           int64_t sector_num, int nb_sectors,
>           BlockDriverCompletionFunc *cb, void *opaque);
> 
> +    /* Returns the alignment in bytes that is required so that no bounce buffer
> +     * is required throughout the stack */
> +    int (*bdrv_opt_mem_align)(BlockDriverState *bs);
> +
>       int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
>           int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
>       int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
>
Kevin Wolf - Dec. 10, 2013, 9:42 a.m.
Am 10.12.2013 um 04:18 hat Wenchao Xia geschrieben:
> 于 2013/12/7 1:22, Kevin Wolf 写道:
> > bs->buffer_alignment is set by the device emulation and contains the
> > logical block size of the guest device. This isn't something that the
> > block layer should know, and even less something to use for determining
> > the right alignment of buffers to be used for the host.
> > 
> > The new function bdrv_opt_mem_align() allows for hooks in a BlockDriver
> > so that it can tell the qemu block layer the optimal alignment to be
> > used so that no bounce buffer must be used in the driver.
> > 
> > This patch may change the buffer alignment from 4k to 512 for all
> > callers that used qemu_blockalign() with the top-level image format
> > BlockDriverState. The value was never propagated to other levels in the
> > tree, so in particular raw-posix never required anything else than 512.
> > 
> > While on disks with 4k sectors direct I/O requires a 4k alignment,
> > memory may still be okay when aligned to 512 byte boundaries. This is
> > what must have happened in practice, because otherwise this would
> > already have failed earlier. Therefore I don't expect regressions even
> > with this intermediate state. Later, raw-posix can implement the hook
> > and expose a different memory alignment requirement.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> > ---
> >   block.c                   | 32 +++++++++++++++++++++++++++++---
> >   include/block/block.h     |  1 +
> >   include/block/block_int.h |  4 ++++
> >   3 files changed, 34 insertions(+), 3 deletions(-)
> > 
> > diff --git a/block.c b/block.c
> > index 613201b..669793b 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -213,6 +213,31 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
> >       qemu_co_queue_next(&bs->throttled_reqs[is_write]);
> >   }
> > 
> > +size_t bdrv_opt_mem_align(BlockDriverState *bs)
> > +{
> > +    size_t alignment;
> > +
> > +    if (!bs || !bs->drv) {
> > +        /* 4k should be on the safe side */
> > +        return 4096;
> > +    }
> > +
> > +    if (bs->drv->bdrv_opt_mem_align) {
> > +        return bs->drv->bdrv_opt_mem_align(bs);
> > +    }
> > +
> > +    if (bs->file) {
> > +        alignment = bdrv_opt_mem_align(bs->file);
> > +    } else {
> > +        alignment = 512;
> > +    }
> > +
> > +    if (bs->backing_hd) {
> > +        alignment = MAX(alignment, bdrv_opt_mem_align(bs->backing_hd));
> > +    }
> 
>   Maybe I didn't understand the commit message correctly, does this code
> intend to get MAX alignment value in a chain? For example:
> base(4096)->mid(512)->top(1024) results: 4096
> The condition to traver the backing files seems complex.

Yes, you want to align any newly allocated buffer such that no matter
what direction it takes on its way through the block layer, it will
never be misaligned. This means that you need to use the highest
alignment restriction.

Kevin
Wayne Xia - Dec. 11, 2013, 2:43 a.m.
于 2013/12/10 17:42, Kevin Wolf 写道:
> Am 10.12.2013 um 04:18 hat Wenchao Xia geschrieben:
>> 于 2013/12/7 1:22, Kevin Wolf 写道:
>>> bs->buffer_alignment is set by the device emulation and contains the
>>> logical block size of the guest device. This isn't something that the
>>> block layer should know, and even less something to use for determining
>>> the right alignment of buffers to be used for the host.
>>>
>>> The new function bdrv_opt_mem_align() allows for hooks in a BlockDriver
>>> so that it can tell the qemu block layer the optimal alignment to be
>>> used so that no bounce buffer must be used in the driver.
>>>
>>> This patch may change the buffer alignment from 4k to 512 for all
>>> callers that used qemu_blockalign() with the top-level image format
>>> BlockDriverState. The value was never propagated to other levels in the
>>> tree, so in particular raw-posix never required anything else than 512.
>>>
>>> While on disks with 4k sectors direct I/O requires a 4k alignment,
>>> memory may still be okay when aligned to 512 byte boundaries. This is
>>> what must have happened in practice, because otherwise this would
>>> already have failed earlier. Therefore I don't expect regressions even
>>> with this intermediate state. Later, raw-posix can implement the hook
>>> and expose a different memory alignment requirement.
>>>
>>> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
>>> ---
>>>    block.c                   | 32 +++++++++++++++++++++++++++++---
>>>    include/block/block.h     |  1 +
>>>    include/block/block_int.h |  4 ++++
>>>    3 files changed, 34 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/block.c b/block.c
>>> index 613201b..669793b 100644
>>> --- a/block.c
>>> +++ b/block.c
>>> @@ -213,6 +213,31 @@ static void bdrv_io_limits_intercept(BlockDriverState *bs,
>>>        qemu_co_queue_next(&bs->throttled_reqs[is_write]);
>>>    }
>>>
>>> +size_t bdrv_opt_mem_align(BlockDriverState *bs)
>>> +{
>>> +    size_t alignment;
>>> +
>>> +    if (!bs || !bs->drv) {
>>> +        /* 4k should be on the safe side */
>>> +        return 4096;
>>> +    }
>>> +
>>> +    if (bs->drv->bdrv_opt_mem_align) {
>>> +        return bs->drv->bdrv_opt_mem_align(bs);
>>> +    }
>>> +
>>> +    if (bs->file) {
>>> +        alignment = bdrv_opt_mem_align(bs->file);
>>> +    } else {
>>> +        alignment = 512;
>>> +    }
>>> +
>>> +    if (bs->backing_hd) {
>>> +        alignment = MAX(alignment, bdrv_opt_mem_align(bs->backing_hd));
>>> +    }
>>
>>    Maybe I didn't understand the commit message correctly, does this code
>> intend to get MAX alignment value in a chain? For example:
>> base(4096)->mid(512)->top(1024) results: 4096
>> The condition to traver the backing files seems complex.
>
> Yes, you want to align any newly allocated buffer such that no matter
> what direction it takes on its way through the block layer, it will
> never be misaligned. This means that you need to use the highest
> alignment restriction.
>
> Kevin
>

   The condition to travel and compare, seems complex, since when
"return" is executed, then travers would stop. for example:
base(4096)->mid(512)->top(1024), when top's
bs->drv->bdrv_opt_mem_align != NULL, the result is 1024.
So I wonder whether this is on purpose, or the code should be:

size_t bdrv_opt_mem_align(BlockDriverState *bs)
{
     size_t alignment;

     if (!bs || !bs->drv) {
         /* 4k should be on the safe side */
         alignment = 4096;
         goto compare;
     }

     if (bs->drv->bdrv_opt_mem_align) {
         alignment = bs->drv->bdrv_opt_mem_align(bs);
         goto compare;
     }

     if (bs->file) {
         alignment = bdrv_opt_mem_align(bs->file);
     } else {
         alignment = 512;
     }

  compare:
     if (bs->backing_hd) {
         alignment = MAX(alignment, bdrv_opt_mem_align(bs->backing_hd));
     }
     return alignment;
}

Patch

diff --git a/block.c b/block.c
index 613201b..669793b 100644
--- a/block.c
+++ b/block.c
@@ -213,6 +213,31 @@  static void bdrv_io_limits_intercept(BlockDriverState *bs,
     qemu_co_queue_next(&bs->throttled_reqs[is_write]);
 }
 
+size_t bdrv_opt_mem_align(BlockDriverState *bs)
+{
+    size_t alignment;
+
+    if (!bs || !bs->drv) {
+        /* 4k should be on the safe side */
+        return 4096;
+    }
+
+    if (bs->drv->bdrv_opt_mem_align) {
+        return bs->drv->bdrv_opt_mem_align(bs);
+    }
+
+    if (bs->file) {
+        alignment = bdrv_opt_mem_align(bs->file);
+    } else {
+        alignment = 512;
+    }
+
+    if (bs->backing_hd) {
+        alignment = MAX(alignment, bdrv_opt_mem_align(bs->backing_hd));
+    }
+    return alignment;
+}
+
 /* check if the path starts with "<protocol>:" */
 static int path_has_protocol(const char *path)
 {
@@ -4335,7 +4360,7 @@  void bdrv_set_buffer_alignment(BlockDriverState *bs, int align)
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
 {
-    return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size);
+    return qemu_memalign(bdrv_opt_mem_align(bs), size);
 }
 
 /*
@@ -4344,12 +4369,13 @@  void *qemu_blockalign(BlockDriverState *bs, size_t size)
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
 {
     int i;
+    size_t alignment = bdrv_opt_mem_align(bs);
 
     for (i = 0; i < qiov->niov; i++) {
-        if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) {
+        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
             return false;
         }
-        if (qiov->iov[i].iov_len % bs->buffer_alignment) {
+        if (qiov->iov[i].iov_len % alignment) {
             return false;
         }
     }
diff --git a/include/block/block.h b/include/block/block.h
index 3560deb..d262c0e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -383,6 +383,7 @@  void bdrv_img_create(const char *filename, const char *fmt,
                      char *options, uint64_t img_size, int flags,
                      Error **errp, bool quiet);
 
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
 void bdrv_set_buffer_alignment(BlockDriverState *bs, int align);
 void *qemu_blockalign(BlockDriverState *bs, size_t size);
 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1666066..6a84f83 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -119,6 +119,10 @@  struct BlockDriver {
         int64_t sector_num, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
 
+    /* Returns the alignment in bytes that is required so that no bounce buffer
+     * is required throughout the stack */
+    int (*bdrv_opt_mem_align)(BlockDriverState *bs);
+
     int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,