Patchwork [v2,15/18] block: align and serialize I/O when guest_block_size < host_block_size

login
register
mail settings
Submitter Paolo Bonzini
Date Jan. 26, 2012, 5:22 p.m.
Message ID <1327598569-5199-16-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/138014/
State New
Headers show

Comments

Paolo Bonzini - Jan. 26, 2012, 5:22 p.m.
When the guest sees a lower alignment than the host, and O_DIRECT
is in place, I/O might not span an integer number of host sectors.
If this is the case, add a few sectors at the beginning and the end.
When reading, copy interesting data from there to the guest buffer.

When writing, merge data from there with data from the guest buffer.
Since writes potentially become read-modify-write sequences, they have
to be serialized against other operations.  Reads are still atomic
(they only need some postprocessing), so they need not be
serialized.

The math is a bit tricky, but luckily all the primitives are already in
cutils.c and iov.c.

On top of this, the guest's buffers might be misaligned with respect
to the host block size.  However, that's caught by raw-posix.c and
posix-aio-compat.c, so we need not care about it.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Makefile.objs |    4 +-
 block.c       |  174 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 trace-events  |    1 +
 3 files changed, 175 insertions(+), 4 deletions(-)

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 06a147b..f37adb1 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -24,7 +24,7 @@  coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
 #######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
-block-obj-y = cutils.o cache-utils.o qemu-option.o module.o async.o
+block-obj-y = cutils.o cache-utils.o qemu-option.o module.o async.o iov.o
 block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o
 block-obj-y += $(coroutine-obj-y) $(qobject-obj-y) $(version-obj-y)
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
@@ -158,7 +158,7 @@  endif
 common-obj-y += $(addprefix ui/, $(ui-obj-y))
 common-obj-$(CONFIG_VNC) += $(addprefix ui/, $(vnc-obj-y))
 
-common-obj-y += iov.o acl.o
+common-obj-y += acl.o
 common-obj-$(CONFIG_POSIX) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o qemu-timer-common.o
diff --git a/block.c b/block.c
index 683d4a3..33ccc23 100644
--- a/block.c
+++ b/block.c
@@ -28,6 +28,7 @@ 
 #include "block_int.h"
 #include "module.h"
 #include "qjson.h"
+#include "iov.h"
 #include "qemu-coroutine.h"
 #include "qmp-commands.h"
 #include "qemu-timer.h"
@@ -1182,6 +1183,21 @@  static void round_sectors(int64_t alignment,
     }
 }
 
+static bool req_is_aligned(int64_t alignment,
+                           int64_t sector_num, int nb_sectors)
+{
+    if (alignment != BDRV_SECTOR_SIZE) {
+        int64_t c = alignment / BDRV_SECTOR_SIZE;
+        if ((sector_num & (c - 1)) != 0) {
+            return false;
+        }
+        if ((nb_sectors & (c - 1)) != 0) {
+            return false;
+        }
+    }
+    return true;
+}
+
 static int64_t get_cluster_size(BlockDriverState *bs)
 {
     BlockDriverInfo bdi;
@@ -1578,6 +1594,135 @@  err:
     return ret;
 }
 
+static int coroutine_fn bdrv_rw_co_align(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, bool is_write)
+{
+    BlockDriver *drv = bs->drv;
+    QEMUIOVector aligned_qiov;
+
+    int alignment = bs->host_block_size / BDRV_SECTOR_SIZE;
+    int64_t io_sector_num;
+    int io_nb_sectors;
+    int head_extra, tail_sectors;
+    int head_sectors = 0;
+    int tail_offset = 0;
+    int ret;
+    uint8_t *head = NULL;
+    uint8_t *tail = NULL;
+
+    trace_bdrv_rw_co_align(bs, sector_num, nb_sectors, is_write);
+    round_sectors(bs->host_block_size, sector_num, nb_sectors,
+                  &io_sector_num, &io_nb_sectors);
+
+    /*
+     * Here is a drawing of some involved quantities.  Double lines
+     * mark points that are aligned to a host block.
+     *
+     *                           io_sector_num + io_nb_sectors ---.
+     *     .-- io_sector_num                                       \
+     *    /                         sector_num + nb_sectors ---.    \
+     *   /______________ ________________  _____  ______________\____\
+     *  ||              |                ||     ||              |     ||
+     *  ||  head_extra  |  head_sectors  || ... || tail_sectors | ... ||
+     *  ||______________|________________||_____||______________|_____||
+     *                   \                \       \
+     *                    `--- sector_num  \       `--- sector_num + tail_offset
+     *                                      \
+     *                                       `--- io_sector_num + alignment
+     */
+    head_extra = sector_num & (alignment - 1);
+    tail_sectors = (sector_num + nb_sectors) & (alignment - 1);
+
+    qemu_iovec_init(&aligned_qiov, qiov->niov + 2);
+
+    /* If the initial sector is not aligned, we need a "head" block.  If a
+     * single block will satisfy the request, we put it in the "head" too.
+     * This way, the tail always starts on an aligned sector.
+     */
+    if (head_extra != 0 || io_nb_sectors == alignment) {
+        head_sectors = MIN(nb_sectors, alignment - head_extra);
+        head = qemu_blockalign(bs, bs->host_block_size);
+        qemu_iovec_add(&aligned_qiov, head, bs->host_block_size);
+    }
+
+    /* If the aligned request is just 1 block, we will read it in head,
+     * so nothing else to do.
+     */
+    if (io_nb_sectors > alignment) {
+        /* Otherwise, see where the last full block ends...  */
+        tail_offset = nb_sectors - tail_sectors;
+        assert(((sector_num + tail_offset) & (alignment - 1)) == 0);
+
+        /* ... add everything after the head and up to it.  */
+        qemu_iovec_copy(&aligned_qiov, qiov,
+                        head_sectors * BDRV_SECTOR_SIZE,
+                        (tail_offset - head_sectors) * BDRV_SECTOR_SIZE);
+
+        /* If the final sector is not aligned, add an extra block at
+         * the end.
+         */
+        if (tail_sectors != 0) {
+            tail = qemu_blockalign(bs, bs->host_block_size);
+            qemu_iovec_add(&aligned_qiov, tail, bs->host_block_size);
+        }
+    }
+
+    assert(head || tail);
+    if (is_write) {
+        QEMUIOVector rmw_qiov;
+        struct iovec rmw_iov;
+
+        /* Merge the user data with data read from the first block...  */
+        if (head != NULL) {
+            rmw_iov.iov_base = head;
+            rmw_iov.iov_len = bs->host_block_size;
+            qemu_iovec_init_external(&rmw_qiov, &rmw_iov, 1);
+            ret = drv->bdrv_co_readv(bs, io_sector_num, alignment, &rmw_qiov);
+            if (ret < 0) {
+                return ret;
+            }
+            iov_to_buf(qiov->iov, qiov->size,
+                       (head + head_extra * BDRV_SECTOR_SIZE), 0,
+                       head_sectors * BDRV_SECTOR_SIZE);
+        }
+
+        /* ... and from the last block.  The two reads could be merged if they
+         * access consecutive blocks, but it is rare and we do not care.  */
+        if (tail != NULL) {
+            rmw_iov.iov_base = tail;
+            rmw_iov.iov_len = bs->host_block_size;
+            qemu_iovec_init_external(&rmw_qiov, &rmw_iov, 1);
+            ret = drv->bdrv_co_readv(bs, sector_num + tail_offset, alignment, &rmw_qiov);
+            if (ret < 0) {
+                return ret;
+            }
+            iov_to_buf(qiov->iov, qiov->size,
+                       tail, tail_offset * BDRV_SECTOR_SIZE,
+                       tail_sectors * BDRV_SECTOR_SIZE);
+        }
+        ret = drv->bdrv_co_writev(bs, io_sector_num, io_nb_sectors, &aligned_qiov);
+    } else {
+        ret = drv->bdrv_co_readv(bs, io_sector_num, io_nb_sectors, &aligned_qiov);
+        /* Copy the misaligned parts that the user wants.  */
+        if (head != NULL) {
+            iov_from_buf(qiov->iov, qiov->size,
+                         (head + head_extra * BDRV_SECTOR_SIZE), 0,
+                         head_sectors * BDRV_SECTOR_SIZE);
+        }
+        if (tail != NULL) {
+            iov_from_buf(qiov->iov, qiov->size,
+                         tail, tail_offset * BDRV_SECTOR_SIZE,
+                         tail_sectors * BDRV_SECTOR_SIZE);
+        }
+    }
+
+    qemu_iovec_destroy(&aligned_qiov);
+    g_free(head);
+    g_free(tail);
+    return ret;
+}
+
+
 /*
  * Handle a read request in coroutine context
  */
@@ -1639,7 +1784,12 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         }
     }
 
-    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        !req_is_aligned(bs->host_block_size, sector_num, nb_sectors)) {
+        ret = bdrv_rw_co_align(bs, sector_num, nb_sectors, qiov, false);
+    } else {
+        ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+    }
 
 out:
     tracked_request_end(&req);
@@ -1698,9 +1847,25 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
                                       get_cluster_size(bs), false);
     }
 
+    /* When the guest sees a lower alignment than the host, and O_DIRECT
+     * is in place, assume that the write will become a read-modify-write
+     * sequence.  These have to be serialized against each other, so that
+     * they look atomic to the guest.
+     */
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        bs->guest_block_size < bs->host_block_size) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors,
+                                      bs->host_block_size, true);
+    }
+
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
-    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        !req_is_aligned(bs->host_block_size, sector_num, nb_sectors)) {
+        ret = bdrv_rw_co_align(bs, sector_num, nb_sectors, qiov, true);
+    } else {
+        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+    }
 
     if (bs->dirty_bitmap) {
         set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
@@ -3639,6 +3803,12 @@  BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
 void bdrv_set_guest_block_size(BlockDriverState *bs, int align)
 {
     bs->guest_block_size = align;
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        bs->guest_block_size < bs->host_block_size) {
+        error_report("Host block size is %d, guest block size is %d.  Direct\n"
+                     "access to the host device may cause performance degradation.",
+                     bs->host_block_size, bs->guest_block_size);
+    }
     if ((bs->open_flags & BDRV_O_RDWR) &&
         bs->host_block_size < bs->guest_block_size) {
         error_report("Host block size is %d, guest block size is %d.  Due to partially\n"
diff --git a/trace-events b/trace-events
index 75f6e17..9328056 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@  bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
 bdrv_co_do_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
+bdrv_rw_co_align(void *bs, int64_t sector_num, int nb_sectors, bool is_write) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d"
 
 # block/stream.c
 stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"