diff mbox

[RFC,05/19] raw: Probe required direct I/O alignment

Message ID 1386350580-5666-6-git-send-email-kwolf@redhat.com
State New
Headers show

Commit Message

Kevin Wolf Dec. 6, 2013, 5:22 p.m. UTC
From: Paolo Bonzini <pbonzini@redhat.com>

Add a bs->request_alignment field that contains the required
offset/length alignment for I/O requests and fill it in the raw block
drivers. Use ioctls if possible, else see what alignment it takes for
O_DIRECT to succeed.

While at it, also expose the memory alignment requirements, which may be
(and in practice are) different from the disk alignment requirements.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   |   4 ++
 block/raw-posix.c         | 102 ++++++++++++++++++++++++++++++++++++++--------
 block/raw-win32.c         |  41 +++++++++++++++++++
 include/block/block_int.h |   3 ++
 4 files changed, 132 insertions(+), 18 deletions(-)

Comments

Paolo Bonzini Dec. 6, 2013, 5:53 p.m. UTC | #1
Il 06/12/2013 18:22, Kevin Wolf ha scritto:
> @@ -1227,6 +1288,7 @@ static BlockDriver bdrv_file = {
>      .bdrv_aio_writev = raw_aio_writev,
>      .bdrv_aio_flush = raw_aio_flush,
>      .bdrv_aio_discard = raw_aio_discard,
> +    .bdrv_opt_mem_align = raw_opt_mem_align,
>  
>      .bdrv_truncate = raw_truncate,
>      .bdrv_getlength = raw_getlength,
> @@ -1582,6 +1644,7 @@ static BlockDriver bdrv_host_device = {
>      .bdrv_aio_writev	= raw_aio_writev,
>      .bdrv_aio_flush	= raw_aio_flush,
>      .bdrv_aio_discard   = hdev_aio_discard,
> +    .bdrv_opt_mem_align = raw_opt_mem_align,

Should this rather be a BlockLimits field?

Paolo
Kevin Wolf Dec. 9, 2013, 12:58 p.m. UTC | #2
Am 06.12.2013 um 18:53 hat Paolo Bonzini geschrieben:
> Il 06/12/2013 18:22, Kevin Wolf ha scritto:
> > @@ -1227,6 +1288,7 @@ static BlockDriver bdrv_file = {
> >      .bdrv_aio_writev = raw_aio_writev,
> >      .bdrv_aio_flush = raw_aio_flush,
> >      .bdrv_aio_discard = raw_aio_discard,
> > +    .bdrv_opt_mem_align = raw_opt_mem_align,
> >  
> >      .bdrv_truncate = raw_truncate,
> >      .bdrv_getlength = raw_getlength,
> > @@ -1582,6 +1644,7 @@ static BlockDriver bdrv_host_device = {
> >      .bdrv_aio_writev	= raw_aio_writev,
> >      .bdrv_aio_flush	= raw_aio_flush,
> >      .bdrv_aio_discard   = hdev_aio_discard,
> > +    .bdrv_opt_mem_align = raw_opt_mem_align,
> 
> Should this rather be a BlockLimits field?

How is BlockLimits supposed with respect to inheritance of values
through the BDS tree? I tried looking at the code, but for example
bl.opt_transfer_length is only forwarded in raw, so for any other format
(or if you ever put a filter there) it simply doesn't work.

I could initialise a new BlockLimits.opt_mem_align field in
bdrv_open_common() with the value of bs->file->bl.opt_mem_align, and in
bdrv_open_backing_file() change it to MAX(bs->bl.opt_mem_align,
bs->backing_hd->bl.opt_mem_align). The block driver could then in
bdrv_open() override the former, but never the latter.

What would happen on bdrv_reopen(), specifically toggling O_DIRECT? The
values would have to change then.

Kevin
Paolo Bonzini Dec. 9, 2013, 1:40 p.m. UTC | #3
Il 09/12/2013 13:58, Kevin Wolf ha scritto:
> How is BlockLimits supposed with respect to inheritance of values
> through the BDS tree?

I think right now the accessor wrappers for the various BlockLimits
field sort that out.  Of course this could be slow if the accessors end
up in the fast path (but that's not different from what your series does
already).

> I tried looking at the code, but for example
> bl.opt_transfer_length is only forwarded in raw, so for any other format
> (or if you ever put a filter there) it simply doesn't work.

That's correct.

For example, for qcow2 the optimal transfer length could be the cluster
size.  Without benchmarking, I didn't complain about Peter's choice of
leaving it zero.

> I could initialise a new BlockLimits.opt_mem_align field in
> bdrv_open_common() with the value of bs->file->bl.opt_mem_align, and in
> bdrv_open_backing_file() change it to MAX(bs->bl.opt_mem_align,
> bs->backing_hd->bl.opt_mem_align). The block driver could then in
> bdrv_open() override the former, but never the latter.
> 
> What would happen on bdrv_reopen(), specifically toggling O_DIRECT? The
> values would have to change then.

Yes.  This also goes in favor of making wrappers handle the stacking of
limits, at least for now.

Paolo
diff mbox

Patch

diff --git a/block.c b/block.c
index 34fedf0..b4b17fd 100644
--- a/block.c
+++ b/block.c
@@ -789,6 +789,7 @@  static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
 
     bs->open_flags = flags;
     bs->guest_block_size = 512;
+    bs->request_alignment = 512;
     bs->zero_beyond_eof = true;
     open_flags = bdrv_open_flags(bs, flags);
     bs->read_only = !(open_flags & BDRV_O_RDWR);
@@ -856,6 +857,9 @@  static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file,
         goto free_and_fail;
     }
 
+    assert(bdrv_opt_mem_align(bs) != 0);
+    assert(bs->request_alignment != 0);
+
 #ifndef _WIN32
     if (bs->is_temporary) {
         assert(bs->filename[0] != '\0');
diff --git a/block/raw-posix.c b/block/raw-posix.c
index f836c8e..732aff5 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -127,6 +127,8 @@  typedef struct BDRVRawState {
     int fd;
     int type;
     int open_flags;
+    size_t buf_align;
+
 #if defined(__linux__)
     /* linux floppy specific */
     int64_t fd_open_time;
@@ -211,6 +213,76 @@  static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    char *buf;
+    unsigned int sector_size;
+
+    /* For /dev/sg devices the alignment is not really used.
+       With buffered I/O, we don't have any restrictions. */
+    if (bs->sg || !(s->open_flags & O_DIRECT)) {
+        bs->request_alignment = 1;
+        s->buf_align = 1;
+        return;
+    }
+
+    /* Try a few ioctls to get the right size */
+    bs->request_alignment = 0;
+    s->buf_align = 0;
+
+#ifdef BLKSSZGET
+    if (ioctl(s->fd, BLKSSZGET, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DKIOCGETBLOCKSIZE
+    if (ioctl(s->fd, DKIOCGETBLOCKSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef DIOCGSECTORSIZE
+    if (ioctl(s->fd, DIOCGSECTORSIZE, &sector_size) >= 0) {
+        bs->request_alignment = sector_size;
+    }
+#endif
+#ifdef CONFIG_XFS
+    if (s->is_xfs) {
+        struct dioattr da;
+        if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) {
+            bs->request_alignment = da.d_miniosz;
+            /* The kernel returns wrong information for d_mem */
+            /* s->buf_align = da.d_mem; */
+        }
+    }
+#endif
+
+    /* If we could not get the sizes so far, we can only guess them */
+    if (!s->buf_align) {
+        size_t align;
+        buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) {
+                s->buf_align = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+
+    if (!bs->request_alignment) {
+        size_t align;
+        buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE);
+        for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) {
+            if (pread(s->fd, buf, align, 0) >= 0) {
+                bs->request_alignment = align;
+                break;
+            }
+        }
+        qemu_vfree(buf);
+    }
+}
+
 static void raw_parse_flags(int bdrv_flags, int *open_flags)
 {
     assert(open_flags != NULL);
@@ -330,6 +402,8 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
 #endif
 
+    raw_probe_alignment(bs);
+
     ret = 0;
 fail:
     qemu_opts_del(opts);
@@ -432,7 +506,6 @@  static int raw_reopen_prepare(BDRVReopenState *state,
     return ret;
 }
 
-
 static void raw_reopen_commit(BDRVReopenState *state)
 {
     BDRVRawReopenState *raw_s = state->opaque;
@@ -468,23 +541,11 @@  static void raw_reopen_abort(BDRVReopenState *state)
     state->opaque = NULL;
 }
 
-
-/* XXX: use host sector size if necessary with:
-#ifdef DIOCGSECTORSIZE
-        {
-            unsigned int sectorsize = 512;
-            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
-                sectorsize > bufsize)
-                bufsize = sectorsize;
-        }
-#endif
-#ifdef CONFIG_COCOA
-        uint32_t blockSize = 512;
-        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
-            bufsize = blockSize;
-        }
-#endif
-*/
+static int raw_opt_mem_align(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    return s->buf_align;
+}
 
 static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb)
 {
@@ -1227,6 +1288,7 @@  static BlockDriver bdrv_file = {
     .bdrv_aio_writev = raw_aio_writev,
     .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_discard = raw_aio_discard,
+    .bdrv_opt_mem_align = raw_opt_mem_align,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
@@ -1582,6 +1644,7 @@  static BlockDriver bdrv_host_device = {
     .bdrv_aio_writev	= raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
     .bdrv_aio_discard   = hdev_aio_discard,
+    .bdrv_opt_mem_align = raw_opt_mem_align,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -1712,6 +1775,7 @@  static BlockDriver bdrv_host_floppy = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_opt_mem_align = raw_opt_mem_align,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -1822,6 +1886,7 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_opt_mem_align = raw_opt_mem_align,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
@@ -1951,6 +2016,7 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
     .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_opt_mem_align = raw_opt_mem_align,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength      = raw_getlength,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index 2bad5a3..6fe64d2 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -202,6 +202,35 @@  static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static void raw_probe_alignment(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    DWORD sectorsPerCluster, freeClusters, totalClusters, count;
+    DISK_GEOMETRY_EX dg;
+    BOOL status;
+
+    if (s->type == FTYPE_CD) {
+        bs->request_alignment = 2048;
+        return;
+    }
+    if (s->type == FTYPE_HARDDISK) {
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            bs->request_alignment = dg.Geometry.BytesPerSector;
+            return;
+        }
+        /* try GetDiskFreeSpace too */
+    }
+
+    if (s->drive_path[0]) {
+        GetDiskFreeSpace(s->drive_path, &sectorsPerCluster,
+                         &dg.Geometry.BytesPerSector,
+                         &freeClusters, &totalClusters);
+        bs->request_alignment = dg.Geometry.BytesPerSector;
+    }
+}
+
 static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
 {
     assert(access_flags != NULL);
@@ -269,6 +298,17 @@  static int raw_open(BlockDriverState *bs, QDict *options, int flags,
         }
     }
 
+    if (filename[0] && filename[1] == ':') {
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]);
+    } else if (filename[0] == '\\' && filename[1] == '\\') {
+        s->drive_path[0] = 0;
+    } else {
+        /* Relative path.  */
+        char buf[MAX_PATH];
+        GetCurrentDirectory(MAX_PATH, buf);
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]);
+    }
+
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           OPEN_EXISTING, overlapped, NULL);
@@ -293,6 +333,7 @@  static int raw_open(BlockDriverState *bs, QDict *options, int flags,
         s->aio = aio;
     }
 
+    raw_probe_alignment(bs);
     ret = 0;
 fail:
     qemu_opts_del(opts);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 760cfc5..31ce7d4 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -293,6 +293,9 @@  struct BlockDriverState {
     /* Whether produces zeros when read beyond eof */
     bool zero_beyond_eof;
 
+    /* Alignment requirement for offset/length of I/O requests */
+    unsigned int request_alignment;
+
     /* the block size for which the guest device expects atomicity */
     int guest_block_size;