Patchwork [25/25] raw-win32: implement native asynchronous I/O

login
register
mail settings
Submitter Paolo Bonzini
Date Oct. 26, 2012, 2:05 p.m.
Message ID <1351260355-19802-26-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/194519/
State New
Headers show

Comments

Paolo Bonzini - Oct. 26, 2012, 2:05 p.m.
With the new support for EventNotifiers in the AIO event loop, we
can hook a completion port to every opened file and use asynchronous
I/O on them.

Wine's support is extremely inefficient, also because it really does
the I/O synchronously on regular files. (!)  But it works, and it is
good to keep the Win32 and POSIX ports as similar as possible.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/Makefile.objs |   2 +-
 block/raw-aio.h     |  10 +++
 block/raw-win32.c   |  42 ++++++++--
 block/win32-aio.c   | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 file modificati, 274 inserzioni(+), 6 rimozioni(-)
 create mode 100644 block/win32-aio.c

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 771d341..30ef6ae 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -3,7 +3,7 @@  block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += parallels.o blkdebug.o blkverify.o
-block-obj-$(CONFIG_WIN32) += raw-win32.o
+block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
diff --git a/block/raw-aio.h b/block/raw-aio.h
index b3bb073..e77f361 100644
--- a/block/raw-aio.h
+++ b/block/raw-aio.h
@@ -35,4 +35,14 @@  BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
         BlockDriverCompletionFunc *cb, void *opaque, int type);
 #endif
 
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type);
+#endif
+
 #endif /* QEMU_RAW_AIO_H */
diff --git a/block/raw-win32.c b/block/raw-win32.c
index ffd86e3..0c05c58 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -36,6 +36,8 @@ 
 #define FTYPE_CD     1
 #define FTYPE_HARDDISK 2
 
+static QEMUWin32AIOState *aio;
+
 typedef struct RawWin32AIOData {
     BlockDriverState *bs;
     HANDLE hfile;
@@ -50,6 +52,7 @@  typedef struct BDRVRawState {
     HANDLE hfile;
     int type;
     char drive_path[16]; /* format: "d:\" */
+    QEMUWin32AIOState *aio;
 } BDRVRawState;
 
 /*
@@ -208,6 +211,9 @@  static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
     }
 
     *overlapped = FILE_ATTRIBUTE_NORMAL;
+    if (flags & BDRV_O_NATIVE_AIO) {
+        *overlapped |= FILE_FLAG_OVERLAPPED;
+    }
     if (flags & BDRV_O_NOCACHE) {
         *overlapped |= FILE_FLAG_NO_BUFFERING;
     }
@@ -222,6 +228,13 @@  static int raw_open(BlockDriverState *bs, const char *filename, int flags)
     s->type = FTYPE_FILE;
 
     raw_parse_flags(flags, &access_flags, &overlapped);
+    
+    if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) {
+        aio = win32_aio_init();
+        if (aio == NULL) {
+            return -EINVAL;
+        }
+    }
 
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
@@ -231,7 +244,16 @@  static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 
         if (err == ERROR_ACCESS_DENIED)
             return -EACCES;
-        return -1;
+        return -EINVAL;
+    }
+
+    if (flags & BDRV_O_NATIVE_AIO) {
+        int ret = win32_aio_attach(aio, s->hfile);
+        if (ret < 0) {
+            CloseHandle(s->hfile);
+            return ret;
+        }
+        s->aio = aio;
     }
     return 0;
 }
@@ -241,8 +263,13 @@  static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
                          BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
-                       cb, opaque, QEMU_AIO_READ);
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_READ); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_READ);
+    }
 }
 
 static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
@@ -250,8 +277,13 @@  static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
                           BlockDriverCompletionFunc *cb, void *opaque)
 {
     BDRVRawState *s = bs->opaque;
-    return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
-                       cb, opaque, QEMU_AIO_WRITE);
+    if (s->aio) {
+        return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov,
+                                nb_sectors, cb, opaque, QEMU_AIO_WRITE); 
+    } else {
+        return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors,
+                           cb, opaque, QEMU_AIO_WRITE);
+    }
 }
 
 static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
diff --git a/block/win32-aio.c b/block/win32-aio.c
new file mode 100644
index 0000000..c34dc73
--- /dev/null
+++ b/block/win32-aio.c
@@ -0,0 +1,228 @@ 
+/*
+ * Native AIO support (win32)
+ *
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include "qemu-common.h"
+#include "qemu-aio.h"
+#include "raw-aio.h"
+#include "event_notifier.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD     1
+#define FTYPE_HARDDISK 2
+
+struct QEMUWin32AIOState {
+    HANDLE hIOCP;
+    EventNotifier e;
+    int count;
+};
+
+typedef struct QEMUWin32AIOCB {
+    BlockDriverAIOCB common;
+    struct QEMUWin32AIOState *ctx;
+    int nbytes;
+    OVERLAPPED ov;
+    QEMUIOVector *qiov;
+    void *buf;
+    bool is_read;
+    bool is_linear;
+} QEMUWin32AIOCB;
+
+/*
+ * Completes an AIO request (calls the callback and frees the ACB).
+ */
+static void win32_aio_process_completion(QEMUWin32AIOState *s,
+    QEMUWin32AIOCB *waiocb, DWORD count)
+{
+    int ret;
+    s->count--;
+
+    if (waiocb->ov.Internal != 0) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+        if (count < waiocb->nbytes) {
+            /* Short reads mean EOF, pad with zeros. */
+            if (waiocb->is_read) {
+                qemu_iovec_memset(waiocb->qiov, count, 0,
+                    waiocb->qiov->size - count);
+            } else {
+                ret = -EINVAL;
+            }
+       }
+    }
+
+    if (!waiocb->is_linear) {
+        if (ret == 0 && waiocb->is_read) {
+            QEMUIOVector *qiov = waiocb->qiov;
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+            g_free(waiocb->buf);
+        }
+    }
+
+
+    waiocb->common.cb(waiocb->common.opaque, ret);
+    qemu_aio_release(waiocb);
+}
+
+static void win32_aio_completion_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+    DWORD count;
+    ULONG_PTR key;
+    OVERLAPPED *ov;
+
+    event_notifier_test_and_clear(&s->e);
+    while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) {
+        QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov);
+
+        win32_aio_process_completion(s, waiocb, count);
+    }
+}
+
+static int win32_aio_flush_cb(EventNotifier *e)
+{
+    QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e);
+
+    return (s->count > 0) ? 1 : 0;
+}
+
+static void win32_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb;
+
+    /*
+     * CancelIoEx is only supported in Vista and newer.  For now, just
+     * wait for completion.
+     */
+    while (!HasOverlappedIoCompleted(&waiocb->ov)) {
+        qemu_aio_wait();
+    }
+}
+
+static AIOPool win32_aio_pool = {
+    .aiocb_size         = sizeof(QEMUWin32AIOCB),
+    .cancel             = win32_aio_cancel,
+};
+
+BlockDriverAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct QEMUWin32AIOCB *waiocb;
+    uint64_t offset = sector_num * 512;
+    DWORD rc;
+
+    waiocb = qemu_aio_get(&win32_aio_pool, bs, cb, opaque);
+    waiocb->nbytes = nb_sectors * 512;
+    waiocb->qiov = qiov;
+    waiocb->is_read = (type == QEMU_AIO_READ);
+
+    if (qiov->niov > 1) {
+        waiocb->buf = qemu_blockalign(bs, qiov->size);
+        if (type & QEMU_AIO_WRITE) {
+            char *p = waiocb->buf;
+            int i;
+
+            for (i = 0; i < qiov->niov; ++i) {
+                memcpy(p, qiov->iov[i].iov_base, qiov->iov[i].iov_len);
+                p += qiov->iov[i].iov_len;
+            }
+        }
+        waiocb->is_linear = false;
+    } else {
+        waiocb->buf = qiov->iov[0].iov_base;
+        waiocb->is_linear = true;
+    }
+
+    waiocb->ov = (OVERLAPPED) {
+        .Offset = (DWORD) offset,
+        .OffsetHigh = (DWORD) (offset >> 32),
+        .hEvent = event_notifier_get_handle(&aio->e)
+    };
+    aio->count++;
+
+    if (type & QEMU_AIO_READ) {
+        rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    } else {
+        rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov);
+    }
+    if(rc == 0 && GetLastError() != ERROR_IO_PENDING) {
+        goto out_dec_count;
+    }
+    return &waiocb->common;
+
+out_dec_count:
+    aio->count--;
+    qemu_aio_release(waiocb);
+    return NULL;
+}
+
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile)
+{
+    if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) {
+        return -EINVAL;
+    } else {
+        return 0;
+    }
+}
+
+QEMUWin32AIOState *win32_aio_init(void)
+{
+    QEMUWin32AIOState *s;
+
+    s = g_malloc0(sizeof(*s));
+    if (event_notifier_init(&s->e, false) < 0) {
+        goto out_free_state;
+    }
+
+    s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
+    if (s->hIOCP == NULL) {
+        goto out_close_efd;
+    }
+
+    qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb,
+                                win32_aio_flush_cb);
+
+    return s;
+
+out_close_efd:
+    event_notifier_cleanup(&s->e);
+out_free_state:
+    g_free(s);
+    return NULL;
+}