Patchwork [2/2] raw-posix: add Linux native AIO support

login
register
mail settings
Submitter Christoph Hellwig
Date Aug. 20, 2009, 2:58 p.m.
Message ID <20090820145835.GB24183@lst.de>
Download mbox | patch
Permalink /patch/31740/
State Superseded
Headers show

Comments

Christoph Hellwig - Aug. 20, 2009, 2:58 p.m.
Now that do have a nicer interface to work against we can add Linux native
AIO support.  It's an extremly thing layer just setting up an iocb for
the io_submit system call in the submission path, and registering an
eventfd with the qemu poll handler to do complete the iocbs directly
from there.

This started out based on Anthony's earlier AIO patch, but after
estimated 42,000 rewrites and just as many build system changes
there's not much left of it.

To enable native kernel aio use the aio=native sub-command on the
drive command line.  I have also added an option to qemu-io to
test the aio support without needing a guest.


Signed-off-by: Christoph Hellwig <hch@lst.de>
Avi Kivity - Aug. 21, 2009, 9:53 a.m.
On 08/20/2009 05:58 PM, Christoph Hellwig wrote:
> Now that do have a nicer interface to work against we can add Linux native
> AIO support.  It's an extremly thing layer just setting up an iocb for
> the io_submit system call in the submission path, and registering an
> eventfd with the qemu poll handler to do complete the iocbs directly
> from there.
>
> This started out based on Anthony's earlier AIO patch, but after
> estimated 42,000 rewrites and just as many build system changes
> there's not much left of it.
>
> To enable native kernel aio use the aio=native sub-command on the
> drive command line.  I have also added an option to qemu-io to
> test the aio support without needing a guest.
>
>
> Signed-off-by: Christoph Hellwig<hch@lst.de>
>
> Index: qemu/Makefile
> ===================================================================
> --- qemu.orig/Makefile	2009-08-19 22:49:08.789354196 -0300
> +++ qemu/Makefile	2009-08-19 22:51:25.293352541 -0300
> @@ -56,6 +56,7 @@ recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR
>   block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
>   block-obj-y += nbd.o block.o aio.o aes.o
>   block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
> +block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
>
>   block-nested-y += cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
>   block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
> Index: qemu/block/raw-posix.c
> ===================================================================
> --- qemu.orig/block/raw-posix.c	2009-08-19 22:49:08.793352540 -0300
> +++ qemu/block/raw-posix.c	2009-08-19 23:00:21.157402768 -0300
> @@ -115,6 +115,7 @@ typedef struct BDRVRawState {
>       int fd_got_error;
>       int fd_media_changed;
>   #endif
> +    int use_aio;
>       uint8_t* aligned_buf;
>   } BDRVRawState;
>
> @@ -159,6 +160,7 @@ static int raw_open_common(BlockDriverSt
>       }
>       s->fd = fd;
>       s->aligned_buf = NULL;
> +
>       if ((bdrv_flags&  BDRV_O_NOCACHE)) {
>           s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
>           if (s->aligned_buf == NULL) {
> @@ -166,9 +168,22 @@ static int raw_open_common(BlockDriverSt
>           }
>       }
>
> -    s->aio_ctx = paio_init();
> -    if (!s->aio_ctx) {
> -        goto out_free_buf;
> +#ifdef CONFIG_LINUX_AIO
> +    if ((bdrv_flags&  (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
> +                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
> +        s->aio_ctx = laio_init();
> +        if (!s->aio_ctx) {
> +            goto out_free_buf;
> +        }
> +        s->use_aio = 1;
> +    } else
> +#endif
> +    {
> +        s->aio_ctx = paio_init();
> +        if (!s->aio_ctx) {
> +            goto out_free_buf;
> +        }
> +        s->use_aio = 0;
>       }
>
>       return 0;
> @@ -524,8 +539,13 @@ static BlockDriverAIOCB *raw_aio_submit(
>        * boundary.  Check if this is the case or telll the low-level
>        * driver that it needs to copy the buffer.
>        */
> -    if (s->aligned_buf&&  !qiov_is_aligned(qiov)) {
> -        type |= QEMU_AIO_MISALIGNED;
> +    if (s->aligned_buf) {
> +        if (!qiov_is_aligned(qiov)) {
> +            type |= QEMU_AIO_MISALIGNED;
> +        } else if (s->use_aio) {
> +            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
> +	                       nb_sectors, cb, opaque, type);
> +        }
>       }
>
>       return paio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, nb_sectors,
> Index: qemu/configure
> ===================================================================
> --- qemu.orig/configure	2009-08-19 22:49:08.801352719 -0300
> +++ qemu/configure	2009-08-19 22:51:25.305393736 -0300
> @@ -197,6 +197,7 @@ build_docs="yes"
>   uname_release=""
>   curses="yes"
>   curl="yes"
> +linux_aio="yes"
>   io_thread="no"
>   nptl="yes"
>   mixemu="no"
> @@ -499,6 +500,8 @@ for opt do
>     ;;
>     --enable-mixemu) mixemu="yes"
>     ;;
> +  --disable-linux-aio) linux_aio="no"
> +  ;;
>     --enable-io-thread) io_thread="yes"
>     ;;
>     --disable-blobs) blobs="no"
> @@ -636,6 +639,7 @@ echo "  --oss-lib                path to
>   echo "  --enable-uname-release=R Return R for uname -r in usermode emulation"
>   echo "  --sparc_cpu=V            Build qemu for Sparc architecture v7, v8, v8plus, v8plusa, v9"
>   echo "  --disable-vde            disable support for vde network"
> +echo "  --disable-linux-aio      disable Linux AIO support"
>   echo "  --enable-io-thread       enable IO thread"
>   echo "  --disable-blobs          disable installing provided firmware blobs"
>   echo "  --kerneldir=PATH         look for kernel includes in PATH"
> @@ -1197,6 +1201,23 @@ if test "$pthread" = no; then
>   fi
>
>   ##########################################
> +# linux-aio probe
> +AIOLIBS=""
> +
> +if test "$linux_aio" = "yes" ; then
> +    linux_aio=no
> +    cat>  $TMPC<<EOF
> +#include<libaio.h>
> +#include<sys/eventfd.h>
> +int main(void) { io_setup(0, NULL); io_set_eventfd(NULL, 0); eventfd(0, 0); return 0; }
> +EOF
> +    if compile_prog "" "-laio" ; then
> +        linux_aio=yes
> +        LIBS="$LIBS -laio"
> +    fi
> +fi
> +
> +##########################################
>   # iovec probe
>   cat>  $TMPC<<EOF
>   #include<sys/types.h>
> @@ -1527,6 +1548,7 @@ echo "NPTL support      $nptl"
>   echo "GUEST_BASE        $guest_base"
>   echo "vde support       $vde"
>   echo "IO thread         $io_thread"
> +echo "Linux AIO support $linux_aio"
>   echo "Install blobs     $blobs"
>   echo -e "KVM support       $kvm"
>   echo "fdt support       $fdt"
> @@ -1700,6 +1722,9 @@ fi
>   if test "$io_thread" = "yes" ; then
>     echo "CONFIG_IOTHREAD=y">>  $config_host_mak
>   fi
> +if test "$linux_aio" = "yes" ; then
> +  echo "CONFIG_LINUX_AIO=y">>  $config_host_mak
> +fi
>   if test "$blobs" = "yes" ; then
>     echo "INSTALL_BLOBS=yes">>  $config_host_mak
>   fi
> Index: qemu/linux-aio.c
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ qemu/linux-aio.c	2009-08-20 10:54:10.924375300 -0300
> @@ -0,0 +1,204 @@
> +/*
> + * Linux native AIO support.
> + *
> + * Copyright (C) 2009 IBM, Corp.
> + * Copyright (C) 2009 Red Hat, Inc.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +#include "qemu-common.h"
> +#include "qemu-aio.h"
> +#include "block_int.h"
> +#include "block/raw-posix-aio.h"
> +
> +#include<sys/eventfd.h>
> +#include<libaio.h>
> +
> +/*
> + * Queue size (per-device).
> + *
> + * XXX: eventually we need to communicate this to the guest and/or make it
> + *      tunable by the guest.  If we get more outstanding requests at a time
> + *      than this we will get EAGAIN from io_submit which is communicated to
> + *      the guest as an I/O error.
> + */
> +#define MAX_EVENTS 128
>    

Or, we could queue any extra requests.

> +
> +
> +void *laio_init(void)
> +{
> +    struct qemu_laio_state *s;
> +
> +    s = qemu_mallocz(sizeof(*s));
> +    s->efd = eventfd(0, 0);
> +    if (s->efd == -1)
> +        goto out_free_state;
> +    fcntl(s->efd, F_SETFL, O_NONBLOCK);
> +
> +    if (io_setup(MAX_EVENTS,&s->ctx) != 0)
> +        goto out_close_efd;
> +
>    

One day we may want a global io context so we can dequeue many events 
with one syscall.  Or we may not, if we thread these things.
Christoph Hellwig - Aug. 21, 2009, 2:48 p.m.
On Fri, Aug 21, 2009 at 12:53:49PM +0300, Avi Kivity wrote:
> >+ * Queue size (per-device).
> >+ *
> >+ * XXX: eventually we need to communicate this to the guest and/or make it
> >+ *      tunable by the guest.  If we get more outstanding requests at a 
> >time
> >+ *      than this we will get EAGAIN from io_submit which is communicated 
> >to
> >+ *      the guest as an I/O error.
> >+ */
> >+#define MAX_EVENTS 128
> >   
> 
> Or, we could queue any extra requests.

That doesn't make much sense.    We'd just do an additional level of
queueing in addition to those already optimized implementation in the
guest and host kernels.  This is really just an issue of communicating
the limits we have and deal with it efficiently.  It should be a
relatively small add-on patch.

> >+    if (io_setup(MAX_EVENTS,&s->ctx) != 0)
> >+        goto out_close_efd;
> >+
> >   
> 
> One day we may want a global io context so we can dequeue many events 
> with one syscall.  Or we may not, if we thread these things.

Wecould do this easily, in fact that's what I did before I run into
issues with the completion queue size when using multiple devices.

Syscall overhead in Linux is small enough that I would not bother until
it actually shows up as a problem.  That beeing said threading the block
layer would probably be a benefit for large setups for various reasons.
Avi Kivity - Aug. 21, 2009, 3:35 p.m.
On 08/21/2009 05:48 PM, Christoph Hellwig wrote:
> On Fri, Aug 21, 2009 at 12:53:49PM +0300, Avi Kivity wrote:
>    
>>> + * Queue size (per-device).
>>> + *
>>> + * XXX: eventually we need to communicate this to the guest and/or make it
>>> + *      tunable by the guest.  If we get more outstanding requests at a
>>> time
>>> + *      than this we will get EAGAIN from io_submit which is communicated
>>> to
>>> + *      the guest as an I/O error.
>>> + */
>>> +#define MAX_EVENTS 128
>>>
>>>        
>> Or, we could queue any extra requests.
>>      
> That doesn't make much sense.    We'd just do an additional level of
> queueing in addition to those already optimized implementation in the
> guest and host kernels.  This is really just an issue of communicating
> the limits we have and deal with it efficiently.  It should be a
> relatively small add-on patch.
>    

You're right, virtio and scsi already know their queue sizes, should be 
easy to pass it down the stack.

Patch

Index: qemu/Makefile
===================================================================
--- qemu.orig/Makefile	2009-08-19 22:49:08.789354196 -0300
+++ qemu/Makefile	2009-08-19 22:51:25.293352541 -0300
@@ -56,6 +56,7 @@  recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
 block-obj-y += nbd.o block.o aio.o aes.o
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
+block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
 block-nested-y += cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
 block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
Index: qemu/block/raw-posix.c
===================================================================
--- qemu.orig/block/raw-posix.c	2009-08-19 22:49:08.793352540 -0300
+++ qemu/block/raw-posix.c	2009-08-19 23:00:21.157402768 -0300
@@ -115,6 +115,7 @@  typedef struct BDRVRawState {
     int fd_got_error;
     int fd_media_changed;
 #endif
+    int use_aio;
     uint8_t* aligned_buf;
 } BDRVRawState;
 
@@ -159,6 +160,7 @@  static int raw_open_common(BlockDriverSt
     }
     s->fd = fd;
     s->aligned_buf = NULL;
+
     if ((bdrv_flags & BDRV_O_NOCACHE)) {
         s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
         if (s->aligned_buf == NULL) {
@@ -166,9 +168,22 @@  static int raw_open_common(BlockDriverSt
         }
     }
 
-    s->aio_ctx = paio_init();
-    if (!s->aio_ctx) {
-        goto out_free_buf;
+#ifdef CONFIG_LINUX_AIO
+    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+        s->aio_ctx = laio_init();
+        if (!s->aio_ctx) {
+            goto out_free_buf;
+        }
+        s->use_aio = 1;
+    } else
+#endif
+    {
+        s->aio_ctx = paio_init();
+        if (!s->aio_ctx) {
+            goto out_free_buf;
+        }
+        s->use_aio = 0;
     }
 
     return 0;
@@ -524,8 +539,13 @@  static BlockDriverAIOCB *raw_aio_submit(
      * boundary.  Check if this is the case or telll the low-level
      * driver that it needs to copy the buffer.
      */
-    if (s->aligned_buf && !qiov_is_aligned(qiov)) {
-        type |= QEMU_AIO_MISALIGNED;
+    if (s->aligned_buf) {
+        if (!qiov_is_aligned(qiov)) {
+            type |= QEMU_AIO_MISALIGNED;
+        } else if (s->use_aio) {
+            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+	                       nb_sectors, cb, opaque, type);
+        }
     }
 
     return paio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, nb_sectors,
Index: qemu/configure
===================================================================
--- qemu.orig/configure	2009-08-19 22:49:08.801352719 -0300
+++ qemu/configure	2009-08-19 22:51:25.305393736 -0300
@@ -197,6 +197,7 @@  build_docs="yes"
 uname_release=""
 curses="yes"
 curl="yes"
+linux_aio="yes"
 io_thread="no"
 nptl="yes"
 mixemu="no"
@@ -499,6 +500,8 @@  for opt do
   ;;
   --enable-mixemu) mixemu="yes"
   ;;
+  --disable-linux-aio) linux_aio="no"
+  ;;
   --enable-io-thread) io_thread="yes"
   ;;
   --disable-blobs) blobs="no"
@@ -636,6 +639,7 @@  echo "  --oss-lib                path to
 echo "  --enable-uname-release=R Return R for uname -r in usermode emulation"
 echo "  --sparc_cpu=V            Build qemu for Sparc architecture v7, v8, v8plus, v8plusa, v9"
 echo "  --disable-vde            disable support for vde network"
+echo "  --disable-linux-aio      disable Linux AIO support"
 echo "  --enable-io-thread       enable IO thread"
 echo "  --disable-blobs          disable installing provided firmware blobs"
 echo "  --kerneldir=PATH         look for kernel includes in PATH"
@@ -1197,6 +1201,23 @@  if test "$pthread" = no; then
 fi
 
 ##########################################
+# linux-aio probe
+AIOLIBS=""
+
+if test "$linux_aio" = "yes" ; then
+    linux_aio=no
+    cat > $TMPC <<EOF
+#include <libaio.h>
+#include <sys/eventfd.h>
+int main(void) { io_setup(0, NULL); io_set_eventfd(NULL, 0); eventfd(0, 0); return 0; }
+EOF
+    if compile_prog "" "-laio" ; then
+        linux_aio=yes
+        LIBS="$LIBS -laio"
+    fi
+fi
+
+##########################################
 # iovec probe
 cat > $TMPC <<EOF
 #include <sys/types.h>
@@ -1527,6 +1548,7 @@  echo "NPTL support      $nptl"
 echo "GUEST_BASE        $guest_base"
 echo "vde support       $vde"
 echo "IO thread         $io_thread"
+echo "Linux AIO support $linux_aio"
 echo "Install blobs     $blobs"
 echo -e "KVM support       $kvm"
 echo "fdt support       $fdt"
@@ -1700,6 +1722,9 @@  fi
 if test "$io_thread" = "yes" ; then
   echo "CONFIG_IOTHREAD=y" >> $config_host_mak
 fi
+if test "$linux_aio" = "yes" ; then
+  echo "CONFIG_LINUX_AIO=y" >> $config_host_mak
+fi
 if test "$blobs" = "yes" ; then
   echo "INSTALL_BLOBS=yes" >> $config_host_mak
 fi
Index: qemu/linux-aio.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ qemu/linux-aio.c	2009-08-20 10:54:10.924375300 -0300
@@ -0,0 +1,204 @@ 
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu-common.h"
+#include "qemu-aio.h"
+#include "block_int.h"
+#include "block/raw-posix-aio.h"
+
+#include <sys/eventfd.h>
+#include <libaio.h>
+
+/*
+ * Queue size (per-device).
+ *
+ * XXX: eventually we need to communicate this to the guest and/or make it
+ *      tunable by the guest.  If we get more outstanding requests at a time
+ *      than this we will get EAGAIN from io_submit which is communicated to
+ *      the guest as an I/O error.
+ */
+#define MAX_EVENTS 128
+
+struct qemu_laiocb {
+    BlockDriverAIOCB common;
+    struct qemu_laio_state *ctx;
+    struct iocb iocb;
+    ssize_t ret;
+    size_t nbytes;
+};
+
+struct qemu_laio_state {
+    io_context_t ctx;
+    int efd;
+    int count;
+};
+
+static inline ssize_t io_event_ret(struct io_event *ev)
+{
+    return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
+}
+
+static void qemu_laio_completion_cb(void *opaque)
+{
+    struct qemu_laio_state *s = opaque;
+
+    while (1) {
+        struct io_event events[MAX_EVENTS];
+        uint64_t val;
+        ssize_t ret;
+        struct timespec ts = { 0 };
+        int nevents, i;
+
+        do {
+            ret = read(s->efd, &val, sizeof(val));
+        } while (ret == 1 && errno == EINTR);
+
+        if (ret == -1 && errno == EAGAIN)
+            break;
+
+        if (ret != 8)
+            break;
+
+        do {
+            nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts);
+        } while (nevents == -EINTR);
+
+        for (i = 0; i < nevents; i++) {
+            struct iocb *iocb = events[i].obj;
+            struct qemu_laiocb *laiocb =
+                    container_of(iocb, struct qemu_laiocb, iocb);
+
+            s->count--;
+
+            ret = laiocb->ret = io_event_ret(&events[i]);
+            if (ret != -ECANCELED) {
+                if (ret == laiocb->nbytes)
+                    ret = 0;
+                else if (ret >= 0)
+                    ret = -EINVAL;
+
+                laiocb->common.cb(laiocb->common.opaque, ret);
+            }
+
+            qemu_aio_release(laiocb);
+        }
+    }
+}
+
+static int qemu_laio_flush_cb(void *opaque)
+{
+    struct qemu_laio_state *s = opaque;
+
+    return (s->count > 0) ? 1 : 0;
+}
+
+static void laio_cancel(BlockDriverAIOCB *blockacb)
+{
+    struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
+    struct io_event event;
+    int ret;
+
+    if (laiocb->ret != -EINPROGRESS)
+        return;
+
+    /*
+     * Note that as of Linux 2.6.31 neither the block device code nor any
+     * filesystem implements cancellation of AIO request.
+     * Thus the polling loop below is the normal code path.
+     */
+    ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
+    if (ret == 0) {
+        laiocb->ret = -ECANCELED;
+        return;
+    }
+
+    /*
+     * We have to wait for the iocb to finish.
+     *
+     * The only way to get the iocb status update is by polling the io context.
+     * We might be able to do this slightly more optimal by removing the
+     * O_NONBLOCK flag.
+     */
+    while (laiocb->ret == -EINPROGRESS)
+        qemu_laio_completion_cb(laiocb->ctx);
+}
+
+static AIOPool laio_pool = {
+    .aiocb_size         = sizeof(struct qemu_laiocb),
+    .cancel             = laio_cancel,
+};
+
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+    struct qemu_laio_state *s = aio_ctx;
+    struct qemu_laiocb *laiocb;
+    struct iocb *iocbs;
+    off_t offset = sector_num * 512;
+
+    laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque);
+    if (!laiocb)
+        return NULL;
+    laiocb->nbytes = nb_sectors * 512;
+    laiocb->ctx = s;
+    laiocb->ret = -EINPROGRESS;
+
+    iocbs = &laiocb->iocb;
+
+    switch (type) {
+    case QEMU_AIO_WRITE:
+        io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+	break;
+    case QEMU_AIO_READ:
+        io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
+	break;
+    default:
+        fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
+                        __func__, type);
+        goto out_free_aiocb;
+    }
+    io_set_eventfd(&laiocb->iocb, s->efd);
+    s->count++;
+
+    if (io_submit(s->ctx, 1, &iocbs) < 0)
+        goto out_dec_count;
+    return &laiocb->common;
+
+out_free_aiocb:
+    qemu_aio_release(laiocb);
+out_dec_count:
+    s->count--;
+    return NULL;
+}
+
+void *laio_init(void)
+{
+    struct qemu_laio_state *s;
+
+    s = qemu_mallocz(sizeof(*s));
+    s->efd = eventfd(0, 0);
+    if (s->efd == -1)
+        goto out_free_state;
+    fcntl(s->efd, F_SETFL, O_NONBLOCK);
+
+    if (io_setup(MAX_EVENTS, &s->ctx) != 0)
+        goto out_close_efd;
+
+    qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb,
+                            NULL, qemu_laio_flush_cb, s);
+
+    return s;
+
+out_close_efd:
+    close(s->efd);
+out_free_state:
+    qemu_free(s);
+    return NULL;
+}
Index: qemu/block/raw-posix-aio.h
===================================================================
--- qemu.orig/block/raw-posix-aio.h	2009-08-19 22:49:08.797353398 -0300
+++ qemu/block/raw-posix-aio.h	2009-08-19 22:51:25.313401597 -0300
@@ -33,4 +33,10 @@  BlockDriverAIOCB *paio_ioctl(BlockDriver
         unsigned long int req, void *buf,
         BlockDriverCompletionFunc *cb, void *opaque);
 
+/* linux-aio.c - Linux native implementation */
+void *laio_init(void);
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int type);
+
 #endif /* QEMU_RAW_POSIX_AIO_H */
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2009-08-19 22:49:08.809352828 -0300
+++ qemu/block.h	2009-08-19 22:51:25.317384576 -0300
@@ -37,6 +37,7 @@  typedef struct QEMUSnapshotInfo {
                                      bdrv_file_open()) */
 #define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
 #define BDRV_O_CACHE_WB    0x0040 /* use write-back caching */
+#define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
 
Index: qemu/qemu-options.hx
===================================================================
--- qemu.orig/qemu-options.hx	2009-08-19 22:49:08.817352727 -0300
+++ qemu/qemu-options.hx	2009-08-19 22:51:25.321383686 -0300
@@ -95,7 +95,7 @@  DEF("drive", HAS_ARG, QEMU_OPTION_drive,
     "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
     "       [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
     "       [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
-    "       [,addr=A][,id=name]\n"
+    "       [,addr=A][,id=name][,aio=threads|native]\n"
     "                use 'file' as a drive image\n")
 DEF("set", HAS_ARG, QEMU_OPTION_set,
     "-set group.id.arg=value\n"
@@ -128,6 +128,8 @@  These options have the same definition a
 @var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
 @item cache=@var{cache}
 @var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data.
+@item aio=@var{aio}
+@var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
 @item format=@var{format}
 Specify which disk @var{format} will be used rather than detecting
 the format.  Can be used to specifiy format=raw to avoid interpreting
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2009-08-19 22:49:08.821354562 -0300
+++ qemu/vl.c	2009-08-19 22:51:25.325352976 -0300
@@ -1921,6 +1921,7 @@  DriveInfo *drive_init(QemuOpts *opts, vo
     int max_devs;
     int index;
     int cache;
+    int aio = 0;
     int bdrv_flags, onerror;
     const char *devaddr;
     DriveInfo *dinfo;
@@ -2054,6 +2055,19 @@  DriveInfo *drive_init(QemuOpts *opts, vo
         }
     }
 
+#ifdef CONFIG_LINUX_AIO
+    if ((buf = qemu_opt_get(opts, "aio")) != NULL) {
+        if (!strcmp(buf, "threads"))
+            aio = 0;
+        else if (!strcmp(buf, "native"))
+            aio = 1;
+        else {
+           fprintf(stderr, "qemu: invalid aio option\n");
+           return NULL;
+        }
+    }
+#endif
+
     if ((buf = qemu_opt_get(opts, "format")) != NULL) {
        if (strcmp(buf, "?") == 0) {
             fprintf(stderr, "qemu: Supported formats:");
@@ -2223,11 +2237,19 @@  DriveInfo *drive_init(QemuOpts *opts, vo
         bdrv_flags |= BDRV_O_NOCACHE;
     else if (cache == 2) /* write-back */
         bdrv_flags |= BDRV_O_CACHE_WB;
+
+    if (aio == 1) {
+        bdrv_flags |= BDRV_O_NATIVE_AIO;
+    } else {
+        bdrv_flags &= ~BDRV_O_NATIVE_AIO;
+    }
+
     if (bdrv_open2(dinfo->bdrv, file, bdrv_flags, drv) < 0) {
         fprintf(stderr, "qemu: could not open disk image %s\n",
                         file);
         return NULL;
     }
+
     if (bdrv_key_required(dinfo->bdrv))
         autostart = 0;
     *fatal_error = 0;
Index: qemu/qemu-config.c
===================================================================
--- qemu.orig/qemu-config.c	2009-08-19 22:49:08.825352416 -0300
+++ qemu/qemu-config.c	2009-08-19 22:51:25.333383955 -0300
@@ -53,6 +53,10 @@  QemuOptsList qemu_drive_opts = {
             .type = QEMU_OPT_STRING,
             .help = "host cache usage (none, writeback, writethrough)",
         },{
+            .name = "aio",
+            .type = QEMU_OPT_STRING,
+            .help = "host AIO implementation (threads, native)",
+        },{
             .name = "format",
             .type = QEMU_OPT_STRING,
             .help = "disk format (raw, qcow2, ...)",
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2009-08-19 22:58:58.421381858 -0300
+++ qemu/block.c	2009-08-19 22:59:39.033439876 -0300
@@ -411,7 +411,8 @@  int bdrv_open2(BlockDriverState *bs, con
     /* Note: for compatibility, we open disk image files as RDWR, and
        RDONLY as fallback */
     if (!(flags & BDRV_O_FILE))
-        open_flags = BDRV_O_RDWR | (flags & BDRV_O_CACHE_MASK);
+        open_flags = BDRV_O_RDWR |
+		(flags & (BDRV_O_CACHE_MASK|BDRV_O_NATIVE_AIO));
     else
         open_flags = flags & ~(BDRV_O_FILE | BDRV_O_SNAPSHOT);
     ret = drv->bdrv_open(bs, filename, open_flags);
Index: qemu/qemu-io.c
===================================================================
--- qemu.orig/qemu-io.c	2009-08-20 10:41:09.047691604 -0300
+++ qemu/qemu-io.c	2009-08-20 10:57:29.753487097 -0300
@@ -1401,6 +1401,7 @@  static void usage(const char *name)
 "  -n, --nocache        disable host cache\n"
 "  -g, --growable       allow file to grow (only applies to protocols)\n"
 "  -m, --misalign       misalign allocations for O_DIRECT\n"
+"  -k, --native-aio     use kernel AIO implementation (on Linux only)\n"
 "  -h, --help           display this help and exit\n"
 "  -V, --version        output version information and exit\n"
 "\n",
@@ -1412,7 +1413,7 @@  int main(int argc, char **argv)
 {
 	int readonly = 0;
 	int growable = 0;
-	const char *sopt = "hVc:Crsnmg";
+	const char *sopt = "hVc:Crsnmgk";
 	struct option lopt[] = {
 		{ "help", 0, NULL, 'h' },
 		{ "version", 0, NULL, 'V' },
@@ -1424,6 +1425,7 @@  int main(int argc, char **argv)
 		{ "nocache", 0, NULL, 'n' },
 		{ "misalign", 0, NULL, 'm' },
 		{ "growable", 0, NULL, 'g' },
+		{ "native-aio", 0, NULL, 'k' },
 		{ NULL, 0, NULL, 0 }
 	};
 	int c;
@@ -1455,6 +1457,9 @@  int main(int argc, char **argv)
 		case 'g':
 			growable = 1;
 			break;
+		case 'k':
+			flags |= BDRV_O_NATIVE_AIO;
+			break;
 		case 'V':
 			printf("%s version %s\n", progname, VERSION);
 			exit(0);