diff mbox

[v4,8/8] linux-aio: share one LinuxAioState within an AioContext

Message ID 1460046816-102846-9-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini April 7, 2016, 4:33 p.m. UTC
This has better performance because it executes fewer system calls
and does not use a bottom half per disk.

Originally proposed by Ming Lei.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 async.c                            |  23 +++++++
 block/linux-aio.c                  |   3 +
 block/raw-posix.c                  | 119 +++++--------------------------------
 block/raw-win32.c                  |   2 +-
 include/block/aio.h                |  13 ++++
 {block => include/block}/raw-aio.h |   0
 6 files changed, 54 insertions(+), 106 deletions(-)
 rename {block => include/block}/raw-aio.h (100%)

Comments

Stefan Hajnoczi April 19, 2016, 9:09 a.m. UTC | #1
On Thu, Apr 07, 2016 at 06:33:36PM +0200, Paolo Bonzini wrote:
> This has better performance because it executes fewer system calls
> and does not use a bottom half per disk.

Each aio_context_t is initialized for 128 in-flight requests in
laio_init().

Will it be possible to hit the limit now that all drives share the same
aio_context_t?

Stefan
Paolo Bonzini May 9, 2016, 4:31 p.m. UTC | #2
On 19/04/2016 11:09, Stefan Hajnoczi wrote:
>> > This has better performance because it executes fewer system calls
>> > and does not use a bottom half per disk.
> Each aio_context_t is initialized for 128 in-flight requests in
> laio_init().
> 
> Will it be possible to hit the limit now that all drives share the same
> aio_context_t?

It was also possible before, because the virtqueue can be bigger than
128 items; that's why there is logic to submit I/O requests after an
io_get_events.  As usual when the answer seems trivial, am I
misunderstanding your question?

Thanks,

Paolo
Stefan Hajnoczi May 10, 2016, 9:30 a.m. UTC | #3
On Mon, May 09, 2016 at 06:31:44PM +0200, Paolo Bonzini wrote:
> On 19/04/2016 11:09, Stefan Hajnoczi wrote:
> >> > This has better performance because it executes fewer system calls
> >> > and does not use a bottom half per disk.
> > Each aio_context_t is initialized for 128 in-flight requests in
> > laio_init().
> > 
> > Will it be possible to hit the limit now that all drives share the same
> > aio_context_t?
> 
> It was also possible before, because the virtqueue can be bigger than
> 128 items; that's why there is logic to submit I/O requests after an
> io_get_events.  As usual when the answer seems trivial, am I
> misunderstanding your question?

I'm concerned about a performance regression rather than correctness.

But looking at linux-aio.c there *is* a correctness problem:

  static void ioq_submit(struct qemu_laio_state *s)
  {
      int ret, len;
      struct qemu_laiocb *aiocb;
      struct iocb *iocbs[MAX_QUEUED_IO];
      QSIMPLEQ_HEAD(, qemu_laiocb) completed;

      do {
          len = 0;
          QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
              iocbs[len++] = &aiocb->iocb;
              if (len == MAX_QUEUED_IO) {
                  break;
              }
          }

          ret = io_submit(s->ctx, len, iocbs);
          if (ret == -EAGAIN) {
              break;
          }
          if (ret < 0) {
              abort();
          }

          s->io_q.n -= ret;
          aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
          QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
      } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
      s->io_q.blocked = (s->io_q.n > 0);
  }

io_submit() may have submitted some of the requests when -EAGAIN is
returned.  QEMU gets no indication of which requests were submitted.  It
may be possible to dig around in the s->ctx rings to find out or we need
to keep track of the number of in-flight requests so we can prevent ever
hitting EAGAIN.

ioq_submit() pretends that no requests were submitted on -EAGAIN and
will submit them again next time.  This could result in double
completions.

Regarding performance, I'm thinking about a guest with 8 disks (queue
depth 32).  The worst case is when the guest submits 32 requests at once
but the Linux AIO event limit has already been reached.  Then the disk
is starved until other disks' requests complete.

Stefan
Kevin Wolf May 10, 2016, 9:40 a.m. UTC | #4
Am 10.05.2016 um 11:30 hat Stefan Hajnoczi geschrieben:
> On Mon, May 09, 2016 at 06:31:44PM +0200, Paolo Bonzini wrote:
> > On 19/04/2016 11:09, Stefan Hajnoczi wrote:
> > >> > This has better performance because it executes fewer system calls
> > >> > and does not use a bottom half per disk.
> > > Each aio_context_t is initialized for 128 in-flight requests in
> > > laio_init().
> > > 
> > > Will it be possible to hit the limit now that all drives share the same
> > > aio_context_t?
> > 
> > It was also possible before, because the virtqueue can be bigger than
> > 128 items; that's why there is logic to submit I/O requests after an
> > io_get_events.  As usual when the answer seems trivial, am I
> > misunderstanding your question?
> 
> I'm concerned about a performance regression rather than correctness.
> 
> But looking at linux-aio.c there *is* a correctness problem:
> 
>   static void ioq_submit(struct qemu_laio_state *s)
>   {
>       int ret, len;
>       struct qemu_laiocb *aiocb;
>       struct iocb *iocbs[MAX_QUEUED_IO];
>       QSIMPLEQ_HEAD(, qemu_laiocb) completed;
> 
>       do {
>           len = 0;
>           QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
>               iocbs[len++] = &aiocb->iocb;
>               if (len == MAX_QUEUED_IO) {
>                   break;
>               }
>           }
> 
>           ret = io_submit(s->ctx, len, iocbs);
>           if (ret == -EAGAIN) {
>               break;
>           }
>           if (ret < 0) {
>               abort();
>           }
> 
>           s->io_q.n -= ret;
>           aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
>           QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
>       } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
>       s->io_q.blocked = (s->io_q.n > 0);
>   }
> 
> io_submit() may have submitted some of the requests when -EAGAIN is
> returned.  QEMU gets no indication of which requests were submitted.

My understanding (which is based on the manpage rather than code) is
that -EAGAIN is only returned if no request could be submitted. In other
cases, the number of submitted requests is returned (similar to how
short reads work).

> It may be possible to dig around in the s->ctx rings to find out or we
> need to keep track of the number of in-flight requests so we can
> prevent ever hitting EAGAIN.
> 
> ioq_submit() pretends that no requests were submitted on -EAGAIN and
> will submit them again next time.  This could result in double
> completions.

Did you check in the code that this can happen?

> Regarding performance, I'm thinking about a guest with 8 disks (queue
> depth 32).  The worst case is when the guest submits 32 requests at once
> but the Linux AIO event limit has already been reached.  Then the disk
> is starved until other disks' requests complete.

Sounds like a valid concern.

Kevin
Paolo Bonzini May 10, 2016, 10:32 a.m. UTC | #5
On 10/05/2016 11:40, Kevin Wolf wrote:
> > Regarding performance, I'm thinking about a guest with 8 disks (queue
> > depth 32).  The worst case is when the guest submits 32 requests at once
> > but the Linux AIO event limit has already been reached.  Then the disk
> > is starved until other disks' requests complete.
>
> Sounds like a valid concern.

Oh, so you're concerned about the non-dataplane case.  My suspicion is
that, with such a number of outstanding I/O, you probably have bad
performance unless you use dataplane.

Also, aio=threads has had a 64 I/O limit for years and we've never heard
about problems with stuck I/O.  But I agree that it's one area to keep
an eye on.

Paolo
Stefan Hajnoczi May 11, 2016, 1:18 p.m. UTC | #6
On Tue, May 10, 2016 at 11:40:33AM +0200, Kevin Wolf wrote:
> Am 10.05.2016 um 11:30 hat Stefan Hajnoczi geschrieben:
> > On Mon, May 09, 2016 at 06:31:44PM +0200, Paolo Bonzini wrote:
> > > On 19/04/2016 11:09, Stefan Hajnoczi wrote:
> > > >> > This has better performance because it executes fewer system calls
> > > >> > and does not use a bottom half per disk.
> > > > Each aio_context_t is initialized for 128 in-flight requests in
> > > > laio_init().
> > > > 
> > > > Will it be possible to hit the limit now that all drives share the same
> > > > aio_context_t?
> > > 
> > > It was also possible before, because the virtqueue can be bigger than
> > > 128 items; that's why there is logic to submit I/O requests after an
> > > io_get_events.  As usual when the answer seems trivial, am I
> > > misunderstanding your question?
> > 
> > I'm concerned about a performance regression rather than correctness.
> > 
> > But looking at linux-aio.c there *is* a correctness problem:
> > 
> >   static void ioq_submit(struct qemu_laio_state *s)
> >   {
> >       int ret, len;
> >       struct qemu_laiocb *aiocb;
> >       struct iocb *iocbs[MAX_QUEUED_IO];
> >       QSIMPLEQ_HEAD(, qemu_laiocb) completed;
> > 
> >       do {
> >           len = 0;
> >           QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) {
> >               iocbs[len++] = &aiocb->iocb;
> >               if (len == MAX_QUEUED_IO) {
> >                   break;
> >               }
> >           }
> > 
> >           ret = io_submit(s->ctx, len, iocbs);
> >           if (ret == -EAGAIN) {
> >               break;
> >           }
> >           if (ret < 0) {
> >               abort();
> >           }
> > 
> >           s->io_q.n -= ret;
> >           aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
> >           QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed);
> >       } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending));
> >       s->io_q.blocked = (s->io_q.n > 0);
> >   }
> > 
> > io_submit() may have submitted some of the requests when -EAGAIN is
> > returned.  QEMU gets no indication of which requests were submitted.
> 
> My understanding (which is based on the manpage rather than code) is
> that -EAGAIN is only returned if no request could be submitted. In other
> cases, the number of submitted requests is returned (similar to how
> short reads work).

I misread the code:

  /*
   * AKPM: should this return a partial result if some of the IOs were
   * successfully submitted?
   */
  for (i=0; i<nr; i++) {
          struct iocb __user *user_iocb;
          struct iocb tmp;

          if (unlikely(__get_user(user_iocb, iocbpp + i))) {
                  ret = -EFAULT;
                  break;
          }

          if (unlikely(copy_from_user(&tmp, user_iocb, sizeof(tmp)))) {
                  ret = -EFAULT;
                  break;
          }

          ret = io_submit_one(ctx, user_iocb, &tmp, compat);
          if (ret)
                  break;
  }
  blk_finish_plug(&plug);

  percpu_ref_put(&ctx->users);
  return i ? i : ret;

You are right that it will return the number of submitted requests (and
no errno) if a failure occurs partway through.

So the "bug" I found does not exist.

Stefan
Stefan Hajnoczi May 11, 2016, 1:22 p.m. UTC | #7
On Tue, May 10, 2016 at 12:32:27PM +0200, Paolo Bonzini wrote:
> On 10/05/2016 11:40, Kevin Wolf wrote:
> > > Regarding performance, I'm thinking about a guest with 8 disks (queue
> > > depth 32).  The worst case is when the guest submits 32 requests at once
> > > but the Linux AIO event limit has already been reached.  Then the disk
> > > is starved until other disks' requests complete.
> >
> > Sounds like a valid concern.
> 
> Oh, so you're concerned about the non-dataplane case.  My suspicion is
> that, with such a number of outstanding I/O, you probably have bad
> performance unless you use dataplane.
> 
> Also, aio=threads has had a 64 I/O limit for years and we've never heard
> about problems with stuck I/O.  But I agree that it's one area to keep
> an eye on.

Good point.  In that case I don't feel so bad if we merge this.

Stefan
Stefan Hajnoczi May 11, 2016, 1:23 p.m. UTC | #8
On Thu, Apr 07, 2016 at 06:33:36PM +0200, Paolo Bonzini wrote:
> This has better performance because it executes fewer system calls
> and does not use a bottom half per disk.
> 
> Originally proposed by Ming Lei.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  async.c                            |  23 +++++++
>  block/linux-aio.c                  |   3 +
>  block/raw-posix.c                  | 119 +++++--------------------------------
>  block/raw-win32.c                  |   2 +-
>  include/block/aio.h                |  13 ++++
>  {block => include/block}/raw-aio.h |   0
>  6 files changed, 54 insertions(+), 106 deletions(-)
>  rename {block => include/block}/raw-aio.h (100%)

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
diff mbox

Patch

diff --git a/async.c b/async.c
index b4bf205..6caa98c 100644
--- a/async.c
+++ b/async.c
@@ -29,6 +29,7 @@ 
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
+#include "block/raw-aio.h"
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -242,6 +243,14 @@  aio_ctx_finalize(GSource     *source)
     qemu_bh_delete(ctx->notify_dummy_bh);
     thread_pool_free(ctx->thread_pool);
 
+#ifdef CONFIG_LINUX_AIO
+    if (ctx->linux_aio) {
+        laio_detach_aio_context(ctx->linux_aio, ctx);
+        laio_cleanup(ctx->linux_aio);
+        ctx->linux_aio = NULL;
+    }
+#endif
+
     qemu_mutex_lock(&ctx->bh_lock);
     while (ctx->first_bh) {
         QEMUBH *next = ctx->first_bh->next;
@@ -282,6 +291,17 @@  ThreadPool *aio_get_thread_pool(AioContext *ctx)
     return ctx->thread_pool;
 }
 
+#ifdef CONFIG_LINUX_AIO
+LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+{
+    if (!ctx->linux_aio) {
+        ctx->linux_aio = laio_init();
+        laio_attach_aio_context(ctx->linux_aio, ctx);
+    }
+    return ctx->linux_aio;
+}
+#endif
+
 void aio_notify(AioContext *ctx)
 {
     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
@@ -345,6 +365,9 @@  AioContext *aio_context_new(Error **errp)
                            false,
                            (EventNotifierHandler *)
                            event_notifier_dummy_cb);
+#ifdef CONFIG_LINUX_AIO
+    ctx->linux_aio = NULL;
+#endif
     ctx->thread_pool = NULL;
     qemu_mutex_init(&ctx->bh_lock);
     rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx);
diff --git a/block/linux-aio.c b/block/linux-aio.c
index 90ec98e..b73ca14 100644
--- a/block/linux-aio.c
+++ b/block/linux-aio.c
@@ -47,6 +47,8 @@  typedef struct {
 } LaioQueue;
 
 struct LinuxAioState {
+    AioContext *aio_context;
+
     io_context_t ctx;
     EventNotifier e;
 
@@ -283,6 +285,7 @@  void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
 
 void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
 {
+    s->aio_context = new_context;
     s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s);
     aio_set_event_notifier(new_context, &s->e, false,
                            qemu_laio_completion_cb);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 71ec463..3b052a9 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -32,7 +32,7 @@ 
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
-#include "raw-aio.h"
+#include "block/raw-aio.h"
 #include "qapi/util.h"
 #include "qapi/qmp/qstring.h"
 
@@ -137,10 +137,6 @@  typedef struct BDRVRawState {
     int open_flags;
     size_t buf_align;
 
-#ifdef CONFIG_LINUX_AIO
-    int use_aio;
-    LinuxAioState *aio_ctx;
-#endif
 #ifdef CONFIG_XFS
     bool is_xfs:1;
 #endif
@@ -154,9 +150,6 @@  typedef struct BDRVRawState {
 typedef struct BDRVRawReopenState {
     int fd;
     int open_flags;
-#ifdef CONFIG_LINUX_AIO
-    int use_aio;
-#endif
 } BDRVRawReopenState;
 
 static int fd_open(BlockDriverState *bs);
@@ -374,58 +367,15 @@  static void raw_parse_flags(int bdrv_flags, int *open_flags)
     }
 }
 
-static void raw_detach_aio_context(BlockDriverState *bs)
-{
 #ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-
-    if (s->use_aio) {
-        laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs));
-    }
-#endif
-}
-
-static void raw_attach_aio_context(BlockDriverState *bs,
-                                   AioContext *new_context)
+static bool raw_use_aio(int bdrv_flags)
 {
-#ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-
-    if (s->use_aio) {
-        laio_attach_aio_context(s->aio_ctx, new_context);
-    }
-#endif
-}
-
-#ifdef CONFIG_LINUX_AIO
-static int raw_set_aio(LinuxAioState **aio_ctx, int *use_aio, int bdrv_flags)
-{
-    int ret = -1;
-    assert(aio_ctx != NULL);
-    assert(use_aio != NULL);
     /*
      * Currently Linux do AIO only for files opened with O_DIRECT
      * specified so check NOCACHE flag too
      */
-    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
-
-        /* if non-NULL, laio_init() has already been run */
-        if (*aio_ctx == NULL) {
-            *aio_ctx = laio_init();
-            if (!*aio_ctx) {
-                goto error;
-            }
-        }
-        *use_aio = 1;
-    } else {
-        *use_aio = 0;
-    }
-
-    ret = 0;
-
-error:
-    return ret;
+    return (bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                         (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO);
 }
 #endif
 
@@ -494,13 +444,7 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     s->fd = fd;
 
 #ifdef CONFIG_LINUX_AIO
-    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
-        qemu_close(fd);
-        ret = -errno;
-        error_setg_errno(errp, -ret, "Could not set AIO state");
-        goto fail;
-    }
-    if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
+    if (!raw_use_aio(bdrv_flags) && (bdrv_flags & BDRV_O_NATIVE_AIO)) {
         error_setg(errp, "aio=native was specified, but it requires "
                          "cache.direct=on, which was not specified.");
         ret = -EINVAL;
@@ -566,8 +510,6 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
 #endif
 
-    raw_attach_aio_context(bs, bdrv_get_aio_context(bs));
-
     ret = 0;
 fail:
     if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) {
@@ -608,18 +550,6 @@  static int raw_reopen_prepare(BDRVReopenState *state,
     state->opaque = g_new0(BDRVRawReopenState, 1);
     raw_s = state->opaque;
 
-#ifdef CONFIG_LINUX_AIO
-    raw_s->use_aio = s->use_aio;
-
-    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
-     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
-     * won't override aio_ctx if aio_ctx is non-NULL */
-    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
-        error_setg(errp, "Could not set AIO state");
-        return -1;
-    }
-#endif
-
     if (s->type == FTYPE_CD) {
         raw_s->open_flags |= O_NONBLOCK;
     }
@@ -702,9 +632,6 @@  static void raw_reopen_commit(BDRVReopenState *state)
 
     qemu_close(s->fd);
     s->fd = raw_s->fd;
-#ifdef CONFIG_LINUX_AIO
-    s->use_aio = raw_s->use_aio;
-#endif
 
     g_free(state->opaque);
     state->opaque = NULL;
@@ -1319,8 +1246,9 @@  static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
         if (!bdrv_qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
-        } else if (s->use_aio) {
-            return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+        } else if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+            LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+            return laio_submit(bs, aio, s->fd, sector_num, qiov,
                                nb_sectors, cb, opaque, type);
 #endif
         }
@@ -1333,9 +1261,9 @@  static BlockAIOCB *raw_aio_submit(BlockDriverState *bs,
 static void raw_aio_plug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_plug(bs, s->aio_ctx);
+    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+        laio_io_plug(bs, aio);
     }
 #endif
 }
@@ -1343,9 +1271,9 @@  static void raw_aio_plug(BlockDriverState *bs)
 static void raw_aio_unplug(BlockDriverState *bs)
 {
 #ifdef CONFIG_LINUX_AIO
-    BDRVRawState *s = bs->opaque;
-    if (s->use_aio) {
-        laio_io_unplug(bs, s->aio_ctx);
+    if (bs->open_flags & BDRV_O_NATIVE_AIO) {
+        LinuxAioState *aio = aio_get_linux_aio(bdrv_get_aio_context(bs));
+        laio_io_unplug(bs, aio);
     }
 #endif
 }
@@ -1381,13 +1309,6 @@  static void raw_close(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
 
-    raw_detach_aio_context(bs);
-
-#ifdef CONFIG_LINUX_AIO
-    if (s->use_aio) {
-        laio_cleanup(s->aio_ctx);
-    }
-#endif
     if (s->fd >= 0) {
         qemu_close(s->fd);
         s->fd = -1;
@@ -1946,9 +1867,6 @@  BlockDriver bdrv_file = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     .create_opts = &raw_create_opts,
 };
 
@@ -2396,9 +2314,6 @@  static BlockDriver bdrv_host_device = {
     .bdrv_probe_blocksizes = hdev_probe_blocksizes,
     .bdrv_probe_geometry = hdev_probe_geometry,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* generic scsi device */
 #ifdef __linux__
     .bdrv_aio_ioctl     = hdev_aio_ioctl,
@@ -2523,9 +2438,6 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* removable device support */
     .bdrv_is_inserted   = cdrom_is_inserted,
     .bdrv_eject         = cdrom_eject,
@@ -2658,9 +2570,6 @@  static BlockDriver bdrv_host_cdrom = {
     .bdrv_get_allocated_file_size
                         = raw_get_allocated_file_size,
 
-    .bdrv_detach_aio_context = raw_detach_aio_context,
-    .bdrv_attach_aio_context = raw_attach_aio_context,
-
     /* removable device support */
     .bdrv_is_inserted   = cdrom_is_inserted,
     .bdrv_eject         = cdrom_eject,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index fd23891..ce77432 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -27,7 +27,7 @@ 
 #include "qemu/timer.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
-#include "raw-aio.h"
+#include "block/raw-aio.h"
 #include "trace.h"
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
diff --git a/include/block/aio.h b/include/block/aio.h
index 88a64ee..afd72a7 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -47,6 +47,9 @@  typedef struct AioHandler AioHandler;
 typedef void QEMUBHFunc(void *opaque);
 typedef void IOHandler(void *opaque);
 
+struct ThreadPool;
+struct LinuxAioState;
+
 struct AioContext {
     GSource source;
 
@@ -119,6 +122,13 @@  struct AioContext {
     /* Thread pool for performing work and receiving completion callbacks */
     struct ThreadPool *thread_pool;
 
+#ifdef CONFIG_LINUX_AIO
+    /* State for native Linux AIO.  Uses aio_context_acquire/release for
+     * locking.
+     */
+    struct LinuxAioState *linux_aio;
+#endif
+
     /* TimerLists for calling timers - one per clock type */
     QEMUTimerListGroup tlg;
 
@@ -335,6 +345,9 @@  GSource *aio_get_g_source(AioContext *ctx);
 /* Return the ThreadPool bound to this AioContext */
 struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
 
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
 /**
  * aio_timer_new:
  * @ctx: the aio context
diff --git a/block/raw-aio.h b/include/block/raw-aio.h
similarity index 100%
rename from block/raw-aio.h
rename to include/block/raw-aio.h