diff mbox

[RFC,4/4] aio-posix: Use epoll in aio_poll

Message ID 1435670385-625-5-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng June 30, 2015, 1:19 p.m. UTC
This patch let aio_poll use epoll_wait(2) syscall instead of
qemu_poll_ns, if possible. It improves scalability of
iothread (for example, virtio-scsi-dataplane.)

The epollfd is managed together with the GSource and ctx->aio_handlers,
by creating epoll_event instances for each watched aio fd and adding to
the epollfd with epoll_ctl.

The following table is a fio benchmark comparison on a single guest
block device, with different number of disks attached to the same scsi
bus (in MB/s):

Comments

Stefan Hajnoczi July 7, 2015, 3:08 p.m. UTC | #1
On Tue, Jun 30, 2015 at 09:19:45PM +0800, Fam Zheng wrote:
> =====================================================================
>   # of scsi-disks  |        master           |       epoll
>                    |   rd     wr    randrw   |   rd    wr    randrw
> ---------------------------------------------------------------------
>         1          |   103    96     49      |   105   99     49
>         4          |   92     96     48      |   103   98     49
>         8          |   96     94     46      |   101   97     50
>         16         |   91     91     45      |   101   95     48
>         32         |   84     83     40      |   95    95     48
>         64         |   75     73     35      |   91    90     44
>         128        |   54     53     26      |   79    80     39
>         256        |   41     39     19      |   63    62     30
> =====================================================================

Nice results!

> @@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
>  
>  void aio_context_setup(AioContext *ctx, Error **errp)
>  {
> +#ifdef CONFIG_EPOLL
> +    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
> +    if (ctx->epollfd < 0) {
> +        error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));

Slightly more concise:
error_setg_errno(errp, errno, "Failed to create epoll fd")

> -/* These thread-local variables are used only in a small part of aio_poll
> +#ifdef CONFIG_EPOLL
> +QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
> +QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
> +QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
> +QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
> +QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);

I guess this assumption is okay but maybe the compiler optimizes:

  event.events = (node->pfd.events & G_IO_IN ? EPOLLIN : 0) |
                 (node->pfd.events & G_IO_OUT ? EPOLLOUT : 0) |
		 (node->pfd.events & G_IO_PRI ? EPOLLPRI : 0) |
		 (node->pfd.events & G_IO_ERR ? EPOLLERR : 0) |
		 (node->pfd.events & G_IO_HUP ? EPOLLHUP : 0);

into:

  events.events = node->pfd.events & (EPOLLIN | EPOLLOUT | EPOLLPRI |
                                      EPOLLERR | EPOLLHUP);

which is just an AND instruction so it's effectively free and doesn't
assume that these constants have the same values.

> +
> +#define EPOLL_BATCH 128
> +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
> +{
> +    AioHandler *node;
> +    bool was_dispatching;
> +    int i, ret;
> +    bool progress;
> +    int64_t timeout;
> +    struct epoll_event events[EPOLL_BATCH];
> +
> +    aio_context_acquire(ctx);
> +    was_dispatching = ctx->dispatching;
> +    progress = false;
> +
> +    /* aio_notify can avoid the expensive event_notifier_set if
> +     * everything (file descriptors, bottom halves, timers) will
> +     * be re-evaluated before the next blocking poll().  This is
> +     * already true when aio_poll is called with blocking == false;
> +     * if blocking == true, it is only true after poll() returns.
> +     *
> +     * If we're in a nested event loop, ctx->dispatching might be true.
> +     * In that case we can restore it just before returning, but we
> +     * have to clear it now.
> +     */
> +    aio_set_dispatching(ctx, !blocking);
> +
> +    ctx->walking_handlers++;
> +
> +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
> +
> +    if (timeout > 0) {
> +        timeout = DIV_ROUND_UP(timeout, 1000000);
> +    }

I think you already posted the timerfd code in an earlier series.  Why
degrade to millisecond precision?  It needs to be fixed up anyway if the
main loop uses aio_poll() in the future.
Paolo Bonzini July 7, 2015, 3:27 p.m. UTC | #2
On 07/07/2015 17:08, Stefan Hajnoczi wrote:
>> > +
>> > +#define EPOLL_BATCH 128
>> > +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
>> > +{
>> > +    AioHandler *node;
>> > +    bool was_dispatching;
>> > +    int i, ret;
>> > +    bool progress;
>> > +    int64_t timeout;
>> > +    struct epoll_event events[EPOLL_BATCH];
>> > +
>> > +    aio_context_acquire(ctx);
>> > +    was_dispatching = ctx->dispatching;
>> > +    progress = false;
>> > +
>> > +    /* aio_notify can avoid the expensive event_notifier_set if
>> > +     * everything (file descriptors, bottom halves, timers) will
>> > +     * be re-evaluated before the next blocking poll().  This is
>> > +     * already true when aio_poll is called with blocking == false;
>> > +     * if blocking == true, it is only true after poll() returns.
>> > +     *
>> > +     * If we're in a nested event loop, ctx->dispatching might be true.
>> > +     * In that case we can restore it just before returning, but we
>> > +     * have to clear it now.
>> > +     */
>> > +    aio_set_dispatching(ctx, !blocking);
>> > +
>> > +    ctx->walking_handlers++;
>> > +
>> > +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
>> > +
>> > +    if (timeout > 0) {
>> > +        timeout = DIV_ROUND_UP(timeout, 1000000);
>> > +    }
> I think you already posted the timerfd code in an earlier series.  Why
> degrade to millisecond precision?  It needs to be fixed up anyway if the
> main loop uses aio_poll() in the future.

BTW, what about putting the code in a separate aio-epoll.c file?

Paolo
Fam Zheng July 8, 2015, 1:01 a.m. UTC | #3
On Tue, 07/07 16:08, Stefan Hajnoczi wrote:
> > @@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd)
> >  
> >  void aio_context_setup(AioContext *ctx, Error **errp)
> >  {
> > +#ifdef CONFIG_EPOLL
> > +    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
> > +    if (ctx->epollfd < 0) {
> > +        error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));
> 
> Slightly more concise:
> error_setg_errno(errp, errno, "Failed to create epoll fd")

Okay.

> 
> > -/* These thread-local variables are used only in a small part of aio_poll
> > +#ifdef CONFIG_EPOLL
> > +QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
> > +QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
> > +QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
> > +QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
> > +QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);
> 
> I guess this assumption is okay but maybe the compiler optimizes:
> 
>   event.events = (node->pfd.events & G_IO_IN ? EPOLLIN : 0) |
>                  (node->pfd.events & G_IO_OUT ? EPOLLOUT : 0) |
> 		 (node->pfd.events & G_IO_PRI ? EPOLLPRI : 0) |
> 		 (node->pfd.events & G_IO_ERR ? EPOLLERR : 0) |
> 		 (node->pfd.events & G_IO_HUP ? EPOLLHUP : 0);
> 
> into:
> 
>   events.events = node->pfd.events & (EPOLLIN | EPOLLOUT | EPOLLPRI |
>                                       EPOLLERR | EPOLLHUP);
> 
> which is just an AND instruction so it's effectively free and doesn't
> assume that these constants have the same values.

Okay, it'll be a few more typing (convert to and back) but more straigtforward
and self-documenting.

> 
> > +
> > +#define EPOLL_BATCH 128
> > +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
> > +{
> > +    AioHandler *node;
> > +    bool was_dispatching;
> > +    int i, ret;
> > +    bool progress;
> > +    int64_t timeout;
> > +    struct epoll_event events[EPOLL_BATCH];
> > +
> > +    aio_context_acquire(ctx);
> > +    was_dispatching = ctx->dispatching;
> > +    progress = false;
> > +
> > +    /* aio_notify can avoid the expensive event_notifier_set if
> > +     * everything (file descriptors, bottom halves, timers) will
> > +     * be re-evaluated before the next blocking poll().  This is
> > +     * already true when aio_poll is called with blocking == false;
> > +     * if blocking == true, it is only true after poll() returns.
> > +     *
> > +     * If we're in a nested event loop, ctx->dispatching might be true.
> > +     * In that case we can restore it just before returning, but we
> > +     * have to clear it now.
> > +     */
> > +    aio_set_dispatching(ctx, !blocking);
> > +
> > +    ctx->walking_handlers++;
> > +
> > +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
> > +
> > +    if (timeout > 0) {
> > +        timeout = DIV_ROUND_UP(timeout, 1000000);
> > +    }
> 
> I think you already posted the timerfd code in an earlier series.  Why
> degrade to millisecond precision?  It needs to be fixed up anyway if the
> main loop uses aio_poll() in the future.

Because of a little complication: timeout here is always -1 for iothread, and
what is interesting is that -1 actually requires an explicit

    timerfd_settime(timerfd, flags, &(struct itimerspec){{0, 0}}, NULL)

to disable timerfd for this aio_poll(), which costs somethings. Passing -1 to
epoll_wait() without this doesn't work because the timerfd is already added to
the epollfd and may have an unexpected timeout set before.

Of course we can cache the state and optimize, but I've not reasoned about what
if another thread happens to call aio_poll() when we're in epoll_wait(), for
example when the first aio_poll() has a positive timeout but the second one has
-1.

Fam
Stefan Hajnoczi July 8, 2015, 10:58 a.m. UTC | #4
On Wed, Jul 08, 2015 at 09:01:27AM +0800, Fam Zheng wrote:
> On Tue, 07/07 16:08, Stefan Hajnoczi wrote:
> > > +#define EPOLL_BATCH 128
> > > +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
> > > +{
> > > +    AioHandler *node;
> > > +    bool was_dispatching;
> > > +    int i, ret;
> > > +    bool progress;
> > > +    int64_t timeout;
> > > +    struct epoll_event events[EPOLL_BATCH];
> > > +
> > > +    aio_context_acquire(ctx);
> > > +    was_dispatching = ctx->dispatching;
> > > +    progress = false;
> > > +
> > > +    /* aio_notify can avoid the expensive event_notifier_set if
> > > +     * everything (file descriptors, bottom halves, timers) will
> > > +     * be re-evaluated before the next blocking poll().  This is
> > > +     * already true when aio_poll is called with blocking == false;
> > > +     * if blocking == true, it is only true after poll() returns.
> > > +     *
> > > +     * If we're in a nested event loop, ctx->dispatching might be true.
> > > +     * In that case we can restore it just before returning, but we
> > > +     * have to clear it now.
> > > +     */
> > > +    aio_set_dispatching(ctx, !blocking);
> > > +
> > > +    ctx->walking_handlers++;
> > > +
> > > +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
> > > +
> > > +    if (timeout > 0) {
> > > +        timeout = DIV_ROUND_UP(timeout, 1000000);
> > > +    }
> > 
> > I think you already posted the timerfd code in an earlier series.  Why
> > degrade to millisecond precision?  It needs to be fixed up anyway if the
> > main loop uses aio_poll() in the future.
> 
> Because of a little complication: timeout here is always -1 for iothread, and
> what is interesting is that -1 actually requires an explicit
> 
>     timerfd_settime(timerfd, flags, &(struct itimerspec){{0, 0}}, NULL)
> 
> to disable timerfd for this aio_poll(), which costs somethings. Passing -1 to
> epoll_wait() without this doesn't work because the timerfd is already added to
> the epollfd and may have an unexpected timeout set before.
> 
> Of course we can cache the state and optimize, but I've not reasoned about what
> if another thread happens to call aio_poll() when we're in epoll_wait(), for
> example when the first aio_poll() has a positive timeout but the second one has
> -1.

I'm not sure I understand the threads scenario since aio_poll_epoll()
has a big aio_context_acquire()/release() region that protects it, but I
guess the nested aio_poll() case is similar.  Care needs to be taken so
the extra timerfd state is consistent.

The optimization can be added later unless the timerfd_settime() syscall
is so expensive that it defeats the advantage of epoll().
Fam Zheng July 10, 2015, 12:46 a.m. UTC | #5
On Wed, 07/08 11:58, Stefan Hajnoczi wrote:
> On Wed, Jul 08, 2015 at 09:01:27AM +0800, Fam Zheng wrote:
> > On Tue, 07/07 16:08, Stefan Hajnoczi wrote:
> > > > +#define EPOLL_BATCH 128
> > > > +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
> > > > +{
> > > > +    AioHandler *node;
> > > > +    bool was_dispatching;
> > > > +    int i, ret;
> > > > +    bool progress;
> > > > +    int64_t timeout;
> > > > +    struct epoll_event events[EPOLL_BATCH];
> > > > +
> > > > +    aio_context_acquire(ctx);
> > > > +    was_dispatching = ctx->dispatching;
> > > > +    progress = false;
> > > > +
> > > > +    /* aio_notify can avoid the expensive event_notifier_set if
> > > > +     * everything (file descriptors, bottom halves, timers) will
> > > > +     * be re-evaluated before the next blocking poll().  This is
> > > > +     * already true when aio_poll is called with blocking == false;
> > > > +     * if blocking == true, it is only true after poll() returns.
> > > > +     *
> > > > +     * If we're in a nested event loop, ctx->dispatching might be true.
> > > > +     * In that case we can restore it just before returning, but we
> > > > +     * have to clear it now.
> > > > +     */
> > > > +    aio_set_dispatching(ctx, !blocking);
> > > > +
> > > > +    ctx->walking_handlers++;
> > > > +
> > > > +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
> > > > +
> > > > +    if (timeout > 0) {
> > > > +        timeout = DIV_ROUND_UP(timeout, 1000000);
> > > > +    }
> > > 
> > > I think you already posted the timerfd code in an earlier series.  Why
> > > degrade to millisecond precision?  It needs to be fixed up anyway if the
> > > main loop uses aio_poll() in the future.
> > 
> > Because of a little complication: timeout here is always -1 for iothread, and
> > what is interesting is that -1 actually requires an explicit
> > 
> >     timerfd_settime(timerfd, flags, &(struct itimerspec){{0, 0}}, NULL)
> > 
> > to disable timerfd for this aio_poll(), which costs somethings. Passing -1 to
> > epoll_wait() without this doesn't work because the timerfd is already added to
> > the epollfd and may have an unexpected timeout set before.
> > 
> > Of course we can cache the state and optimize, but I've not reasoned about what
> > if another thread happens to call aio_poll() when we're in epoll_wait(), for
> > example when the first aio_poll() has a positive timeout but the second one has
> > -1.
> 
> I'm not sure I understand the threads scenario since aio_poll_epoll()
> has a big aio_context_acquire()/release() region that protects it, but I
> guess the nested aio_poll() case is similar.  Care needs to be taken so
> the extra timerfd state is consistent.

Nested aio_poll() has no racing on timerfd because the outer aio_poll()'s
epoll_wait() would have already returned at the point of the inner aio_poll().

Threads are different with Paolo's "release AioContext around blocking
aio_poll()".

> 
> The optimization can be added later unless the timerfd_settime() syscall
> is so expensive that it defeats the advantage of epoll().

That's the plan, and must be done before it get used by main loop.

Fam
Stefan Hajnoczi July 13, 2015, 10:02 a.m. UTC | #6
On Fri, Jul 10, 2015 at 08:46:44AM +0800, Fam Zheng wrote:
> On Wed, 07/08 11:58, Stefan Hajnoczi wrote:
> > On Wed, Jul 08, 2015 at 09:01:27AM +0800, Fam Zheng wrote:
> > > On Tue, 07/07 16:08, Stefan Hajnoczi wrote:
> > > > > +#define EPOLL_BATCH 128
> > > > > +static bool aio_poll_epoll(AioContext *ctx, bool blocking)
> > > > > +{
> > > > > +    AioHandler *node;
> > > > > +    bool was_dispatching;
> > > > > +    int i, ret;
> > > > > +    bool progress;
> > > > > +    int64_t timeout;
> > > > > +    struct epoll_event events[EPOLL_BATCH];
> > > > > +
> > > > > +    aio_context_acquire(ctx);
> > > > > +    was_dispatching = ctx->dispatching;
> > > > > +    progress = false;
> > > > > +
> > > > > +    /* aio_notify can avoid the expensive event_notifier_set if
> > > > > +     * everything (file descriptors, bottom halves, timers) will
> > > > > +     * be re-evaluated before the next blocking poll().  This is
> > > > > +     * already true when aio_poll is called with blocking == false;
> > > > > +     * if blocking == true, it is only true after poll() returns.
> > > > > +     *
> > > > > +     * If we're in a nested event loop, ctx->dispatching might be true.
> > > > > +     * In that case we can restore it just before returning, but we
> > > > > +     * have to clear it now.
> > > > > +     */
> > > > > +    aio_set_dispatching(ctx, !blocking);
> > > > > +
> > > > > +    ctx->walking_handlers++;
> > > > > +
> > > > > +    timeout = blocking ? aio_compute_timeout(ctx) : 0;
> > > > > +
> > > > > +    if (timeout > 0) {
> > > > > +        timeout = DIV_ROUND_UP(timeout, 1000000);
> > > > > +    }
> > > > 
> > > > I think you already posted the timerfd code in an earlier series.  Why
> > > > degrade to millisecond precision?  It needs to be fixed up anyway if the
> > > > main loop uses aio_poll() in the future.
> > > 
> > > Because of a little complication: timeout here is always -1 for iothread, and
> > > what is interesting is that -1 actually requires an explicit
> > > 
> > >     timerfd_settime(timerfd, flags, &(struct itimerspec){{0, 0}}, NULL)
> > > 
> > > to disable timerfd for this aio_poll(), which costs somethings. Passing -1 to
> > > epoll_wait() without this doesn't work because the timerfd is already added to
> > > the epollfd and may have an unexpected timeout set before.
> > > 
> > > Of course we can cache the state and optimize, but I've not reasoned about what
> > > if another thread happens to call aio_poll() when we're in epoll_wait(), for
> > > example when the first aio_poll() has a positive timeout but the second one has
> > > -1.
> > 
> > I'm not sure I understand the threads scenario since aio_poll_epoll()
> > has a big aio_context_acquire()/release() region that protects it, but I
> > guess the nested aio_poll() case is similar.  Care needs to be taken so
> > the extra timerfd state is consistent.
> 
> Nested aio_poll() has no racing on timerfd because the outer aio_poll()'s
> epoll_wait() would have already returned at the point of the inner aio_poll().
> 
> Threads are different with Paolo's "release AioContext around blocking
> aio_poll()".

Ah, I see!

> > 
> > The optimization can be added later unless the timerfd_settime() syscall
> > is so expensive that it defeats the advantage of epoll().
> 
> That's the plan, and must be done before it get used by main loop.

I'd rather we merge correct code than fast code which violates the API.

Let's do nanosecond precision now, as advertised by the function names,
and optimize timerfd later.
diff mbox

Patch

=====================================================================
  # of scsi-disks  |        master           |       epoll
                   |   rd     wr    randrw   |   rd    wr    randrw
---------------------------------------------------------------------
        1          |   103    96     49      |   105   99     49
        4          |   92     96     48      |   103   98     49
        8          |   96     94     46      |   101   97     50
        16         |   91     91     45      |   101   95     48
        32         |   84     83     40      |   95    95     48
        64         |   75     73     35      |   91    90     44
        128        |   54     53     26      |   79    80     39
        256        |   41     39     19      |   63    62     30
=====================================================================

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 aio-posix.c         | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 include/block/aio.h |   3 ++
 2 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/aio-posix.c b/aio-posix.c
index 22406ce..111d7fb 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@ 
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
 
 struct AioHandler
 {
@@ -44,6 +47,12 @@  static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 
 void aio_context_setup(AioContext *ctx, Error **errp)
 {
+#ifdef CONFIG_EPOLL
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd < 0) {
+        error_setg(errp, "Failed to create epoll fd: %s", strerror(errno));
+    }
+#endif
 }
 
 void aio_set_fd_handler_pri(AioContext *ctx,
@@ -54,6 +63,11 @@  void aio_set_fd_handler_pri(AioContext *ctx,
                             void *opaque)
 {
     AioHandler *node;
+#ifdef CONFIG_EPOLL
+    struct epoll_event event;
+    int r;
+    bool add = false;
+#endif
 
     node = find_aio_handler(ctx, fd);
 
@@ -61,6 +75,10 @@  void aio_set_fd_handler_pri(AioContext *ctx,
     if (!io_read && !io_write && !io_read_pri) {
         if (node) {
             g_source_remove_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, fd, &event);
+            assert(!r);
+#endif
 
             /* If the lock is held, just mark the node as deleted */
             if (ctx->walking_handlers) {
@@ -83,6 +101,9 @@  void aio_set_fd_handler_pri(AioContext *ctx,
             QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
 
             g_source_add_poll(&ctx->source, &node->pfd);
+#ifdef CONFIG_EPOLL
+            add = true;
+#endif
         }
         /* Update handler with latest information */
         node->io_read = io_read;
@@ -93,6 +114,13 @@  void aio_set_fd_handler_pri(AioContext *ctx,
         node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
         node->pfd.events |= (io_read_pri ? G_IO_PRI | G_IO_HUP | G_IO_ERR : 0);
+#ifdef CONFIG_EPOLL
+        event.data.ptr = node;
+        event.events = node->pfd.events;
+        r = epoll_ctl(ctx->epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD,
+                      fd, &event);
+        assert(!r);
+#endif
     }
 
     aio_notify(ctx);
@@ -198,7 +226,80 @@  bool aio_dispatch(AioContext *ctx)
     return progress;
 }
 
-/* These thread-local variables are used only in a small part of aio_poll
+#ifdef CONFIG_EPOLL
+QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN);
+QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT);
+QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI);
+QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR);
+QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP);
+
+#define EPOLL_BATCH 128
+static bool aio_poll_epoll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    bool was_dispatching;
+    int i, ret;
+    bool progress;
+    int64_t timeout;
+    struct epoll_event events[EPOLL_BATCH];
+
+    aio_context_acquire(ctx);
+    was_dispatching = ctx->dispatching;
+    progress = false;
+
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns.
+     *
+     * If we're in a nested event loop, ctx->dispatching might be true.
+     * In that case we can restore it just before returning, but we
+     * have to clear it now.
+     */
+    aio_set_dispatching(ctx, !blocking);
+
+    ctx->walking_handlers++;
+
+    timeout = blocking ? aio_compute_timeout(ctx) : 0;
+
+    if (timeout > 0) {
+        timeout = DIV_ROUND_UP(timeout, 1000000);
+    }
+
+    /* wait until next event */
+    if (timeout) {
+        aio_context_release(ctx);
+    }
+    ret = epoll_wait(ctx->epollfd, events, EPOLL_BATCH, timeout);
+    if (timeout) {
+        aio_context_acquire(ctx);
+    }
+
+    /* if we have any readable fds, dispatch event */
+    if (ret > 0) {
+        for (i = 0; i < ret; i++) {
+            node = events[i].data.ptr;
+            node->pfd.revents = events[i].events;
+        }
+    }
+
+    ctx->walking_handlers--;
+
+    /* Run dispatch even if there were no readable fds to run timers */
+    aio_set_dispatching(ctx, true);
+    if (aio_dispatch(ctx)) {
+        progress = true;
+    }
+
+    aio_set_dispatching(ctx, was_dispatching);
+    aio_context_release(ctx);
+
+    return progress;
+}
+#else
+
+/* These thread-local variables are used only in a small part of aio_poll_posix
  * around the call to the poll() system call.  In particular they are not
  * used while aio_poll is performing callbacks, which makes it much easier
  * to think about reentrancy!
@@ -212,7 +313,6 @@  bool aio_dispatch(AioContext *ctx)
 static __thread GPollFD *pollfds;
 static __thread AioHandler **nodes;
 static __thread unsigned npfd, nalloc;
-static __thread Notifier pollfds_cleanup_notifier;
 
 static void pollfds_cleanup(Notifier *n, void *unused)
 {
@@ -221,7 +321,7 @@  static void pollfds_cleanup(Notifier *n, void *unused)
     g_free(nodes);
     nalloc = 0;
 }
-
+static __thread Notifier pollfds_cleanup_notifier;
 static void add_pollfd(AioHandler *node)
 {
     if (npfd == nalloc) {
@@ -244,7 +344,7 @@  static void add_pollfd(AioHandler *node)
     npfd++;
 }
 
-bool aio_poll(AioContext *ctx, bool blocking)
+bool aio_poll_posix(AioContext *ctx, bool blocking)
 {
     AioHandler *node;
     bool was_dispatching;
@@ -311,3 +411,13 @@  bool aio_poll(AioContext *ctx, bool blocking)
 
     return progress;
 }
+#endif
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+#ifdef CONFIG_EPOLL
+    return aio_poll_epoll(ctx, blocking);
+#else
+    return aio_poll_posix(ctx, blocking);
+#endif
+}
diff --git a/include/block/aio.h b/include/block/aio.h
index 5120583..9178ff2 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -87,6 +87,9 @@  struct AioContext {
 
     /* TimerLists for calling timers - one per clock type */
     QEMUTimerListGroup tlg;
+
+    /* epoll fd */
+    int epollfd;
 };
 
 /* Used internally to synchronize aio_poll against qemu_bh_schedule.  */