@@ -10,6 +10,7 @@ util-obj-y += qmp-introspect.o qapi-types.o qapi-visit.o qapi-event.o
block-obj-y = async.o thread-pool.o
block-obj-y += nbd.o block.o blockjob.o
block-obj-y += main-loop.o iohandler.o qemu-timer.o
+block-obj-$(CONFIG_EPOLL) += aio-epoll.o
block-obj-$(CONFIG_POSIX) += aio-posix.o
block-obj-$(CONFIG_WIN32) += aio-win32.o
block-obj-y += block/
new file mode 100644
@@ -0,0 +1,150 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright Red Hat, Inc, 2015
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu-common.h"
+#include "block/block.h"
+#include "qemu/queue.h"
+#include "block/aio-internal.h"
+#include <sys/epoll.h>
+
+/* The fd number threashold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+static void aio_epoll_disable(AioContext *ctx)
+{
+ ctx->epoll_available = false;
+ if (!ctx->epoll_enabled) {
+ return;
+ }
+ ctx->epoll_enabled = false;
+ close(ctx->epollfd);
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+ (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static bool aio_epoll_try_enable(AioContext *ctx)
+{
+ AioHandler *node;
+ struct epoll_event event;
+ if (!ctx->epoll_available) {
+ return false;
+ }
+
+ QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+ int r;
+ if (node->deleted || !node->pfd.events) {
+ continue;
+ }
+ event.events = epoll_events_from_pfd(node->pfd.events);
+ event.data.ptr = node;
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+ if (r) {
+ return false;
+ }
+ }
+ ctx->epoll_enabled = true;
+ return true;
+}
+
+void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+ struct epoll_event event;
+ int r;
+
+ if (!ctx->epoll_enabled) {
+ return;
+ }
+ if (!node->pfd.events) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
+ assert(!r);
+ } else {
+ event.data.ptr = node;
+ event.events = epoll_events_from_pfd(node->pfd.events);
+ if (is_new) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+ if (r) {
+ aio_epoll_disable(ctx);
+ }
+ } else {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
+ assert(!r);
+ }
+ }
+}
+
+int aio_epoll(AioContext *ctx, GPollFD *pfds, unsigned npfd, int64_t timeout)
+{
+ AioHandler *node;
+ int i, ret = 0;
+ struct epoll_event events[128];
+
+ assert(npfd == 1);
+ assert(pfds[0].fd == ctx->epollfd);
+ if (timeout > 0) {
+ ret = qemu_poll_ns(pfds, npfd, timeout);
+ }
+ if (timeout <= 0 || ret > 0) {
+ ret = epoll_wait(ctx->epollfd, events,
+ sizeof(events) / sizeof(events[0]),
+ timeout);
+ if (ret <= 0) {
+ goto out;
+ }
+ for (i = 0; i < ret; i++) {
+ int ev = events[i].events;
+ node = events[i].data.ptr;
+ node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
+ (ev & EPOLLERR ? G_IO_ERR : 0);
+ }
+ }
+out:
+ return ret;
+}
+
+bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+ unsigned npfd, int64_t timeout)
+{
+ if (!ctx->epoll_available) {
+ return false;
+ }
+ if (ctx->epoll_enabled) {
+ return true;
+ }
+ if (npfd >= EPOLL_ENABLE_THRESHOLD && aio_epoll_try_enable(ctx)) {
+ return true;
+ } else {
+ aio_epoll_disable(ctx);
+ return false;
+ }
+}
+
+void aio_context_setup_epoll(AioContext *ctx, Error **errp)
+{
+ assert(!ctx->epollfd);
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epollfd == -1) {
+ ctx->epoll_available = false;
+ } else {
+ ctx->epoll_available = true;
+ }
+}
@@ -39,6 +39,7 @@ void aio_set_fd_handler(AioContext *ctx,
void *opaque)
{
AioHandler *node;
+ bool is_new = false;
node = find_aio_handler(ctx, fd);
@@ -68,6 +69,7 @@ void aio_set_fd_handler(AioContext *ctx,
QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
g_source_add_poll(&ctx->source, &node->pfd);
+ is_new = true;
}
/* Update handler with latest information */
node->io_read = io_read;
@@ -78,6 +80,7 @@ void aio_set_fd_handler(AioContext *ctx,
node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
}
+ aio_epoll_update(ctx, node, is_new);
aio_notify(ctx);
}
@@ -248,7 +251,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
/* fill pollfds */
QLIST_FOREACH(node, &ctx->aio_handlers, node) {
- if (!node->deleted && node->pfd.events) {
+ if (!node->deleted && node->pfd.events && !ctx->epoll_enabled) {
add_pollfd(node);
}
}
@@ -259,7 +262,15 @@ bool aio_poll(AioContext *ctx, bool blocking)
if (timeout) {
aio_context_release(ctx);
}
- ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+ if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+ npfd = 0;
+ AioHandler epoll_handler = { 0 };
+ aio_epoll_fill(ctx, &epoll_handler);
+ add_pollfd(&epoll_handler);
+ ret = aio_epoll(ctx, pollfds, npfd, timeout);
+ } else {
+ ret = qemu_poll_ns(pollfds, npfd, timeout);
+ }
if (blocking) {
atomic_sub(&ctx->notify_me, 2);
}
@@ -291,4 +302,5 @@ bool aio_poll(AioContext *ctx, bool blocking)
void aio_context_setup(AioContext *ctx, Error **errp)
{
+ aio_context_setup_epoll(ctx, errp);
}
@@ -29,4 +29,19 @@ struct AioHandler {
void aio_context_setup(AioContext *ctx, Error **errp);
+void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new);
+
+int aio_epoll(AioContext *ctx, GPollFD *pfds, unsigned npfd, int64_t timeout);
+
+bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+ unsigned npfd, int64_t timeout);
+
+void aio_context_setup_epoll(AioContext *ctx, Error **errp);
+
+static inline void aio_epoll_fill(AioContext *ctx, AioHandler *node)
+{
+ node->pfd.fd = ctx->epollfd;
+ node->pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
+}
+
#endif
@@ -122,6 +122,11 @@ struct AioContext {
/* TimerLists for calling timers - one per clock type */
QEMUTimerListGroup tlg;
+
+ /* Fields below are used by aio-epoll. */
+ int epollfd;
+ bool epoll_enabled;
+ bool epoll_available;
};
/**
@@ -39,3 +39,4 @@ stub-obj-y += cpus.o
stub-obj-y += kvm.o
stub-obj-y += qmp_pc_dimm_device_list.o
stub-obj-y += target-monitor-defs.o
+stub-obj-y += aio-epoll.o
new file mode 100644
@@ -0,0 +1,37 @@
+/*
+ * QEMU aio epoll stub functions
+ *
+ * Copyright Red Hat, Inc, 2015
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu-common.h"
+#include "block/aio-internal.h"
+
+void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+}
+
+int aio_epoll(AioContext *ctx, GPollFD *pfds, unsigned npfd, int64_t timeout)
+{
+ assert(false);
+}
+
+bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+ unsigned npfd, int64_t timeout)
+{
+ return false;
+}
+
+void aio_context_setup_epoll(AioContext *ctx, Error **errp)
+{
+}
+
To minimize code duplication, epoll is hooked into aio-posix's aio_poll() instead of rolling its own. This approach also has the advantage that both compile time and run time ability to switch from between the two: 1) If configure script didn't find epoll, the libqemustub.a nop functions will be used, which selects the usual ppoll. 2) When QEMU starts with a small number of fds in the event loop, ppoll is used. 3) When QEMU starts with a big number of fds, or when more devices are hot plugged after starting up, epoll automatically kicks in after the number of fds hits the threshold. 4) Some fds may not support epoll, such as tty based stdio. In this case, we can fall back to ppoll. Signed-off-by: Fam Zheng <famz@redhat.com> --- Makefile.objs | 1 + aio-epoll.c | 150 +++++++++++++++++++++++++++++++++++++++++++ aio-posix.c | 16 ++++- include/block/aio-internal.h | 15 +++++ include/block/aio.h | 5 ++ stubs/Makefile.objs | 1 + stubs/aio-epoll.c | 37 +++++++++++ 7 files changed, 223 insertions(+), 2 deletions(-) create mode 100644 aio-epoll.c create mode 100644 stubs/aio-epoll.c