From patchwork Tue Jun 30 13:19:45 2015 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Fam Zheng X-Patchwork-Id: 489716 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 2F58B1402CD for ; Tue, 30 Jun 2015 23:25:25 +1000 (AEST) Received: from localhost ([::1]:47022 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Z9vXP-0003dl-CK for incoming@patchwork.ozlabs.org; Tue, 30 Jun 2015 09:25:23 -0400 Received: from eggs.gnu.org ([2001:4830:134:3::10]:43309) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Z9vWw-0002t1-0B for qemu-devel@nongnu.org; Tue, 30 Jun 2015 09:24:57 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1Z9vSK-0003aZ-Hc for qemu-devel@nongnu.org; Tue, 30 Jun 2015 09:20:09 -0400 Received: from mx1.redhat.com ([209.132.183.28]:51471) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Z9vSA-00039N-Qm; Tue, 30 Jun 2015 09:19:58 -0400 Received: from int-mx14.intmail.prod.int.phx2.redhat.com (int-mx14.intmail.prod.int.phx2.redhat.com [10.5.11.27]) by mx1.redhat.com (Postfix) with ESMTPS id 830E36C; Tue, 30 Jun 2015 13:19:58 +0000 (UTC) Received: from ad.nay.redhat.com (dhcp-14-104.nay.redhat.com [10.66.14.104]) by int-mx14.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id t5UDJktK000751; Tue, 30 Jun 2015 09:19:56 -0400 From: Fam Zheng To: qemu-devel@nongnu.org Date: Tue, 30 Jun 2015 21:19:45 +0800 Message-Id: <1435670385-625-5-git-send-email-famz@redhat.com> In-Reply-To: <1435670385-625-1-git-send-email-famz@redhat.com> References: <1435670385-625-1-git-send-email-famz@redhat.com> X-Scanned-By: MIMEDefang 2.68 on 10.5.11.27 X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x X-Received-From: 209.132.183.28 Cc: Kevin Wolf , pbonzini@redhat.com, qemu-block@nongnu.org, Stefan Hajnoczi Subject: [Qemu-devel] [PATCH RFC 4/4] aio-posix: Use epoll in aio_poll X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org This patch let aio_poll use epoll_wait(2) syscall instead of qemu_poll_ns, if possible. It improves scalability of iothread (for example, virtio-scsi-dataplane.) The epollfd is managed together with the GSource and ctx->aio_handlers, by creating epoll_event instances for each watched aio fd and adding to the epollfd with epoll_ctl. The following table is a fio benchmark comparison on a single guest block device, with different number of disks attached to the same scsi bus (in MB/s): ===================================================================== # of scsi-disks | master | epoll | rd wr randrw | rd wr randrw --------------------------------------------------------------------- 1 | 103 96 49 | 105 99 49 4 | 92 96 48 | 103 98 49 8 | 96 94 46 | 101 97 50 16 | 91 91 45 | 101 95 48 32 | 84 83 40 | 95 95 48 64 | 75 73 35 | 91 90 44 128 | 54 53 26 | 79 80 39 256 | 41 39 19 | 63 62 30 ===================================================================== Signed-off-by: Fam Zheng --- aio-posix.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++-- include/block/aio.h | 3 ++ 2 files changed, 117 insertions(+), 4 deletions(-) diff --git a/aio-posix.c b/aio-posix.c index 22406ce..111d7fb 100644 --- a/aio-posix.c +++ b/aio-posix.c @@ -17,6 +17,9 @@ #include "block/block.h" #include "qemu/queue.h" #include "qemu/sockets.h" +#ifdef CONFIG_EPOLL +#include +#endif struct AioHandler { @@ -44,6 +47,12 @@ static AioHandler *find_aio_handler(AioContext *ctx, int fd) void aio_context_setup(AioContext *ctx, Error **errp) { +#ifdef CONFIG_EPOLL + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epollfd < 0) { + error_setg(errp, "Failed to create epoll fd: %s", strerror(errno)); + } +#endif } void aio_set_fd_handler_pri(AioContext *ctx, @@ -54,6 +63,11 @@ void aio_set_fd_handler_pri(AioContext *ctx, void *opaque) { AioHandler *node; +#ifdef CONFIG_EPOLL + struct epoll_event event; + int r; + bool add = false; +#endif node = find_aio_handler(ctx, fd); @@ -61,6 +75,10 @@ void aio_set_fd_handler_pri(AioContext *ctx, if (!io_read && !io_write && !io_read_pri) { if (node) { g_source_remove_poll(&ctx->source, &node->pfd); +#ifdef CONFIG_EPOLL + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, fd, &event); + assert(!r); +#endif /* If the lock is held, just mark the node as deleted */ if (ctx->walking_handlers) { @@ -83,6 +101,9 @@ void aio_set_fd_handler_pri(AioContext *ctx, QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node); g_source_add_poll(&ctx->source, &node->pfd); +#ifdef CONFIG_EPOLL + add = true; +#endif } /* Update handler with latest information */ node->io_read = io_read; @@ -93,6 +114,13 @@ void aio_set_fd_handler_pri(AioContext *ctx, node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); node->pfd.events |= (io_read_pri ? G_IO_PRI | G_IO_HUP | G_IO_ERR : 0); +#ifdef CONFIG_EPOLL + event.data.ptr = node; + event.events = node->pfd.events; + r = epoll_ctl(ctx->epollfd, add ? EPOLL_CTL_ADD : EPOLL_CTL_MOD, + fd, &event); + assert(!r); +#endif } aio_notify(ctx); @@ -198,7 +226,80 @@ bool aio_dispatch(AioContext *ctx) return progress; } -/* These thread-local variables are used only in a small part of aio_poll +#ifdef CONFIG_EPOLL +QEMU_BUILD_BUG_ON((int)G_IO_IN != EPOLLIN); +QEMU_BUILD_BUG_ON((int)G_IO_OUT != EPOLLOUT); +QEMU_BUILD_BUG_ON((int)G_IO_PRI != EPOLLPRI); +QEMU_BUILD_BUG_ON((int)G_IO_ERR != EPOLLERR); +QEMU_BUILD_BUG_ON((int)G_IO_HUP != EPOLLHUP); + +#define EPOLL_BATCH 128 +static bool aio_poll_epoll(AioContext *ctx, bool blocking) +{ + AioHandler *node; + bool was_dispatching; + int i, ret; + bool progress; + int64_t timeout; + struct epoll_event events[EPOLL_BATCH]; + + aio_context_acquire(ctx); + was_dispatching = ctx->dispatching; + progress = false; + + /* aio_notify can avoid the expensive event_notifier_set if + * everything (file descriptors, bottom halves, timers) will + * be re-evaluated before the next blocking poll(). This is + * already true when aio_poll is called with blocking == false; + * if blocking == true, it is only true after poll() returns. + * + * If we're in a nested event loop, ctx->dispatching might be true. + * In that case we can restore it just before returning, but we + * have to clear it now. + */ + aio_set_dispatching(ctx, !blocking); + + ctx->walking_handlers++; + + timeout = blocking ? aio_compute_timeout(ctx) : 0; + + if (timeout > 0) { + timeout = DIV_ROUND_UP(timeout, 1000000); + } + + /* wait until next event */ + if (timeout) { + aio_context_release(ctx); + } + ret = epoll_wait(ctx->epollfd, events, EPOLL_BATCH, timeout); + if (timeout) { + aio_context_acquire(ctx); + } + + /* if we have any readable fds, dispatch event */ + if (ret > 0) { + for (i = 0; i < ret; i++) { + node = events[i].data.ptr; + node->pfd.revents = events[i].events; + } + } + + ctx->walking_handlers--; + + /* Run dispatch even if there were no readable fds to run timers */ + aio_set_dispatching(ctx, true); + if (aio_dispatch(ctx)) { + progress = true; + } + + aio_set_dispatching(ctx, was_dispatching); + aio_context_release(ctx); + + return progress; +} +#else + +/* These thread-local variables are used only in a small part of aio_poll_posix * around the call to the poll() system call. In particular they are not * used while aio_poll is performing callbacks, which makes it much easier * to think about reentrancy! @@ -212,7 +313,6 @@ bool aio_dispatch(AioContext *ctx) static __thread GPollFD *pollfds; static __thread AioHandler **nodes; static __thread unsigned npfd, nalloc; -static __thread Notifier pollfds_cleanup_notifier; static void pollfds_cleanup(Notifier *n, void *unused) { @@ -221,7 +321,7 @@ static void pollfds_cleanup(Notifier *n, void *unused) g_free(nodes); nalloc = 0; } - +static __thread Notifier pollfds_cleanup_notifier; static void add_pollfd(AioHandler *node) { if (npfd == nalloc) { @@ -244,7 +344,7 @@ static void add_pollfd(AioHandler *node) npfd++; } -bool aio_poll(AioContext *ctx, bool blocking) +bool aio_poll_posix(AioContext *ctx, bool blocking) { AioHandler *node; bool was_dispatching; @@ -311,3 +411,13 @@ bool aio_poll(AioContext *ctx, bool blocking) return progress; } +#endif + +bool aio_poll(AioContext *ctx, bool blocking) +{ +#ifdef CONFIG_EPOLL + return aio_poll_epoll(ctx, blocking); +#else + return aio_poll_posix(ctx, blocking); +#endif +} diff --git a/include/block/aio.h b/include/block/aio.h index 5120583..9178ff2 100644 --- a/include/block/aio.h +++ b/include/block/aio.h @@ -87,6 +87,9 @@ struct AioContext { /* TimerLists for calling timers - one per clock type */ QEMUTimerListGroup tlg; + + /* epoll fd */ + int epollfd; }; /* Used internally to synchronize aio_poll against qemu_bh_schedule. */