From patchwork Fri Jul 4 17:41:55 2014 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Dr. David Alan Gilbert" X-Patchwork-Id: 367196 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Received: from lists.gnu.org (lists.gnu.org [IPv6:2001:4830:134:3::11]) (using TLSv1 with cipher AES256-SHA (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 03ABB140092 for ; Sat, 5 Jul 2014 04:48:46 +1000 (EST) Received: from localhost ([::1]:37418 helo=lists.gnu.org) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X37cS-0007xs-29 for incoming@patchwork.ozlabs.org; Fri, 04 Jul 2014 13:49:56 -0400 Received: from eggs.gnu.org ([2001:4830:134:3::10]:33066) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X37W6-0008Nw-Lp for qemu-devel@nongnu.org; Fri, 04 Jul 2014 13:43:28 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1X37W0-0000j8-FN for qemu-devel@nongnu.org; Fri, 04 Jul 2014 13:43:22 -0400 Received: from mx1.redhat.com ([209.132.183.28]:20758) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1X37W0-0000iy-7o for qemu-devel@nongnu.org; Fri, 04 Jul 2014 13:43:16 -0400 Received: from int-mx14.intmail.prod.int.phx2.redhat.com (int-mx14.intmail.prod.int.phx2.redhat.com [10.5.11.27]) by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id s64HhDb0018028 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-GCM-SHA384 bits=256 verify=OK); Fri, 4 Jul 2014 13:43:13 -0400 Received: from dgilbert-t530.home.treblig.org (vpn1-7-141.ams2.redhat.com [10.36.7.141]) by int-mx14.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id s64HfvUD030576; Fri, 4 Jul 2014 13:43:12 -0400 From: "Dr. David Alan Gilbert (git)" To: qemu-devel@nongnu.org Date: Fri, 4 Jul 2014 18:41:55 +0100 Message-Id: <1404495717-4239-45-git-send-email-dgilbert@redhat.com> In-Reply-To: <1404495717-4239-1-git-send-email-dgilbert@redhat.com> References: <1404495717-4239-1-git-send-email-dgilbert@redhat.com> X-Scanned-By: MIMEDefang 2.68 on 10.5.11.27 X-detected-operating-system: by eggs.gnu.org: GNU/Linux 3.x X-Received-From: 209.132.183.28 Cc: aarcange@redhat.com, yamahata@private.email.ne.jp, lilei@linux.vnet.ibm.com, quintela@redhat.com Subject: [Qemu-devel] [PATCH 44/46] postcopy: Use userfaultfd X-BeenThere: qemu-devel@nongnu.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org From: "Dr. David Alan Gilbert" userfaultfd is a Linux syscall that gives an fd that receives a stream of notifications of accesses to pages marked as MADV_USERFAULT, and allows the program to acknowledge those stalls and tell the accessing thread to carry on. Signed-off-by: Dr. David Alan Gilbert --- postcopy-ram.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/postcopy-ram.c b/postcopy-ram.c index 8d0a225..466c42b 100644 --- a/postcopy-ram.c +++ b/postcopy-ram.c @@ -68,6 +68,14 @@ #define __NR_remap_anon_pages 317 #endif +#ifndef __NR_userfaultfd +#define __NR_userfaultfd 318 +#endif + +#ifndef USERFAULTFD_PROTOCOL +#define USERFAULTFD_PROTOCOL (uint64_t)0xaa +#endif + /* ---------------------------------------------------------------------- */ /* Postcopy pagemap-inbound (pmi) - data structures that record the */ /* state of each page used by the inbound postcopy */ @@ -192,6 +200,7 @@ int postcopy_ram_hosttest(void) */ void *testarea, *testarea2; long pagesize = getpagesize(); + int ufd; testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); @@ -201,15 +210,24 @@ int postcopy_ram_hosttest(void) } g_assert(((size_t)testarea & (pagesize-1)) == 0); + ufd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (ufd == -1) { + perror("postcopy_ram_hosttest: userfaultfd not available"); + munmap(testarea, pagesize); + return -1; + } + if (madvise(testarea, pagesize, MADV_USERFAULT)) { perror("postcopy_ram_hosttest: MADV_USERFAULT not available"); munmap(testarea, pagesize); + close(ufd); return -1; } if (madvise(testarea, pagesize, MADV_NOUSERFAULT)) { perror("postcopy_ram_hosttest: MADV_NOUSERFAULT not available"); munmap(testarea, pagesize); + close(ufd); return -1; } @@ -226,11 +244,13 @@ int postcopy_ram_hosttest(void) perror("postcopy_ram_hosttest: remap_anon_pages not available"); munmap(testarea, pagesize); munmap(testarea2, pagesize); + close(ufd); return -1; } munmap(testarea, pagesize); munmap(testarea2, pagesize); + close(ufd); return 0; } @@ -361,6 +381,39 @@ static int postcopy_ram_sensitise_area(const char *block_name, void *host_addr, } /* + * Tell the kernel that we've now got some memory it previously asked for. + * Note: We're not allowed to ack a page which wasn't requested. + */ +static int ack_userfault(MigrationIncomingState *mis, void *start, size_t len) +{ + uint64_t tmp[2]; + + /* Kernel wants the range that's now safe to access */ + tmp[0] = (uint64_t)start; + tmp[1] = (uint64_t)start + (uint64_t)(len-1); + + if (write(mis->userfault_fd, tmp, 16) != 16) { + int e = errno; + + if (e == ENOENT) { + /* Kernel said it wasn't waiting - one case where this can + * happen is where two threads triggered the userfault + * and we receive the page and ack it just after we received + * the 2nd request and that ends up deciding it should ack it + * We could optimise it out, but it's rare. + */ + /*fprintf(stderr, "ack_userfault: %p/%zx ENOENT\n", start, len); */ + return 0; + } + error_report("postcopy_ram: Failed to notify kernel for %p/%zx (%d)", + start, len, e); + return -errno; + } + + return 0; +} + +/* * Handle faults detected by the USERFAULT markings */ static void *postcopy_ram_fault_thread(void *opaque) @@ -420,10 +473,9 @@ static void *postcopy_ram_fault_thread(void *opaque) /* Already arrived - no state change, just kick the kernel */ DPRINTF("postcopy_ram_fault_thread: notify pre of %p", hostaddr); - /* TODO! Send ack if (ack_userfault(mis, hostaddr, hostpagesize)) { assert(0); - } */ + } break; case POSTCOPY_PMI_MISSING: @@ -464,8 +516,33 @@ static void *postcopy_ram_fault_thread(void *opaque) int postcopy_ram_enable_notify(MigrationIncomingState *mis) { - /* Create the fault handler thread and wait for it to be ready */ - mis->userfault_fd = -1; /* TODO */ + uint64_t tmp64; + + /* Open the fd for the kernel to give us userfaults */ + mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (mis->userfault_fd == -1) { + perror("Failed to open userfault fd"); + return -1; + } + + /* + * Version handshake, we send it the version we want and expect to get the + * same back. + */ + tmp64 = USERFAULTFD_PROTOCOL; + if (write(mis->userfault_fd, &tmp64, sizeof(tmp64)) != sizeof(tmp64)) { + perror("Writing userfaultfd version"); + return -1; + } + if (read(mis->userfault_fd, &tmp64, sizeof(tmp64)) != sizeof(tmp64)) { + perror("Reading userfaultfd version"); + return -1; + } + if (tmp64 != USERFAULTFD_PROTOCOL) { + error_report("Mismatched userfaultfd version, expected %zx, got %zx", + (size_t)USERFAULTFD_PROTOCOL, (size_t)tmp64); + } + qemu_sem_init(&mis->fault_thread_sem, 0); qemu_thread_create(&mis->fault_thread, "postcopy/fault", postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); @@ -476,6 +553,8 @@ int postcopy_ram_enable_notify(MigrationIncomingState *mis) return -1; } + DPRINTF("postcopy_ram_enable_notify: Sensitised"); + return 0; } @@ -509,11 +588,12 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, if (syscall(__NR_remap_anon_pages, host, from, getpagesize(), 0) != getpagesize()) { + int e = errno; perror("remap_anon_pages in postcopy_place_page"); fprintf(stderr, "host: %p from: %p pmi=%d\n", host, from, postcopy_pmi_get_state(mis, bitmap_offset)); - return -errno; + return -e; } tmp_state = postcopy_pmi_get_state(mis, bitmap_offset); @@ -526,7 +606,10 @@ int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from, if (old_state == POSTCOPY_PMI_REQUESTED) { - /* TODO: Notify kernel */ + /* Send the kernel the host address that should now be accessible */ + DPRINTF("%s: Notifying kernel bitmap_offset=0x%lx host=%p", + __func__, bitmap_offset, host); + return ack_userfault(mis, host, getpagesize()); } /* TODO: hostpagesize!=targetpagesize case */