Patchwork [v3,23/35] postcopy: implement incoming part of postcopy live migration

login
register
mail settings
Submitter Isaku Yamahata
Date Oct. 30, 2012, 8:32 a.m.
Message ID <044a5ffe55fd33f37f5b24f81ce125cf23a5e33d.1351582535.git.yamahata@valinux.co.jp>
Download mbox | patch
Permalink /patch/195440/
State New
Headers show

Comments

Isaku Yamahata - Oct. 30, 2012, 8:32 a.m.
This patch implements postcopy live migration for incoming part

Signed-off-by: Isaku Yamahata <yamahata@valinux.co.jp>
---
Changes v2 -> v3:
- threading, not fork
- use blocking io instead of select + non-blocking io
- don't modify RAMBlock
- When device allocates its own RAM region, e.g. vshmem, it's handled by
  device save/load. So skip it such area which has RAM_PREALLOLC_MASK flags
  set.
- less memory overhead
- drop -postcopy option. It is automatically detected.
- various improvement and simplification
- error handling

Changes v1 -> v2:
- fork umemd early to address qemu devices touching guest ram via
  post/pre_load
- code clean up on initialization
- Makefile.target
  migration-postcopy.c is target dependent due to TARGET_PAGE_xxx
  So it can't be shared between target architecture.
- use qemu_fopen_fd
- introduce incoming_flags_use_umem_make_present flag
- use MADV_DONTNEED
- make incoming socket nonblocking
- several clean ups
- Dropped QEMUFilePipe
- Moved QEMUFileNonblock to buffered_file
- Split out into umem/incoming/outgoing
- make mig_read nonblocking when socket
- updates for umem device changes
---
 Makefile.target      |    2 +
 cpu-all.h            |    3 +
 exec.c               |    6 +
 migration-fd.c       |    4 +-
 migration-postcopy.c | 1249 ++++++++++++++++++++++++++++++++++++++++++++++++++
 migration-tcp.c      |   10 +-
 migration-unix.c     |   10 +-
 migration.h          |   10 +
 savevm.c             |   28 ++
 vl.c                 |    2 +
 10 files changed, 1315 insertions(+), 9 deletions(-)
 create mode 100644 migration-postcopy.c

Patch

diff --git a/Makefile.target b/Makefile.target
index 3822bc5..930c070 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -121,6 +121,8 @@  obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o
 obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o
 LIBS+=-lz
 
+obj-y += migration-postcopy.o umem.o
+
 QEMU_CFLAGS += $(VNC_TLS_CFLAGS)
 QEMU_CFLAGS += $(VNC_SASL_CFLAGS)
 QEMU_CFLAGS += $(VNC_JPEG_CFLAGS)
diff --git a/cpu-all.h b/cpu-all.h
index b5fefc8..79846fe 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -485,6 +485,9 @@  extern ram_addr_t ram_size;
 /* RAM is pre-allocated and passed into qemu_ram_alloc_from_ptr */
 #define RAM_PREALLOC_MASK   (1 << 0)
 
+/* RAM is allocated via umem for postcopy incoming mode */
+#define RAM_POSTCOPY_UMEM_MASK  (1 << 1)
+
 typedef struct RAMBlock {
     struct MemoryRegion *mr;
     uint8_t *host;
diff --git a/exec.c b/exec.c
index 2aa4d90..6da991a 100644
--- a/exec.c
+++ b/exec.c
@@ -36,6 +36,7 @@ 
 #include "arch_init.h"
 #include "memory.h"
 #include "exec-memory.h"
+#include "migration.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -2555,6 +2556,8 @@  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
         new_block->host = host;
         new_block->flags |= RAM_PREALLOC_MASK;
     } else {
+        ram_addr_t page_size = getpagesize();
+        size = (size + page_size - 1) & ~(page_size - 1);
         if (mem_path) {
 #if defined (__linux__) && !defined(TARGET_S390X)
             new_block->host = file_ram_alloc(new_block, size, mem_path);
@@ -2635,6 +2638,9 @@  void qemu_ram_free(ram_addr_t addr)
             ram_list.version++;
             if (block->flags & RAM_PREALLOC_MASK) {
                 ;
+            }
+            else if (block->flags & RAM_POSTCOPY_UMEM_MASK) {
+                postcopy_incoming_ram_free(block);
             } else if (mem_path) {
 #if defined (__linux__) && !defined(TARGET_S390X)
                 if (block->fd) {
diff --git a/migration-fd.c b/migration-fd.c
index b3c54e5..8384975 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -105,7 +105,9 @@  static void fd_accept_incoming_migration(void *opaque)
 
     process_incoming_migration(f);
     qemu_set_fd_handler2(qemu_file_fd(f), NULL, NULL, NULL, NULL);
-    qemu_fclose(f);
+    if (!incoming_postcopy) {
+        qemu_fclose(f);
+    }
 }
 
 int fd_start_incoming_migration(const char *infd)
diff --git a/migration-postcopy.c b/migration-postcopy.c
new file mode 100644
index 0000000..0809ffa
--- /dev/null
+++ b/migration-postcopy.c
@@ -0,0 +1,1249 @@ 
+/*
+ * migration-postcopy.c: postcopy livemigration
+ *
+ * Copyright (c) 2011
+ * National Institute of Advanced Industrial Science and Technology
+ *
+ * https://sites.google.com/site/grivonhome/quick-kvm-migration
+ * Author: Isaku Yamahata <yamahata at valinux co jp>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "config-host.h"
+
+#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE)
+#include <sys/mman.h>
+#endif
+
+#include "bitmap.h"
+#include "sysemu.h"
+#include "kvm.h"
+#include "hw/hw.h"
+#include "arch_init.h"
+#include "migration.h"
+#include "buffered_file.h"
+#include "qemu_socket.h"
+#include "qemu-thread.h"
+#include "umem.h"
+
+#include "memory.h"
+#include "cpu-common.h"
+
+//#define DEBUG_POSTCOPY
+#ifdef DEBUG_POSTCOPY
+#define DPRINTF(fmt, ...)                                               \
+    do {                                                                \
+        printf("%s:%d: " fmt, __func__, __LINE__, ## __VA_ARGS__);      \
+    } while (0)
+#else
+#define DPRINTF(fmt, ...)       do { } while (0)
+#endif
+
+static void fd_close(int *fd)
+{
+    if (*fd >= 0) {
+        close(*fd);
+        *fd = -1;
+    }
+}
+
+static void set_fd(int fd, fd_set *fds, int *nfds)
+{
+    FD_SET(fd, fds);
+    if (fd > *nfds) {
+        *nfds = fd;
+    }
+}
+
+/***************************************************************************
+ * umem daemon on destination <-> qemu on source protocol
+ */
+
+#define QEMU_UMEM_REQ_INIT      0x00
+#define QEMU_UMEM_REQ_EOC       0x01
+#define QEMU_UMEM_REQ_PAGE      0x02
+#define QEMU_UMEM_REQ_PAGE_CONT 0x03
+
+struct qemu_umem_req {
+    int8_t cmd;
+    uint8_t len;
+    char *idstr;        /* REQ_PAGE */
+    uint32_t nr;        /* REQ_PAGE, REQ_PAGE_CONT */
+
+    /* in target page size as qemu migration protocol */
+    uint64_t *pgoffs;   /* REQ_PAGE, REQ_PAGE_CONT */
+};
+
+static void postcopy_incoming_send_req_idstr(QEMUFile *f, const char* idstr)
+{
+    qemu_put_byte(f, strlen(idstr));
+    qemu_put_buffer(f, (uint8_t *)idstr, strlen(idstr));
+}
+
+static void postcopy_incoming_send_req_pgoffs(QEMUFile *f, uint32_t nr,
+                                              const uint64_t *pgoffs)
+{
+    uint32_t i;
+
+    qemu_put_be32(f, nr);
+    for (i = 0; i < nr; i++) {
+        qemu_put_be64(f, pgoffs[i]);
+    }
+}
+
+static void postcopy_incoming_send_req_one(QEMUFile *f,
+                                           const struct qemu_umem_req *req)
+{
+    DPRINTF("cmd %d\n", req->cmd);
+    qemu_put_byte(f, req->cmd);
+    switch (req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+    case QEMU_UMEM_REQ_EOC:
+        /* nothing */
+        break;
+    case QEMU_UMEM_REQ_PAGE:
+        postcopy_incoming_send_req_idstr(f, req->idstr);
+        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
+        break;
+    case QEMU_UMEM_REQ_PAGE_CONT:
+        postcopy_incoming_send_req_pgoffs(f, req->nr, req->pgoffs);
+        break;
+    default:
+        abort();
+        break;
+    }
+}
+
+/* QEMUFile can buffer up to IO_BUF_SIZE = 32 * 1024.
+ * So one message size must be <= IO_BUF_SIZE
+ * cmd: 1
+ * id len: 1
+ * id: 256
+ * nr: 2
+ */
+#define MAX_PAGE_NR     ((32 * 1024 - 1 - 1 - 256 - 2) / sizeof(uint64_t))
+static void postcopy_incoming_send_req(QEMUFile *f,
+                                       const struct qemu_umem_req *req)
+{
+    uint32_t nr = req->nr;
+    struct qemu_umem_req tmp = *req;
+
+    switch (req->cmd) {
+    case QEMU_UMEM_REQ_INIT:
+    case QEMU_UMEM_REQ_EOC:
+        postcopy_incoming_send_req_one(f, &tmp);
+        break;
+    case QEMU_UMEM_REQ_PAGE:
+        tmp.nr = MIN(nr, MAX_PAGE_NR);
+        postcopy_incoming_send_req_one(f, &tmp);
+
+        nr -= tmp.nr;
+        tmp.pgoffs += tmp.nr;
+        tmp.cmd = QEMU_UMEM_REQ_PAGE_CONT;
+        /* fall through */
+    case QEMU_UMEM_REQ_PAGE_CONT:
+        while (nr > 0) {
+            tmp.nr = MIN(nr, MAX_PAGE_NR);
+            postcopy_incoming_send_req_one(f, &tmp);
+
+            nr -= tmp.nr;
+            tmp.pgoffs += tmp.nr;
+        }
+        break;
+    default:
+        abort();
+        break;
+    }
+}
+
+/***************************************************************************
+ * QEMU_VM_POSTCOPY section subtype
+ */
+#define QEMU_VM_POSTCOPY_INIT           0
+#define QEMU_VM_POSTCOPY_SECTION_FULL   1
+
+/***************************************************************************
+ * incoming part
+ */
+
+bool incoming_postcopy = false;
+
+
+#define PIS_STATE_QUIT_RECEIVED         0x01
+#define PIS_STATE_QUIT_QUEUED           0x02
+#define PIS_STATE_QUIT_SENT             0x04
+
+#define PIS_STATE_QUIT_MASK             (PIS_STATE_QUIT_RECEIVED | \
+                                         PIS_STATE_QUIT_QUEUED | \
+                                         PIS_STATE_QUIT_SENT)
+
+struct PostcopyIncomingState {
+    /* dest qemu state */
+    uint32_t    state;
+
+    int host_page_size;
+    int host_page_shift;
+
+    /* qemu side */
+    int to_umemd_fd;
+    QEMUFile *to_umemd;
+
+    int from_umemd_fd;
+    QEMUFile *from_umemd;
+    int version_id;     /* save/load format version id */
+};
+typedef struct PostcopyIncomingState PostcopyIncomingState;
+
+
+#define UMEM_STATE_EOS_RECEIVED         0x01    /* umem daemon <-> src qemu */
+#define UMEM_STATE_EOC_SEND_REQ         0x02    /* umem daemon <-> src qemu */
+#define UMEM_STATE_EOC_SENDING          0x04    /* umem daemon <-> src qemu */
+#define UMEM_STATE_EOC_SENT             0x08    /* umem daemon <-> src qemu */
+
+#define UMEM_STATE_QUIT_RECEIVED        0x10    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_HANDLED         0x20    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_QUEUED          0x40    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_SENDING         0x80    /* umem daemon <-> dst qemu */
+#define UMEM_STATE_QUIT_SENT            0x100   /* umem daemon <-> dst qemu */
+
+#define UMEM_STATE_ERROR_REQ            0x1000  /* umem daemon error */
+#define UMEM_STATE_ERROR_SENDING        0x2000  /* umem daemon error */
+#define UMEM_STATE_ERROR_SENT           0x3000  /* umem daemon error */
+
+#define UMEM_STATE_QUIT_MASK            (UMEM_STATE_QUIT_QUEUED |   \
+                                         UMEM_STATE_QUIT_SENDING |  \
+                                         UMEM_STATE_QUIT_SENT |     \
+                                         UMEM_STATE_QUIT_RECEIVED | \
+                                         UMEM_STATE_QUIT_HANDLED)
+#define UMEM_STATE_END_MASK             (UMEM_STATE_EOS_RECEIVED | \
+                                         UMEM_STATE_EOC_SEND_REQ | \
+                                         UMEM_STATE_EOC_SENDING |  \
+                                         UMEM_STATE_EOC_SENT |     \
+                                         UMEM_STATE_QUIT_MASK)
+
+struct UMemBlock {
+    UMem* umem;
+    char idstr[256];
+    ram_addr_t offset;
+    ram_addr_t length;
+    QLIST_ENTRY(UMemBlock) next;
+};
+typedef struct UMemBlock UMemBlock;
+
+struct PostcopyIncomingUMemDaemon {
+    /* umem daemon side */
+    QemuMutex mutex;
+    uint32_t state;     /* shared state. protected by mutex */
+
+    /* read only */
+    int host_page_size;
+    int host_page_shift;
+    int nr_host_pages_per_target_page;
+    int host_to_target_page_shift;
+    int nr_target_pages_per_host_page;
+    int target_to_host_page_shift;
+    int version_id;     /* save/load format version id */
+
+    QemuThread thread;
+    QLIST_HEAD(, UMemBlock) blocks;
+
+    /* thread to communicate with qemu main loop via pipe */
+    QemuThread pipe_thread;
+    int to_qemu_fd;
+    QEMUFile *to_qemu;
+    int from_qemu_fd;
+    QEMUFile *from_qemu;
+
+    /* = KVM_MAX_VCPUS * (ASYNC_PF_PER_VCPUS + 1) */
+#define MAX_REQUESTS    (512 * (64 + 1))
+
+    /* thread to read from outgoing qemu */
+    QemuThread mig_read_thread;
+    int mig_read_fd;
+    QEMUFile *mig_read;                 /* qemu on source -> umem daemon */
+    UMemBlock *last_block_read;         /* qemu on source -> umem daemon */
+    /* bitmap indexed by target page offset */
+    unsigned long *phys_received;
+    UMemPages *page_cached;
+
+    /* thread to write to outgoing qemu */
+    QemuThread mig_write_thread;
+    int mig_write_fd;
+    QEMUFile *mig_write;                /* umem daemon -> qemu on source */
+    UMemBlock *last_block_write;        /* umem daemon -> qemu on source */
+    /* bitmap indexed by target page offset */
+    unsigned long *phys_requested;
+    UMemPages *page_request;
+    uint64_t *target_pgoffs;
+};
+typedef struct PostcopyIncomingUMemDaemon PostcopyIncomingUMemDaemon;
+
+static PostcopyIncomingState state = {
+    .state = 0,
+    .to_umemd_fd = -1,
+    .to_umemd = NULL,
+    .from_umemd_fd = -1,
+    .from_umemd = NULL,
+};
+
+static PostcopyIncomingUMemDaemon umemd = {
+    .state = 0,
+    .to_qemu_fd = -1,
+    .to_qemu = NULL,
+    .from_qemu_fd = -1,
+    .from_qemu = NULL,
+    .blocks = QLIST_HEAD_INITIALIZER(&umemd.blocks),
+    .mig_read_fd = -1,
+    .mig_read = NULL,
+    .mig_write_fd = -1,
+    .mig_write = NULL,
+};
+
+static void *postcopy_incoming_umemd(void*);
+static void postcopy_incoming_qemu_handle_req(void *opaque);
+
+/* protected by qemu_mutex_lock_ramlist() */
+void postcopy_incoming_ram_free(RAMBlock *ram_block)
+{
+    UMemBlock *block;
+    QLIST_FOREACH(block, &umemd.blocks, next) {
+        if (!strncmp(ram_block->idstr, block->idstr, strlen(block->idstr))) {
+            break;
+        }
+    }
+    if (block != NULL) {
+        umem_unmap(block->umem);
+    } else {
+        munmap(ram_block->host, ram_block->length);
+    }
+}
+
+static int postcopy_incoming_ram_load_get64(QEMUFile *f,
+                                            ram_addr_t *addr, int *flags)
+{
+    *addr = qemu_get_be64(f);
+    *flags = *addr & ~TARGET_PAGE_MASK;
+    *addr &= TARGET_PAGE_MASK;
+    return qemu_file_get_error(f);
+}
+
+int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id)
+{
+    ram_addr_t addr;
+    int flags;
+    int error;
+
+    DPRINTF("incoming ram load\n");
+    /*
+     * RAM_SAVE_FLAGS_EOS or
+     * RAM_SAVE_FLAGS_MEM_SIZE + mem size + RAM_SAVE_FLAGS_EOS
+     * see postcopy_outgoing_ram_save_live()
+     */
+
+    if (version_id != RAM_SAVE_VERSION_ID) {
+        DPRINTF("RAM_SAVE_VERSION_ID %d != %d\n",
+                version_id, RAM_SAVE_VERSION_ID);
+        return -EINVAL;
+    }
+    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
+    DPRINTF("addr 0x%lx flags 0x%x\n", addr, flags);
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
+        DPRINTF("EOS\n");
+        return 0;
+    }
+
+    if (flags != RAM_SAVE_FLAG_MEM_SIZE) {
+        DPRINTF("-EINVAL flags 0x%x\n", flags);
+        return -EINVAL;
+    }
+    error = ram_load_mem_size(f, addr);
+    if (error) {
+        DPRINTF("addr 0x%lx error %d\n", addr, error);
+        return error;
+    }
+
+    error = postcopy_incoming_ram_load_get64(f, &addr, &flags);
+    if (error) {
+        DPRINTF("addr 0x%lx flags 0x%x error %d\n", addr, flags, error);
+        return error;
+    }
+    if (flags == RAM_SAVE_FLAG_EOS && addr == 0) {
+        DPRINTF("done\n");
+        return 0;
+    }
+    DPRINTF("-EINVAL\n");
+    return -EINVAL;
+}
+
+static void postcopy_incoming_umem_block_free(void)
+{
+    UMemBlock *block;
+    UMemBlock *tmp;
+
+    /* to protect againt postcopy_incoming_ram_free() */
+    qemu_mutex_lock_ramlist();
+    QLIST_FOREACH_SAFE(block, &umemd.blocks, next, tmp) {
+        UMem *umem = block->umem;
+        umem_unmap_shmem(umem);
+        umem_destroy(umem);
+        QLIST_REMOVE(block, next);
+        g_free(block);
+    }
+    qemu_mutex_unlock_ramlist();
+}
+
+static int postcopy_incoming_prepare(void)
+{
+    int error = 0;
+    RAMBlock *block;
+    int nbits;
+
+    state.state = 0;
+    state.host_page_size = getpagesize();
+    state.host_page_shift = ffs(state.host_page_size) - 1;
+    state.version_id = RAM_SAVE_VERSION_ID; /* = save version of
+                                               ram_save_live() */
+
+    qemu_mutex_init(&umemd.mutex);
+    umemd.host_page_size = state.host_page_size;
+    umemd.host_page_shift = state.host_page_shift;
+
+    umemd.nr_host_pages_per_target_page =
+        TARGET_PAGE_SIZE / umemd.host_page_size;
+    umemd.nr_target_pages_per_host_page =
+        umemd.host_page_size / TARGET_PAGE_SIZE;
+    umemd.target_to_host_page_shift =
+        ffs(umemd.nr_host_pages_per_target_page) - 1;
+    umemd.host_to_target_page_shift =
+        ffs(umemd.nr_target_pages_per_host_page) - 1;
+
+    QLIST_INIT(&umemd.blocks);
+    qemu_mutex_lock_ramlist();
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        UMem *umem;
+        UMemBlock *umem_block;
+
+        if (block->flags & RAM_PREALLOC_MASK) {
+            continue;
+        }
+        error = umem_new(block->host, block->length, &umem);
+        if (error < 0) {
+            qemu_mutex_unlock_ramlist();
+            goto out;
+        }
+        umem_block = g_malloc0(sizeof(*umem_block));
+        umem_block->umem = umem;
+        umem_block->offset = block->offset;
+        umem_block->length = block->length;
+        pstrcpy(umem_block->idstr, sizeof(umem_block->idstr), block->idstr);
+
+        error = umem_map_shmem(umem_block->umem);
+        if (error) {
+            qemu_mutex_unlock_ramlist();
+            goto out;
+        }
+        umem_close_shmem(umem_block->umem);
+
+        block->flags |= RAM_POSTCOPY_UMEM_MASK;
+        QLIST_INSERT_HEAD(&umemd.blocks, umem_block, next);
+    }
+    qemu_mutex_unlock_ramlist();
+
+    umemd.page_request = g_malloc(umem_pages_size(MAX_REQUESTS));
+    umemd.page_cached = g_malloc(
+        umem_pages_size(MAX_REQUESTS *
+                        (TARGET_PAGE_SIZE >= umemd.host_page_size ?
+                         1: umemd.nr_host_pages_per_target_page)));
+    umemd.target_pgoffs =
+        g_new(uint64_t, MAX_REQUESTS *
+              MAX(umemd.nr_host_pages_per_target_page,
+                  umemd.nr_target_pages_per_host_page));
+
+    nbits = last_ram_offset() >> TARGET_PAGE_BITS;
+    umemd.phys_requested = bitmap_new(nbits);
+    umemd.phys_received = bitmap_new(nbits);
+    umemd.last_block_read = NULL;
+    umemd.last_block_write = NULL;
+    return 0;
+
+out:
+    postcopy_incoming_umem_block_free();
+    return error;
+}
+
+static int postcopy_incoming_loadvm_init(QEMUFile *f, uint32_t size)
+{
+    uint64_t options;
+    int flags;
+    int error;
+
+    if (size != sizeof(options)) {
+        fprintf(stderr, "unknown size %d\n", size);
+        return -EINVAL;
+    }
+    options = qemu_get_be64(f);
+    if (options) {
+        fprintf(stderr, "unknown options 0x%"PRIx64, options);
+        return -ENOSYS;
+    }
+    flags = fcntl(qemu_file_fd(f), F_GETFL);
+    if ((flags & O_ACCMODE) != O_RDWR) {
+        /* postcopy requires read/write file descriptor */
+        fprintf(stderr, "non-writable connection. "
+                "postcopy requires read/write connection \n");
+        return -EINVAL;
+    }
+    if (mem_path) {
+        fprintf(stderr, "mem_path is specified to %s. "
+                "postcopy doesn't work with it\n", mem_path);
+        return -ENOSYS;
+    }
+
+    DPRINTF("detected POSTCOPY\n");
+    error = postcopy_incoming_prepare();
+    if (error) {
+        return error;
+    }
+    savevm_ram_handlers.load_state = postcopy_incoming_ram_load;
+    incoming_postcopy = true;
+    return 0;
+}
+
+static int postcopy_incoming_create_umemd_thread(QEMUFile *mig_read)
+{
+    int error;
+    int fds[2];
+    int mig_read_fd;
+    int mig_write_fd;
+    assert((fcntl(qemu_file_fd(mig_read), F_GETFL) & O_ACCMODE) == O_RDWR);
+
+    if (qemu_pipe(fds) == -1) {
+        perror("qemu_pipe");
+        abort();
+    }
+    state.from_umemd_fd = fds[0];
+    umemd.to_qemu_fd = fds[1];
+
+    if (qemu_pipe(fds) == -1) {
+        perror("qemu_pipe");
+        abort();
+    }
+    umemd.from_qemu_fd = fds[0];
+    state.to_umemd_fd = fds[1];
+
+    mig_read_fd = qemu_file_fd(mig_read);
+    umemd.state = 0;
+    umemd.version_id = state.version_id;
+    umemd.mig_read_fd = mig_read_fd;
+    umemd.mig_read = mig_read;
+
+    mig_write_fd = dup(mig_read_fd);
+    if (mig_write_fd < 0) {
+        perror("could not dup for writable socket \n");
+        abort();
+    }
+    umemd.mig_write_fd = mig_write_fd;
+    umemd.mig_write = qemu_fopen_fd(mig_write_fd, "w");
+
+    qemu_thread_create(&umemd.thread, &postcopy_incoming_umemd,
+                       NULL, QEMU_THREAD_DETACHED);
+
+    error = umem_qemu_wait_for_daemon(state.from_umemd_fd);
+    if (error) {
+        return error;
+    }
+    /* now socket is disowned. So tell umem thread that it's safe to use it */
+    error = umem_qemu_ready(state.to_umemd_fd);
+    if (error) {
+        return error;
+    }
+
+    state.from_umemd = qemu_fopen_fd(state.from_umemd_fd, "r");
+    state.to_umemd = qemu_fopen_fd(state.to_umemd_fd, "w");
+    qemu_set_fd_handler(state.from_umemd_fd,
+                        postcopy_incoming_qemu_handle_req, NULL, NULL);
+    return 0;
+}
+
+static int postcopy_incoming_loadvm_section_full(QEMUFile *f, uint32_t size,
+                                                 QEMUFile **buf_file)
+{
+    int error;
+    uint8_t *buf;
+    int read_size;
+
+    /* as size comes from network, check if it's not unreasonably big
+     * At the moment, it is guessed as 16MB.
+     */
+    DPRINTF("size 0x%"PRIx32"\n", size);
+#define SAVE_VM_FULL_SIZE_MAX   (16 * 1024 * 1024)
+    if (size > SAVE_VM_FULL_SIZE_MAX) {
+        fprintf(stderr,
+                "QEMU_VM_POSTCOPY QEMU_VM_POSTCOPY_SECTION_FULL section seems "
+                "to have unreasonably big size 0x%x"PRIx32". aborting.\n"
+                "If its size is really correct, "
+                "please increase it in the code\n",
+                size);
+        return -EINVAL;
+    }
+
+    buf = g_malloc(size);
+    read_size = qemu_get_buffer(f, buf, size);
+    if (size != read_size) {
+        fprintf(stderr, "qemu: warning: error while postcopy size %d %d\n",
+                size, read_size);
+        g_free(buf);
+        return -EINVAL;
+    }
+    error = postcopy_incoming_create_umemd_thread(f);
+    if (error) {
+        return error;
+    }
+
+    /* VMStateDescription:pre/post_load and
+     * cpu_sychronize_all_post_init() may fault on guest RAM.
+     * (MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME)
+     * postcopy daemon needs to be forked before the fault.
+     */
+    *buf_file = qemu_fopen_buf_read(buf, size);
+    return 0;
+}
+
+int postcopy_incoming_loadvm_state(QEMUFile *f, QEMUFile **buf_file)
+{
+    int ret = 0;
+    uint8_t subtype;
+    uint32_t size;
+
+    subtype = qemu_get_ubyte(f);
+    size = qemu_get_be32(f);
+    switch (subtype) {
+    case QEMU_VM_POSTCOPY_INIT:
+        ret = postcopy_incoming_loadvm_init(f, size);
+        break;
+    case QEMU_VM_POSTCOPY_SECTION_FULL:
+        ret = postcopy_incoming_loadvm_section_full(f, size, buf_file);
+        break;
+    default:
+        ret = -EINVAL;
+        break;
+    }
+    return ret;
+}
+
+static void postcopy_incoming_qemu_recv_quit(void)
+{
+    if (state.state & PIS_STATE_QUIT_RECEIVED) {
+        return;
+    }
+
+    DPRINTF("|= PIS_STATE_QUIT_RECEIVED\n");
+    state.state |= PIS_STATE_QUIT_RECEIVED;
+    qemu_set_fd_handler(state.from_umemd_fd, NULL, NULL, NULL);
+    qemu_fclose(state.from_umemd);
+    state.from_umemd = NULL;
+    fd_close(&state.from_umemd_fd);
+}
+
+static void postcopy_incoming_qemu_check_quite_queued(void)
+{
+    if (state.state & PIS_STATE_QUIT_QUEUED &&
+        !(state.state & PIS_STATE_QUIT_SENT)) {
+        DPRINTF("|= PIS_STATE_QUIT_SENT\n");
+        state.state |= PIS_STATE_QUIT_SENT;
+
+        qemu_fclose(state.to_umemd);
+        state.to_umemd = NULL;
+        fd_close(&state.to_umemd_fd);
+    }
+}
+
+static void postcopy_incoming_qemu_queue_quit(void)
+{
+    if (state.state & PIS_STATE_QUIT_QUEUED) {
+        return;
+    }
+
+    DPRINTF("|= PIS_STATE_QUIT_QUEUED\n");
+    umem_qemu_quit(state.to_umemd);
+    state.state |= PIS_STATE_QUIT_QUEUED;
+}
+
+static void postcopy_incoming_qemu_handle_req(void *opaque)
+{
+    uint8_t cmd;
+
+    cmd = qemu_get_ubyte(state.from_umemd);
+    DPRINTF("cmd %c\n", cmd);
+
+    switch (cmd) {
+    case UMEM_DAEMON_QUIT:
+        postcopy_incoming_qemu_recv_quit();
+        postcopy_incoming_qemu_queue_quit();
+        postcopy_incoming_qemu_cleanup();
+        break;
+    case UMEM_DAEMON_ERROR:
+        /* umem daemon hit troubles, so it warned us to stop vm execution */
+        vm_stop(RUN_STATE_IO_ERROR); /* or RUN_STATE_INTERNAL_ERROR */
+        break;
+    default:
+        DPRINTF("unknown command %d\n", cmd);
+        abort();
+        break;
+    }
+
+    postcopy_incoming_qemu_check_quite_queued();
+}
+
+void postcopy_incoming_qemu_cleanup(void)
+{
+    /* when qemu will quit before completing postcopy, tell umem daemon
+       to tear down umem device and exit. */
+    if (state.to_umemd_fd >= 0) {
+        postcopy_incoming_qemu_queue_quit();
+        postcopy_incoming_qemu_check_quite_queued();
+    }
+}
+
+/**************************************************************************
+ * incoming umem daemon
+ */
+
+static void postcopy_incoming_umem_error_req(void)
+{
+    qemu_mutex_lock(&umemd.mutex);
+    umemd.state |= UMEM_STATE_ERROR_REQ;
+    qemu_mutex_unlock(&umemd.mutex);
+}
+
+static void postcopy_incoming_umem_recv_quit(void)
+{
+    qemu_mutex_lock(&umemd.mutex);
+    if (umemd.state & UMEM_STATE_QUIT_RECEIVED) {
+        qemu_mutex_unlock(&umemd.mutex);
+        return;
+    }
+    DPRINTF("|= UMEM_STATE_QUIT_RECEIVED\n");
+    umemd.state |= UMEM_STATE_QUIT_RECEIVED;
+    qemu_mutex_unlock(&umemd.mutex);
+
+    qemu_fclose(umemd.from_qemu);
+    umemd.from_qemu = NULL;
+    fd_close(&umemd.from_qemu_fd);
+
+    qemu_mutex_lock(&umemd.mutex);
+    DPRINTF("|= UMEM_STATE_QUIT_HANDLED\n");
+    umemd.state |= UMEM_STATE_QUIT_HANDLED;
+    qemu_mutex_unlock(&umemd.mutex);
+}
+
+/* call with umemd.mutex held */
+static void postcopy_incoming_umem_queue_quit_locked(void)
+{
+    if (umemd.state & UMEM_STATE_QUIT_QUEUED) {
+        return;
+    }
+    DPRINTF("|= UMEM_STATE_QUIT_QUEUED\n");
+    umemd.state |= UMEM_STATE_QUIT_QUEUED;
+}
+
+static void postcopy_incoming_umem_check_eoc_req(void)
+{
+    struct qemu_umem_req req;
+
+    qemu_mutex_lock(&umemd.mutex);
+    if (!(umemd.state & UMEM_STATE_EOC_SEND_REQ) ||
+        umemd.state & (UMEM_STATE_EOC_SENDING | UMEM_STATE_EOC_SENT)) {
+        qemu_mutex_unlock(&umemd.mutex);
+        return;
+    }
+
+    DPRINTF("|= UMEM_STATE_EOC_SENDING\n");
+    umemd.state |= UMEM_STATE_EOC_SENDING;
+    qemu_mutex_unlock(&umemd.mutex);
+
+    req.cmd = QEMU_UMEM_REQ_EOC;
+    postcopy_incoming_send_req(umemd.mig_write, &req);
+    qemu_fclose(umemd.mig_write);
+    umemd.mig_write = NULL;
+    fd_close(&umemd.mig_write_fd);
+
+    qemu_mutex_lock(&umemd.mutex);
+    DPRINTF("|= UMEM_STATE_EOC_SENT\n");
+    umemd.state |= UMEM_STATE_EOC_SENT;
+    qemu_mutex_unlock(&umemd.mutex);
+}
+
+static void postcopy_incoming_umem_req_eoc(void)
+{
+    qemu_mutex_lock(&umemd.mutex);
+    DPRINTF("|= UMEM_STATE_EOC_SEND_REQ\n");
+    umemd.state |= UMEM_STATE_EOC_SEND_REQ;
+    qemu_mutex_unlock(&umemd.mutex);
+}
+
+static int postcopy_incoming_umem_send_page_req(UMemBlock *block)
+{
+    int error;
+    struct qemu_umem_req req;
+    unsigned long bit;
+    uint64_t target_pgoff;
+    int i;
+
+    umemd.page_request->nr = MAX_REQUESTS;
+    error = umem_get_page_request(block->umem, umemd.page_request);
+    if (error) {
+        return error;
+    }
+    DPRINTF("id %s nr %"PRId64" offs 0x%"PRIx64" 0x%"PRIx64"\n",
+            block->idstr, (uint64_t)umemd.page_request->nr,
+            (uint64_t)umemd.page_request->pgoffs[0],
+            (uint64_t)umemd.page_request->pgoffs[1]);
+
+    if (umemd.last_block_write != block) {
+        req.cmd = QEMU_UMEM_REQ_PAGE;
+        req.idstr = block->idstr;
+    } else {
+        req.cmd = QEMU_UMEM_REQ_PAGE_CONT;
+    }
+
+    req.nr = 0;
+    req.pgoffs = umemd.target_pgoffs;
+    if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
+        for (i = 0; i < umemd.page_request->nr; i++) {
+            target_pgoff = umemd.page_request->pgoffs[i] >>
+                umemd.host_to_target_page_shift;
+            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
+
+            if (!test_and_set_bit(bit, umemd.phys_requested)) {
+                req.pgoffs[req.nr] = target_pgoff;
+                req.nr++;
+            }
+        }
+    } else {
+        for (i = 0; i < umemd.page_request->nr; i++) {
+            int j;
+            target_pgoff = umemd.page_request->pgoffs[i] <<
+                umemd.host_to_target_page_shift;
+            bit = (block->offset >> TARGET_PAGE_BITS) + target_pgoff;
+
+            for (j = 0; j < umemd.nr_target_pages_per_host_page; j++) {
+                if (!test_and_set_bit(bit + j, umemd.phys_requested)) {
+                    req.pgoffs[req.nr] = target_pgoff + j;
+                    req.nr++;
+                }
+            }
+        }
+    }
+
+    DPRINTF("id %s nr %d offs 0x%"PRIx64" 0x%"PRIx64"\n",
+            block->idstr, req.nr, req.pgoffs[0], req.pgoffs[1]);
+    if (req.nr > 0 && umemd.mig_write != NULL) {
+        postcopy_incoming_send_req(umemd.mig_write, &req);
+        umemd.last_block_write = block;
+    }
+    return 0;
+}
+
+static void postcopy_incoming_umem_page_fault(UMemBlock *block,
+                                              const UMemPages *pages)
+{
+    uint64_t i;
+
+    for (i = 0; i < pages->nr; i++) {
+        size_t offset = pages->pgoffs[i] << umemd.host_page_shift;
+        RAMBlock *ram_block;
+
+        /* make pages present by forcibly triggering page fault. */
+        qemu_mutex_lock_ramlist();
+        ram_block = ram_find_block(block->idstr, strlen(block->idstr));
+        if (ram_block && offset < ram_block->length) {
+            volatile uint8_t *ram =
+                memory_region_get_ram_ptr(ram_block->mr) + offset;
+            uint8_t dummy_read = ram[0];
+            (void)dummy_read;   /* suppress unused variable warning */
+        }
+        qemu_mutex_unlock_ramlist();
+
+        umem_remove_shmem(block->umem, offset, umemd.host_page_size);
+    }
+}
+
+static bool postcopy_incoming_umem_check_umem_done(void)
+{
+    bool all_done = true;
+    UMemBlock *block;
+
+    QLIST_FOREACH(block, &umemd.blocks, next) {
+        if (umem_shmem_finished(block->umem)) {
+            umem_unmap_shmem(block->umem);
+        } else {
+            all_done = false;
+            break;
+        }
+    }
+
+    return all_done;
+}
+
+static void postcopy_incoming_umem_done(void)
+{
+    postcopy_incoming_umem_req_eoc();
+    qemu_mutex_lock(&umemd.mutex);
+    postcopy_incoming_umem_queue_quit_locked();
+    qemu_mutex_unlock(&umemd.mutex);
+}
+
+static UMemBlock *postcopy_incoming_umem_block_from_stream(
+    QEMUFile *f, int flags)
+{
+    uint8_t len;
+    char id[256];
+    UMemBlock *block;
+
+    if (flags & RAM_SAVE_FLAG_CONTINUE) {
+        return umemd.last_block_read;
+    }
+
+    len = qemu_get_byte(f);
+    qemu_get_buffer(f, (uint8_t*)id, len);
+    id[len] = 0;
+
+    DPRINTF("idstr: %s len %d\n", id, len);
+    QLIST_FOREACH(block, &umemd.blocks, next) {
+        if (!strncmp(id, block->idstr, len)) {
+            umemd.last_block_read = block;
+            return block;
+        }
+    }
+    DPRINTF("error\n");
+    return NULL;
+}
+
+static int postcopy_incoming_umem_ram_load(void)
+{
+    ram_addr_t offset;
+    int flags;
+    UMemBlock *block;
+
+    void *shmem;
+    int error;
+    int i;
+    int bit;
+
+    if (umemd.version_id != RAM_SAVE_VERSION_ID) {
+        return -EINVAL;
+    }
+
+    error = postcopy_incoming_ram_load_get64(umemd.mig_read, &offset, &flags);
+    /* DPRINTF("offset 0x%lx flags 0x%x\n", offset, flags); */
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+    assert(!(flags & RAM_SAVE_FLAG_MEM_SIZE));
+
+    if (flags & RAM_SAVE_FLAG_EOS) {
+        DPRINTF("RAM_SAVE_FLAG_EOS\n");
+        postcopy_incoming_umem_req_eoc();
+
+        qemu_fclose(umemd.mig_read);
+        umemd.mig_read = NULL;
+        fd_close(&umemd.mig_read_fd);
+
+        qemu_mutex_lock(&umemd.mutex);
+        umemd.state |= UMEM_STATE_EOS_RECEIVED;
+        postcopy_incoming_umem_queue_quit_locked();
+        qemu_mutex_unlock(&umemd.mutex);
+        DPRINTF("|= UMEM_STATE_EOS_RECEIVED\n");
+        return 0;
+    }
+
+    block = postcopy_incoming_umem_block_from_stream(umemd.mig_read, flags);
+    if (block == NULL) {
+        return -EINVAL;
+    }
+    assert(!umem_shmem_finished(block->umem));
+    shmem = block->umem->shmem + offset;
+    error = ram_load_page(umemd.mig_read, shmem, flags);
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+    error = qemu_file_get_error(umemd.mig_read);
+    if (error) {
+        DPRINTF("error %d\n", error);
+        return error;
+    }
+
+    umemd.page_cached->nr = 0;
+    bit = (block->offset + offset) >> TARGET_PAGE_BITS;
+    if (!test_and_set_bit(bit, umemd.phys_received)) {
+        if (TARGET_PAGE_SIZE >= umemd.host_page_size) {
+            uint64_t pgoff = offset >> umemd.host_page_shift;
+            for (i = 0; i < umemd.nr_host_pages_per_target_page; i++) {
+                umemd.page_cached->pgoffs[umemd.page_cached->nr] = pgoff + i;
+                umemd.page_cached->nr++;
+            }
+        } else {
+            bool mark_cache = true;
+            for (i = 0; i < umemd.nr_target_pages_per_host_page; i++) {
+                if (!test_bit(bit + i, umemd.phys_received)) {
+                    mark_cache = false;
+                    break;
+                }
+            }
+            if (mark_cache) {
+                umemd.page_cached->pgoffs[0] =
+                    offset >> umemd.host_page_shift;
+                umemd.page_cached->nr = 1;
+            }
+        }
+    }
+
+    if (umemd.page_cached->nr > 0) {
+        error = umem_mark_page_cached(block->umem, umemd.page_cached);
+        if (error) {
+            return error;
+        }
+        postcopy_incoming_umem_page_fault(block, umemd.page_cached);
+        if (postcopy_incoming_umem_check_umem_done()) {
+            postcopy_incoming_umem_done();
+        }
+    }
+
+    return 0;
+}
+
+static int postcopy_incoming_umemd_mig_read_loop(void)
+{
+    int error;
+    /* read thread doesn't need to check periodically UMEM_STATE_EOC_SEND_REQ
+     * because RAM_SAVE_FLAG_EOS is always sent by the outgoing part. */
+    if (umemd.mig_read_fd < 0) {
+        return -EINVAL;
+    }
+    error = postcopy_incoming_umem_ram_load();
+    if (error) {
+        postcopy_incoming_umem_error_req();
+    }
+    return error;
+}
+
+static int postcopy_incoming_umemd_mig_write_loop(void)
+{
+    int ret;
+    UMemBlock *block;
+    /* to check UMEM_STATE_EOC_SEND_REQ periodically */
+    struct timeval timeout = {.tv_sec = 1, .tv_usec = 0};
+    int nfds = -1;
+    fd_set readfds;
+    FD_ZERO(&readfds);
+
+    QLIST_FOREACH(block, &umemd.blocks, next) {
+        set_fd(block->umem->fd, &readfds, &nfds);
+    }
+    ret = select(nfds + 1, &readfds, NULL, NULL, &timeout);
+    if (ret == -1) {
+        if (errno == EINTR) {
+            return 0;
+        }
+        return ret;
+    }
+    QLIST_FOREACH(block, &umemd.blocks, next) {
+        if (FD_ISSET(block->umem->fd, &readfds)) {
+            ret = postcopy_incoming_umem_send_page_req(block);
+            if (ret) {
+                postcopy_incoming_umem_error_req();
+                return ret;
+            }
+        }
+    }
+    if (umemd.mig_write != NULL) {
+        qemu_fflush(umemd.mig_write);
+    }
+    postcopy_incoming_umem_check_eoc_req();
+
+    return 0;
+}
+
+static int postcopy_incoming_umemd_pipe_init(void)
+{
+    int error;
+    error = umem_daemon_ready(umemd.to_qemu_fd);
+    if (error) {
+        goto out;
+    }
+    umemd.to_qemu = qemu_fopen_fd(umemd.to_qemu_fd, "w");
+
+    /* wait for qemu to disown migration_fd */
+    error = umem_daemon_wait_for_qemu(umemd.from_qemu_fd);
+    if (error) {
+        goto out;
+    }
+    umemd.from_qemu = qemu_fopen_fd(umemd.from_qemu_fd, "r");
+    return 0;
+
+out:
+    /* Here there is no way to tell error to main thread
+       in order to teardown. */
+    perror("initialization error");
+    abort();
+    return error;
+}
+
+static int postcopy_incoming_umemd_pipe_loop(void)
+{
+    int ret;
+    /* to check UMEM_STATE_QUIT_QUEUED periodically */
+    struct timeval timeout = {.tv_sec = 1, .tv_usec = 0};
+    fd_set readfds;
+    int nfds = -1;
+
+    FD_ZERO(&readfds);
+    if (umemd.from_qemu_fd >= 0) {
+        set_fd(umemd.from_qemu_fd, &readfds, &nfds);
+    }
+    ret = select(nfds + 1, &readfds, NULL, NULL, &timeout);
+    if (ret == -1) {
+        if (errno == EINTR) {
+            return 0;
+        }
+        return ret;
+    }
+    if (umemd.from_qemu_fd >= 0 && FD_ISSET(umemd.from_qemu_fd, &readfds)) {
+        uint8_t cmd;
+        cmd = qemu_get_ubyte(umemd.from_qemu);
+        DPRINTF("cmd %c\n", cmd);
+        switch (cmd) {
+        case UMEM_QEMU_QUIT:
+            postcopy_incoming_umem_recv_quit();
+            postcopy_incoming_umem_done();
+            break;
+        default:
+            abort();
+            break;
+        }
+        if (umemd.to_qemu != NULL) {
+            qemu_fflush(umemd.to_qemu);
+        }
+    }
+
+    if (umemd.to_qemu != NULL) {
+        qemu_mutex_lock(&umemd.mutex);
+        if (umemd.state & UMEM_STATE_ERROR_REQ &&
+            !(umemd.state & UMEM_STATE_ERROR_SENDING)) {
+            umemd.state |= UMEM_STATE_ERROR_SENDING;
+            qemu_mutex_unlock(&umemd.mutex);
+            umem_daemon_error(umemd.to_qemu);
+            qemu_mutex_lock(&umemd.mutex);
+            umemd.state |= UMEM_STATE_ERROR_SENT;
+        }
+        if (umemd.state & UMEM_STATE_QUIT_QUEUED &&
+            !(umemd.state & (UMEM_STATE_QUIT_SENDING |
+                             UMEM_STATE_QUIT_SENT))) {
+            DPRINTF("|= UMEM_STATE_QUIT_SENDING\n");
+            umemd.state |= UMEM_STATE_QUIT_SENDING;
+            qemu_mutex_unlock(&umemd.mutex);
+
+            umem_daemon_quit(umemd.to_qemu);
+            qemu_fclose(umemd.to_qemu);
+            umemd.to_qemu = NULL;
+            fd_close(&umemd.to_qemu_fd);
+
+            qemu_mutex_lock(&umemd.mutex);
+            DPRINTF("|= UMEM_STATE_QUIT_SENT\n");
+            umemd.state |= UMEM_STATE_QUIT_SENT;
+        }
+        qemu_mutex_unlock(&umemd.mutex);
+    }
+
+    return 0;
+}
+
+struct IncomingThread {
+    int (*init_func)(void);
+    int (*loop_func)(void);
+};
+typedef struct IncomingThread IncomingThread;
+
+static void *postcopy_incoming_umemd_thread(void* arg)
+{
+    IncomingThread *im  = arg;
+    int error;
+
+    DPRINTF("loop %d %p %p\n", getpid(), im->init_func, im->loop_func);
+    if (im->init_func) {
+        error = im->init_func();
+        if (error) {
+            postcopy_incoming_umem_error_req();
+            return NULL;
+        }
+    }
+    for (;;) {
+        qemu_mutex_lock(&umemd.mutex);
+        if ((umemd.state & UMEM_STATE_END_MASK) == UMEM_STATE_END_MASK) {
+            qemu_mutex_unlock(&umemd.mutex);
+            DPRINTF("loop out %p\n", im->loop_func);
+            break;
+        }
+        qemu_mutex_unlock(&umemd.mutex);
+
+        error = im->loop_func();
+        if (error) {
+            DPRINTF("func %p error = %d\n", im->loop_func, error);
+            break;
+        }
+    }
+    return NULL;
+}
+
+static void *postcopy_incoming_umemd(void* unused)
+{
+    DPRINTF("umemd\n");
+    qemu_thread_create(&umemd.mig_read_thread,
+                       &postcopy_incoming_umemd_thread,
+                       &(IncomingThread) {
+                           NULL, &postcopy_incoming_umemd_mig_read_loop,},
+                       QEMU_THREAD_JOINABLE);
+    qemu_thread_create(&umemd.mig_write_thread,
+                       &postcopy_incoming_umemd_thread,
+                       &(IncomingThread) {
+                           NULL, &postcopy_incoming_umemd_mig_write_loop,},
+                       QEMU_THREAD_JOINABLE);
+    qemu_thread_create(&umemd.pipe_thread, &postcopy_incoming_umemd_thread,
+                       &(IncomingThread) {
+                           &postcopy_incoming_umemd_pipe_init,
+                           &postcopy_incoming_umemd_pipe_loop,},
+                       QEMU_THREAD_JOINABLE);
+
+    qemu_thread_join(&umemd.mig_read_thread);
+    qemu_thread_join(&umemd.mig_write_thread);
+    qemu_thread_join(&umemd.pipe_thread);
+
+    g_free(umemd.page_request);
+    g_free(umemd.page_cached);
+    g_free(umemd.target_pgoffs);
+    g_free(umemd.phys_requested);
+    g_free(umemd.phys_received);
+
+    postcopy_incoming_umem_block_free();
+
+    DPRINTF("umemd done\n");
+    return NULL;
+}
diff --git a/migration-tcp.c b/migration-tcp.c
index a15c2b8..69c655d 100644
--- a/migration-tcp.c
+++ b/migration-tcp.c
@@ -107,13 +107,15 @@  static void tcp_accept_incoming_migration(void *opaque)
     f = qemu_fopen_socket(c);
     if (f == NULL) {
         fprintf(stderr, "could not qemu_fopen socket\n");
-        goto out;
+        close(c);
+        goto out2;
     }
 
     process_incoming_migration(f);
-    qemu_fclose(f);
-out:
-    close(c);
+    if (!incoming_postcopy) {
+        qemu_fclose(f);
+        close(c);
+    }
 out2:
     qemu_set_fd_handler2(s, NULL, NULL, NULL, NULL);
     close(s);
diff --git a/migration-unix.c b/migration-unix.c
index 169de88..d4e2431 100644
--- a/migration-unix.c
+++ b/migration-unix.c
@@ -140,13 +140,15 @@  static void unix_accept_incoming_migration(void *opaque)
     f = qemu_fopen_socket(c);
     if (f == NULL) {
         fprintf(stderr, "could not qemu_fopen socket\n");
-        goto out;
+        close(c);
+        goto out2;
     }
 
     process_incoming_migration(f);
-    qemu_fclose(f);
-out:
-    close(c);
+    if (!incoming_postcopy) {
+        qemu_fclose(f);
+        close(c);
+    }
 out2:
     qemu_set_fd_handler2(s, NULL, NULL, NULL, NULL);
     close(s);
diff --git a/migration.h b/migration.h
index 2d27738..0766691 100644
--- a/migration.h
+++ b/migration.h
@@ -134,4 +134,14 @@  int64_t migrate_xbzrle_cache_size(void);
 
 int64_t xbzrle_cache_resize(int64_t new_size);
 
+/* For incoming postcopy */
+extern bool incoming_postcopy;
+
+int postcopy_incoming_loadvm_state(QEMUFile *f, QEMUFile **buf_file);
+int postcopy_incoming_ram_load(QEMUFile *f, void *opaque, int version_id);
+void postcopy_incoming_qemu_cleanup(void);
+#if defined(NEED_CPU_H) && !defined(CONFIG_USER_ONLY)
+void postcopy_incoming_ram_free(RAMBlock *ram_block);
+#endif
+
 #endif
diff --git a/savevm.c b/savevm.c
index c93b6eb..d1488d2 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1981,6 +1981,10 @@  int qemu_loadvm_state(QEMUFile *f)
     uint8_t section_type;
     unsigned int v;
     int ret;
+    QEMUFile *orig_f = NULL;
+
+    /* postcopy may change this. restore later */
+    LoadStateHandler *old_ram_load = savevm_ram_handlers.load_state;
 
     if (qemu_savevm_state_blocked(NULL)) {
         return -EINVAL;
@@ -2048,6 +2052,7 @@  int qemu_loadvm_state(QEMUFile *f)
             break;
         case QEMU_VM_SECTION_PART:
         case QEMU_VM_SECTION_END:
+            assert(orig_f == NULL);
             section_id = qemu_get_be32(f);
 
             QLIST_FOREACH(le, &loadvm_handlers, entry) {
@@ -2068,6 +2073,23 @@  int qemu_loadvm_state(QEMUFile *f)
                 goto out;
             }
             break;
+        case QEMU_VM_POSTCOPY: {
+            QEMUFile *buf_file = NULL;
+            ret = postcopy_incoming_loadvm_state(f, &buf_file);
+            if (ret) {
+                goto out;
+            }
+            if (buf_file != NULL) {
+                /* VMStateDescription:pre/post_load and
+                 * cpu_sychronize_all_post_init() may fault on guest RAM.
+                 * (MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME)
+                 * postcopy threads needs to be created before the fault.
+                 */
+                orig_f = f;
+                f = buf_file;
+            }
+            break;
+        }
         default:
             fprintf(stderr, "Unknown savevm section type %d\n", section_type);
             ret = -EINVAL;
@@ -2080,6 +2102,12 @@  int qemu_loadvm_state(QEMUFile *f)
     ret = 0;
 
 out:
+    if (orig_f != NULL) {
+        assert(incoming_postcopy);
+        qemu_fclose(f);
+        f = orig_f;
+    }
+    savevm_ram_handlers.load_state = old_ram_load;
     QLIST_FOREACH_SAFE(le, &loadvm_handlers, entry, new_le) {
         QLIST_REMOVE(le, entry);
         g_free(le);
diff --git a/vl.c b/vl.c
index 723fc59..3221f50 100644
--- a/vl.c
+++ b/vl.c
@@ -3789,6 +3789,8 @@  int main(int argc, char **argv, char **envp)
     bdrv_close_all();
     pause_all_vcpus();
     net_cleanup();
+    postcopy_incoming_qemu_cleanup();
+
     res_free();
 
     return 0;