diff mbox

[RFC,v3,17/24] cpus: Reclaim vCPU objects

Message ID 1429858066-12088-18-git-send-email-bharata@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bharata B Rao April 24, 2015, 6:47 a.m. UTC
From: Gu Zheng <guz.fnst@cn.fujitsu.com>

In order to deal well with the kvm vcpus (which can not be removed without any
protection), we do not close KVM vcpu fd, just record and mark it as stopped
into a list, so that we can reuse it for the appending cpu hot-add request if
possible. It is also the approach that kvm guys suggested:
https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html

This patch also adds a QOM API object_has_no_children(Object *obj)
that checks whether a given object has any child objects. This API
is needed to release CPU core and socket objects when a vCPU is destroyed.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
               [Added core and socket removal bits]
---
 cpus.c               | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/qom/cpu.h    | 11 +++++++++
 include/sysemu/kvm.h |  1 +
 kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++-
 kvm-stub.c           |  5 ++++
 5 files changed, 140 insertions(+), 1 deletion(-)

Comments

David Gibson May 5, 2015, 7:20 a.m. UTC | #1
On Fri, Apr 24, 2015 at 12:17:39PM +0530, Bharata B Rao wrote:
> From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> 
> In order to deal well with the kvm vcpus (which can not be removed without any
> protection), we do not close KVM vcpu fd, just record and mark it as stopped
> into a list, so that we can reuse it for the appending cpu hot-add request if
> possible. It is also the approach that kvm guys suggested:
> https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> 
> This patch also adds a QOM API object_has_no_children(Object *obj)
> that checks whether a given object has any child objects. This API
> is needed to release CPU core and socket objects when a vCPU is destroyed.

I'm guessing this commit message needs updating, since you seem to
have split this out into the previous patch.

> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
>                [Added core and socket removal bits]
> ---
>  cpus.c               | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  include/qom/cpu.h    | 11 +++++++++
>  include/sysemu/kvm.h |  1 +
>  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++-
>  kvm-stub.c           |  5 ++++
>  5 files changed, 140 insertions(+), 1 deletion(-)
> 
> diff --git a/cpus.c b/cpus.c
> index 0fac143..325f8a6 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -858,6 +858,47 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>      qemu_cpu_kick(cpu);
>  }
>  
> +static void qemu_destroy_cpu_core(Object *core)
> +{
> +    Object *socket = core->parent;
> +
> +    object_unparent(core);
> +    if (socket && object_has_no_children(socket)) {
> +        object_unparent(socket);
> +    }

This seems a bit odd to me.  I thought the general idea of the new
approaches to cpu hotplug meant that the hotplug sequence started from
the top (the socket or core) and worked down to the threads.  Rather
than starting at the thread, and working up to the core and socket
level.

> +}
> +
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    Object *thread = OBJECT(cpu);
> +    Object *core = thread->parent;
> +
> +    CPU_REMOVE(cpu);
> +
> +    if (kvm_destroy_vcpu(cpu) < 0) {
> +        error_report("kvm_destroy_vcpu failed.\n");
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    object_unparent(thread);
> +    if (core && object_has_no_children(core)) {
> +        qemu_destroy_cpu_core(core);
> +    }
> +}
> +
> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> +{
> +    Object *thread = OBJECT(cpu);
> +    Object *core = thread->parent;
> +
> +    CPU_REMOVE(cpu);
> +    object_unparent(OBJECT(cpu));
> +
> +    if (core && object_has_no_children(core)) {
> +        qemu_destroy_cpu_core(core);
> +    }
> +}
> +
>  static void flush_queued_work(CPUState *cpu)
>  {
>      struct qemu_work_item *wi;
> @@ -950,6 +991,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_kvm_wait_io_event(cpu);
> +        if (cpu->exit && !cpu_can_run(cpu)) {
> +            qemu_kvm_destroy_vcpu(cpu);
> +            qemu_mutex_unlock(&qemu_global_mutex);
> +            return NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -1003,6 +1049,7 @@ static void tcg_exec_all(void);
>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>  {
>      CPUState *cpu = arg;
> +    CPUState *remove_cpu = NULL;
>  
>      qemu_tcg_init_cpu_signals();
>      qemu_thread_get_self(cpu->thread);
> @@ -1039,6 +1086,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_tcg_wait_io_event();
> +        CPU_FOREACH(cpu) {
> +            if (cpu->exit && !cpu_can_run(cpu)) {
> +                remove_cpu = cpu;
> +                break;
> +            }
> +        }
> +        if (remove_cpu) {
> +            qemu_tcg_destroy_vcpu(remove_cpu);
> +            remove_cpu = NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -1196,6 +1253,13 @@ void resume_all_vcpus(void)
>      }
>  }
>  
> +void cpu_remove(CPUState *cpu)
> +{
> +    cpu->stop = true;
> +    cpu->exit = true;
> +    qemu_cpu_kick(cpu);
> +}
> +
>  /* For temporary buffers for forming a name */
>  #define VCPU_THREAD_NAME_SIZE 16
>  
> @@ -1390,6 +1454,9 @@ static void tcg_exec_all(void)
>                  break;
>              }
>          } else if (cpu->stop || cpu->stopped) {
> +            if (cpu->exit) {
> +                next_cpu = CPU_NEXT(cpu);
> +            }
>              break;
>          }
>      }
> diff --git a/include/qom/cpu.h b/include/qom/cpu.h
> index 5241cf4..1bfc3d4 100644
> --- a/include/qom/cpu.h
> +++ b/include/qom/cpu.h
> @@ -206,6 +206,7 @@ struct kvm_run;
>   * @halted: Nonzero if the CPU is in suspended state.
>   * @stop: Indicates a pending stop request.
>   * @stopped: Indicates the CPU has been artificially stopped.
> + * @exit: Indicates the CPU has exited due to an unplug operation.
>   * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
>   *           CPU and return to its top level loop.
>   * @singlestep_enabled: Flags for single-stepping.
> @@ -249,6 +250,7 @@ struct CPUState {
>      bool created;
>      bool stop;
>      bool stopped;
> +    bool exit;
>      volatile sig_atomic_t exit_request;
>      uint32_t interrupt_request;
>      int singlestep_enabled;
> @@ -306,6 +308,7 @@ struct CPUState {
>  QTAILQ_HEAD(CPUTailQ, CPUState);
>  extern struct CPUTailQ cpus;
>  #define CPU_NEXT(cpu) QTAILQ_NEXT(cpu, node)
> +#define CPU_REMOVE(cpu) QTAILQ_REMOVE(&cpus, cpu, node)
>  #define CPU_FOREACH(cpu) QTAILQ_FOREACH(cpu, &cpus, node)
>  #define CPU_FOREACH_SAFE(cpu, next_cpu) \
>      QTAILQ_FOREACH_SAFE(cpu, &cpus, node, next_cpu)
> @@ -610,6 +613,14 @@ void cpu_exit(CPUState *cpu);
>   */
>  void cpu_resume(CPUState *cpu);
>  
> + /**
> + * cpu_remove:
> + * @cpu: The CPU to remove.
> + *
> + * Requests the CPU to be removed.
> + */
> +void cpu_remove(CPUState *cpu);
> +
>  /**
>   * qemu_init_vcpu:
>   * @cpu: The vCPU to initialize.
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 30cb84d..560caef 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -188,6 +188,7 @@ int kvm_has_intx_set_mask(void);
>  
>  int kvm_init_vcpu(CPUState *cpu);
>  int kvm_cpu_exec(CPUState *cpu);
> +int kvm_destroy_vcpu(CPUState *cpu);
>  
>  #ifdef NEED_CPU_H
>  
> diff --git a/kvm-all.c b/kvm-all.c
> index 05a79c2..46e7853 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -71,6 +71,12 @@ typedef struct KVMSlot
>  
>  typedef struct kvm_dirty_log KVMDirtyLog;
>  
> +struct KVMParkedVcpu {
> +    unsigned long vcpu_id;
> +    int kvm_fd;
> +    QLIST_ENTRY(KVMParkedVcpu) node;
> +};
> +
>  struct KVMState
>  {
>      AccelState parent_obj;
> @@ -107,6 +113,7 @@ struct KVMState
>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>      bool direct_msi;
>  #endif
> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>  };
>  
>  #define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
> @@ -247,6 +254,53 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>  }
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    KVMState *s = kvm_state;
> +    long mmap_size;
> +    struct KVMParkedVcpu *vcpu = NULL;
> +    int ret = 0;
> +
> +    DPRINTF("kvm_destroy_vcpu\n");
> +
> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> +    if (mmap_size < 0) {
> +        ret = mmap_size;
> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
> +        goto err;
> +    }
> +
> +    ret = munmap(cpu->kvm_run, mmap_size);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    vcpu = g_malloc0(sizeof(*vcpu));
> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
> +    vcpu->kvm_fd = cpu->kvm_fd;
> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> +err:
> +    return ret;
> +}
> +
> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> +{
> +    struct KVMParkedVcpu *cpu;
> +
> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> +        if (cpu->vcpu_id == vcpu_id) {
> +            int kvm_fd;
> +
> +            QLIST_REMOVE(cpu, node);
> +            kvm_fd = cpu->kvm_fd;
> +            g_free(cpu);
> +            return kvm_fd;
> +        }
> +    }
> +
> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      KVMState *s = kvm_state;
> @@ -255,7 +309,7 @@ int kvm_init_vcpu(CPUState *cpu)
>  
>      DPRINTF("kvm_init_vcpu\n");
>  
> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>      if (ret < 0) {
>          DPRINTF("kvm_create_vcpu failed\n");
>          goto err;
> @@ -1448,6 +1502,7 @@ static int kvm_init(MachineState *ms)
>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>  #endif
> +    QLIST_INIT(&s->kvm_parked_vcpus);
>      s->vmfd = -1;
>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>      if (s->fd == -1) {
> diff --git a/kvm-stub.c b/kvm-stub.c
> index 7ba90c5..79ac626 100644
> --- a/kvm-stub.c
> +++ b/kvm-stub.c
> @@ -30,6 +30,11 @@ bool kvm_gsi_direct_mapping;
>  bool kvm_allowed;
>  bool kvm_readonly_mem_allowed;
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    return -ENOSYS;
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      return -ENOSYS;
Bharata B Rao May 6, 2015, 6:37 a.m. UTC | #2
On Tue, May 05, 2015 at 05:20:04PM +1000, David Gibson wrote:
> On Fri, Apr 24, 2015 at 12:17:39PM +0530, Bharata B Rao wrote:
> > From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > 
> > In order to deal well with the kvm vcpus (which can not be removed without any
> > protection), we do not close KVM vcpu fd, just record and mark it as stopped
> > into a list, so that we can reuse it for the appending cpu hot-add request if
> > possible. It is also the approach that kvm guys suggested:
> > https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> > 
> > This patch also adds a QOM API object_has_no_children(Object *obj)
> > that checks whether a given object has any child objects. This API
> > is needed to release CPU core and socket objects when a vCPU is destroyed.
> 
> I'm guessing this commit message needs updating, since you seem to
> have split this out into the previous patch.
> 
> > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> >                [Added core and socket removal bits]
> > ---
> >  cpus.c               | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/qom/cpu.h    | 11 +++++++++
> >  include/sysemu/kvm.h |  1 +
> >  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++-
> >  kvm-stub.c           |  5 ++++
> >  5 files changed, 140 insertions(+), 1 deletion(-)
> > 
> > diff --git a/cpus.c b/cpus.c
> > index 0fac143..325f8a6 100644
> > --- a/cpus.c
> > +++ b/cpus.c
> > @@ -858,6 +858,47 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
> >      qemu_cpu_kick(cpu);
> >  }
> >  
> > +static void qemu_destroy_cpu_core(Object *core)
> > +{
> > +    Object *socket = core->parent;
> > +
> > +    object_unparent(core);
> > +    if (socket && object_has_no_children(socket)) {
> > +        object_unparent(socket);
> > +    }
> 
> This seems a bit odd to me.  I thought the general idea of the new
> approaches to cpu hotplug meant that the hotplug sequence started from
> the top (the socket or core) and worked down to the threads.  Rather
> than starting at the thread, and working up to the core and socket
> level.

Yes that's true for hotplug as well as hot unplug curently. Plug or
unplug starts at socket, moves down to cores and threads.

However when the unplug request comes down to the thread, we have to
destroy the vCPU and that's when we end up in this part of the code. Here
the thread (vCPU) unparents itself from the core. The core can't unparent
untill all its threads have unparented themselves. When all threads of a
core are done unparenting, core goes ahead and unparents itself from
its parent socket. Similarly socket can unparent when all cores under
it have unparented themselves from the socket.

This is the code that ensures that the socket device object finally
gets cleared and the id associated with the hot removed socket device
is available for reuse with next hotplug.

> 
> > +}
> > +
> > +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> > +{
> > +    Object *thread = OBJECT(cpu);
> > +    Object *core = thread->parent;
> > +
> > +    CPU_REMOVE(cpu);
> > +
> > +    if (kvm_destroy_vcpu(cpu) < 0) {
> > +        error_report("kvm_destroy_vcpu failed.\n");
> > +        exit(EXIT_FAILURE);
> > +    }
> > +
> > +    object_unparent(thread);
> > +    if (core && object_has_no_children(core)) {
> > +        qemu_destroy_cpu_core(core);
> > +    }
> > +}
> > +

Regards,
Bharata.
David Gibson May 7, 2015, 1:06 a.m. UTC | #3
On Wed, May 06, 2015 at 12:07:57PM +0530, Bharata B Rao wrote:
> On Tue, May 05, 2015 at 05:20:04PM +1000, David Gibson wrote:
> > On Fri, Apr 24, 2015 at 12:17:39PM +0530, Bharata B Rao wrote:
> > > From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > > 
> > > In order to deal well with the kvm vcpus (which can not be removed without any
> > > protection), we do not close KVM vcpu fd, just record and mark it as stopped
> > > into a list, so that we can reuse it for the appending cpu hot-add request if
> > > possible. It is also the approach that kvm guys suggested:
> > > https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> > > 
> > > This patch also adds a QOM API object_has_no_children(Object *obj)
> > > that checks whether a given object has any child objects. This API
> > > is needed to release CPU core and socket objects when a vCPU is destroyed.
> > 
> > I'm guessing this commit message needs updating, since you seem to
> > have split this out into the previous patch.
> > 
> > > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > > Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > > Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> > > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> > >                [Added core and socket removal bits]
> > > ---
> > >  cpus.c               | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/qom/cpu.h    | 11 +++++++++
> > >  include/sysemu/kvm.h |  1 +
> > >  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++-
> > >  kvm-stub.c           |  5 ++++
> > >  5 files changed, 140 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/cpus.c b/cpus.c
> > > index 0fac143..325f8a6 100644
> > > --- a/cpus.c
> > > +++ b/cpus.c
> > > @@ -858,6 +858,47 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
> > >      qemu_cpu_kick(cpu);
> > >  }
> > >  
> > > +static void qemu_destroy_cpu_core(Object *core)
> > > +{
> > > +    Object *socket = core->parent;
> > > +
> > > +    object_unparent(core);
> > > +    if (socket && object_has_no_children(socket)) {
> > > +        object_unparent(socket);
> > > +    }
> > 
> > This seems a bit odd to me.  I thought the general idea of the new
> > approaches to cpu hotplug meant that the hotplug sequence started from
> > the top (the socket or core) and worked down to the threads.  Rather
> > than starting at the thread, and working up to the core and socket
> > level.
> 
> Yes that's true for hotplug as well as hot unplug curently. Plug or
> unplug starts at socket, moves down to cores and threads.
> 
> However when the unplug request comes down to the thread, we have to
> destroy the vCPU and that's when we end up in this part of the code. Here
> the thread (vCPU) unparents itself from the core. The core can't unparent
> untill all its threads have unparented themselves. When all threads of a
> core are done unparenting, core goes ahead and unparents itself from
> its parent socket. Similarly socket can unparent when all cores under
> it have unparented themselves from the socket.

Why can't the core unplug routine propagte the unplug down to the
threads, let that complete, then do the per-core unplug stuff and
remove itself?

Is there an asynchronous callback in here somewhere?

> This is the code that ensures that the socket device object finally
> gets cleared and the id associated with the hot removed socket device
> is available for reuse with next hotplug.
> 
> > 
> > > +}
> > > +
> > > +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> > > +{
> > > +    Object *thread = OBJECT(cpu);
> > > +    Object *core = thread->parent;
> > > +
> > > +    CPU_REMOVE(cpu);
> > > +
> > > +    if (kvm_destroy_vcpu(cpu) < 0) {
> > > +        error_report("kvm_destroy_vcpu failed.\n");
> > > +        exit(EXIT_FAILURE);
> > > +    }
> > > +
> > > +    object_unparent(thread);
> > > +    if (core && object_has_no_children(core)) {
> > > +        qemu_destroy_cpu_core(core);
> > > +    }
> > > +}
> > > +
> 
> Regards,
> Bharata.
>
diff mbox

Patch

diff --git a/cpus.c b/cpus.c
index 0fac143..325f8a6 100644
--- a/cpus.c
+++ b/cpus.c
@@ -858,6 +858,47 @@  void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     qemu_cpu_kick(cpu);
 }
 
+static void qemu_destroy_cpu_core(Object *core)
+{
+    Object *socket = core->parent;
+
+    object_unparent(core);
+    if (socket && object_has_no_children(socket)) {
+        object_unparent(socket);
+    }
+}
+
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    Object *thread = OBJECT(cpu);
+    Object *core = thread->parent;
+
+    CPU_REMOVE(cpu);
+
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    object_unparent(thread);
+    if (core && object_has_no_children(core)) {
+        qemu_destroy_cpu_core(core);
+    }
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+    Object *thread = OBJECT(cpu);
+    Object *core = thread->parent;
+
+    CPU_REMOVE(cpu);
+    object_unparent(OBJECT(cpu));
+
+    if (core && object_has_no_children(core)) {
+        qemu_destroy_cpu_core(core);
+    }
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -950,6 +991,11 @@  static void *qemu_kvm_cpu_thread_fn(void *arg)
             }
         }
         qemu_kvm_wait_io_event(cpu);
+        if (cpu->exit && !cpu_can_run(cpu)) {
+            qemu_kvm_destroy_vcpu(cpu);
+            qemu_mutex_unlock(&qemu_global_mutex);
+            return NULL;
+        }
     }
 
     return NULL;
@@ -1003,6 +1049,7 @@  static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;
 
     qemu_tcg_init_cpu_signals();
     qemu_thread_get_self(cpu->thread);
@@ -1039,6 +1086,16 @@  static void *qemu_tcg_cpu_thread_fn(void *arg)
             }
         }
         qemu_tcg_wait_io_event();
+        CPU_FOREACH(cpu) {
+            if (cpu->exit && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            remove_cpu = NULL;
+        }
     }
 
     return NULL;
@@ -1196,6 +1253,13 @@  void resume_all_vcpus(void)
     }
 }
 
+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->exit = true;
+    qemu_cpu_kick(cpu);
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16
 
@@ -1390,6 +1454,9 @@  static void tcg_exec_all(void)
                 break;
             }
         } else if (cpu->stop || cpu->stopped) {
+            if (cpu->exit) {
+                next_cpu = CPU_NEXT(cpu);
+            }
             break;
         }
     }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 5241cf4..1bfc3d4 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -206,6 +206,7 @@  struct kvm_run;
  * @halted: Nonzero if the CPU is in suspended state.
  * @stop: Indicates a pending stop request.
  * @stopped: Indicates the CPU has been artificially stopped.
+ * @exit: Indicates the CPU has exited due to an unplug operation.
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *           CPU and return to its top level loop.
  * @singlestep_enabled: Flags for single-stepping.
@@ -249,6 +250,7 @@  struct CPUState {
     bool created;
     bool stop;
     bool stopped;
+    bool exit;
     volatile sig_atomic_t exit_request;
     uint32_t interrupt_request;
     int singlestep_enabled;
@@ -306,6 +308,7 @@  struct CPUState {
 QTAILQ_HEAD(CPUTailQ, CPUState);
 extern struct CPUTailQ cpus;
 #define CPU_NEXT(cpu) QTAILQ_NEXT(cpu, node)
+#define CPU_REMOVE(cpu) QTAILQ_REMOVE(&cpus, cpu, node)
 #define CPU_FOREACH(cpu) QTAILQ_FOREACH(cpu, &cpus, node)
 #define CPU_FOREACH_SAFE(cpu, next_cpu) \
     QTAILQ_FOREACH_SAFE(cpu, &cpus, node, next_cpu)
@@ -610,6 +613,14 @@  void cpu_exit(CPUState *cpu);
  */
 void cpu_resume(CPUState *cpu);
 
+ /**
+ * cpu_remove:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed.
+ */
+void cpu_remove(CPUState *cpu);
+
 /**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 30cb84d..560caef 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -188,6 +188,7 @@  int kvm_has_intx_set_mask(void);
 
 int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
+int kvm_destroy_vcpu(CPUState *cpu);
 
 #ifdef NEED_CPU_H
 
diff --git a/kvm-all.c b/kvm-all.c
index 05a79c2..46e7853 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -71,6 +71,12 @@  typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
+struct KVMParkedVcpu {
+    unsigned long vcpu_id;
+    int kvm_fd;
+    QLIST_ENTRY(KVMParkedVcpu) node;
+};
+
 struct KVMState
 {
     AccelState parent_obj;
@@ -107,6 +113,7 @@  struct KVMState
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
     bool direct_msi;
 #endif
+    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 };
 
 #define TYPE_KVM_ACCEL ACCEL_CLASS_NAME("kvm")
@@ -247,6 +254,53 @@  static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 }
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    KVMState *s = kvm_state;
+    long mmap_size;
+    struct KVMParkedVcpu *vcpu = NULL;
+    int ret = 0;
+
+    DPRINTF("kvm_destroy_vcpu\n");
+
+    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (mmap_size < 0) {
+        ret = mmap_size;
+        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        goto err;
+    }
+
+    ret = munmap(cpu->kvm_run, mmap_size);
+    if (ret < 0) {
+        goto err;
+    }
+
+    vcpu = g_malloc0(sizeof(*vcpu));
+    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+    vcpu->kvm_fd = cpu->kvm_fd;
+    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+err:
+    return ret;
+}
+
+static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+    struct KVMParkedVcpu *cpu;
+
+    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+        if (cpu->vcpu_id == vcpu_id) {
+            int kvm_fd;
+
+            QLIST_REMOVE(cpu, node);
+            kvm_fd = cpu->kvm_fd;
+            g_free(cpu);
+            return kvm_fd;
+        }
+    }
+
+    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
@@ -255,7 +309,7 @@  int kvm_init_vcpu(CPUState *cpu)
 
     DPRINTF("kvm_init_vcpu\n");
 
-    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
+    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
     if (ret < 0) {
         DPRINTF("kvm_create_vcpu failed\n");
         goto err;
@@ -1448,6 +1502,7 @@  static int kvm_init(MachineState *ms)
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
 #endif
+    QLIST_INIT(&s->kvm_parked_vcpus);
     s->vmfd = -1;
     s->fd = qemu_open("/dev/kvm", O_RDWR);
     if (s->fd == -1) {
diff --git a/kvm-stub.c b/kvm-stub.c
index 7ba90c5..79ac626 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -30,6 +30,11 @@  bool kvm_gsi_direct_mapping;
 bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    return -ENOSYS;
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     return -ENOSYS;