diff mbox

[v5,04/10] cpu: Reclaim vCPU objects

Message ID 1448024079-20808-5-git-send-email-bharata@linux.vnet.ibm.com
State New
Headers show

Commit Message

Bharata B Rao Nov. 20, 2015, 12:54 p.m. UTC
From: Gu Zheng <guz.fnst@cn.fujitsu.com>

In order to deal well with the kvm vcpus (which can not be removed without any
protection), we do not close KVM vcpu fd, just record and mark it as stopped
into a list, so that we can reuse it for the appending cpu hot-add request if
possible. It is also the approach that kvm guys suggested:
https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
               [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
                  isn't needed as it is done from cpu_exec_exit()]
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
 cpus.c               | 41 +++++++++++++++++++++++++++++++++++++
 include/qom/cpu.h    | 10 +++++++++
 include/sysemu/kvm.h |  1 +
 kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 kvm-stub.c           |  5 +++++
 5 files changed, 113 insertions(+), 1 deletion(-)

Comments

Alexey Kardashevskiy Nov. 30, 2015, 7:30 a.m. UTC | #1
On 11/20/2015 11:54 PM, Bharata B Rao wrote:
> From: Gu Zheng <guz.fnst@cn.fujitsu.com>
>
> In order to deal well with the kvm vcpus (which can not be removed without any
> protection), we do not close KVM vcpu fd, just record and mark it as stopped
> into a list, so that we can reuse it for the appending cpu hot-add request if
> possible. It is also the approach that kvm guys suggested:
> https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
>
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
>                 [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
>                    isn't needed as it is done from cpu_exec_exit()]
> Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> ---
>   cpus.c               | 41 +++++++++++++++++++++++++++++++++++++
>   include/qom/cpu.h    | 10 +++++++++
>   include/sysemu/kvm.h |  1 +
>   kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>   kvm-stub.c           |  5 +++++
>   5 files changed, 113 insertions(+), 1 deletion(-)
>
> diff --git a/cpus.c b/cpus.c
> index 877bd70..af2b274 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -953,6 +953,21 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>       qemu_cpu_kick(cpu);
>   }
>
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    if (kvm_destroy_vcpu(cpu) < 0) {
> +        error_report("kvm_destroy_vcpu failed.\n");
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    object_unparent(OBJECT(cpu));
> +}
> +
> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> +{
> +    object_unparent(OBJECT(cpu));
> +}
> +
>   static void flush_queued_work(CPUState *cpu)
>   {
>       struct qemu_work_item *wi;
> @@ -1053,6 +1068,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>               }
>           }
>           qemu_kvm_wait_io_event(cpu);
> +        if (cpu->exit && !cpu_can_run(cpu)) {
> +            qemu_kvm_destroy_vcpu(cpu);
> +            qemu_mutex_unlock(&qemu_global_mutex);


Nit: qemu_mutex_unlock_iothread() may be? Or it is important for 
iothread_locked to remain "true"? It does not seem to be used much though.
David Gibson Dec. 1, 2015, 12:55 a.m. UTC | #2
On Fri, Nov 20, 2015 at 06:24:33PM +0530, Bharata B Rao wrote:
> From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> 
> In order to deal well with the kvm vcpus (which can not be removed without any
> protection), we do not close KVM vcpu fd, just record and mark it as stopped
> into a list, so that we can reuse it for the appending cpu hot-add request if
> possible. It is also the approach that kvm guys suggested:
> https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
>                [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
>                   isn't needed as it is done from cpu_exec_exit()]
> ---
>  cpus.c               | 41 +++++++++++++++++++++++++++++++++++++
>  include/qom/cpu.h    | 10 +++++++++
>  include/sysemu/kvm.h |  1 +
>  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>  kvm-stub.c           |  5 +++++
>  5 files changed, 113 insertions(+), 1 deletion(-)
> 
> diff --git a/cpus.c b/cpus.c
> index 877bd70..af2b274 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -953,6 +953,21 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>      qemu_cpu_kick(cpu);
>  }
>  
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    if (kvm_destroy_vcpu(cpu) < 0) {
> +        error_report("kvm_destroy_vcpu failed.\n");
> +        exit(EXIT_FAILURE);
> +    }
> +
> +    object_unparent(OBJECT(cpu));
> +}
> +
> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> +{
> +    object_unparent(OBJECT(cpu));
> +}
> +
>  static void flush_queued_work(CPUState *cpu)
>  {
>      struct qemu_work_item *wi;
> @@ -1053,6 +1068,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_kvm_wait_io_event(cpu);
> +        if (cpu->exit && !cpu_can_run(cpu)) {
> +            qemu_kvm_destroy_vcpu(cpu);
> +            qemu_mutex_unlock(&qemu_global_mutex);

This looks like a change to locking semantics, and I can't see the
connection to the described purpose of the patch.

> +            return NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -1108,6 +1128,7 @@ static void tcg_exec_all(void);
>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>  {
>      CPUState *cpu = arg;
> +    CPUState *remove_cpu = NULL;
>  
>      rcu_register_thread();
>  
> @@ -1145,6 +1166,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
> +        CPU_FOREACH(cpu) {
> +            if (cpu->exit && !cpu_can_run(cpu)) {
> +                remove_cpu = cpu;
> +                break;
> +            }
> +        }
> +        if (remove_cpu) {
> +            qemu_tcg_destroy_vcpu(remove_cpu);
> +            remove_cpu = NULL;
> +        }

Any particular reason to only cleanup one cpu per iteration?

Also, any particular reason this isn't folded into tcg_exec_all with
the other cpu->exit logic?

>      }
>  
>      return NULL;
> @@ -1301,6 +1332,13 @@ void resume_all_vcpus(void)
>      }
>  }
>  
> +void cpu_remove(CPUState *cpu)
> +{
> +    cpu->stop = true;
> +    cpu->exit = true;
> +    qemu_cpu_kick(cpu);
> +}
> +
>  /* For temporary buffers for forming a name */
>  #define VCPU_THREAD_NAME_SIZE 16
>  
> @@ -1506,6 +1544,9 @@ static void tcg_exec_all(void)
>                  break;
>              }
>          } else if (cpu->stop || cpu->stopped) {
> +            if (cpu->exit) {
> +                next_cpu = CPU_NEXT(cpu);
> +            }
>              break;
>          }
>      }
> diff --git a/include/qom/cpu.h b/include/qom/cpu.h
> index 51a1323..67e05b0 100644
> --- a/include/qom/cpu.h
> +++ b/include/qom/cpu.h
> @@ -223,6 +223,7 @@ struct kvm_run;
>   * @halted: Nonzero if the CPU is in suspended state.
>   * @stop: Indicates a pending stop request.
>   * @stopped: Indicates the CPU has been artificially stopped.
> + * @exit: Indicates the CPU has exited due to an unplug operation.
>   * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
>   * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
>   *           CPU and return to its top level loop.
> @@ -274,6 +275,7 @@ struct CPUState {
>      bool created;
>      bool stop;
>      bool stopped;
> +    bool exit;
>      bool crash_occurred;
>      bool exit_request;
>      uint32_t interrupt_request;
> @@ -696,6 +698,14 @@ void cpu_exit(CPUState *cpu);
>  void cpu_resume(CPUState *cpu);
>  
>  /**
> + * cpu_remove:
> + * @cpu: The CPU to remove.
> + *
> + * Requests the CPU to be removed.
> + */
> +void cpu_remove(CPUState *cpu);
> +
> +/**
>   * qemu_init_vcpu:
>   * @cpu: The vCPU to initialize.
>   *
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index b31f325..dd1b783 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -206,6 +206,7 @@ int kvm_has_intx_set_mask(void);
>  
>  int kvm_init_vcpu(CPUState *cpu);
>  int kvm_cpu_exec(CPUState *cpu);
> +int kvm_destroy_vcpu(CPUState *cpu);
>  
>  #ifdef NEED_CPU_H
>  
> diff --git a/kvm-all.c b/kvm-all.c
> index c648b81..3befc59 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -60,6 +60,12 @@
>  
>  #define KVM_MSI_HASHTAB_SIZE    256
>  
> +struct KVMParkedVcpu {
> +    unsigned long vcpu_id;
> +    int kvm_fd;
> +    QLIST_ENTRY(KVMParkedVcpu) node;
> +};
> +
>  struct KVMState
>  {
>      AccelState parent_obj;
> @@ -93,6 +99,7 @@ struct KVMState
>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>  #endif
>      KVMMemoryListener memory_listener;
> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>  };
>  
>  KVMState *kvm_state;
> @@ -235,6 +242,53 @@ static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>  }
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    KVMState *s = kvm_state;
> +    long mmap_size;
> +    struct KVMParkedVcpu *vcpu = NULL;
> +    int ret = 0;
> +
> +    DPRINTF("kvm_destroy_vcpu\n");
> +
> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> +    if (mmap_size < 0) {
> +        ret = mmap_size;
> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
> +        goto err;
> +    }
> +
> +    ret = munmap(cpu->kvm_run, mmap_size);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    vcpu = g_malloc0(sizeof(*vcpu));
> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
> +    vcpu->kvm_fd = cpu->kvm_fd;
> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> +err:
> +    return ret;
> +}
> +
> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> +{
> +    struct KVMParkedVcpu *cpu;
> +
> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> +        if (cpu->vcpu_id == vcpu_id) {
> +            int kvm_fd;
> +
> +            QLIST_REMOVE(cpu, node);
> +            kvm_fd = cpu->kvm_fd;
> +            g_free(cpu);
> +            return kvm_fd;
> +        }
> +    }

Hmm.. use of a simple list here does mean that unplugging, then
replugging all (except 1) vcpus would be an O(n^2) operation.  That's
probably still alright, I guess.

> +
> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      KVMState *s = kvm_state;
> @@ -243,7 +297,7 @@ int kvm_init_vcpu(CPUState *cpu)
>  
>      DPRINTF("kvm_init_vcpu\n");
>  
> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>      if (ret < 0) {
>          DPRINTF("kvm_create_vcpu failed\n");
>          goto err;
> @@ -1468,6 +1522,7 @@ static int kvm_init(MachineState *ms)
>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>  #endif
> +    QLIST_INIT(&s->kvm_parked_vcpus);
>      s->vmfd = -1;
>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>      if (s->fd == -1) {
> diff --git a/kvm-stub.c b/kvm-stub.c
> index dc97a5e..0b39456 100644
> --- a/kvm-stub.c
> +++ b/kvm-stub.c
> @@ -32,6 +32,11 @@ bool kvm_allowed;
>  bool kvm_readonly_mem_allowed;
>  bool kvm_ioeventfd_any_length_allowed;
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    return -ENOSYS;
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      return -ENOSYS;
Bharata B Rao Dec. 2, 2015, 3:50 a.m. UTC | #3
On Mon, Nov 30, 2015 at 06:30:52PM +1100, Alexey Kardashevskiy wrote:
> On 11/20/2015 11:54 PM, Bharata B Rao wrote:
> >From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> >
> >In order to deal well with the kvm vcpus (which can not be removed without any
> >protection), we do not close KVM vcpu fd, just record and mark it as stopped
> >into a list, so that we can reuse it for the appending cpu hot-add request if
> >possible. It is also the approach that kvm guys suggested:
> >https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> >
> >Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> >Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> >Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> >Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> >                [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
> >                   isn't needed as it is done from cpu_exec_exit()]
> >Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
> >---
> >  cpus.c               | 41 +++++++++++++++++++++++++++++++++++++
> >  include/qom/cpu.h    | 10 +++++++++
> >  include/sysemu/kvm.h |  1 +
> >  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  kvm-stub.c           |  5 +++++
> >  5 files changed, 113 insertions(+), 1 deletion(-)
> >
> >diff --git a/cpus.c b/cpus.c
> >index 877bd70..af2b274 100644
> >--- a/cpus.c
> >+++ b/cpus.c
> >@@ -953,6 +953,21 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
> >      qemu_cpu_kick(cpu);
> >  }
> >
> >+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> >+{
> >+    if (kvm_destroy_vcpu(cpu) < 0) {
> >+        error_report("kvm_destroy_vcpu failed.\n");
> >+        exit(EXIT_FAILURE);
> >+    }
> >+
> >+    object_unparent(OBJECT(cpu));
> >+}
> >+
> >+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> >+{
> >+    object_unparent(OBJECT(cpu));
> >+}
> >+
> >  static void flush_queued_work(CPUState *cpu)
> >  {
> >      struct qemu_work_item *wi;
> >@@ -1053,6 +1068,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
> >              }
> >          }
> >          qemu_kvm_wait_io_event(cpu);
> >+        if (cpu->exit && !cpu_can_run(cpu)) {
> >+            qemu_kvm_destroy_vcpu(cpu);
> >+            qemu_mutex_unlock(&qemu_global_mutex);
> 
> 
> Nit: qemu_mutex_unlock_iothread() may be? Or it is important for
> iothread_locked to remain "true"? It does not seem to be used much though.

This patch is quite old and qemu_global_mutex got changed to
qemu_mutex_unlock_iothread() some time ago in this part of the code.

Thanks for noticing this, will fix this in next version.

Regards,
Bharata.
Bharata B Rao Dec. 2, 2015, 5:28 a.m. UTC | #4
On Tue, Dec 01, 2015 at 11:55:58AM +1100, David Gibson wrote:
> On Fri, Nov 20, 2015 at 06:24:33PM +0530, Bharata B Rao wrote:
> > From: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > 
> > In order to deal well with the kvm vcpus (which can not be removed without any
> > protection), we do not close KVM vcpu fd, just record and mark it as stopped
> > into a list, so that we can reuse it for the appending cpu hot-add request if
> > possible. It is also the approach that kvm guys suggested:
> > https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> > 
> > Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> > Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> > Signed-off-by: Zhu Guihua <zhugh.fnst@cn.fujitsu.com>
> > Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
> >                [- Explicit CPU_REMOVE() from qemu_kvm/tcg_destroy_vcpu()
> >                   isn't needed as it is done from cpu_exec_exit()]
> > ---
> >  cpus.c               | 41 +++++++++++++++++++++++++++++++++++++
> >  include/qom/cpu.h    | 10 +++++++++
> >  include/sysemu/kvm.h |  1 +
> >  kvm-all.c            | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  kvm-stub.c           |  5 +++++
> >  5 files changed, 113 insertions(+), 1 deletion(-)
> > 
> > diff --git a/cpus.c b/cpus.c
> > index 877bd70..af2b274 100644
> > --- a/cpus.c
> > +++ b/cpus.c
> > @@ -953,6 +953,21 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
> >      qemu_cpu_kick(cpu);
> >  }
> >  
> > +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> > +{
> > +    if (kvm_destroy_vcpu(cpu) < 0) {
> > +        error_report("kvm_destroy_vcpu failed.\n");
> > +        exit(EXIT_FAILURE);
> > +    }
> > +
> > +    object_unparent(OBJECT(cpu));
> > +}
> > +
> > +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> > +{
> > +    object_unparent(OBJECT(cpu));
> > +}
> > +
> >  static void flush_queued_work(CPUState *cpu)
> >  {
> >      struct qemu_work_item *wi;
> > @@ -1053,6 +1068,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
> >              }
> >          }
> >          qemu_kvm_wait_io_event(cpu);
> > +        if (cpu->exit && !cpu_can_run(cpu)) {
> > +            qemu_kvm_destroy_vcpu(cpu);
> > +            qemu_mutex_unlock(&qemu_global_mutex);
> 
> This looks like a change to locking semantics, and I can't see the
> connection to the described purpose of the patch.

As I replied in another thread to Alexey, this needs fixing.

> 
> > +            return NULL;
> > +        }
> >      }
> >  
> >      return NULL;
> > @@ -1108,6 +1128,7 @@ static void tcg_exec_all(void);
> >  static void *qemu_tcg_cpu_thread_fn(void *arg)
> >  {
> >      CPUState *cpu = arg;
> > +    CPUState *remove_cpu = NULL;
> >  
> >      rcu_register_thread();
> >  
> > @@ -1145,6 +1166,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
> >              }
> >          }
> >          qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
> > +        CPU_FOREACH(cpu) {
> > +            if (cpu->exit && !cpu_can_run(cpu)) {
> > +                remove_cpu = cpu;
> > +                break;
> > +            }
> > +        }
> > +        if (remove_cpu) {
> > +            qemu_tcg_destroy_vcpu(remove_cpu);
> > +            remove_cpu = NULL;
> > +        }
> 
> Any particular reason to only cleanup one cpu per iteration?

Not sure, this is borrowed from x86 CPU hotplug patchset.
Zhu - do you know why ?

> 
> Also, any particular reason this isn't folded into tcg_exec_all with
> the other cpu->exit logic?

Looks like it can be done. Will give it a try in the next iteration.

Regards,
Bharata.
diff mbox

Patch

diff --git a/cpus.c b/cpus.c
index 877bd70..af2b274 100644
--- a/cpus.c
+++ b/cpus.c
@@ -953,6 +953,21 @@  void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     qemu_cpu_kick(cpu);
 }
 
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        error_report("kvm_destroy_vcpu failed.\n");
+        exit(EXIT_FAILURE);
+    }
+
+    object_unparent(OBJECT(cpu));
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+    object_unparent(OBJECT(cpu));
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -1053,6 +1068,11 @@  static void *qemu_kvm_cpu_thread_fn(void *arg)
             }
         }
         qemu_kvm_wait_io_event(cpu);
+        if (cpu->exit && !cpu_can_run(cpu)) {
+            qemu_kvm_destroy_vcpu(cpu);
+            qemu_mutex_unlock(&qemu_global_mutex);
+            return NULL;
+        }
     }
 
     return NULL;
@@ -1108,6 +1128,7 @@  static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;
 
     rcu_register_thread();
 
@@ -1145,6 +1166,16 @@  static void *qemu_tcg_cpu_thread_fn(void *arg)
             }
         }
         qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
+        CPU_FOREACH(cpu) {
+            if (cpu->exit && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            remove_cpu = NULL;
+        }
     }
 
     return NULL;
@@ -1301,6 +1332,13 @@  void resume_all_vcpus(void)
     }
 }
 
+void cpu_remove(CPUState *cpu)
+{
+    cpu->stop = true;
+    cpu->exit = true;
+    qemu_cpu_kick(cpu);
+}
+
 /* For temporary buffers for forming a name */
 #define VCPU_THREAD_NAME_SIZE 16
 
@@ -1506,6 +1544,9 @@  static void tcg_exec_all(void)
                 break;
             }
         } else if (cpu->stop || cpu->stopped) {
+            if (cpu->exit) {
+                next_cpu = CPU_NEXT(cpu);
+            }
             break;
         }
     }
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 51a1323..67e05b0 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -223,6 +223,7 @@  struct kvm_run;
  * @halted: Nonzero if the CPU is in suspended state.
  * @stop: Indicates a pending stop request.
  * @stopped: Indicates the CPU has been artificially stopped.
+ * @exit: Indicates the CPU has exited due to an unplug operation.
  * @crash_occurred: Indicates the OS reported a crash (panic) for this CPU
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *           CPU and return to its top level loop.
@@ -274,6 +275,7 @@  struct CPUState {
     bool created;
     bool stop;
     bool stopped;
+    bool exit;
     bool crash_occurred;
     bool exit_request;
     uint32_t interrupt_request;
@@ -696,6 +698,14 @@  void cpu_exit(CPUState *cpu);
 void cpu_resume(CPUState *cpu);
 
 /**
+ * cpu_remove:
+ * @cpu: The CPU to remove.
+ *
+ * Requests the CPU to be removed.
+ */
+void cpu_remove(CPUState *cpu);
+
+/**
  * qemu_init_vcpu:
  * @cpu: The vCPU to initialize.
  *
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index b31f325..dd1b783 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -206,6 +206,7 @@  int kvm_has_intx_set_mask(void);
 
 int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
+int kvm_destroy_vcpu(CPUState *cpu);
 
 #ifdef NEED_CPU_H
 
diff --git a/kvm-all.c b/kvm-all.c
index c648b81..3befc59 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -60,6 +60,12 @@ 
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
+struct KVMParkedVcpu {
+    unsigned long vcpu_id;
+    int kvm_fd;
+    QLIST_ENTRY(KVMParkedVcpu) node;
+};
+
 struct KVMState
 {
     AccelState parent_obj;
@@ -93,6 +99,7 @@  struct KVMState
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 #endif
     KVMMemoryListener memory_listener;
+    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 };
 
 KVMState *kvm_state;
@@ -235,6 +242,53 @@  static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 }
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    KVMState *s = kvm_state;
+    long mmap_size;
+    struct KVMParkedVcpu *vcpu = NULL;
+    int ret = 0;
+
+    DPRINTF("kvm_destroy_vcpu\n");
+
+    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (mmap_size < 0) {
+        ret = mmap_size;
+        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        goto err;
+    }
+
+    ret = munmap(cpu->kvm_run, mmap_size);
+    if (ret < 0) {
+        goto err;
+    }
+
+    vcpu = g_malloc0(sizeof(*vcpu));
+    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+    vcpu->kvm_fd = cpu->kvm_fd;
+    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+err:
+    return ret;
+}
+
+static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+    struct KVMParkedVcpu *cpu;
+
+    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+        if (cpu->vcpu_id == vcpu_id) {
+            int kvm_fd;
+
+            QLIST_REMOVE(cpu, node);
+            kvm_fd = cpu->kvm_fd;
+            g_free(cpu);
+            return kvm_fd;
+        }
+    }
+
+    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
@@ -243,7 +297,7 @@  int kvm_init_vcpu(CPUState *cpu)
 
     DPRINTF("kvm_init_vcpu\n");
 
-    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
+    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
     if (ret < 0) {
         DPRINTF("kvm_create_vcpu failed\n");
         goto err;
@@ -1468,6 +1522,7 @@  static int kvm_init(MachineState *ms)
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
 #endif
+    QLIST_INIT(&s->kvm_parked_vcpus);
     s->vmfd = -1;
     s->fd = qemu_open("/dev/kvm", O_RDWR);
     if (s->fd == -1) {
diff --git a/kvm-stub.c b/kvm-stub.c
index dc97a5e..0b39456 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -32,6 +32,11 @@  bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_ioeventfd_any_length_allowed;
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    return -ENOSYS;
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     return -ENOSYS;