diff mbox

[RFC,V2,10/10] cpus: reclaim allocated vCPU objects

Message ID 1409197002-9498-11-git-send-email-guz.fnst@cn.fujitsu.com
State New
Headers show

Commit Message

Gu Zheng Aug. 28, 2014, 3:36 a.m. UTC
After ACPI get a signal to eject a vCPU, the vCPU must be
removed from CPU list,before the vCPU really removed,  then
release the all related vCPU objects.
But we do not close KVM vcpu fd, just record it into a list, in
order to reuse it.

Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
---
 cpus.c               |   37 ++++++++++++++++++++++++++++++++
 include/sysemu/kvm.h |    1 +
 kvm-all.c            |   57 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 1 deletions(-)

Comments

Igor Mammedov Sept. 9, 2014, 2:40 p.m. UTC | #1
On Thu, 28 Aug 2014 11:36:42 +0800
Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:

> After ACPI get a signal to eject a vCPU, the vCPU must be
> removed from CPU list,before the vCPU really removed,  then
> release the all related vCPU objects.
> But we do not close KVM vcpu fd, just record it into a list, in
> order to reuse it.
> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> ---
>  cpus.c               |   37 ++++++++++++++++++++++++++++++++
>  include/sysemu/kvm.h |    1 +
>  kvm-all.c            |   57 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 94 insertions(+), 1 deletions(-)
> 
> diff --git a/cpus.c b/cpus.c
> index eee693b..0608b41 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -851,6 +851,24 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>      qemu_cpu_kick(cpu);
>  }
>  
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    CPU_REMOVE(cpu);
> +
> +    if (kvm_destroy_vcpu(cpu) < 0) {
> +        fprintf(stderr, "kvm_destroy_vcpu failed.\n");
> +        exit(1);
> +    }
> +
> +    object_unparent(OBJECT(cpu));
> +}
> +
> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> +{
> +    CPU_REMOVE(cpu);
> +    object_unparent(OBJECT(cpu));
> +}
> +
>  static void flush_queued_work(CPUState *cpu)
>  {
>      struct qemu_work_item *wi;
> @@ -942,6 +960,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_kvm_wait_io_event(cpu);
> +        if (cpu->exit && !cpu_can_run(cpu)) {
> +            qemu_kvm_destroy_vcpu(cpu);
> +            qemu_mutex_unlock(&qemu_global_mutex);
> +            return NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -994,6 +1017,7 @@ static void tcg_exec_all(void);
>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>  {
>      CPUState *cpu = arg;
> +    CPUState *remove_cpu = NULL;
>  
>      qemu_tcg_init_cpu_signals();
>      qemu_thread_get_self(cpu->thread);
> @@ -1026,6 +1050,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_tcg_wait_io_event();
> +        CPU_FOREACH(cpu) {
> +            if (cpu->exit && !cpu_can_run(cpu)) {
> +                remove_cpu = cpu;
> +                break;
> +            }
> +        }
> +        if (remove_cpu) {
> +            qemu_tcg_destroy_vcpu(remove_cpu);
> +            remove_cpu = NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -1383,6 +1417,9 @@ static void tcg_exec_all(void)
>                  break;
>              }
>          } else if (cpu->stop || cpu->stopped) {
> +            if (cpu->exit) {
> +                next_cpu = CPU_NEXT(cpu);
> +            }
>              break;
>          }
>      }
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 174ea36..88e2403 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -178,6 +178,7 @@ int kvm_has_intx_set_mask(void);
>  
>  int kvm_init_vcpu(CPUState *cpu);
>  int kvm_cpu_exec(CPUState *cpu);
> +int kvm_destroy_vcpu(CPUState *cpu);
>  
>  #ifdef NEED_CPU_H
>  
> diff --git a/kvm-all.c b/kvm-all.c
> index 1402f4f..d0caeff 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -74,6 +74,12 @@ typedef struct KVMSlot
>  
>  typedef struct kvm_dirty_log KVMDirtyLog;
>  
> +struct KVMParkedVcpu {
> +    unsigned long vcpu_id;
> +    int kvm_fd;
> +    QLIST_ENTRY(KVMParkedVcpu) node;
> +};
> +
>  struct KVMState
>  {
>      KVMSlot *slots;
> @@ -108,6 +114,7 @@ struct KVMState
>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>      bool direct_msi;
>  #endif
> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>  };
>  
>  KVMState *kvm_state;
> @@ -226,6 +233,53 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>  }
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    KVMState *s = kvm_state;
> +    long mmap_size;
> +    struct KVMParkedVcpu *vcpu = NULL;
> +    int ret = 0;
> +
> +    DPRINTF("kvm_destroy_vcpu\n");
maybe it would be good to put cpu in RESET state before parking it?

Do we need a kernel patch to park VCPU so it wouldn't be possible
to wake it up by sending INIT/SIPI to it?

> +
> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> +    if (mmap_size < 0) {
> +        ret = mmap_size;
> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
> +        goto err;
> +    }
> +
> +    ret = munmap(cpu->kvm_run, mmap_size);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    vcpu = g_malloc0(sizeof(*vcpu));
> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
> +    vcpu->kvm_fd = cpu->kvm_fd;
> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> +err:
> +    return ret;
> +}
> +
> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> +{
> +    struct KVMParkedVcpu *cpu;
> +
> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> +        if (cpu->vcpu_id == vcpu_id) {
> +            int kvm_fd;
> +
> +            QLIST_REMOVE(cpu, node);
> +            kvm_fd = cpu->kvm_fd;
> +            g_free(cpu);
> +            return kvm_fd;
> +        }
> +    }
> +
> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      KVMState *s = kvm_state;
> @@ -234,7 +288,7 @@ int kvm_init_vcpu(CPUState *cpu)
>  
>      DPRINTF("kvm_init_vcpu\n");
>  
> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>      if (ret < 0) {
>          DPRINTF("kvm_create_vcpu failed\n");
>          goto err;
> @@ -1404,6 +1458,7 @@ int kvm_init(MachineClass *mc)
>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>  #endif
> +    QLIST_INIT(&s->kvm_parked_vcpus);
>      s->vmfd = -1;
>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>      if (s->fd == -1) {
Gu Zheng Sept. 10, 2014, 3:54 a.m. UTC | #2
Hi Igor,

On 09/09/2014 10:40 PM, Igor Mammedov wrote:

> On Thu, 28 Aug 2014 11:36:42 +0800
> Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> 
>> After ACPI get a signal to eject a vCPU, the vCPU must be
>> removed from CPU list,before the vCPU really removed,  then
>> release the all related vCPU objects.
>> But we do not close KVM vcpu fd, just record it into a list, in
>> order to reuse it.
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
>> ---
>>  cpus.c               |   37 ++++++++++++++++++++++++++++++++
>>  include/sysemu/kvm.h |    1 +
>>  kvm-all.c            |   57 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 94 insertions(+), 1 deletions(-)
>>
>> diff --git a/cpus.c b/cpus.c
>> index eee693b..0608b41 100644
>> --- a/cpus.c
>> +++ b/cpus.c
>> @@ -851,6 +851,24 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>>      qemu_cpu_kick(cpu);
>>  }
>>  
>> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
>> +{
>> +    CPU_REMOVE(cpu);
>> +
>> +    if (kvm_destroy_vcpu(cpu) < 0) {
>> +        fprintf(stderr, "kvm_destroy_vcpu failed.\n");
>> +        exit(1);
>> +    }
>> +
>> +    object_unparent(OBJECT(cpu));
>> +}
>> +
>> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
>> +{
>> +    CPU_REMOVE(cpu);
>> +    object_unparent(OBJECT(cpu));
>> +}
>> +
>>  static void flush_queued_work(CPUState *cpu)
>>  {
>>      struct qemu_work_item *wi;
>> @@ -942,6 +960,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>>              }
>>          }
>>          qemu_kvm_wait_io_event(cpu);
>> +        if (cpu->exit && !cpu_can_run(cpu)) {
>> +            qemu_kvm_destroy_vcpu(cpu);
>> +            qemu_mutex_unlock(&qemu_global_mutex);
>> +            return NULL;
>> +        }
>>      }
>>  
>>      return NULL;
>> @@ -994,6 +1017,7 @@ static void tcg_exec_all(void);
>>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>>  {
>>      CPUState *cpu = arg;
>> +    CPUState *remove_cpu = NULL;
>>  
>>      qemu_tcg_init_cpu_signals();
>>      qemu_thread_get_self(cpu->thread);
>> @@ -1026,6 +1050,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>>              }
>>          }
>>          qemu_tcg_wait_io_event();
>> +        CPU_FOREACH(cpu) {
>> +            if (cpu->exit && !cpu_can_run(cpu)) {
>> +                remove_cpu = cpu;
>> +                break;
>> +            }
>> +        }
>> +        if (remove_cpu) {
>> +            qemu_tcg_destroy_vcpu(remove_cpu);
>> +            remove_cpu = NULL;
>> +        }
>>      }
>>  
>>      return NULL;
>> @@ -1383,6 +1417,9 @@ static void tcg_exec_all(void)
>>                  break;
>>              }
>>          } else if (cpu->stop || cpu->stopped) {
>> +            if (cpu->exit) {
>> +                next_cpu = CPU_NEXT(cpu);
>> +            }
>>              break;
>>          }
>>      }
>> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
>> index 174ea36..88e2403 100644
>> --- a/include/sysemu/kvm.h
>> +++ b/include/sysemu/kvm.h
>> @@ -178,6 +178,7 @@ int kvm_has_intx_set_mask(void);
>>  
>>  int kvm_init_vcpu(CPUState *cpu);
>>  int kvm_cpu_exec(CPUState *cpu);
>> +int kvm_destroy_vcpu(CPUState *cpu);
>>  
>>  #ifdef NEED_CPU_H
>>  
>> diff --git a/kvm-all.c b/kvm-all.c
>> index 1402f4f..d0caeff 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -74,6 +74,12 @@ typedef struct KVMSlot
>>  
>>  typedef struct kvm_dirty_log KVMDirtyLog;
>>  
>> +struct KVMParkedVcpu {
>> +    unsigned long vcpu_id;
>> +    int kvm_fd;
>> +    QLIST_ENTRY(KVMParkedVcpu) node;
>> +};
>> +
>>  struct KVMState
>>  {
>>      KVMSlot *slots;
>> @@ -108,6 +114,7 @@ struct KVMState
>>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>>      bool direct_msi;
>>  #endif
>> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>>  };
>>  
>>  KVMState *kvm_state;
>> @@ -226,6 +233,53 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>>  }
>>  
>> +int kvm_destroy_vcpu(CPUState *cpu)
>> +{
>> +    KVMState *s = kvm_state;
>> +    long mmap_size;
>> +    struct KVMParkedVcpu *vcpu = NULL;
>> +    int ret = 0;
>> +
>> +    DPRINTF("kvm_destroy_vcpu\n");
> maybe it would be good to put cpu in RESET state before parking it?

It would be better.

> 
> Do we need a kernel patch to park VCPU so it wouldn't be possible
> to wake it up by sending INIT/SIPI to it?

We sent a kernel patch before, but things is harder than we think, and
it will lead to performance regression. So the suggestion is just parking
vcpu in the QEMU side.

Thanks,
Gu

> 
>> +
>> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>> +    if (mmap_size < 0) {
>> +        ret = mmap_size;
>> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
>> +        goto err;
>> +    }
>> +
>> +    ret = munmap(cpu->kvm_run, mmap_size);
>> +    if (ret < 0) {
>> +        goto err;
>> +    }
>> +
>> +    vcpu = g_malloc0(sizeof(*vcpu));
>> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>> +    vcpu->kvm_fd = cpu->kvm_fd;
>> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
>> +err:
>> +    return ret;
>> +}
>> +
>> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
>> +{
>> +    struct KVMParkedVcpu *cpu;
>> +
>> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
>> +        if (cpu->vcpu_id == vcpu_id) {
>> +            int kvm_fd;
>> +
>> +            QLIST_REMOVE(cpu, node);
>> +            kvm_fd = cpu->kvm_fd;
>> +            g_free(cpu);
>> +            return kvm_fd;
>> +        }
>> +    }
>> +
>> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
>> +}
>> +
>>  int kvm_init_vcpu(CPUState *cpu)
>>  {
>>      KVMState *s = kvm_state;
>> @@ -234,7 +288,7 @@ int kvm_init_vcpu(CPUState *cpu)
>>  
>>      DPRINTF("kvm_init_vcpu\n");
>>  
>> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
>> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>>      if (ret < 0) {
>>          DPRINTF("kvm_create_vcpu failed\n");
>>          goto err;
>> @@ -1404,6 +1458,7 @@ int kvm_init(MachineClass *mc)
>>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>>  #endif
>> +    QLIST_INIT(&s->kvm_parked_vcpus);
>>      s->vmfd = -1;
>>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>>      if (s->fd == -1) {
> 
> .
>
Bharata B Rao Sept. 11, 2014, 9:35 a.m. UTC | #3
On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> After ACPI get a signal to eject a vCPU, the vCPU must be
> removed from CPU list,before the vCPU really removed,  then
> release the all related vCPU objects.
> But we do not close KVM vcpu fd, just record it into a list, in
> order to reuse it.

After I add and delete a CPU, "info cpus" from monitor still lists the
removed CPU. Is this expected ?

Regards,
Bharata.
Gu Zheng Sept. 11, 2014, 9:49 a.m. UTC | #4
On 09/11/2014 05:35 PM, Bharata B Rao wrote:

> On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> After ACPI get a signal to eject a vCPU, the vCPU must be
>> removed from CPU list,before the vCPU really removed,  then
>> release the all related vCPU objects.
>> But we do not close KVM vcpu fd, just record it into a list, in
>> order to reuse it.
> 
> After I add and delete a CPU, "info cpus" from monitor still lists the
> removed CPU. Is this expected ?

No, neither from QEMU (e.g. monitor) nor guest side, you can not see the
removed cpu, but from the kernel side it is still there, just no one uses it
any more.

Thanks,
Gu

> 
> Regards,
> Bharata.
> .
>
Gu Zheng Sept. 11, 2014, 9:53 a.m. UTC | #5
On 09/11/2014 05:35 PM, Bharata B Rao wrote:

> On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> After ACPI get a signal to eject a vCPU, the vCPU must be
>> removed from CPU list,before the vCPU really removed,  then
>> release the all related vCPU objects.
>> But we do not close KVM vcpu fd, just record it into a list, in
>> order to reuse it.
> 
> After I add and delete a CPU, "info cpus" from monitor still lists the
> removed CPU. Is this expected ?

No, you can not see the removed cpu from QEMU (e.g. monitor) or guest side,
but from the kernel side it is still there, just no one uses it any more.

Thanks,
Gu

> 
> Regards,
> Bharata.
> .
>
Anshul Makkar Sept. 11, 2014, 10:03 a.m. UTC | #6
Bharata, this not expected. info cpus should indicate report proper number
of cpus after deletion.

Anshul Makkar

On Thu, Sep 11, 2014 at 11:35 AM, Bharata B Rao <bharata.rao@gmail.com>
wrote:

> from
Bharata B Rao Sept. 11, 2014, 12:37 p.m. UTC | #7
On Thu, Sep 11, 2014 at 3:23 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> On 09/11/2014 05:35 PM, Bharata B Rao wrote:
>
>> On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>>> After ACPI get a signal to eject a vCPU, the vCPU must be
>>> removed from CPU list,before the vCPU really removed,  then
>>> release the all related vCPU objects.
>>> But we do not close KVM vcpu fd, just record it into a list, in
>>> order to reuse it.
>>
>> After I add and delete a CPU, "info cpus" from monitor still lists the
>> removed CPU. Is this expected ?
>
> No, you can not see the removed cpu from QEMU (e.g. monitor) or guest side,
> but from the kernel side it is still there, just no one uses it any more.

I am trying your patches, but still see the removed CPU after deletion.

(qemu) info cpus
* CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
  CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
  CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
  CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
  CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
  CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
  CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
  CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
(qemu) device_add qemu64-x86_64-cpu,id=cpu8
(qemu) info cpus
* CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
  CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
  CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
  CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
  CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
  CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
  CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
  CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
  CPU #8: pc=0xffffffff8100b82c (halted) thread_id=8041
(qemu) device_del cpu8
(qemu) info cpus
* CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
  CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
  CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
  CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
  CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
  CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
  CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
  CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
  CPU #8: pc=0xffffffff81031722 (halted) thread_id=8041

I applied your patchset on commit 69f87f713069f1f70, may be I should
try with latest QEMU ?

Regards,
Bharata.
Gu Zheng Sept. 12, 2014, 1:24 a.m. UTC | #8
Hi Bharata,

On 09/11/2014 08:37 PM, Bharata B Rao wrote:

> On Thu, Sep 11, 2014 at 3:23 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> On 09/11/2014 05:35 PM, Bharata B Rao wrote:
>>
>>> On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>>>> After ACPI get a signal to eject a vCPU, the vCPU must be
>>>> removed from CPU list,before the vCPU really removed,  then
>>>> release the all related vCPU objects.
>>>> But we do not close KVM vcpu fd, just record it into a list, in
>>>> order to reuse it.
>>>
>>> After I add and delete a CPU, "info cpus" from monitor still lists the
>>> removed CPU. Is this expected ?
>>
>> No, you can not see the removed cpu from QEMU (e.g. monitor) or guest side,
>> but from the kernel side it is still there, just no one uses it any more.
> 
> I am trying your patches, but still see the removed CPU after deletion.
> 
> (qemu) info cpus
> * CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
>   CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
>   CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
>   CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
>   CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
>   CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
>   CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
>   CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
> (qemu) device_add qemu64-x86_64-cpu,id=cpu8
> (qemu) info cpus
> * CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
>   CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
>   CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
>   CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
>   CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
>   CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
>   CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
>   CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
>   CPU #8: pc=0xffffffff8100b82c (halted) thread_id=8041
> (qemu) device_del cpu8
> (qemu) info cpus
> * CPU #0: pc=0xffffffff8100b82c (halted) thread_id=7812
>   CPU #1: pc=0xffffffff8100b82c (halted) thread_id=7813
>   CPU #2: pc=0xffffffff8100b82c (halted) thread_id=7814
>   CPU #3: pc=0xffffffff8100b82c (halted) thread_id=7815
>   CPU #4: pc=0xffffffff8100b82c (halted) thread_id=7816
>   CPU #5: pc=0xffffffff8100b82c (halted) thread_id=7817
>   CPU #6: pc=0xffffffff8100b82c (halted) thread_id=7818
>   CPU #7: pc=0xffffffff8100b82c (halted) thread_id=7819
>   CPU #8: pc=0xffffffff81031722 (halted) thread_id=8041
> 
> I applied your patchset on commit 69f87f713069f1f70, may be I should
> try with latest QEMU ?

Is guest os enabled acpi cpu hotplug? What's the guest's cpu info?
Please try latest QEMU, and any feedback is welcome.

Thanks,
Gu

> 
> Regards,
> Bharata.
> .
>
Bharata B Rao Sept. 12, 2014, 8:09 a.m. UTC | #9
On Fri, Sep 12, 2014 at 6:54 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> Is guest os enabled acpi cpu hotplug? What's the guest's cpu info?
> Please try latest QEMU, and any feedback is welcome.
>

Tried with latest QEMU git + your patchset and Fedora 20 guest, but
QEMU monitor still shows the removed CPU.

Guest kernel messages during hotplug:

[root@localhost cpu]# echo 1 > cpu8/online
[   72.936069] smpboot: Booting Node 0 Processor 8 APIC 0x8
[    0.003000] kvm-clock: cpu 8, msr 0:7ffc9201, secondary cpu clock
[   72.950003] TSC synchronization [CPU#0 -> CPU#8]:
[   72.950003] Measured 199886723309 cycles TSC warp between CPUs,
turning off TSC clock.
[   72.950003] tsc: Marking TSC unstable due to check_tsc_sync_source failed
[   72.972976] KVM setup async PF for cpu 8
[   72.973648] kvm-stealtime: cpu 8, msr 7d30df00
[   72.974415] Will online and init hotplugged CPU: 8
[   72.975307] microcode: CPU8 sig=0x663, pf=0x1, revision=0x1

Guest kernel messages during hotunplug:

[root@localhost cpu]# [   95.482172] Unregister pv shared memory for cpu 8
[   95.487169] smpboot: CPU 8 is now offline
[   95.488667] ACPI: Device does not support D3cold


Guest cpuinfo (showing for the last CPU only after adding and removing CPU 8)

processor    : 7
vendor_id    : GenuineIntel
cpu family    : 6
model        : 6
model name    : QEMU Virtual CPU version 2.1.50
stepping    : 3
microcode    : 0x1
cpu MHz        : 2899.998
cache size    : 4096 KB
fpu        : yes
fpu_exception    : yes
cpuid level    : 4
wp        : yes
flags        : fpu de pse tsc msr pae mce cx8 apic sep mtrr pge mca
cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good nopl pni
cx16 x2apic popcnt hypervisor lahf_lm
bogomips    : 5799.99
clflush size    : 64
cache_alignment    : 64
address sizes    : 40 bits physical, 48 bits virtual
power management:

[root@localhost boot]# grep -ir hot config-3.11.10-301.fc20.x86_64
CONFIG_TICK_ONESHOT=y
# CONFIG_MEMORY_HOTPLUG is not set
CONFIG_HOTPLUG_CPU=y
# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set
# CONFIG_DEBUG_HOTPLUG_CPU0 is not set
CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
CONFIG_ACPI_HOTPLUG_CPU=y
CONFIG_HOTPLUG_PCI_PCIE=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_ACPI=y
CONFIG_HOTPLUG_PCI_ACPI_IBM=m
# CONFIG_HOTPLUG_PCI_CPCI is not set
CONFIG_HOTPLUG_PCI_SHPC=m
Gu Zheng Sept. 12, 2014, 9:53 a.m. UTC | #10
Hi Bharata,
On 09/12/2014 04:09 PM, Bharata B Rao wrote:

> On Fri, Sep 12, 2014 at 6:54 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> Is guest os enabled acpi cpu hotplug? What's the guest's cpu info?
>> Please try latest QEMU, and any feedback is welcome.
>>
> 
> Tried with latest QEMU git + your patchset and Fedora 20 guest, but
> QEMU monitor still shows the removed CPU.
> 
> Guest kernel messages during hotplug:
> 
> [root@localhost cpu]# echo 1 > cpu8/online
> [   72.936069] smpboot: Booting Node 0 Processor 8 APIC 0x8
> [    0.003000] kvm-clock: cpu 8, msr 0:7ffc9201, secondary cpu clock
> [   72.950003] TSC synchronization [CPU#0 -> CPU#8]:
> [   72.950003] Measured 199886723309 cycles TSC warp between CPUs,
> turning off TSC clock.
> [   72.950003] tsc: Marking TSC unstable due to check_tsc_sync_source failed
> [   72.972976] KVM setup async PF for cpu 8
> [   72.973648] kvm-stealtime: cpu 8, msr 7d30df00
> [   72.974415] Will online and init hotplugged CPU: 8
> [   72.975307] microcode: CPU8 sig=0x663, pf=0x1, revision=0x1
> 
> Guest kernel messages during hotunplug:
> 
> [root@localhost cpu]# [   95.482172] Unregister pv shared memory for cpu 8
> [   95.487169] smpboot: CPU 8 is now offline
> [   95.488667] ACPI: Device does not support D3cold
> 
> 
> Guest cpuinfo (showing for the last CPU only after adding and removing CPU 8)
> 
> processor    : 7
> vendor_id    : GenuineIntel
> cpu family    : 6
> model        : 6
> model name    : QEMU Virtual CPU version 2.1.50
> stepping    : 3
> microcode    : 0x1
> cpu MHz        : 2899.998
> cache size    : 4096 KB
> fpu        : yes
> fpu_exception    : yes
> cpuid level    : 4
> wp        : yes
> flags        : fpu de pse tsc msr pae mce cx8 apic sep mtrr pge mca
> cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good nopl pni
> cx16 x2apic popcnt hypervisor lahf_lm
> bogomips    : 5799.99
> clflush size    : 64
> cache_alignment    : 64
> address sizes    : 40 bits physical, 48 bits virtual
> power management:

Guest ejected CPU 8 successfully.
I confirmed it with the same environment as yours, it works well.
Could you please offer your QEMU config and the guest start cmd?
It may help me to investigate the issue.

Thanks,
Gu

> 
> [root@localhost boot]# grep -ir hot config-3.11.10-301.fc20.x86_64
> CONFIG_TICK_ONESHOT=y
> # CONFIG_MEMORY_HOTPLUG is not set
> CONFIG_HOTPLUG_CPU=y
> # CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set
> # CONFIG_DEBUG_HOTPLUG_CPU0 is not set
> CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
> CONFIG_ACPI_HOTPLUG_CPU=y
> CONFIG_HOTPLUG_PCI_PCIE=y
> CONFIG_HOTPLUG_PCI=y
> CONFIG_HOTPLUG_PCI_ACPI=y
> CONFIG_HOTPLUG_PCI_ACPI_IBM=m
> # CONFIG_HOTPLUG_PCI_CPCI is not set
> CONFIG_HOTPLUG_PCI_SHPC=m
> .
>
Bharata B Rao Sept. 12, 2014, 10:30 a.m. UTC | #11
On Fri, Sep 12, 2014 at 3:23 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> Hi Bharata,
> On 09/12/2014 04:09 PM, Bharata B Rao wrote:
>
>> On Fri, Sep 12, 2014 at 6:54 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>>> Is guest os enabled acpi cpu hotplug? What's the guest's cpu info?
>>> Please try latest QEMU, and any feedback is welcome.
>>>
>>
>> Tried with latest QEMU git + your patchset and Fedora 20 guest, but
>> QEMU monitor still shows the removed CPU.
>>
>> Guest kernel messages during hotplug:
>>
>> [root@localhost cpu]# echo 1 > cpu8/online
>> [   72.936069] smpboot: Booting Node 0 Processor 8 APIC 0x8
>> [    0.003000] kvm-clock: cpu 8, msr 0:7ffc9201, secondary cpu clock
>> [   72.950003] TSC synchronization [CPU#0 -> CPU#8]:
>> [   72.950003] Measured 199886723309 cycles TSC warp between CPUs,
>> turning off TSC clock.
>> [   72.950003] tsc: Marking TSC unstable due to check_tsc_sync_source failed
>> [   72.972976] KVM setup async PF for cpu 8
>> [   72.973648] kvm-stealtime: cpu 8, msr 7d30df00
>> [   72.974415] Will online and init hotplugged CPU: 8
>> [   72.975307] microcode: CPU8 sig=0x663, pf=0x1, revision=0x1
>>
>> Guest kernel messages during hotunplug:
>>
>> [root@localhost cpu]# [   95.482172] Unregister pv shared memory for cpu 8
>> [   95.487169] smpboot: CPU 8 is now offline
>> [   95.488667] ACPI: Device does not support D3cold
>>
>>
>> Guest cpuinfo (showing for the last CPU only after adding and removing CPU 8)
>>
>> processor    : 7
>> vendor_id    : GenuineIntel
>> cpu family    : 6
>> model        : 6
>> model name    : QEMU Virtual CPU version 2.1.50
>> stepping    : 3
>> microcode    : 0x1
>> cpu MHz        : 2899.998
>> cache size    : 4096 KB
>> fpu        : yes
>> fpu_exception    : yes
>> cpuid level    : 4
>> wp        : yes
>> flags        : fpu de pse tsc msr pae mce cx8 apic sep mtrr pge mca
>> cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good nopl pni
>> cx16 x2apic popcnt hypervisor lahf_lm
>> bogomips    : 5799.99
>> clflush size    : 64
>> cache_alignment    : 64
>> address sizes    : 40 bits physical, 48 bits virtual
>> power management:
>
> Guest ejected CPU 8 successfully.
> I confirmed it with the same environment as yours, it works well.
> Could you please offer your QEMU config and the guest start cmd?
> It may help me to investigate the issue.

Let me also debug this a bit in my environment, but here are the
details you asked for.

# ./x86_64-softmmu/qemu-system-x86_64 --nographic --enable-kvm -m 2G
-smp 8,maxcpus=32 -device virtio-blk-pci,drive=rootdisk -drive
file=/home/bharata/F20.img,if=none,cache=none,format=qcow2,id=rootdisk
-monitor telnet:localhost:1234,server,nowait

# cat config-host.h
/* Automatically generated by create_config - do not modify */
#define CONFIG_QEMU_CONFDIR "/usr/local/etc/qemu"
#define CONFIG_QEMU_DATADIR "/usr/local/share/qemu"
#define CONFIG_QEMU_DOCDIR "/usr/local/share/doc/qemu"
#define CONFIG_QEMU_MODDIR "/usr/local/lib/qemu"
#define CONFIG_QEMU_LOCALSTATEDIR "/usr/local/var"
#define CONFIG_QEMU_HELPERDIR "/usr/local/libexec"
#define CONFIG_QEMU_LOCALEDIR "/usr/local/share/locale"
#define HOST_X86_64 1
#define CONFIG_DEBUG_TCG 1
#define CONFIG_POSIX 1
#define CONFIG_LINUX 1
#define CONFIG_SLIRP 1
#define CONFIG_SMBD_COMMAND "/usr/sbin/smbd"
#define CONFIG_L2TPV3 1
#define CONFIG_AUDIO_DRIVERS \
    &oss_audio_driver,\

#define CONFIG_OSS 1
#define CONFIG_BDRV_RW_WHITELIST\
    NULL
#define CONFIG_BDRV_RO_WHITELIST\
    NULL
#define CONFIG_VNC 1
#define CONFIG_FNMATCH 1
#define QEMU_VERSION "2.1.50"
#define QEMU_PKGVERSION ""
#define CONFIG_UTIMENSAT 1
#define CONFIG_PIPE2 1
#define CONFIG_ACCEPT4 1
#define CONFIG_SPLICE 1
#define CONFIG_EVENTFD 1
#define CONFIG_FALLOCATE 1
#define CONFIG_FALLOCATE_PUNCH_HOLE 1
#define CONFIG_SYNC_FILE_RANGE 1
#define CONFIG_FIEMAP 1
#define CONFIG_DUP3 1
#define CONFIG_PPOLL 1
#define CONFIG_PRCTL_PR_SET_TIMERSLACK 1
#define CONFIG_EPOLL 1
#define CONFIG_EPOLL_CREATE1 1
#define CONFIG_EPOLL_PWAIT 1
#define CONFIG_SENDFILE 1
#define CONFIG_TIMERFD 1
#define CONFIG_INOTIFY 1
#define CONFIG_INOTIFY1 1
#define CONFIG_BYTESWAP_H 1
#define CONFIG_ATTR 1
#define CONFIG_VHOST_SCSI 1
#define CONFIG_VHOST_NET_USED 1
#define CONFIG_IOVEC 1
#define CONFIG_PREADV 1
#define CONFIG_SIGNALFD 1
#define CONFIG_FDATASYNC 1
#define CONFIG_MADVISE 1
#define CONFIG_POSIX_MADVISE 1
#define CONFIG_SIGEV_THREAD_ID 1
#define CONFIG_QOM_CAST_DEBUG 1
#define CONFIG_COROUTINE_BACKEND ucontext
#define CONFIG_COROUTINE_POOL 1
#define CONFIG_LINUX_MAGIC_H 1
#define CONFIG_HAS_ENVIRON 1
#define CONFIG_CPUID_H 1
#define CONFIG_INT128 1
#define CONFIG_TPM $(CONFIG_SOFTMMU)
#define CONFIG_TPM_PASSTHROUGH 1
#define CONFIG_TRACE_NOP 1
#define CONFIG_TRACE_FILE trace
#define CONFIG_THREAD_SETNAME_BYTHREAD 1
#define CONFIG_PTHREAD_SETNAME_NP 1
#define HOST_DSOSUF ".so

And I am on commit ID 0dfa7e30126364c434a4

Regards,
Bharata.
Anshul Makkar Sept. 12, 2014, 10:53 a.m. UTC | #12
During plugging we can see this event: echo 1 > cpu8/online.

But during unplugging , we can't see the event echo 0 > cpu8/online.

Just for additional check,  in my code  I have added following udev rule
echo 0 > cpu[0-9]*/online. May be this is of any help.

Thanks
Anshul Makkar

On Fri, Sep 12, 2014 at 11:53 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:

> Hi Bharata,
> On 09/12/2014 04:09 PM, Bharata B Rao wrote:
>
> > On Fri, Sep 12, 2014 at 6:54 AM, Gu Zheng <guz.fnst@cn.fujitsu.com>
> wrote:
> >> Is guest os enabled acpi cpu hotplug? What's the guest's cpu info?
> >> Please try latest QEMU, and any feedback is welcome.
> >>
> >
> > Tried with latest QEMU git + your patchset and Fedora 20 guest, but
> > QEMU monitor still shows the removed CPU.
> >
> > Guest kernel messages during hotplug:
> >
> > [root@localhost cpu]# echo 1 > cpu8/online
> > [   72.936069] smpboot: Booting Node 0 Processor 8 APIC 0x8
> > [    0.003000] kvm-clock: cpu 8, msr 0:7ffc9201, secondary cpu clock
> > [   72.950003] TSC synchronization [CPU#0 -> CPU#8]:
> > [   72.950003] Measured 199886723309 cycles TSC warp between CPUs,
> > turning off TSC clock.
> > [   72.950003] tsc: Marking TSC unstable due to check_tsc_sync_source
> failed
> > [   72.972976] KVM setup async PF for cpu 8
> > [   72.973648] kvm-stealtime: cpu 8, msr 7d30df00
> > [   72.974415] Will online and init hotplugged CPU: 8
> > [   72.975307] microcode: CPU8 sig=0x663, pf=0x1, revision=0x1
> >
> > Guest kernel messages during hotunplug:
> >
> > [root@localhost cpu]# [   95.482172] Unregister pv shared memory for
> cpu 8
> > [   95.487169] smpboot: CPU 8 is now offline
> > [   95.488667] ACPI: Device does not support D3cold
> >
> >
> > Guest cpuinfo (showing for the last CPU only after adding and removing
> CPU 8)
> >
> > processor    : 7
> > vendor_id    : GenuineIntel
> > cpu family    : 6
> > model        : 6
> > model name    : QEMU Virtual CPU version 2.1.50
> > stepping    : 3
> > microcode    : 0x1
> > cpu MHz        : 2899.998
> > cache size    : 4096 KB
> > fpu        : yes
> > fpu_exception    : yes
> > cpuid level    : 4
> > wp        : yes
> > flags        : fpu de pse tsc msr pae mce cx8 apic sep mtrr pge mca
> > cmov pse36 clflush mmx fxsr sse sse2 syscall nx lm rep_good nopl pni
> > cx16 x2apic popcnt hypervisor lahf_lm
> > bogomips    : 5799.99
> > clflush size    : 64
> > cache_alignment    : 64
> > address sizes    : 40 bits physical, 48 bits virtual
> > power management:
>
> Guest ejected CPU 8 successfully.
> I confirmed it with the same environment as yours, it works well.
> Could you please offer your QEMU config and the guest start cmd?
> It may help me to investigate the issue.
>
> Thanks,
> Gu
>
> >
> > [root@localhost boot]# grep -ir hot config-3.11.10-301.fc20.x86_64
> > CONFIG_TICK_ONESHOT=y
> > # CONFIG_MEMORY_HOTPLUG is not set
> > CONFIG_HOTPLUG_CPU=y
> > # CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set
> > # CONFIG_DEBUG_HOTPLUG_CPU0 is not set
> > CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
> > CONFIG_ACPI_HOTPLUG_CPU=y
> > CONFIG_HOTPLUG_PCI_PCIE=y
> > CONFIG_HOTPLUG_PCI=y
> > CONFIG_HOTPLUG_PCI_ACPI=y
> > CONFIG_HOTPLUG_PCI_ACPI_IBM=m
> > # CONFIG_HOTPLUG_PCI_CPCI is not set
> > CONFIG_HOTPLUG_PCI_SHPC=m
> > .
> >
>
>
>
Bharata B Rao Sept. 12, 2014, 1:52 p.m. UTC | #13
On Fri, Sep 12, 2014 at 4:23 PM, Anshul Makkar
<anshul.makkar@profitbricks.com> wrote:
> During plugging we can see this event: echo 1 > cpu8/online.
>
> But during unplugging , we can't see the event echo 0 > cpu8/online.

That's because I didn't do that explicitly, was always trying to
remove an online cpu from the monitor w/o explicitly offlining it from
inside the guest. Either ways I still see the removed CPU being listed
in QEMU monitor.

I don't ever hit any of the below code paths during CPU removal:

cpus.c: qemu_kvm_destroy_vcpu()
cpus.c: x86_cpu_finalizefn()

I see CPU_REMOVE() being called from above two routines.

And neither does hw/acpi/cpu_hotplug.c:cpu_status_write() gets called
here. Does the message "ACPI: Device does not support D3cold" guest
kernel throws during hot removal is causing this behaviour here ?
Guest kernel is 3.11.10, should I be on latest kernel ?

Regards,
Bharata.
Igor Mammedov Sept. 12, 2014, 2:15 p.m. UTC | #14
On Thu, 28 Aug 2014 11:36:42 +0800
Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:

> After ACPI get a signal to eject a vCPU, the vCPU must be
> removed from CPU list,before the vCPU really removed,  then
> release the all related vCPU objects.
> But we do not close KVM vcpu fd, just record it into a list, in
> order to reuse it.
An additional question,
  Have you checked if migration works after CPU unplug?

> 
> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
> ---
>  cpus.c               |   37 ++++++++++++++++++++++++++++++++
>  include/sysemu/kvm.h |    1 +
>  kvm-all.c            |   57 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 94 insertions(+), 1 deletions(-)
> 
> diff --git a/cpus.c b/cpus.c
> index eee693b..0608b41 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -851,6 +851,24 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>      qemu_cpu_kick(cpu);
>  }
>  
> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    CPU_REMOVE(cpu);
> +
> +    if (kvm_destroy_vcpu(cpu) < 0) {
> +        fprintf(stderr, "kvm_destroy_vcpu failed.\n");
> +        exit(1);
> +    }
> +
> +    object_unparent(OBJECT(cpu));
> +}
> +
> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
> +{
> +    CPU_REMOVE(cpu);
> +    object_unparent(OBJECT(cpu));
> +}
> +
>  static void flush_queued_work(CPUState *cpu)
>  {
>      struct qemu_work_item *wi;
> @@ -942,6 +960,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_kvm_wait_io_event(cpu);
> +        if (cpu->exit && !cpu_can_run(cpu)) {
> +            qemu_kvm_destroy_vcpu(cpu);
> +            qemu_mutex_unlock(&qemu_global_mutex);
> +            return NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -994,6 +1017,7 @@ static void tcg_exec_all(void);
>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>  {
>      CPUState *cpu = arg;
> +    CPUState *remove_cpu = NULL;
>  
>      qemu_tcg_init_cpu_signals();
>      qemu_thread_get_self(cpu->thread);
> @@ -1026,6 +1050,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>              }
>          }
>          qemu_tcg_wait_io_event();
> +        CPU_FOREACH(cpu) {
> +            if (cpu->exit && !cpu_can_run(cpu)) {
> +                remove_cpu = cpu;
> +                break;
> +            }
> +        }
> +        if (remove_cpu) {
> +            qemu_tcg_destroy_vcpu(remove_cpu);
> +            remove_cpu = NULL;
> +        }
>      }
>  
>      return NULL;
> @@ -1383,6 +1417,9 @@ static void tcg_exec_all(void)
>                  break;
>              }
>          } else if (cpu->stop || cpu->stopped) {
> +            if (cpu->exit) {
> +                next_cpu = CPU_NEXT(cpu);
> +            }
>              break;
>          }
>      }
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 174ea36..88e2403 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -178,6 +178,7 @@ int kvm_has_intx_set_mask(void);
>  
>  int kvm_init_vcpu(CPUState *cpu);
>  int kvm_cpu_exec(CPUState *cpu);
> +int kvm_destroy_vcpu(CPUState *cpu);
>  
>  #ifdef NEED_CPU_H
>  
> diff --git a/kvm-all.c b/kvm-all.c
> index 1402f4f..d0caeff 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -74,6 +74,12 @@ typedef struct KVMSlot
>  
>  typedef struct kvm_dirty_log KVMDirtyLog;
>  
> +struct KVMParkedVcpu {
> +    unsigned long vcpu_id;
> +    int kvm_fd;
> +    QLIST_ENTRY(KVMParkedVcpu) node;
> +};
> +
>  struct KVMState
>  {
>      KVMSlot *slots;
> @@ -108,6 +114,7 @@ struct KVMState
>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>      bool direct_msi;
>  #endif
> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>  };
>  
>  KVMState *kvm_state;
> @@ -226,6 +233,53 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>  }
>  
> +int kvm_destroy_vcpu(CPUState *cpu)
> +{
> +    KVMState *s = kvm_state;
> +    long mmap_size;
> +    struct KVMParkedVcpu *vcpu = NULL;
> +    int ret = 0;
> +
> +    DPRINTF("kvm_destroy_vcpu\n");
> +
> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
> +    if (mmap_size < 0) {
> +        ret = mmap_size;
> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
> +        goto err;
> +    }
> +
> +    ret = munmap(cpu->kvm_run, mmap_size);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    vcpu = g_malloc0(sizeof(*vcpu));
> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
> +    vcpu->kvm_fd = cpu->kvm_fd;
> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
> +err:
> +    return ret;
> +}
> +
> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
> +{
> +    struct KVMParkedVcpu *cpu;
> +
> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
> +        if (cpu->vcpu_id == vcpu_id) {
> +            int kvm_fd;
> +
> +            QLIST_REMOVE(cpu, node);
> +            kvm_fd = cpu->kvm_fd;
> +            g_free(cpu);
> +            return kvm_fd;
> +        }
> +    }
> +
> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
> +}
> +
>  int kvm_init_vcpu(CPUState *cpu)
>  {
>      KVMState *s = kvm_state;
> @@ -234,7 +288,7 @@ int kvm_init_vcpu(CPUState *cpu)
>  
>      DPRINTF("kvm_init_vcpu\n");
>  
> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>      if (ret < 0) {
>          DPRINTF("kvm_create_vcpu failed\n");
>          goto err;
> @@ -1404,6 +1458,7 @@ int kvm_init(MachineClass *mc)
>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>  #endif
> +    QLIST_INIT(&s->kvm_parked_vcpus);
>      s->vmfd = -1;
>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>      if (s->fd == -1) {
Anshul Makkar Sept. 12, 2014, 3:34 p.m. UTC | #15
I have tested with 3.11 kernel, Kernel should be fine.. But it wouldn't
harm testing with latest kernel, may be it can provide some extra hints..

Anshul Makkar

On Fri, Sep 12, 2014 at 3:52 PM, Bharata B Rao <bharata.rao@gmail.com>
wrote:

> On Fri, Sep 12, 2014 at 4:23 PM, Anshul Makkar
> <anshul.makkar@profitbricks.com> wrote:
> > During plugging we can see this event: echo 1 > cpu8/online.
> >
> > But during unplugging , we can't see the event echo 0 > cpu8/online.
>
> That's because I didn't do that explicitly, was always trying to
> remove an online cpu from the monitor w/o explicitly offlining it from
> inside the guest. Either ways I still see the removed CPU being listed
> in QEMU monitor.
>
> I don't ever hit any of the below code paths during CPU removal:
>
> cpus.c: qemu_kvm_destroy_vcpu()
> cpus.c: x86_cpu_finalizefn()
>
> I see CPU_REMOVE() being called from above two routines.
>
> And neither does hw/acpi/cpu_hotplug.c:cpu_status_write() gets called
> here. Does the message "ACPI: Device does not support D3cold" guest
> kernel throws during hot removal is causing this behaviour here ?
> Guest kernel is 3.11.10, should I be on latest kernel ?
>
> Regards,
> Bharata.
>
Gu Zheng Sept. 15, 2014, 5:03 a.m. UTC | #16
Hi Igor,
On 09/12/2014 10:15 PM, Igor Mammedov wrote:

> On Thu, 28 Aug 2014 11:36:42 +0800
> Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> 
>> After ACPI get a signal to eject a vCPU, the vCPU must be
>> removed from CPU list,before the vCPU really removed,  then
>> release the all related vCPU objects.
>> But we do not close KVM vcpu fd, just record it into a list, in
>> order to reuse it.
> An additional question,
>   Have you checked if migration works after CPU unplug?

Yes, migration works fine after CPU unplug.

Thanks,
Gu

> 
>>
>> Signed-off-by: Chen Fan <chen.fan.fnst@cn.fujitsu.com>
>> Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
>> ---
>>  cpus.c               |   37 ++++++++++++++++++++++++++++++++
>>  include/sysemu/kvm.h |    1 +
>>  kvm-all.c            |   57 +++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 94 insertions(+), 1 deletions(-)
>>
>> diff --git a/cpus.c b/cpus.c
>> index eee693b..0608b41 100644
>> --- a/cpus.c
>> +++ b/cpus.c
>> @@ -851,6 +851,24 @@ void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
>>      qemu_cpu_kick(cpu);
>>  }
>>  
>> +static void qemu_kvm_destroy_vcpu(CPUState *cpu)
>> +{
>> +    CPU_REMOVE(cpu);
>> +
>> +    if (kvm_destroy_vcpu(cpu) < 0) {
>> +        fprintf(stderr, "kvm_destroy_vcpu failed.\n");
>> +        exit(1);
>> +    }
>> +
>> +    object_unparent(OBJECT(cpu));
>> +}
>> +
>> +static void qemu_tcg_destroy_vcpu(CPUState *cpu)
>> +{
>> +    CPU_REMOVE(cpu);
>> +    object_unparent(OBJECT(cpu));
>> +}
>> +
>>  static void flush_queued_work(CPUState *cpu)
>>  {
>>      struct qemu_work_item *wi;
>> @@ -942,6 +960,11 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>>              }
>>          }
>>          qemu_kvm_wait_io_event(cpu);
>> +        if (cpu->exit && !cpu_can_run(cpu)) {
>> +            qemu_kvm_destroy_vcpu(cpu);
>> +            qemu_mutex_unlock(&qemu_global_mutex);
>> +            return NULL;
>> +        }
>>      }
>>  
>>      return NULL;
>> @@ -994,6 +1017,7 @@ static void tcg_exec_all(void);
>>  static void *qemu_tcg_cpu_thread_fn(void *arg)
>>  {
>>      CPUState *cpu = arg;
>> +    CPUState *remove_cpu = NULL;
>>  
>>      qemu_tcg_init_cpu_signals();
>>      qemu_thread_get_self(cpu->thread);
>> @@ -1026,6 +1050,16 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
>>              }
>>          }
>>          qemu_tcg_wait_io_event();
>> +        CPU_FOREACH(cpu) {
>> +            if (cpu->exit && !cpu_can_run(cpu)) {
>> +                remove_cpu = cpu;
>> +                break;
>> +            }
>> +        }
>> +        if (remove_cpu) {
>> +            qemu_tcg_destroy_vcpu(remove_cpu);
>> +            remove_cpu = NULL;
>> +        }
>>      }
>>  
>>      return NULL;
>> @@ -1383,6 +1417,9 @@ static void tcg_exec_all(void)
>>                  break;
>>              }
>>          } else if (cpu->stop || cpu->stopped) {
>> +            if (cpu->exit) {
>> +                next_cpu = CPU_NEXT(cpu);
>> +            }
>>              break;
>>          }
>>      }
>> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
>> index 174ea36..88e2403 100644
>> --- a/include/sysemu/kvm.h
>> +++ b/include/sysemu/kvm.h
>> @@ -178,6 +178,7 @@ int kvm_has_intx_set_mask(void);
>>  
>>  int kvm_init_vcpu(CPUState *cpu);
>>  int kvm_cpu_exec(CPUState *cpu);
>> +int kvm_destroy_vcpu(CPUState *cpu);
>>  
>>  #ifdef NEED_CPU_H
>>  
>> diff --git a/kvm-all.c b/kvm-all.c
>> index 1402f4f..d0caeff 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -74,6 +74,12 @@ typedef struct KVMSlot
>>  
>>  typedef struct kvm_dirty_log KVMDirtyLog;
>>  
>> +struct KVMParkedVcpu {
>> +    unsigned long vcpu_id;
>> +    int kvm_fd;
>> +    QLIST_ENTRY(KVMParkedVcpu) node;
>> +};
>> +
>>  struct KVMState
>>  {
>>      KVMSlot *slots;
>> @@ -108,6 +114,7 @@ struct KVMState
>>      QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
>>      bool direct_msi;
>>  #endif
>> +    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
>>  };
>>  
>>  KVMState *kvm_state;
>> @@ -226,6 +233,53 @@ static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
>>      return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
>>  }
>>  
>> +int kvm_destroy_vcpu(CPUState *cpu)
>> +{
>> +    KVMState *s = kvm_state;
>> +    long mmap_size;
>> +    struct KVMParkedVcpu *vcpu = NULL;
>> +    int ret = 0;
>> +
>> +    DPRINTF("kvm_destroy_vcpu\n");
>> +
>> +    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
>> +    if (mmap_size < 0) {
>> +        ret = mmap_size;
>> +        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
>> +        goto err;
>> +    }
>> +
>> +    ret = munmap(cpu->kvm_run, mmap_size);
>> +    if (ret < 0) {
>> +        goto err;
>> +    }
>> +
>> +    vcpu = g_malloc0(sizeof(*vcpu));
>> +    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
>> +    vcpu->kvm_fd = cpu->kvm_fd;
>> +    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
>> +err:
>> +    return ret;
>> +}
>> +
>> +static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
>> +{
>> +    struct KVMParkedVcpu *cpu;
>> +
>> +    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
>> +        if (cpu->vcpu_id == vcpu_id) {
>> +            int kvm_fd;
>> +
>> +            QLIST_REMOVE(cpu, node);
>> +            kvm_fd = cpu->kvm_fd;
>> +            g_free(cpu);
>> +            return kvm_fd;
>> +        }
>> +    }
>> +
>> +    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
>> +}
>> +
>>  int kvm_init_vcpu(CPUState *cpu)
>>  {
>>      KVMState *s = kvm_state;
>> @@ -234,7 +288,7 @@ int kvm_init_vcpu(CPUState *cpu)
>>  
>>      DPRINTF("kvm_init_vcpu\n");
>>  
>> -    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
>> +    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
>>      if (ret < 0) {
>>          DPRINTF("kvm_create_vcpu failed\n");
>>          goto err;
>> @@ -1404,6 +1458,7 @@ int kvm_init(MachineClass *mc)
>>  #ifdef KVM_CAP_SET_GUEST_DEBUG
>>      QTAILQ_INIT(&s->kvm_sw_breakpoints);
>>  #endif
>> +    QLIST_INIT(&s->kvm_parked_vcpus);
>>      s->vmfd = -1;
>>      s->fd = qemu_open("/dev/kvm", O_RDWR);
>>      if (s->fd == -1) {
> 
> .
>
Gu Zheng Sept. 15, 2014, 6:39 a.m. UTC | #17
Hi Bharata,
On 09/12/2014 09:52 PM, Bharata B Rao wrote:

> On Fri, Sep 12, 2014 at 4:23 PM, Anshul Makkar
> <anshul.makkar@profitbricks.com> wrote:
>> During plugging we can see this event: echo 1 > cpu8/online.
>>
>> But during unplugging , we can't see the event echo 0 > cpu8/online.
> 
> That's because I didn't do that explicitly, was always trying to
> remove an online cpu from the monitor w/o explicitly offlining it from
> inside the guest. Either ways I still see the removed CPU being listed
> in QEMU monitor.
> 
> I don't ever hit any of the below code paths during CPU removal:

It seems that the guest OS did not call the "_EJ0" method.
Could you please dump the guest os' acpi dsdt, and check whether the "_EJ0"
method exists? 

> 
> cpus.c: qemu_kvm_destroy_vcpu()
> cpus.c: x86_cpu_finalizefn()
> 
> I see CPU_REMOVE() being called from above two routines.
> 
> And neither does hw/acpi/cpu_hotplug.c:cpu_status_write() gets called
> here. Does the message "ACPI: Device does not support D3cold" guest
> kernel throws during hot removal is causing this behaviour here ?
> Guest kernel is 3.11.10, should I be on latest kernel ?

I have tested the fedora 20 guest with the default kernel, it works well,
so the kernel should be fine.
One more question, is the guest kernel config the same as fedora 20's default one?

Thanks,
Gu

> 
> Regards,
> Bharata.
> .
>
Bharata B Rao Sept. 15, 2014, 10:09 a.m. UTC | #18
On Mon, Sep 15, 2014 at 12:09 PM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> Hi Bharata,
> On 09/12/2014 09:52 PM, Bharata B Rao wrote:
>
>> On Fri, Sep 12, 2014 at 4:23 PM, Anshul Makkar
>> <anshul.makkar@profitbricks.com> wrote:
>>> During plugging we can see this event: echo 1 > cpu8/online.
>>>
>>> But during unplugging , we can't see the event echo 0 > cpu8/online.
>>
>> That's because I didn't do that explicitly, was always trying to
>> remove an online cpu from the monitor w/o explicitly offlining it from
>> inside the guest. Either ways I still see the removed CPU being listed
>> in QEMU monitor.
>>
>> I don't ever hit any of the below code paths during CPU removal:
>
> It seems that the guest OS did not call the "_EJ0" method.
> Could you please dump the guest os' acpi dsdt, and check whether the "_EJ0"
> method exists?

_EJ0 doesn't exist in my DSDT.

>
>>
>> cpus.c: qemu_kvm_destroy_vcpu()
>> cpus.c: x86_cpu_finalizefn()
>>
>> I see CPU_REMOVE() being called from above two routines.
>>
>> And neither does hw/acpi/cpu_hotplug.c:cpu_status_write() gets called
>> here. Does the message "ACPI: Device does not support D3cold" guest
>> kernel throws during hot removal is causing this behaviour here ?
>> Guest kernel is 3.11.10, should I be on latest kernel ?
>
> I have tested the fedora 20 guest with the default kernel, it works well,
> so the kernel should be fine.
> One more question, is the guest kernel config the same as fedora 20's default one?

Yes, I am running the distro provided kernel.

Regards,
Bharata.
Anshul Makkar Sept. 15, 2014, 10:33 a.m. UTC | #19
That explains the cause.

Please verify you have the iasl compiler installed and are not using the
hold .hex (compile .dsl ) files. (Faced this issue in our build setup using
sbuil.).

I hope you have verified that your .dsl file has the changes as mentioned
in the patch.

I have also verified with fedora 20 (unmodified kernel) and cpu-plug/unplug
is working fine.

Thanks
Anshul Makkar

On Mon, Sep 15, 2014 at 12:09 PM, Bharata B Rao <bharata.rao@gmail.com>
wrote:

> _EJ0 doesn't exist in my DSDT.
Bharata B Rao Sept. 15, 2014, 1:53 p.m. UTC | #20
On Mon, Sep 15, 2014 at 4:03 PM, Anshul Makkar
<anshul.makkar@profitbricks.com> wrote:
> That explains the cause.
>
> Please verify you have the iasl compiler installed and are not using the
> hold .hex (compile .dsl ) files. (Faced this issue in our build setup using
> sbuil.).
>
> I hope you have verified that your .dsl file has the changes as mentioned in
> the patch.

Oh, it was not obvious to me that I have to install iasl, use
--iasl=XXX during configure stage to get the new _EJ0 method to be
included in the ACPI DSDT!

So finally I now see the CPU getting removed from QEMU. Thanks for all
the inputs.

Regards,
Bharata.
Anshul Makkar Sept. 15, 2014, 2:29 p.m. UTC | #21
Great !!

Anshul Makkar

On Mon, Sep 15, 2014 at 3:53 PM, Bharata B Rao <bharata.rao@gmail.com>
wrote:

> On Mon, Sep 15, 2014 at 4:03 PM, Anshul Makkar
> <anshul.makkar@profitbricks.com> wrote:
> > That explains the cause.
> >
> > Please verify you have the iasl compiler installed and are not using the
> > hold .hex (compile .dsl ) files. (Faced this issue in our build setup
> using
> > sbuil.).
> >
> > I hope you have verified that your .dsl file has the changes as
> mentioned in
> > the patch.
>
> Oh, it was not obvious to me that I have to install iasl, use
> --iasl=XXX during configure stage to get the new _EJ0 method to be
> included in the ACPI DSDT!
>
> So finally I now see the CPU getting removed from QEMU. Thanks for all
> the inputs.
>
> Regards,
> Bharata.
>
Bharata B Rao Dec. 8, 2014, 9:16 a.m. UTC | #22
On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> After ACPI get a signal to eject a vCPU, the vCPU must be
> removed from CPU list,before the vCPU really removed,  then
> release the all related vCPU objects.
> But we do not close KVM vcpu fd, just record it into a list, in
> order to reuse it.

Hi Gu Zheng,

So this approach of parking the vcpu fd in QEMU and reusing it next
time during CPU hotplug will only work if all architectures supported
by KVM driver are ok with reuse of vcpu object in the kernel. I am
using relevant bits of this patch of yours with PowerPC architecture
to support cpu unplug and I realize that there is at least one case
where reuse of vcpu object in the kernel causes problems. Before I
attempt to fix that in ppc KVM, I wanted to check if there is
consensus on this approach (parking and reusing kvm vcpu fd) in QEMU.

Also is there a latest version of this patchset that I can try ?

Regards,
Bharata.
Peter Maydell Dec. 8, 2014, 9:26 a.m. UTC | #23
On 8 December 2014 at 09:16, Bharata B Rao <bharata.rao@gmail.com> wrote:
> So this approach of parking the vcpu fd in QEMU and reusing it next
> time during CPU hotplug will only work if all architectures supported
> by KVM driver are ok with reuse of vcpu object in the kernel. I am
> using relevant bits of this patch of yours with PowerPC architecture
> to support cpu unplug and I realize that there is at least one case
> where reuse of vcpu object in the kernel causes problems. Before I
> attempt to fix that in ppc KVM, I wanted to check if there is
> consensus on this approach (parking and reusing kvm vcpu fd) in QEMU.

For ARM we've just clarified the kernel-to-userspace ABI, and
among other things it says you can't reuse the VCPU unless it's
for exactly the same CPU type and feature configuration. So
I'm not sure this park-and-reuse approach will work...

-- PMM
Gu Zheng Dec. 8, 2014, 10:12 a.m. UTC | #24
Hi Bharata,
On 12/08/2014 05:16 PM, Bharata B Rao wrote:

> On Thu, Aug 28, 2014 at 9:06 AM, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
>> After ACPI get a signal to eject a vCPU, the vCPU must be
>> removed from CPU list,before the vCPU really removed,  then
>> release the all related vCPU objects.
>> But we do not close KVM vcpu fd, just record it into a list, in
>> order to reuse it.
> 
> Hi Gu Zheng,
> 
> So this approach of parking the vcpu fd in QEMU and reusing it next
> time during CPU hotplug will only work if all architectures supported
> by KVM driver are ok with reuse of vcpu object in the kernel. I am
> using relevant bits of this patch of yours with PowerPC architecture
> to support cpu unplug and I realize that there is at least one case
> where reuse of vcpu object in the kernel causes problems. Before I
> attempt to fix that in ppc KVM, I wanted to check if there is
> consensus on this approach (parking and reusing kvm vcpu fd) in QEMU.

Not yet, also no objection.
But it's the best way that I can find to handle the vcpu object without
touching the CPU array in kernel.

> 
> Also is there a latest version of this patchset that I can try ?

A new version is on the respinning, and will be sent out once the 2.2 released.

Thanks,
Gu

> 
> Regards,
> Bharata.
> .
>
Gu Zheng Dec. 8, 2014, 10:28 a.m. UTC | #25
Hi Peter,
On 12/08/2014 05:26 PM, Peter Maydell wrote:

> On 8 December 2014 at 09:16, Bharata B Rao <bharata.rao@gmail.com> wrote:
>> So this approach of parking the vcpu fd in QEMU and reusing it next
>> time during CPU hotplug will only work if all architectures supported
>> by KVM driver are ok with reuse of vcpu object in the kernel. I am
>> using relevant bits of this patch of yours with PowerPC architecture
>> to support cpu unplug and I realize that there is at least one case
>> where reuse of vcpu object in the kernel causes problems. Before I
>> attempt to fix that in ppc KVM, I wanted to check if there is
>> consensus on this approach (parking and reusing kvm vcpu fd) in QEMU.
> 
> For ARM we've just clarified the kernel-to-userspace ABI, and
> among other things it says you can't reuse the VCPU unless it's
> for exactly the same CPU type and feature configuration. So

Yes, it's a limitation now, but it is not the final implementation.
As to the QEMU side, the vcpu object in kernel is stateless, and a
reset operation will be done when we hot add a vcpu, so I think we
can kill the limitation, just choose a parked vcpu and reconfigure
it before we use it.

Thanks,
Gu 

> I'm not sure this park-and-reuse approach will work...

> 

> -- PMM
> .
>
Peter Maydell Dec. 8, 2014, 10:50 a.m. UTC | #26
On 8 December 2014 at 10:28, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> Yes, it's a limitation now, but it is not the final implementation.
> As to the QEMU side, the vcpu object in kernel is stateless, and a
> reset operation will be done when we hot add a vcpu, so I think we
> can kill the limitation, just choose a parked vcpu and reconfigure
> it before we use it.

Why can't the kernel handle our just destroying the vcpu and
later recreating it if necessary? That seems the more logical
approach than trying to keep fds hanging around in userspace
for reuse.

-- PMM
Igor Mammedov Dec. 8, 2014, 3:38 p.m. UTC | #27
On Mon, 8 Dec 2014 10:50:21 +0000
Peter Maydell <peter.maydell@linaro.org> wrote:

> On 8 December 2014 at 10:28, Gu Zheng <guz.fnst@cn.fujitsu.com> wrote:
> > Yes, it's a limitation now, but it is not the final implementation.
> > As to the QEMU side, the vcpu object in kernel is stateless, and a
> > reset operation will be done when we hot add a vcpu, so I think we
> > can kill the limitation, just choose a parked vcpu and reconfigure
> > it before we use it.
> 
> Why can't the kernel handle our just destroying the vcpu and
> later recreating it if necessary? That seems the more logical
> approach than trying to keep fds hanging around in userspace
> for reuse.

It's somewhat complex approach and it was suggested on KVM list to go
parking route. for more details see thread
 https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html

> 
> -- PMM
>
Peter Maydell Dec. 8, 2014, 4:38 p.m. UTC | #28
On 8 December 2014 at 15:38, Igor Mammedov <imammedo@redhat.com> wrote:
> On Mon, 8 Dec 2014 10:50:21 +0000
> Peter Maydell <peter.maydell@linaro.org> wrote:
>> Why can't the kernel handle our just destroying the vcpu and
>> later recreating it if necessary? That seems the more logical
>> approach than trying to keep fds hanging around in userspace
>> for reuse.
>
> It's somewhat complex approach and it was suggested on KVM list to go
> parking route. for more details see thread
>  https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html

If the kernel can't cope with userspace creating and destroying
vCPUs dynamically then that seems like a kernel bug to me.
It seems better to me to fix that directly rather than make
non-x86 architectures change things around to help with working
around that bug...

thanks
-- PMM
Gu Zheng Dec. 9, 2014, 12:58 a.m. UTC | #29
+cc Gleb, KVM guys,

On 12/09/2014 12:38 AM, Peter Maydell wrote:

> On 8 December 2014 at 15:38, Igor Mammedov <imammedo@redhat.com> wrote:
>> On Mon, 8 Dec 2014 10:50:21 +0000
>> Peter Maydell <peter.maydell@linaro.org> wrote:
>>> Why can't the kernel handle our just destroying the vcpu and
>>> later recreating it if necessary? That seems the more logical
>>> approach than trying to keep fds hanging around in userspace
>>> for reuse.
>>
>> It's somewhat complex approach and it was suggested on KVM list to go
>> parking route. for more details see thread
>>  https://www.mail-archive.com/kvm@vger.kernel.org/msg102839.html
> 
> If the kernel can't cope with userspace creating and destroying
> vCPUs dynamically then that seems like a kernel bug to me.

Yes, it's a flaw.

> It seems better to me to fix that directly rather than make
> non-x86 architectures change things around to help with working
> around that bug...

Agree.
But as we discussed before:
"CPU array is accessed locklessly in a lot of places, so it will have to be 
RCUified. There was attempt to do so 2 year or so ago, but it didn't go anyware. 
Adding locks is to big a price to pay for ability to free a little bit
of memory by destroying vcpu."
We worry about the regression if we add lock in a lot of places.
I'm not very familiar with non-x86 architectures. So I'm not sure how long we
need to go to help "vcpu hot-unplug" working with parking route.

Gleb,
Is any guys still working on the RCUing CPUarray access?
Is there any plan for this issue, or just leave it as it is?

Thanks,
Gu

> 
> thanks
> -- PMM
> .
>
diff mbox

Patch

diff --git a/cpus.c b/cpus.c
index eee693b..0608b41 100644
--- a/cpus.c
+++ b/cpus.c
@@ -851,6 +851,24 @@  void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
     qemu_cpu_kick(cpu);
 }
 
+static void qemu_kvm_destroy_vcpu(CPUState *cpu)
+{
+    CPU_REMOVE(cpu);
+
+    if (kvm_destroy_vcpu(cpu) < 0) {
+        fprintf(stderr, "kvm_destroy_vcpu failed.\n");
+        exit(1);
+    }
+
+    object_unparent(OBJECT(cpu));
+}
+
+static void qemu_tcg_destroy_vcpu(CPUState *cpu)
+{
+    CPU_REMOVE(cpu);
+    object_unparent(OBJECT(cpu));
+}
+
 static void flush_queued_work(CPUState *cpu)
 {
     struct qemu_work_item *wi;
@@ -942,6 +960,11 @@  static void *qemu_kvm_cpu_thread_fn(void *arg)
             }
         }
         qemu_kvm_wait_io_event(cpu);
+        if (cpu->exit && !cpu_can_run(cpu)) {
+            qemu_kvm_destroy_vcpu(cpu);
+            qemu_mutex_unlock(&qemu_global_mutex);
+            return NULL;
+        }
     }
 
     return NULL;
@@ -994,6 +1017,7 @@  static void tcg_exec_all(void);
 static void *qemu_tcg_cpu_thread_fn(void *arg)
 {
     CPUState *cpu = arg;
+    CPUState *remove_cpu = NULL;
 
     qemu_tcg_init_cpu_signals();
     qemu_thread_get_self(cpu->thread);
@@ -1026,6 +1050,16 @@  static void *qemu_tcg_cpu_thread_fn(void *arg)
             }
         }
         qemu_tcg_wait_io_event();
+        CPU_FOREACH(cpu) {
+            if (cpu->exit && !cpu_can_run(cpu)) {
+                remove_cpu = cpu;
+                break;
+            }
+        }
+        if (remove_cpu) {
+            qemu_tcg_destroy_vcpu(remove_cpu);
+            remove_cpu = NULL;
+        }
     }
 
     return NULL;
@@ -1383,6 +1417,9 @@  static void tcg_exec_all(void)
                 break;
             }
         } else if (cpu->stop || cpu->stopped) {
+            if (cpu->exit) {
+                next_cpu = CPU_NEXT(cpu);
+            }
             break;
         }
     }
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 174ea36..88e2403 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -178,6 +178,7 @@  int kvm_has_intx_set_mask(void);
 
 int kvm_init_vcpu(CPUState *cpu);
 int kvm_cpu_exec(CPUState *cpu);
+int kvm_destroy_vcpu(CPUState *cpu);
 
 #ifdef NEED_CPU_H
 
diff --git a/kvm-all.c b/kvm-all.c
index 1402f4f..d0caeff 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -74,6 +74,12 @@  typedef struct KVMSlot
 
 typedef struct kvm_dirty_log KVMDirtyLog;
 
+struct KVMParkedVcpu {
+    unsigned long vcpu_id;
+    int kvm_fd;
+    QLIST_ENTRY(KVMParkedVcpu) node;
+};
+
 struct KVMState
 {
     KVMSlot *slots;
@@ -108,6 +114,7 @@  struct KVMState
     QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
     bool direct_msi;
 #endif
+    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 };
 
 KVMState *kvm_state;
@@ -226,6 +233,53 @@  static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
     return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 }
 
+int kvm_destroy_vcpu(CPUState *cpu)
+{
+    KVMState *s = kvm_state;
+    long mmap_size;
+    struct KVMParkedVcpu *vcpu = NULL;
+    int ret = 0;
+
+    DPRINTF("kvm_destroy_vcpu\n");
+
+    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (mmap_size < 0) {
+        ret = mmap_size;
+        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        goto err;
+    }
+
+    ret = munmap(cpu->kvm_run, mmap_size);
+    if (ret < 0) {
+        goto err;
+    }
+
+    vcpu = g_malloc0(sizeof(*vcpu));
+    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
+    vcpu->kvm_fd = cpu->kvm_fd;
+    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
+err:
+    return ret;
+}
+
+static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
+{
+    struct KVMParkedVcpu *cpu;
+
+    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
+        if (cpu->vcpu_id == vcpu_id) {
+            int kvm_fd;
+
+            QLIST_REMOVE(cpu, node);
+            kvm_fd = cpu->kvm_fd;
+            g_free(cpu);
+            return kvm_fd;
+        }
+    }
+
+    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
+}
+
 int kvm_init_vcpu(CPUState *cpu)
 {
     KVMState *s = kvm_state;
@@ -234,7 +288,7 @@  int kvm_init_vcpu(CPUState *cpu)
 
     DPRINTF("kvm_init_vcpu\n");
 
-    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
+    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
     if (ret < 0) {
         DPRINTF("kvm_create_vcpu failed\n");
         goto err;
@@ -1404,6 +1458,7 @@  int kvm_init(MachineClass *mc)
 #ifdef KVM_CAP_SET_GUEST_DEBUG
     QTAILQ_INIT(&s->kvm_sw_breakpoints);
 #endif
+    QLIST_INIT(&s->kvm_parked_vcpus);
     s->vmfd = -1;
     s->fd = qemu_open("/dev/kvm", O_RDWR);
     if (s->fd == -1) {