diff mbox series

[2/3] i386: kvm: disable KVM_CAP_PMU_CAPABILITY if "pmu" is disabled

Message ID 20221119122901.2469-3-dongli.zhang@oracle.com
State Handled Elsewhere
Headers show
Series kvm: fix two svm pmu virtualization bugs | expand

Commit Message

Dongli Zhang Nov. 19, 2022, 12:29 p.m. UTC
The "perf stat" at the VM side still works even we set "-cpu host,-pmu" in
the QEMU command line. That is, neither "-cpu host,-pmu" nor "-cpu EPYC"
could disable the pmu virtualization in an AMD environment.

We still see below at VM kernel side ...

[    0.510611] Performance Events: Fam17h+ core perfctr, AMD PMU driver.

... although we expect something like below.

[    0.596381] Performance Events: PMU not available due to virtualization, using software events only.
[    0.600972] NMI watchdog: Perf NMI watchdog permanently disabled

This is because the AMD pmu (v1) does not rely on cpuid to decide if the
pmu virtualization is supported.

We disable KVM_CAP_PMU_CAPABILITY if the 'pmu' is disabled in the vcpu
properties.

Cc: Joe Jin <joe.jin@oracle.com>
Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
---
 target/i386/kvm/kvm.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

Comments

Greg Kurz Nov. 21, 2022, 11:03 a.m. UTC | #1
On Sat, 19 Nov 2022 04:29:00 -0800
Dongli Zhang <dongli.zhang@oracle.com> wrote:

> The "perf stat" at the VM side still works even we set "-cpu host,-pmu" in
> the QEMU command line. That is, neither "-cpu host,-pmu" nor "-cpu EPYC"
> could disable the pmu virtualization in an AMD environment.
> 
> We still see below at VM kernel side ...
> 
> [    0.510611] Performance Events: Fam17h+ core perfctr, AMD PMU driver.
> 
> ... although we expect something like below.
> 
> [    0.596381] Performance Events: PMU not available due to virtualization, using software events only.
> [    0.600972] NMI watchdog: Perf NMI watchdog permanently disabled
> 
> This is because the AMD pmu (v1) does not rely on cpuid to decide if the
> pmu virtualization is supported.
> 
> We disable KVM_CAP_PMU_CAPABILITY if the 'pmu' is disabled in the vcpu
> properties.
> 
> Cc: Joe Jin <joe.jin@oracle.com>
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> ---
>  target/i386/kvm/kvm.c | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 8fec0bc5b5..0b1226ff7f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -137,6 +137,8 @@ static int has_triple_fault_event;
>  
>  static bool has_msr_mcg_ext_ctl;
>  
> +static int has_pmu_cap;
> +
>  static struct kvm_cpuid2 *cpuid_cache;
>  static struct kvm_cpuid2 *hv_cpuid_cache;
>  static struct kvm_msr_list *kvm_feature_msrs;
> @@ -1725,6 +1727,19 @@ static void kvm_init_nested_state(CPUX86State *env)
>  
>  void kvm_arch_pre_create_vcpu(CPUState *cs)
>  {
> +    X86CPU *cpu = X86_CPU(cs);
> +    int ret;
> +
> +    if (has_pmu_cap && !cpu->enable_pmu) {
> +        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PMU_CAPABILITY, 0,
> +                                KVM_PMU_CAP_DISABLE);

It doesn't seem conceptually correct to configure VM level stuff out of
a vCPU property, which could theoretically be different for each vCPU,
even if this isn't the case with the current code base.

Maybe consider controlling PMU with a machine property and this
could be done in kvm_arch_init() like other VM level stuff ?

> +        if (ret < 0) {
> +            error_report("kvm: Failed to disable pmu cap: %s",
> +                         strerror(-ret));
> +        }
> +
> +        has_pmu_cap = 0;
> +    }
>  }
>  
>  int kvm_arch_init_vcpu(CPUState *cs)
> @@ -2517,6 +2532,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>          }
>      }
>  
> +    has_pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
> +
>      ret = kvm_get_supported_msrs(s);
>      if (ret < 0) {
>          return ret;
Liang Yan Nov. 21, 2022, 2:23 p.m. UTC | #2
On 11/21/22 06:03, Greg Kurz wrote:
> On Sat, 19 Nov 2022 04:29:00 -0800
> Dongli Zhang <dongli.zhang@oracle.com> wrote:
>
>> The "perf stat" at the VM side still works even we set "-cpu host,-pmu" in
>> the QEMU command line. That is, neither "-cpu host,-pmu" nor "-cpu EPYC"
>> could disable the pmu virtualization in an AMD environment.
>>
>> We still see below at VM kernel side ...
>>
>> [    0.510611] Performance Events: Fam17h+ core perfctr, AMD PMU driver.
>>
>> ... although we expect something like below.
>>
>> [    0.596381] Performance Events: PMU not available due to virtualization, using software events only.
>> [    0.600972] NMI watchdog: Perf NMI watchdog permanently disabled
>>
>> This is because the AMD pmu (v1) does not rely on cpuid to decide if the
>> pmu virtualization is supported.
>>
>> We disable KVM_CAP_PMU_CAPABILITY if the 'pmu' is disabled in the vcpu
>> properties.
>>
>> Cc: Joe Jin <joe.jin@oracle.com>
>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>> ---
>>   target/i386/kvm/kvm.c | 17 +++++++++++++++++
>>   1 file changed, 17 insertions(+)
>>
>> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
>> index 8fec0bc5b5..0b1226ff7f 100644
>> --- a/target/i386/kvm/kvm.c
>> +++ b/target/i386/kvm/kvm.c
>> @@ -137,6 +137,8 @@ static int has_triple_fault_event;
>>   
>>   static bool has_msr_mcg_ext_ctl;
>>   
>> +static int has_pmu_cap;
>> +
>>   static struct kvm_cpuid2 *cpuid_cache;
>>   static struct kvm_cpuid2 *hv_cpuid_cache;
>>   static struct kvm_msr_list *kvm_feature_msrs;
>> @@ -1725,6 +1727,19 @@ static void kvm_init_nested_state(CPUX86State *env)
>>   
>>   void kvm_arch_pre_create_vcpu(CPUState *cs)
>>   {
>> +    X86CPU *cpu = X86_CPU(cs);
>> +    int ret;
>> +
>> +    if (has_pmu_cap && !cpu->enable_pmu) {
>> +        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PMU_CAPABILITY, 0,
>> +                                KVM_PMU_CAP_DISABLE);
> It doesn't seem conceptually correct to configure VM level stuff out of
> a vCPU property, which could theoretically be different for each vCPU,
> even if this isn't the case with the current code base.
>
> Maybe consider controlling PMU with a machine property and this
> could be done in kvm_arch_init() like other VM level stuff ?
>

There is already a 'pmu' property for x86_cpu with variable 'enable_pmu' 
as we see the above code. It is mainly used by Intel CPU and set to off 
by default since qemu 1.5.

And, this property is spread to AMD CPU too.

I think you may need setup a machine property to disable it from current 
machine model. Otherwise, it will break the Live Migration scenario.


>> +        if (ret < 0) {
>> +            error_report("kvm: Failed to disable pmu cap: %s",
>> +                         strerror(-ret));
>> +        }
>> +
>> +        has_pmu_cap = 0;
>> +    }
>>   }
>>   
>>   int kvm_arch_init_vcpu(CPUState *cs)
>> @@ -2517,6 +2532,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>>           }
>>       }
>>   
>> +    has_pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
>> +
>>       ret = kvm_get_supported_msrs(s);
>>       if (ret < 0) {
>>           return ret;
>
Dongli Zhang Nov. 21, 2022, 9:11 p.m. UTC | #3
Hi Greg and Liang,

On 11/21/22 6:23 AM, Liang Yan wrote:
> 
> On 11/21/22 06:03, Greg Kurz wrote:
>> On Sat, 19 Nov 2022 04:29:00 -0800
>> Dongli Zhang <dongli.zhang@oracle.com> wrote:
>>
>>> The "perf stat" at the VM side still works even we set "-cpu host,-pmu" in
>>> the QEMU command line. That is, neither "-cpu host,-pmu" nor "-cpu EPYC"
>>> could disable the pmu virtualization in an AMD environment.
>>>
>>> We still see below at VM kernel side ...
>>>
>>> [    0.510611] Performance Events: Fam17h+ core perfctr, AMD PMU driver.
>>>
>>> ... although we expect something like below.
>>>
>>> [    0.596381] Performance Events: PMU not available due to virtualization,
>>> using software events only.
>>> [    0.600972] NMI watchdog: Perf NMI watchdog permanently disabled
>>>
>>> This is because the AMD pmu (v1) does not rely on cpuid to decide if the
>>> pmu virtualization is supported.
>>>
>>> We disable KVM_CAP_PMU_CAPABILITY if the 'pmu' is disabled in the vcpu
>>> properties.
>>>
>>> Cc: Joe Jin <joe.jin@oracle.com>
>>> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
>>> ---
>>>   target/i386/kvm/kvm.c | 17 +++++++++++++++++
>>>   1 file changed, 17 insertions(+)
>>>
>>> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
>>> index 8fec0bc5b5..0b1226ff7f 100644
>>> --- a/target/i386/kvm/kvm.c
>>> +++ b/target/i386/kvm/kvm.c
>>> @@ -137,6 +137,8 @@ static int has_triple_fault_event;
>>>     static bool has_msr_mcg_ext_ctl;
>>>   +static int has_pmu_cap;
>>> +
>>>   static struct kvm_cpuid2 *cpuid_cache;
>>>   static struct kvm_cpuid2 *hv_cpuid_cache;
>>>   static struct kvm_msr_list *kvm_feature_msrs;
>>> @@ -1725,6 +1727,19 @@ static void kvm_init_nested_state(CPUX86State *env)
>>>     void kvm_arch_pre_create_vcpu(CPUState *cs)
>>>   {
>>> +    X86CPU *cpu = X86_CPU(cs);
>>> +    int ret;
>>> +
>>> +    if (has_pmu_cap && !cpu->enable_pmu) {
>>> +        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PMU_CAPABILITY, 0,
>>> +                                KVM_PMU_CAP_DISABLE);
>> It doesn't seem conceptually correct to configure VM level stuff out of
>> a vCPU property, which could theoretically be different for each vCPU,
>> even if this isn't the case with the current code base.
>>
>> Maybe consider controlling PMU with a machine property and this
>> could be done in kvm_arch_init() like other VM level stuff ?
>>
> 
> There is already a 'pmu' property for x86_cpu with variable 'enable_pmu' as we
> see the above code. It is mainly used by Intel CPU and set to off by default
> since qemu 1.5.
> 
> And, this property is spread to AMD CPU too.
> 
> I think you may need setup a machine property to disable it from current machine
> model. Otherwise, it will break the Live Migration scenario.

Initially, I was thinking about to replace the CPU level property 'pmu' to a VM
level property but with a concern that reviewers/maintainers might not be happy
with it. That may break the way other Apps use the QEMU.

I will add an extra VM level property to the kvm_accel_class_init() for
"-machine accel=kvm" in addition to the existing per CPU property. The VM (kvm)
level
'pmu' will be more privileged than the CPU level 'pmu'.

Thank you very much!

Dongli Zhang

> 
> 
>>> +        if (ret < 0) {
>>> +            error_report("kvm: Failed to disable pmu cap: %s",
>>> +                         strerror(-ret));
>>> +        }
>>> +
>>> +        has_pmu_cap = 0;
>>> +    }
>>>   }
>>>     int kvm_arch_init_vcpu(CPUState *cs)
>>> @@ -2517,6 +2532,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>>>           }
>>>       }
>>>   +    has_pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
>>> +
>>>       ret = kvm_get_supported_msrs(s);
>>>       if (ret < 0) {
>>>           return ret;
>>
Denis V. Lunev Nov. 13, 2023, 4:39 p.m. UTC | #4
On 11/19/22 13:29, Dongli Zhang wrote:
> The "perf stat" at the VM side still works even we set "-cpu host,-pmu" in
> the QEMU command line. That is, neither "-cpu host,-pmu" nor "-cpu EPYC"
> could disable the pmu virtualization in an AMD environment.
>
> We still see below at VM kernel side ...
>
> [    0.510611] Performance Events: Fam17h+ core perfctr, AMD PMU driver.
>
> ... although we expect something like below.
>
> [    0.596381] Performance Events: PMU not available due to virtualization, using software events only.
> [    0.600972] NMI watchdog: Perf NMI watchdog permanently disabled
>
> This is because the AMD pmu (v1) does not rely on cpuid to decide if the
> pmu virtualization is supported.
>
> We disable KVM_CAP_PMU_CAPABILITY if the 'pmu' is disabled in the vcpu
> properties.
>
> Cc: Joe Jin <joe.jin@oracle.com>
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> ---
>   target/i386/kvm/kvm.c | 17 +++++++++++++++++
>   1 file changed, 17 insertions(+)
>
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 8fec0bc5b5..0b1226ff7f 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -137,6 +137,8 @@ static int has_triple_fault_event;
>   
>   static bool has_msr_mcg_ext_ctl;
>   
> +static int has_pmu_cap;
> +
>   static struct kvm_cpuid2 *cpuid_cache;
>   static struct kvm_cpuid2 *hv_cpuid_cache;
>   static struct kvm_msr_list *kvm_feature_msrs;
> @@ -1725,6 +1727,19 @@ static void kvm_init_nested_state(CPUX86State *env)
>   
>   void kvm_arch_pre_create_vcpu(CPUState *cs)
>   {
> +    X86CPU *cpu = X86_CPU(cs);
> +    int ret;
> +
> +    if (has_pmu_cap && !cpu->enable_pmu) {
> +        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PMU_CAPABILITY, 0,
> +                                KVM_PMU_CAP_DISABLE);
> +        if (ret < 0) {
> +            error_report("kvm: Failed to disable pmu cap: %s",
> +                         strerror(-ret));
> +        }
> +
> +        has_pmu_cap = 0;
> +    }
>   }
>   
>   int kvm_arch_init_vcpu(CPUState *cs)
> @@ -2517,6 +2532,8 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>           }
>       }
>   
> +    has_pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
> +
>       ret = kvm_get_supported_msrs(s);
>       if (ret < 0) {
>           return ret;
This patch is very important in particular.
It boosts performance of any single VMexit
is 13% for AMD. Intel is being measured.

At my opinion v1 of the patch is better that
version 2. We should not introduce any
new capability but disable PMU if we can
while it is disabled according to the configuration.

The discussion about performance improvement
is here
https://lore.kernel.org/lkml/ZU2D3f6kc0MDzNR5@google.com/T/

Den
diff mbox series

Patch

diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index 8fec0bc5b5..0b1226ff7f 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -137,6 +137,8 @@  static int has_triple_fault_event;
 
 static bool has_msr_mcg_ext_ctl;
 
+static int has_pmu_cap;
+
 static struct kvm_cpuid2 *cpuid_cache;
 static struct kvm_cpuid2 *hv_cpuid_cache;
 static struct kvm_msr_list *kvm_feature_msrs;
@@ -1725,6 +1727,19 @@  static void kvm_init_nested_state(CPUX86State *env)
 
 void kvm_arch_pre_create_vcpu(CPUState *cs)
 {
+    X86CPU *cpu = X86_CPU(cs);
+    int ret;
+
+    if (has_pmu_cap && !cpu->enable_pmu) {
+        ret = kvm_vm_enable_cap(kvm_state, KVM_CAP_PMU_CAPABILITY, 0,
+                                KVM_PMU_CAP_DISABLE);
+        if (ret < 0) {
+            error_report("kvm: Failed to disable pmu cap: %s",
+                         strerror(-ret));
+        }
+
+        has_pmu_cap = 0;
+    }
 }
 
 int kvm_arch_init_vcpu(CPUState *cs)
@@ -2517,6 +2532,8 @@  int kvm_arch_init(MachineState *ms, KVMState *s)
         }
     }
 
+    has_pmu_cap = kvm_check_extension(s, KVM_CAP_PMU_CAPABILITY);
+
     ret = kvm_get_supported_msrs(s);
     if (ret < 0) {
         return ret;