diff mbox series

[4/8] i386: hvf: Implement CPU kick

Message ID 20200624225850.16982-5-r.bolshakov@yadro.com
State New
Headers show
Series Improve synchronization between QEMU and HVF | expand

Commit Message

Roman Bolshakov June 24, 2020, 10:58 p.m. UTC
HVF doesn't have a CPU kick and without it it's not possible to perform
an action on CPU thread until a VMEXIT happens. The kick is also needed
for timely interrupt delivery.

Existing implementation of CPU kick sends SIG_IPI (aka SIGUSR1) to vCPU
thread, but it's different from what hv_vcpu_interrupt does. The latter
one results in invocation of mp_cpus_kick() in XNU kernel [1].

While at it, correct type of hvf_fd to the type of hv_vcpuid_t to avoid
compilation warnings.

1. https://opensource.apple.com/source/xnu/xnu-6153.81.5/osfmk/i386/mp.c

Cc: Cameron Esfahani <dirty@apple.com>
Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
---
 cpus.c                | 13 +++++++++----
 include/hw/core/cpu.h |  2 +-
 include/sysemu/hvf.h  |  1 +
 target/i386/hvf/hvf.c | 11 +++++++++++
 4 files changed, 22 insertions(+), 5 deletions(-)

Comments

Claudio Fontana June 25, 2020, 7:07 a.m. UTC | #1
Hi Roman,

On 6/25/20 12:58 AM, Roman Bolshakov wrote:
> HVF doesn't have a CPU kick and without it it's not possible to perform
> an action on CPU thread until a VMEXIT happens. The kick is also needed
> for timely interrupt delivery.
> 
> Existing implementation of CPU kick sends SIG_IPI (aka SIGUSR1) to vCPU
> thread, but it's different from what hv_vcpu_interrupt does. The latter
> one results in invocation of mp_cpus_kick() in XNU kernel [1].
> 
> While at it, correct type of hvf_fd to the type of hv_vcpuid_t to avoid
> compilation warnings.
> 
> 1. https://opensource.apple.com/source/xnu/xnu-6153.81.5/osfmk/i386/mp.c
> 
> Cc: Cameron Esfahani <dirty@apple.com>
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
>  cpus.c                | 13 +++++++++----
>  include/hw/core/cpu.h |  2 +-
>  include/sysemu/hvf.h  |  1 +
>  target/i386/hvf/hvf.c | 11 +++++++++++
>  4 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/cpus.c b/cpus.c
> index 26709677d3..36f38ce5c8 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -1783,10 +1783,15 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
>          return;
>      }
>      cpu->thread_kicked = true;
> -    err = pthread_kill(cpu->thread->thread, SIG_IPI);
> -    if (err && err != ESRCH) {
> -        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> -        exit(1);
> +
> +    if (hvf_enabled()) {
> +        hvf_vcpu_kick(cpu);

could this be moved to qemu_cpu_kick, where we have already the ifs for accelerator types tcg and hax?

Not terribly important if then my cpus-refactoring goes forward, but on its own that should be the proper place for if (hvf_enabled()) I think.



> +    } else {
> +        err = pthread_kill(cpu->thread->thread, SIG_IPI);
> +        if (err && err != ESRCH) {
> +            fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> +            exit(1);
> +        }
>      }
>  #else /* _WIN32 */
>      if (!qemu_cpu_is_self(cpu)) {
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index b3f4b79318..288a2bd57e 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -438,7 +438,7 @@ struct CPUState {
>  
>      struct hax_vcpu_state *hax_vcpu;
>  
> -    int hvf_fd;
> +    unsigned hvf_fd;
>  
>      /* track IOMMUs whose translations we've cached in the TCG TLB */
>      GArray *iommu_notifiers;
> diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
> index 1d40a8ec01..aaa00cbf05 100644
> --- a/include/sysemu/hvf.h
> +++ b/include/sysemu/hvf.h
> @@ -25,6 +25,7 @@ extern bool hvf_allowed;
>  
>  int hvf_init_vcpu(CPUState *);
>  int hvf_vcpu_exec(CPUState *);
> +void hvf_vcpu_kick(CPUState *);
>  void hvf_cpu_synchronize_state(CPUState *);
>  void hvf_cpu_synchronize_post_reset(CPUState *);
>  void hvf_cpu_synchronize_post_init(CPUState *);
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index efe9802962..4d254a477a 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -966,6 +966,17 @@ int hvf_vcpu_exec(CPUState *cpu)
>      return ret;
>  }
>  
> +void hvf_vcpu_kick(CPUState *cpu)
> +{
> +    hv_return_t err;
> +
> +    err = hv_vcpu_interrupt(&cpu->hvf_fd, 1);
> +    if (err) {
> +        fprintf(stderr, "qemu:%s error %#x\n", __func__, err);
> +        exit(1);
> +    }
> +}
> +
>  bool hvf_allowed;
>  
>  static int hvf_accel_init(MachineState *ms)
>
Paolo Bonzini June 25, 2020, 10:28 a.m. UTC | #2
On 25/06/20 00:58, Roman Bolshakov wrote:
> HVF doesn't have a CPU kick and without it it's not possible to perform
> an action on CPU thread until a VMEXIT happens. The kick is also needed
> for timely interrupt delivery.
> 
> Existing implementation of CPU kick sends SIG_IPI (aka SIGUSR1) to vCPU
> thread, but it's different from what hv_vcpu_interrupt does. The latter
> one results in invocation of mp_cpus_kick() in XNU kernel [1].
> 
> While at it, correct type of hvf_fd to the type of hv_vcpuid_t to avoid
> compilation warnings.
> 
> 1. https://opensource.apple.com/source/xnu/xnu-6153.81.5/osfmk/i386/mp.c
> 
> Cc: Cameron Esfahani <dirty@apple.com>
> Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> ---
>  cpus.c                | 13 +++++++++----
>  include/hw/core/cpu.h |  2 +-
>  include/sysemu/hvf.h  |  1 +
>  target/i386/hvf/hvf.c | 11 +++++++++++
>  4 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/cpus.c b/cpus.c
> index 26709677d3..36f38ce5c8 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -1783,10 +1783,15 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
>          return;
>      }
>      cpu->thread_kicked = true;
> -    err = pthread_kill(cpu->thread->thread, SIG_IPI);
> -    if (err && err != ESRCH) {
> -        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> -        exit(1);
> +
> +    if (hvf_enabled()) {
> +        hvf_vcpu_kick(cpu);
> +    } else {
> +        err = pthread_kill(cpu->thread->thread, SIG_IPI);
> +        if (err && err != ESRCH) {
> +            fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> +            exit(1);
> +        }
>      }
>  #else /* _WIN32 */
>      if (!qemu_cpu_is_self(cpu)) {
> diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> index b3f4b79318..288a2bd57e 100644
> --- a/include/hw/core/cpu.h
> +++ b/include/hw/core/cpu.h
> @@ -438,7 +438,7 @@ struct CPUState {
>  
>      struct hax_vcpu_state *hax_vcpu;
>  
> -    int hvf_fd;
> +    unsigned hvf_fd;
>  
>      /* track IOMMUs whose translations we've cached in the TCG TLB */
>      GArray *iommu_notifiers;
> diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
> index 1d40a8ec01..aaa00cbf05 100644
> --- a/include/sysemu/hvf.h
> +++ b/include/sysemu/hvf.h
> @@ -25,6 +25,7 @@ extern bool hvf_allowed;
>  
>  int hvf_init_vcpu(CPUState *);
>  int hvf_vcpu_exec(CPUState *);
> +void hvf_vcpu_kick(CPUState *);
>  void hvf_cpu_synchronize_state(CPUState *);
>  void hvf_cpu_synchronize_post_reset(CPUState *);
>  void hvf_cpu_synchronize_post_init(CPUState *);
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index efe9802962..4d254a477a 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -966,6 +966,17 @@ int hvf_vcpu_exec(CPUState *cpu)
>      return ret;
>  }
>  
> +void hvf_vcpu_kick(CPUState *cpu)
> +{
> +    hv_return_t err;
> +
> +    err = hv_vcpu_interrupt(&cpu->hvf_fd, 1);
> +    if (err) {
> +        fprintf(stderr, "qemu:%s error %#x\n", __func__, err);
> +        exit(1);
> +    }
> +}
> +
>  bool hvf_allowed;
>  
>  static int hvf_accel_init(MachineState *ms)
> 

The documentation isn't clear on whether hv_vcpu_interrupt is able to
interrupt a *subsequent* hv_vcpu_run, similar to WHPX
WHvCancelRunVirtualProcessor (is it possible to decompile
hv_vcpu_interrupt and see what it does?).  If not, you can reduce a bit
the race window by setting a variable in cpu, like

	atomic_set(&cpu->deadline, 0);
	hv_vcpu_interrupt(...)

and in the vCPU thread

	hv_vcpu_run_until(..., atomic_read(&cpu->deadline));
	atomic_set(&cpu->deadline, HV_DEADLINE_FOREVER);

Paolo
Roman Bolshakov June 25, 2020, 10:51 a.m. UTC | #3
On Thu, Jun 25, 2020 at 09:07:04AM +0200, Claudio Fontana wrote:
> Hi Roman,
> 
> On 6/25/20 12:58 AM, Roman Bolshakov wrote:
> > HVF doesn't have a CPU kick and without it it's not possible to perform
> > an action on CPU thread until a VMEXIT happens. The kick is also needed
> > for timely interrupt delivery.
> > 
> > Existing implementation of CPU kick sends SIG_IPI (aka SIGUSR1) to vCPU
> > thread, but it's different from what hv_vcpu_interrupt does. The latter
> > one results in invocation of mp_cpus_kick() in XNU kernel [1].
> > 
> > While at it, correct type of hvf_fd to the type of hv_vcpuid_t to avoid
> > compilation warnings.
> > 
> > 1. https://opensource.apple.com/source/xnu/xnu-6153.81.5/osfmk/i386/mp.c
> > 
> > Cc: Cameron Esfahani <dirty@apple.com>
> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > ---
> >  cpus.c                | 13 +++++++++----
> >  include/hw/core/cpu.h |  2 +-
> >  include/sysemu/hvf.h  |  1 +
> >  target/i386/hvf/hvf.c | 11 +++++++++++
> >  4 files changed, 22 insertions(+), 5 deletions(-)
> > 
> > diff --git a/cpus.c b/cpus.c
> > index 26709677d3..36f38ce5c8 100644
> > --- a/cpus.c
> > +++ b/cpus.c
> > @@ -1783,10 +1783,15 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
> >          return;
> >      }
> >      cpu->thread_kicked = true;
> > -    err = pthread_kill(cpu->thread->thread, SIG_IPI);
> > -    if (err && err != ESRCH) {
> > -        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> > -        exit(1);
> > +
> > +    if (hvf_enabled()) {
> > +        hvf_vcpu_kick(cpu);
> 
> could this be moved to qemu_cpu_kick, where we have already the ifs for accelerator types tcg and hax?
> 

Hi Claudio,

I did this because of cpu->thread_kicked which is not set or tested in
qemu_cpu_kick(). It's not used for tcg and mttcg but hax does seem to
use the qemu_cpu_kick_thread() and additionally sets cpu->exit_request
in qemu_cpu_kick(). There's a difference between hax/kvm and hvf, they
use different ways of siginalling the kick. hax/kvm use POSIX signals
while HVF sends an IPI from the host LAPIC to deliver the kick. The
patch highlights the difference.

As far as I understand if thread_kicked is set, multiple cpu kicks are
coalesced until thread_kicked is cleared. So, the answer to your
question: It could be moved to qemu_cpu_kick but then kick debouncing
should be duplicated inside hvf_vcpu_kick().

Regards,
Roman

> Not terribly important if then my cpus-refactoring goes forward, but on its own that should be the proper place for if (hvf_enabled()) I think.
> 
> 
> 
> > +    } else {
> > +        err = pthread_kill(cpu->thread->thread, SIG_IPI);
> > +        if (err && err != ESRCH) {
> > +            fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> > +            exit(1);
> > +        }
> >      }
> >  #else /* _WIN32 */
> >      if (!qemu_cpu_is_self(cpu)) {
Roman Bolshakov June 25, 2020, 3:57 p.m. UTC | #4
On Thu, Jun 25, 2020 at 12:28:26PM +0200, Paolo Bonzini wrote:
> On 25/06/20 00:58, Roman Bolshakov wrote:
> > HVF doesn't have a CPU kick and without it it's not possible to perform
> > an action on CPU thread until a VMEXIT happens. The kick is also needed
> > for timely interrupt delivery.
> > 
> > Existing implementation of CPU kick sends SIG_IPI (aka SIGUSR1) to vCPU
> > thread, but it's different from what hv_vcpu_interrupt does. The latter
> > one results in invocation of mp_cpus_kick() in XNU kernel [1].
> > 
> > While at it, correct type of hvf_fd to the type of hv_vcpuid_t to avoid
> > compilation warnings.
> > 
> > 1. https://opensource.apple.com/source/xnu/xnu-6153.81.5/osfmk/i386/mp.c
> > 
> > Cc: Cameron Esfahani <dirty@apple.com>
> > Signed-off-by: Roman Bolshakov <r.bolshakov@yadro.com>
> > ---
> >  cpus.c                | 13 +++++++++----
> >  include/hw/core/cpu.h |  2 +-
> >  include/sysemu/hvf.h  |  1 +
> >  target/i386/hvf/hvf.c | 11 +++++++++++
> >  4 files changed, 22 insertions(+), 5 deletions(-)
> > 
> > diff --git a/cpus.c b/cpus.c
> > index 26709677d3..36f38ce5c8 100644
> > --- a/cpus.c
> > +++ b/cpus.c
> > @@ -1783,10 +1783,15 @@ static void qemu_cpu_kick_thread(CPUState *cpu)
> >          return;
> >      }
> >      cpu->thread_kicked = true;
> > -    err = pthread_kill(cpu->thread->thread, SIG_IPI);
> > -    if (err && err != ESRCH) {
> > -        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> > -        exit(1);
> > +
> > +    if (hvf_enabled()) {
> > +        hvf_vcpu_kick(cpu);
> > +    } else {
> > +        err = pthread_kill(cpu->thread->thread, SIG_IPI);
> > +        if (err && err != ESRCH) {
> > +            fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
> > +            exit(1);
> > +        }
> >      }
> >  #else /* _WIN32 */
> >      if (!qemu_cpu_is_self(cpu)) {
> > diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
> > index b3f4b79318..288a2bd57e 100644
> > --- a/include/hw/core/cpu.h
> > +++ b/include/hw/core/cpu.h
> > @@ -438,7 +438,7 @@ struct CPUState {
> >  
> >      struct hax_vcpu_state *hax_vcpu;
> >  
> > -    int hvf_fd;
> > +    unsigned hvf_fd;
> >  
> >      /* track IOMMUs whose translations we've cached in the TCG TLB */
> >      GArray *iommu_notifiers;
> > diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
> > index 1d40a8ec01..aaa00cbf05 100644
> > --- a/include/sysemu/hvf.h
> > +++ b/include/sysemu/hvf.h
> > @@ -25,6 +25,7 @@ extern bool hvf_allowed;
> >  
> >  int hvf_init_vcpu(CPUState *);
> >  int hvf_vcpu_exec(CPUState *);
> > +void hvf_vcpu_kick(CPUState *);
> >  void hvf_cpu_synchronize_state(CPUState *);
> >  void hvf_cpu_synchronize_post_reset(CPUState *);
> >  void hvf_cpu_synchronize_post_init(CPUState *);
> > diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> > index efe9802962..4d254a477a 100644
> > --- a/target/i386/hvf/hvf.c
> > +++ b/target/i386/hvf/hvf.c
> > @@ -966,6 +966,17 @@ int hvf_vcpu_exec(CPUState *cpu)
> >      return ret;
> >  }
> >  
> > +void hvf_vcpu_kick(CPUState *cpu)
> > +{
> > +    hv_return_t err;
> > +
> > +    err = hv_vcpu_interrupt(&cpu->hvf_fd, 1);
> > +    if (err) {
> > +        fprintf(stderr, "qemu:%s error %#x\n", __func__, err);
> > +        exit(1);
> > +    }
> > +}
> > +
> >  bool hvf_allowed;
> >  
> >  static int hvf_accel_init(MachineState *ms)
> > 
> 
> The documentation isn't clear on whether hv_vcpu_interrupt is able to
> interrupt a *subsequent* hv_vcpu_run, similar to WHPX
> WHvCancelRunVirtualProcessor (is it possible to decompile
> hv_vcpu_interrupt and see what it does?).

hv_vcpu_interrupt sends a KICK IPI using mp_cpus_kick() only if the
destination vCPU thread is running as far as I undrestand the
mp_cpus_kick():

void
mp_cpus_kick(cpumask_t cpus)
{
	cpu_t           cpu;
	boolean_t       intrs_enabled = FALSE;

	intrs_enabled = ml_set_interrupts_enabled(FALSE);
	mp_safe_spin_lock(&x86_topo_lock);

	for (cpu = 0; cpu < (cpu_t) real_ncpus; cpu++) {
		if ((cpu == (cpu_t) cpu_number())
		    || ((cpu_to_cpumask(cpu) & cpus) == 0)
		    || !cpu_is_running(cpu)) {
			continue;
		}

		lapic_send_ipi(cpu, LAPIC_VECTOR(KICK));
	}

	simple_unlock(&x86_topo_lock);
	ml_set_interrupts_enabled(intrs_enabled);
}

So, the kick is not delivered to self and in case if destination cpu is
not running. I think it can't interrupt subsequent hv_vcpu_run.

> If not, you can reduce a bit the race window by setting a variable in
> cpu, like
> 
> 	atomic_set(&cpu->deadline, 0);
> 	hv_vcpu_interrupt(...)
> 
> and in the vCPU thread
> 
> 	hv_vcpu_run_until(..., atomic_read(&cpu->deadline));
> 	atomic_set(&cpu->deadline, HV_DEADLINE_FOREVER);
> 

Sure, could you please explain who'll be racing? There's a race if a
kick was sent after VMEXIT, right? So essentially we need a way to
"requeue" a kick that was received outside of hv_vcpu_run to avoid loss
of it?

hv_vcpu_run_until is only available on macOS 10.15+ and we can't use yet
because of three release support rule.
(https://developer.apple.com/documentation/hypervisor/3181548-hv_vcpu_run_until?language=objc)

BTW, I'm totally okay to send v2 if kicks are lost and/or the patch
needs improvements. (and I can address EFER to VMCS Entry Controls
synchronization as well)

Paolo, do you know any particular test in kvm-unit-tests that can
exhibit the issue?

Thanks,
Roman
Paolo Bonzini June 25, 2020, 6:34 p.m. UTC | #5
On 25/06/20 17:57, Roman Bolshakov wrote:
> So, the kick is not delivered to self and in case if destination cpu is
> not running. I think it can't interrupt subsequent hv_vcpu_run.

Yes.

>> If not, you can reduce a bit the race window by setting a variable in
>> cpu, like
>>
>> 	atomic_set(&cpu->deadline, 0);
>> 	hv_vcpu_interrupt(...)
>>
>> and in the vCPU thread
>>
>> 	hv_vcpu_run_until(..., atomic_read(&cpu->deadline));
>> 	atomic_set(&cpu->deadline, HV_DEADLINE_FOREVER);
>>
> 
> Sure, could you please explain who'll be racing? There's a race if a
> kick was sent after VMEXIT, right? So essentially we need a way to
> "requeue" a kick that was received outside of hv_vcpu_run to avoid loss
> of it?

Yes.  Note that this is not a new bug, it's pre-existing and it's common
to all hypervisors except KVM/WHPX.  I mean not the QEMU code, it's the
kernel APIs that are broken. :)

One way to do so is to keep the signal, and have the signal handler
enable the preemption timer (with a deadline of 0) in the pin-based
interrupt controls.  Hopefully macOS allows that, especially on 10.15+
where hv_vcpu_run_until probably uses the preemption timer.

> hv_vcpu_run_until is only available on macOS 10.15+ and we can't use yet
> because of three release support rule.
> (https://developer.apple.com/documentation/hypervisor/3181548-hv_vcpu_run_until?language=objc)
> 
> BTW, I'm totally okay to send v2 if kicks are lost and/or the patch
> needs improvements. (and I can address EFER to VMCS Entry Controls
> synchronization as well)
> 
> Paolo, do you know any particular test in kvm-unit-tests that can
> exhibit the issue?

No, it's a race and it's extremely rare, but I point it out because it's
a kernel issue that Apple might want to fix anyway.  It might also be
(depending on how the kernel side is written) that the next scheduler
tick will end up unblocking the vCPU and papering over it.

Paolo
Roman Bolshakov June 29, 2020, 11:31 a.m. UTC | #6
On Thu, Jun 25, 2020 at 08:34:14PM +0200, Paolo Bonzini wrote:
> On 25/06/20 17:57, Roman Bolshakov wrote:
> > So, the kick is not delivered to self and in case if destination cpu is
> > not running. I think it can't interrupt subsequent hv_vcpu_run.
> 
> Yes.
> 
> >> If not, you can reduce a bit the race window by setting a variable in
> >> cpu, like
> >>
> >> 	atomic_set(&cpu->deadline, 0);
> >> 	hv_vcpu_interrupt(...)
> >>
> >> and in the vCPU thread
> >>
> >> 	hv_vcpu_run_until(..., atomic_read(&cpu->deadline));
> >> 	atomic_set(&cpu->deadline, HV_DEADLINE_FOREVER);
> >>
> > 
> > Sure, could you please explain who'll be racing? There's a race if a
> > kick was sent after VMEXIT, right? So essentially we need a way to
> > "requeue" a kick that was received outside of hv_vcpu_run to avoid loss
> > of it?
> 
> Yes.  Note that this is not a new bug, it's pre-existing and it's common
> to all hypervisors except KVM/WHPX.  I mean not the QEMU code, it's the
> kernel APIs that are broken. :)
> 
> One way to do so is to keep the signal, and have the signal handler
> enable the preemption timer (with a deadline of 0) in the pin-based
> interrupt controls.  Hopefully macOS allows that, especially on 10.15+
> where hv_vcpu_run_until probably uses the preemption timer.
> 
> > hv_vcpu_run_until is only available on macOS 10.15+ and we can't use yet
> > because of three release support rule.
> > (https://developer.apple.com/documentation/hypervisor/3181548-hv_vcpu_run_until?language=objc)
> > 
> > BTW, I'm totally okay to send v2 if kicks are lost and/or the patch
> > needs improvements. (and I can address EFER to VMCS Entry Controls
> > synchronization as well)
> > 
> > Paolo, do you know any particular test in kvm-unit-tests that can
> > exhibit the issue?
> 
> No, it's a race and it's extremely rare, but I point it out because it's
> a kernel issue that Apple might want to fix anyway.  It might also be
> (depending on how the kernel side is written) that the next scheduler
> tick will end up unblocking the vCPU and papering over it.
> 

Hi Paolo,

I implemented what you proposed using VMX-preemption timer in Pin-based
controls and regular hv_vcpu_run(). It works fine without noticable
regressions, I'll send that in v2.

hv_vcpu_run_until() was also evaluated on macOS 10.15.5 but it degrades
VM performance significantly compared to explicit setting of
VMX-preepmtion timer value and hv_vcpu_run(). The performance issue was
observed on Broadwell-based MacBook Air and Ivy Bridge-based MacBook
Pro.

macOS 11.0 Beta deprecated hv_vcpu_run() and introduced a special
declaration for hv_vcpu_run_until(), that's not available 10.15 -
HV_DEADLINE_FOREVER (UINT64_MAX, which is bigger than maximum value of
VMX-preeemption counter). Perhaps the performance issue is addressed
there.

Regards,
Roman
Paolo Bonzini June 29, 2020, 1:03 p.m. UTC | #7
On 29/06/20 13:31, Roman Bolshakov wrote:
> I implemented what you proposed using VMX-preemption timer in Pin-based
> controls and regular hv_vcpu_run(). It works fine without noticable
> regressions, I'll send that in v2.
> 
> hv_vcpu_run_until() was also evaluated on macOS 10.15.5 but it degrades
> VM performance significantly compared to explicit setting of
> VMX-preepmtion timer value and hv_vcpu_run(). The performance issue was
> observed on Broadwell-based MacBook Air and Ivy Bridge-based MacBook
> Pro.
> 
> macOS 11.0 Beta deprecated hv_vcpu_run() and introduced a special
> declaration for hv_vcpu_run_until(), that's not available 10.15 -
> HV_DEADLINE_FOREVER (UINT64_MAX, which is bigger than maximum value of
> VMX-preeemption counter). Perhaps the performance issue is addressed
> there.

Possibly.  I'm worried that the preemption-timer trick will fail to run
there, but we'll see.

Paolo
Roman Bolshakov June 29, 2020, 1:29 p.m. UTC | #8
On Mon, Jun 29, 2020 at 03:03:20PM +0200, Paolo Bonzini wrote:
> On 29/06/20 13:31, Roman Bolshakov wrote:
> > I implemented what you proposed using VMX-preemption timer in Pin-based
> > controls and regular hv_vcpu_run(). It works fine without noticable
> > regressions, I'll send that in v2.
> > 
> > hv_vcpu_run_until() was also evaluated on macOS 10.15.5 but it degrades
> > VM performance significantly compared to explicit setting of
> > VMX-preepmtion timer value and hv_vcpu_run(). The performance issue was
> > observed on Broadwell-based MacBook Air and Ivy Bridge-based MacBook
> > Pro.
> > 
> > macOS 11.0 Beta deprecated hv_vcpu_run() and introduced a special
> > declaration for hv_vcpu_run_until(), that's not available 10.15 -
> > HV_DEADLINE_FOREVER (UINT64_MAX, which is bigger than maximum value of
> > VMX-preeemption counter). Perhaps the performance issue is addressed
> > there.
> 
> Possibly.  I'm worried that the preemption-timer trick will fail to run
> there, but we'll see.
> 

Well, I've got new VM-exits (caused by zero preemption timer) on either
of my laptops.

-Roman
Paolo Bonzini June 29, 2020, 1:35 p.m. UTC | #9
On 29/06/20 15:29, Roman Bolshakov wrote:
>>> macOS 11.0 Beta deprecated hv_vcpu_run() and introduced a special
>>> declaration for hv_vcpu_run_until(), that's not available 10.15 -
>>> HV_DEADLINE_FOREVER (UINT64_MAX, which is bigger than maximum value of
>>> VMX-preeemption counter). Perhaps the performance issue is addressed
>>> there.
>> Possibly.  I'm worried that the preemption-timer trick will fail to run
>> there, but we'll see.
>>
> Well, I've got new VM-exits (caused by zero preemption timer) on either
> of my laptops.

If you have already tried 11.0 that's great.  I was worried that it
would forcibly clear the preemption timer bit when passed
HV_DEADLINE_FOREVER.

Paolo
Roman Bolshakov June 29, 2020, 2:04 p.m. UTC | #10
On Mon, Jun 29, 2020 at 03:35:16PM +0200, Paolo Bonzini wrote:
> On 29/06/20 15:29, Roman Bolshakov wrote:
> >>> macOS 11.0 Beta deprecated hv_vcpu_run() and introduced a special
> >>> declaration for hv_vcpu_run_until(), that's not available 10.15 -
> >>> HV_DEADLINE_FOREVER (UINT64_MAX, which is bigger than maximum value of
> >>> VMX-preeemption counter). Perhaps the performance issue is addressed
> >>> there.
> >> Possibly.  I'm worried that the preemption-timer trick will fail to run
> >> there, but we'll see.
> >>
> > Well, I've got new VM-exits (caused by zero preemption timer) on either
> > of my laptops.
> 
> If you have already tried 11.0 that's great.  I was worried that it
> would forcibly clear the preemption timer bit when passed
> HV_DEADLINE_FOREVER.
> 

I did not but I can check how it works on it then, note that I'm not
using hv_vcpu_run_until() because it doesn't have HV_DEADLINE_FOREVER
and it performs poorly on macOS 10.15. My approach is based
hv_vcpu_run() and should hopefully work almost anywhere where
Hypervisor.framework is available because Hypervisor framework exposes
timer value
(https://developer.apple.com/documentation/hypervisor/vmcs_guest_vmx_timer_value)
since macOS 10.10.3+.

I can also test how hv_vcpu_run_until() performs with HV_DEADLINE_FOREVER
on the Beta. And if the performance issues with VMX-preemption timer and
hv_vcpu_run_until() are fixed there.

-Roman
Paolo Bonzini June 29, 2020, 2:18 p.m. UTC | #11
On 29/06/20 16:04, Roman Bolshakov wrote:
> My approach is based
> hv_vcpu_run() and should hopefully work almost anywhere where
> Hypervisor.framework is available because Hypervisor framework exposes
> timer value
> (https://developer.apple.com/documentation/hypervisor/vmcs_guest_vmx_timer_value)
> since macOS 10.10.3+.

There are a few other constants for which it would be unwise to write
from userspace, so that's not a big consolation. :)

> I can also test how hv_vcpu_run_until() performs with HV_DEADLINE_FOREVER
> on the Beta. And if the performance issues with VMX-preemption timer and
> hv_vcpu_run_until() are fixed there.

Thanks!  The main thing to test on Big Sur would be: 1) whether the
preemption timer bit in the pin controls "sticks" to 0 after setting it
2) whether the bit reads back as zero after
hv_vcpu_run_until(HV_DEADLINE_FOREVER).

Thanks,

Paolo
Roman Bolshakov June 30, 2020, 10:12 a.m. UTC | #12
On Mon, Jun 29, 2020 at 04:18:46PM +0200, Paolo Bonzini wrote:
> On 29/06/20 16:04, Roman Bolshakov wrote:
> > My approach is based
> > hv_vcpu_run() and should hopefully work almost anywhere where
> > Hypervisor.framework is available because Hypervisor framework exposes
> > timer value
> > (https://developer.apple.com/documentation/hypervisor/vmcs_guest_vmx_timer_value)
> > since macOS 10.10.3+.
> 
> There are a few other constants for which it would be unwise to write
> from userspace, so that's not a big consolation. :)
> 

Hi Paolo,

So, I've tried Big Sur Beta and it has exactly the same performance
issue with hv_vcpu_run_until() while hv_vcpu_run() works as good as it
worked on 10.15.5. I've submitted FB7827341 to Apple wrt the issue.

> > I can also test how hv_vcpu_run_until() performs with HV_DEADLINE_FOREVER
> > on the Beta. And if the performance issues with VMX-preemption timer and
> > hv_vcpu_run_until() are fixed there.
> 
> Thanks!  The main thing to test on Big Sur would be: 1) whether the
> preemption timer bit in the pin controls "sticks" to 0 after setting it

It does not. If it's set, it stays there.

> 2) whether the bit reads back as zero after
> hv_vcpu_run_until(HV_DEADLINE_FOREVER).
> 

Likewise, it's not cleared if set.

As far as I understand, hv_vcpu_run_until(HV_DEADLINE_FOREVER) works
like hv_vcpu_run() without VMX-preemption timer. Otherwise
hv_vcpu_run_until() implicitly sets VMX-preemption timer Pin-based
control and sets the timer value.

Thanks,
Roman

Here's the patch over v2 that adds support of hv_vcpu_run_until() on Big Sur:
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index 317304aa1d..ad202f7358 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -72,8 +72,12 @@
 #include "sysemu/accel.h"
 #include "target/i386/cpu.h"
 
+#if defined(__MAC_10_16) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_16
+#define HVF_MAX_DEADLINE HV_DEADLINE_FOREVER
+#else
 /* Maximum value of VMX-preemption timer */
 #define HVF_MAX_DEADLINE UINT32_MAX
+#endif
 
 HVFState *hvf_state;
 
@@ -693,6 +697,7 @@ int hvf_vcpu_exec(CPUState *cpu)
     CPUX86State *env = &x86_cpu->env;
     int ret = 0;
     uint64_t rip = 0;
+    hv_return_t r;
 
     if (hvf_process_events(cpu)) {
         return EXCP_HLT;
@@ -718,10 +723,22 @@ int hvf_vcpu_exec(CPUState *cpu)
         /* Use VMX-preemption timer trick only if available */
         if (rvmcs(cpu->hvf_fd, VMCS_PIN_BASED_CTLS) &
             VMCS_PIN_BASED_CTLS_VMX_PREEMPT_TIMER) {
+#if defined(__MAC_10_16) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_16
+            r = hv_vcpu_run_until(cpu->hvf_fd,
+                                  atomic_read(&env->hvf_deadline));
+        } else {
+            /*
+             * Equivalent to behaviour of hv_vcpu_run() with VMX-preemption
+             * timer disabled, prone to kick loss.
+             */
+            r = hv_vcpu_run_until(cpu->hvf_fd, HVF_MAX_DEADLINE);
+        }
+#else
             wvmcs(cpu->hvf_fd, VMCS_PREEMPTION_TIMER_VALUE,
                   atomic_read(&env->hvf_deadline));
         }
-        hv_return_t r  = hv_vcpu_run(cpu->hvf_fd);
+        r = hv_vcpu_run(cpu->hvf_fd);
+#endif
         atomic_set(&env->hvf_deadline, HVF_MAX_DEADLINE);
         assert_hvf_ok(r);
Paolo Bonzini June 30, 2020, 10:43 a.m. UTC | #13
Ok, I'll review the patch to see what you've implemented.  Thanks!

Paolo

On 30/06/20 12:12, Roman Bolshakov wrote:
> On Mon, Jun 29, 2020 at 04:18:46PM +0200, Paolo Bonzini wrote:
>> On 29/06/20 16:04, Roman Bolshakov wrote:
>>> My approach is based
>>> hv_vcpu_run() and should hopefully work almost anywhere where
>>> Hypervisor.framework is available because Hypervisor framework exposes
>>> timer value
>>> (https://developer.apple.com/documentation/hypervisor/vmcs_guest_vmx_timer_value)
>>> since macOS 10.10.3+.
>>
>> There are a few other constants for which it would be unwise to write
>> from userspace, so that's not a big consolation. :)
>>
> 
> Hi Paolo,
> 
> So, I've tried Big Sur Beta and it has exactly the same performance
> issue with hv_vcpu_run_until() while hv_vcpu_run() works as good as it
> worked on 10.15.5. I've submitted FB7827341 to Apple wrt the issue.
> 
>>> I can also test how hv_vcpu_run_until() performs with HV_DEADLINE_FOREVER
>>> on the Beta. And if the performance issues with VMX-preemption timer and
>>> hv_vcpu_run_until() are fixed there.
>>
>> Thanks!  The main thing to test on Big Sur would be: 1) whether the
>> preemption timer bit in the pin controls "sticks" to 0 after setting it
> 
> It does not. If it's set, it stays there.
> 
>> 2) whether the bit reads back as zero after
>> hv_vcpu_run_until(HV_DEADLINE_FOREVER).
>>
> 
> Likewise, it's not cleared if set.
> 
> As far as I understand, hv_vcpu_run_until(HV_DEADLINE_FOREVER) works
> like hv_vcpu_run() without VMX-preemption timer. Otherwise
> hv_vcpu_run_until() implicitly sets VMX-preemption timer Pin-based
> control and sets the timer value.
> 
> Thanks,
> Roman
> 
> Here's the patch over v2 that adds support of hv_vcpu_run_until() on Big Sur:
> diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
> index 317304aa1d..ad202f7358 100644
> --- a/target/i386/hvf/hvf.c
> +++ b/target/i386/hvf/hvf.c
> @@ -72,8 +72,12 @@
>  #include "sysemu/accel.h"
>  #include "target/i386/cpu.h"
>  
> +#if defined(__MAC_10_16) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_16
> +#define HVF_MAX_DEADLINE HV_DEADLINE_FOREVER
> +#else
>  /* Maximum value of VMX-preemption timer */
>  #define HVF_MAX_DEADLINE UINT32_MAX
> +#endif
>  
>  HVFState *hvf_state;
>  
> @@ -693,6 +697,7 @@ int hvf_vcpu_exec(CPUState *cpu)
>      CPUX86State *env = &x86_cpu->env;
>      int ret = 0;
>      uint64_t rip = 0;
> +    hv_return_t r;
>  
>      if (hvf_process_events(cpu)) {
>          return EXCP_HLT;
> @@ -718,10 +723,22 @@ int hvf_vcpu_exec(CPUState *cpu)
>          /* Use VMX-preemption timer trick only if available */
>          if (rvmcs(cpu->hvf_fd, VMCS_PIN_BASED_CTLS) &
>              VMCS_PIN_BASED_CTLS_VMX_PREEMPT_TIMER) {
> +#if defined(__MAC_10_16) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_16
> +            r = hv_vcpu_run_until(cpu->hvf_fd,
> +                                  atomic_read(&env->hvf_deadline));
> +        } else {
> +            /*
> +             * Equivalent to behaviour of hv_vcpu_run() with VMX-preemption
> +             * timer disabled, prone to kick loss.
> +             */
> +            r = hv_vcpu_run_until(cpu->hvf_fd, HVF_MAX_DEADLINE);
> +        }
> +#else
>              wvmcs(cpu->hvf_fd, VMCS_PREEMPTION_TIMER_VALUE,
>                    atomic_read(&env->hvf_deadline));
>          }
> -        hv_return_t r  = hv_vcpu_run(cpu->hvf_fd);
> +        r = hv_vcpu_run(cpu->hvf_fd);
> +#endif
>          atomic_set(&env->hvf_deadline, HVF_MAX_DEADLINE);
>          assert_hvf_ok(r);
>  
>
diff mbox series

Patch

diff --git a/cpus.c b/cpus.c
index 26709677d3..36f38ce5c8 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1783,10 +1783,15 @@  static void qemu_cpu_kick_thread(CPUState *cpu)
         return;
     }
     cpu->thread_kicked = true;
-    err = pthread_kill(cpu->thread->thread, SIG_IPI);
-    if (err && err != ESRCH) {
-        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
-        exit(1);
+
+    if (hvf_enabled()) {
+        hvf_vcpu_kick(cpu);
+    } else {
+        err = pthread_kill(cpu->thread->thread, SIG_IPI);
+        if (err && err != ESRCH) {
+            fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
+            exit(1);
+        }
     }
 #else /* _WIN32 */
     if (!qemu_cpu_is_self(cpu)) {
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index b3f4b79318..288a2bd57e 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -438,7 +438,7 @@  struct CPUState {
 
     struct hax_vcpu_state *hax_vcpu;
 
-    int hvf_fd;
+    unsigned hvf_fd;
 
     /* track IOMMUs whose translations we've cached in the TCG TLB */
     GArray *iommu_notifiers;
diff --git a/include/sysemu/hvf.h b/include/sysemu/hvf.h
index 1d40a8ec01..aaa00cbf05 100644
--- a/include/sysemu/hvf.h
+++ b/include/sysemu/hvf.h
@@ -25,6 +25,7 @@  extern bool hvf_allowed;
 
 int hvf_init_vcpu(CPUState *);
 int hvf_vcpu_exec(CPUState *);
+void hvf_vcpu_kick(CPUState *);
 void hvf_cpu_synchronize_state(CPUState *);
 void hvf_cpu_synchronize_post_reset(CPUState *);
 void hvf_cpu_synchronize_post_init(CPUState *);
diff --git a/target/i386/hvf/hvf.c b/target/i386/hvf/hvf.c
index efe9802962..4d254a477a 100644
--- a/target/i386/hvf/hvf.c
+++ b/target/i386/hvf/hvf.c
@@ -966,6 +966,17 @@  int hvf_vcpu_exec(CPUState *cpu)
     return ret;
 }
 
+void hvf_vcpu_kick(CPUState *cpu)
+{
+    hv_return_t err;
+
+    err = hv_vcpu_interrupt(&cpu->hvf_fd, 1);
+    if (err) {
+        fprintf(stderr, "qemu:%s error %#x\n", __func__, err);
+        exit(1);
+    }
+}
+
 bool hvf_allowed;
 
 static int hvf_accel_init(MachineState *ms)