diff mbox

[v7,06/11] x86, paravirt: Add interface to support kvm/xen vcpu preempted check

Message ID 20161115154706.GF11311@worktop.programming.kicks-ass.net (mailing list archive)
State Not Applicable
Headers show

Commit Message

Peter Zijlstra Nov. 15, 2016, 3:47 p.m. UTC
On Wed, Nov 02, 2016 at 05:08:33AM -0400, Pan Xinhui wrote:
> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
> index 0f400c0..38c3bb7 100644
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -310,6 +310,8 @@ struct pv_lock_ops {
>  
>  	void (*wait)(u8 *ptr, u8 val);
>  	void (*kick)(int cpu);
> +
> +	bool (*vcpu_is_preempted)(int cpu);
>  };

So that ends up with a full function call in the native case. I did
something like the below on top, completely untested, not been near a
compiler etc..

It doesn't get rid of the branch, but at least it avoids the function
call, and hardware should have no trouble predicting a constant
condition.

Also, it looks like you end up not setting vcpu_is_preempted when KVM
doesn't support steal clock, which would end up in an instant NULL
deref. Fixed that too.

---

Comments

xinhui Nov. 16, 2016, 4:19 a.m. UTC | #1
在 2016/11/15 23:47, Peter Zijlstra 写道:
> On Wed, Nov 02, 2016 at 05:08:33AM -0400, Pan Xinhui wrote:
>> diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
>> index 0f400c0..38c3bb7 100644
>> --- a/arch/x86/include/asm/paravirt_types.h
>> +++ b/arch/x86/include/asm/paravirt_types.h
>> @@ -310,6 +310,8 @@ struct pv_lock_ops {
>>
>>  	void (*wait)(u8 *ptr, u8 val);
>>  	void (*kick)(int cpu);
>> +
>> +	bool (*vcpu_is_preempted)(int cpu);
>>  };
>
> So that ends up with a full function call in the native case. I did
> something like the below on top, completely untested, not been near a
> compiler etc..
>
Hi, Peter.
	I think we can avoid a function call in a simpler way. How about below

static inline bool vcpu_is_preempted(int cpu)
{
	/* only set in pv case*/
	if (pv_lock_ops.vcpu_is_preempted)
		return pv_lock_ops.vcpu_is_preempted(cpu);
	return false;
}


> It doesn't get rid of the branch, but at least it avoids the function
> call, and hardware should have no trouble predicting a constant
> condition.
>
> Also, it looks like you end up not setting vcpu_is_preempted when KVM
> doesn't support steal clock, which would end up in an instant NULL
> deref. Fixed that too.
>
maybe not true. There is .vcpu_is_preempted = native_vcpu_is_preempted when we define pv_lock_ops.

your patch is a good example for any people who want to add any native/pv function. :)

thanks
xinhui

> ---
> --- a/arch/x86/include/asm/paravirt.h
> +++ b/arch/x86/include/asm/paravirt.h
> @@ -673,6 +673,11 @@ static __always_inline void pv_kick(int
>  	PVOP_VCALL1(pv_lock_ops.kick, cpu);
>  }
>
> +static __always_inline void pv_vcpu_is_prempted(int cpu)
> +{
> +	PVOP_VCALLEE1(pv_lock_ops.vcpu_is_preempted, cpu);
> +}
> +
>  #endif /* SMP && PARAVIRT_SPINLOCKS */
>
>  #ifdef CONFIG_X86_32
> --- a/arch/x86/include/asm/paravirt_types.h
> +++ b/arch/x86/include/asm/paravirt_types.h
> @@ -309,7 +309,7 @@ struct pv_lock_ops {
>  	void (*wait)(u8 *ptr, u8 val);
>  	void (*kick)(int cpu);
>
> -	bool (*vcpu_is_preempted)(int cpu);
> +	struct paravirt_callee_save vcpu_is_preempted;
>  };
>
>  /* This contains all the paravirt structures: we get a convenient
> --- a/arch/x86/include/asm/qspinlock.h
> +++ b/arch/x86/include/asm/qspinlock.h
> @@ -32,6 +32,12 @@ static inline void queued_spin_unlock(st
>  {
>  	pv_queued_spin_unlock(lock);
>  }
> +
> +#define vcpu_is_preempted vcpu_is_preempted
> +static inline bool vcpu_is_preempted(int cpu)
> +{
> +	return pv_vcpu_is_preempted(cpu);
> +}
>  #else
>  static inline void queued_spin_unlock(struct qspinlock *lock)
>  {
> --- a/arch/x86/include/asm/spinlock.h
> +++ b/arch/x86/include/asm/spinlock.h
> @@ -26,14 +26,6 @@
>  extern struct static_key paravirt_ticketlocks_enabled;
>  static __always_inline bool static_key_false(struct static_key *key);
>
> -#ifdef CONFIG_PARAVIRT_SPINLOCKS
> -#define vcpu_is_preempted vcpu_is_preempted
> -static inline bool vcpu_is_preempted(int cpu)
> -{
> -	return pv_lock_ops.vcpu_is_preempted(cpu);
> -}
> -#endif
> -
>  #include <asm/qspinlock.h>
>
>  /*
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -415,15 +415,6 @@ void kvm_disable_steal_time(void)
>  	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
>  }
>
> -static bool kvm_vcpu_is_preempted(int cpu)
> -{
> -	struct kvm_steal_time *src;
> -
> -	src = &per_cpu(steal_time, cpu);
> -
> -	return !!src->preempted;
> -}
> -
>  #ifdef CONFIG_SMP
>  static void __init kvm_smp_prepare_boot_cpu(void)
>  {
> @@ -480,9 +471,6 @@ void __init kvm_guest_init(void)
>  	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
>  		has_steal_clock = 1;
>  		pv_time_ops.steal_clock = kvm_steal_clock;
> -#ifdef CONFIG_PARAVIRT_SPINLOCKS
> -		pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
> -#endif
>  	}
>
>  	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
> @@ -604,6 +592,14 @@ static void kvm_wait(u8 *ptr, u8 val)
>  	local_irq_restore(flags);
>  }
>
> +static bool __kvm_vcpu_is_preempted(int cpu)
> +{
> +	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
> +
> +	return !!src->preempted;
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
> +
>  /*
>   * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
>   */
> @@ -620,6 +616,12 @@ void __init kvm_spinlock_init(void)
>  	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>  	pv_lock_ops.wait = kvm_wait;
>  	pv_lock_ops.kick = kvm_kick_cpu;
> +	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted);
> +
> +	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
> +		pv_lock_ops.vcpu_is_preempted =
> +			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
> +	}
>  }
>
>  static __init int kvm_spinlock_init_jump(void)
> --- a/arch/x86/kernel/paravirt-spinlocks.c
> +++ b/arch/x86/kernel/paravirt-spinlocks.c
> @@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlo
>  {
>  	native_queued_spin_unlock(lock);
>  }
> -
>  PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
>
>  bool pv_is_native_spin_unlock(void)
> @@ -21,9 +20,16 @@ bool pv_is_native_spin_unlock(void)
>  		__raw_callee_save___native_queued_spin_unlock;
>  }
>
> -static bool native_vcpu_is_preempted(int cpu)
> +__visible bool __native_vcpu_is_preempted(int cpu)
>  {
> -	return 0;
> +	return false;
> +}
> +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
> +
> +bool pv_is_native_vcpu_is_preempted(void)
> +{
> +	return pv_lock_ops.queued_spin_unlock.func ==
> +		__raw_callee_save__native_vcpu_is_preempted;
>  }
>
copy-paste issue...

>  struct pv_lock_ops pv_lock_ops = {
> @@ -32,7 +38,7 @@ struct pv_lock_ops pv_lock_ops = {
>  	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
>  	.wait = paravirt_nop,
>  	.kick = paravirt_nop,
> -	.vcpu_is_preempted = native_vcpu_is_preempted,
> +	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
>  #endif /* SMP */
>  };
>  EXPORT_SYMBOL(pv_lock_ops);
> --- a/arch/x86/kernel/paravirt_patch_32.c
> +++ b/arch/x86/kernel/paravirt_patch_32.c
> @@ -11,6 +11,7 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %c
>
>  #if defined(CONFIG_PARAVIRT_SPINLOCKS)
>  DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
> +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "movl $0, %eax");
>  #endif
>
>  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
> @@ -26,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *i
>  }
>
>  extern bool pv_is_native_spin_unlock(void);
> +extern bool pv_is_native_vcpu_is_preempted(void);
>
>  unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		      unsigned long addr, unsigned len)
> @@ -54,6 +56,12 @@ unsigned native_patch(u8 type, u16 clobb
>  				end   = end_pv_lock_ops_queued_spin_unlock;
>  				goto patch_site;
>  			}
> +		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
> +			if (pv_is_native_vcpu_is_preempted()) {
> +				start = start_pv_lock_ops_vcpu_is_preempted;
> +				end   = end_pv_lock_ops_vcpu_is_preempted;
> +				goto patch_site;
> +			}
>  #endif
>
>  	default:
> --- a/arch/x86/kernel/paravirt_patch_64.c
> +++ b/arch/x86/kernel/paravirt_patch_64.c
> @@ -20,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax");
>
>  #if defined(CONFIG_PARAVIRT_SPINLOCKS)
>  DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
> +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "movl $0, rax");
>  #endif
>
>  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
> @@ -35,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *i
>  }
>
>  extern bool pv_is_native_spin_unlock(void);
> +extern bool pv_is_native_vcpu_is_preempted(void);
>
>  unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		      unsigned long addr, unsigned len)
> @@ -66,6 +68,12 @@ unsigned native_patch(u8 type, u16 clobb
>  				end   = end_pv_lock_ops_queued_spin_unlock;
>  				goto patch_site;
>  			}
> +		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
> +			if (pv_is_native_vcpu_is_preempted()) {
> +				start = start_pv_lock_ops_vcpu_is_preempted;
> +				end   = end_pv_lock_ops_vcpu_is_preempted;
> +				goto patch_site;
> +			}
>  #endif
>
>  	default:
> --- a/arch/x86/xen/spinlock.c
> +++ b/arch/x86/xen/spinlock.c
> @@ -114,6 +114,8 @@ void xen_uninit_lock_cpu(int cpu)
>  	per_cpu(irq_name, cpu) = NULL;
>  }
>
> +PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
> +
>  /*
>   * Our init of PV spinlocks is split in two init functions due to us
>   * using paravirt patching and jump labels patching and having to do
> @@ -136,8 +138,7 @@ void __init xen_init_spinlocks(void)
>  	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
>  	pv_lock_ops.wait = xen_qlock_wait;
>  	pv_lock_ops.kick = xen_qlock_kick;
> -
> -	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
> +	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
>  }
>
>  /*
>
Peter Zijlstra Nov. 16, 2016, 10:23 a.m. UTC | #2
On Wed, Nov 16, 2016 at 12:19:09PM +0800, Pan Xinhui wrote:
> Hi, Peter.
> 	I think we can avoid a function call in a simpler way. How about below
> 
> static inline bool vcpu_is_preempted(int cpu)
> {
> 	/* only set in pv case*/
> 	if (pv_lock_ops.vcpu_is_preempted)
> 		return pv_lock_ops.vcpu_is_preempted(cpu);
> 	return false;
> }

That is still more expensive. It needs to do an actual load and makes it
hard to predict the branch, you'd have to actually wait for the load to
complete etc.

Also, it generates more code.

Paravirt muck should strive to be as cheap as possible when ran on
native hardware.
Christian Borntraeger Nov. 16, 2016, 11:29 a.m. UTC | #3
On 11/16/2016 11:23 AM, Peter Zijlstra wrote:
> On Wed, Nov 16, 2016 at 12:19:09PM +0800, Pan Xinhui wrote:
>> Hi, Peter.
>> 	I think we can avoid a function call in a simpler way. How about below
>>
>> static inline bool vcpu_is_preempted(int cpu)
>> {
>> 	/* only set in pv case*/
>> 	if (pv_lock_ops.vcpu_is_preempted)
>> 		return pv_lock_ops.vcpu_is_preempted(cpu);
>> 	return false;
>> }
> 
> That is still more expensive. It needs to do an actual load and makes it
> hard to predict the branch, you'd have to actually wait for the load to
> complete etc.

Out of curiosity, why is that hard to predict?
On s390 the branch prediction runs asynchronously ahead of the downstream
pipeline (e.g. search for "IBM z Systems Processor Optimization Primer" page 11).
given enough capacity, I would assume that modern x86 processors would do the same
and be able to predict this is as soon as it becomes hot (and otherwise you would
 not notice the branch miss anyway). Is x86 behaving differently here?

> Also, it generates more code.
> 
> Paravirt muck should strive to be as cheap as possible when ran on
> native hardware.

As I am interested in this series from the s390 point of view, this is 
the only thing that block this series?

Is there a chance to add a static key around the paravirt ops somehow?
Peter Zijlstra Nov. 16, 2016, 11:43 a.m. UTC | #4
On Wed, Nov 16, 2016 at 12:29:44PM +0100, Christian Borntraeger wrote:
> On 11/16/2016 11:23 AM, Peter Zijlstra wrote:
> > On Wed, Nov 16, 2016 at 12:19:09PM +0800, Pan Xinhui wrote:
> >> Hi, Peter.
> >> 	I think we can avoid a function call in a simpler way. How about below
> >>
> >> static inline bool vcpu_is_preempted(int cpu)
> >> {
> >> 	/* only set in pv case*/
> >> 	if (pv_lock_ops.vcpu_is_preempted)
> >> 		return pv_lock_ops.vcpu_is_preempted(cpu);
> >> 	return false;
> >> }
> > 
> > That is still more expensive. It needs to do an actual load and makes it
> > hard to predict the branch, you'd have to actually wait for the load to
> > complete etc.
> 
> Out of curiosity, why is that hard to predict?
> On s390 the branch prediction runs asynchronously ahead of the downstream
> pipeline (e.g. search for "IBM z Systems Processor Optimization Primer" page 11).
> given enough capacity, I would assume that modern x86 processors would do the same
> and be able to predict this is as soon as it becomes hot (and otherwise you would
>  not notice the branch miss anyway). Is x86 behaving differently here?

Not sure how exactly it works, but it seems to me that an immediate
assignment to the value you're going to compare would leave very little
doubt.

Then again, maybe cores aren't that smart and only look at the
hysterical btb for prediction.

> > Also, it generates more code.
> > 
> > Paravirt muck should strive to be as cheap as possible when ran on
> > native hardware.
> 
> As I am interested in this series from the s390 point of view, this is 
> the only thing that block this series?

Ingo was rewriting the changelog, other than that, no, I can do this on
top. Just spotted this because Ingo and me talked it over.

> Is there a chance to add a static key around the paravirt ops somehow?

More code generation still, replacing the call with an immediate
assignment to the return register is the shortest possible option I
think.
xinhui Nov. 17, 2016, 5:16 a.m. UTC | #5
在 2016/11/16 18:23, Peter Zijlstra 写道:
> On Wed, Nov 16, 2016 at 12:19:09PM +0800, Pan Xinhui wrote:
>> Hi, Peter.
>> 	I think we can avoid a function call in a simpler way. How about below
>>
>> static inline bool vcpu_is_preempted(int cpu)
>> {
>> 	/* only set in pv case*/
>> 	if (pv_lock_ops.vcpu_is_preempted)
>> 		return pv_lock_ops.vcpu_is_preempted(cpu);
>> 	return false;
>> }
>
> That is still more expensive. It needs to do an actual load and makes it
> hard to predict the branch, you'd have to actually wait for the load to
> complete etc.
>
yes, one more load in native case. I think this is acceptable as vcpu_is_preempted is not a critical function.

however if we use pv_callee_save_regs_thunk, more unnecessary registers might be save/resotred in pv case.
that will introduce a little overhead.

but I think I am okay with your idea. I can make another patch based on this patchset with your suggested-by.

thanks
xinhui

> Also, it generates more code.
>
> Paravirt muck should strive to be as cheap as possible when ran on
> native hardware.
>
diff mbox

Patch

--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -673,6 +673,11 @@  static __always_inline void pv_kick(int
 	PVOP_VCALL1(pv_lock_ops.kick, cpu);
 }
 
+static __always_inline void pv_vcpu_is_prempted(int cpu)
+{
+	PVOP_VCALLEE1(pv_lock_ops.vcpu_is_preempted, cpu);
+}
+
 #endif /* SMP && PARAVIRT_SPINLOCKS */
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -309,7 +309,7 @@  struct pv_lock_ops {
 	void (*wait)(u8 *ptr, u8 val);
 	void (*kick)(int cpu);
 
-	bool (*vcpu_is_preempted)(int cpu);
+	struct paravirt_callee_save vcpu_is_preempted;
 };
 
 /* This contains all the paravirt structures: we get a convenient
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -32,6 +32,12 @@  static inline void queued_spin_unlock(st
 {
 	pv_queued_spin_unlock(lock);
 }
+
+#define vcpu_is_preempted vcpu_is_preempted
+static inline bool vcpu_is_preempted(int cpu)
+{
+	return pv_vcpu_is_preempted(cpu);
+}
 #else
 static inline void queued_spin_unlock(struct qspinlock *lock)
 {
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -26,14 +26,6 @@ 
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
-{
-	return pv_lock_ops.vcpu_is_preempted(cpu);
-}
-#endif
-
 #include <asm/qspinlock.h>
 
 /*
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -415,15 +415,6 @@  void kvm_disable_steal_time(void)
 	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
-static bool kvm_vcpu_is_preempted(int cpu)
-{
-	struct kvm_steal_time *src;
-
-	src = &per_cpu(steal_time, cpu);
-
-	return !!src->preempted;
-}
-
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -480,9 +471,6 @@  void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
 		has_steal_clock = 1;
 		pv_time_ops.steal_clock = kvm_steal_clock;
-#ifdef CONFIG_PARAVIRT_SPINLOCKS
-		pv_lock_ops.vcpu_is_preempted = kvm_vcpu_is_preempted;
-#endif
 	}
 
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -604,6 +592,14 @@  static void kvm_wait(u8 *ptr, u8 val)
 	local_irq_restore(flags);
 }
 
+static bool __kvm_vcpu_is_preempted(int cpu)
+{
+	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+	return !!src->preempted;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
  */
@@ -620,6 +616,12 @@  void __init kvm_spinlock_init(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = kvm_wait;
 	pv_lock_ops.kick = kvm_kick_cpu;
+	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted);
+
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+		pv_lock_ops.vcpu_is_preempted =
+			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
+	}
 }
 
 static __init int kvm_spinlock_init_jump(void)
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -12,7 +12,6 @@  __visible void __native_queued_spin_unlo
 {
 	native_queued_spin_unlock(lock);
 }
-
 PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
 
 bool pv_is_native_spin_unlock(void)
@@ -21,9 +20,16 @@  bool pv_is_native_spin_unlock(void)
 		__raw_callee_save___native_queued_spin_unlock;
 }
 
-static bool native_vcpu_is_preempted(int cpu)
+__visible bool __native_vcpu_is_preempted(int cpu)
 {
-	return 0;
+	return false;
+}
+PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted);
+
+bool pv_is_native_vcpu_is_preempted(void)
+{
+	return pv_lock_ops.queued_spin_unlock.func ==
+		__raw_callee_save__native_vcpu_is_preempted;
 }
 
 struct pv_lock_ops pv_lock_ops = {
@@ -32,7 +38,7 @@  struct pv_lock_ops pv_lock_ops = {
 	.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
 	.wait = paravirt_nop,
 	.kick = paravirt_nop,
-	.vcpu_is_preempted = native_vcpu_is_preempted,
+	.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted),
 #endif /* SMP */
 };
 EXPORT_SYMBOL(pv_lock_ops);
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -11,6 +11,7 @@  DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %c
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "movl $0, %eax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -26,6 +27,7 @@  unsigned paravirt_patch_ident_64(void *i
 }
 
 extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
@@ -54,6 +56,12 @@  unsigned native_patch(u8 type, u16 clobb
 				end   = end_pv_lock_ops_queued_spin_unlock;
 				goto patch_site;
 			}
+		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+			if (pv_is_native_vcpu_is_preempted()) {
+				start = start_pv_lock_ops_vcpu_is_preempted;
+				end   = end_pv_lock_ops_vcpu_is_preempted;
+				goto patch_site;
+			}
 #endif
 
 	default:
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -20,6 +20,7 @@  DEF_NATIVE(, mov64, "mov %rdi, %rax");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "movl $0, rax");
 #endif
 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
@@ -35,6 +36,7 @@  unsigned paravirt_patch_ident_64(void *i
 }
 
 extern bool pv_is_native_spin_unlock(void);
+extern bool pv_is_native_vcpu_is_preempted(void);
 
 unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		      unsigned long addr, unsigned len)
@@ -66,6 +68,12 @@  unsigned native_patch(u8 type, u16 clobb
 				end   = end_pv_lock_ops_queued_spin_unlock;
 				goto patch_site;
 			}
+		case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted):
+			if (pv_is_native_vcpu_is_preempted()) {
+				start = start_pv_lock_ops_vcpu_is_preempted;
+				end   = end_pv_lock_ops_vcpu_is_preempted;
+				goto patch_site;
+			}
 #endif
 
 	default:
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -114,6 +114,8 @@  void xen_uninit_lock_cpu(int cpu)
 	per_cpu(irq_name, cpu) = NULL;
 }
 
+PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
+
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -136,8 +138,7 @@  void __init xen_init_spinlocks(void)
 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
 	pv_lock_ops.wait = xen_qlock_wait;
 	pv_lock_ops.kick = xen_qlock_kick;
-
-	pv_lock_ops.vcpu_is_preempted = xen_vcpu_stolen;
+	pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
 }
 
 /*