diff mbox series

[v1,11/55] powerpc/time: add API for KVM to re-arm the host timer/decrementer

Message ID 20210726035036.739609-12-npiggin@gmail.com
State New
Headers show
Series [v1,01/55] KVM: PPC: Book3S HV: Remove TM emulation from POWER7/8 path | expand

Commit Message

Nicholas Piggin July 26, 2021, 3:49 a.m. UTC
Rather than have KVM look up the host timer and fiddle with the
irq-work internal details, have the powerpc/time.c code provide a
function for KVM to re-arm the Linux timer code when exiting a
guest.

This is implementation has an improvement over existing code of
marking a decrementer interrupt as soft-pending if a timer has
expired, rather than setting DEC to a -ve value, which tended to
cause host timers to take two interrupts (first hdec to exit the
guest, then the immediate dec).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/time.h | 16 +++-------
 arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
 arch/powerpc/kvm/book3s_hv.c    |  7 ++---
 3 files changed, 49 insertions(+), 26 deletions(-)

Comments

Christophe Leroy Aug. 5, 2021, 7:22 a.m. UTC | #1
Le 26/07/2021 à 05:49, Nicholas Piggin a écrit :
> Rather than have KVM look up the host timer and fiddle with the
> irq-work internal details, have the powerpc/time.c code provide a
> function for KVM to re-arm the Linux timer code when exiting a
> guest.
> 
> This is implementation has an improvement over existing code of
> marking a decrementer interrupt as soft-pending if a timer has
> expired, rather than setting DEC to a -ve value, which tended to
> cause host timers to take two interrupts (first hdec to exit the
> guest, then the immediate dec).
> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   arch/powerpc/include/asm/time.h | 16 +++-------
>   arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
>   arch/powerpc/kvm/book3s_hv.c    |  7 ++---
>   3 files changed, 49 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
> index 69b6be617772..924b2157882f 100644
> --- a/arch/powerpc/include/asm/time.h
> +++ b/arch/powerpc/include/asm/time.h
> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
>   extern void secondary_cpu_time_init(void);
>   extern void __init time_init(void);
>   
> -#ifdef CONFIG_PPC64
> -static inline unsigned long test_irq_work_pending(void)
> -{
> -	unsigned long x;
> -
> -	asm volatile("lbz %0,%1(13)"
> -		: "=r" (x)
> -		: "i" (offsetof(struct paca_struct, irq_work_pending)));
> -	return x;
> -}
> -#endif
> -
>   DECLARE_PER_CPU(u64, decrementers_next_tb);
>   
>   static inline u64 timer_get_next_tb(void)
> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
>   	return __this_cpu_read(decrementers_next_tb);
>   }
>   
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +void timer_rearm_host_dec(u64 now);
> +#endif
> +
>   /* Convert timebase ticks to nanoseconds */
>   unsigned long long tb_to_ns(unsigned long long tb_ticks);
>   
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 72d872b49167..016828b7401b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc);
>    * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
>    */
>   #ifdef CONFIG_PPC64
> +static inline unsigned long test_irq_work_pending(void)
> +{
> +	unsigned long x;
> +
> +	asm volatile("lbz %0,%1(13)"
> +		: "=r" (x)
> +		: "i" (offsetof(struct paca_struct, irq_work_pending)));

Can we just use READ_ONCE() instead of hard coding the read ?


> +	return x;
> +}
> +
>   static inline void set_irq_work_pending_flag(void)
>   {
>   	asm volatile("stb %0,%1(13)" : :
> @@ -542,13 +552,44 @@ void arch_irq_work_raise(void)
>   	preempt_enable();
>   }
>   
> +static void set_dec_or_work(u64 val)
> +{
> +	set_dec(val);
> +	/* We may have raced with new irq work */
> +	if (unlikely(test_irq_work_pending()))
> +		set_dec(1);
> +}
> +
>   #else  /* CONFIG_IRQ_WORK */
>   
>   #define test_irq_work_pending()	0
>   #define clear_irq_work_pending()
>   
> +static void set_dec_or_work(u64 val)
> +{
> +	set_dec(val);
> +}
>   #endif /* CONFIG_IRQ_WORK */
>   
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +void timer_rearm_host_dec(u64 now)
> +{
> +	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
> +
> +	WARN_ON_ONCE(!arch_irqs_disabled());
> +	WARN_ON_ONCE(mfmsr() & MSR_EE);
> +
> +	if (now >= *next_tb) {
> +		local_paca->irq_happened |= PACA_IRQ_DEC;
> +	} else {
> +		now = *next_tb - now;
> +		if (now <= decrementer_max)
> +			set_dec_or_work(now);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
> +#endif
> +
>   /*
>    * timer_interrupt - gets called when the decrementer overflows,
>    * with interrupts disabled.
> @@ -609,10 +650,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
>   	} else {
>   		now = *next_tb - now;
>   		if (now <= decrementer_max)
> -			set_dec(now);
> -		/* We may have raced with new irq work */
> -		if (test_irq_work_pending())
> -			set_dec(1);
> +			set_dec_or_work(now);
>   		__this_cpu_inc(irq_stat.timer_irqs_others);
>   	}
>   
> @@ -854,11 +892,7 @@ static int decrementer_set_next_event(unsigned long evt,
>   				      struct clock_event_device *dev)
>   {
>   	__this_cpu_write(decrementers_next_tb, get_tb() + evt);
> -	set_dec(evt);
> -
> -	/* We may have raced with new irq work */
> -	if (test_irq_work_pending())
> -		set_dec(1);
> +	set_dec_or_work(evt);
>   
>   	return 0;
>   }
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 6e6cfb10e9bb..0cef578930f9 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -4018,11 +4018,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>   	vc->entry_exit_map = 0x101;
>   	vc->in_guest = 0;
>   
> -	next_timer = timer_get_next_tb();
> -	set_dec(next_timer - tb);
> -	/* We may have raced with new irq work */
> -	if (test_irq_work_pending())
> -		set_dec(1);
> +	timer_rearm_host_dec(tb);
> +
>   	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
>   
>   	kvmhv_load_host_pmu();
>
Nicholas Piggin Aug. 6, 2021, 10:30 a.m. UTC | #2
Excerpts from Christophe Leroy's message of August 5, 2021 5:22 pm:
> 
> 
> Le 26/07/2021 à 05:49, Nicholas Piggin a écrit :
>> Rather than have KVM look up the host timer and fiddle with the
>> irq-work internal details, have the powerpc/time.c code provide a
>> function for KVM to re-arm the Linux timer code when exiting a
>> guest.
>> 
>> This is implementation has an improvement over existing code of
>> marking a decrementer interrupt as soft-pending if a timer has
>> expired, rather than setting DEC to a -ve value, which tended to
>> cause host timers to take two interrupts (first hdec to exit the
>> guest, then the immediate dec).
>> 
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>>   arch/powerpc/include/asm/time.h | 16 +++-------
>>   arch/powerpc/kernel/time.c      | 52 +++++++++++++++++++++++++++------
>>   arch/powerpc/kvm/book3s_hv.c    |  7 ++---
>>   3 files changed, 49 insertions(+), 26 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
>> index 69b6be617772..924b2157882f 100644
>> --- a/arch/powerpc/include/asm/time.h
>> +++ b/arch/powerpc/include/asm/time.h
>> @@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
>>   extern void secondary_cpu_time_init(void);
>>   extern void __init time_init(void);
>>   
>> -#ifdef CONFIG_PPC64
>> -static inline unsigned long test_irq_work_pending(void)
>> -{
>> -	unsigned long x;
>> -
>> -	asm volatile("lbz %0,%1(13)"
>> -		: "=r" (x)
>> -		: "i" (offsetof(struct paca_struct, irq_work_pending)));
>> -	return x;
>> -}
>> -#endif
>> -
>>   DECLARE_PER_CPU(u64, decrementers_next_tb);
>>   
>>   static inline u64 timer_get_next_tb(void)
>> @@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
>>   	return __this_cpu_read(decrementers_next_tb);
>>   }
>>   
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +void timer_rearm_host_dec(u64 now);
>> +#endif
>> +
>>   /* Convert timebase ticks to nanoseconds */
>>   unsigned long long tb_to_ns(unsigned long long tb_ticks);
>>   
>> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
>> index 72d872b49167..016828b7401b 100644
>> --- a/arch/powerpc/kernel/time.c
>> +++ b/arch/powerpc/kernel/time.c
>> @@ -499,6 +499,16 @@ EXPORT_SYMBOL(profile_pc);
>>    * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
>>    */
>>   #ifdef CONFIG_PPC64
>> +static inline unsigned long test_irq_work_pending(void)
>> +{
>> +	unsigned long x;
>> +
>> +	asm volatile("lbz %0,%1(13)"
>> +		: "=r" (x)
>> +		: "i" (offsetof(struct paca_struct, irq_work_pending)));
> 
> Can we just use READ_ONCE() instead of hard coding the read ?

Good question, probably yes. Probably calls for its own patch series, 
e.g., hw_irq.h has all similar paca accesses.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 69b6be617772..924b2157882f 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -99,18 +99,6 @@  extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 extern void secondary_cpu_time_init(void);
 extern void __init time_init(void);
 
-#ifdef CONFIG_PPC64
-static inline unsigned long test_irq_work_pending(void)
-{
-	unsigned long x;
-
-	asm volatile("lbz %0,%1(13)"
-		: "=r" (x)
-		: "i" (offsetof(struct paca_struct, irq_work_pending)));
-	return x;
-}
-#endif
-
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
 static inline u64 timer_get_next_tb(void)
@@ -118,6 +106,10 @@  static inline u64 timer_get_next_tb(void)
 	return __this_cpu_read(decrementers_next_tb);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now);
+#endif
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 72d872b49167..016828b7401b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -499,6 +499,16 @@  EXPORT_SYMBOL(profile_pc);
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, irq_work_pending)));
+	return x;
+}
+
 static inline void set_irq_work_pending_flag(void)
 {
 	asm volatile("stb %0,%1(13)" : :
@@ -542,13 +552,44 @@  void arch_irq_work_raise(void)
 	preempt_enable();
 }
 
+static void set_dec_or_work(u64 val)
+{
+	set_dec(val);
+	/* We may have raced with new irq work */
+	if (unlikely(test_irq_work_pending()))
+		set_dec(1);
+}
+
 #else  /* CONFIG_IRQ_WORK */
 
 #define test_irq_work_pending()	0
 #define clear_irq_work_pending()
 
+static void set_dec_or_work(u64 val)
+{
+	set_dec(val);
+}
 #endif /* CONFIG_IRQ_WORK */
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now)
+{
+	u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+
+	WARN_ON_ONCE(!arch_irqs_disabled());
+	WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+	if (now >= *next_tb) {
+		local_paca->irq_happened |= PACA_IRQ_DEC;
+	} else {
+		now = *next_tb - now;
+		if (now <= decrementer_max)
+			set_dec_or_work(now);
+	}
+}
+EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
+#endif
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -609,10 +650,7 @@  DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
 	} else {
 		now = *next_tb - now;
 		if (now <= decrementer_max)
-			set_dec(now);
-		/* We may have raced with new irq work */
-		if (test_irq_work_pending())
-			set_dec(1);
+			set_dec_or_work(now);
 		__this_cpu_inc(irq_stat.timer_irqs_others);
 	}
 
@@ -854,11 +892,7 @@  static int decrementer_set_next_event(unsigned long evt,
 				      struct clock_event_device *dev)
 {
 	__this_cpu_write(decrementers_next_tb, get_tb() + evt);
-	set_dec(evt);
-
-	/* We may have raced with new irq work */
-	if (test_irq_work_pending())
-		set_dec(1);
+	set_dec_or_work(evt);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6e6cfb10e9bb..0cef578930f9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4018,11 +4018,8 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->entry_exit_map = 0x101;
 	vc->in_guest = 0;
 
-	next_timer = timer_get_next_tb();
-	set_dec(next_timer - tb);
-	/* We may have raced with new irq work */
-	if (test_irq_work_pending())
-		set_dec(1);
+	timer_rearm_host_dec(tb);
+
 	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
 	kvmhv_load_host_pmu();