diff mbox series

[v2,1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.

Message ID 20210205060643.233481-1-leobras.c@gmail.com
State New
Headers show
Series [v2,1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code. | expand

Commit Message

Leonardo Brás Feb. 5, 2021, 6:06 a.m. UTC
Before guest entry, TBU40 register is changed to reflect guest timebase.
After exitting guest, the register is reverted to it's original value.

If one tries to get the timestamp from host between those changes, it
will present an incorrect value.

An example would be trying to add a tracepoint in
kvmppc_guest_entry_inject_int(), which depending on last tracepoint
acquired could actually cause the host to crash.

Save the Timebase Offset to PACA and use it on sched_clock() to always
get the correct timestamp.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Suggested-by: Paul Mackerras <paulus@ozlabs.org>
---
Changes since v1:
- Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
  CONFIG_PPC_BOOK3S_64 are defined.
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c         | 1 +
 arch/powerpc/kernel/time.c                | 8 +++++++-
 arch/powerpc/kvm/book3s_hv.c              | 2 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
 5 files changed, 13 insertions(+), 1 deletion(-)

Comments

Nicholas Piggin Feb. 5, 2021, 6:28 a.m. UTC | #1
Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> Before guest entry, TBU40 register is changed to reflect guest timebase.
> After exitting guest, the register is reverted to it's original value.
> 
> If one tries to get the timestamp from host between those changes, it
> will present an incorrect value.
> 
> An example would be trying to add a tracepoint in
> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> acquired could actually cause the host to crash.
> 
> Save the Timebase Offset to PACA and use it on sched_clock() to always
> get the correct timestamp.

Ouch. Not sure how reasonable it is to half switch into guest registers 
and expect to call into the wider kernel, fixing things up as we go. 
What if mftb is used in other places?

Especially as it doesn't seem like there is a reason that function _has_
to be called after the timebase is switched to guest, that's just how 
the code is structured.

As a local hack to work out a bug okay. If you really need it upstream 
could you put it under a debug config option?

Thanks,
Nick

> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> ---
> Changes since v1:
> - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
>   CONFIG_PPC_BOOK3S_64 are defined.
> ---
>  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
>  arch/powerpc/kernel/asm-offsets.c         | 1 +
>  arch/powerpc/kernel/time.c                | 8 +++++++-
>  arch/powerpc/kvm/book3s_hv.c              | 2 ++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
>  5 files changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 078f4648ea27..e2c12a10eed2 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -131,6 +131,7 @@ struct kvmppc_host_state {
>  	u64 cfar;
>  	u64 ppr;
>  	u64 host_fscr;
> +	u64 tb_offset;		/* Timebase offset: keeps correct timebase while on guest */
>  #endif
>  };
>  
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index b12d7c049bfe..0beb8fdc6352 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -706,6 +706,7 @@ int main(void)
>  	HSTATE_FIELD(HSTATE_CFAR, cfar);
>  	HSTATE_FIELD(HSTATE_PPR, ppr);
>  	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> +	HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>  
>  #else /* CONFIG_PPC_BOOK3S */
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 67feb3524460..f27f0163792b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
>   */
>  notrace unsigned long long sched_clock(void)
>  {
> -	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> +	u64 tb = get_tb() - boot_tb;
> +
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> +	tb -= local_paca->kvm_hstate.tb_offset;
> +#endif
> +
> +	return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
>  }
>  
>  
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index b3731572295e..c08593c63353 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
>  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
>  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
>  		vc->tb_offset_applied = vc->tb_offset;
> +		local_paca->kvm_hstate.tb_offset = vc->tb_offset;
>  	}
>  
>  	if (vc->pcr)
> @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
>  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
>  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
>  		vc->tb_offset_applied = 0;
> +		local_paca->kvm_hstate.tb_offset = 0;
>  	}
>  
>  	mtspr(SPRN_HDEC, 0x7fffffff);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b73140607875..8f7a9f7f4ee6 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>  	cmpdi	r8,0
>  	beq	37f
>  	std	r8, VCORE_TB_OFFSET_APPL(r5)
> +	std	r8, HSTATE_TB_OFFSET(r13)
>  	mftb	r6		/* current host timebase */
>  	add	r8,r8,r6
>  	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
> @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>  	beq	17f
>  	li	r0, 0
>  	std	r0, VCORE_TB_OFFSET_APPL(r5)
> +	std	r0, HSTATE_TB_OFFSET(r13)
>  	mftb	r6			/* current guest timebase */
>  	subf	r8,r8,r6
>  	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
> -- 
> 2.29.2
> 
>
Leonardo Brás Feb. 5, 2021, 7:01 a.m. UTC | #2
Hey Nick, thanks for reviewing :)

On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> 
> Ouch. Not sure how reasonable it is to half switch into guest registers 
> and expect to call into the wider kernel, fixing things up as we go. 
> What if mftb is used in other places?

IIUC, the CPU is not supposed to call anything as host between guest
entry and guest exit, except guest-related cases, like
kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
will still get the same value as before.

This is only supposed to change stuff that depends on sched_clock, like
Tracepoints, that can happen in those exceptions.


> Especially as it doesn't seem like there is a reason that function _has_
> to be called after the timebase is switched to guest, that's just how 
> the code is structured.

Correct, but if called, like in rb routines, used by tracepoints, the
difference between last tb and current (lower) tb may cause the CPU to
trap PROGRAM exception, crashing host. 

> As a local hack to work out a bug okay. If you really need it upstream 
> could you put it under a debug config option?

You mean something that is automatically selected whenever those
configs are enabled? 

CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64

Or something the user need to select himself in menuconfig?

> 
> Thanks,
> Nick
> 

Thank you!
Leonardo Bras

> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c         | 1 +
> >  arch/powerpc/kernel/time.c                | 8 +++++++-
> >  arch/powerpc/kvm/book3s_hv.c              | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >  	u64 cfar;
> >  	u64 ppr;
> >  	u64 host_fscr;
> > +	u64 tb_offset;		/* Timebase offset: keeps correct timebase while on guest */
> >  #endif
> >  };
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >  	HSTATE_FIELD(HSTATE_CFAR, cfar);
> >  	HSTATE_FIELD(HSTATE_PPR, ppr);
> >  	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +	HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> >  #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> >   */
> >  notrace unsigned long long sched_clock(void)
> >  {
> > -	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > +	u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > +	tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > +	return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> >  }
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> >  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
> >  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
> >  		vc->tb_offset_applied = vc->tb_offset;
> > +		local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> >  	}
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> >  	if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> >  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
> >  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
> >  		vc->tb_offset_applied = 0;
> > +		local_paca->kvm_hstate.tb_offset = 0;
> >  	}
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> >  	mtspr(SPRN_HDEC, 0x7fffffff);
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b73140607875..8f7a9f7f4ee6 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> >  	cmpdi	r8,0
> >  	beq	37f
> >  	std	r8, VCORE_TB_OFFSET_APPL(r5)
> > +	std	r8, HSTATE_TB_OFFSET(r13)
> >  	mftb	r6		/* current host timebase */
> >  	add	r8,r8,r6
> >  	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
> > @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >  	beq	17f
> >  	li	r0, 0
> >  	std	r0, VCORE_TB_OFFSET_APPL(r5)
> > +	std	r0, HSTATE_TB_OFFSET(r13)
> >  	mftb	r6			/* current guest timebase */
> >  	subf	r8,r8,r6
> >  	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
> > -- 
> > 2.29.2
> > 
> >
Fabiano Rosas Feb. 5, 2021, 1:09 p.m. UTC | #3
Leonardo Bras <leobras.c@gmail.com> writes:

> Before guest entry, TBU40 register is changed to reflect guest timebase.
> After exitting guest, the register is reverted to it's original value.
>
> If one tries to get the timestamp from host between those changes, it
> will present an incorrect value.
>
> An example would be trying to add a tracepoint in
> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> acquired could actually cause the host to crash.
>
> Save the Timebase Offset to PACA and use it on sched_clock() to always
> get the correct timestamp.
>
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> ---
> Changes since v1:
> - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
>   CONFIG_PPC_BOOK3S_64 are defined.
> ---
>  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
>  arch/powerpc/kernel/asm-offsets.c         | 1 +
>  arch/powerpc/kernel/time.c                | 8 +++++++-
>  arch/powerpc/kvm/book3s_hv.c              | 2 ++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
>  5 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 078f4648ea27..e2c12a10eed2 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -131,6 +131,7 @@ struct kvmppc_host_state {
>  	u64 cfar;
>  	u64 ppr;
>  	u64 host_fscr;
> +	u64 tb_offset;		/* Timebase offset: keeps correct
> timebase while on guest */

Couldn't you use the vc->tb_offset_applied for this? We have a reference
for the vcore in the hstate already.

>  #endif
>  };
>
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index b12d7c049bfe..0beb8fdc6352 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -706,6 +706,7 @@ int main(void)
>  	HSTATE_FIELD(HSTATE_CFAR, cfar);
>  	HSTATE_FIELD(HSTATE_PPR, ppr);
>  	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> +	HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
>  #endif /* CONFIG_PPC_BOOK3S_64 */
>
>  #else /* CONFIG_PPC_BOOK3S */
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 67feb3524460..f27f0163792b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
>   */
>  notrace unsigned long long sched_clock(void)
>  {
> -	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> +	u64 tb = get_tb() - boot_tb;
> +
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> +	tb -= local_paca->kvm_hstate.tb_offset;
> +#endif
> +
> +	return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
>  }
>
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index b3731572295e..c08593c63353 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
>  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
>  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
>  		vc->tb_offset_applied = vc->tb_offset;
> +		local_paca->kvm_hstate.tb_offset = vc->tb_offset;
>  	}
>
>  	if (vc->pcr)
> @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
>  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
>  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
>  		vc->tb_offset_applied = 0;
> +		local_paca->kvm_hstate.tb_offset = 0;
>  	}
>
>  	mtspr(SPRN_HDEC, 0x7fffffff);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b73140607875..8f7a9f7f4ee6 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
>  	cmpdi	r8,0
>  	beq	37f
>  	std	r8, VCORE_TB_OFFSET_APPL(r5)
> +	std	r8, HSTATE_TB_OFFSET(r13)
>  	mftb	r6		/* current host timebase */
>  	add	r8,r8,r6
>  	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
> @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
>  	beq	17f
>  	li	r0, 0
>  	std	r0, VCORE_TB_OFFSET_APPL(r5)
> +	std	r0, HSTATE_TB_OFFSET(r13)
>  	mftb	r6			/* current guest timebase */
>  	subf	r8,r8,r6
>  	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
Leonardo Brás Feb. 5, 2021, 8:35 p.m. UTC | #4
Hello Fabiano, 
Thanks for reviewing! 
(answers inline)

On Fri, 2021-02-05 at 10:09 -0300, Fabiano Rosas wrote:
> Leonardo Bras <leobras.c@gmail.com> writes:
> 
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> > 
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> > 
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> > 
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> >   CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> >  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> >  arch/powerpc/kernel/asm-offsets.c         | 1 +
> >  arch/powerpc/kernel/time.c                | 8 +++++++-
> >  arch/powerpc/kvm/book3s_hv.c              | 2 ++
> >  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 2 ++
> >  5 files changed, 13 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> >  	u64 cfar;
> >  	u64 ppr;
> >  	u64 host_fscr;
> > +	u64 tb_offset;		/* Timebase offset: keeps correct
> > timebase while on guest */
> 
> Couldn't you use the vc->tb_offset_applied for this? We have a reference
> for the vcore in the hstate already.

But it's a pointer, which means we would have to keep checking for NULL
every time we need sched_clock(). 
Potentially it would cost a cache miss for PACA memory region that
contain vc, another for getting the part of *vc that contains the
tb_offset_applied, instead of only one for PACA struct region that
contains tb_offset.

On the other hand, it got me thinking: If the offset is applied per
cpu, why don't we get this info only in PACA, instead of in vc?
It could be a general way to get an offset applied for any purpose and
still get the sched_clock() right. 
(Not that I have any idea of any other purpose we could use it) 

Best regards!
Leonardo Bras

> 
> >  #endif
> >  };
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> >  	HSTATE_FIELD(HSTATE_CFAR, cfar);
> >  	HSTATE_FIELD(HSTATE_PPR, ppr);
> >  	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > +	HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> >  #endif /* CONFIG_PPC_BOOK3S_64 */
> > 
> >  #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> >   */
> >  notrace unsigned long long sched_clock(void)
> >  {
> > -	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > +	u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > +	tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > +	return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> >  }
> > 
> > 
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> >  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
> >  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
> >  		vc->tb_offset_applied = vc->tb_offset;
> > +		local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> >  	}
> > 
> >  	if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> >  		if ((tb & 0xffffff) < (new_tb & 0xffffff))
> >  			mtspr(SPRN_TBU40, new_tb + 0x1000000);
> >  		vc->tb_offset_applied = 0;
> > +		local_paca->kvm_hstate.tb_offset = 0;
> >  	}
> > 
> >  	mtspr(SPRN_HDEC, 0x7fffffff);
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b73140607875..8f7a9f7f4ee6 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> >  	cmpdi	r8,0
> >  	beq	37f
> >  	std	r8, VCORE_TB_OFFSET_APPL(r5)
> > +	std	r8, HSTATE_TB_OFFSET(r13)
> >  	mftb	r6		/* current host timebase */
> >  	add	r8,r8,r6
> >  	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
> > @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> >  	beq	17f
> >  	li	r0, 0
> >  	std	r0, VCORE_TB_OFFSET_APPL(r5)
> > +	std	r0, HSTATE_TB_OFFSET(r13)
> >  	mftb	r6			/* current guest timebase */
> >  	subf	r8,r8,r6
> >  	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */
Michael Ellerman Feb. 5, 2021, 11:32 p.m. UTC | #5
Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
>> Before guest entry, TBU40 register is changed to reflect guest timebase.
>> After exitting guest, the register is reverted to it's original value.
>> 
>> If one tries to get the timestamp from host between those changes, it
>> will present an incorrect value.
>> 
>> An example would be trying to add a tracepoint in
>> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
>> acquired could actually cause the host to crash.
>> 
>> Save the Timebase Offset to PACA and use it on sched_clock() to always
>> get the correct timestamp.
>
> Ouch. Not sure how reasonable it is to half switch into guest registers 
> and expect to call into the wider kernel, fixing things up as we go. 

Yeah it's not.

We need to disable tracing on those routines that are called in that
half-exited state.

cheers
Nicholas Piggin Feb. 6, 2021, 3:03 a.m. UTC | #6
Excerpts from Leonardo Bras's message of February 5, 2021 5:01 pm:
> Hey Nick, thanks for reviewing :)
> 
> On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
>> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
>> > Before guest entry, TBU40 register is changed to reflect guest timebase.
>> > After exitting guest, the register is reverted to it's original value.
>> > 
>> > If one tries to get the timestamp from host between those changes, it
>> > will present an incorrect value.
>> > 
>> > An example would be trying to add a tracepoint in
>> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
>> > acquired could actually cause the host to crash.
>> > 
>> > Save the Timebase Offset to PACA and use it on sched_clock() to always
>> > get the correct timestamp.
>> 
>> Ouch. Not sure how reasonable it is to half switch into guest registers 
>> and expect to call into the wider kernel, fixing things up as we go. 
>> What if mftb is used in other places?
> 
> IIUC, the CPU is not supposed to call anything as host between guest
> entry and guest exit, except guest-related cases, like

When I say "call", I'm including tracing in that. If a function is not 
marked as no trace, then it will call into the tracing subsystem.

> kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
> will still get the same value as before.

Right, so it'll be out of whack again.

> This is only supposed to change stuff that depends on sched_clock, like
> Tracepoints, that can happen in those exceptions.

If they depend on sched_clock that's one thing. Do they definitely have 
no dependencies on mftb from other calls?

>> Especially as it doesn't seem like there is a reason that function _has_
>> to be called after the timebase is switched to guest, that's just how 
>> the code is structured.
> 
> Correct, but if called, like in rb routines, used by tracepoints, the
> difference between last tb and current (lower) tb may cause the CPU to
> trap PROGRAM exception, crashing host. 

Yes, so I agree with Michael any function that is involved when we begin 
to switch into guest context (or have not completed switching back to 
host going the other way) should be marked as no trace (noinstr even, 
perhaps).

>> As a local hack to work out a bug okay. If you really need it upstream 
>> could you put it under a debug config option?
> 
> You mean something that is automatically selected whenever those
> configs are enabled? 
> 
> CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64
> 
> Or something the user need to select himself in menuconfig?

Yeah I meant a default n thing under powerpc kernel debugging somewhere.

Thanks,
Nick
Leonardo Brás Feb. 8, 2021, 4:37 p.m. UTC | #7
Hello Nick,

On Sat, 2021-02-06 at 13:03 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of February 5, 2021 5:01 pm:
> > Hey Nick, thanks for reviewing :)
> > 
> > On Fri, 2021-02-05 at 16:28 +1000, Nicholas Piggin wrote:
> > > Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
> > > > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > > > After exitting guest, the register is reverted to it's original value.
> > > > 
> > > > If one tries to get the timestamp from host between those changes, it
> > > > will present an incorrect value.
> > > > 
> > > > An example would be trying to add a tracepoint in
> > > > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > > > acquired could actually cause the host to crash.
> > > > 
> > > > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > > > get the correct timestamp.
> > > 
> > > Ouch. Not sure how reasonable it is to half switch into guest registers 
> > > and expect to call into the wider kernel, fixing things up as we go. 
> > > What if mftb is used in other places?
> > 
> > IIUC, the CPU is not supposed to call anything as host between guest
> > entry and guest exit, except guest-related cases, like
> 
> When I say "call", I'm including tracing in that. If a function is not 
> marked as no trace, then it will call into the tracing subsystem.
> 
> > kvmppc_guest_entry_inject_int(), but anyway, if something calls mftb it
> > will still get the same value as before.
> 
> Right, so it'll be out of whack again.
> 
> > This is only supposed to change stuff that depends on sched_clock, like
> > Tracepoints, that can happen in those exceptions.
> 
> If they depend on sched_clock that's one thing. Do they definitely have 
> no dependencies on mftb from other calls?

We could change that on get_tb() or mftb() @ timebase.h, which would
have a broader reach, but would not reach any mftb from asm code.

> > > Especially as it doesn't seem like there is a reason that function _has_
> > > to be called after the timebase is switched to guest, that's just how 
> > > the code is structured.
> > 
> > Correct, but if called, like in rb routines, used by tracepoints, the
> > difference between last tb and current (lower) tb may cause the CPU to
> > trap PROGRAM exception, crashing host. 
> 
> Yes, so I agree with Michael any function that is involved when we begin 
> to switch into guest context (or have not completed switching back to 
> host going the other way) should be marked as no trace (noinstr even, 
> perhaps).

Sure, that would avoid having to get paca->tb_offset for every mftb()
called, and avoid inconsistencies when different ways to get time are
used in code.

On the other hand, it would make it very hard to debug functions like
kvmppc_guest_entry_inject_int() as I am doing right now.

> 
> > > As a local hack to work out a bug okay. If you really need it upstream 
> > > could you put it under a debug config option?
> > 
> > You mean something that is automatically selected whenever those
> > configs are enabled? 
> > 
> > CONFIG_TRACEPOINT && CONFIG_KVM_BOOK3S_HANDLER && CONFIG_PPC_BOOK3S_64
> > 
> > Or something the user need to select himself in menuconfig?
> 
> Yeah I meant a default n thing under powerpc kernel debugging somewhere.

So, IIUC all we can do is split this in 2 changes:
1 - Adding notrace to those functions
2 - Introducing a kernel debug config that reverts (1) and 'fixes' mftb

If that's correct, I have some ideas we can use. 

For debug option, should we add the offset on get_tb() or mftb()?

Another option would be to adding this tb_offset only in the routines
used by tracing. But this could probably mean having to add a function
in arch-generic code, but still an option.

What do you think?

> 
> Thanks,
> Nick

Thank you!
Leonardo Bras
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 078f4648ea27..e2c12a10eed2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -131,6 +131,7 @@  struct kvmppc_host_state {
 	u64 cfar;
 	u64 ppr;
 	u64 host_fscr;
+	u64 tb_offset;		/* Timebase offset: keeps correct timebase while on guest */
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index b12d7c049bfe..0beb8fdc6352 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -706,6 +706,7 @@  int main(void)
 	HSTATE_FIELD(HSTATE_CFAR, cfar);
 	HSTATE_FIELD(HSTATE_PPR, ppr);
 	HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
+	HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 67feb3524460..f27f0163792b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -699,7 +699,13 @@  EXPORT_SYMBOL_GPL(tb_to_ns);
  */
 notrace unsigned long long sched_clock(void)
 {
-	return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
+	u64 tb = get_tb() - boot_tb;
+
+#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
+	tb -= local_paca->kvm_hstate.tb_offset;
+#endif
+
+	return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b3731572295e..c08593c63353 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3491,6 +3491,7 @@  static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 		if ((tb & 0xffffff) < (new_tb & 0xffffff))
 			mtspr(SPRN_TBU40, new_tb + 0x1000000);
 		vc->tb_offset_applied = vc->tb_offset;
+		local_paca->kvm_hstate.tb_offset = vc->tb_offset;
 	}
 
 	if (vc->pcr)
@@ -3594,6 +3595,7 @@  static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
 		if ((tb & 0xffffff) < (new_tb & 0xffffff))
 			mtspr(SPRN_TBU40, new_tb + 0x1000000);
 		vc->tb_offset_applied = 0;
+		local_paca->kvm_hstate.tb_offset = 0;
 	}
 
 	mtspr(SPRN_HDEC, 0x7fffffff);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b73140607875..8f7a9f7f4ee6 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -632,6 +632,7 @@  END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 	cmpdi	r8,0
 	beq	37f
 	std	r8, VCORE_TB_OFFSET_APPL(r5)
+	std	r8, HSTATE_TB_OFFSET(r13)
 	mftb	r6		/* current host timebase */
 	add	r8,r8,r6
 	mtspr	SPRN_TBU40,r8	/* update upper 40 bits */
@@ -1907,6 +1908,7 @@  END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	beq	17f
 	li	r0, 0
 	std	r0, VCORE_TB_OFFSET_APPL(r5)
+	std	r0, HSTATE_TB_OFFSET(r13)
 	mftb	r6			/* current guest timebase */
 	subf	r8,r8,r6
 	mtspr	SPRN_TBU40,r8		/* update upper 40 bits */