diff mbox series

[v3,19/41] KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path

Message ID 20210305150638.2675513-20-npiggin@gmail.com
State Superseded
Headers show
Series KVM: PPC: Book3S: C-ify the P9 entry/exit code | expand

Commit Message

Nicholas Piggin March 5, 2021, 3:06 p.m. UTC
In the interest of minimising the amount of code that is run in
"real-mode", don't handle hcalls in real mode in the P9 path.

POWER8 and earlier are much more expensive to exit from HV real mode
and switch to host mode, because on those processors HV interrupts get
to the hypervisor with the MMU off, and the other threads in the core
need to be pulled out of the guest, and SLBs all need to be saved,
ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
in host mode. Hash guests also require a lot of hcalls to run. The
XICS interrupt controller requires hcalls to run.

By contrast, POWER9 has independent thread switching, and in radix mode
the hypervisor is already in a host virtual memory mode when the HV
interrupt is taken. Radix + xive guests don't need hcalls to handle
interrupts or manage translations.

So it's much less important to handle hcalls in real mode in P9.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/include/asm/kvm_ppc.h      |  5 +++
 arch/powerpc/kvm/book3s_hv.c            | 46 +++++++++++++++----
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  5 +++
 arch/powerpc/kvm/book3s_xive.c          | 60 +++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 8 deletions(-)

Comments

Fabiano Rosas March 17, 2021, 4:22 p.m. UTC | #1
Nicholas Piggin <npiggin@gmail.com> writes:

> In the interest of minimising the amount of code that is run in
> "real-mode", don't handle hcalls in real mode in the P9 path.
>
> POWER8 and earlier are much more expensive to exit from HV real mode
> and switch to host mode, because on those processors HV interrupts get
> to the hypervisor with the MMU off, and the other threads in the core
> need to be pulled out of the guest, and SLBs all need to be saved,
> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
> in host mode. Hash guests also require a lot of hcalls to run. The
> XICS interrupt controller requires hcalls to run.
>
> By contrast, POWER9 has independent thread switching, and in radix mode
> the hypervisor is already in a host virtual memory mode when the HV
> interrupt is taken. Radix + xive guests don't need hcalls to handle
> interrupts or manage translations.
>
> So it's much less important to handle hcalls in real mode in P9.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---

<snip>

> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 497f216ad724..1f2ba8955c6a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1147,7 +1147,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>   * This has to be done early, not in kvmppc_pseries_do_hcall(), so
>   * that the cede logic in kvmppc_run_single_vcpu() works properly.
>   */
> -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
> +static void kvmppc_cede(struct kvm_vcpu *vcpu)

The comment above needs to be updated I think.

>  {
>  	vcpu->arch.shregs.msr |= MSR_EE;
>  	vcpu->arch.ceded = 1;
> @@ -1403,9 +1403,15 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
>  		/* hcall - punt to userspace */
>  		int i;
>
> -		/* hypercall with MSR_PR has already been handled in rmode,
> -		 * and never reaches here.
> -		 */
> +		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
> +			/*
> +			 * Guest userspace executed sc 1, reflect it back as a
> +			 * privileged program check interrupt.
> +			 */
> +			kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
> +			r = RESUME_GUEST;
> +			break;
> +		}
>
>  		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
>  		for (i = 0; i < 9; ++i)
> @@ -3740,15 +3746,36 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>  		/* H_CEDE has to be handled now, not later */
>  		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
>  		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
> -			kvmppc_nested_cede(vcpu);
> +			kvmppc_cede(vcpu);
>  			kvmppc_set_gpr(vcpu, 3, 0);
>  			trap = 0;
>  		}
>  	} else {
>  		kvmppc_xive_push_vcpu(vcpu);
>  		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
> -		kvmppc_xive_pull_vcpu(vcpu);
> +		/* H_CEDE has to be handled now, not later */
> +		/* XICS hcalls must be handled before xive is pulled */
> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>
> +			if (req == H_CEDE) {
> +				kvmppc_cede(vcpu);
> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
> +				kvmppc_set_gpr(vcpu, 3, 0);
> +				trap = 0;
> +			}
> +			if (req == H_EOI || req == H_CPPR ||
> +			    req == H_IPI || req == H_IPOLL ||
> +			    req == H_XIRR || req == H_XIRR_X) {
> +				unsigned long ret;
> +
> +				ret = kvmppc_xive_xics_hcall(vcpu, req);
> +				kvmppc_set_gpr(vcpu, 3, ret);
> +				trap = 0;
> +			}
> +		}

I tried running L2 with xive=off and this code slows down the boot
considerably. I think we're missing a !vcpu->arch.nested in the
conditional.

This may also be missing these checks from kvmppc_pseries_do_hcall:

		if (kvmppc_xics_enabled(vcpu)) {
			if (xics_on_xive()) {
				ret = H_NOT_AVAILABLE;
				return RESUME_GUEST;
			}
			ret = kvmppc_xics_hcall(vcpu, req);
                        (...)

For H_CEDE there might be a similar situation since we're shadowing the
code above that runs after H_ENTER_NESTED by setting trap to 0 here.

> +		kvmppc_xive_pull_vcpu(vcpu);
>  	}
>
>  	vcpu->arch.slb_max = 0;
> @@ -4408,8 +4435,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
>  		else
>  			r = kvmppc_run_vcpu(vcpu);
>
> -		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
> -		    !(vcpu->arch.shregs.msr & MSR_PR)) {
> +		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
> +			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
> +				r = RESUME_GUEST;
> +				continue;
> +			}
>  			trace_kvm_hcall_enter(vcpu);
>  			r = kvmppc_pseries_do_hcall(vcpu);
>  			trace_kvm_hcall_exit(vcpu, r);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index c11597f815e4..2d0d14ed1d92 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1397,9 +1397,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  	mr	r4,r9
>  	bge	fast_guest_return
>  2:
> +	/* If we came in through the P9 short path, no real mode hcalls */
> +	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
> +	cmpwi	r0, 0
> +	bne	no_try_real
>  	/* See if this is an hcall we can handle in real mode */
>  	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
>  	beq	hcall_try_real_mode
> +no_try_real:
>
>  	/* Hypervisor doorbell - exit only if host IPI flag set */
>  	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index 52cdb9e2660a..1e4871bbcad4 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -158,6 +158,40 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
>
> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
> +{
> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
> +
> +	if (!esc_vaddr)
> +		return;
> +
> +	/* we are using XIVE with single escalation */
> +
> +	if (vcpu->arch.xive_esc_on) {
> +		/*
> +		 * If we still have a pending escalation, abort the cede,
> +		 * and we must set PQ to 10 rather than 00 so that we don't
> +		 * potentially end up with two entries for the escalation
> +		 * interrupt in the XIVE interrupt queue.  In that case
> +		 * we also don't want to set xive_esc_on to 1 here in
> +		 * case we race with xive_esc_irq().
> +		 */
> +		vcpu->arch.ceded = 0;
> +		/*
> +		 * The escalation interrupts are special as we don't EOI them.
> +		 * There is no need to use the load-after-store ordering offset
> +		 * to set PQ to 10 as we won't use StoreEOI.
> +		 */
> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
> +	} else {
> +		vcpu->arch.xive_esc_on = true;
> +		mb();
> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
> +	}
> +	mb();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
> +
>  /*
>   * This is a simple trigger for a generic XIVE IRQ. This must
>   * only be called for interrupts that support a trigger page
> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>  	return 0;
>  }
>
> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	switch (req) {
> +	case H_XIRR:
> +		return xive_vm_h_xirr(vcpu);
> +	case H_CPPR:
> +		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_EOI:
> +		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_IPI:
> +		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
> +					  kvmppc_get_gpr(vcpu, 5));
> +	case H_IPOLL:
> +		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_XIRR_X:
> +		xive_vm_h_xirr(vcpu);
> +		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
> +		return H_SUCCESS;
> +	}
> +
> +	return H_UNSUPPORTED;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
> +
>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>  {
>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
Nicholas Piggin March 17, 2021, 10:41 p.m. UTC | #2
Excerpts from Fabiano Rosas's message of March 18, 2021 2:22 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> In the interest of minimising the amount of code that is run in
>> "real-mode", don't handle hcalls in real mode in the P9 path.
>>
>> POWER8 and earlier are much more expensive to exit from HV real mode
>> and switch to host mode, because on those processors HV interrupts get
>> to the hypervisor with the MMU off, and the other threads in the core
>> need to be pulled out of the guest, and SLBs all need to be saved,
>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>> in host mode. Hash guests also require a lot of hcalls to run. The
>> XICS interrupt controller requires hcalls to run.
>>
>> By contrast, POWER9 has independent thread switching, and in radix mode
>> the hypervisor is already in a host virtual memory mode when the HV
>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>> interrupts or manage translations.
>>
>> So it's much less important to handle hcalls in real mode in P9.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
> 
> <snip>
> 
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 497f216ad724..1f2ba8955c6a 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -1147,7 +1147,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>>   * This has to be done early, not in kvmppc_pseries_do_hcall(), so
>>   * that the cede logic in kvmppc_run_single_vcpu() works properly.
>>   */
>> -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
>> +static void kvmppc_cede(struct kvm_vcpu *vcpu)
> 
> The comment above needs to be updated I think.
> 
>>  {
>>  	vcpu->arch.shregs.msr |= MSR_EE;
>>  	vcpu->arch.ceded = 1;
>> @@ -1403,9 +1403,15 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
>>  		/* hcall - punt to userspace */
>>  		int i;
>>
>> -		/* hypercall with MSR_PR has already been handled in rmode,
>> -		 * and never reaches here.
>> -		 */
>> +		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
>> +			/*
>> +			 * Guest userspace executed sc 1, reflect it back as a
>> +			 * privileged program check interrupt.
>> +			 */
>> +			kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
>> +			r = RESUME_GUEST;
>> +			break;
>> +		}
>>
>>  		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
>>  		for (i = 0; i < 9; ++i)
>> @@ -3740,15 +3746,36 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>>  		/* H_CEDE has to be handled now, not later */
>>  		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
>>  		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
>> -			kvmppc_nested_cede(vcpu);
>> +			kvmppc_cede(vcpu);
>>  			kvmppc_set_gpr(vcpu, 3, 0);
>>  			trap = 0;
>>  		}
>>  	} else {
>>  		kvmppc_xive_push_vcpu(vcpu);
>>  		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>> -		kvmppc_xive_pull_vcpu(vcpu);
>> +		/* H_CEDE has to be handled now, not later */
>> +		/* XICS hcalls must be handled before xive is pulled */
>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>
>> +			if (req == H_CEDE) {
>> +				kvmppc_cede(vcpu);
>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>> +				kvmppc_set_gpr(vcpu, 3, 0);
>> +				trap = 0;
>> +			}
>> +			if (req == H_EOI || req == H_CPPR ||
>> +			    req == H_IPI || req == H_IPOLL ||
>> +			    req == H_XIRR || req == H_XIRR_X) {
>> +				unsigned long ret;
>> +
>> +				ret = kvmppc_xive_xics_hcall(vcpu, req);
>> +				kvmppc_set_gpr(vcpu, 3, ret);
>> +				trap = 0;
>> +			}
>> +		}
> 
> I tried running L2 with xive=off and this code slows down the boot
> considerably. I think we're missing a !vcpu->arch.nested in the
> conditional.

You might be right, the real mode handlers never run if nested is set
so none of these should run I think.

> 
> This may also be missing these checks from kvmppc_pseries_do_hcall:
> 
> 		if (kvmppc_xics_enabled(vcpu)) {
> 			if (xics_on_xive()) {
> 				ret = H_NOT_AVAILABLE;
> 				return RESUME_GUEST;
> 			}
> 			ret = kvmppc_xics_hcall(vcpu, req);
>                         (...)

Well this is the formerly real-mode part of the hcall, whereas 
pseries_do_hcall is the virt-mode handler so it expects the real mode 
has already run.

Hmm, probably it shouldn't be setting trap = 0 if it did not handle the
hcall. I don't know if that's the problem you have or if it's the nested
test but probably should test for this anyway.

> For H_CEDE there might be a similar situation since we're shadowing the
> code above that runs after H_ENTER_NESTED by setting trap to 0 here.

Yes.

Thanks,
Nick
Alexey Kardashevskiy March 22, 2021, 7:30 a.m. UTC | #3
On 06/03/2021 02:06, Nicholas Piggin wrote:
> In the interest of minimising the amount of code that is run in
> "real-mode", don't handle hcalls in real mode in the P9 path.
> 
> POWER8 and earlier are much more expensive to exit from HV real mode
> and switch to host mode, because on those processors HV interrupts get
> to the hypervisor with the MMU off, and the other threads in the core
> need to be pulled out of the guest, and SLBs all need to be saved,
> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
> in host mode. Hash guests also require a lot of hcalls to run. The
> XICS interrupt controller requires hcalls to run.
> 
> By contrast, POWER9 has independent thread switching, and in radix mode
> the hypervisor is already in a host virtual memory mode when the HV
> interrupt is taken. Radix + xive guests don't need hcalls to handle
> interrupts or manage translations.
> 
> So it's much less important to handle hcalls in real mode in P9.

So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
H_TOO_HARD) can be reverted, pretty much?



> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
>   arch/powerpc/include/asm/kvm_ppc.h      |  5 +++
>   arch/powerpc/kvm/book3s_hv.c            | 46 +++++++++++++++----
>   arch/powerpc/kvm/book3s_hv_rmhandlers.S |  5 +++
>   arch/powerpc/kvm/book3s_xive.c          | 60 +++++++++++++++++++++++++
>   4 files changed, 108 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index 73b1ca5a6471..db6646c2ade2 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -607,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm);
>   extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
>   extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
>   extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
> +extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req);
>   extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
>   extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
>   extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
> @@ -639,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
>   static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
>   static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
>   	{ return 0; }
> +static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +	{ return 0; }
>   #endif
>   
>   #ifdef CONFIG_KVM_XIVE
> @@ -673,6 +676,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
>   			       int level, bool line_status);
>   extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
>   extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
> +extern void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu);
>   
>   static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>   {
> @@ -714,6 +718,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir
>   				      int level, bool line_status) { return -ENODEV; }
>   static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
>   static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
> +static inline void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu) { }
>   
>   static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
>   	{ return 0; }
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 497f216ad724..1f2ba8955c6a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1147,7 +1147,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>    * This has to be done early, not in kvmppc_pseries_do_hcall(), so
>    * that the cede logic in kvmppc_run_single_vcpu() works properly.
>    */
> -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
> +static void kvmppc_cede(struct kvm_vcpu *vcpu)
>   {
>   	vcpu->arch.shregs.msr |= MSR_EE;
>   	vcpu->arch.ceded = 1;
> @@ -1403,9 +1403,15 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
>   		/* hcall - punt to userspace */
>   		int i;
>   
> -		/* hypercall with MSR_PR has already been handled in rmode,
> -		 * and never reaches here.
> -		 */
> +		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
> +			/*
> +			 * Guest userspace executed sc 1, reflect it back as a
> +			 * privileged program check interrupt.
> +			 */
> +			kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
> +			r = RESUME_GUEST;
> +			break;
> +		}
>   
>   		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
>   		for (i = 0; i < 9; ++i)
> @@ -3740,15 +3746,36 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>   		/* H_CEDE has to be handled now, not later */
>   		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
>   		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
> -			kvmppc_nested_cede(vcpu);
> +			kvmppc_cede(vcpu);
>   			kvmppc_set_gpr(vcpu, 3, 0);
>   			trap = 0;
>   		}
>   	} else {
>   		kvmppc_xive_push_vcpu(vcpu);
>   		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
> -		kvmppc_xive_pull_vcpu(vcpu);
> +		/* H_CEDE has to be handled now, not later */
> +		/* XICS hcalls must be handled before xive is pulled */
> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>   
> +			if (req == H_CEDE) {
> +				kvmppc_cede(vcpu);
> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
> +				kvmppc_set_gpr(vcpu, 3, 0);
> +				trap = 0;
> +			}
> +			if (req == H_EOI || req == H_CPPR ||

else if (req == H_EOI ... ?

> +			    req == H_IPI || req == H_IPOLL ||
> +			    req == H_XIRR || req == H_XIRR_X) {
> +				unsigned long ret;
> +
> +				ret = kvmppc_xive_xics_hcall(vcpu, req);
> +				kvmppc_set_gpr(vcpu, 3, ret);
> +				trap = 0;
> +			}
> +		}
> +		kvmppc_xive_pull_vcpu(vcpu);
>   	}
>   
>   	vcpu->arch.slb_max = 0;
> @@ -4408,8 +4435,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
>   		else
>   			r = kvmppc_run_vcpu(vcpu);
>   
> -		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
> -		    !(vcpu->arch.shregs.msr & MSR_PR)) {
> +		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
> +			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
> +				r = RESUME_GUEST;
> +				continue;
> +			}
>   			trace_kvm_hcall_enter(vcpu);
>   			r = kvmppc_pseries_do_hcall(vcpu);
>   			trace_kvm_hcall_exit(vcpu, r);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index c11597f815e4..2d0d14ed1d92 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1397,9 +1397,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>   	mr	r4,r9
>   	bge	fast_guest_return
>   2:
> +	/* If we came in through the P9 short path, no real mode hcalls */
> +	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
> +	cmpwi	r0, 0
> +	bne	no_try_real
>   	/* See if this is an hcall we can handle in real mode */
>   	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
>   	beq	hcall_try_real_mode
> +no_try_real:
>   
>   	/* Hypervisor doorbell - exit only if host IPI flag set */
>   	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> index 52cdb9e2660a..1e4871bbcad4 100644
> --- a/arch/powerpc/kvm/book3s_xive.c
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -158,6 +158,40 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
>   }
>   EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
>   
> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
> +{
> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
> +
> +	if (!esc_vaddr)
> +		return;
> +
> +	/* we are using XIVE with single escalation */
> +
> +	if (vcpu->arch.xive_esc_on) {
> +		/*
> +		 * If we still have a pending escalation, abort the cede,
> +		 * and we must set PQ to 10 rather than 00 so that we don't
> +		 * potentially end up with two entries for the escalation
> +		 * interrupt in the XIVE interrupt queue.  In that case
> +		 * we also don't want to set xive_esc_on to 1 here in
> +		 * case we race with xive_esc_irq().
> +		 */
> +		vcpu->arch.ceded = 0;
> +		/*
> +		 * The escalation interrupts are special as we don't EOI them.
> +		 * There is no need to use the load-after-store ordering offset
> +		 * to set PQ to 10 as we won't use StoreEOI.
> +		 */
> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
> +	} else {
> +		vcpu->arch.xive_esc_on = true;
> +		mb();
> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
> +	}
> +	mb();


Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
to that asm!


> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
> +
>   /*
>    * This is a simple trigger for a generic XIVE IRQ. This must
>    * only be called for interrupts that support a trigger page
> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>   	return 0;
>   }
>   
> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;


Can a XIVE enabled guest issue these hcalls? Don't we want if 
(!kvmppc_xics_enabled(vcpu)) and
  if (xics_on_xive()) here, as kvmppc_rm_h_xirr() have? Some of these 
hcalls do write to XIVE registers but some seem to change 
kvmppc_xive_vcpu. Thanks,




> +
> +	switch (req) {
> +	case H_XIRR:
> +		return xive_vm_h_xirr(vcpu);
> +	case H_CPPR:
> +		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_EOI:
> +		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_IPI:
> +		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
> +					  kvmppc_get_gpr(vcpu, 5));
> +	case H_IPOLL:
> +		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
> +	case H_XIRR_X:
> +		xive_vm_h_xirr(vcpu);
> +		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
> +		return H_SUCCESS;
> +	}
> +
> +	return H_UNSUPPORTED;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
> +
>   int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>   {
>   	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>
Nicholas Piggin March 22, 2021, 1:15 p.m. UTC | #4
Excerpts from Alexey Kardashevskiy's message of March 22, 2021 5:30 pm:
> 
> 
> On 06/03/2021 02:06, Nicholas Piggin wrote:
>> In the interest of minimising the amount of code that is run in
>> "real-mode", don't handle hcalls in real mode in the P9 path.
>> 
>> POWER8 and earlier are much more expensive to exit from HV real mode
>> and switch to host mode, because on those processors HV interrupts get
>> to the hypervisor with the MMU off, and the other threads in the core
>> need to be pulled out of the guest, and SLBs all need to be saved,
>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>> in host mode. Hash guests also require a lot of hcalls to run. The
>> XICS interrupt controller requires hcalls to run.
>> 
>> By contrast, POWER9 has independent thread switching, and in radix mode
>> the hypervisor is already in a host virtual memory mode when the HV
>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>> interrupts or manage translations.
>> 
>> So it's much less important to handle hcalls in real mode in P9.
> 
> So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
> H_TOO_HARD) can be reverted, pretty much?

Yes. Although that calls attention to the fact I missed doing
a P9 h_random handler in this patch. I'll fix that, then I think
acde2572 could be reverted entirely.

[...]

>>   	} else {
>>   		kvmppc_xive_push_vcpu(vcpu);
>>   		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>> -		kvmppc_xive_pull_vcpu(vcpu);
>> +		/* H_CEDE has to be handled now, not later */
>> +		/* XICS hcalls must be handled before xive is pulled */
>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>   
>> +			if (req == H_CEDE) {
>> +				kvmppc_cede(vcpu);
>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>> +				kvmppc_set_gpr(vcpu, 3, 0);
>> +				trap = 0;
>> +			}
>> +			if (req == H_EOI || req == H_CPPR ||
> 
> else if (req == H_EOI ... ?

Hummm, sure.

[...]

>> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
>> +{
>> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
>> +
>> +	if (!esc_vaddr)
>> +		return;
>> +
>> +	/* we are using XIVE with single escalation */
>> +
>> +	if (vcpu->arch.xive_esc_on) {
>> +		/*
>> +		 * If we still have a pending escalation, abort the cede,
>> +		 * and we must set PQ to 10 rather than 00 so that we don't
>> +		 * potentially end up with two entries for the escalation
>> +		 * interrupt in the XIVE interrupt queue.  In that case
>> +		 * we also don't want to set xive_esc_on to 1 here in
>> +		 * case we race with xive_esc_irq().
>> +		 */
>> +		vcpu->arch.ceded = 0;
>> +		/*
>> +		 * The escalation interrupts are special as we don't EOI them.
>> +		 * There is no need to use the load-after-store ordering offset
>> +		 * to set PQ to 10 as we won't use StoreEOI.
>> +		 */
>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
>> +	} else {
>> +		vcpu->arch.xive_esc_on = true;
>> +		mb();
>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
>> +	}
>> +	mb();
> 
> 
> Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
> to that asm!

Glad it helped.

>> +}
>> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
>> +
>>   /*
>>    * This is a simple trigger for a generic XIVE IRQ. This must
>>    * only be called for interrupts that support a trigger page
>> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>>   	return 0;
>>   }
>>   
>> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
>> +{
>> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> 
> 
> Can a XIVE enabled guest issue these hcalls? Don't we want if 
> (!kvmppc_xics_enabled(vcpu)) and
>   if (xics_on_xive()) here, as kvmppc_rm_h_xirr() have? Some of these 
> hcalls do write to XIVE registers but some seem to change 
> kvmppc_xive_vcpu. Thanks,

Yes I think you're right, good catch. I'm not completely sure about all 
the xive and xics modes but a guest certainly can make any kind of hcall 
it likes and we have to sanity check it.

We want to take the hcall here (in replacement of the real mode hcalls)
with the same condition. So it would be:

        if (!kvmppc_xics_enabled(vcpu))
                return H_TOO_HARD;
        if (!xics_on_xive())
		return H_TOO_HARD;
	
	[ ... process xive_vm_h_xirr / cppr / eoi / etc ]

Right?

Thanks,
Nick

> 
> 
> 
> 
>> +
>> +	switch (req) {
>> +	case H_XIRR:
>> +		return xive_vm_h_xirr(vcpu);
>> +	case H_CPPR:
>> +		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_EOI:
>> +		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_IPI:
>> +		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
>> +					  kvmppc_get_gpr(vcpu, 5));
>> +	case H_IPOLL:
>> +		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_XIRR_X:
>> +		xive_vm_h_xirr(vcpu);
>> +		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
>> +		return H_SUCCESS;
>> +	}
>> +
>> +	return H_UNSUPPORTED;
>> +}
>> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
>> +
>>   int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>>   {
>>   	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> 
> 
> -- 
> Alexey
>
Cédric Le Goater March 22, 2021, 4:01 p.m. UTC | #5
On 3/22/21 2:15 PM, Nicholas Piggin wrote:
> Excerpts from Alexey Kardashevskiy's message of March 22, 2021 5:30 pm:
>>
>>
>> On 06/03/2021 02:06, Nicholas Piggin wrote:
>>> In the interest of minimising the amount of code that is run in>>> "real-mode", don't handle hcalls in real mode in the P9 path.
>>>
>>> POWER8 and earlier are much more expensive to exit from HV real mode
>>> and switch to host mode, because on those processors HV interrupts get
>>> to the hypervisor with the MMU off, and the other threads in the core
>>> need to be pulled out of the guest, and SLBs all need to be saved,
>>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>>> in host mode. Hash guests also require a lot of hcalls to run. The
>>> XICS interrupt controller requires hcalls to run.
>>>
>>> By contrast, POWER9 has independent thread switching, and in radix mode
>>> the hypervisor is already in a host virtual memory mode when the HV
>>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>>> interrupts or manage translations.

Do we need to handle the host-is-a-P9-without-xive case ?

>>> So it's much less important to handle hcalls in real mode in P9.
>>
>> So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
>> H_TOO_HARD) can be reverted, pretty much?
> 
> Yes. Although that calls attention to the fact I missed doing
> a P9 h_random handler in this patch. I'll fix that, then I think
> acde2572 could be reverted entirely.
> 
> [...]
> 
>>>   	} else {
>>>   		kvmppc_xive_push_vcpu(vcpu);
>>>   		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>>> -		kvmppc_xive_pull_vcpu(vcpu);
>>> +		/* H_CEDE has to be handled now, not later */
>>> +		/* XICS hcalls must be handled before xive is pulled */
>>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>>   
>>> +			if (req == H_CEDE) {
>>> +				kvmppc_cede(vcpu);
>>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>>> +				kvmppc_set_gpr(vcpu, 3, 0);
>>> +				trap = 0;
>>> +			}
>>> +			if (req == H_EOI || req == H_CPPR ||
>>
>> else if (req == H_EOI ... ?
> 
> Hummm, sure.

you could integrate the H_CEDE in the switch statement below.

> 
> [...]
> 
>>> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
>>> +{
>>> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
>>> +
>>> +	if (!esc_vaddr)
>>> +		return;
>>> +
>>> +	/* we are using XIVE with single escalation */
>>> +
>>> +	if (vcpu->arch.xive_esc_on) {
>>> +		/*
>>> +		 * If we still have a pending escalation, abort the cede,
>>> +		 * and we must set PQ to 10 rather than 00 so that we don't
>>> +		 * potentially end up with two entries for the escalation
>>> +		 * interrupt in the XIVE interrupt queue.  In that case
>>> +		 * we also don't want to set xive_esc_on to 1 here in
>>> +		 * case we race with xive_esc_irq().
>>> +		 */
>>> +		vcpu->arch.ceded = 0;
>>> +		/*
>>> +		 * The escalation interrupts are special as we don't EOI them.
>>> +		 * There is no need to use the load-after-store ordering offset
>>> +		 * to set PQ to 10 as we won't use StoreEOI.
>>> +		 */
>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
>>> +	} else {
>>> +		vcpu->arch.xive_esc_on = true;
>>> +		mb();
>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
>>> +	}
>>> +	mb();
>>
>>
>> Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
>> to that asm!
> 
> Glad it helped.
>>> +}

I had to do the PowerNV models in QEMU to start understanding that stuff ... 

>>> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
>>> +
>>>   /*
>>>    * This is a simple trigger for a generic XIVE IRQ. This must
>>>    * only be called for interrupts that support a trigger page
>>> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>>>   	return 0;
>>>   }
>>>   
>>> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
>>> +{
>>> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>>
>>
>> Can a XIVE enabled guest issue these hcalls? Don't we want if 
>> (!kvmppc_xics_enabled(vcpu)) and
>>   if (xics_on_xive()) here, as kvmppc_rm_h_xirr() have? Some of these 
>> hcalls do write to XIVE registers but some seem to change 
>> kvmppc_xive_vcpu. Thanks,
> 
> Yes I think you're right, good catch. I'm not completely sure about all 
> the xive and xics modes but a guest certainly can make any kind of hcall 
> it likes and we have to sanity check it.

Yes. 

> We want to take the hcall here (in replacement of the real mode hcalls)
> with the same condition. So it would be:
> 
>         if (!kvmppc_xics_enabled(vcpu))
>                 return H_TOO_HARD;

Yes.

This test covers the case in which a vCPU does XICS hcalls without QEMU 
having connected the vCPU to a XICS ICP. The ICP is the KVM XICS device 
on P8 or XICS-on-XIVE on P9. It catches QEMU errors when the interrupt 
mode is negotiated, we don't want the OS to do XICS hcalls after having 
negotiated the XIVE interrupt mode. 

It's different for the XIVE hcalls (when running under XICS) because they 
are all handled in QEMU. 

>         if (!xics_on_xive())
> 		return H_TOO_HARD;

I understand that this code is only called on P9 and with translation on.

On P9, we could have xics_on_xive() == 0 if XIVE is disabled at compile 
time or with "xive=off" at boot time. But guests should be supported. 
I don't see a reason to restrict the support even if these scenarios 
are rather unusual if not very rare.

on P10, it's the same but since we don't have the XICS emulation layer 
in OPAL, the host will be pretty useless. We don't care.

Since we are trying to handle hcalls, this is L0 and it can not be called 
for nested guests, which would be another case of xics_on_xive() == 0. 
We don't care either.



C.



> 	[ ... process xive_vm_h_xirr / cppr / eoi / etc ]
> 
> Right?
> 
> Thanks,
> Nick
> 
>>
>>
>>
>>
>>> +
>>> +	switch (req) {
>>> +	case H_XIRR:
>>> +		return xive_vm_h_xirr(vcpu);
>>> +	case H_CPPR:
>>> +		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
>>> +	case H_EOI:
>>> +		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
>>> +	case H_IPI:
>>> +		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
>>> +					  kvmppc_get_gpr(vcpu, 5));
>>> +	case H_IPOLL:
>>> +		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
>>> +	case H_XIRR_X:
>>> +		xive_vm_h_xirr(vcpu);
>>> +		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
>>> +		return H_SUCCESS;
>>> +	}
>>> +
>>> +	return H_UNSUPPORTED;
>>> +}
>>> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
>>> +
>>>   int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>>>   {
>>>   	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>>>
>>
>> -- 
>> Alexey
>>
Cédric Le Goater March 22, 2021, 4:12 p.m. UTC | #6
On 3/17/21 5:22 PM, Fabiano Rosas wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> In the interest of minimising the amount of code that is run in
>> "real-mode", don't handle hcalls in real mode in the P9 path.
>>
>> POWER8 and earlier are much more expensive to exit from HV real mode
>> and switch to host mode, because on those processors HV interrupts get
>> to the hypervisor with the MMU off, and the other threads in the core
>> need to be pulled out of the guest, and SLBs all need to be saved,
>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>> in host mode. Hash guests also require a lot of hcalls to run. The
>> XICS interrupt controller requires hcalls to run.
>>
>> By contrast, POWER9 has independent thread switching, and in radix mode
>> the hypervisor is already in a host virtual memory mode when the HV
>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>> interrupts or manage translations.
>>
>> So it's much less important to handle hcalls in real mode in P9.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
> 
> <snip>
> 
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 497f216ad724..1f2ba8955c6a 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -1147,7 +1147,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>>   * This has to be done early, not in kvmppc_pseries_do_hcall(), so
>>   * that the cede logic in kvmppc_run_single_vcpu() works properly.
>>   */
>> -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
>> +static void kvmppc_cede(struct kvm_vcpu *vcpu)
> 
> The comment above needs to be updated I think.
> 
>>  {
>>  	vcpu->arch.shregs.msr |= MSR_EE;
>>  	vcpu->arch.ceded = 1;
>> @@ -1403,9 +1403,15 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
>>  		/* hcall - punt to userspace */
>>  		int i;
>>
>> -		/* hypercall with MSR_PR has already been handled in rmode,
>> -		 * and never reaches here.
>> -		 */
>> +		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
>> +			/*
>> +			 * Guest userspace executed sc 1, reflect it back as a
>> +			 * privileged program check interrupt.
>> +			 */
>> +			kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
>> +			r = RESUME_GUEST;
>> +			break;
>> +		}
>>
>>  		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
>>  		for (i = 0; i < 9; ++i)
>> @@ -3740,15 +3746,36 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
>>  		/* H_CEDE has to be handled now, not later */
>>  		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
>>  		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
>> -			kvmppc_nested_cede(vcpu);
>> +			kvmppc_cede(vcpu);
>>  			kvmppc_set_gpr(vcpu, 3, 0);
>>  			trap = 0;
>>  		}
>>  	} else {
>>  		kvmppc_xive_push_vcpu(vcpu);
>>  		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>> -		kvmppc_xive_pull_vcpu(vcpu);
>> +		/* H_CEDE has to be handled now, not later */
>> +		/* XICS hcalls must be handled before xive is pulled */
>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>
>> +			if (req == H_CEDE) {
>> +				kvmppc_cede(vcpu);
>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>> +				kvmppc_set_gpr(vcpu, 3, 0);
>> +				trap = 0;
>> +			}
>> +			if (req == H_EOI || req == H_CPPR ||
>> +			    req == H_IPI || req == H_IPOLL ||
>> +			    req == H_XIRR || req == H_XIRR_X) {
>> +				unsigned long ret;
>> +
>> +				ret = kvmppc_xive_xics_hcall(vcpu, req);
>> +				kvmppc_set_gpr(vcpu, 3, ret);
>> +				trap = 0;
>> +			}
>> +		}
> 
> I tried running L2 with xive=off and this code slows down the boot
> considerably. I think we're missing a !vcpu->arch.nested in the
> conditional.

L2 by default will always use the XIVE emulation in QEMU. If you deactivate 
XIVE support in the L2, with "xive=off" in the OS, or "ic-mode=xics" in the 
L1 QEMU, it will use the legacy XICS mode, emulated in the L1 KVM-on-pseries. 

And yes, the QEMU XIVE emulation tends to be faster. I don't exactly know
why. Probably because of less exit/entries ? 

C.


> This may also be missing these checks from kvmppc_pseries_do_hcall:
> 
> 		if (kvmppc_xics_enabled(vcpu)) {
> 			if (xics_on_xive()) {
> 				ret = H_NOT_AVAILABLE;
> 				return RESUME_GUEST;
> 			}
> 			ret = kvmppc_xics_hcall(vcpu, req);
>                         (...)
> 
> For H_CEDE there might be a similar situation since we're shadowing the
> code above that runs after H_ENTER_NESTED by setting trap to 0 here.
> 
>> +		kvmppc_xive_pull_vcpu(vcpu);
>>  	}
>>
>>  	vcpu->arch.slb_max = 0;
>> @@ -4408,8 +4435,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
>>  		else
>>  			r = kvmppc_run_vcpu(vcpu);
>>
>> -		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
>> -		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>> +		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
>> +			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
>> +				r = RESUME_GUEST;
>> +				continue;
>> +			}
>>  			trace_kvm_hcall_enter(vcpu);
>>  			r = kvmppc_pseries_do_hcall(vcpu);
>>  			trace_kvm_hcall_exit(vcpu, r);
>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> index c11597f815e4..2d0d14ed1d92 100644
>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> @@ -1397,9 +1397,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>>  	mr	r4,r9
>>  	bge	fast_guest_return
>>  2:
>> +	/* If we came in through the P9 short path, no real mode hcalls */
>> +	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
>> +	cmpwi	r0, 0
>> +	bne	no_try_real
>>  	/* See if this is an hcall we can handle in real mode */
>>  	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
>>  	beq	hcall_try_real_mode
>> +no_try_real:
>>
>>  	/* Hypervisor doorbell - exit only if host IPI flag set */
>>  	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index 52cdb9e2660a..1e4871bbcad4 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
>> @@ -158,6 +158,40 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
>>  }
>>  EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
>>
>> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
>> +{
>> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
>> +
>> +	if (!esc_vaddr)
>> +		return;
>> +
>> +	/* we are using XIVE with single escalation */
>> +
>> +	if (vcpu->arch.xive_esc_on) {
>> +		/*
>> +		 * If we still have a pending escalation, abort the cede,
>> +		 * and we must set PQ to 10 rather than 00 so that we don't
>> +		 * potentially end up with two entries for the escalation
>> +		 * interrupt in the XIVE interrupt queue.  In that case
>> +		 * we also don't want to set xive_esc_on to 1 here in
>> +		 * case we race with xive_esc_irq().
>> +		 */
>> +		vcpu->arch.ceded = 0;
>> +		/*
>> +		 * The escalation interrupts are special as we don't EOI them.
>> +		 * There is no need to use the load-after-store ordering offset
>> +		 * to set PQ to 10 as we won't use StoreEOI.
>> +		 */
>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
>> +	} else {
>> +		vcpu->arch.xive_esc_on = true;
>> +		mb();
>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
>> +	}
>> +	mb();
>> +}
>> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
>> +
>>  /*
>>   * This is a simple trigger for a generic XIVE IRQ. This must
>>   * only be called for interrupts that support a trigger page
>> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>>  	return 0;
>>  }
>>
>> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
>> +{
>> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>> +
>> +	switch (req) {
>> +	case H_XIRR:
>> +		return xive_vm_h_xirr(vcpu);
>> +	case H_CPPR:
>> +		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_EOI:
>> +		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_IPI:
>> +		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
>> +					  kvmppc_get_gpr(vcpu, 5));
>> +	case H_IPOLL:
>> +		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
>> +	case H_XIRR_X:
>> +		xive_vm_h_xirr(vcpu);
>> +		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
>> +		return H_SUCCESS;
>> +	}
>> +
>> +	return H_UNSUPPORTED;
>> +}
>> +EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
>> +
>>  int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
>>  {
>>  	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
Nicholas Piggin March 22, 2021, 6:22 p.m. UTC | #7
Excerpts from Cédric Le Goater's message of March 23, 2021 2:01 am:
> On 3/22/21 2:15 PM, Nicholas Piggin wrote:
>> Excerpts from Alexey Kardashevskiy's message of March 22, 2021 5:30 pm:
>>>
>>>
>>> On 06/03/2021 02:06, Nicholas Piggin wrote:
>>>> In the interest of minimising the amount of code that is run in>>> "real-mode", don't handle hcalls in real mode in the P9 path.
>>>>
>>>> POWER8 and earlier are much more expensive to exit from HV real mode
>>>> and switch to host mode, because on those processors HV interrupts get
>>>> to the hypervisor with the MMU off, and the other threads in the core
>>>> need to be pulled out of the guest, and SLBs all need to be saved,
>>>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>>>> in host mode. Hash guests also require a lot of hcalls to run. The
>>>> XICS interrupt controller requires hcalls to run.
>>>>
>>>> By contrast, POWER9 has independent thread switching, and in radix mode
>>>> the hypervisor is already in a host virtual memory mode when the HV
>>>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>>>> interrupts or manage translations.
> 
> Do we need to handle the host-is-a-P9-without-xive case ?

I'm not sure really. Is there an intention for OPAL to be able to 
provide a fallback layer in the worst case? Maybe microwatt grows
HV capability before XIVE?

> 
>>>> So it's much less important to handle hcalls in real mode in P9.
>>>
>>> So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
>>> H_TOO_HARD) can be reverted, pretty much?
>> 
>> Yes. Although that calls attention to the fact I missed doing
>> a P9 h_random handler in this patch. I'll fix that, then I think
>> acde2572 could be reverted entirely.
>> 
>> [...]
>> 
>>>>   	} else {
>>>>   		kvmppc_xive_push_vcpu(vcpu);
>>>>   		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>>>> -		kvmppc_xive_pull_vcpu(vcpu);
>>>> +		/* H_CEDE has to be handled now, not later */
>>>> +		/* XICS hcalls must be handled before xive is pulled */
>>>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>>>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>>>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>>>   
>>>> +			if (req == H_CEDE) {
>>>> +				kvmppc_cede(vcpu);
>>>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>>>> +				kvmppc_set_gpr(vcpu, 3, 0);
>>>> +				trap = 0;
>>>> +			}
>>>> +			if (req == H_EOI || req == H_CPPR ||
>>>
>>> else if (req == H_EOI ... ?
>> 
>> Hummm, sure.
> 
> you could integrate the H_CEDE in the switch statement below.

Below is in a different file just for the emulation calls.

>> 
>> [...]
>> 
>>>> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
>>>> +
>>>> +	if (!esc_vaddr)
>>>> +		return;
>>>> +
>>>> +	/* we are using XIVE with single escalation */
>>>> +
>>>> +	if (vcpu->arch.xive_esc_on) {
>>>> +		/*
>>>> +		 * If we still have a pending escalation, abort the cede,
>>>> +		 * and we must set PQ to 10 rather than 00 so that we don't
>>>> +		 * potentially end up with two entries for the escalation
>>>> +		 * interrupt in the XIVE interrupt queue.  In that case
>>>> +		 * we also don't want to set xive_esc_on to 1 here in
>>>> +		 * case we race with xive_esc_irq().
>>>> +		 */
>>>> +		vcpu->arch.ceded = 0;
>>>> +		/*
>>>> +		 * The escalation interrupts are special as we don't EOI them.
>>>> +		 * There is no need to use the load-after-store ordering offset
>>>> +		 * to set PQ to 10 as we won't use StoreEOI.
>>>> +		 */
>>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
>>>> +	} else {
>>>> +		vcpu->arch.xive_esc_on = true;
>>>> +		mb();
>>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
>>>> +	}
>>>> +	mb();
>>>
>>>
>>> Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
>>> to that asm!
>> 
>> Glad it helped.
>>>> +}
> 
> I had to do the PowerNV models in QEMU to start understanding that stuff ... 
> 
>>>> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
>>>> +
>>>>   /*
>>>>    * This is a simple trigger for a generic XIVE IRQ. This must
>>>>    * only be called for interrupts that support a trigger page
>>>> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>>>>   	return 0;
>>>>   }
>>>>   
>>>> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
>>>> +{
>>>> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>>>
>>>
>>> Can a XIVE enabled guest issue these hcalls? Don't we want if 
>>> (!kvmppc_xics_enabled(vcpu)) and
>>>   if (xics_on_xive()) here, as kvmppc_rm_h_xirr() have? Some of these 
>>> hcalls do write to XIVE registers but some seem to change 
>>> kvmppc_xive_vcpu. Thanks,
>> 
>> Yes I think you're right, good catch. I'm not completely sure about all 
>> the xive and xics modes but a guest certainly can make any kind of hcall 
>> it likes and we have to sanity check it.
> 
> Yes. 
> 
>> We want to take the hcall here (in replacement of the real mode hcalls)
>> with the same condition. So it would be:
>> 
>>         if (!kvmppc_xics_enabled(vcpu))
>>                 return H_TOO_HARD;
> 
> Yes.
> 
> This test covers the case in which a vCPU does XICS hcalls without QEMU 
> having connected the vCPU to a XICS ICP. The ICP is the KVM XICS device 
> on P8 or XICS-on-XIVE on P9. It catches QEMU errors when the interrupt 
> mode is negotiated, we don't want the OS to do XICS hcalls after having 
> negotiated the XIVE interrupt mode. 

Okay.

> It's different for the XIVE hcalls (when running under XICS) because they 
> are all handled in QEMU. 

XIVE guest hcalls running on XICS host?

>>         if (!xics_on_xive())
>> 		return H_TOO_HARD;
> 
> I understand that this code is only called on P9 and with translation on.

Yes.

> On P9, we could have xics_on_xive() == 0 if XIVE is disabled at compile 
> time or with "xive=off" at boot time. But guests should be supported. 
> I don't see a reason to restrict the support even if these scenarios 
> are rather unusual if not very rare.
> 
> on P10, it's the same but since we don't have the XICS emulation layer 
> in OPAL, the host will be pretty useless. We don't care.
> 
> Since we are trying to handle hcalls, this is L0 and it can not be called 
> for nested guests, which would be another case of xics_on_xive() == 0. 
> We don't care either.

Okay so no xics_on_xive() test. I'll change that.

Thanks,
Nick
Cédric Le Goater March 23, 2021, 7:26 a.m. UTC | #8
On 3/22/21 7:22 PM, Nicholas Piggin wrote:
> Excerpts from Cédric Le Goater's message of March 23, 2021 2:01 am:
>> On 3/22/21 2:15 PM, Nicholas Piggin wrote:
>>> Excerpts from Alexey Kardashevskiy's message of March 22, 2021 5:30 pm:
>>>>
>>>>
>>>> On 06/03/2021 02:06, Nicholas Piggin wrote:
>>>>> In the interest of minimising the amount of code that is run in>>> "real-mode", don't handle hcalls in real mode in the P9 path.
>>>>>
>>>>> POWER8 and earlier are much more expensive to exit from HV real mode
>>>>> and switch to host mode, because on those processors HV interrupts get
>>>>> to the hypervisor with the MMU off, and the other threads in the core
>>>>> need to be pulled out of the guest, and SLBs all need to be saved,
>>>>> ERATs invalidated, and host SLB reloaded before the MMU is re-enabled
>>>>> in host mode. Hash guests also require a lot of hcalls to run. The
>>>>> XICS interrupt controller requires hcalls to run.
>>>>>
>>>>> By contrast, POWER9 has independent thread switching, and in radix mode
>>>>> the hypervisor is already in a host virtual memory mode when the HV
>>>>> interrupt is taken. Radix + xive guests don't need hcalls to handle
>>>>> interrupts or manage translations.
>>
>> Do we need to handle the host-is-a-P9-without-xive case ?
> 
> I'm not sure really. Is there an intention for OPAL to be able to 
> provide a fallback layer in the worst case?

yes. OPAL has a XICS-on-XIVE emulation for P9, implemented for bringup,
and it still boots, XICS guest can run. P10 doesn't have it though.

> Maybe microwatt grows HV capability before XIVE?

I don't know if we should develop the same XIVE logic for microwatt. 
It's awfully complex and we have the XICS interface which works already. 

>>>>> So it's much less important to handle hcalls in real mode in P9.
>>>>
>>>> So acde25726bc6034b (which added if(kvm_is_radix(vcpu->kvm))return 
>>>> H_TOO_HARD) can be reverted, pretty much?
>>>
>>> Yes. Although that calls attention to the fact I missed doing
>>> a P9 h_random handler in this patch. I'll fix that, then I think
>>> acde2572 could be reverted entirely.
>>>
>>> [...]
>>>
>>>>>   	} else {
>>>>>   		kvmppc_xive_push_vcpu(vcpu);
>>>>>   		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
>>>>> -		kvmppc_xive_pull_vcpu(vcpu);
>>>>> +		/* H_CEDE has to be handled now, not later */
>>>>> +		/* XICS hcalls must be handled before xive is pulled */
>>>>> +		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
>>>>> +		    !(vcpu->arch.shregs.msr & MSR_PR)) {
>>>>> +			unsigned long req = kvmppc_get_gpr(vcpu, 3);
>>>>>   
>>>>> +			if (req == H_CEDE) {
>>>>> +				kvmppc_cede(vcpu);
>>>>> +				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
>>>>> +				kvmppc_set_gpr(vcpu, 3, 0);
>>>>> +				trap = 0;
>>>>> +			}
>>>>> +			if (req == H_EOI || req == H_CPPR ||
>>>>
>>>> else if (req == H_EOI ... ?
>>>
>>> Hummm, sure.
>>
>> you could integrate the H_CEDE in the switch statement below.
> 
> Below is in a different file just for the emulation calls.
> 
>>>
>>> [...]
>>>
>>>>> +void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
>>>>> +
>>>>> +	if (!esc_vaddr)
>>>>> +		return;
>>>>> +
>>>>> +	/* we are using XIVE with single escalation */
>>>>> +
>>>>> +	if (vcpu->arch.xive_esc_on) {
>>>>> +		/*
>>>>> +		 * If we still have a pending escalation, abort the cede,
>>>>> +		 * and we must set PQ to 10 rather than 00 so that we don't
>>>>> +		 * potentially end up with two entries for the escalation
>>>>> +		 * interrupt in the XIVE interrupt queue.  In that case
>>>>> +		 * we also don't want to set xive_esc_on to 1 here in
>>>>> +		 * case we race with xive_esc_irq().
>>>>> +		 */
>>>>> +		vcpu->arch.ceded = 0;
>>>>> +		/*
>>>>> +		 * The escalation interrupts are special as we don't EOI them.
>>>>> +		 * There is no need to use the load-after-store ordering offset
>>>>> +		 * to set PQ to 10 as we won't use StoreEOI.
>>>>> +		 */
>>>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
>>>>> +	} else {
>>>>> +		vcpu->arch.xive_esc_on = true;
>>>>> +		mb();
>>>>> +		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
>>>>> +	}
>>>>> +	mb();
>>>>
>>>>
>>>> Uff. Thanks for cut-n-pasting the comments, helped a lot to match this c 
>>>> to that asm!
>>>
>>> Glad it helped.
>>>>> +}
>>
>> I had to do the PowerNV models in QEMU to start understanding that stuff ... 
>>
>>>>> +EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
>>>>> +
>>>>>   /*
>>>>>    * This is a simple trigger for a generic XIVE IRQ. This must
>>>>>    * only be called for interrupts that support a trigger page
>>>>> @@ -2106,6 +2140,32 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
>>>>>   	return 0;
>>>>>   }
>>>>>   
>>>>> +int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
>>>>> +{
>>>>> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>>>>
>>>>
>>>> Can a XIVE enabled guest issue these hcalls? Don't we want if 
>>>> (!kvmppc_xics_enabled(vcpu)) and
>>>>   if (xics_on_xive()) here, as kvmppc_rm_h_xirr() have? Some of these 
>>>> hcalls do write to XIVE registers but some seem to change 
>>>> kvmppc_xive_vcpu. Thanks,
>>>
>>> Yes I think you're right, good catch. I'm not completely sure about all 
>>> the xive and xics modes but a guest certainly can make any kind of hcall 
>>> it likes and we have to sanity check it.
>>
>> Yes. 
>>
>>> We want to take the hcall here (in replacement of the real mode hcalls)
>>> with the same condition. So it would be:
>>>
>>>         if (!kvmppc_xics_enabled(vcpu))
>>>                 return H_TOO_HARD;
>>
>> Yes.
>>
>> This test covers the case in which a vCPU does XICS hcalls without QEMU 
>> having connected the vCPU to a XICS ICP. The ICP is the KVM XICS device 
>> on P8 or XICS-on-XIVE on P9. It catches QEMU errors when the interrupt 
>> mode is negotiated, we don't want the OS to do XICS hcalls after having 
>> negotiated the XIVE interrupt mode. 
> 
> Okay.
> 
>> It's different for the XIVE hcalls (when running under XICS) because they 
>> are all handled in QEMU. 
> 
> XIVE guest hcalls running on XICS host?

What I meant is that in anycase, XICS or XIVE host, the XIVE hcalls are 
trapped in KVM but always handled in QEMU. So We don't need to check 
anything in KVM, as QEMU will take care of it. 

( It also make the implementation cleaner since the hcall frontend is in
one place. )

C.

>>>         if (!xics_on_xive())
>>> 		return H_TOO_HARD;
>>
>> I understand that this code is only called on P9 and with translation on.
> 
> Yes.
> 
>> On P9, we could have xics_on_xive() == 0 if XIVE is disabled at compile 
>> time or with "xive=off" at boot time. But guests should be supported. 
>> I don't see a reason to restrict the support even if these scenarios 
>> are rather unusual if not very rare.
>>
>> on P10, it's the same but since we don't have the XICS emulation layer 
>> in OPAL, the host will be pretty useless. We don't care.
>>
>> Since we are trying to handle hcalls, this is L0 and it can not be called 
>> for nested guests, which would be another case of xics_on_xive() == 0. 
>> We don't care either.
> 
> Okay so no xics_on_xive() test. I'll change that.
>
> 
> Thanks,
> Nick
>
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 73b1ca5a6471..db6646c2ade2 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -607,6 +607,7 @@  extern void kvmppc_free_pimap(struct kvm *kvm);
 extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
+extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
@@ -639,6 +640,8 @@  static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
 static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
+static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+	{ return 0; }
 #endif
 
 #ifdef CONFIG_KVM_XIVE
@@ -673,6 +676,7 @@  extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
 			       int level, bool line_status);
 extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
+extern void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu);
 
 static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
 {
@@ -714,6 +718,7 @@  static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir
 				      int level, bool line_status) { return -ENODEV; }
 static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
 static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
+static inline void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu) { }
 
 static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 497f216ad724..1f2ba8955c6a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1147,7 +1147,7 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
  * This has to be done early, not in kvmppc_pseries_do_hcall(), so
  * that the cede logic in kvmppc_run_single_vcpu() works properly.
  */
-static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+static void kvmppc_cede(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.shregs.msr |= MSR_EE;
 	vcpu->arch.ceded = 1;
@@ -1403,9 +1403,15 @@  static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 		/* hcall - punt to userspace */
 		int i;
 
-		/* hypercall with MSR_PR has already been handled in rmode,
-		 * and never reaches here.
-		 */
+		if (unlikely(vcpu->arch.shregs.msr & MSR_PR)) {
+			/*
+			 * Guest userspace executed sc 1, reflect it back as a
+			 * privileged program check interrupt.
+			 */
+			kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV);
+			r = RESUME_GUEST;
+			break;
+		}
 
 		run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
 		for (i = 0; i < 9; ++i)
@@ -3740,15 +3746,36 @@  static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 		/* H_CEDE has to be handled now, not later */
 		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
 		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
-			kvmppc_nested_cede(vcpu);
+			kvmppc_cede(vcpu);
 			kvmppc_set_gpr(vcpu, 3, 0);
 			trap = 0;
 		}
 	} else {
 		kvmppc_xive_push_vcpu(vcpu);
 		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
-		kvmppc_xive_pull_vcpu(vcpu);
+		/* H_CEDE has to be handled now, not later */
+		/* XICS hcalls must be handled before xive is pulled */
+		if (trap == BOOK3S_INTERRUPT_SYSCALL &&
+		    !(vcpu->arch.shregs.msr & MSR_PR)) {
+			unsigned long req = kvmppc_get_gpr(vcpu, 3);
 
+			if (req == H_CEDE) {
+				kvmppc_cede(vcpu);
+				kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */
+				kvmppc_set_gpr(vcpu, 3, 0);
+				trap = 0;
+			}
+			if (req == H_EOI || req == H_CPPR ||
+			    req == H_IPI || req == H_IPOLL ||
+			    req == H_XIRR || req == H_XIRR_X) {
+				unsigned long ret;
+
+				ret = kvmppc_xive_xics_hcall(vcpu, req);
+				kvmppc_set_gpr(vcpu, 3, ret);
+				trap = 0;
+			}
+		}
+		kvmppc_xive_pull_vcpu(vcpu);
 	}
 
 	vcpu->arch.slb_max = 0;
@@ -4408,8 +4435,11 @@  static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
 		else
 			r = kvmppc_run_vcpu(vcpu);
 
-		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
-		    !(vcpu->arch.shregs.msr & MSR_PR)) {
+		if (run->exit_reason == KVM_EXIT_PAPR_HCALL) {
+			if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_PR)) {
+				r = RESUME_GUEST;
+				continue;
+			}
 			trace_kvm_hcall_enter(vcpu);
 			r = kvmppc_pseries_do_hcall(vcpu);
 			trace_kvm_hcall_exit(vcpu, r);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c11597f815e4..2d0d14ed1d92 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1397,9 +1397,14 @@  END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mr	r4,r9
 	bge	fast_guest_return
 2:
+	/* If we came in through the P9 short path, no real mode hcalls */
+	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
+	cmpwi	r0, 0
+	bne	no_try_real
 	/* See if this is an hcall we can handle in real mode */
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
+no_try_real:
 
 	/* Hypervisor doorbell - exit only if host IPI flag set */
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 52cdb9e2660a..1e4871bbcad4 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -158,6 +158,40 @@  void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu);
 
+void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu)
+{
+	void __iomem *esc_vaddr = (void __iomem *)vcpu->arch.xive_esc_vaddr;
+
+	if (!esc_vaddr)
+		return;
+
+	/* we are using XIVE with single escalation */
+
+	if (vcpu->arch.xive_esc_on) {
+		/*
+		 * If we still have a pending escalation, abort the cede,
+		 * and we must set PQ to 10 rather than 00 so that we don't
+		 * potentially end up with two entries for the escalation
+		 * interrupt in the XIVE interrupt queue.  In that case
+		 * we also don't want to set xive_esc_on to 1 here in
+		 * case we race with xive_esc_irq().
+		 */
+		vcpu->arch.ceded = 0;
+		/*
+		 * The escalation interrupts are special as we don't EOI them.
+		 * There is no need to use the load-after-store ordering offset
+		 * to set PQ to 10 as we won't use StoreEOI.
+		 */
+		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_10);
+	} else {
+		vcpu->arch.xive_esc_on = true;
+		mb();
+		__raw_readq(esc_vaddr + XIVE_ESB_SET_PQ_00);
+	}
+	mb();
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_cede_vcpu);
+
 /*
  * This is a simple trigger for a generic XIVE IRQ. This must
  * only be called for interrupts that support a trigger page
@@ -2106,6 +2140,32 @@  static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
 	return 0;
 }
 
+int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	switch (req) {
+	case H_XIRR:
+		return xive_vm_h_xirr(vcpu);
+	case H_CPPR:
+		return xive_vm_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_EOI:
+		return xive_vm_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_IPI:
+		return xive_vm_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4),
+					  kvmppc_get_gpr(vcpu, 5));
+	case H_IPOLL:
+		return xive_vm_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4));
+	case H_XIRR_X:
+		xive_vm_h_xirr(vcpu);
+		kvmppc_set_gpr(vcpu, 5, get_tb() + vc->tb_offset);
+		return H_SUCCESS;
+	}
+
+	return H_UNSUPPORTED;
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_xics_hcall);
+
 int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;