diff mbox series

[RFC,20/32] KVM: PPC: Book3S HV: Nested guest entry via hypercall

Message ID 1537524123-9578-21-git-send-email-paulus@ozlabs.org
State Superseded
Headers show
Series KVM: PPC: Book3S HV: Nested HV virtualization | expand

Commit Message

Paul Mackerras Sept. 21, 2018, 10:01 a.m. UTC
This adds a new hypercall, H_ENTER_NESTED, which is used by a nested
hypervisor to enter one of its nested guests.  The hypercall supplies
register values in two structs.  Those values are copied by the level 0
(L0) hypervisor (the one which is running in hypervisor mode) into the
vcpu struct of the L1 guest, and then the guest is run until an
interrupt or error occurs which needs to be reported to L1 via the
hypercall return value.

Currently this assumes that the L0 and L1 hypervisors are the same
endianness, and the structs passed as arguments are in native
endianness.

Nested hypervisors do not support indep_threads_mode=N, so this adds
code to print a warning message if the administrator has set
indep_threads_mode=N, and treat it as Y.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/include/asm/hvcall.h       |  36 +++++
 arch/powerpc/include/asm/kvm_book3s.h   |   7 +
 arch/powerpc/include/asm/kvm_host.h     |   5 +
 arch/powerpc/kernel/asm-offsets.c       |   1 +
 arch/powerpc/kvm/book3s_hv.c            | 194 +++++++++++++++++++++++----
 arch/powerpc/kvm/book3s_hv_nested.c     | 230 ++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   4 +
 7 files changed, 452 insertions(+), 25 deletions(-)

Comments

David Gibson Sept. 26, 2018, 5:41 a.m. UTC | #1
On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote:
> This adds a new hypercall, H_ENTER_NESTED, which is used by a nested
> hypervisor to enter one of its nested guests.  The hypercall supplies
> register values in two structs.  Those values are copied by the level 0
> (L0) hypervisor (the one which is running in hypervisor mode) into the
> vcpu struct of the L1 guest, and then the guest is run until an
> interrupt or error occurs which needs to be reported to L1 via the
> hypercall return value.
> 
> Currently this assumes that the L0 and L1 hypervisors are the same
> endianness, and the structs passed as arguments are in native
> endianness.

That's nasty.  It'd be good to at least detect this and bail.

> 
> Nested hypervisors do not support indep_threads_mode=N, so this adds
> code to print a warning message if the administrator has set
> indep_threads_mode=N, and treat it as Y.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> ---
>  arch/powerpc/include/asm/hvcall.h       |  36 +++++
>  arch/powerpc/include/asm/kvm_book3s.h   |   7 +
>  arch/powerpc/include/asm/kvm_host.h     |   5 +
>  arch/powerpc/kernel/asm-offsets.c       |   1 +
>  arch/powerpc/kvm/book3s_hv.c            | 194 +++++++++++++++++++++++----
>  arch/powerpc/kvm/book3s_hv_nested.c     | 230 ++++++++++++++++++++++++++++++++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |   4 +
>  7 files changed, 452 insertions(+), 25 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
> index 9afaa82..dfcf43d 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -487,6 +487,42 @@ struct h_cpu_char_result {
>  	u64 behaviour;
>  };
>  
> +/* Register state for entering a nested guest with H_ENTER_NESTED */
> +struct hv_guest_state {
> +	u64 version;		/* version of this structure layout */
> +	u32 lpid;
> +	u32 vcpu_token;
> +	/* These registers are hypervisor privileged (at least for writing) */
> +	u64 lpcr;
> +	u64 pcr;
> +	u64 amor;
> +	u64 dpdes;
> +	u64 hfscr;
> +	s64 tb_offset;
> +	u64 dawr0;
> +	u64 dawrx0;
> +	u64 ciabr;
> +	u64 hdec_expiry;
> +	u64 purr;
> +	u64 spurr;
> +	u64 ic;
> +	u64 vtb;
> +	u64 hdar;
> +	u64 hdsisr;
> +	u64 heir;
> +	u64 asdr;
> +	/* These are OS privileged but need to be set late in guest entry */
> +	u64 srr0;
> +	u64 srr1;
> +	u64 sprg[4];
> +	u64 pidr;
> +	u64 cfar;
> +	u64 ppr;
> +};
> +
> +/* Latest version of hv_guest_state structure */
> +#define HV_GUEST_STATE_VERSION	1
> +
>  #endif /* __ASSEMBLY__ */
>  #endif /* __KERNEL__ */
>  #endif /* _ASM_POWERPC_HVCALL_H */
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 7719ca5..125bc5b 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -280,6 +280,13 @@ void kvmhv_vm_nested_init(struct kvm *kvm);
>  long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
>  void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
> +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
> +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
> +			  u64 time_limit);
> +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
> +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
> +				   struct hv_guest_state *hr);
> +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
>  
>  void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
>  
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index c35d4f2..ceb9f20 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -95,6 +95,7 @@ struct dtl_entry;
>  
>  struct kvmppc_vcpu_book3s;
>  struct kvmppc_book3s_shadow_vcpu;
> +struct kvm_nested_guest;
>  
>  struct kvm_vm_stat {
>  	ulong remote_tlb_flush;
> @@ -786,6 +787,10 @@ struct kvm_vcpu_arch {
>  	u32 emul_inst;
>  
>  	u32 online;
> +
> +	/* For support of nested guests */
> +	struct kvm_nested_guest *nested;
> +	u32 nested_vcpu_id;
>  #endif
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 7c3738d..d0abcbb 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -503,6 +503,7 @@ int main(void)
>  	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
>  	OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
>  	OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
> +	OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
>  	OFFSET(VCPU_CPU, kvm_vcpu, cpu);
>  	OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
>  #endif
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 82c9a1e..da380cb 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -942,6 +942,13 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  		break;
>  	case H_ENTER_NESTED:
>  		ret = H_FUNCTION;
> +		if (!vcpu->kvm->arch.nested_enable)
> +			break;

Wouldn't H_AUTHORITY make more sense than H_FUNCTION for the
no-nested-allowed case?

> +		ret = kvmhv_enter_nested_guest(vcpu);
> +		if (ret == H_INTERRUPT) {
> +			kvmppc_set_gpr(vcpu, 3, 0);
> +			return RESUME_HOST;
> +		}
>  		break;
>  
>  	default:
> @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	return r;
>  }
>  
> +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)

Might be nice to rename to make it clear if this is the L0 or L1
handling for a nested exit.

> +{
> +	int r;
> +	int srcu_idx;
> +
> +	vcpu->stat.sum_exits++;
> +
> +	/*
> +	 * This can happen if an interrupt occurs in the last stages
> +	 * of guest entry or the first stages of guest exit (i.e. after
> +	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
> +	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
> +	 * That can happen due to a bug, or due to a machine check
> +	 * occurring at just the wrong time.
> +	 */
> +	if (vcpu->arch.shregs.msr & MSR_HV) {
> +		printk(KERN_EMERG "KVM trap in HV mode while nested!\n");
> +		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
> +			vcpu->arch.trap, kvmppc_get_pc(vcpu),
> +			vcpu->arch.shregs.msr);
> +		kvmppc_dump_regs(vcpu);
> +		BUG();
> +	}
> +	switch (vcpu->arch.trap) {
> +	/* We're good on these - the host merely wanted to get our attention */
> +	case BOOK3S_INTERRUPT_HV_DECREMENTER:
> +		vcpu->stat.dec_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_EXTERNAL:
> +		vcpu->stat.ext_intr_exits++;
> +		r = RESUME_HOST;
> +		break;
> +	case BOOK3S_INTERRUPT_H_DOORBELL:
> +	case BOOK3S_INTERRUPT_H_VIRT:
> +		vcpu->stat.ext_intr_exits++;
> +		r = RESUME_GUEST;
> +		break;
> +	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
> +	case BOOK3S_INTERRUPT_HMI:
> +	case BOOK3S_INTERRUPT_PERFMON:
> +	case BOOK3S_INTERRUPT_SYSTEM_RESET:
> +		r = RESUME_GUEST;
> +		break;
> +	case BOOK3S_INTERRUPT_MACHINE_CHECK:
> +		/* Pass the machine check to the L1 guest */
> +		r = RESUME_HOST;
> +		/* Print the MCE event to host console. */
> +		machine_check_print_event_info(&vcpu->arch.mce_evt, false);
> +		break;
> +	/*
> +	 * We get these next two if the guest accesses a page which it thinks
> +	 * it has mapped but which is not actually present, either because
> +	 * it is for an emulated I/O device or because the corresonding
> +	 * host page has been paged out.
> +	 */
> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> +		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> +		r = kvmhv_nested_page_fault(vcpu);
> +		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> +		break;
> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> +		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
> +		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
> +					 DSISR_SRR1_MATCH_64S;
> +		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
> +			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
> +		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> +		r = kvmhv_nested_page_fault(vcpu);
> +		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> +		break;
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> +	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
> +		/*
> +		 * This occurs for various TM-related instructions that
> +		 * we need to emulate on POWER9 DD2.2.  We have already
> +		 * handled the cases where the guest was in real-suspend
> +		 * mode and was transitioning to transactional state.
> +		 */
> +		r = kvmhv_p9_tm_emulation(vcpu);
> +		break;
> +#endif
> +
> +	case BOOK3S_INTERRUPT_HV_RM_HARD:
> +		vcpu->arch.trap = 0;
> +		r = RESUME_GUEST;
> +		if (!xive_enabled())
> +			kvmppc_xics_rm_complete(vcpu, 0);
> +		break;
> +	default:
> +		r = RESUME_HOST;
> +		break;
> +	}
> +
> +	return r;
> +}
> +
>  static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
>  					    struct kvm_sregs *sregs)
>  {
> @@ -3095,7 +3200,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
>  /*
>   * Load up hypervisor-mode registers on P9.
>   */
> -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
> +static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit)
>  {
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  	s64 hdec;
> @@ -3108,7 +3213,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
>  	unsigned long host_psscr = mfspr(SPRN_PSSCR);
>  	unsigned long host_pidr = mfspr(SPRN_PID);
>  
> -	hdec = local_paca->kvm_hstate.dec_expires - mftb();
> +	hdec = time_limit - mftb();

Looks like this change might better belong in the earlier patch
creating kvmhv_load_hv_regs_and_go().

>  	if (hdec < 0)
>  		return BOOK3S_INTERRUPT_HV_DECREMENTER;
>  	mtspr(SPRN_HDEC, hdec);
> @@ -3222,7 +3327,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
>   * Virtual-mode guest entry for POWER9 and later when the host and
>   * guest are both using the radix MMU.  The LPIDR has already been set.
>   */
> -int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
> +int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit)
>  {
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  	unsigned long host_dscr = mfspr(SPRN_DSCR);
> @@ -3237,6 +3342,8 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
>  	if (dec < 512)
>  		return BOOK3S_INTERRUPT_HV_DECREMENTER;
>  	local_paca->kvm_hstate.dec_expires = dec + tb;
> +	if (local_paca->kvm_hstate.dec_expires < time_limit)
> +		time_limit = local_paca->kvm_hstate.dec_expires;
>  
>  	vcpu->arch.ceded = 0;
>  
> @@ -3290,7 +3397,28 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
>  		vcpu->arch.doorbell_request = 0;
>  	}
>  
> -	trap = kvmhv_load_hv_regs_and_go(vcpu);
> +	if (!cpu_has_feature(CPU_FTR_HVMODE)) {
> +		/* call our hypervisor to load up HV regs and go */
> +		struct hv_guest_state hvregs;
> +
> +		kvmhv_save_hv_regs(vcpu, &hvregs);
> +		vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
> +		hvregs.version = HV_GUEST_STATE_VERSION;
> +		hvregs.lpid = vcpu->kvm->arch.lpid;
> +		hvregs.vcpu_token = vcpu->vcpu_id;
> +		hvregs.hdec_expiry = time_limit;
> +		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
> +			     &vcpu->arch.pending_exceptions))
> +			hvregs.lpcr |= LPCR_MER;
> +		trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
> +					  __pa(&vcpu->arch.regs));
> +		kvmhv_restore_hv_return_state(vcpu, &hvregs);
> +		vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
> +		vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
> +		vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
> +	} else {
> +		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit);
> +	}
>  
>  	vcpu->arch.slb_max = 0;
>  	dec = mfspr(SPRN_DEC);
> @@ -3530,6 +3658,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>  	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
>  }
>  
> +/*
> + * This is assumed not to be able to fail for a radix guest in
> + * kvmhv_run_single_vcpu().
> + */

Might be clearer to split that change out with a rationale as to why
it's correct.

>  static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
>  {
>  	int r = 0;
> @@ -3679,12 +3811,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>  	return vcpu->arch.ret;
>  }
>  
> -static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
> -				  struct kvm_vcpu *vcpu)
> +int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
> +			  struct kvm_vcpu *vcpu, u64 time_limit)
>  {
>  	int trap, r, pcpu, pcpu0;
>  	int srcu_idx;
>  	struct kvmppc_vcore *vc;
> +	struct kvm_nested_guest *nested = vcpu->arch.nested;
> +	unsigned long lpid;
>  
>  	trace_kvmppc_run_vcpu_enter(vcpu);
>  
> @@ -3705,16 +3839,8 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  	vc->runner = vcpu;
>  
>  	/* See if the MMU is ready to go */
> -	if (!vcpu->kvm->arch.mmu_ready) {
> -		r = kvmhv_setup_mmu(vcpu);
> -		if (r) {
> -			kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
> -			kvm_run->fail_entry.
> -				hardware_entry_failure_reason = 0;
> -			vcpu->arch.ret = r;
> -			goto out;
> -		}
> -	}
> +	if (!vcpu->kvm->arch.mmu_ready)
> +		kvmhv_setup_mmu(vcpu);
>  
>  	if (need_resched())
>  		cond_resched();
> @@ -3736,7 +3862,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  	if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready)
>  		goto out;
>  
> -	kvmppc_core_prepare_to_enter(vcpu);
> +	if (!nested) {
> +		kvmppc_core_prepare_to_enter(vcpu);
> +	} else if (vcpu->arch.pending_exceptions) {
> +		vcpu->arch.ret = RESUME_HOST;
> +		goto out;
> +	}
>  
>  	kvmppc_clear_host_core(pcpu);
>  
> @@ -3750,7 +3881,10 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  	vc->vcore_state = VCORE_RUNNING;
>  	trace_kvmppc_run_core(vc, 0);
>  
> -	mtspr(SPRN_LPID, vc->kvm->arch.lpid);
> +	lpid = vc->kvm->arch.lpid;
> +	if (nested)
> +		lpid = nested->shadow_lpid;
> +	mtspr(SPRN_LPID, lpid);
>  	isync();
>  
>  	/* See comment above in kvmppc_run_core() about this */
> @@ -3759,7 +3893,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  		pcpu0 &= ~0x3UL;
>  
>  	if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) {
> -		radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
> +		radix__local_flush_tlb_lpid_guest(lpid);
>  		/* Clear the bit after the TLB flush */
>  		cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush);
>  	}
> @@ -3771,7 +3905,7 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  
>  	this_cpu_disable_ftrace();
>  
> -	trap = kvmhv_p9_guest_entry(vcpu);
> +	trap = kvmhv_p9_guest_entry(vcpu, time_limit);
>  	vcpu->arch.trap = trap;
>  
>  	this_cpu_enable_ftrace();
> @@ -3796,8 +3930,12 @@ static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
>  
>  	trace_kvm_guest_exit(vcpu);
>  	r = RESUME_GUEST;
> -	if (trap)
> -		r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
> +	if (trap) {
> +		if (!nested)
> +			r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
> +		else
> +			r = kvmppc_handle_nested_exit(vcpu);
> +	}
>  	vcpu->arch.ret = r;
>  
>  	if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
> @@ -3912,7 +4050,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
>  
>  	do {
>  		if (kvm->arch.threads_indep && kvm_is_radix(kvm))
> -			r = kvmppc_run_single_vcpu(run, vcpu);
> +			r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0);
>  		else
>  			r = kvmppc_run_vcpu(run, vcpu);
>  
> @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  	 * On POWER9, we only need to do this if the "indep_threads_mode"
>  	 * module parameter has been set to N.
>  	 */
> -	if (cpu_has_feature(CPU_FTR_ARCH_300))
> -		kvm->arch.threads_indep = indep_threads_mode;
> +	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
> +			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
> +			kvm->arch.threads_indep = true;

Wouldn't it be cleaner to enforce this at the point indep_threads_mode
is set, rather than altering the value here?

> +		} else {
> +			kvm->arch.threads_indep = indep_threads_mode;
> +		}
> +	}
>  	if (!kvm->arch.threads_indep)
>  		kvm_hv_vm_activated();
>  
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 5fe3ea4..a7f3da9 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -20,6 +20,231 @@ static struct patb_entry *pseries_partition_tb;
>  
>  static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
>  
> +void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	hr->lpcr = vc->lpcr;
> +	hr->pcr = vc->pcr;
> +	hr->dpdes = vc->dpdes;
> +	hr->hfscr = vcpu->arch.hfscr;
> +	hr->tb_offset = vc->tb_offset;
> +	hr->dawr0 = vcpu->arch.dawr;
> +	hr->dawrx0 = vcpu->arch.dawrx;
> +	hr->ciabr = vcpu->arch.ciabr;
> +	hr->purr = vcpu->arch.purr;
> +	hr->spurr = vcpu->arch.spurr;
> +	hr->ic = vcpu->arch.ic;
> +	hr->vtb = vc->vtb;
> +	hr->srr0 = vcpu->arch.shregs.srr0;
> +	hr->srr1 = vcpu->arch.shregs.srr1;
> +	hr->sprg[0] = vcpu->arch.shregs.sprg0;
> +	hr->sprg[1] = vcpu->arch.shregs.sprg1;
> +	hr->sprg[2] = vcpu->arch.shregs.sprg2;
> +	hr->sprg[3] = vcpu->arch.shregs.sprg3;
> +	hr->pidr = vcpu->arch.pid;
> +	hr->cfar = vcpu->arch.cfar;
> +	hr->ppr = vcpu->arch.ppr;
> +}
> +
> +static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
> +				 struct hv_guest_state *hr)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	hr->dpdes = vc->dpdes;
> +	hr->hfscr = vcpu->arch.hfscr;
> +	hr->purr = vcpu->arch.purr;
> +	hr->spurr = vcpu->arch.spurr;
> +	hr->ic = vcpu->arch.ic;
> +	hr->vtb = vc->vtb;
> +	hr->srr0 = vcpu->arch.shregs.srr0;
> +	hr->srr1 = vcpu->arch.shregs.srr1;
> +	hr->sprg[0] = vcpu->arch.shregs.sprg0;
> +	hr->sprg[1] = vcpu->arch.shregs.sprg1;
> +	hr->sprg[2] = vcpu->arch.shregs.sprg2;
> +	hr->sprg[3] = vcpu->arch.shregs.sprg3;
> +	hr->pidr = vcpu->arch.pid;
> +	hr->cfar = vcpu->arch.cfar;
> +	hr->ppr = vcpu->arch.ppr;
> +	switch (trap) {
> +	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> +		hr->hdar = vcpu->arch.fault_dar;
> +		hr->hdsisr = vcpu->arch.fault_dsisr;
> +		hr->asdr = vcpu->arch.fault_gpa;
> +		break;
> +	case BOOK3S_INTERRUPT_H_INST_STORAGE:
> +		hr->asdr = vcpu->arch.fault_gpa;
> +		break;
> +	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
> +		hr->heir = vcpu->arch.emul_inst;
> +		break;
> +	}
> +}
> +
> +static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	vc->pcr = hr->pcr;
> +	vc->dpdes = hr->dpdes;
> +	vcpu->arch.hfscr = hr->hfscr;
> +	vcpu->arch.dawr = hr->dawr0;
> +	vcpu->arch.dawrx = hr->dawrx0;
> +	vcpu->arch.ciabr = hr->ciabr;
> +	vcpu->arch.purr = hr->purr;
> +	vcpu->arch.spurr = hr->spurr;
> +	vcpu->arch.ic = hr->ic;
> +	vc->vtb = hr->vtb;
> +	vcpu->arch.shregs.srr0 = hr->srr0;
> +	vcpu->arch.shregs.srr1 = hr->srr1;
> +	vcpu->arch.shregs.sprg0 = hr->sprg[0];
> +	vcpu->arch.shregs.sprg1 = hr->sprg[1];
> +	vcpu->arch.shregs.sprg2 = hr->sprg[2];
> +	vcpu->arch.shregs.sprg3 = hr->sprg[3];
> +	vcpu->arch.pid = hr->pidr;
> +	vcpu->arch.cfar = hr->cfar;
> +	vcpu->arch.ppr = hr->ppr;
> +}
> +
> +void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
> +				   struct hv_guest_state *hr)
> +{
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +
> +	vc->dpdes = hr->dpdes;
> +	vcpu->arch.hfscr = hr->hfscr;
> +	vcpu->arch.purr = hr->purr;
> +	vcpu->arch.spurr = hr->spurr;
> +	vcpu->arch.ic = hr->ic;
> +	vc->vtb = hr->vtb;
> +	vcpu->arch.fault_dar = hr->hdar;
> +	vcpu->arch.fault_dsisr = hr->hdsisr;
> +	vcpu->arch.fault_gpa = hr->asdr;
> +	vcpu->arch.emul_inst = hr->heir;
> +	vcpu->arch.shregs.srr0 = hr->srr0;
> +	vcpu->arch.shregs.srr1 = hr->srr1;
> +	vcpu->arch.shregs.sprg0 = hr->sprg[0];
> +	vcpu->arch.shregs.sprg1 = hr->sprg[1];
> +	vcpu->arch.shregs.sprg2 = hr->sprg[2];
> +	vcpu->arch.shregs.sprg3 = hr->sprg[3];
> +	vcpu->arch.pid = hr->pidr;
> +	vcpu->arch.cfar = hr->cfar;
> +	vcpu->arch.ppr = hr->ppr;
> +}
> +
> +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
> +{
> +	long int err, r;
> +	struct kvm_nested_guest *l2;
> +	struct pt_regs l2_regs, saved_l1_regs;
> +	struct hv_guest_state l2_hv, saved_l1_hv;
> +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> +	u64 hv_ptr, regs_ptr;
> +	u64 hdec_exp;
> +	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
> +	u64 mask;
> +
> +	if (!kvm_is_radix(vcpu->kvm))
> +		return H_FUNCTION;

Would it be safer / cleaner to have this instead check that the L1 has
completed an H_SET_PARTITION_TABLE?  Which wouldn't be allowed for an
HPT guest.

> +
> +	/* copy parameters in */
> +	hv_ptr = kvmppc_get_gpr(vcpu, 4);
> +	err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
> +				  sizeof(struct hv_guest_state));
> +	if (err)
> +		return H_PARAMETER;
> +	if (l2_hv.version != HV_GUEST_STATE_VERSION)
> +		return H_P2;
> +
> +	regs_ptr = kvmppc_get_gpr(vcpu, 5);
> +	err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
> +				  sizeof(struct pt_regs));
> +	if (err)
> +		return H_PARAMETER;
> +
> +	/* translate lpid */
> +	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
> +	if (!l2)
> +		return H_PARAMETER;
> +	if (!l2->l1_gr_to_hr) {
> +		mutex_lock(&l2->tlb_lock);
> +		kvmhv_update_ptbl_cache(l2);
> +		mutex_unlock(&l2->tlb_lock);
> +	}
> +
> +	/* save l1 values of things */
> +	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
> +	saved_l1_regs = vcpu->arch.regs;
> +	kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
> +
> +	/* convert TB values/offsets to host (L0) values */
> +	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
> +	vc->tb_offset += l2_hv.tb_offset;
> +
> +	/* set L1 state to L2 state */
> +	vcpu->arch.nested = l2;
> +	vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
> +	vcpu->arch.regs = l2_regs;
> +	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
> +	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
> +		LPCR_LPES | LPCR_MER;
> +	vc->lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
> +	restore_hv_regs(vcpu, &l2_hv);
> +
> +	vcpu->arch.ret = RESUME_GUEST;
> +	vcpu->arch.trap = 0;
> +	do {
> +		if (mftb() >= hdec_exp) {
> +			vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
> +			r = RESUME_HOST;
> +			break;
> +		}
> +		r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp);
> +	} while (is_kvmppc_resume_guest(r));
> +
> +	/* save L2 state for return */
> +	l2_regs = vcpu->arch.regs;
> +	l2_regs.msr = vcpu->arch.shregs.msr;
> +	delta_purr = vcpu->arch.purr - l2_hv.purr;
> +	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
> +	delta_ic = vcpu->arch.ic - l2_hv.ic;
> +	delta_vtb = vc->vtb - l2_hv.vtb;
> +	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
> +
> +	/* restore L1 state */
> +	vcpu->arch.nested = NULL;
> +	vcpu->arch.regs = saved_l1_regs;
> +	vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
> +	/* set L1 MSR TS field according to L2 transaction state */
> +	if (l2_regs.msr & MSR_TS_MASK)
> +		vcpu->arch.shregs.msr |= MSR_TS_S;
> +	vc->lpcr = saved_l1_hv.lpcr;
> +	vc->tb_offset = saved_l1_hv.tb_offset;
> +	restore_hv_regs(vcpu, &saved_l1_hv);
> +	vcpu->arch.purr += delta_purr;
> +	vcpu->arch.spurr += delta_spurr;
> +	vcpu->arch.ic += delta_ic;
> +	vc->vtb += delta_vtb;
> +
> +	kvmhv_put_nested(l2);
> +
> +	/* copy l2_hv_state and regs back to guest */
> +	err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
> +				   sizeof(struct hv_guest_state));
> +	if (err)
> +		return H_AUTHORITY;
> +	err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
> +				   sizeof(struct pt_regs));
> +	if (err)
> +		return H_AUTHORITY;
> +
> +	if (r == -EINTR)
> +		return H_INTERRUPT;
> +
> +	return vcpu->arch.trap;
> +}
> +
>  /* Only called when we're not in hypervisor mode */
>  bool kvmhv_nested_init(void)
>  {
> @@ -284,3 +509,8 @@ struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
>  		return NULL;
>  	return kvm->arch.nested_guests[lpid];
>  }
> +
> +long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
> +{
> +	return RESUME_HOST;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 83efc13..04fcaa4 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -2199,6 +2199,10 @@ hcall_try_real_mode:
>  	andi.	r0,r11,MSR_PR
>  	/* sc 1 from userspace - reflect to guest syscall */
>  	bne	sc_1_fast_return
> +	/* sc 1 from nested guest - give it to L1 to handle */
> +	ld	r0, VCPU_NESTED(r9)
> +	cmpdi	r0, 0
> +	bne	guest_exit_cont
>  	clrrdi	r3,r3,2
>  	cmpldi	r3,hcall_real_table_end - hcall_real_table
>  	bge	guest_exit_cont
Paul Mackerras Sept. 26, 2018, 10:59 a.m. UTC | #2
On Wed, Sep 26, 2018 at 03:41:07PM +1000, David Gibson wrote:
> On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote:
> > This adds a new hypercall, H_ENTER_NESTED, which is used by a nested
> > hypervisor to enter one of its nested guests.  The hypercall supplies
> > register values in two structs.  Those values are copied by the level 0
> > (L0) hypervisor (the one which is running in hypervisor mode) into the
> > vcpu struct of the L1 guest, and then the guest is run until an
> > interrupt or error occurs which needs to be reported to L1 via the
> > hypercall return value.
> > 
> > Currently this assumes that the L0 and L1 hypervisors are the same
> > endianness, and the structs passed as arguments are in native
> > endianness.
> 
> That's nasty.  It'd be good to at least detect this and bail.

OK, so I need to reword the commit message, because we do detect this
via the version number check, and reject the hcall in the cross-endian
case.

> >  	case H_ENTER_NESTED:
> >  		ret = H_FUNCTION;
> > +		if (!vcpu->kvm->arch.nested_enable)
> > +			break;
> 
> Wouldn't H_AUTHORITY make more sense than H_FUNCTION for the
> no-nested-allowed case?

So the guest can distinguish "I don't know about nested virt" from "I
know about it but you're not allowed to use it", you mean?  I'm not
sure it makes much difference in practice.

> > +		ret = kvmhv_enter_nested_guest(vcpu);
> > +		if (ret == H_INTERRUPT) {
> > +			kvmppc_set_gpr(vcpu, 3, 0);
> > +			return RESUME_HOST;
> > +		}
> >  		break;
> >  
> >  	default:
> > @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
> >  	return r;
> >  }
> >  
> > +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
> 
> Might be nice to rename to make it clear if this is the L0 or L1
> handling for a nested exit.

Well, it's Ln handling the exit of Ln+2.  Got a suggestion for a
better name?

> > @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
> >  	 * On POWER9, we only need to do this if the "indep_threads_mode"
> >  	 * module parameter has been set to N.
> >  	 */
> > -	if (cpu_has_feature(CPU_FTR_ARCH_300))
> > -		kvm->arch.threads_indep = indep_threads_mode;
> > +	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> > +		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
> > +			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
> > +			kvm->arch.threads_indep = true;
> 
> Wouldn't it be cleaner to enforce this at the point indep_threads_mode
> is set, rather than altering the value here?

I'm not sure if we get notified when the administrator changes the
value via sysfs.  In any case, doing it this way causes a warning
every time you start a VM, which is more likely to get noticed than a
single warning in the middle of all the boot-time messages.

> > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
> > +{
> > +	long int err, r;
> > +	struct kvm_nested_guest *l2;
> > +	struct pt_regs l2_regs, saved_l1_regs;
> > +	struct hv_guest_state l2_hv, saved_l1_hv;
> > +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> > +	u64 hv_ptr, regs_ptr;
> > +	u64 hdec_exp;
> > +	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
> > +	u64 mask;
> > +
> > +	if (!kvm_is_radix(vcpu->kvm))
> > +		return H_FUNCTION;
> 
> Would it be safer / cleaner to have this instead check that the L1 has
> completed an H_SET_PARTITION_TABLE?  Which wouldn't be allowed for an
> HPT guest.

There is no valid bit in the PTCR value, and the PTCR could have been
set via the one-reg interface without there having been a
H_SET_PARTITION_TABLE in the lifetime of this KVM instance (for
example when the L1 guest has been migrated in from another machine),
so I don't see a foolproof way to do what you suggest.

Paul.
David Gibson Sept. 27, 2018, 12:57 a.m. UTC | #3
On Wed, Sep 26, 2018 at 08:59:22PM +1000, Paul Mackerras wrote:
> On Wed, Sep 26, 2018 at 03:41:07PM +1000, David Gibson wrote:
> > On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote:
> > > This adds a new hypercall, H_ENTER_NESTED, which is used by a nested
> > > hypervisor to enter one of its nested guests.  The hypercall supplies
> > > register values in two structs.  Those values are copied by the level 0
> > > (L0) hypervisor (the one which is running in hypervisor mode) into the
> > > vcpu struct of the L1 guest, and then the guest is run until an
> > > interrupt or error occurs which needs to be reported to L1 via the
> > > hypercall return value.
> > > 
> > > Currently this assumes that the L0 and L1 hypervisors are the same
> > > endianness, and the structs passed as arguments are in native
> > > endianness.
> > 
> > That's nasty.  It'd be good to at least detect this and bail.
> 
> OK, so I need to reword the commit message, because we do detect this
> via the version number check, and reject the hcall in the cross-endian
> case.

Ah, ok.

> > >  	case H_ENTER_NESTED:
> > >  		ret = H_FUNCTION;
> > > +		if (!vcpu->kvm->arch.nested_enable)
> > > +			break;
> > 
> > Wouldn't H_AUTHORITY make more sense than H_FUNCTION for the
> > no-nested-allowed case?
> 
> So the guest can distinguish "I don't know about nested virt" from "I
> know about it but you're not allowed to use it", you mean?  I'm not
> sure it makes much difference in practice.

I guess not.  I think I'm just used to avoiding H_FUNCTION in qemu for
anything except "we know nothing about this hypercall whatsoever".

> > > +		ret = kvmhv_enter_nested_guest(vcpu);
> > > +		if (ret == H_INTERRUPT) {
> > > +			kvmppc_set_gpr(vcpu, 3, 0);
> > > +			return RESUME_HOST;
> > > +		}
> > >  		break;
> > >  
> > >  	default:
> > > @@ -1269,6 +1276,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
> > >  	return r;
> > >  }
> > >  
> > > +static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
> > 
> > Might be nice to rename to make it clear if this is the L0 or L1
> > handling for a nested exit.
> 
> Well, it's Ln handling the exit of Ln+2.  Got a suggestion for a
> better name?

Yeah, as I said elsewhere feels like we need better terms for absolute
vs. relative hypervisor levels.  What about
kvmppc_handle_guestguest_exit() or kvmppc_handle_metaguest_exit()?
That would work best if "guestguest" or "metaguest" was used to refer
to Ln+2 throughout, of course.

> > > @@ -4462,8 +4600,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
> > >  	 * On POWER9, we only need to do this if the "indep_threads_mode"
> > >  	 * module parameter has been set to N.
> > >  	 */
> > > -	if (cpu_has_feature(CPU_FTR_ARCH_300))
> > > -		kvm->arch.threads_indep = indep_threads_mode;
> > > +	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> > > +		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
> > > +			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
> > > +			kvm->arch.threads_indep = true;
> > 
> > Wouldn't it be cleaner to enforce this at the point indep_threads_mode
> > is set, rather than altering the value here?
> 
> I'm not sure if we get notified when the administrator changes the
> value via sysfs.  In any case, doing it this way causes a warning
> every time you start a VM, which is more likely to get noticed than a
> single warning in the middle of all the boot-time messages.

Hm, good point.

> > > +long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
> > > +{
> > > +	long int err, r;
> > > +	struct kvm_nested_guest *l2;
> > > +	struct pt_regs l2_regs, saved_l1_regs;
> > > +	struct hv_guest_state l2_hv, saved_l1_hv;
> > > +	struct kvmppc_vcore *vc = vcpu->arch.vcore;
> > > +	u64 hv_ptr, regs_ptr;
> > > +	u64 hdec_exp;
> > > +	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
> > > +	u64 mask;
> > > +
> > > +	if (!kvm_is_radix(vcpu->kvm))
> > > +		return H_FUNCTION;
> > 
> > Would it be safer / cleaner to have this instead check that the L1 has
> > completed an H_SET_PARTITION_TABLE?  Which wouldn't be allowed for an
> > HPT guest.
> 
> There is no valid bit in the PTCR value, and the PTCR could have been
> set via the one-reg interface without there having been a
> H_SET_PARTITION_TABLE in the lifetime of this KVM instance (for
> example when the L1 guest has been migrated in from another machine),
> so I don't see a foolproof way to do what you suggest.

Ah, right.

Would checking if the vPTCR is != it's initial value (0?) do the job?
(AFAICT PTCR==0 would be unusable, even if technically allowed).
Paul Mackerras Sept. 27, 2018, 1:45 a.m. UTC | #4
On Thu, Sep 27, 2018 at 10:57:17AM +1000, David Gibson wrote:
> On Wed, Sep 26, 2018 at 08:59:22PM +1000, Paul Mackerras wrote:
> > On Wed, Sep 26, 2018 at 03:41:07PM +1000, David Gibson wrote:
> > > On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote:
> > > > +	if (!kvm_is_radix(vcpu->kvm))
> > > > +		return H_FUNCTION;
> > > 
> > > Would it be safer / cleaner to have this instead check that the L1 has
> > > completed an H_SET_PARTITION_TABLE?  Which wouldn't be allowed for an
> > > HPT guest.
> > 
> > There is no valid bit in the PTCR value, and the PTCR could have been
> > set via the one-reg interface without there having been a
> > H_SET_PARTITION_TABLE in the lifetime of this KVM instance (for
> > example when the L1 guest has been migrated in from another machine),
> > so I don't see a foolproof way to do what you suggest.
> 
> Ah, right.
> 
> Would checking if the vPTCR is != it's initial value (0?) do the job?
> (AFAICT PTCR==0 would be unusable, even if technically allowed).

If I did check PTCR, what error code would you suggest?  I'm sure
you'd tell me not to use H_FUNCTION. :)

Paul.
David Gibson Sept. 27, 2018, 3:28 a.m. UTC | #5
On Thu, Sep 27, 2018 at 11:45:21AM +1000, Paul Mackerras wrote:
> On Thu, Sep 27, 2018 at 10:57:17AM +1000, David Gibson wrote:
> > On Wed, Sep 26, 2018 at 08:59:22PM +1000, Paul Mackerras wrote:
> > > On Wed, Sep 26, 2018 at 03:41:07PM +1000, David Gibson wrote:
> > > > On Fri, Sep 21, 2018 at 08:01:51PM +1000, Paul Mackerras wrote:
> > > > > +	if (!kvm_is_radix(vcpu->kvm))
> > > > > +		return H_FUNCTION;
> > > > 
> > > > Would it be safer / cleaner to have this instead check that the L1 has
> > > > completed an H_SET_PARTITION_TABLE?  Which wouldn't be allowed for an
> > > > HPT guest.
> > > 
> > > There is no valid bit in the PTCR value, and the PTCR could have been
> > > set via the one-reg interface without there having been a
> > > H_SET_PARTITION_TABLE in the lifetime of this KVM instance (for
> > > example when the L1 guest has been migrated in from another machine),
> > > so I don't see a foolproof way to do what you suggest.
> > 
> > Ah, right.
> > 
> > Would checking if the vPTCR is != it's initial value (0?) do the job?
> > (AFAICT PTCR==0 would be unusable, even if technically allowed).
> 
> If I did check PTCR, what error code would you suggest?  I'm sure
> you'd tell me not to use H_FUNCTION. :)

Probably H_NOT_AVAILABLE?  That's what I use if you attempt an HPT
resize from an RPT guest.
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 9afaa82..dfcf43d 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -487,6 +487,42 @@  struct h_cpu_char_result {
 	u64 behaviour;
 };
 
+/* Register state for entering a nested guest with H_ENTER_NESTED */
+struct hv_guest_state {
+	u64 version;		/* version of this structure layout */
+	u32 lpid;
+	u32 vcpu_token;
+	/* These registers are hypervisor privileged (at least for writing) */
+	u64 lpcr;
+	u64 pcr;
+	u64 amor;
+	u64 dpdes;
+	u64 hfscr;
+	s64 tb_offset;
+	u64 dawr0;
+	u64 dawrx0;
+	u64 ciabr;
+	u64 hdec_expiry;
+	u64 purr;
+	u64 spurr;
+	u64 ic;
+	u64 vtb;
+	u64 hdar;
+	u64 hdsisr;
+	u64 heir;
+	u64 asdr;
+	/* These are OS privileged but need to be set late in guest entry */
+	u64 srr0;
+	u64 srr1;
+	u64 sprg[4];
+	u64 pidr;
+	u64 cfar;
+	u64 ppr;
+};
+
+/* Latest version of hv_guest_state structure */
+#define HV_GUEST_STATE_VERSION	1
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7719ca5..125bc5b 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -280,6 +280,13 @@  void kvmhv_vm_nested_init(struct kvm *kvm);
 long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
 void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
 void kvmhv_release_all_nested(struct kvm *kvm);
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
+			  u64 time_limit);
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+				   struct hv_guest_state *hr);
+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
 
 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c35d4f2..ceb9f20 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -95,6 +95,7 @@  struct dtl_entry;
 
 struct kvmppc_vcpu_book3s;
 struct kvmppc_book3s_shadow_vcpu;
+struct kvm_nested_guest;
 
 struct kvm_vm_stat {
 	ulong remote_tlb_flush;
@@ -786,6 +787,10 @@  struct kvm_vcpu_arch {
 	u32 emul_inst;
 
 	u32 online;
+
+	/* For support of nested guests */
+	struct kvm_nested_guest *nested;
+	u32 nested_vcpu_id;
 #endif
 
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7c3738d..d0abcbb 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -503,6 +503,7 @@  int main(void)
 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
 	OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
 	OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
+	OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
 	OFFSET(VCPU_CPU, kvm_vcpu, cpu);
 	OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 82c9a1e..da380cb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -942,6 +942,13 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		break;
 	case H_ENTER_NESTED:
 		ret = H_FUNCTION;
+		if (!vcpu->kvm->arch.nested_enable)
+			break;
+		ret = kvmhv_enter_nested_guest(vcpu);
+		if (ret == H_INTERRUPT) {
+			kvmppc_set_gpr(vcpu, 3, 0);
+			return RESUME_HOST;
+		}
 		break;
 
 	default:
@@ -1269,6 +1276,104 @@  static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return r;
 }
 
+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
+{
+	int r;
+	int srcu_idx;
+
+	vcpu->stat.sum_exits++;
+
+	/*
+	 * This can happen if an interrupt occurs in the last stages
+	 * of guest entry or the first stages of guest exit (i.e. after
+	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
+	 * That can happen due to a bug, or due to a machine check
+	 * occurring at just the wrong time.
+	 */
+	if (vcpu->arch.shregs.msr & MSR_HV) {
+		printk(KERN_EMERG "KVM trap in HV mode while nested!\n");
+		printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+			vcpu->arch.trap, kvmppc_get_pc(vcpu),
+			vcpu->arch.shregs.msr);
+		kvmppc_dump_regs(vcpu);
+		BUG();
+	}
+	switch (vcpu->arch.trap) {
+	/* We're good on these - the host merely wanted to get our attention */
+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
+		vcpu->stat.dec_exits++;
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_HOST;
+		break;
+	case BOOK3S_INTERRUPT_H_DOORBELL:
+	case BOOK3S_INTERRUPT_H_VIRT:
+		vcpu->stat.ext_intr_exits++;
+		r = RESUME_GUEST;
+		break;
+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+	case BOOK3S_INTERRUPT_HMI:
+	case BOOK3S_INTERRUPT_PERFMON:
+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
+		r = RESUME_GUEST;
+		break;
+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
+		/* Pass the machine check to the L1 guest */
+		r = RESUME_HOST;
+		/* Print the MCE event to host console. */
+		machine_check_print_event_info(&vcpu->arch.mce_evt, false);
+		break;
+	/*
+	 * We get these next two if the guest accesses a page which it thinks
+	 * it has mapped but which is not actually present, either because
+	 * it is for an emulated I/O device or because the corresonding
+	 * host page has been paged out.
+	 */
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = kvmhv_nested_page_fault(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+					 DSISR_SRR1_MATCH_64S;
+		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = kvmhv_nested_page_fault(vcpu);
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+		/*
+		 * This occurs for various TM-related instructions that
+		 * we need to emulate on POWER9 DD2.2.  We have already
+		 * handled the cases where the guest was in real-suspend
+		 * mode and was transitioning to transactional state.
+		 */
+		r = kvmhv_p9_tm_emulation(vcpu);
+		break;
+#endif
+
+	case BOOK3S_INTERRUPT_HV_RM_HARD:
+		vcpu->arch.trap = 0;
+		r = RESUME_GUEST;
+		if (!xive_enabled())
+			kvmppc_xics_rm_complete(vcpu, 0);
+		break;
+	default:
+		r = RESUME_HOST;
+		break;
+	}
+
+	return r;
+}
+
 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
 					    struct kvm_sregs *sregs)
 {
@@ -3095,7 +3200,7 @@  static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 /*
  * Load up hypervisor-mode registers on P9.
  */
-static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	s64 hdec;
@@ -3108,7 +3213,7 @@  static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
 	unsigned long host_psscr = mfspr(SPRN_PSSCR);
 	unsigned long host_pidr = mfspr(SPRN_PID);
 
-	hdec = local_paca->kvm_hstate.dec_expires - mftb();
+	hdec = time_limit - mftb();
 	if (hdec < 0)
 		return BOOK3S_INTERRUPT_HV_DECREMENTER;
 	mtspr(SPRN_HDEC, hdec);
@@ -3222,7 +3327,7 @@  static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu)
  * Virtual-mode guest entry for POWER9 and later when the host and
  * guest are both using the radix MMU.  The LPIDR has already been set.
  */
-int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	unsigned long host_dscr = mfspr(SPRN_DSCR);
@@ -3237,6 +3342,8 @@  int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
 	if (dec < 512)
 		return BOOK3S_INTERRUPT_HV_DECREMENTER;
 	local_paca->kvm_hstate.dec_expires = dec + tb;
+	if (local_paca->kvm_hstate.dec_expires < time_limit)
+		time_limit = local_paca->kvm_hstate.dec_expires;
 
 	vcpu->arch.ceded = 0;
 
@@ -3290,7 +3397,28 @@  int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu)
 		vcpu->arch.doorbell_request = 0;
 	}
 
-	trap = kvmhv_load_hv_regs_and_go(vcpu);
+	if (!cpu_has_feature(CPU_FTR_HVMODE)) {
+		/* call our hypervisor to load up HV regs and go */
+		struct hv_guest_state hvregs;
+
+		kvmhv_save_hv_regs(vcpu, &hvregs);
+		vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+		hvregs.version = HV_GUEST_STATE_VERSION;
+		hvregs.lpid = vcpu->kvm->arch.lpid;
+		hvregs.vcpu_token = vcpu->vcpu_id;
+		hvregs.hdec_expiry = time_limit;
+		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+			     &vcpu->arch.pending_exceptions))
+			hvregs.lpcr |= LPCR_MER;
+		trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+					  __pa(&vcpu->arch.regs));
+		kvmhv_restore_hv_return_state(vcpu, &hvregs);
+		vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+		vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+		vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+	} else {
+		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit);
+	}
 
 	vcpu->arch.slb_max = 0;
 	dec = mfspr(SPRN_DEC);
@@ -3530,6 +3658,10 @@  static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
 }
 
+/*
+ * This is assumed not to be able to fail for a radix guest in
+ * kvmhv_run_single_vcpu().
+ */
 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 {
 	int r = 0;
@@ -3679,12 +3811,14 @@  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return vcpu->arch.ret;
 }
 
-static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
-				  struct kvm_vcpu *vcpu)
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
+			  struct kvm_vcpu *vcpu, u64 time_limit)
 {
 	int trap, r, pcpu, pcpu0;
 	int srcu_idx;
 	struct kvmppc_vcore *vc;
+	struct kvm_nested_guest *nested = vcpu->arch.nested;
+	unsigned long lpid;
 
 	trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -3705,16 +3839,8 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 	vc->runner = vcpu;
 
 	/* See if the MMU is ready to go */
-	if (!vcpu->kvm->arch.mmu_ready) {
-		r = kvmhv_setup_mmu(vcpu);
-		if (r) {
-			kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-			kvm_run->fail_entry.
-				hardware_entry_failure_reason = 0;
-			vcpu->arch.ret = r;
-			goto out;
-		}
-	}
+	if (!vcpu->kvm->arch.mmu_ready)
+		kvmhv_setup_mmu(vcpu);
 
 	if (need_resched())
 		cond_resched();
@@ -3736,7 +3862,12 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 	if (lazy_irq_pending() || need_resched() || !vcpu->kvm->arch.mmu_ready)
 		goto out;
 
-	kvmppc_core_prepare_to_enter(vcpu);
+	if (!nested) {
+		kvmppc_core_prepare_to_enter(vcpu);
+	} else if (vcpu->arch.pending_exceptions) {
+		vcpu->arch.ret = RESUME_HOST;
+		goto out;
+	}
 
 	kvmppc_clear_host_core(pcpu);
 
@@ -3750,7 +3881,10 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 	vc->vcore_state = VCORE_RUNNING;
 	trace_kvmppc_run_core(vc, 0);
 
-	mtspr(SPRN_LPID, vc->kvm->arch.lpid);
+	lpid = vc->kvm->arch.lpid;
+	if (nested)
+		lpid = nested->shadow_lpid;
+	mtspr(SPRN_LPID, lpid);
 	isync();
 
 	/* See comment above in kvmppc_run_core() about this */
@@ -3759,7 +3893,7 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 		pcpu0 &= ~0x3UL;
 
 	if (cpumask_test_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush)) {
-		radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
+		radix__local_flush_tlb_lpid_guest(lpid);
 		/* Clear the bit after the TLB flush */
 		cpumask_clear_cpu(pcpu0, &vc->kvm->arch.need_tlb_flush);
 	}
@@ -3771,7 +3905,7 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 
 	this_cpu_disable_ftrace();
 
-	trap = kvmhv_p9_guest_entry(vcpu);
+	trap = kvmhv_p9_guest_entry(vcpu, time_limit);
 	vcpu->arch.trap = trap;
 
 	this_cpu_enable_ftrace();
@@ -3796,8 +3930,12 @@  static int kvmppc_run_single_vcpu(struct kvm_run *kvm_run,
 
 	trace_kvm_guest_exit(vcpu);
 	r = RESUME_GUEST;
-	if (trap)
-		r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+	if (trap) {
+		if (!nested)
+			r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+		else
+			r = kvmppc_handle_nested_exit(vcpu);
+	}
 	vcpu->arch.ret = r;
 
 	if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
@@ -3912,7 +4050,7 @@  static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	do {
 		if (kvm->arch.threads_indep && kvm_is_radix(kvm))
-			r = kvmppc_run_single_vcpu(run, vcpu);
+			r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0);
 		else
 			r = kvmppc_run_vcpu(run, vcpu);
 
@@ -4462,8 +4600,14 @@  static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 * On POWER9, we only need to do this if the "indep_threads_mode"
 	 * module parameter has been set to N.
 	 */
-	if (cpu_has_feature(CPU_FTR_ARCH_300))
-		kvm->arch.threads_indep = indep_threads_mode;
+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
+			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
+			kvm->arch.threads_indep = true;
+		} else {
+			kvm->arch.threads_indep = indep_threads_mode;
+		}
+	}
 	if (!kvm->arch.threads_indep)
 		kvm_hv_vm_activated();
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 5fe3ea4..a7f3da9 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -20,6 +20,231 @@  static struct patb_entry *pseries_partition_tb;
 
 static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
 
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	hr->lpcr = vc->lpcr;
+	hr->pcr = vc->pcr;
+	hr->dpdes = vc->dpdes;
+	hr->hfscr = vcpu->arch.hfscr;
+	hr->tb_offset = vc->tb_offset;
+	hr->dawr0 = vcpu->arch.dawr;
+	hr->dawrx0 = vcpu->arch.dawrx;
+	hr->ciabr = vcpu->arch.ciabr;
+	hr->purr = vcpu->arch.purr;
+	hr->spurr = vcpu->arch.spurr;
+	hr->ic = vcpu->arch.ic;
+	hr->vtb = vc->vtb;
+	hr->srr0 = vcpu->arch.shregs.srr0;
+	hr->srr1 = vcpu->arch.shregs.srr1;
+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
+	hr->pidr = vcpu->arch.pid;
+	hr->cfar = vcpu->arch.cfar;
+	hr->ppr = vcpu->arch.ppr;
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+				 struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	hr->dpdes = vc->dpdes;
+	hr->hfscr = vcpu->arch.hfscr;
+	hr->purr = vcpu->arch.purr;
+	hr->spurr = vcpu->arch.spurr;
+	hr->ic = vcpu->arch.ic;
+	hr->vtb = vc->vtb;
+	hr->srr0 = vcpu->arch.shregs.srr0;
+	hr->srr1 = vcpu->arch.shregs.srr1;
+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
+	hr->pidr = vcpu->arch.pid;
+	hr->cfar = vcpu->arch.cfar;
+	hr->ppr = vcpu->arch.ppr;
+	switch (trap) {
+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+		hr->hdar = vcpu->arch.fault_dar;
+		hr->hdsisr = vcpu->arch.fault_dsisr;
+		hr->asdr = vcpu->arch.fault_gpa;
+		break;
+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
+		hr->asdr = vcpu->arch.fault_gpa;
+		break;
+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+		hr->heir = vcpu->arch.emul_inst;
+		break;
+	}
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	vc->pcr = hr->pcr;
+	vc->dpdes = hr->dpdes;
+	vcpu->arch.hfscr = hr->hfscr;
+	vcpu->arch.dawr = hr->dawr0;
+	vcpu->arch.dawrx = hr->dawrx0;
+	vcpu->arch.ciabr = hr->ciabr;
+	vcpu->arch.purr = hr->purr;
+	vcpu->arch.spurr = hr->spurr;
+	vcpu->arch.ic = hr->ic;
+	vc->vtb = hr->vtb;
+	vcpu->arch.shregs.srr0 = hr->srr0;
+	vcpu->arch.shregs.srr1 = hr->srr1;
+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
+	vcpu->arch.pid = hr->pidr;
+	vcpu->arch.cfar = hr->cfar;
+	vcpu->arch.ppr = hr->ppr;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+				   struct hv_guest_state *hr)
+{
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+	vc->dpdes = hr->dpdes;
+	vcpu->arch.hfscr = hr->hfscr;
+	vcpu->arch.purr = hr->purr;
+	vcpu->arch.spurr = hr->spurr;
+	vcpu->arch.ic = hr->ic;
+	vc->vtb = hr->vtb;
+	vcpu->arch.fault_dar = hr->hdar;
+	vcpu->arch.fault_dsisr = hr->hdsisr;
+	vcpu->arch.fault_gpa = hr->asdr;
+	vcpu->arch.emul_inst = hr->heir;
+	vcpu->arch.shregs.srr0 = hr->srr0;
+	vcpu->arch.shregs.srr1 = hr->srr1;
+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
+	vcpu->arch.pid = hr->pidr;
+	vcpu->arch.cfar = hr->cfar;
+	vcpu->arch.ppr = hr->ppr;
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+	long int err, r;
+	struct kvm_nested_guest *l2;
+	struct pt_regs l2_regs, saved_l1_regs;
+	struct hv_guest_state l2_hv, saved_l1_hv;
+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	u64 hv_ptr, regs_ptr;
+	u64 hdec_exp;
+	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+	u64 mask;
+
+	if (!kvm_is_radix(vcpu->kvm))
+		return H_FUNCTION;
+
+	/* copy parameters in */
+	hv_ptr = kvmppc_get_gpr(vcpu, 4);
+	err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
+				  sizeof(struct hv_guest_state));
+	if (err)
+		return H_PARAMETER;
+	if (l2_hv.version != HV_GUEST_STATE_VERSION)
+		return H_P2;
+
+	regs_ptr = kvmppc_get_gpr(vcpu, 5);
+	err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
+				  sizeof(struct pt_regs));
+	if (err)
+		return H_PARAMETER;
+
+	/* translate lpid */
+	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+	if (!l2)
+		return H_PARAMETER;
+	if (!l2->l1_gr_to_hr) {
+		mutex_lock(&l2->tlb_lock);
+		kvmhv_update_ptbl_cache(l2);
+		mutex_unlock(&l2->tlb_lock);
+	}
+
+	/* save l1 values of things */
+	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+	saved_l1_regs = vcpu->arch.regs;
+	kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+	/* convert TB values/offsets to host (L0) values */
+	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+	vc->tb_offset += l2_hv.tb_offset;
+
+	/* set L1 state to L2 state */
+	vcpu->arch.nested = l2;
+	vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+	vcpu->arch.regs = l2_regs;
+	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+		LPCR_LPES | LPCR_MER;
+	vc->lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
+	restore_hv_regs(vcpu, &l2_hv);
+
+	vcpu->arch.ret = RESUME_GUEST;
+	vcpu->arch.trap = 0;
+	do {
+		if (mftb() >= hdec_exp) {
+			vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+			r = RESUME_HOST;
+			break;
+		}
+		r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp);
+	} while (is_kvmppc_resume_guest(r));
+
+	/* save L2 state for return */
+	l2_regs = vcpu->arch.regs;
+	l2_regs.msr = vcpu->arch.shregs.msr;
+	delta_purr = vcpu->arch.purr - l2_hv.purr;
+	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+	delta_ic = vcpu->arch.ic - l2_hv.ic;
+	delta_vtb = vc->vtb - l2_hv.vtb;
+	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+
+	/* restore L1 state */
+	vcpu->arch.nested = NULL;
+	vcpu->arch.regs = saved_l1_regs;
+	vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+	/* set L1 MSR TS field according to L2 transaction state */
+	if (l2_regs.msr & MSR_TS_MASK)
+		vcpu->arch.shregs.msr |= MSR_TS_S;
+	vc->lpcr = saved_l1_hv.lpcr;
+	vc->tb_offset = saved_l1_hv.tb_offset;
+	restore_hv_regs(vcpu, &saved_l1_hv);
+	vcpu->arch.purr += delta_purr;
+	vcpu->arch.spurr += delta_spurr;
+	vcpu->arch.ic += delta_ic;
+	vc->vtb += delta_vtb;
+
+	kvmhv_put_nested(l2);
+
+	/* copy l2_hv_state and regs back to guest */
+	err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
+				   sizeof(struct hv_guest_state));
+	if (err)
+		return H_AUTHORITY;
+	err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
+				   sizeof(struct pt_regs));
+	if (err)
+		return H_AUTHORITY;
+
+	if (r == -EINTR)
+		return H_INTERRUPT;
+
+	return vcpu->arch.trap;
+}
+
 /* Only called when we're not in hypervisor mode */
 bool kvmhv_nested_init(void)
 {
@@ -284,3 +509,8 @@  struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
 		return NULL;
 	return kvm->arch.nested_guests[lpid];
 }
+
+long kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
+{
+	return RESUME_HOST;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 83efc13..04fcaa4 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -2199,6 +2199,10 @@  hcall_try_real_mode:
 	andi.	r0,r11,MSR_PR
 	/* sc 1 from userspace - reflect to guest syscall */
 	bne	sc_1_fast_return
+	/* sc 1 from nested guest - give it to L1 to handle */
+	ld	r0, VCPU_NESTED(r9)
+	cmpdi	r0, 0
+	bne	guest_exit_cont
 	clrrdi	r3,r3,2
 	cmpldi	r3,hcall_real_table_end - hcall_real_table
 	bge	guest_exit_cont