diff mbox series

KVM: PPC: Book3S HV: Do not expose HFSCR sanitisation to nested hypervisor

Message ID 20210305231055.2913892-1-farosas@linux.ibm.com
State New
Headers show
Series KVM: PPC: Book3S HV: Do not expose HFSCR sanitisation to nested hypervisor | expand

Commit Message

Fabiano Rosas March 5, 2021, 11:10 p.m. UTC
As one of the arguments of the H_ENTER_NESTED hypercall, the nested
hypervisor (L1) prepares a structure containing the values of various
hypervisor-privileged registers with which it wants the nested guest
(L2) to run. Since the nested HV runs in supervisor mode it needs the
host to write to these registers.

To stop a nested HV manipulating this mechanism and using a nested
guest as a proxy to access a facility that has been made unavailable
to it, we have a routine that sanitises the values of the HV registers
before copying them into the nested guest's vcpu struct.

However, when coming out of the guest the values are copied as they
were back into L1 memory, which means that any sanitisation we did
during guest entry will be exposed to L1 after H_ENTER_NESTED returns.

This is not a problem by itself, but in the case of the Hypervisor
Facility Status and Control Register (HFSCR), we use the intersection
between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
this to indirectly read the (hv-privileged) value from its vcpu
struct.

This patch fixes this by making sure that L1 only gets back the bits
that are necessary for regular functioning.

Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv_nested.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

Comments

Nicholas Piggin March 8, 2021, 8:18 a.m. UTC | #1
Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
> As one of the arguments of the H_ENTER_NESTED hypercall, the nested
> hypervisor (L1) prepares a structure containing the values of various
> hypervisor-privileged registers with which it wants the nested guest
> (L2) to run. Since the nested HV runs in supervisor mode it needs the
> host to write to these registers.
> 
> To stop a nested HV manipulating this mechanism and using a nested
> guest as a proxy to access a facility that has been made unavailable
> to it, we have a routine that sanitises the values of the HV registers
> before copying them into the nested guest's vcpu struct.
> 
> However, when coming out of the guest the values are copied as they
> were back into L1 memory, which means that any sanitisation we did
> during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
> 
> This is not a problem by itself, but in the case of the Hypervisor
> Facility Status and Control Register (HFSCR), we use the intersection
> between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
> this to indirectly read the (hv-privileged) value from its vcpu
> struct.
> 
> This patch fixes this by making sure that L1 only gets back the bits
> that are necessary for regular functioning.

The general idea of restricting exposure of HV privileged bits, but
for the case of HFSCR a guest can probe the HFCR anyway by testing which 
facilities are available (and presumably an HV may need some way to know
what features are available for it to advertise to its own guests), so
is this necessary? Perhaps a comment would be sufficient.

Thanks,
Nick

> 
> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
>  arch/powerpc/kvm/book3s_hv_nested.c | 22 +++++++++++++++++-----
>  1 file changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> index 0cd0e7aad588..860004f46e08 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -98,12 +98,20 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
>  }
>  
>  static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
> -				 struct hv_guest_state *hr)
> +				 struct hv_guest_state *hr, u64 saved_hfscr)
>  {
>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>  
> +	/*
> +	 * During sanitise_hv_regs() we used HFSCR bits from L1 state
> +	 * to restrict what the L2 state is allowed to be. Since L1 is
> +	 * not allowed to read this SPR, do not include these
> +	 * modifications in the return state.
> +	 */
> +	hr->hfscr = ((~HFSCR_INTR_CAUSE & saved_hfscr) |
> +		     (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
> +
>  	hr->dpdes = vc->dpdes;
> -	hr->hfscr = vcpu->arch.hfscr;
>  	hr->purr = vcpu->arch.purr;
>  	hr->spurr = vcpu->arch.spurr;
>  	hr->ic = vcpu->arch.ic;
> @@ -132,12 +140,14 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
>  	}
>  }
>  
> -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
> +static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr,
> +			     u64 *saved_hfscr)
>  {
>  	/*
>  	 * Don't let L1 enable features for L2 which we've disabled for L1,
>  	 * but preserve the interrupt cause field.
>  	 */
> +	*saved_hfscr = hr->hfscr;
>  	hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
>  
>  	/* Don't let data address watchpoint match in hypervisor state */
> @@ -272,6 +282,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>  	u64 hdec_exp;
>  	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
>  	u64 mask;
> +	u64 hfscr;
>  	unsigned long lpcr;
>  
>  	if (vcpu->kvm->arch.l1_ptcr == 0)
> @@ -324,7 +335,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>  	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
>  		LPCR_LPES | LPCR_MER;
>  	lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
> -	sanitise_hv_regs(vcpu, &l2_hv);
> +
> +	sanitise_hv_regs(vcpu, &l2_hv, &hfscr);
>  	restore_hv_regs(vcpu, &l2_hv);
>  
>  	vcpu->arch.ret = RESUME_GUEST;
> @@ -345,7 +357,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>  	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
>  	delta_ic = vcpu->arch.ic - l2_hv.ic;
>  	delta_vtb = vc->vtb - l2_hv.vtb;
> -	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
> +	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv, hfscr);
>  
>  	/* restore L1 state */
>  	vcpu->arch.nested = NULL;
> -- 
> 2.29.2
> 
>
Fabiano Rosas March 8, 2021, 3:04 p.m. UTC | #2
Nicholas Piggin <npiggin@gmail.com> writes:

> Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
>> As one of the arguments of the H_ENTER_NESTED hypercall, the nested
>> hypervisor (L1) prepares a structure containing the values of various
>> hypervisor-privileged registers with which it wants the nested guest
>> (L2) to run. Since the nested HV runs in supervisor mode it needs the
>> host to write to these registers.
>> 
>> To stop a nested HV manipulating this mechanism and using a nested
>> guest as a proxy to access a facility that has been made unavailable
>> to it, we have a routine that sanitises the values of the HV registers
>> before copying them into the nested guest's vcpu struct.
>> 
>> However, when coming out of the guest the values are copied as they
>> were back into L1 memory, which means that any sanitisation we did
>> during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
>> 
>> This is not a problem by itself, but in the case of the Hypervisor
>> Facility Status and Control Register (HFSCR), we use the intersection
>> between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
>> this to indirectly read the (hv-privileged) value from its vcpu
>> struct.
>> 
>> This patch fixes this by making sure that L1 only gets back the bits
>> that are necessary for regular functioning.
>
> The general idea of restricting exposure of HV privileged bits, but
> for the case of HFSCR a guest can probe the HFCR anyway by testing which 
> facilities are available (and presumably an HV may need some way to know
> what features are available for it to advertise to its own guests), so
> is this necessary? Perhaps a comment would be sufficient.
>

Well, I'd be happy to force them through the arduous path then =); and
there are features that are emulated by the HV which L1 would not be
able to probe.

I think we should implement a mechanism that stops all leaks now, rather
than having to ponder about this every time we touch an hv_reg in that
structure. I'm not too worried about HFSCR specifically.

Let me think about this some more and see if I can make it more generic,
I realise that sticking the saved_hfscr on the side is not the most
elegant approach.

> Thanks,
> Nick
>
>> 
>> Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
>> ---
>>  arch/powerpc/kvm/book3s_hv_nested.c | 22 +++++++++++++++++-----
>>  1 file changed, 17 insertions(+), 5 deletions(-)
>> 
>> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
>> index 0cd0e7aad588..860004f46e08 100644
>> --- a/arch/powerpc/kvm/book3s_hv_nested.c
>> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
>> @@ -98,12 +98,20 @@ static void byteswap_hv_regs(struct hv_guest_state *hr)
>>  }
>>  
>>  static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
>> -				 struct hv_guest_state *hr)
>> +				 struct hv_guest_state *hr, u64 saved_hfscr)
>>  {
>>  	struct kvmppc_vcore *vc = vcpu->arch.vcore;
>>  
>> +	/*
>> +	 * During sanitise_hv_regs() we used HFSCR bits from L1 state
>> +	 * to restrict what the L2 state is allowed to be. Since L1 is
>> +	 * not allowed to read this SPR, do not include these
>> +	 * modifications in the return state.
>> +	 */
>> +	hr->hfscr = ((~HFSCR_INTR_CAUSE & saved_hfscr) |
>> +		     (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
>> +
>>  	hr->dpdes = vc->dpdes;
>> -	hr->hfscr = vcpu->arch.hfscr;
>>  	hr->purr = vcpu->arch.purr;
>>  	hr->spurr = vcpu->arch.spurr;
>>  	hr->ic = vcpu->arch.ic;
>> @@ -132,12 +140,14 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
>>  	}
>>  }
>>  
>> -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
>> +static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr,
>> +			     u64 *saved_hfscr)
>>  {
>>  	/*
>>  	 * Don't let L1 enable features for L2 which we've disabled for L1,
>>  	 * but preserve the interrupt cause field.
>>  	 */
>> +	*saved_hfscr = hr->hfscr;
>>  	hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
>>  
>>  	/* Don't let data address watchpoint match in hypervisor state */
>> @@ -272,6 +282,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>>  	u64 hdec_exp;
>>  	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
>>  	u64 mask;
>> +	u64 hfscr;
>>  	unsigned long lpcr;
>>  
>>  	if (vcpu->kvm->arch.l1_ptcr == 0)
>> @@ -324,7 +335,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>>  	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
>>  		LPCR_LPES | LPCR_MER;
>>  	lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
>> -	sanitise_hv_regs(vcpu, &l2_hv);
>> +
>> +	sanitise_hv_regs(vcpu, &l2_hv, &hfscr);
>>  	restore_hv_regs(vcpu, &l2_hv);
>>  
>>  	vcpu->arch.ret = RESUME_GUEST;
>> @@ -345,7 +357,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>>  	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
>>  	delta_ic = vcpu->arch.ic - l2_hv.ic;
>>  	delta_vtb = vc->vtb - l2_hv.vtb;
>> -	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
>> +	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv, hfscr);
>>  
>>  	/* restore L1 state */
>>  	vcpu->arch.nested = NULL;
>> -- 
>> 2.29.2
>> 
>>
Nicholas Piggin March 9, 2021, 1:07 a.m. UTC | #3
Excerpts from Fabiano Rosas's message of March 9, 2021 1:04 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
> 
>> Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
>>> As one of the arguments of the H_ENTER_NESTED hypercall, the nested
>>> hypervisor (L1) prepares a structure containing the values of various
>>> hypervisor-privileged registers with which it wants the nested guest
>>> (L2) to run. Since the nested HV runs in supervisor mode it needs the
>>> host to write to these registers.
>>> 
>>> To stop a nested HV manipulating this mechanism and using a nested
>>> guest as a proxy to access a facility that has been made unavailable
>>> to it, we have a routine that sanitises the values of the HV registers
>>> before copying them into the nested guest's vcpu struct.
>>> 
>>> However, when coming out of the guest the values are copied as they
>>> were back into L1 memory, which means that any sanitisation we did
>>> during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
>>> 
>>> This is not a problem by itself, but in the case of the Hypervisor
>>> Facility Status and Control Register (HFSCR), we use the intersection
>>> between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
>>> this to indirectly read the (hv-privileged) value from its vcpu
>>> struct.
>>> 
>>> This patch fixes this by making sure that L1 only gets back the bits
>>> that are necessary for regular functioning.
>>
>> The general idea of restricting exposure of HV privileged bits, but
>> for the case of HFSCR a guest can probe the HFCR anyway by testing which 
>> facilities are available (and presumably an HV may need some way to know
>> what features are available for it to advertise to its own guests), so
>> is this necessary? Perhaps a comment would be sufficient.
>>
> 
> Well, I'd be happy to force them through the arduous path then =);

That's not a very satisifying justification.

> and
> there are features that are emulated by the HV which L1 would not be
> able to probe.

It should be able to trivially by measuring timing.

> 
> I think we should implement a mechanism that stops all leaks now, rather
> than having to ponder about this every time we touch an hv_reg in that
> structure.

This does not follow. There is already a "leak" via a timing or faulting 
side channel, so by definition we can't stop all leaks just by filtering 
the register value.

So what we need to do first I think is define what the threat is. What 
is the problem with the L1 knowing what the HFSCR is? If we can identify
a threat then we would appear to have much bigger problems. If not, then
this change can not be justified on the basis of security AFAIKS.

> I'm not too worried about HFSCR specifically.

HFSCR is pretty special because its behaviour makes it quite trivial to
extrapolate. It also has the fault cause bits in it that aren't being
sanitised either so that would have to be thought about.

> Let me think about this some more and see if I can make it more generic,
> I realise that sticking the saved_hfscr on the side is not the most
> elegant approach.

I would say returning an error from the hcall if the caller tries to 
enable an HFSCR bit that it's not allowed to would be the easiest
approach. At least then a well meaning but optimistic guest won't try
to enable and advertise missing features to its nested guests and have
them crash strangely, rather it would just stop up front.

I don't think trying to obscure HFSCR in general will ever be possible 
though.

Thanks,
Nick
Michael Ellerman March 9, 2021, 1:52 a.m. UTC | #4
Fabiano Rosas <farosas@linux.ibm.com> writes:
> Nicholas Piggin <npiggin@gmail.com> writes:
>
>> Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
>>> As one of the arguments of the H_ENTER_NESTED hypercall, the nested
>>> hypervisor (L1) prepares a structure containing the values of various
>>> hypervisor-privileged registers with which it wants the nested guest
>>> (L2) to run. Since the nested HV runs in supervisor mode it needs the
>>> host to write to these registers.
>>> 
>>> To stop a nested HV manipulating this mechanism and using a nested
>>> guest as a proxy to access a facility that has been made unavailable
>>> to it, we have a routine that sanitises the values of the HV registers
>>> before copying them into the nested guest's vcpu struct.
>>> 
>>> However, when coming out of the guest the values are copied as they
>>> were back into L1 memory, which means that any sanitisation we did
>>> during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
>>> 
>>> This is not a problem by itself, but in the case of the Hypervisor
>>> Facility Status and Control Register (HFSCR), we use the intersection
>>> between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
>>> this to indirectly read the (hv-privileged) value from its vcpu
>>> struct.
>>> 
>>> This patch fixes this by making sure that L1 only gets back the bits
>>> that are necessary for regular functioning.
>>
>> The general idea of restricting exposure of HV privileged bits, but
>> for the case of HFSCR a guest can probe the HFCR anyway by testing which 
>> facilities are available (and presumably an HV may need some way to know
>> what features are available for it to advertise to its own guests), so
>> is this necessary? Perhaps a comment would be sufficient.
>
> Well, I'd be happy to force them through the arduous path then =); and
> there are features that are emulated by the HV which L1 would not be
> able to probe.
>
> I think we should implement a mechanism that stops all leaks now, rather
> than having to ponder about this every time we touch an hv_reg in that
> structure. I'm not too worried about HFSCR specifically.
>
> Let me think about this some more and see if I can make it more generic,
> I realise that sticking the saved_hfscr on the side is not the most
> elegant approach.

Yeah that would be good.

I don't really like the patch as it is, ie. having to pass *saved_hfscr
and so on.

But in general I agree that we should avoid leaking details across
boundaries, even if we don't think they are particularly sensitive.

cheers
Paul Mackerras March 10, 2021, 9:23 a.m. UTC | #5
On Mon, Mar 08, 2021 at 06:18:47PM +1000, Nicholas Piggin wrote:
> Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
> > As one of the arguments of the H_ENTER_NESTED hypercall, the nested
> > hypervisor (L1) prepares a structure containing the values of various
> > hypervisor-privileged registers with which it wants the nested guest
> > (L2) to run. Since the nested HV runs in supervisor mode it needs the
> > host to write to these registers.
> > 
> > To stop a nested HV manipulating this mechanism and using a nested
> > guest as a proxy to access a facility that has been made unavailable
> > to it, we have a routine that sanitises the values of the HV registers
> > before copying them into the nested guest's vcpu struct.
> > 
> > However, when coming out of the guest the values are copied as they
> > were back into L1 memory, which means that any sanitisation we did
> > during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
> > 
> > This is not a problem by itself, but in the case of the Hypervisor
> > Facility Status and Control Register (HFSCR), we use the intersection
> > between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
> > this to indirectly read the (hv-privileged) value from its vcpu
> > struct.
> > 
> > This patch fixes this by making sure that L1 only gets back the bits
> > that are necessary for regular functioning.
> 
> The general idea of restricting exposure of HV privileged bits, but
> for the case of HFSCR a guest can probe the HFCR anyway by testing which 
> facilities are available (and presumably an HV may need some way to know
> what features are available for it to advertise to its own guests), so
> is this necessary? Perhaps a comment would be sufficient.

I would see it a bit differently.  From L1's point of view, L0 is the
hardware.  The situation we have now is akin to writing a value to the
real HFSCR, then reading HFSCR and finding that some of the facility
enable bits have magically got set to zero.  That's not the way real
hardware works, so L0 shouldn't behave that way either, or at least
not without some strong justification.

Paul.
Nicholas Piggin March 12, 2021, 1:13 a.m. UTC | #6
Excerpts from Paul Mackerras's message of March 10, 2021 7:23 pm:
> On Mon, Mar 08, 2021 at 06:18:47PM +1000, Nicholas Piggin wrote:
>> Excerpts from Fabiano Rosas's message of March 6, 2021 9:10 am:
>> > As one of the arguments of the H_ENTER_NESTED hypercall, the nested
>> > hypervisor (L1) prepares a structure containing the values of various
>> > hypervisor-privileged registers with which it wants the nested guest
>> > (L2) to run. Since the nested HV runs in supervisor mode it needs the
>> > host to write to these registers.
>> > 
>> > To stop a nested HV manipulating this mechanism and using a nested
>> > guest as a proxy to access a facility that has been made unavailable
>> > to it, we have a routine that sanitises the values of the HV registers
>> > before copying them into the nested guest's vcpu struct.
>> > 
>> > However, when coming out of the guest the values are copied as they
>> > were back into L1 memory, which means that any sanitisation we did
>> > during guest entry will be exposed to L1 after H_ENTER_NESTED returns.
>> > 
>> > This is not a problem by itself, but in the case of the Hypervisor
>> > Facility Status and Control Register (HFSCR), we use the intersection
>> > between L2 hfscr bits and L1 hfscr bits. That means that L1 could use
>> > this to indirectly read the (hv-privileged) value from its vcpu
>> > struct.
>> > 
>> > This patch fixes this by making sure that L1 only gets back the bits
>> > that are necessary for regular functioning.
>> 
>> The general idea of restricting exposure of HV privileged bits, but
>> for the case of HFSCR a guest can probe the HFCR anyway by testing which 
>> facilities are available (and presumably an HV may need some way to know
>> what features are available for it to advertise to its own guests), so
>> is this necessary? Perhaps a comment would be sufficient.
> 
> I would see it a bit differently.  From L1's point of view, L0 is the
> hardware.  The situation we have now is akin to writing a value to the
> real HFSCR, then reading HFSCR and finding that some of the facility
> enable bits have magically got set to zero.  That's not the way real
> hardware works, so L0 shouldn't behave that way either, or at least
> not without some strong justification.

But the features disallowed by the L0 have to be viewed as unimplemented 
by the hardware so the bits would be reserved, so according to 
architecture they actually are allowed to return zero.

That's not my concern though, and I do agree it is a bit odd. I don't 
have a problem with leaving the FC field value unchanged.

I think at least printing a warning for unimplemented bits would be good 
though.

Thanks,
Nick
diff mbox series

Patch

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 0cd0e7aad588..860004f46e08 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -98,12 +98,20 @@  static void byteswap_hv_regs(struct hv_guest_state *hr)
 }
 
 static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
-				 struct hv_guest_state *hr)
+				 struct hv_guest_state *hr, u64 saved_hfscr)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+	/*
+	 * During sanitise_hv_regs() we used HFSCR bits from L1 state
+	 * to restrict what the L2 state is allowed to be. Since L1 is
+	 * not allowed to read this SPR, do not include these
+	 * modifications in the return state.
+	 */
+	hr->hfscr = ((~HFSCR_INTR_CAUSE & saved_hfscr) |
+		     (HFSCR_INTR_CAUSE & vcpu->arch.hfscr));
+
 	hr->dpdes = vc->dpdes;
-	hr->hfscr = vcpu->arch.hfscr;
 	hr->purr = vcpu->arch.purr;
 	hr->spurr = vcpu->arch.spurr;
 	hr->ic = vcpu->arch.ic;
@@ -132,12 +140,14 @@  static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
 	}
 }
 
-static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr,
+			     u64 *saved_hfscr)
 {
 	/*
 	 * Don't let L1 enable features for L2 which we've disabled for L1,
 	 * but preserve the interrupt cause field.
 	 */
+	*saved_hfscr = hr->hfscr;
 	hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
 
 	/* Don't let data address watchpoint match in hypervisor state */
@@ -272,6 +282,7 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	u64 hdec_exp;
 	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
 	u64 mask;
+	u64 hfscr;
 	unsigned long lpcr;
 
 	if (vcpu->kvm->arch.l1_ptcr == 0)
@@ -324,7 +335,8 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
 		LPCR_LPES | LPCR_MER;
 	lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
-	sanitise_hv_regs(vcpu, &l2_hv);
+
+	sanitise_hv_regs(vcpu, &l2_hv, &hfscr);
 	restore_hv_regs(vcpu, &l2_hv);
 
 	vcpu->arch.ret = RESUME_GUEST;
@@ -345,7 +357,7 @@  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
 	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
 	delta_ic = vcpu->arch.ic - l2_hv.ic;
 	delta_vtb = vc->vtb - l2_hv.vtb;
-	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv, hfscr);
 
 	/* restore L1 state */
 	vcpu->arch.nested = NULL;