diff mbox series

[RFC] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

Message ID 1594888333-9370-1-git-send-email-linuxram@us.ibm.com (mailing list archive)
State Rejected
Headers show
Series [RFC] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register | expand

Commit Message

Ram Pai July 16, 2020, 8:32 a.m. UTC
An instruction accessing a mmio address, generates a HDSI fault.  This fault is
appropriately handled by the Hypervisor.  However in the case of secureVMs, the
fault is delivered to the ultravisor.

Unfortunately the Ultravisor has no correct-way to fetch the faulting
instruction. The PEF architecture does not allow Ultravisor to enable MMU
translation. Walking the two level page table to read the instruction can race
with other vcpus modifying the SVM's process scoped page table.

This problem can be correctly solved with some help from the kernel.

Capture the faulting instruction in SPRG0 register, before executing the
faulting instruction. This enables the ultravisor to easily procure the
faulting instruction and emulate it.

Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/include/asm/io.h | 85 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 75 insertions(+), 10 deletions(-)

Comments

Laurent Dufour July 20, 2020, 9:39 a.m. UTC | #1
Le 16/07/2020 à 10:32, Ram Pai a écrit :
> An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> fault is delivered to the ultravisor.
> 
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.
> 
> This problem can be correctly solved with some help from the kernel.
> 
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.
> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>   arch/powerpc/include/asm/io.h | 85 ++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 75 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
> index 635969b..7ef663d 100644
> --- a/arch/powerpc/include/asm/io.h
> +++ b/arch/powerpc/include/asm/io.h
> @@ -35,6 +35,7 @@
>   #include <asm/mmu.h>
>   #include <asm/ppc_asm.h>
>   #include <asm/pgtable.h>
> +#include <asm/svm.h>
>   
>   #define SIO_CONFIG_RA	0x398
>   #define SIO_CONFIG_RD	0x399
> @@ -105,34 +106,98 @@
>   static inline u##size name(const volatile u##size __iomem *addr)	\
>   {									\
>   	u##size ret;							\
> -	__asm__ __volatile__("sync;"#insn" %0,%y1;twi 0,%0,0;isync"	\
> -		: "=r" (ret) : "Z" (*addr) : "memory");			\
> +	if (is_secure_guest()) {					\
> +		__asm__ __volatile__("mfsprg0 %3;"			\
> +				"lnia %2;"				\
> +				"ld %2,12(%2);"				\
> +				"mtsprg0 %2;"				\
> +				"sync;"					\
> +				#insn" %0,%y1;"				\
> +				"twi 0,%0,0;"				\
> +				"isync;"				\
> +				"mtsprg0 %3"				\
> +			: "=r" (ret)					\
> +			: "Z" (*addr), "r" (0), "r" (0)			\

I'm wondering if SPRG0 is restored to its original value.
You're using the same register (r0) for parameters 2 and 3, so when doing lnia 
%2, you're overwriting the SPRG0 value you saved in r0 just earlier.

It may be clearer to use explicit registers for %2 and %3 and to mark them as 
modified for the compiler.

This applies to the other macros.

Cheers,
Laurent.

> +			: "memory");					\
> +	} else {							\
> +		__asm__ __volatile__("sync;"				\
> +				#insn" %0,%y1;"				\
> +				"twi 0,%0,0;"				\
> +				"isync"					\
> +			: "=r" (ret) : "Z" (*addr) : "memory");		\
> +	}								\
>   	return ret;							\
>   }
>   
>   #define DEF_MMIO_OUT_X(name, size, insn)				\
>   static inline void name(volatile u##size __iomem *addr, u##size val)	\
>   {									\
> -	__asm__ __volatile__("sync;"#insn" %1,%y0"			\
> -		: "=Z" (*addr) : "r" (val) : "memory");			\
> -	mmiowb_set_pending();						\
> +	if (is_secure_guest()) {					\
> +		__asm__ __volatile__("mfsprg0 %3;"			\
> +				"lnia %2;"				\
> +				"ld %2,12(%2);"				\
> +				"mtsprg0 %2;"				\
> +				"sync;"					\
> +				#insn" %1,%y0;"				\
> +				"mtsprg0 %3"				\
> +			: "=Z" (*addr)					\
> +			: "r" (val), "r" (0), "r" (0)			\
> +			: "memory");					\
> +	} else {							\
> +		__asm__ __volatile__("sync;"				\
> +				#insn" %1,%y0"				\
> +			: "=Z" (*addr) : "r" (val) : "memory");         \
> +		mmiowb_set_pending();					\
> +	}								\
>   }
>   
>   #define DEF_MMIO_IN_D(name, size, insn)				\
>   static inline u##size name(const volatile u##size __iomem *addr)	\
>   {									\
>   	u##size ret;							\
> -	__asm__ __volatile__("sync;"#insn"%U1%X1 %0,%1;twi 0,%0,0;isync"\
> -		: "=r" (ret) : "m" (*addr) : "memory");			\
> +	if (is_secure_guest()) {					\
> +		__asm__ __volatile__("mfsprg0 %3;"			\
> +				"lnia %2;"				\
> +				"ld %2,12(%2);"				\
> +				"mtsprg0 %2;"				\
> +				"sync;"					\
> +				#insn"%U1%X1 %0,%1;"			\
> +				"twi 0,%0,0;"				\
> +				"isync;"				\
> +				"mtsprg0 %3"				\
> +			: "=r" (ret)					\
> +			: "m" (*addr), "r" (0), "r" (0)			\
> +			: "memory");					\
> +	} else {							\
> +		__asm__ __volatile__("sync;"				\
> +				#insn"%U1%X1 %0,%1;"			\
> +				"twi 0,%0,0;"				\
> +				"isync"					\
> +			: "=r" (ret) : "m" (*addr) : "memory");         \
> +	}								\
>   	return ret;							\
>   }
>   
>   #define DEF_MMIO_OUT_D(name, size, insn)				\
>   static inline void name(volatile u##size __iomem *addr, u##size val)	\
>   {									\
> -	__asm__ __volatile__("sync;"#insn"%U0%X0 %1,%0"			\
> -		: "=m" (*addr) : "r" (val) : "memory");			\
> -	mmiowb_set_pending();						\
> +	if (is_secure_guest()) {					\
> +		__asm__ __volatile__("mfsprg0 %3;"			\
> +				"lnia %2;"				\
> +				"ld %2,12(%2);"				\
> +				"mtsprg0 %2;"				\
> +				"sync;"					\
> +				#insn"%U0%X0 %1,%0;"			\
> +				"mtsprg0 %3"				\
> +			: "=m" (*addr)					\
> +			: "r" (val), "r" (0), "r" (0)			\
> +			: "memory");					\
> +	} else {							\
> +		__asm__ __volatile__("sync;"				\
> +				#insn"%U0%X0 %1,%0"			\
> +			: "=m" (*addr) : "r" (val) : "memory");		\
> +		mmiowb_set_pending();					\
> +	}								\
>   }
>   
>   DEF_MMIO_IN_D(in_8,     8, lbz);
>
Segher Boessenkool July 20, 2020, 8:10 p.m. UTC | #2
On Mon, Jul 20, 2020 at 11:39:56AM +0200, Laurent Dufour wrote:
> Le 16/07/2020 à 10:32, Ram Pai a écrit :
> >+	if (is_secure_guest()) {					\
> >+		__asm__ __volatile__("mfsprg0 %3;"			\
> >+				"lnia %2;"				\
> >+				"ld %2,12(%2);"				\
> >+				"mtsprg0 %2;"				\
> >+				"sync;"					\
> >+				#insn" %0,%y1;"				\
> >+				"twi 0,%0,0;"				\
> >+				"isync;"				\
> >+				"mtsprg0 %3"				\
> >+			: "=r" (ret)					\
> >+			: "Z" (*addr), "r" (0), "r" (0)			\
> 
> I'm wondering if SPRG0 is restored to its original value.
> You're using the same register (r0) for parameters 2 and 3, so when doing 
> lnia %2, you're overwriting the SPRG0 value you saved in r0 just earlier.

It is putting the value 0 in the registers the compiler chooses for
operands 2 and 3.  But operand 3 is written, while the asm says it is an
input.  It needs an earlyclobber as well.

> It may be clearer to use explicit registers for %2 and %3 and to mark them 
> as modified for the compiler.

That is not a good idea, imnsho.


Segher
Segher Boessenkool July 20, 2020, 8:24 p.m. UTC | #3
On Mon, Jul 20, 2020 at 03:10:41PM -0500, Segher Boessenkool wrote:
> On Mon, Jul 20, 2020 at 11:39:56AM +0200, Laurent Dufour wrote:
> > Le 16/07/2020 à 10:32, Ram Pai a écrit :
> > >+	if (is_secure_guest()) {					\
> > >+		__asm__ __volatile__("mfsprg0 %3;"			\
> > >+				"lnia %2;"				\
> > >+				"ld %2,12(%2);"				\
> > >+				"mtsprg0 %2;"				\
> > >+				"sync;"					\
> > >+				#insn" %0,%y1;"				\
> > >+				"twi 0,%0,0;"				\
> > >+				"isync;"				\
> > >+				"mtsprg0 %3"				\
> > >+			: "=r" (ret)					\
> > >+			: "Z" (*addr), "r" (0), "r" (0)			\
> > 
> > I'm wondering if SPRG0 is restored to its original value.
> > You're using the same register (r0) for parameters 2 and 3, so when doing 
> > lnia %2, you're overwriting the SPRG0 value you saved in r0 just earlier.
> 
> It is putting the value 0 in the registers the compiler chooses for
> operands 2 and 3.  But operand 3 is written, while the asm says it is an
> input.  It needs an earlyclobber as well.
> 
> > It may be clearer to use explicit registers for %2 and %3 and to mark them 
> > as modified for the compiler.
> 
> That is not a good idea, imnsho.

(The explicit register number part, I mean; operand 2 should be an
output as well, yes.)


Segher
Laurent Dufour July 21, 2020, 7:22 a.m. UTC | #4
Le 20/07/2020 à 22:24, Segher Boessenkool a écrit :
> On Mon, Jul 20, 2020 at 03:10:41PM -0500, Segher Boessenkool wrote:
>> On Mon, Jul 20, 2020 at 11:39:56AM +0200, Laurent Dufour wrote:
>>> Le 16/07/2020 à 10:32, Ram Pai a écrit :
>>>> +	if (is_secure_guest()) {					\
>>>> +		__asm__ __volatile__("mfsprg0 %3;"			\
>>>> +				"lnia %2;"				\
>>>> +				"ld %2,12(%2);"				\
>>>> +				"mtsprg0 %2;"				\
>>>> +				"sync;"					\
>>>> +				#insn" %0,%y1;"				\
>>>> +				"twi 0,%0,0;"				\
>>>> +				"isync;"				\
>>>> +				"mtsprg0 %3"				\
>>>> +			: "=r" (ret)					\
>>>> +			: "Z" (*addr), "r" (0), "r" (0)			\
>>>
>>> I'm wondering if SPRG0 is restored to its original value.
>>> You're using the same register (r0) for parameters 2 and 3, so when doing
>>> lnia %2, you're overwriting the SPRG0 value you saved in r0 just earlier.
>>
>> It is putting the value 0 in the registers the compiler chooses for
>> operands 2 and 3.  But operand 3 is written, while the asm says it is an
>> input.  It needs an earlyclobber as well.

Oh nice, I was not aware that compiler may choose registers this way.
Good to know, thanks for the explanation.

>>> It may be clearer to use explicit registers for %2 and %3 and to mark them
>>> as modified for the compiler.
>>
>> That is not a good idea, imnsho.
> 
> (The explicit register number part, I mean; operand 2 should be an
> output as well, yes.)

Sure if the compiler can choose the registers that's far better.

Cheers,
Laurent.
Nicholas Piggin July 21, 2020, 3 p.m. UTC | #5
Excerpts from Ram Pai's message of July 16, 2020 6:32 pm:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> fault is delivered to the ultravisor.

Why not a ucall if you're paraultravizing it anyway?

> 
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.
> 
> This problem can be correctly solved with some help from the kernel.
> 
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.
> 
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/include/asm/io.h | 85 ++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 75 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
> index 635969b..7ef663d 100644
> --- a/arch/powerpc/include/asm/io.h
> +++ b/arch/powerpc/include/asm/io.h
> @@ -35,6 +35,7 @@
>  #include <asm/mmu.h>
>  #include <asm/ppc_asm.h>
>  #include <asm/pgtable.h>
> +#include <asm/svm.h>
>  
>  #define SIO_CONFIG_RA	0x398
>  #define SIO_CONFIG_RD	0x399
> @@ -105,34 +106,98 @@
>  static inline u##size name(const volatile u##size __iomem *addr)	\
>  {									\
>  	u##size ret;							\
> -	__asm__ __volatile__("sync;"#insn" %0,%y1;twi 0,%0,0;isync"	\
> -		: "=r" (ret) : "Z" (*addr) : "memory");			\
> +	if (is_secure_guest()) {					\
> +		__asm__ __volatile__("mfsprg0 %3;"			\
> +				"lnia %2;"				\
> +				"ld %2,12(%2);"				\
> +				"mtsprg0 %2;"				\
> +				"sync;"					\
> +				#insn" %0,%y1;"				\
> +				"twi 0,%0,0;"				\
> +				"isync;"				\
> +				"mtsprg0 %3"				\

We prefer to use mtspr in new code, and the nia offset should be 
calculated with a label I think "(1f - .)(%2)" should work.

SPRG usage is documented in arch/powerpc/include/asm/reg.h if this 
goes past RFC stage. Looks like SPRG0 probably could be used for this.

Thanks,
Nick
Michael Ellerman July 22, 2020, 2:06 a.m. UTC | #6
Ram Pai <linuxram@us.ibm.com> writes:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> fault is delivered to the ultravisor.
>
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.

You're trying to read the guest's kernel text IIUC, that mapping should
be stable. Possibly permissions on it could change over time, but the
virtual -> real mapping should not.

> This problem can be correctly solved with some help from the kernel.
>
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.

This is not something I'm going to merge. Sorry.

cheers
Benjamin Herrenschmidt July 22, 2020, 2:23 a.m. UTC | #7
On Wed, 2020-07-22 at 12:06 +1000, Michael Ellerman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> > fault is delivered to the ultravisor.
> > 
> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
> > translation. Walking the two level page table to read the instruction can race
> > with other vcpus modifying the SVM's process scoped page table.
> 
> You're trying to read the guest's kernel text IIUC, that mapping should
> be stable. Possibly permissions on it could change over time, but the
> virtual -> real mapping should not.

This will of course no longer work if userspace tries to access MMIO
but as long as you are ok limiting MMIO usage to the kernel, that's one
way.

iirc MMIO emulation in KVM on powerpc already has that race because of
HW TLB inval broadcast and it hasn't hurt anybody ... so far.

> > This problem can be correctly solved with some help from the kernel.
> > 
> > Capture the faulting instruction in SPRG0 register, before executing the
> > faulting instruction. This enables the ultravisor to easily procure the
> > faulting instruction and emulate it.
> 
> This is not something I'm going to merge. Sorry.

Another approach is to have the guest explicitly switch to using UV
calls for MMIO.

Cheers,
Ben.
Paul Mackerras July 22, 2020, 5:02 a.m. UTC | #8
On Thu, Jul 16, 2020 at 01:32:13AM -0700, Ram Pai wrote:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> fault is delivered to the ultravisor.
> 
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.
> 
> This problem can be correctly solved with some help from the kernel.
> 
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.

Just a comment on the approach of putting the instruction in SPRG0:
these I/O accessors can be used in interrupt routines, which means
that if these accessors are ever used with interrupts enabled, there
is the possibility of an external interrupt occurring between the
instruction that sets SPRG0 and the load/store instruction that
faults.  If the handler for that interrupt itself does an I/O access,
it will overwrite SPRG0, corrupting the value set by the interrupted
code.

The choices to fix that would seem to be (a) disable interrupts around
all I/O accesses, (b) have the accessor save and restore SPRG0, or (c)
solve the problem another way, such as by doing a H_LOGICAL_CI_LOAD
or H_LOGICAL_CI_STORE hypercall.

Paul.
Ram Pai July 22, 2020, 7:42 a.m. UTC | #9
On Wed, Jul 22, 2020 at 03:02:32PM +1000, Paul Mackerras wrote:
> On Thu, Jul 16, 2020 at 01:32:13AM -0700, Ram Pai wrote:
> > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> > fault is delivered to the ultravisor.
> > 
> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
> > translation. Walking the two level page table to read the instruction can race
> > with other vcpus modifying the SVM's process scoped page table.
> > 
> > This problem can be correctly solved with some help from the kernel.
> > 
> > Capture the faulting instruction in SPRG0 register, before executing the
> > faulting instruction. This enables the ultravisor to easily procure the
> > faulting instruction and emulate it.
> 
> Just a comment on the approach of putting the instruction in SPRG0:
> these I/O accessors can be used in interrupt routines, which means
> that if these accessors are ever used with interrupts enabled, there
> is the possibility of an external interrupt occurring between the
> instruction that sets SPRG0 and the load/store instruction that
> faults.  If the handler for that interrupt itself does an I/O access,
> it will overwrite SPRG0, corrupting the value set by the interrupted
> code.

Acutally my proposed code restores the value of SPRG0 before returning back to
the interrupted instruction. So here is the sequence. I think it works.

 (1) store sprg0 in register Rx (lets say srpg0 had 0xc. Rx now contains 0xc)

 (2) save faulting instruction address in sprg0 (lets say the value is 0xa.
		 			sprg0 will contain 0xa).

 (3) 		<----- interrupt arrives

 (4) store sprg0 in register Ry ( Ry now contains 0xa )

 (5) save faulting instruction address in sprg0 (lets say the value is 0xb).

 (6) 0xb: execute faulting instruction

 (7) restore Ry into sprg0 ( sprg0 now contains 0xa )

 (8) 		<-- return from interrupt

 (9)  0xa: execute faulting instruction

 (10) restore Rx into sprg0 (sprg0 now contains 0xc)

> 
> The choices to fix that would seem to be (a) disable interrupts around
> all I/O accesses, (b) have the accessor save and restore SPRG0, or (c)
> solve the problem another way, such as by doing a H_LOGICAL_CI_LOAD
> or H_LOGICAL_CI_STORE hypercall.

Ok. Will explore (c).

Thanks,
RP
Ram Pai July 22, 2020, 7:45 a.m. UTC | #10
On Wed, Jul 22, 2020 at 12:42:05AM -0700, Ram Pai wrote:
> On Wed, Jul 22, 2020 at 03:02:32PM +1000, Paul Mackerras wrote:
> > On Thu, Jul 16, 2020 at 01:32:13AM -0700, Ram Pai wrote:
> > > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> > > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> > > fault is delivered to the ultravisor.
> > > 
> > > Unfortunately the Ultravisor has no correct-way to fetch the faulting
> > > instruction. The PEF architecture does not allow Ultravisor to enable MMU
> > > translation. Walking the two level page table to read the instruction can race
> > > with other vcpus modifying the SVM's process scoped page table.
> > > 
> > > This problem can be correctly solved with some help from the kernel.
> > > 
> > > Capture the faulting instruction in SPRG0 register, before executing the
> > > faulting instruction. This enables the ultravisor to easily procure the
> > > faulting instruction and emulate it.
> > 
> > Just a comment on the approach of putting the instruction in SPRG0:
> > these I/O accessors can be used in interrupt routines, which means
> > that if these accessors are ever used with interrupts enabled, there
> > is the possibility of an external interrupt occurring between the
> > instruction that sets SPRG0 and the load/store instruction that
> > faults.  If the handler for that interrupt itself does an I/O access,
> > it will overwrite SPRG0, corrupting the value set by the interrupted
> > code.
> 
> Acutally my proposed code restores the value of SPRG0 before returning back to
> the interrupted instruction. So here is the sequence. I think it works.
> 
>  (1) store sprg0 in register Rx (lets say srpg0 had 0xc. Rx now contains 0xc)
> 
>  (2) save faulting instruction address in sprg0 (lets say the value is 0xa.
> 		 			sprg0 will contain 0xa).

Small correction. sprg0 does not store the address of the faulting
instruction. It stores the isntruction itself. Regardless, the code
should work, I think.

RP
Ram Pai July 22, 2020, 7:49 a.m. UTC | #11
On Wed, Jul 22, 2020 at 12:06:06PM +1000, Michael Ellerman wrote:
> Ram Pai <linuxram@us.ibm.com> writes:
> > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
> > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
> > fault is delivered to the ultravisor.
> >
> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
> > translation. Walking the two level page table to read the instruction can race
> > with other vcpus modifying the SVM's process scoped page table.
> 
> You're trying to read the guest's kernel text IIUC, that mapping should
> be stable. Possibly permissions on it could change over time, but the
> virtual -> real mapping should not.

Actually the code does not capture the address of the instruction in the
sprg0 register. It captures the instruction itself. So should the mapping
matter?

> 
> > This problem can be correctly solved with some help from the kernel.
> >
> > Capture the faulting instruction in SPRG0 register, before executing the
> > faulting instruction. This enables the ultravisor to easily procure the
> > faulting instruction and emulate it.
> 
> This is not something I'm going to merge. Sorry.

Ok. Will consider other approaches.

RP
Michael Ellerman July 22, 2020, 12:45 p.m. UTC | #12
Ram Pai <linuxram@us.ibm.com> writes:
> On Wed, Jul 22, 2020 at 12:06:06PM +1000, Michael Ellerman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
>> > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
>> > fault is delivered to the ultravisor.
>> >
>> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
>> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
>> > translation. Walking the two level page table to read the instruction can race
>> > with other vcpus modifying the SVM's process scoped page table.
>> 
>> You're trying to read the guest's kernel text IIUC, that mapping should
>> be stable. Possibly permissions on it could change over time, but the
>> virtual -> real mapping should not.
>
> Actually the code does not capture the address of the instruction in the
> sprg0 register. It captures the instruction itself. So should the mapping
> matter?

Sorry that was talking about reading the instruction by doing the page
walk, not with this patch applied.

cheers
Michael Ellerman July 24, 2020, 11:49 a.m. UTC | #13
Ram Pai <linuxram@us.ibm.com> writes:
> On Wed, Jul 22, 2020 at 12:06:06PM +1000, Michael Ellerman wrote:
>> Ram Pai <linuxram@us.ibm.com> writes:
>> > An instruction accessing a mmio address, generates a HDSI fault.  This fault is
>> > appropriately handled by the Hypervisor.  However in the case of secureVMs, the
>> > fault is delivered to the ultravisor.
>> >
>> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
>> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
>> > translation. Walking the two level page table to read the instruction can race
>> > with other vcpus modifying the SVM's process scoped page table.
>> 
>> You're trying to read the guest's kernel text IIUC, that mapping should
>> be stable. Possibly permissions on it could change over time, but the
>> virtual -> real mapping should not.
>
> Actually the code does not capture the address of the instruction in the
> sprg0 register. It captures the instruction itself. So should the mapping
> matter?
>> 
>> > This problem can be correctly solved with some help from the kernel.
>> >
>> > Capture the faulting instruction in SPRG0 register, before executing the
>> > faulting instruction. This enables the ultravisor to easily procure the
>> > faulting instruction and emulate it.
>> 
>> This is not something I'm going to merge. Sorry.
>
> Ok. Will consider other approaches.

To elaborate ...

You've basically invented a custom ucall ABI. But a really strange one
which takes an instruction as its first parameter in SPRG0, and then
subsequent parameters in any GPR depending on what the instruction was.

The UV should either emulate the instruction, which means the guest
should not be expected to do anything other than execute the
instruction. Or it should be done with a proper ucall that the guest
explicitly makes with a well defined ABI.

cheers
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 635969b..7ef663d 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -35,6 +35,7 @@ 
 #include <asm/mmu.h>
 #include <asm/ppc_asm.h>
 #include <asm/pgtable.h>
+#include <asm/svm.h>
 
 #define SIO_CONFIG_RA	0x398
 #define SIO_CONFIG_RD	0x399
@@ -105,34 +106,98 @@ 
 static inline u##size name(const volatile u##size __iomem *addr)	\
 {									\
 	u##size ret;							\
-	__asm__ __volatile__("sync;"#insn" %0,%y1;twi 0,%0,0;isync"	\
-		: "=r" (ret) : "Z" (*addr) : "memory");			\
+	if (is_secure_guest()) {					\
+		__asm__ __volatile__("mfsprg0 %3;"			\
+				"lnia %2;"				\
+				"ld %2,12(%2);"				\
+				"mtsprg0 %2;"				\
+				"sync;"					\
+				#insn" %0,%y1;"				\
+				"twi 0,%0,0;"				\
+				"isync;"				\
+				"mtsprg0 %3"				\
+			: "=r" (ret)					\
+			: "Z" (*addr), "r" (0), "r" (0)			\
+			: "memory");					\
+	} else {							\
+		__asm__ __volatile__("sync;"				\
+				#insn" %0,%y1;"				\
+				"twi 0,%0,0;"				\
+				"isync"					\
+			: "=r" (ret) : "Z" (*addr) : "memory");		\
+	}								\
 	return ret;							\
 }
 
 #define DEF_MMIO_OUT_X(name, size, insn)				\
 static inline void name(volatile u##size __iomem *addr, u##size val)	\
 {									\
-	__asm__ __volatile__("sync;"#insn" %1,%y0"			\
-		: "=Z" (*addr) : "r" (val) : "memory");			\
-	mmiowb_set_pending();						\
+	if (is_secure_guest()) {					\
+		__asm__ __volatile__("mfsprg0 %3;"			\
+				"lnia %2;"				\
+				"ld %2,12(%2);"				\
+				"mtsprg0 %2;"				\
+				"sync;"					\
+				#insn" %1,%y0;"				\
+				"mtsprg0 %3"				\
+			: "=Z" (*addr)					\
+			: "r" (val), "r" (0), "r" (0)			\
+			: "memory");					\
+	} else {							\
+		__asm__ __volatile__("sync;"				\
+				#insn" %1,%y0"				\
+			: "=Z" (*addr) : "r" (val) : "memory");         \
+		mmiowb_set_pending();					\
+	}								\
 }
 
 #define DEF_MMIO_IN_D(name, size, insn)				\
 static inline u##size name(const volatile u##size __iomem *addr)	\
 {									\
 	u##size ret;							\
-	__asm__ __volatile__("sync;"#insn"%U1%X1 %0,%1;twi 0,%0,0;isync"\
-		: "=r" (ret) : "m" (*addr) : "memory");			\
+	if (is_secure_guest()) {					\
+		__asm__ __volatile__("mfsprg0 %3;"			\
+				"lnia %2;"				\
+				"ld %2,12(%2);"				\
+				"mtsprg0 %2;"				\
+				"sync;"					\
+				#insn"%U1%X1 %0,%1;"			\
+				"twi 0,%0,0;"				\
+				"isync;"				\
+				"mtsprg0 %3"				\
+			: "=r" (ret)					\
+			: "m" (*addr), "r" (0), "r" (0)			\
+			: "memory");					\
+	} else {							\
+		__asm__ __volatile__("sync;"				\
+				#insn"%U1%X1 %0,%1;"			\
+				"twi 0,%0,0;"				\
+				"isync"					\
+			: "=r" (ret) : "m" (*addr) : "memory");         \
+	}								\
 	return ret;							\
 }
 
 #define DEF_MMIO_OUT_D(name, size, insn)				\
 static inline void name(volatile u##size __iomem *addr, u##size val)	\
 {									\
-	__asm__ __volatile__("sync;"#insn"%U0%X0 %1,%0"			\
-		: "=m" (*addr) : "r" (val) : "memory");			\
-	mmiowb_set_pending();						\
+	if (is_secure_guest()) {					\
+		__asm__ __volatile__("mfsprg0 %3;"			\
+				"lnia %2;"				\
+				"ld %2,12(%2);"				\
+				"mtsprg0 %2;"				\
+				"sync;"					\
+				#insn"%U0%X0 %1,%0;"			\
+				"mtsprg0 %3"				\
+			: "=m" (*addr)					\
+			: "r" (val), "r" (0), "r" (0)			\
+			: "memory");					\
+	} else {							\
+		__asm__ __volatile__("sync;"				\
+				#insn"%U0%X0 %1,%0"			\
+			: "=m" (*addr) : "r" (val) : "memory");		\
+		mmiowb_set_pending();					\
+	}								\
 }
 
 DEF_MMIO_IN_D(in_8,     8, lbz);