diff mbox series

[RFC,v1] powerpc/prom_init: disable XIVE in Secure VM.

Message ID 1582962844-26333-1-git-send-email-linuxram@us.ibm.com
State Rejected
Headers show
Series [RFC,v1] powerpc/prom_init: disable XIVE in Secure VM. | expand

Commit Message

Ram Pai Feb. 29, 2020, 7:54 a.m. UTC
XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.

Hence Secure VM, must always default to XICS interrupt controller.

If XIVE is requested through kernel command line option "xive=on",
override and turn it off.

If XIVE is the only supported platform interrupt controller; specified
through qemu option "ic-mode=xive", simply abort. Otherwise default to
XICS.

Cc: kvm-ppc@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
Cc: Michael Anderson <andmike@linux.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Greg Kurz <groug@kaod.org>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Ram Pai <linuxram@us.ibm.com>
---
 arch/powerpc/kernel/prom_init.c | 43 ++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

Comments

Cédric Le Goater Feb. 29, 2020, 8:27 a.m. UTC | #1
On 2/29/20 8:54 AM, Ram Pai wrote:
> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> 
> Hence Secure VM, must always default to XICS interrupt controller.

have you tried XIVE emulation 'kernel-irqchip=off' ? 

> If XIVE is requested through kernel command line option "xive=on",
> override and turn it off.

This is incorrect. It is negotiated through CAS depending on the FW
capabilities and the KVM capabilities.

> If XIVE is the only supported platform interrupt controller; specified
> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> XICS.


I don't think it is a good approach to downgrade the guest kernel 
capabilities this way. 

PAPR has specified the CAS negotiation process for this purpose. It 
comes in two parts under KVM. First the KVM hypervisor advertises or 
not a capability to QEMU. The second is the CAS negotiation process 
between QEMU and the guest OS.

The SVM specifications might not be complete yet and if some features 
are incompatible, I think we should modify the capabilities advertised 
by the hypervisor : no XIVE in case of SVM. QEMU will automatically 
use the fallback path and emulate the XIVE device, same as setting 
'kernel-irqchip=off'. 

This is how KVM operates on Boston systems today which do not have 
the right level of FW to support migration. XIVE is emulated. 

It will give SVM a working default without any changes in QEMU or the
guest. Now, if one needs more performance, accelerated xics should be
activated on the command line with 'xive=off'.


I understand that SVM requires FW support. Do we have a SVM capability  
returned to QEMU ? That might have been addressed in other patches.

Thanks,

C.

> 
> Cc: kvm-ppc@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Cc: Michael Anderson <andmike@linux.ibm.com>
> Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: Paul Mackerras <paulus@ozlabs.org>
> Cc: Greg Kurz <groug@kaod.org>
> Cc: Cedric Le Goater <clg@fr.ibm.com>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/kernel/prom_init.c | 43 ++++++++++++++++++++++++++++-------------
>  1 file changed, 30 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 5773453..dd96c82 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -805,6 +805,18 @@ static void __init early_cmdline_parse(void)
>  #endif
>  	}
>  
> +#ifdef CONFIG_PPC_SVM
> +	opt = prom_strstr(prom_cmd_line, "svm=");
> +	if (opt) {
> +		bool val;
> +
> +		opt += sizeof("svm=") - 1;
> +		if (!prom_strtobool(opt, &val))
> +			prom_svm_enable = val;
> +		prom_printf("svm =%d\n", prom_svm_enable);
> +	}
> +#endif /* CONFIG_PPC_SVM */
> +
>  #ifdef CONFIG_PPC_PSERIES
>  	prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
>  	opt = prom_strstr(prom_cmd_line, "disable_radix");
> @@ -823,23 +835,22 @@ static void __init early_cmdline_parse(void)
>  	if (prom_radix_disable)
>  		prom_debug("Radix disabled from cmdline\n");
>  
> -	opt = prom_strstr(prom_cmd_line, "xive=off");
> -	if (opt) {
> +#ifdef CONFIG_PPC_SVM
> +	if (prom_svm_enable) {
>  		prom_xive_disable = true;
> -		prom_debug("XIVE disabled from cmdline\n");
> +		prom_debug("XIVE disabled in Secure VM\n");
>  	}
> -#endif /* CONFIG_PPC_PSERIES */
> -
> -#ifdef CONFIG_PPC_SVM
> -	opt = prom_strstr(prom_cmd_line, "svm=");
> -	if (opt) {
> -		bool val;
> +#endif /* CONFIG_PPC_SVM */
>  
> -		opt += sizeof("svm=") - 1;
> -		if (!prom_strtobool(opt, &val))
> -			prom_svm_enable = val;
> +	if (!prom_xive_disable) {
> +		opt = prom_strstr(prom_cmd_line, "xive=off");
> +		if (opt) {
> +			prom_xive_disable = true;
> +			prom_debug("XIVE disabled from cmdline\n");
> +		}
>  	}
> -#endif /* CONFIG_PPC_SVM */
> +
> +#endif /* CONFIG_PPC_PSERIES */
>  }
>  
>  #ifdef CONFIG_PPC_PSERIES
> @@ -1251,6 +1262,12 @@ static void __init prom_parse_xive_model(u8 val,
>  		break;
>  	case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */
>  		prom_debug("XIVE - exploitation mode supported\n");
> +
> +#ifdef CONFIG_PPC_SVM
> +		if (prom_svm_enable)
> +			prom_panic("WARNING: xive unsupported in Secure VM\n");
> +#endif /* CONFIG_PPC_SVM */
> +
>  		if (prom_xive_disable) {
>  			/*
>  			 * If we __have__ to do XIVE, we're better off ignoring
>
Ram Pai Feb. 29, 2020, 10:51 p.m. UTC | #2
On Sat, Feb 29, 2020 at 09:27:54AM +0100, Cédric Le Goater wrote:
> On 2/29/20 8:54 AM, Ram Pai wrote:
> > XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> > 
> > Hence Secure VM, must always default to XICS interrupt controller.
> 
> have you tried XIVE emulation 'kernel-irqchip=off' ? 

yes and it hangs. I think that option, continues to enable some variant
of XIVE in the VM.  There are some known deficiencies between KVM
and the ultravisor negotiation, resulting in a hang in the SVM.

> 
> > If XIVE is requested through kernel command line option "xive=on",
> > override and turn it off.
> 
> This is incorrect. It is negotiated through CAS depending on the FW
> capabilities and the KVM capabilities.

Yes I understand, qemu/KVM have predetermined a set of capabilties that
it can offer to the VM.  The kernel within the VM has a list of
capabilties it needs to operate correctly.  So both negotiate and
determine something mutually ammicable.

Here I am talking about the list of capabilities that the kernel is
trying to determine, it needs to operate correctly.  "xive=on" is one of
those capabilities the kernel is told by the VM-adminstrator, to enable.
Unfortunately if the VM-administrtor blindly requests to enable it, the
kernel must override it, if it knows that will be switching the VM into
a SVM soon. No point negotiating a capability with Qemu; through CAS,
if it knows it cannot handle that capability.

> 
> > If XIVE is the only supported platform interrupt controller; specified
> > through qemu option "ic-mode=xive", simply abort. Otherwise default to
> > XICS.
> 
> 
> I don't think it is a good approach to downgrade the guest kernel 
> capabilities this way. 
> 
> PAPR has specified the CAS negotiation process for this purpose. It 
> comes in two parts under KVM. First the KVM hypervisor advertises or 
> not a capability to QEMU. The second is the CAS negotiation process 
> between QEMU and the guest OS.

Unfortunately, this is not viable.  At the time the hypervisor
advertises its capabilities to qemu, the hypervisor has no idea whether
that VM will switch into a SVM or not.  The decision to switch into a
SVM is taken by the kernel running in the VM. This happens much later,
after the hypervisor has already conveyed its capabilties to the qemu, and
qemu has than instantiated the VM.

As a result, CAS in prom_init is the only place where this negotiation
can take place.

> 
> The SVM specifications might not be complete yet and if some features 
> are incompatible, I think we should modify the capabilities advertised 
> by the hypervisor : no XIVE in case of SVM. QEMU will automatically 
> use the fallback path and emulate the XIVE device, same as setting 
> 'kernel-irqchip=off'. 

As mentioned above, this would be an excellent approach, if the
Hypervisor was aware of the VM's intent to switch into a SVM.  Neither
the hypervisor knows, nor the qemu.  Only the kernel running within the
VM knows about it.


Do you still think, my approach is wrong?
RP
Cédric Le Goater March 2, 2020, 7:34 a.m. UTC | #3
On 2/29/20 11:51 PM, Ram Pai wrote:
> On Sat, Feb 29, 2020 at 09:27:54AM +0100, Cédric Le Goater wrote:
>> On 2/29/20 8:54 AM, Ram Pai wrote:
>>> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
>>>
>>> Hence Secure VM, must always default to XICS interrupt controller.
>>
>> have you tried XIVE emulation 'kernel-irqchip=off' ? 
> 
> yes and it hangs. I think that option, continues to enable some variant
> of XIVE in the VM. 

HW is not involved, KVM is not involved anymore and all is emulated at 
the QEMU level in user space. What is the issue ? 

> There are some known deficiencies between KVM
> and the ultravisor negotiation, resulting in a hang in the SVM.

That is something else to investigate. feature/capability negotiation
is the core of the hypervisor stack : 

    OPAL <-> PowerNV <-> KVM <-> QEMU <-> guest OS

>>> If XIVE is requested through kernel command line option "xive=on",
>>> override and turn it off.
>>
>> This is incorrect. It is negotiated through CAS depending on the FW
>> capabilities and the KVM capabilities.
> 
> Yes I understand, qemu/KVM have predetermined a set of capabilties that
> it can offer to the VM.  The kernel within the VM has a list of
> capabilties it needs to operate correctly.  So both negotiate and
> determine something mutually ammicable.
> 
> Here I am talking about the list of capabilities that the kernel is
> trying to determine, it needs to operate correctly.  "xive=on" is one of
> those capabilities the kernel is told by the VM-adminstrator, to enable.

XIVE is not a kernel capability. It's platform support and the default
for P9 is the native exploitation mode which makes full use of the P9
interrupt controller. For non XIVE aware kernels, the hypervisor emulates
the legacy interface on top of XIVE. 

"xive=off" was introduced for distro testing. It skips the negotiation 
process of the XIVE native exploitation mode on the guest. But it's not
a negotiation setting. It's a chicken switch.

> Unfortunately if the VM-administrtor blindly requests to enable it, the
> kernel must override it, if it knows that will be switching the VM into
> a SVM soon. No point negotiating a capability with Qemu; through CAS,
> if it knows it cannot handle that capability.

I don't understand. Are you talking about SVM or XIVE ? 

>>> If XIVE is the only supported platform interrupt controller; specified
>>> through qemu option "ic-mode=xive", simply abort. Otherwise default to
>>> XICS.
>>
>>
>> I don't think it is a good approach to downgrade the guest kernel 
>> capabilities this way. 
>>
>> PAPR has specified the CAS negotiation process for this purpose. It 
>> comes in two parts under KVM. First the KVM hypervisor advertises or 
>> not a capability to QEMU. The second is the CAS negotiation process 
>> between QEMU and the guest OS.
> 
> Unfortunately, this is not viable.  At the time the hypervisor
> advertises its capabilities to qemu, the hypervisor has no idea whether
> that VM will switch into a SVM or not. 

OK, but the hypervisor knows if it can handle 'SVM' guests or not and,
if not, there is no point in advertising a 'SVM' capability to the guest. 

> The decision to switch into a> SVM is taken by the kernel running in the VM. This happens much later,
> after the hypervisor has already conveyed its capabilties to the qemu, and
> qemu has than instantiated the VM.

So you don't have negotiation with the hypervisor ? How does the guest
knows the hypervisor platform can handle SVMs ? try and see if it fails ?
If so, it seems quite broken to me.
 
> As a result, CAS in prom_init is the only place where this negotiation
> can take place.

Euh. I don't follow. This is indeed where CAS is performed and so it's 
*the* place to check that the hypervisor has 'SVM' support ? 

>> The SVM specifications might not be complete yet and if some features 
>> are incompatible, I think we should modify the capabilities advertised 
>> by the hypervisor : no XIVE in case of SVM. QEMU will automatically 
>> use the fallback path and emulate the XIVE device, same as setting 
>> 'kernel-irqchip=off'. 
> 
> As mentioned above, this would be an excellent approach, if the
> Hypervisor was aware of the VM's intent to switch into a SVM. Neither
> the hypervisor knows, nor the qemu.  Only the kernel running within the
> VM knows about it.


The hypervisor (KVM/QEMU) never knows what are the guest OS capabilities
or its intents. That is why there is a negotiation process. 

I would do :

 * OPAL FW advertises 'SVM' support to the Linux PowerNV (through DT) 
 * KVM advertises 'SVM' support to QEMU (extend KVM ioctls)
 * QEMU advertises 'SVM' support to guest OS (through CAS or DT) 
 * Guest OS should not try to use SVM it is not supported. 

If the passthrough of HW pages is not supported by Ultravisor, KVM 
should not advertised XIVE to QEMU which would then use fallback mode.

If emulated XIVE or XICS is not supported by SVM guests, then we have
a problem and we need to understand why ! :) 

And if XIVE is still a problem, then the guest could change the CAS 
request and remove XIVE when SVM is being set. I suppose that we have 
all this information before CAS. Do we ? 

It should be a runtime choice taking into account the full software 
stack rather than a compile choice at the bottom which would impact
all other options. This is not acceptable IMHO.

Cheers,

C.
Greg Kurz March 2, 2020, 8:54 p.m. UTC | #4
On Fri, 28 Feb 2020 23:54:04 -0800
Ram Pai <linuxram@us.ibm.com> wrote:

> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> 

What exactly is "not correctly enabled" ?

> Hence Secure VM, must always default to XICS interrupt controller.
> 

So this is a temporary workaround until whatever isn't working with
XIVE and the Secure VM gets fixed. Maybe worth mentioning this in
some comment.

> If XIVE is requested through kernel command line option "xive=on",
> override and turn it off.
> 

There's no such thing as requesting XIVE with "xive=on". XIVE is
on by default if the platform and CPU support it BUT it can be
disabled with "xive=off" in which case the guest wont request
XIVE except if it's the only available mode.

> If XIVE is the only supported platform interrupt controller; specified
> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> XICS.
> 

If XIVE is the only option and the guest requests XICS anyway, QEMU is
supposed to print an error message and terminate:

        if (!spapr->irq->xics) {
            error_report(
"Guest requested unavailable interrupt mode (XICS), either don't set the ic-mode machine property or try ic-mode=xics or ic-mode=dual");
            exit(EXIT_FAILURE);
        }

I think it would be better to end up there rather than aborting.

> Cc: kvm-ppc@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Cc: Michael Anderson <andmike@linux.ibm.com>
> Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: Paul Mackerras <paulus@ozlabs.org>
> Cc: Greg Kurz <groug@kaod.org>
> Cc: Cedric Le Goater <clg@fr.ibm.com>
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Ram Pai <linuxram@us.ibm.com>
> ---
>  arch/powerpc/kernel/prom_init.c | 43 ++++++++++++++++++++++++++++-------------
>  1 file changed, 30 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
> index 5773453..dd96c82 100644
> --- a/arch/powerpc/kernel/prom_init.c
> +++ b/arch/powerpc/kernel/prom_init.c
> @@ -805,6 +805,18 @@ static void __init early_cmdline_parse(void)
>  #endif
>  	}
>  
> +#ifdef CONFIG_PPC_SVM
> +	opt = prom_strstr(prom_cmd_line, "svm=");
> +	if (opt) {
> +		bool val;
> +
> +		opt += sizeof("svm=") - 1;
> +		if (!prom_strtobool(opt, &val))
> +			prom_svm_enable = val;
> +		prom_printf("svm =%d\n", prom_svm_enable);
> +	}
> +#endif /* CONFIG_PPC_SVM */
> +
>  #ifdef CONFIG_PPC_PSERIES
>  	prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
>  	opt = prom_strstr(prom_cmd_line, "disable_radix");
> @@ -823,23 +835,22 @@ static void __init early_cmdline_parse(void)
>  	if (prom_radix_disable)
>  		prom_debug("Radix disabled from cmdline\n");
>  
> -	opt = prom_strstr(prom_cmd_line, "xive=off");
> -	if (opt) {

A comment to explain why we currently need to limit ourselves to using
XICS would be appreciated.

> +#ifdef CONFIG_PPC_SVM
> +	if (prom_svm_enable) {
>  		prom_xive_disable = true;
> -		prom_debug("XIVE disabled from cmdline\n");
> +		prom_debug("XIVE disabled in Secure VM\n");
>  	}
> -#endif /* CONFIG_PPC_PSERIES */
> -
> -#ifdef CONFIG_PPC_SVM
> -	opt = prom_strstr(prom_cmd_line, "svm=");
> -	if (opt) {
> -		bool val;
> +#endif /* CONFIG_PPC_SVM */
>  
> -		opt += sizeof("svm=") - 1;
> -		if (!prom_strtobool(opt, &val))
> -			prom_svm_enable = val;
> +	if (!prom_xive_disable) {
> +		opt = prom_strstr(prom_cmd_line, "xive=off");
> +		if (opt) {
> +			prom_xive_disable = true;
> +			prom_debug("XIVE disabled from cmdline\n");
> +		}
>  	}
> -#endif /* CONFIG_PPC_SVM */
> +
> +#endif /* CONFIG_PPC_PSERIES */
>  }
>  
>  #ifdef CONFIG_PPC_PSERIES
> @@ -1251,6 +1262,12 @@ static void __init prom_parse_xive_model(u8 val,
>  		break;
>  	case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */
>  		prom_debug("XIVE - exploitation mode supported\n");
> +
> +#ifdef CONFIG_PPC_SVM
> +		if (prom_svm_enable)
> +			prom_panic("WARNING: xive unsupported in Secure VM\n");

Change the prom_panic() line into a break. The guest will ask XICS and QEMU
will terminate nicely. Maybe still print out a warning since QEMU won't mention
the Secure VM aspect of things.

> +#endif /* CONFIG_PPC_SVM */
> +
>  		if (prom_xive_disable) {
>  			/*
>  			 * If we __have__ to do XIVE, we're better off ignoring
David Gibson March 2, 2020, 11:32 p.m. UTC | #5
On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> 
> Hence Secure VM, must always default to XICS interrupt controller.
> 
> If XIVE is requested through kernel command line option "xive=on",
> override and turn it off.
> 
> If XIVE is the only supported platform interrupt controller; specified
> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> XICS.

Uh... the discussion thread here seems to have gotten oddly off
track.  So, to try to clean up some misunderstandings on both sides:

  1) The guest is the main thing that knows that it will be in secure
     mode, so it's reasonable for it to conditionally use XIVE based
     on that.

  2) The mechanism by which we do it here isn't quite right.  Here the
     guest is checking itself that the host only allows XIVE, but we
     can't do XIVE and is panic()ing.  Instead, in the SVM case we
     should force support->xive to false, and send that in the CAS
     request to the host.  We expect the host to just terminate
     us because of the mismatch, but this will interact better with
     host side options setting policy for panic states and the like.
     Essentially an SVM kernel should behave like an old kernel with
     no XIVE support at all, at least w.r.t. the CAS irq mode flags.

  3) Although there are means by which the hypervisor can kind of know
     a guest is in secure mode, there's not really an "svm=on" option
     on the host side.  For the most part secure mode is based on
     discussion directly between the guest and the ultravisor with
     almost no hypervisor intervention.

  4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
     to write to event queues in guest memory, which would have to be
     explicitly shared for secure mode.  That's true whether it's KVM
     or qemu accessing the guest memory, so kernel_irqchip=on/off is
     entirely irrelevant.

  5) All the above said, having to use XICS is pretty crappy.  You
     should really get working on XIVE support for secure VMs.
Cédric Le Goater March 3, 2020, 6:50 a.m. UTC | #6
On 3/3/20 12:32 AM, David Gibson wrote:
> On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
>> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
>>
>> Hence Secure VM, must always default to XICS interrupt controller.
>>
>> If XIVE is requested through kernel command line option "xive=on",
>> override and turn it off.
>>
>> If XIVE is the only supported platform interrupt controller; specified
>> through qemu option "ic-mode=xive", simply abort. Otherwise default to
>> XICS.
> 
> Uh... the discussion thread here seems to have gotten oddly off
> track.  

There seem to be multiple issues. It is difficult to have a clear status.

> So, to try to clean up some misunderstandings on both sides:
> 
>   1) The guest is the main thing that knows that it will be in secure
>      mode, so it's reasonable for it to conditionally use XIVE based
>      on that

FW support is required AFAIUI.

>   2) The mechanism by which we do it here isn't quite right.  Here the
>      guest is checking itself that the host only allows XIVE, but we
>      can't do XIVE and is panic()ing.  Instead, in the SVM case we
>      should force support->xive to false, and send that in the CAS
>      request to the host.  We expect the host to just terminate
>      us because of the mismatch, but this will interact better with
>      host side options setting policy for panic states and the like.
>      Essentially an SVM kernel should behave like an old kernel with
>      no XIVE support at all, at least w.r.t. the CAS irq mode flags.

Yes. XIVE shouldn't be requested by the guest. This is the last option 
I proposed but I thought there was some negotiation with the hypervisor
which is not the case. 

>   3) Although there are means by which the hypervisor can kind of know
>      a guest is in secure mode, there's not really an "svm=on" option
>      on the host side.  For the most part secure mode is based on
>      discussion directly between the guest and the ultravisor with
>      almost no hypervisor intervention.

Is there a negotiation with the ultravisor ? 

>   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
>      to write to event queues in guest memory, which would have to be
>      explicitly shared for secure mode.  That's true whether it's KVM
>      or qemu accessing the guest memory, so kernel_irqchip=on/off is
>      entirely irrelevant.

This problem should be already fixed. The XIVE event queues are shared 
and the remaining problem with XIVE is the KVM page fault handler 
populating the TIMA and ESB pages. Ultravisor doesn't seem to support
this feature and this breaks interrupt management in the guest. 

But, kernel_irqchip=off should work out of the box. It seems it doesn't. 
Something to investigate.

> 
>   5) All the above said, having to use XICS is pretty crappy.  You
>      should really get working on XIVE support for secure VMs.

Yes. 

Thanks,

C.
Ram Pai March 3, 2020, 5:02 p.m. UTC | #7
On Tue, Mar 03, 2020 at 07:50:08AM +0100, Cédric Le Goater wrote:
> On 3/3/20 12:32 AM, David Gibson wrote:
> > On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> >> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> >>
> >> Hence Secure VM, must always default to XICS interrupt controller.
> >>
> >> If XIVE is requested through kernel command line option "xive=on",
> >> override and turn it off.
> >>
> >> If XIVE is the only supported platform interrupt controller; specified
> >> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> >> XICS.
> > 
> > Uh... the discussion thread here seems to have gotten oddly off
> > track.  
> 
> There seem to be multiple issues. It is difficult to have a clear status.
> 
> > So, to try to clean up some misunderstandings on both sides:
> > 
> >   1) The guest is the main thing that knows that it will be in secure
> >      mode, so it's reasonable for it to conditionally use XIVE based
> >      on that
> 
> FW support is required AFAIUI.
> >   2) The mechanism by which we do it here isn't quite right.  Here the
> >      guest is checking itself that the host only allows XIVE, but we
> >      can't do XIVE and is panic()ing.  Instead, in the SVM case we
> >      should force support->xive to false, and send that in the CAS
> >      request to the host.  We expect the host to just terminate
> >      us because of the mismatch, but this will interact better with
> >      host side options setting policy for panic states and the like.
> >      Essentially an SVM kernel should behave like an old kernel with
> >      no XIVE support at all, at least w.r.t. the CAS irq mode flags.
> 
> Yes. XIVE shouldn't be requested by the guest.

	Ok.

> This is the last option 
> I proposed but I thought there was some negotiation with the hypervisor
> which is not the case. 
> 
> >   3) Although there are means by which the hypervisor can kind of know
> >      a guest is in secure mode, there's not really an "svm=on" option
> >      on the host side.  For the most part secure mode is based on
> >      discussion directly between the guest and the ultravisor with
> >      almost no hypervisor intervention.
> 
> Is there a negotiation with the ultravisor ? 

	The VM has no negotiation with the ultravisor w.r.t CAS.

> 
> >   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> >      to write to event queues in guest memory, which would have to be
> >      explicitly shared for secure mode.  That's true whether it's KVM
> >      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> >      entirely irrelevant.
> 
> This problem should be already fixed.
> The XIVE event queues are shared 
 	
Yes i have a patch for the guest kernel that shares the event 
queue page with the hypervisor. This is done using the
UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
lists yet. However the patch by itself does not solve the xive problem
for secure VM.

> and the remaining problem with XIVE is the KVM page fault handler 
> populating the TIMA and ESB pages. Ultravisor doesn't seem to support
> this feature and this breaks interrupt management in the guest. 

Yes. This is the bigger issue that needs to be fixed. When the secure guest
accesses the page associated with the xive memslot, a page fault is
generated, which the ultravisor reflects to the hypervisor. Hypervisor
seems to be mapping Hardware-page to that GPA. Unforatunately it is not
informing the ultravisor of that map.  I am trying to understand the
root cause. But since I am not sure what more issues I might run into
after chasing down that issue, I figured its better to disable xive
support in SVM in the interim.

**** BTW: I figured, I dont need this intermin patch to disable xive for
secure VM.  Just doing "svm=on xive=off" on the kernel command line is
sufficient for now. *****


> 
> But, kernel_irqchip=off should work out of the box. It seems it doesn't. 
> Something to investigate.

Dont know why. 

Does this option, disable the chip from interrupting the
guest directly; instead mediates the interrupt through the hypervisor?

> 
> > 
> >   5) All the above said, having to use XICS is pretty crappy.  You
> >      should really get working on XIVE support for secure VMs.
> 
> Yes. 

and yes too.


Summary:  I am dropping this patch for now.

> 
> Thanks,
> 
> C.
Greg Kurz March 3, 2020, 5:45 p.m. UTC | #8
On Tue, 3 Mar 2020 09:02:05 -0800
Ram Pai <linuxram@us.ibm.com> wrote:

> On Tue, Mar 03, 2020 at 07:50:08AM +0100, Cédric Le Goater wrote:
> > On 3/3/20 12:32 AM, David Gibson wrote:
> > > On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> > >> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> > >>
> > >> Hence Secure VM, must always default to XICS interrupt controller.
> > >>
> > >> If XIVE is requested through kernel command line option "xive=on",
> > >> override and turn it off.
> > >>
> > >> If XIVE is the only supported platform interrupt controller; specified
> > >> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> > >> XICS.
> > > 
> > > Uh... the discussion thread here seems to have gotten oddly off
> > > track.  
> > 
> > There seem to be multiple issues. It is difficult to have a clear status.
> > 
> > > So, to try to clean up some misunderstandings on both sides:
> > > 
> > >   1) The guest is the main thing that knows that it will be in secure
> > >      mode, so it's reasonable for it to conditionally use XIVE based
> > >      on that
> > 
> > FW support is required AFAIUI.
> > >   2) The mechanism by which we do it here isn't quite right.  Here the
> > >      guest is checking itself that the host only allows XIVE, but we
> > >      can't do XIVE and is panic()ing.  Instead, in the SVM case we
> > >      should force support->xive to false, and send that in the CAS
> > >      request to the host.  We expect the host to just terminate
> > >      us because of the mismatch, but this will interact better with
> > >      host side options setting policy for panic states and the like.
> > >      Essentially an SVM kernel should behave like an old kernel with
> > >      no XIVE support at all, at least w.r.t. the CAS irq mode flags.
> > 
> > Yes. XIVE shouldn't be requested by the guest.
> 
> 	Ok.
> 
> > This is the last option 
> > I proposed but I thought there was some negotiation with the hypervisor
> > which is not the case. 
> > 
> > >   3) Although there are means by which the hypervisor can kind of know
> > >      a guest is in secure mode, there's not really an "svm=on" option
> > >      on the host side.  For the most part secure mode is based on
> > >      discussion directly between the guest and the ultravisor with
> > >      almost no hypervisor intervention.
> > 
> > Is there a negotiation with the ultravisor ? 
> 
> 	The VM has no negotiation with the ultravisor w.r.t CAS.
> 
> > 
> > >   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> > >      to write to event queues in guest memory, which would have to be
> > >      explicitly shared for secure mode.  That's true whether it's KVM
> > >      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> > >      entirely irrelevant.
> > 
> > This problem should be already fixed.
> > The XIVE event queues are shared 
>  	
> Yes i have a patch for the guest kernel that shares the event 
> queue page with the hypervisor. This is done using the
> UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> lists yet.

Why ?

> However the patch by itself does not solve the xive problem
> for secure VM.
> 

This patch would allow at least to answer Cedric's question about
kernel_irqchip=off, since this looks like the only thing needed
to make it work.

> > and the remaining problem with XIVE is the KVM page fault handler 
> > populating the TIMA and ESB pages. Ultravisor doesn't seem to support
> > this feature and this breaks interrupt management in the guest. 
> 
> Yes. This is the bigger issue that needs to be fixed. When the secure guest
> accesses the page associated with the xive memslot, a page fault is
> generated, which the ultravisor reflects to the hypervisor. Hypervisor
> seems to be mapping Hardware-page to that GPA. Unforatunately it is not
> informing the ultravisor of that map.  I am trying to understand the
> root cause. But since I am not sure what more issues I might run into
> after chasing down that issue, I figured its better to disable xive
> support in SVM in the interim.
> 
> **** BTW: I figured, I dont need this intermin patch to disable xive for
> secure VM.  Just doing "svm=on xive=off" on the kernel command line is
> sufficient for now. *****
> 

No it is not. If the hypervisor doesn't propose XIVE (ie. ic-mode=xive
on the QEMU command line), the kernel simply ignores "xive=off".

> 
> > 
> > But, kernel_irqchip=off should work out of the box. It seems it doesn't. 
> > Something to investigate.
> 
> Dont know why. 
> 
> Does this option, disable the chip from interrupting the
> guest directly; instead mediates the interrupt through the hypervisor?
> 
> > 
> > > 
> > >   5) All the above said, having to use XICS is pretty crappy.  You
> > >      should really get working on XIVE support for secure VMs.
> > 
> > Yes. 
> 
> and yes too.
> 
> 
> Summary:  I am dropping this patch for now.
> 
> > 
> > Thanks,
> > 
> > C.
>
Ram Pai March 3, 2020, 6:56 p.m. UTC | #9
On Tue, Mar 03, 2020 at 06:45:20PM +0100, Greg Kurz wrote:
> On Tue, 3 Mar 2020 09:02:05 -0800
> Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > On Tue, Mar 03, 2020 at 07:50:08AM +0100, Cédric Le Goater wrote:
> > > On 3/3/20 12:32 AM, David Gibson wrote:
> > > > On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> > > >> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> > > >>
> > > >> Hence Secure VM, must always default to XICS interrupt controller.
> > > >>
> > > >> If XIVE is requested through kernel command line option "xive=on",
> > > >> override and turn it off.
> > > >>
> > > >> If XIVE is the only supported platform interrupt controller; specified
> > > >> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> > > >> XICS.
> > > > 
> > > > Uh... the discussion thread here seems to have gotten oddly off
> > > > track.  
> > > 
> > > There seem to be multiple issues. It is difficult to have a clear status.
> > > 
> > > > So, to try to clean up some misunderstandings on both sides:
> > > > 
> > > >   1) The guest is the main thing that knows that it will be in secure
> > > >      mode, so it's reasonable for it to conditionally use XIVE based
> > > >      on that
> > > 
> > > FW support is required AFAIUI.
> > > >   2) The mechanism by which we do it here isn't quite right.  Here the
> > > >      guest is checking itself that the host only allows XIVE, but we
> > > >      can't do XIVE and is panic()ing.  Instead, in the SVM case we
> > > >      should force support->xive to false, and send that in the CAS
> > > >      request to the host.  We expect the host to just terminate
> > > >      us because of the mismatch, but this will interact better with
> > > >      host side options setting policy for panic states and the like.
> > > >      Essentially an SVM kernel should behave like an old kernel with
> > > >      no XIVE support at all, at least w.r.t. the CAS irq mode flags.
> > > 
> > > Yes. XIVE shouldn't be requested by the guest.
> > 
> > 	Ok.
> > 
> > > This is the last option 
> > > I proposed but I thought there was some negotiation with the hypervisor
> > > which is not the case. 
> > > 
> > > >   3) Although there are means by which the hypervisor can kind of know
> > > >      a guest is in secure mode, there's not really an "svm=on" option
> > > >      on the host side.  For the most part secure mode is based on
> > > >      discussion directly between the guest and the ultravisor with
> > > >      almost no hypervisor intervention.
> > > 
> > > Is there a negotiation with the ultravisor ? 
> > 
> > 	The VM has no negotiation with the ultravisor w.r.t CAS.
> > 
> > > 
> > > >   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> > > >      to write to event queues in guest memory, which would have to be
> > > >      explicitly shared for secure mode.  That's true whether it's KVM
> > > >      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> > > >      entirely irrelevant.
> > > 
> > > This problem should be already fixed.
> > > The XIVE event queues are shared 
> >  	
> > Yes i have a patch for the guest kernel that shares the event 
> > queue page with the hypervisor. This is done using the
> > UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> > lists yet.
> 
> Why ?

At this point I am not sure if this is the only change, I need to the
guest kernel.  I also need changes to KVM and to the ultravisor. Its bit
premature to send the patch without having figured out everything
to get xive working on a Secure VM.

> 
> > However the patch by itself does not solve the xive problem
> > for secure VM.
> > 
> 
> This patch would allow at least to answer Cedric's question about
> kernel_irqchip=off, since this looks like the only thing needed
> to make it work.

hmm.. I am not sure. Are you saying
(a) patch the guest kernel to share the event queue page
(b) run the qemu with "kernel_irqchip=off"
(c) and the guest kernel with "svm=on"

and it should all work?

RP
Cédric Le Goater March 3, 2020, 7:08 p.m. UTC | #10
>>>   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
>>>      to write to event queues in guest memory, which would have to be
>>>      explicitly shared for secure mode.  That's true whether it's KVM
>>>      or qemu accessing the guest memory, so kernel_irqchip=on/off is
>>>      entirely irrelevant.
>>
>> This problem should be already fixed.
>> The XIVE event queues are shared 
>  	
> Yes i have a patch for the guest kernel that shares the event 
> queue page with the hypervisor. This is done using the
> UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> lists yet. However the patch by itself does not solve the xive problem
> for secure VM.

yes because you also need to share the XIVE TIMA and ESB pages mapped 
in xive_native_esb_fault() and xive_native_tima_fault(). 

>> and the remaining problem with XIVE is the KVM page fault handler 
>> populating the TIMA and ESB pages. Ultravisor doesn't seem to support
>> this feature and this breaks interrupt management in the guest. 
> 
> Yes. This is the bigger issue that needs to be fixed. When the secure guest
> accesses the page associated with the xive memslot, a page fault is
> generated, which the ultravisor reflects to the hypervisor. Hypervisor
> seems to be mapping Hardware-page to that GPA. Unforatunately it is not
> informing the ultravisor of that map.  I am trying to understand the
> root cause. But since I am not sure what more issues I might run into
> after chasing down that issue, I figured its better to disable xive
> support in SVM in the interim.

Is it possible to call uv_share_page() from the hypervisor ? 

> **** BTW: I figured, I dont need this intermin patch to disable xive for
> secure VM.  Just doing "svm=on xive=off" on the kernel command line is
> sufficient for now. *****

Yes. 

>> But, kernel_irqchip=off should work out of the box. It seems it doesn't. 
>> Something to investigate.
> 
> Dont know why. 

We need to understand why. 

You still need the patch to share the event queue page allocated by the 
guest OS because QEMU will enqueue events. But you should not need anything
else.

> Does this option, disable the chip from interrupting the
> guest directly; instead mediates the interrupt through the hypervisor?

Yes. The KVM backend is unused, the XIVE interrupt controller is deactivated
for the guest and QEMU notifies the vCPUs directly.  

The TIMA and ESB pages belong the QEMU process and the guest OS will do 
some load and store operations onto them for interrupt management. Is that 
OK from a UV perspective ?  

Thanks,

C.
Cédric Le Goater March 3, 2020, 7:18 p.m. UTC | #11
>> **** BTW: I figured, I dont need this intermin patch to disable xive for
>> secure VM.  Just doing "svm=on xive=off" on the kernel command line is
>> sufficient for now. *****
>>
> 
> No it is not. If the hypervisor doesn't propose XIVE (ie. ic-mode=xive
> on the QEMU command line), the kernel simply ignores "xive=off".

If I am correct, with the option ic-mode=xive, the hypervisor will 
propose only 'xive' in OV5 and not both 'xive' and 'xics'. But the
result is the same because xive can not be turned off and "xive=off" 
is ignored.

Anyway, it's not the most common case of usage of the QEMU command
like. I think it's OK to use "xive=off" on the kernel command line 
for now.

C.
Ram Pai March 3, 2020, 8:29 p.m. UTC | #12
On Tue, Mar 03, 2020 at 08:08:51PM +0100, Cédric Le Goater wrote:
> >>>   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> >>>      to write to event queues in guest memory, which would have to be
> >>>      explicitly shared for secure mode.  That's true whether it's KVM
> >>>      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> >>>      entirely irrelevant.
> >>
> >> This problem should be already fixed.
> >> The XIVE event queues are shared 
> >  	
> > Yes i have a patch for the guest kernel that shares the event 
> > queue page with the hypervisor. This is done using the
> > UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> > lists yet. However the patch by itself does not solve the xive problem
> > for secure VM.
> 
> yes because you also need to share the XIVE TIMA and ESB pages mapped 
> in xive_native_esb_fault() and xive_native_tima_fault(). 

These pages belong to the xive memory slot right? If that is the case,
they are implicitly shared. The Ultravisor will set them up to be
shared. The guest kernel should not be doing anything.

We still need some fixes in KVM and Ultravisor to correctly map the
hardware pages to GPA ranges of the xive memory slot. Work is in progress...



> 
> >> and the remaining problem with XIVE is the KVM page fault handler 
> >> populating the TIMA and ESB pages. Ultravisor doesn't seem to support
> >> this feature and this breaks interrupt management in the guest. 
> > 
> > Yes. This is the bigger issue that needs to be fixed. When the secure guest
> > accesses the page associated with the xive memslot, a page fault is
> > generated, which the ultravisor reflects to the hypervisor. Hypervisor
> > seems to be mapping Hardware-page to that GPA. Unforatunately it is not
> > informing the ultravisor of that map.  I am trying to understand the
> > root cause. But since I am not sure what more issues I might run into
> > after chasing down that issue, I figured its better to disable xive
> > support in SVM in the interim.
> 
> Is it possible to call uv_share_page() from the hypervisor ? 

No. Not allowed. If allowed hypervisor can easily attack the SVM.

> 
> > **** BTW: I figured, I dont need this intermin patch to disable xive for
> > secure VM.  Just doing "svm=on xive=off" on the kernel command line is
> > sufficient for now. *****
> 
> Yes. 
> 
> >> But, kernel_irqchip=off should work out of the box. It seems it doesn't. 
> >> Something to investigate.
> > 
> > Dont know why. 
> 
> We need to understand why. 
> 
> You still need the patch to share the event queue page allocated by the 
> guest OS because QEMU will enqueue events. But you should not need anything
> else.

ok. that is assuring.

> 
> > Does this option, disable the chip from interrupting the
> > guest directly; instead mediates the interrupt through the hypervisor?
> 
> Yes. The KVM backend is unused, the XIVE interrupt controller is deactivated
> for the guest and QEMU notifies the vCPUs directly.  
> 
> The TIMA and ESB pages belong the QEMU process and the guest OS will do 
> some load and store operations onto them for interrupt management. Is that 
> OK from a UV perspective ?  

Yes. These GPA ranges are needed; by design, to be read/writable from qemu/KVM and
the SVM. Just that the implementation in its current form, needs some
fixing.

RP
Greg Kurz March 4, 2020, 8:34 a.m. UTC | #13
On Tue, 3 Mar 2020 20:18:18 +0100
Cédric Le Goater <clg@kaod.org> wrote:

> >> **** BTW: I figured, I dont need this intermin patch to disable xive for
> >> secure VM.  Just doing "svm=on xive=off" on the kernel command line is
> >> sufficient for now. *****
> >>
> > 
> > No it is not. If the hypervisor doesn't propose XIVE (ie. ic-mode=xive
> > on the QEMU command line), the kernel simply ignores "xive=off".
> 

Ah... sorry for the typo... "doesn't propose XICS" of course :)

> If I am correct, with the option ic-mode=xive, the hypervisor will 
> propose only 'xive' in OV5 and not both 'xive' and 'xics'. But the
> result is the same because xive can not be turned off and "xive=off" 
> is ignored.
> 
> Anyway, it's not the most common case of usage of the QEMU command
> like. I think it's OK to use "xive=off" on the kernel command line 
> for now.
> 

Sure, I just wanted to make things clear. Like you said it's a chicken
switch introduced for distro testing. I think it should not be used
to do anything else. If "svm=1" needs to enforce supported.xive == false
as a temporary workaround, it should do it explicitly.

> C.
Greg Kurz March 4, 2020, 10:59 a.m. UTC | #14
On Tue, 3 Mar 2020 10:56:45 -0800
Ram Pai <linuxram@us.ibm.com> wrote:

> On Tue, Mar 03, 2020 at 06:45:20PM +0100, Greg Kurz wrote:
> > On Tue, 3 Mar 2020 09:02:05 -0800
> > Ram Pai <linuxram@us.ibm.com> wrote:
> > 
> > > On Tue, Mar 03, 2020 at 07:50:08AM +0100, Cédric Le Goater wrote:
> > > > On 3/3/20 12:32 AM, David Gibson wrote:
> > > > > On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> > > > >> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> > > > >>
> > > > >> Hence Secure VM, must always default to XICS interrupt controller.
> > > > >>
> > > > >> If XIVE is requested through kernel command line option "xive=on",
> > > > >> override and turn it off.
> > > > >>
> > > > >> If XIVE is the only supported platform interrupt controller; specified
> > > > >> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> > > > >> XICS.
> > > > > 
> > > > > Uh... the discussion thread here seems to have gotten oddly off
> > > > > track.  
> > > > 
> > > > There seem to be multiple issues. It is difficult to have a clear status.
> > > > 
> > > > > So, to try to clean up some misunderstandings on both sides:
> > > > > 
> > > > >   1) The guest is the main thing that knows that it will be in secure
> > > > >      mode, so it's reasonable for it to conditionally use XIVE based
> > > > >      on that
> > > > 
> > > > FW support is required AFAIUI.
> > > > >   2) The mechanism by which we do it here isn't quite right.  Here the
> > > > >      guest is checking itself that the host only allows XIVE, but we
> > > > >      can't do XIVE and is panic()ing.  Instead, in the SVM case we
> > > > >      should force support->xive to false, and send that in the CAS
> > > > >      request to the host.  We expect the host to just terminate
> > > > >      us because of the mismatch, but this will interact better with
> > > > >      host side options setting policy for panic states and the like.
> > > > >      Essentially an SVM kernel should behave like an old kernel with
> > > > >      no XIVE support at all, at least w.r.t. the CAS irq mode flags.
> > > > 
> > > > Yes. XIVE shouldn't be requested by the guest.
> > > 
> > > 	Ok.
> > > 
> > > > This is the last option 
> > > > I proposed but I thought there was some negotiation with the hypervisor
> > > > which is not the case. 
> > > > 
> > > > >   3) Although there are means by which the hypervisor can kind of know
> > > > >      a guest is in secure mode, there's not really an "svm=on" option
> > > > >      on the host side.  For the most part secure mode is based on
> > > > >      discussion directly between the guest and the ultravisor with
> > > > >      almost no hypervisor intervention.
> > > > 
> > > > Is there a negotiation with the ultravisor ? 
> > > 
> > > 	The VM has no negotiation with the ultravisor w.r.t CAS.
> > > 
> > > > 
> > > > >   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> > > > >      to write to event queues in guest memory, which would have to be
> > > > >      explicitly shared for secure mode.  That's true whether it's KVM
> > > > >      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> > > > >      entirely irrelevant.
> > > > 
> > > > This problem should be already fixed.
> > > > The XIVE event queues are shared 
> > >  	
> > > Yes i have a patch for the guest kernel that shares the event 
> > > queue page with the hypervisor. This is done using the
> > > UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> > > lists yet.
> > 
> > Why ?
> 
> At this point I am not sure if this is the only change, I need to the
> guest kernel.

Maybe but we're already sure that this change is needed. I don't really see
the point in holding this any longer.

>  I also need changes to KVM and to the ultravisor. Its bit
> premature to send the patch without having figured out everything
> to get xive working on a Secure VM.
> 

I'm a bit confused... why did you send this workaround patch in
the first place then ? I mean, this raises a concern and we're
just trying to move forward.

> > 
> > > However the patch by itself does not solve the xive problem
> > > for secure VM.
> > > 
> > 
> > This patch would allow at least to answer Cedric's question about
> > kernel_irqchip=off, since this looks like the only thing needed
> > to make it work.
> 
> hmm.. I am not sure. Are you saying
> (a) patch the guest kernel to share the event queue page
> (b) run the qemu with "kernel_irqchip=off"
> (c) and the guest kernel with "svm=on"
> 
> and it should all work?
> 

Yes.

> RP
>
Ram Pai March 4, 2020, 3:13 p.m. UTC | #15
On Wed, Mar 04, 2020 at 11:59:48AM +0100, Greg Kurz wrote:
> On Tue, 3 Mar 2020 10:56:45 -0800
> Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > On Tue, Mar 03, 2020 at 06:45:20PM +0100, Greg Kurz wrote:
> > > On Tue, 3 Mar 2020 09:02:05 -0800
> > > Ram Pai <linuxram@us.ibm.com> wrote:
> > > 
> > > > On Tue, Mar 03, 2020 at 07:50:08AM +0100, Cédric Le Goater wrote:
> > > > > On 3/3/20 12:32 AM, David Gibson wrote:
> > > > > > On Fri, Feb 28, 2020 at 11:54:04PM -0800, Ram Pai wrote:
> > > > > >> XIVE is not correctly enabled for Secure VM in the KVM Hypervisor yet.
> > > > > >>
> > > > > >> Hence Secure VM, must always default to XICS interrupt controller.
> > > > > >>
> > > > > >> If XIVE is requested through kernel command line option "xive=on",
> > > > > >> override and turn it off.
> > > > > >>
> > > > > >> If XIVE is the only supported platform interrupt controller; specified
> > > > > >> through qemu option "ic-mode=xive", simply abort. Otherwise default to
> > > > > >> XICS.
> > > > > > 
> > > > > > Uh... the discussion thread here seems to have gotten oddly off
> > > > > > track.  
> > > > > 
> > > > > There seem to be multiple issues. It is difficult to have a clear status.
> > > > > 
> > > > > > So, to try to clean up some misunderstandings on both sides:
> > > > > > 
> > > > > >   1) The guest is the main thing that knows that it will be in secure
> > > > > >      mode, so it's reasonable for it to conditionally use XIVE based
> > > > > >      on that
> > > > > 
> > > > > FW support is required AFAIUI.
> > > > > >   2) The mechanism by which we do it here isn't quite right.  Here the
> > > > > >      guest is checking itself that the host only allows XIVE, but we
> > > > > >      can't do XIVE and is panic()ing.  Instead, in the SVM case we
> > > > > >      should force support->xive to false, and send that in the CAS
> > > > > >      request to the host.  We expect the host to just terminate
> > > > > >      us because of the mismatch, but this will interact better with
> > > > > >      host side options setting policy for panic states and the like.
> > > > > >      Essentially an SVM kernel should behave like an old kernel with
> > > > > >      no XIVE support at all, at least w.r.t. the CAS irq mode flags.
> > > > > 
> > > > > Yes. XIVE shouldn't be requested by the guest.
> > > > 
> > > > 	Ok.
> > > > 
> > > > > This is the last option 
> > > > > I proposed but I thought there was some negotiation with the hypervisor
> > > > > which is not the case. 
> > > > > 
> > > > > >   3) Although there are means by which the hypervisor can kind of know
> > > > > >      a guest is in secure mode, there's not really an "svm=on" option
> > > > > >      on the host side.  For the most part secure mode is based on
> > > > > >      discussion directly between the guest and the ultravisor with
> > > > > >      almost no hypervisor intervention.
> > > > > 
> > > > > Is there a negotiation with the ultravisor ? 
> > > > 
> > > > 	The VM has no negotiation with the ultravisor w.r.t CAS.
> > > > 
> > > > > 
> > > > > >   4) I'm guessing the problem with XIVE in SVM mode is that XIVE needs
> > > > > >      to write to event queues in guest memory, which would have to be
> > > > > >      explicitly shared for secure mode.  That's true whether it's KVM
> > > > > >      or qemu accessing the guest memory, so kernel_irqchip=on/off is
> > > > > >      entirely irrelevant.
> > > > > 
> > > > > This problem should be already fixed.
> > > > > The XIVE event queues are shared 
> > > >  	
> > > > Yes i have a patch for the guest kernel that shares the event 
> > > > queue page with the hypervisor. This is done using the
> > > > UV_SHARE_PAGE ultracall. This patch is not sent out to any any mailing
> > > > lists yet.
> > > 
> > > Why ?
> > 
> > At this point I am not sure if this is the only change, I need to the
> > guest kernel.
> 
> Maybe but we're already sure that this change is needed. I don't really see
> the point in holding this any longer.
> 
> >  I also need changes to KVM and to the ultravisor. Its bit
> > premature to send the patch without having figured out everything
> > to get xive working on a Secure VM.
> > 
> 
> I'm a bit confused... why did you send this workaround patch in
> the first place then ? I mean, this raises a concern and we're
> just trying to move forward.

The upstream kernel in its current form, will hang as of today if qemu
has 'ic-mode=xive'. The kernel hangs without giving any indication as to
'why?'. This is a bad state to be in.

This patch was a temporary solution. It is there to inform the user, not
to use 'xive' in secureVM. The user is atleast informed/guided, instead
of lost/confused.

The permanent solution, is to fix the problem in KVM and ultravisor,
along with sharing the EQ-page in the SVM, and get a holistic solution
in place. But that will take time, and may not happen by the time 5.6
releases.

RP
Ram Pai March 4, 2020, 3:37 p.m. UTC | #16
On Wed, Mar 04, 2020 at 11:59:48AM +0100, Greg Kurz wrote:
> On Tue, 3 Mar 2020 10:56:45 -0800
> Ram Pai <linuxram@us.ibm.com> wrote:
> 
> > On Tue, Mar 03, 2020 at 06:45:20PM +0100, Greg Kurz wrote:
....snip.
> > > 
> > > This patch would allow at least to answer Cedric's question about
> > > kernel_irqchip=off, since this looks like the only thing needed
> > > to make it work.
> > 
> > hmm.. I am not sure. Are you saying
> > (a) patch the guest kernel to share the event queue page
> > (b) run the qemu with "kernel_irqchip=off"
> > (c) and the guest kernel with "svm=on"
> > 
> > and it should all work?
> > 
> 
> Yes.

Ok. 

(1) applied the patch which shares the EQ-page with the hypervisor.
(2) set "kernel_irqchip=off"
(3) set "ic-mode=xive"
(4) set "svm=on" on the kernel command line.
(5) no changes to the hypervisor and ultravisor.

And Boom it works!.   So you were right.


I am sending out the patch for (1) above ASAP.

Thanks,
RP
Cédric Le Goater March 4, 2020, 3:56 p.m. UTC | #17
[ ... ]

> (1) applied the patch which shares the EQ-page with the hypervisor.
> (2) set "kernel_irqchip=off"
> (3) set "ic-mode=xive"

you don't have to set the interrupt mode. xive should be negotiated
by default.

> (4) set "svm=on" on the kernel command line.
> (5) no changes to the hypervisor and ultravisor.
> 
> And Boom it works!.   So you were right.

Excellent.
 
> I am sending out the patch for (1) above ASAP.

Next step, could you please try to do the same with the TIMA and ESB pfn ?
and use KVM. 

Thanks,

C.
David Gibson March 4, 2020, 11:55 p.m. UTC | #18
On Wed, Mar 04, 2020 at 04:56:09PM +0100, Cédric Le Goater wrote:
> [ ... ]
> 
> > (1) applied the patch which shares the EQ-page with the hypervisor.
> > (2) set "kernel_irqchip=off"
> > (3) set "ic-mode=xive"
> 
> you don't have to set the interrupt mode. xive should be negotiated
> by default.
> 
> > (4) set "svm=on" on the kernel command line.
> > (5) no changes to the hypervisor and ultravisor.
> > 
> > And Boom it works!.   So you were right.
> 
> Excellent.
>  
> > I am sending out the patch for (1) above ASAP.
> 
> Next step, could you please try to do the same with the TIMA and ESB pfn ?
> and use KVM.

I'm a bit confused by this.  Aren't the TIMA and ESB pages essentially
IO pages, rather than memory pages from the guest's point of view?  I
assume only memory pages are protected with PEF - I can't even really
see what protecting an IO page would even mean.
Cédric Le Goater March 5, 2020, 7:15 a.m. UTC | #19
On 3/5/20 12:55 AM, David Gibson wrote:
> On Wed, Mar 04, 2020 at 04:56:09PM +0100, Cédric Le Goater wrote:
>> [ ... ]
>>
>>> (1) applied the patch which shares the EQ-page with the hypervisor.
>>> (2) set "kernel_irqchip=off"
>>> (3) set "ic-mode=xive"
>>
>> you don't have to set the interrupt mode. xive should be negotiated
>> by default.
>>
>>> (4) set "svm=on" on the kernel command line.
>>> (5) no changes to the hypervisor and ultravisor.
>>>
>>> And Boom it works!.   So you were right.
>>
>> Excellent.
>>  
>>> I am sending out the patch for (1) above ASAP.
>>
>> Next step, could you please try to do the same with the TIMA and ESB pfn ?
>> and use KVM.
> 
> I'm a bit confused by this.  Aren't the TIMA and ESB pages essentially
> IO pages, rather than memory pages from the guest's point of view?

yes. 

> I assume only memory pages are protected with PEF - I can't even really
> see what protecting an IO page would even mean.

AFAIUI, the ultravisor needs to be aware of these IO page frames. We have 
the information in KVM but we need to inform the ultravisor in some ways.
It could be done earlier than in the page fault handler.

C.
Cédric Le Goater March 5, 2020, 11:41 a.m. UTC | #20
[ ... ] 

>> yes because you also need to share the XIVE TIMA and ESB pages mapped 
>> in xive_native_esb_fault() and xive_native_tima_fault(). 
> 
> These pages belong to the xive memory slot right? If that is the case,
> they are implicitly shared. The Ultravisor will set them up to be
> shared. The guest kernel should not be doing anything.
> 
> We still need some fixes in KVM and Ultravisor to correctly map the
> hardware pages to GPA ranges of the xive memory slot. Work is in progress...

ok. Since this is already done for KVM, I suppose it's not too hard. 
the VMA has VM_IO | VM_PFNMAP flags.

Otherwise you could still pick up the XIVE ESB and TIMA HW MMIO ranges 
in OPAL and brutally declare the whole as shared, if that's possible.

C.
Ram Pai March 5, 2020, 3:15 p.m. UTC | #21
On Thu, Mar 05, 2020 at 10:55:45AM +1100, David Gibson wrote:
> On Wed, Mar 04, 2020 at 04:56:09PM +0100, Cédric Le Goater wrote:
> > [ ... ]
> > 
> > > (1) applied the patch which shares the EQ-page with the hypervisor.
> > > (2) set "kernel_irqchip=off"
> > > (3) set "ic-mode=xive"
> > 
> > you don't have to set the interrupt mode. xive should be negotiated
> > by default.
> > 
> > > (4) set "svm=on" on the kernel command line.
> > > (5) no changes to the hypervisor and ultravisor.
> > > 
> > > And Boom it works!.   So you were right.
> > 
> > Excellent.
> >  
> > > I am sending out the patch for (1) above ASAP.
> > 
> > Next step, could you please try to do the same with the TIMA and ESB pfn ?
> > and use KVM.
> 
> I'm a bit confused by this.  Aren't the TIMA and ESB pages essentially
> IO pages, rather than memory pages from the guest's point of view?  I
> assume only memory pages are protected with PEF - I can't even really
> see what protecting an IO page would even mean.

It means, that the hypervisor and qemu cannot access the addresses used
to access the I/O pages. It can only be accessed by Ultravisor and the
SVM.

As it stands today, those pages are accessible from the hypervisor
and not from the SVM or the ultravisor.

To make it work, we need to enable acccess to those pages from the SVM
and from the ultravisor.  One thing I am not clear is should we block
access to those pages from the hypervisor.  If yes, than there is no
good way to do that, without hardware help.  If no, than those GPA pages
can be shared, so that hypervisor/ultravisor/qemu/SVM can all access
those pages.

RP
Cédric Le Goater March 5, 2020, 3:36 p.m. UTC | #22
On 3/5/20 4:15 PM, Ram Pai wrote:
> On Thu, Mar 05, 2020 at 10:55:45AM +1100, David Gibson wrote:
>> On Wed, Mar 04, 2020 at 04:56:09PM +0100, Cédric Le Goater wrote:
>>> [ ... ]
>>>
>>>> (1) applied the patch which shares the EQ-page with the hypervisor.
>>>> (2) set "kernel_irqchip=off"
>>>> (3) set "ic-mode=xive"
>>>
>>> you don't have to set the interrupt mode. xive should be negotiated
>>> by default.
>>>
>>>> (4) set "svm=on" on the kernel command line.
>>>> (5) no changes to the hypervisor and ultravisor.
>>>>
>>>> And Boom it works!.   So you were right.
>>>
>>> Excellent.
>>>  
>>>> I am sending out the patch for (1) above ASAP.
>>>
>>> Next step, could you please try to do the same with the TIMA and ESB pfn ?
>>> and use KVM.
>>
>> I'm a bit confused by this.  Aren't the TIMA and ESB pages essentially
>> IO pages, rather than memory pages from the guest's point of view?  I
>> assume only memory pages are protected with PEF - I can't even really
>> see what protecting an IO page would even mean.
> 
> It means, that the hypervisor and qemu cannot access the addresses used
> to access the I/O pages. It can only be accessed by Ultravisor and the
> SVM.
> 
> As it stands today, those pages are accessible from the hypervisor
> and not from the SVM or the ultravisor.
> 
> To make it work, we need to enable acccess to those pages from the SVM
> and from the ultravisor.  One thing I am not clear is should we block
> access to those pages from the hypervisor. If yes, than there is no> good way to do that, without hardware help.  If no, than those GPA pages
> can be shared, so that hypervisor/ultravisor/qemu/SVM can all access
> those pages.

They are shared.

KVM will also access them, at interrupt creation, device reset and 
passthrough. QEMU will use them to mask on/off the interrupts in
case of guest migration or machine stop/continue. 

C.
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 5773453..dd96c82 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -805,6 +805,18 @@  static void __init early_cmdline_parse(void)
 #endif
 	}
 
+#ifdef CONFIG_PPC_SVM
+	opt = prom_strstr(prom_cmd_line, "svm=");
+	if (opt) {
+		bool val;
+
+		opt += sizeof("svm=") - 1;
+		if (!prom_strtobool(opt, &val))
+			prom_svm_enable = val;
+		prom_printf("svm =%d\n", prom_svm_enable);
+	}
+#endif /* CONFIG_PPC_SVM */
+
 #ifdef CONFIG_PPC_PSERIES
 	prom_radix_disable = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);
 	opt = prom_strstr(prom_cmd_line, "disable_radix");
@@ -823,23 +835,22 @@  static void __init early_cmdline_parse(void)
 	if (prom_radix_disable)
 		prom_debug("Radix disabled from cmdline\n");
 
-	opt = prom_strstr(prom_cmd_line, "xive=off");
-	if (opt) {
+#ifdef CONFIG_PPC_SVM
+	if (prom_svm_enable) {
 		prom_xive_disable = true;
-		prom_debug("XIVE disabled from cmdline\n");
+		prom_debug("XIVE disabled in Secure VM\n");
 	}
-#endif /* CONFIG_PPC_PSERIES */
-
-#ifdef CONFIG_PPC_SVM
-	opt = prom_strstr(prom_cmd_line, "svm=");
-	if (opt) {
-		bool val;
+#endif /* CONFIG_PPC_SVM */
 
-		opt += sizeof("svm=") - 1;
-		if (!prom_strtobool(opt, &val))
-			prom_svm_enable = val;
+	if (!prom_xive_disable) {
+		opt = prom_strstr(prom_cmd_line, "xive=off");
+		if (opt) {
+			prom_xive_disable = true;
+			prom_debug("XIVE disabled from cmdline\n");
+		}
 	}
-#endif /* CONFIG_PPC_SVM */
+
+#endif /* CONFIG_PPC_PSERIES */
 }
 
 #ifdef CONFIG_PPC_PSERIES
@@ -1251,6 +1262,12 @@  static void __init prom_parse_xive_model(u8 val,
 		break;
 	case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */
 		prom_debug("XIVE - exploitation mode supported\n");
+
+#ifdef CONFIG_PPC_SVM
+		if (prom_svm_enable)
+			prom_panic("WARNING: xive unsupported in Secure VM\n");
+#endif /* CONFIG_PPC_SVM */
+
 		if (prom_xive_disable) {
 			/*
 			 * If we __have__ to do XIVE, we're better off ignoring