diff mbox

[3/3] KVM: PPC: Book3S: Add support for hwrng found on some powernv systems

Message ID 1380177066-3835-3-git-send-email-michael@ellerman.id.au (mailing list archive)
State Not Applicable
Headers show

Commit Message

Michael Ellerman Sept. 26, 2013, 6:31 a.m. UTC
Some powernv systems include a hwrng. Guests can access it via the
H_RANDOM hcall.

We add a real mode implementation of H_RANDOM when a hwrng is found.
Userspace can detect the presence of the hwrng by quering the
KVM_CAP_PPC_HWRNG capability.

Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
---
 arch/powerpc/include/asm/archrandom.h   |  11 ++-
 arch/powerpc/include/asm/kvm_ppc.h      |   2 +
 arch/powerpc/kvm/book3s_hv_builtin.c    |  15 ++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 119 ++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c              |   3 +
 arch/powerpc/platforms/powernv/rng.c    |  25 +++++++
 include/uapi/linux/kvm.h                |   1 +
 7 files changed, 174 insertions(+), 2 deletions(-)

Comments

Paolo Bonzini Sept. 26, 2013, 9:06 a.m. UTC | #1
Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> Some powernv systems include a hwrng. Guests can access it via the
> H_RANDOM hcall.
> 
> We add a real mode implementation of H_RANDOM when a hwrng is found.
> Userspace can detect the presence of the hwrng by quering the
> KVM_CAP_PPC_HWRNG capability.
> 
> Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
> ---
>  arch/powerpc/include/asm/archrandom.h   |  11 ++-
>  arch/powerpc/include/asm/kvm_ppc.h      |   2 +
>  arch/powerpc/kvm/book3s_hv_builtin.c    |  15 ++++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S | 119 ++++++++++++++++++++++++++++++++
>  arch/powerpc/kvm/powerpc.c              |   3 +
>  arch/powerpc/platforms/powernv/rng.c    |  25 +++++++
>  include/uapi/linux/kvm.h                |   1 +
>  7 files changed, 174 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
> index d853d16..86296d5 100644
> --- a/arch/powerpc/include/asm/archrandom.h
> +++ b/arch/powerpc/include/asm/archrandom.h
> @@ -25,8 +25,15 @@ static inline int arch_get_random_int(unsigned int *v)
>  	return rc;
>  }
>  
> -int powernv_get_random_long(unsigned long *v);
> -
>  #endif /* CONFIG_ARCH_RANDOM */
>  
> +#ifdef CONFIG_PPC_POWERNV
> +int powernv_hwrng_present(void);
> +int powernv_get_random_long(unsigned long *v);
> +int powernv_get_random_real_mode(unsigned long *v);
> +#else
> +static inline int powernv_hwrng_present(void) { return 0; }
> +static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
> +#endif
> +
>  #endif /* _ASM_POWERPC_ARCHRANDOM_H */
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index b15554a..51966fd 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -177,6 +177,8 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
>  extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
>  extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
>  
> +extern int kvmppc_hwrng_present(void);
> +
>  /*
>   * Cuts out inst bits with ordering according to spec.
>   * That means the leftmost bit is zero. All given bits are included.
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
> index 8cd0dae..de12592 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -19,6 +19,7 @@
>  #include <asm/cputable.h>
>  #include <asm/kvm_ppc.h>
>  #include <asm/kvm_book3s.h>
> +#include <asm/archrandom.h>
>  
>  #include "book3s_hv_cma.h"
>  /*
> @@ -181,3 +182,17 @@ void __init kvm_cma_reserve(void)
>  		kvm_cma_declare_contiguous(selected_size, align_size);
>  	}
>  }
> +
> +int kvmppc_hwrng_present(void)
> +{
> +	return powernv_hwrng_present();
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
> +
> +long kvmppc_h_random(struct kvm_vcpu *vcpu)
> +{
> +	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
> +		return H_SUCCESS;
> +
> +	return H_HARDWARE;
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 294b7af..35ce59e 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1502,6 +1502,125 @@ hcall_real_table:
>  	.long	0		/* 0x11c */
>  	.long	0		/* 0x120 */
>  	.long	.kvmppc_h_bulk_remove - hcall_real_table
> +	.long	0		/* 0x128 */
> +	.long	0		/* 0x12c */
> +	.long	0		/* 0x130 */
> +	.long	0		/* 0x134 */
> +	.long	0		/* 0x138 */
> +	.long	0		/* 0x13c */
> +	.long	0		/* 0x140 */
> +	.long	0		/* 0x144 */
> +	.long	0		/* 0x148 */
> +	.long	0		/* 0x14c */
> +	.long	0		/* 0x150 */
> +	.long	0		/* 0x154 */
> +	.long	0		/* 0x158 */
> +	.long	0		/* 0x15c */
> +	.long	0		/* 0x160 */
> +	.long	0		/* 0x164 */
> +	.long	0		/* 0x168 */
> +	.long	0		/* 0x16c */
> +	.long	0		/* 0x170 */
> +	.long	0		/* 0x174 */
> +	.long	0		/* 0x178 */
> +	.long	0		/* 0x17c */
> +	.long	0		/* 0x180 */
> +	.long	0		/* 0x184 */
> +	.long	0		/* 0x188 */
> +	.long	0		/* 0x18c */
> +	.long	0		/* 0x190 */
> +	.long	0		/* 0x194 */
> +	.long	0		/* 0x198 */
> +	.long	0		/* 0x19c */
> +	.long	0		/* 0x1a0 */
> +	.long	0		/* 0x1a4 */
> +	.long	0		/* 0x1a8 */
> +	.long	0		/* 0x1ac */
> +	.long	0		/* 0x1b0 */
> +	.long	0		/* 0x1b4 */
> +	.long	0		/* 0x1b8 */
> +	.long	0		/* 0x1bc */
> +	.long	0		/* 0x1c0 */
> +	.long	0		/* 0x1c4 */
> +	.long	0		/* 0x1c8 */
> +	.long	0		/* 0x1cc */
> +	.long	0		/* 0x1d0 */
> +	.long	0		/* 0x1d4 */
> +	.long	0		/* 0x1d8 */
> +	.long	0		/* 0x1dc */
> +	.long	0		/* 0x1e0 */
> +	.long	0		/* 0x1e4 */
> +	.long	0		/* 0x1e8 */
> +	.long	0		/* 0x1ec */
> +	.long	0		/* 0x1f0 */
> +	.long	0		/* 0x1f4 */
> +	.long	0		/* 0x1f8 */
> +	.long	0		/* 0x1fc */
> +	.long	0		/* 0x200 */
> +	.long	0		/* 0x204 */
> +	.long	0		/* 0x208 */
> +	.long	0		/* 0x20c */
> +	.long	0		/* 0x210 */
> +	.long	0		/* 0x214 */
> +	.long	0		/* 0x218 */
> +	.long	0		/* 0x21c */
> +	.long	0		/* 0x220 */
> +	.long	0		/* 0x224 */
> +	.long	0		/* 0x228 */
> +	.long	0		/* 0x22c */
> +	.long	0		/* 0x230 */
> +	.long	0		/* 0x234 */
> +	.long	0		/* 0x238 */
> +	.long	0		/* 0x23c */
> +	.long	0		/* 0x240 */
> +	.long	0		/* 0x244 */
> +	.long	0		/* 0x248 */
> +	.long	0		/* 0x24c */
> +	.long	0		/* 0x250 */
> +	.long	0		/* 0x254 */
> +	.long	0		/* 0x258 */
> +	.long	0		/* 0x25c */
> +	.long	0		/* 0x260 */
> +	.long	0		/* 0x264 */
> +	.long	0		/* 0x268 */
> +	.long	0		/* 0x26c */
> +	.long	0		/* 0x270 */
> +	.long	0		/* 0x274 */
> +	.long	0		/* 0x278 */
> +	.long	0		/* 0x27c */
> +	.long	0		/* 0x280 */
> +	.long	0		/* 0x284 */
> +	.long	0		/* 0x288 */
> +	.long	0		/* 0x28c */
> +	.long	0		/* 0x290 */
> +	.long	0		/* 0x294 */
> +	.long	0		/* 0x298 */
> +	.long	0		/* 0x29c */
> +	.long	0		/* 0x2a0 */
> +	.long	0		/* 0x2a4 */
> +	.long	0		/* 0x2a8 */
> +	.long	0		/* 0x2ac */
> +	.long	0		/* 0x2b0 */
> +	.long	0		/* 0x2b4 */
> +	.long	0		/* 0x2b8 */
> +	.long	0		/* 0x2bc */
> +	.long	0		/* 0x2c0 */
> +	.long	0		/* 0x2c4 */
> +	.long	0		/* 0x2c8 */
> +	.long	0		/* 0x2cc */
> +	.long	0		/* 0x2d0 */
> +	.long	0		/* 0x2d4 */
> +	.long	0		/* 0x2d8 */
> +	.long	0		/* 0x2dc */
> +	.long	0		/* 0x2e0 */
> +	.long	0		/* 0x2e4 */
> +	.long	0		/* 0x2e8 */
> +	.long	0		/* 0x2ec */
> +	.long	0		/* 0x2f0 */
> +	.long	0		/* 0x2f4 */
> +	.long	0		/* 0x2f8 */
> +	.long	0		/* 0x2fc */
> +	.long	.kvmppc_h_random - hcall_real_table
>  hcall_real_table_end:
>  
>  ignore_hdec:
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 07c0106..0d7208e 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -356,6 +356,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>  		if (cpu_has_feature(CPU_FTR_ARCH_201))
>  			r = 2;
>  		break;
> +	case KVM_CAP_PPC_HWRNG:
> +		r = kvmppc_hwrng_present();
> +		break;
>  #endif
>  	case KVM_CAP_SYNC_MMU:
>  #ifdef CONFIG_KVM_BOOK3S_64_HV
> diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
> index 51d2e6a..ea7e5cd 100644
> --- a/arch/powerpc/platforms/powernv/rng.c
> +++ b/arch/powerpc/platforms/powernv/rng.c
> @@ -20,12 +20,18 @@
>  
>  struct powernv_rng {
>  	void __iomem *regs;
> +	void __iomem *regs_real;
>  	unsigned long mask;
>  };
>  
>  static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
>  
>  
> +int powernv_hwrng_present(void)
> +{
> +	return __raw_get_cpu_var(powernv_rng) != NULL;
> +}
> +
>  static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
>  {
>  	unsigned long parity;
> @@ -42,6 +48,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
>  	return val;
>  }
>  
> +int powernv_get_random_real_mode(unsigned long *v)
> +{
> +	struct powernv_rng *rng;
> +
> +	rng = __raw_get_cpu_var(powernv_rng);
> +
> +	*v = rng_whiten(rng, in_rm64(rng->regs_real));
> +
> +	return 1;
> +}
> +
>  int powernv_get_random_long(unsigned long *v)
>  {
>  	struct powernv_rng *rng;
> @@ -76,12 +93,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
>  static __init int rng_create(struct device_node *dn)
>  {
>  	struct powernv_rng *rng;
> +	struct resource res;
>  	unsigned long val;
>  
>  	rng = kzalloc(sizeof(*rng), GFP_KERNEL);
>  	if (!rng)
>  		return -ENOMEM;
>  
> +	if (of_address_to_resource(dn, 0, &res)) {
> +		kfree(rng);
> +		return -ENXIO;
> +	}
> +
> +	rng->regs_real = (void __iomem *)res.start;
> +
>  	rng->regs = of_iomap(dn, 0);
>  	if (!rng->regs) {
>  		kfree(rng);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 99c2533..493a409 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -668,6 +668,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_IRQ_XICS 92
>  #define KVM_CAP_ARM_EL1_32BIT 93
>  #define KVM_CAP_SPAPR_MULTITCE 94
> +#define KVM_CAP_PPC_HWRNG 95
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> 

Is there any reason to do this in the kernel?  It does not have to be a
particularly fast path; on x86, we are simply forwarding /dev/hwrng or
/dev/random data to the guest.  You can simply use virtio-rng.

If you really want to have the hypercall, implementing it in QEMU means
that you can support it on all systems, in fact even when running
without KVM.  The QEMU command line would be something like "-object
rng-random,filename=/dev/random,id=rng0 -device spapr-rng,rng=rng0".

Paolo
Anshuman Khandual Sept. 27, 2013, 2:15 p.m. UTC | #2
On 09/26/2013 12:01 PM, Michael Ellerman wrote:
> +int powernv_hwrng_present(void)
> +{
> +	return __raw_get_cpu_var(powernv_rng) != NULL;
> +}
> +
>  static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
>  {
>  	unsigned long parity;
> @@ -42,6 +48,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
>  	return val;
>  }
> 
> +int powernv_get_random_real_mode(unsigned long *v)
> +{
> +	struct powernv_rng *rng;
> +
> +	rng = __raw_get_cpu_var(powernv_rng);
> +
> +	*v = rng_whiten(rng, in_rm64(rng->regs_real));
> +

Will it be in_be64() instead of in_rm64() ? Its failing the build here. Except this
all individual patches build correctly.

Regards
Anshuman
Michael Ellerman Oct. 1, 2013, 8:34 a.m. UTC | #3
On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
> Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> > Some powernv systems include a hwrng. Guests can access it via the
> > H_RANDOM hcall.
> 
> Is there any reason to do this in the kernel?  

It's less code, and it's faster :)

> It does not have to be a particularly fast path;

Sure, but do we have to make it slow on purpose?

> on x86, we are simply forwarding /dev/hwrng or
> /dev/random data to the guest.  You can simply use virtio-rng.

Not all guests support virtio-rng.

> If you really want to have the hypercall, implementing it in QEMU means
> that you can support it on all systems, in fact even when running
> without KVM.  

Sure, I can add a fallback to /dev/hwrng for full emulation.

> The QEMU command line would be something like "-object
> rng-random,filename=/dev/random,id=rng0 -device spapr-rng,rng=rng0".

We can't use /dev/random like that. The PAPR specification, which is
what we're implementing, implies that H_RANDOM provides data from a
hardware source.

cheers
Michael Ellerman Oct. 1, 2013, 8:36 a.m. UTC | #4
On Fri, Sep 27, 2013 at 07:45:45PM +0530, Anshuman Khandual wrote:
> On 09/26/2013 12:01 PM, Michael Ellerman wrote:
> > +int powernv_hwrng_present(void)
> > +{
> > +	return __raw_get_cpu_var(powernv_rng) != NULL;
> > +}
> > +
> >  static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
> >  {
> >  	unsigned long parity;
> > @@ -42,6 +48,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
> >  	return val;
> >  }
> > 
> > +int powernv_get_random_real_mode(unsigned long *v)
> > +{
> > +	struct powernv_rng *rng;
> > +
> > +	rng = __raw_get_cpu_var(powernv_rng);
> > +
> > +	*v = rng_whiten(rng, in_rm64(rng->regs_real));
> > +
> 
> Will it be in_be64() instead of in_rm64() ? Its failing the build here. Except this
> all individual patches build correctly.

No it's definitely not in_be64() - that will checkstop your machine :)

I added in_rm64() in a previous patch, "Add real mode cache inhibited
IO accessors" - I just didn't want to spam the KVM guys with those
patches as well.

Thanks for the review & testing.

cheers
Gleb Natapov Oct. 1, 2013, 8:39 a.m. UTC | #5
On Tue, Oct 01, 2013 at 06:34:26PM +1000, Michael Ellerman wrote:
> On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
> > Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> > > Some powernv systems include a hwrng. Guests can access it via the
> > > H_RANDOM hcall.
> > 
> > Is there any reason to do this in the kernel?  
> 
> It's less code, and it's faster :)
> 
> > It does not have to be a particularly fast path;
> 
> Sure, but do we have to make it slow on purpose?
> 
We do not put non performance critical devices into the kernel.

--
			Gleb.
Paul Mackerras Oct. 1, 2013, 9:23 a.m. UTC | #6
On Tue, Oct 01, 2013 at 11:39:08AM +0300, Gleb Natapov wrote:
> On Tue, Oct 01, 2013 at 06:34:26PM +1000, Michael Ellerman wrote:
> > On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
> > > Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> > > > Some powernv systems include a hwrng. Guests can access it via the
> > > > H_RANDOM hcall.
> > > 
> > > Is there any reason to do this in the kernel?  
> > 
> > It's less code, and it's faster :)
> > 
> > > It does not have to be a particularly fast path;
> > 
> > Sure, but do we have to make it slow on purpose?
> > 
> We do not put non performance critical devices into the kernel.

It's not a device, it's a single hypercall, specified by PAPR, which
is the moral equivalent of x86's RDRAND.

Paul.
Benjamin Herrenschmidt Oct. 1, 2013, 9:38 a.m. UTC | #7
On Tue, 2013-10-01 at 11:39 +0300, Gleb Natapov wrote:
> On Tue, Oct 01, 2013 at 06:34:26PM +1000, Michael Ellerman wrote:
> > On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
> > > Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> > > > Some powernv systems include a hwrng. Guests can access it via the
> > > > H_RANDOM hcall.
> > > 
> > > Is there any reason to do this in the kernel?  
> > 
> > It's less code, and it's faster :)
> > 
> > > It does not have to be a particularly fast path;
> > 
> > Sure, but do we have to make it slow on purpose?
> > 
> We do not put non performance critical devices into the kernel.

So for the sake of that dogma you are going to make us do something that
is about 100 times slower ? (and possibly involves more lines of code)

It's not just speed ... H_RANDOM is going to be called by the guest
kernel. A round trip to qemu is going to introduce a kernel jitter
(complete stop of operations of the kernel on that virtual processor) of
a full exit + round trip to qemu + back to the kernel to get to some
source of random number ...  this is going to be in the dozens of ns at
least.

This makes no sense.

Ben.
Gleb Natapov Oct. 1, 2013, 9:57 a.m. UTC | #8
On Tue, Oct 01, 2013 at 07:23:20PM +1000, Paul Mackerras wrote:
> On Tue, Oct 01, 2013 at 11:39:08AM +0300, Gleb Natapov wrote:
> > On Tue, Oct 01, 2013 at 06:34:26PM +1000, Michael Ellerman wrote:
> > > On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
> > > > Il 26/09/2013 08:31, Michael Ellerman ha scritto:
> > > > > Some powernv systems include a hwrng. Guests can access it via the
> > > > > H_RANDOM hcall.
> > > > 
> > > > Is there any reason to do this in the kernel?  
> > > 
> > > It's less code, and it's faster :)
> > > 
> > > > It does not have to be a particularly fast path;
> > > 
> > > Sure, but do we have to make it slow on purpose?
> > > 
> > We do not put non performance critical devices into the kernel.
> 
> It's not a device, it's a single hypercall, specified by PAPR, which
> is the moral equivalent of x86's RDRAND.
> 
OK. A couple of general questions. How guest knows when this hypercall
is available? Is QEMU enable it when KVM_CAP_PPC_HWRNG is available (I
haven't seen userspace patch that uses KVM_CAP_PPC_HWRNG)? What about
QEMU TCG does it implement this hypercall or it emulates hwrng directly?

--
			Gleb.
Paolo Bonzini Oct. 1, 2013, 9:58 a.m. UTC | #9
Il 01/10/2013 10:34, Michael Ellerman ha scritto:
>> If you really want to have the hypercall, implementing it in QEMU means
>> that you can support it on all systems, in fact even when running
>> without KVM.  
> 
> Sure, I can add a fallback to /dev/hwrng for full emulation.
> 
>> The QEMU command line would be something like "-object
>> rng-random,filename=/dev/random,id=rng0 -device spapr-rng,rng=rng0".
> 
> We can't use /dev/random like that. The PAPR specification, which is
> what we're implementing, implies that H_RANDOM provides data from a
> hardware source.

Then use /dev/hwrng.

I don't have POWER machines, but I still want to be able to test as much
as possible using emulation.

Paolo
Alexander Graf Oct. 1, 2013, 10 a.m. UTC | #10
On 10/01/2013 11:23 AM, Paul Mackerras wrote:
> On Tue, Oct 01, 2013 at 11:39:08AM +0300, Gleb Natapov wrote:
>> On Tue, Oct 01, 2013 at 06:34:26PM +1000, Michael Ellerman wrote:
>>> On Thu, Sep 26, 2013 at 11:06:59AM +0200, Paolo Bonzini wrote:
>>>> Il 26/09/2013 08:31, Michael Ellerman ha scritto:
>>>>> Some powernv systems include a hwrng. Guests can access it via the
>>>>> H_RANDOM hcall.
>>>> Is there any reason to do this in the kernel?
>>> It's less code, and it's faster :)
>>>
>>>> It does not have to be a particularly fast path;
>>> Sure, but do we have to make it slow on purpose?
>>>
>> We do not put non performance critical devices into the kernel.
> It's not a device, it's a single hypercall, specified by PAPR, which
> is the moral equivalent of x86's RDRAND.

Yes, and hypercalls should be handled in user space unless impossible 
otherwise (like MMU hypercalls which modify state that user space has no 
priviledge to access).

I think the most reasonable way forward would be to implement the path 
that jumps through hoops and goes through user space, then add a new 
device in kvm that registers on this hcall inside of kvm.

That way we ensure consistency (user space knows what to put into device 
tree, can disable it if it wants to, can run with TCG, etc) and you can 
prove that your user space interface works along the way.


Alex
Paolo Bonzini Oct. 1, 2013, 11:19 a.m. UTC | #11
Il 01/10/2013 11:38, Benjamin Herrenschmidt ha scritto:
> So for the sake of that dogma you are going to make us do something that
> is about 100 times slower ? (and possibly involves more lines of code)

If it's 100 times slower there is something else that's wrong.  It's
most likely not 100 times slower, and this makes me wonder if you or
Michael actually timed the code at all.

> It's not just speed ... H_RANDOM is going to be called by the guest
> kernel. A round trip to qemu is going to introduce a kernel jitter
> (complete stop of operations of the kernel on that virtual processor) of
> a full exit + round trip to qemu + back to the kernel to get to some
> source of random number ...  this is going to be in the dozens of ns at
> least.

I guess you mean dozens of *micro*seconds, which is somewhat exaggerated
but not too much.  On x86 some reasonable timings are:

  100 cycles            bare metal rdrand
  2000 cycles           guest->hypervisor->guest
  15000 cycles          guest->userspace->guest

(100 cycles = 40 ns = 200 MB/sec; 2000 cycles = ~1 microseconds; 15000
cycles = ~7.5 microseconds).  Even on 5 year old hardware, a userspace
roundtrip is around a dozen microseconds.


Anyhow, I would like to know more about this hwrng and hypercall.

Does the hwrng return random numbers (like rdrand) or real entropy (like
rdseed that Intel will add in Broadwell)?  What about the hypercall?
For example virtio-rng is specified to return actual entropy, it doesn't
matter if it is from hardware or software.

In either case, the patches have problems.

1) If the hwrng returns random numbers, the whitening you're doing is
totally insufficient and patch 2 is forging entropy that doesn't exist.

2) If the hwrng returns entropy, a read from the hwrng is going to even
more expensive than an x86 rdrand (perhaps ~2000 cycles).  Hence, doing
the emulation in the kernel is even less necessary.  Also, if the hwrng
returns entropy patch 1 is unnecessary: you do not need to waste
precious entropy bits by passing them to arch_get_random_long; just run
rngd in the host as that will put the entropy to much better use.

3) If the hypercall returns random numbers, then it is a pretty
braindead interface since returning 8 bytes at a time limits the
throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
 But more important: in this case drivers/char/hw_random/pseries-rng.c
is completely broken and insecure, just like patch 2 in case (1) above.

4) If the hypercall returns entropy (same as virtio-rng), the same
considerations on speed apply.  If you can only produce entropy at say 1
MB/s (so reading 8 bytes take 8 microseconds---which is actually very
fast), it doesn't matter that much to spend 7 microseconds on a
userspace roundtrip.  It's going to be only half the speed of bare
metal, not 100 times slower.


Also, you will need _anyway_ extra code that is not present here to
either disable the rng based on userspace command-line, or to emulate
the rng from userspace.  It is absolutely _not_ acceptable to have a
hypercall disappear across migration.  You're repeatedly ignoring these
issues, but rest assured that they will come back and bite you
spectacularly.

Based on all this, I would simply ignore the part of the spec where they
say "the hypercall should return numbers from a hardware source".  All
that matters in virtualization is to have a good source of _entropy_.
Then you can run rngd without randomness checks, which will more than
recover the cost of userspace roundtrips.

In any case, deciding where to get that entropy from is definitely
outside the scope of KVM, and in fact QEMU already has a configurable
mechanism for that.

Paolo
Benjamin Herrenschmidt Oct. 1, 2013, 9:44 p.m. UTC | #12
On Tue, 2013-10-01 at 13:19 +0200, Paolo Bonzini wrote:
> Il 01/10/2013 11:38, Benjamin Herrenschmidt ha scritto:
> > So for the sake of that dogma you are going to make us do something that
> > is about 100 times slower ? (and possibly involves more lines of code)
> 
> If it's 100 times slower there is something else that's wrong.  It's
> most likely not 100 times slower, and this makes me wonder if you or
> Michael actually timed the code at all.

We haven't but it's pretty obvious:

 - The KVM real mode implementation: guest issues the hcall, we remain
in real mode, within the MMU context of the guest, all secondary threads
on the core are still running in the guest, and we do an MMIO & return.

 - The qemu variant: guest issues the hcall we need to exit the guest,
which means bring *all* threads on the core out of KVM, switch the full
MMU context back to host (which among others involves flushing the ERAT,
aka level 1 TLB), while sending the secondary threads into idle loops.
Then we return to qemu user context, which will then use /dev/random ->
back into the kernel and out, at which point we can return to the guest,
so back into the kernel, back into run which means IPI the secondary
threads on the core, switch the MMU context again until we can finally
go back to executing guest instructions.

So no we haven't measured. But it is going to be VERY VERY VERY much
slower. Our exit latencies are bad with our current MMU *and* any exit
is going to cause all secondary threads on the core to have to exit as
well (remember P7 is 4 threads, P8 is 8)

> > It's not just speed ... H_RANDOM is going to be called by the guest
> > kernel. A round trip to qemu is going to introduce a kernel jitter
> > (complete stop of operations of the kernel on that virtual processor) of
> > a full exit + round trip to qemu + back to the kernel to get to some
> > source of random number ...  this is going to be in the dozens of ns at
> > least.
> 
> I guess you mean dozens of *micro*seconds, which is somewhat exaggerated
> but not too much.  On x86 some reasonable timings are:

Yes.

>   100 cycles            bare metal rdrand
>   2000 cycles           guest->hypervisor->guest
>   15000 cycles          guest->userspace->guest
> 
> (100 cycles = 40 ns = 200 MB/sec; 2000 cycles = ~1 microseconds; 15000
> cycles = ~7.5 microseconds).  Even on 5 year old hardware, a userspace
> roundtrip is around a dozen microseconds.

So in your case going to qemu to "emulate" rdrand would indeed be 150
times slower, I don't see in what universe that would be considered a
good idea.

> Anyhow, I would like to know more about this hwrng and hypercall.
> 
> Does the hwrng return random numbers (like rdrand) or real entropy (like
> rdseed that Intel will add in Broadwell)?

It's a random number obtained from sampling a set of oscillators. It's
slightly biased but we have very simple code (I believe shared with the
host kernel implementation) for whitening it as is required by PAPR.
 
>   What about the hypercall?
> For example virtio-rng is specified to return actual entropy, it doesn't
> matter if it is from hardware or software.
> 
> In either case, the patches have problems.
> 
> 1) If the hwrng returns random numbers, the whitening you're doing is
> totally insufficient and patch 2 is forging entropy that doesn't exist.

I will let Paul to comment on the whitening, it passes all the tests
we've been running it through.

> 2) If the hwrng returns entropy, a read from the hwrng is going to even
> more expensive than an x86 rdrand (perhaps ~2000 cycles).

Depends how often you read, the HW I think is sampling asynchronously so
you only block on the MMIO if you already consumed the previous sample
but I'll let Paulus provide more details here.

>   Hence, doing
> the emulation in the kernel is even less necessary.  Also, if the hwrng
> returns entropy patch 1 is unnecessary: you do not need to waste
> precious entropy bits by passing them to arch_get_random_long; just run
> rngd in the host as that will put the entropy to much better use.
>
> 3) If the hypercall returns random numbers, then it is a pretty
> braindead interface since returning 8 bytes at a time limits the
> throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
>  But more important: in this case drivers/char/hw_random/pseries-rng.c
> is completely broken and insecure, just like patch 2 in case (1) above.

How so ?

> 4) If the hypercall returns entropy (same as virtio-rng), the same
> considerations on speed apply.  If you can only produce entropy at say 1
> MB/s (so reading 8 bytes take 8 microseconds---which is actually very
> fast), it doesn't matter that much to spend 7 microseconds on a
> userspace roundtrip.  It's going to be only half the speed of bare
> metal, not 100 times slower.
> 
> 
> Also, you will need _anyway_ extra code that is not present here to
> either disable the rng based on userspace command-line, or to emulate
> the rng from userspace.  It is absolutely _not_ acceptable to have a
> hypercall disappear across migration.  You're repeatedly ignoring these
> issues, but rest assured that they will come back and bite you
> spectacularly.
> 
> Based on all this, I would simply ignore the part of the spec where they
> say "the hypercall should return numbers from a hardware source".  All
> that matters in virtualization is to have a good source of _entropy_.
> Then you can run rngd without randomness checks, which will more than
> recover the cost of userspace roundtrips.
> 
> In any case, deciding where to get that entropy from is definitely
> outside the scope of KVM, and in fact QEMU already has a configurable
> mechanism for that.
>
> Paolo
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul Mackerras Oct. 2, 2013, 5:09 a.m. UTC | #13
On Tue, Oct 01, 2013 at 01:19:06PM +0200, Paolo Bonzini wrote:

> Anyhow, I would like to know more about this hwrng and hypercall.
> 
> Does the hwrng return random numbers (like rdrand) or real entropy (like
> rdseed that Intel will add in Broadwell)?  What about the hypercall?

Well, firstly, your terminology is inaccurate.  Real entropy will give
you random numbers.  I think when you say "random numbers" you
actually mean "pseudo-random numbers".

Secondly, the RNG produces real entropy.  The way it works is that
there are 64 ring oscillators running at different frequencies (above
1GHz).  They get sampled at (typically) 1MHz and the samples get put
in a 64-entry FIFO, which is read by MMIO.  There is practically no
correlation between bits or between adjacent samples.  The only
deficiency is that the distribution of each bit is not always
precisely 50% zero / 50% one (it is somewhere between 40/60 and
60/40).

The whitening addresses this bias.  Looking at the stream of values
for a given bit, we XOR that stream with another stream that is
uncorrelated and has a 50/50 distribution (or very very close to
that), which gives a stream whose distribution is closer to 50/50 than
either input stream.  The second stream is effectively derived by
XORing together all 64 bits of some previous sample.  XORing together
many uncorrelated streams that are each close to 50/50 distribution
gives a stream that is much closer to a 50/50 distribution (by the
"piling up lemma").  The result passes all the dieharder tests.

> For example virtio-rng is specified to return actual entropy, it doesn't
> matter if it is from hardware or software.
> 
> In either case, the patches have problems.
> 
> 1) If the hwrng returns random numbers, the whitening you're doing is
> totally insufficient and patch 2 is forging entropy that doesn't exist.
> 
> 2) If the hwrng returns entropy, a read from the hwrng is going to even
> more expensive than an x86 rdrand (perhaps ~2000 cycles).  Hence, doing

The MMIO itself is reasonably quick if the FIFO is not empty, but the
long-term overall rate is limited by the sampling rate.

> the emulation in the kernel is even less necessary.  Also, if the hwrng
> returns entropy patch 1 is unnecessary: you do not need to waste
> precious entropy bits by passing them to arch_get_random_long; just run
> rngd in the host as that will put the entropy to much better use.

Not sure why they are particularly "precious"; we get 64 bits per
microsecond whether we use them or not.  What are you suggesting
arch_get_random_long() should do instead?

> 3) If the hypercall returns random numbers, then it is a pretty
> braindead interface since returning 8 bytes at a time limits the
> throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
>  But more important: in this case drivers/char/hw_random/pseries-rng.c
> is completely broken and insecure, just like patch 2 in case (1) above.

Assuming that by "random numbers" you actually mean "pseudo-random
numbers", then this doesn't apply.

> 4) If the hypercall returns entropy (same as virtio-rng), the same
> considerations on speed apply.  If you can only produce entropy at say 1
> MB/s (so reading 8 bytes take 8 microseconds---which is actually very
> fast), it doesn't matter that much to spend 7 microseconds on a
> userspace roundtrip.  It's going to be only half the speed of bare
> metal, not 100 times slower.

8 bytes takes at most 1 microsecond, so the round-trip to userspace is
definitely noticeable.

Paul.
Paolo Bonzini Oct. 2, 2013, 8:38 a.m. UTC | #14
Il 01/10/2013 23:44, Benjamin Herrenschmidt ha scritto:
> On Tue, 2013-10-01 at 13:19 +0200, Paolo Bonzini wrote:
>> Il 01/10/2013 11:38, Benjamin Herrenschmidt ha scritto:
>>> So for the sake of that dogma you are going to make us do something that
>>> is about 100 times slower ? (and possibly involves more lines of code)
>>
>> If it's 100 times slower there is something else that's wrong.  It's
>> most likely not 100 times slower, and this makes me wonder if you or
>> Michael actually timed the code at all.
> 
> So no we haven't measured. But it is going to be VERY VERY VERY much
> slower. Our exit latencies are bad with our current MMU *and* any exit
> is going to cause all secondary threads on the core to have to exit as
> well (remember P7 is 4 threads, P8 is 8)

Ok, this is indeed the main difference between Power and x86.

>>   100 cycles            bare metal rdrand
>>   2000 cycles           guest->hypervisor->guest
>>   15000 cycles          guest->userspace->guest
>>
>> (100 cycles = 40 ns = 200 MB/sec; 2000 cycles = ~1 microseconds; 15000
>> cycles = ~7.5 microseconds).  Even on 5 year old hardware, a userspace
>> roundtrip is around a dozen microseconds.
> 
> So in your case going to qemu to "emulate" rdrand would indeed be 150
> times slower, I don't see in what universe that would be considered a
> good idea.

rdrand is not privileged on x86, guests can use it.  But my point is
that going to the kernel is already 20 times slower.  Getting entropy
(not just a pseudo-random number seeded by the HWRNG) with rdrand is
~1000 times slower according to Intel's recommendations, so the
roundtrip to userspace is entirely invisible in that case.

The numbers for PPC seem to be a bit different though (it's faster to
read entropy, and slower to do a userspace exit).

> It's a random number obtained from sampling a set of oscillators. It's
> slightly biased but we have very simple code (I believe shared with the
> host kernel implementation) for whitening it as is required by PAPR.

Good.  Actually, passing the dieharder tests does not mean much (an
AES-encrypted counter should also pass them with flashing colors), but
if it's specified by the architecture gods it's likely to have received
some scrutiny.

>> 2) If the hwrng returns entropy, a read from the hwrng is going to even
>> more expensive than an x86 rdrand (perhaps ~2000 cycles).
> 
> Depends how often you read, the HW I think is sampling asynchronously so
> you only block on the MMIO if you already consumed the previous sample
> but I'll let Paulus provide more details here.

Given Paul's description, there's indeed very little extra cost compared
to a "nop" hypercall.  That's nice.

Still, considering that QEMU code has to be there anyway for
compatibility, kernel emulation is not particularly necessary IMHO.  I
would of course like to see actual performance numbers, but besides that
are you ever going to ever see this in the profile except if you run "dd
if=/dev/hwrng of=/dev/null"?

Can you instrument pHyp to find out how many times per second is this
hypercall called by a "normal" Linux or AIX guest?

>> 3) If the hypercall returns random numbers, then it is a pretty
>> braindead interface since returning 8 bytes at a time limits the
>> throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
>>  But more important: in this case drivers/char/hw_random/pseries-rng.c
>> is completely broken and insecure, just like patch 2 in case (1) above.
> 
> How so ?

Paul confirmed that it returns real entropy so this is moot.

Paolo
Paolo Bonzini Oct. 2, 2013, 8:46 a.m. UTC | #15
Il 02/10/2013 07:09, Paul Mackerras ha scritto:
> On Tue, Oct 01, 2013 at 01:19:06PM +0200, Paolo Bonzini wrote:
> 
>> Anyhow, I would like to know more about this hwrng and hypercall.
>>
>> Does the hwrng return random numbers (like rdrand) or real entropy (like
>> rdseed that Intel will add in Broadwell)?  What about the hypercall?
> 
> Well, firstly, your terminology is inaccurate.  Real entropy will give
> you random numbers.  I think when you say "random numbers" you
> actually mean "pseudo-random numbers".

Yes---I meant pseudo-random numbers where the generator is periodically
seeded by a random number.

> Secondly, the RNG produces real entropy.

Good to know, thanks.

> Not sure why they are particularly "precious"; we get 64 bits per
> microsecond whether we use them or not.  What are you suggesting
> arch_get_random_long() should do instead?

If you are running rngd, there is no need to have arch_get_random_long()
at all.

>> 3) If the hypercall returns random numbers, then it is a pretty
>> braindead interface since returning 8 bytes at a time limits the
>> throughput to a handful of MB/s (compare to 200 MB/sec for x86 rdrand).
>>  But more important: in this case drivers/char/hw_random/pseries-rng.c
>> is completely broken and insecure, just like patch 2 in case (1) above.
> 
> Assuming that by "random numbers" you actually mean "pseudo-random
> numbers", then this doesn't apply.

Indeed.

>> 4) If the hypercall returns entropy (same as virtio-rng), the same
>> considerations on speed apply.  If you can only produce entropy at say 1
>> MB/s (so reading 8 bytes take 8 microseconds---which is actually very
>> fast), it doesn't matter that much to spend 7 microseconds on a
>> userspace roundtrip.  It's going to be only half the speed of bare
>> metal, not 100 times slower.
> 
> 8 bytes takes at most 1 microsecond, so the round-trip to userspace is
> definitely noticeable.

Thanks.  Any chance you can give some numbers of a kernel hypercall and
a userspace hypercall on Power, so we have actual data?  For example a
hypercall that returns H_PARAMETER as soon as possible.

Paolo
Benjamin Herrenschmidt Oct. 2, 2013, 9:06 a.m. UTC | #16
On Wed, 2013-10-02 at 10:46 +0200, Paolo Bonzini wrote:

> 
> Thanks.  Any chance you can give some numbers of a kernel hypercall and
> a userspace hypercall on Power, so we have actual data?  For example a
> hypercall that returns H_PARAMETER as soon as possible.

I don't have (yet) numbers at hand but we have basically 3 places where
we can handle hypercalls:

 - Kernel real mode. This is where most of our MMU stuff goes for
example unless it needs to trigger a page fault in Linux. This is
executed with translation disabled and the MMU still in guest context.
This is the fastest path since we don't take out the other threads nor
perform any expensive context change. This is where we put the
"accelerated" H_RANDOM as well.

 - Kernel virtual mode. That's a full exit, so all threads are out and
MMU switched back to host Linux. Things like vhost MMIO emulation goes
there, page faults, etc...

 - Qemu. This adds the round trip to userspace on top of the above.

Cheers,
Ben.
Alexander Graf Oct. 2, 2013, 9:11 a.m. UTC | #17
On 02.10.2013, at 11:06, Benjamin Herrenschmidt wrote:

> On Wed, 2013-10-02 at 10:46 +0200, Paolo Bonzini wrote:
> 
>> 
>> Thanks.  Any chance you can give some numbers of a kernel hypercall and
>> a userspace hypercall on Power, so we have actual data?  For example a
>> hypercall that returns H_PARAMETER as soon as possible.
> 
> I don't have (yet) numbers at hand but we have basically 3 places where
> we can handle hypercalls:
> 
> - Kernel real mode. This is where most of our MMU stuff goes for
> example unless it needs to trigger a page fault in Linux. This is
> executed with translation disabled and the MMU still in guest context.
> This is the fastest path since we don't take out the other threads nor
> perform any expensive context change. This is where we put the
> "accelerated" H_RANDOM as well.
> 
> - Kernel virtual mode. That's a full exit, so all threads are out and
> MMU switched back to host Linux. Things like vhost MMIO emulation goes
> there, page faults, etc...
> 
> - Qemu. This adds the round trip to userspace on top of the above.

Right, and the difference for the patch in question is really whether we handle in in kernel virtual mode or in QEMU, so the bulk of the overhead (kicking threads out of  guest context, switching MMU context, etc) happens either way.

So the additional overhead when handling it in QEMU here really boils down to the user space roundtrip (plus another random number read roundtrip).


Alex
Alexander Graf Oct. 2, 2013, 9:50 a.m. UTC | #18
On 02.10.2013, at 11:11, Alexander Graf wrote:

> 
> On 02.10.2013, at 11:06, Benjamin Herrenschmidt wrote:
> 
>> On Wed, 2013-10-02 at 10:46 +0200, Paolo Bonzini wrote:
>> 
>>> 
>>> Thanks.  Any chance you can give some numbers of a kernel hypercall and
>>> a userspace hypercall on Power, so we have actual data?  For example a
>>> hypercall that returns H_PARAMETER as soon as possible.
>> 
>> I don't have (yet) numbers at hand but we have basically 3 places where
>> we can handle hypercalls:
>> 
>> - Kernel real mode. This is where most of our MMU stuff goes for
>> example unless it needs to trigger a page fault in Linux. This is
>> executed with translation disabled and the MMU still in guest context.
>> This is the fastest path since we don't take out the other threads nor
>> perform any expensive context change. This is where we put the
>> "accelerated" H_RANDOM as well.
>> 
>> - Kernel virtual mode. That's a full exit, so all threads are out and
>> MMU switched back to host Linux. Things like vhost MMIO emulation goes
>> there, page faults, etc...
>> 
>> - Qemu. This adds the round trip to userspace on top of the above.
> 
> Right, and the difference for the patch in question is really whether we handle in in kernel virtual mode or in QEMU, so the bulk of the overhead (kicking threads out of  guest context, switching MMU context, etc) happens either way.
> 
> So the additional overhead when handling it in QEMU here really boils down to the user space roundtrip (plus another random number read roundtrip).

Ah, sorry, I misread the patch. You're running the handler in real mode of course :).

So how do you solve live migration between a kernel that has this patch and one that doesn't?


Alex
Gleb Natapov Oct. 2, 2013, 10:02 a.m. UTC | #19
On Wed, Oct 02, 2013 at 11:50:50AM +0200, Alexander Graf wrote:
> 
> On 02.10.2013, at 11:11, Alexander Graf wrote:
> 
> > 
> > On 02.10.2013, at 11:06, Benjamin Herrenschmidt wrote:
> > 
> >> On Wed, 2013-10-02 at 10:46 +0200, Paolo Bonzini wrote:
> >> 
> >>> 
> >>> Thanks.  Any chance you can give some numbers of a kernel hypercall and
> >>> a userspace hypercall on Power, so we have actual data?  For example a
> >>> hypercall that returns H_PARAMETER as soon as possible.
> >> 
> >> I don't have (yet) numbers at hand but we have basically 3 places where
> >> we can handle hypercalls:
> >> 
> >> - Kernel real mode. This is where most of our MMU stuff goes for
> >> example unless it needs to trigger a page fault in Linux. This is
> >> executed with translation disabled and the MMU still in guest context.
> >> This is the fastest path since we don't take out the other threads nor
> >> perform any expensive context change. This is where we put the
> >> "accelerated" H_RANDOM as well.
> >> 
> >> - Kernel virtual mode. That's a full exit, so all threads are out and
> >> MMU switched back to host Linux. Things like vhost MMIO emulation goes
> >> there, page faults, etc...
> >> 
> >> - Qemu. This adds the round trip to userspace on top of the above.
> > 
> > Right, and the difference for the patch in question is really whether we handle in in kernel virtual mode or in QEMU, so the bulk of the overhead (kicking threads out of  guest context, switching MMU context, etc) happens either way.
> > 
> > So the additional overhead when handling it in QEMU here really boils down to the user space roundtrip (plus another random number read roundtrip).
> 
> Ah, sorry, I misread the patch. You're running the handler in real mode of course :).
> 
> So how do you solve live migration between a kernel that has this patch and one that doesn't?
> 
Yes, I alluded to it in my email to Paul and Paolo asked also. How this
interface is disabled? Also hwrnd is MMIO in a host why guest needs to
use hypercall instead of emulating the device (in kernel or somewhere
else?). Another things is that on a host hwrnd is protected from
direct userspace access by virtue of been a device, but guest code (event
kernel mode) is userspace as far as hosts security model goes, so by
implementing this hypercall in a way that directly access hwrnd you
expose hwrnd to a userspace unconditionally. Why is this a good idea? 

--
			Gleb.
Michael Ellerman Oct. 2, 2013, 1:57 p.m. UTC | #20
On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:
> On Wed, Oct 02, 2013 at 11:50:50AM +0200, Alexander Graf wrote:
> > 
> > On 02.10.2013, at 11:11, Alexander Graf wrote:
> > 
> > So how do you solve live migration between a kernel that has this patch and one that doesn't?
> > 
> Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> interface is disabled? 

Yes that is a valid point.

We can't disable the interface at runtime, the guest detects its
presence at boot.

What will happen is the hcall will come through to QEMU, which will
reject it with H_FUNCTION (~= ENOSYS).

The current pseries-rng driver does not handle that case well, which is
exactly why I sent patches to fix it recently.

The only other option would be to feed it with /dev/random.

> Also hwrnd is MMIO in a host why guest needs to
> use hypercall instead of emulating the device (in kernel or somewhere
> else?). 

Because PAPR is a platform specification and it specifies that the
interface is a hypervisor call. We can't just decide we want to do it
differently.

> Another things is that on a host hwrnd is protected from
> direct userspace access by virtue of been a device, but guest code (event
> kernel mode) is userspace as far as hosts security model goes, so by
> implementing this hypercall in a way that directly access hwrnd you
> expose hwrnd to a userspace unconditionally. Why is this a good idea? 

I'm not sure I follow you.

The hwrng is accessible by host userspace via /dev/mem.

cheers
Alexander Graf Oct. 2, 2013, 2:08 p.m. UTC | #21
On 02.10.2013, at 15:57, Michael Ellerman wrote:

> On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:
>> On Wed, Oct 02, 2013 at 11:50:50AM +0200, Alexander Graf wrote:
>>> 
>>> On 02.10.2013, at 11:11, Alexander Graf wrote:
>>> 
>>> So how do you solve live migration between a kernel that has this patch and one that doesn't?
>>> 
>> Yes, I alluded to it in my email to Paul and Paolo asked also. How this
>> interface is disabled? 
> 
> Yes that is a valid point.
> 
> We can't disable the interface at runtime, the guest detects its
> presence at boot.
> 
> What will happen is the hcall will come through to QEMU, which will
> reject it with H_FUNCTION (~= ENOSYS).
> 
> The current pseries-rng driver does not handle that case well, which is
> exactly why I sent patches to fix it recently.
> 
> The only other option would be to feed it with /dev/random.
> 
>> Also hwrnd is MMIO in a host why guest needs to
>> use hypercall instead of emulating the device (in kernel or somewhere
>> else?). 
> 
> Because PAPR is a platform specification and it specifies that the
> interface is a hypervisor call. We can't just decide we want to do it
> differently.
> 
>> Another things is that on a host hwrnd is protected from
>> direct userspace access by virtue of been a device, but guest code (event
>> kernel mode) is userspace as far as hosts security model goes, so by
>> implementing this hypercall in a way that directly access hwrnd you
>> expose hwrnd to a userspace unconditionally. Why is this a good idea? 
> 
> I'm not sure I follow you.
> 
> The hwrng is accessible by host userspace via /dev/mem.

A guest should live on the same permission level as a user space application. If you run QEMU as UID 1000 without access to /dev/mem, why should the guest suddenly be able to directly access a memory location (MMIO) it couldn't access directly through a normal user space interface.

It's basically a layering violation.


Alex
Gleb Natapov Oct. 2, 2013, 2:10 p.m. UTC | #22
On Wed, Oct 02, 2013 at 11:57:55PM +1000, Michael Ellerman wrote:
> On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:
> > On Wed, Oct 02, 2013 at 11:50:50AM +0200, Alexander Graf wrote:
> > > 
> > > On 02.10.2013, at 11:11, Alexander Graf wrote:
> > > 
> > > So how do you solve live migration between a kernel that has this patch and one that doesn't?
> > > 
> > Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> > interface is disabled? 
> 
> Yes that is a valid point.
> 
> We can't disable the interface at runtime, the guest detects its
> presence at boot.
> 
> What will happen is the hcall will come through to QEMU, which will
> reject it with H_FUNCTION (~= ENOSYS).
> 
> The current pseries-rng driver does not handle that case well, which is
> exactly why I sent patches to fix it recently.
> 
> The only other option would be to feed it with /dev/random.
> 
What about other way, if guest migrates from kvm that has no this
hypercall to one that has? We try to not change HW under guest during
migration.

> > Also hwrnd is MMIO in a host why guest needs to
> > use hypercall instead of emulating the device (in kernel or somewhere
> > else?). 
> 
> Because PAPR is a platform specification and it specifies that the
> interface is a hypervisor call. We can't just decide we want to do it
> differently.
Any insights on why it was specified this what. What is special about
hwrnd device that hypercall is needed to access it? I got that you didn't
just decide to implement it that way :) Also what will happen if guest
will find emulated hwrnd device, will it use it?

> 
> > Another things is that on a host hwrnd is protected from
> > direct userspace access by virtue of been a device, but guest code (event
> > kernel mode) is userspace as far as hosts security model goes, so by
> > implementing this hypercall in a way that directly access hwrnd you
> > expose hwrnd to a userspace unconditionally. Why is this a good idea? 
> 
> I'm not sure I follow you.
> 
> The hwrng is accessible by host userspace via /dev/mem.
> 
Regular user has no access to /dev/mem, but he can start kvm guest and
gain access to the device.

--
			Gleb.
Paolo Bonzini Oct. 2, 2013, 2:33 p.m. UTC | #23
Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > The hwrng is accessible by host userspace via /dev/mem.
> 
> A guest should live on the same permission level as a user space
> application. If you run QEMU as UID 1000 without access to /dev/mem, why
> should the guest suddenly be able to directly access a memory location
> (MMIO) it couldn't access directly through a normal user space interface.
> 
> It's basically a layering violation.

With Michael's earlier patch in this series, the hwrng is accessible by
host userspace via /dev/hwrng, no?

Paolo
Alexander Graf Oct. 2, 2013, 2:36 p.m. UTC | #24
On 02.10.2013, at 16:33, Paolo Bonzini wrote:

> Il 02/10/2013 16:08, Alexander Graf ha scritto:
>>> The hwrng is accessible by host userspace via /dev/mem.
>> 
>> A guest should live on the same permission level as a user space
>> application. If you run QEMU as UID 1000 without access to /dev/mem, why
>> should the guest suddenly be able to directly access a memory location
>> (MMIO) it couldn't access directly through a normal user space interface.
>> 
>> It's basically a layering violation.
> 
> With Michael's earlier patch in this series, the hwrng is accessible by
> host userspace via /dev/hwrng, no?

Yes, but there's not token from user space that gets passed into the kernel to check whether access is ok or not. So while QEMU may not have permission to open /dev/hwrng it could spawn a guest that opens it, drains all entropy out of it and thus stall other processes which try to fetch entropy, no?

Maybe I haven't fully grasped the interface yet though :).


Alex
Gleb Natapov Oct. 2, 2013, 2:37 p.m. UTC | #25
On Wed, Oct 02, 2013 at 04:33:18PM +0200, Paolo Bonzini wrote:
> Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > > The hwrng is accessible by host userspace via /dev/mem.
> > 
> > A guest should live on the same permission level as a user space
> > application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > should the guest suddenly be able to directly access a memory location
> > (MMIO) it couldn't access directly through a normal user space interface.
> > 
> > It's basically a layering violation.
> 
> With Michael's earlier patch in this series, the hwrng is accessible by
> host userspace via /dev/hwrng, no?
> 
Access to which can be controlled by its permission. Permission of
/dev/kvm may be different. If we route hypercall via userspace and
configure qemu to get entropy from /dev/hwrng everything will fall
nicely together (except performance).

--
			Gleb.
Paolo Bonzini Oct. 2, 2013, 2:38 p.m. UTC | #26
Il 02/10/2013 16:36, Alexander Graf ha scritto:
>> > 
>> > With Michael's earlier patch in this series, the hwrng is accessible by
>> > host userspace via /dev/hwrng, no?
> Yes, but there's not token from user space that gets passed into the
> kernel to check whether access is ok or not. So while QEMU may not have
> permission to open /dev/hwrng it could spawn a guest that opens it,
> drains all entropy out of it and thus stall other processes which try to
> fetch entropy, no?
> 
> Maybe I haven't fully grasped the interface yet though :).

Yes, that's right.  I don't think it's a huge problem, but it's another
point in favor of just doing the hypercall in userspace.

Paolo
Benjamin Herrenschmidt Oct. 2, 2013, 9:58 p.m. UTC | #27
On Wed, 2013-10-02 at 11:11 +0200, Alexander Graf wrote:

> Right, and the difference for the patch in question is really whether
> we handle in in kernel virtual mode or in QEMU, so the bulk of the
> overhead (kicking threads out of  guest context, switching MMU
> context, etc) happens either way.
> 
> So the additional overhead when handling it in QEMU here really boils
> down to the user space roundtrip (plus another random number read
> roundtrip).

Hrm, Michael had a real mode version ... not sure where it went then.

Cheers,
Ben.
Benjamin Herrenschmidt Oct. 2, 2013, 10:02 p.m. UTC | #28
On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:

> Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> interface is disabled? Also hwrnd is MMIO in a host why guest needs to
> use hypercall instead of emulating the device (in kernel or somewhere
> else?).

Migration will have to be dealt with one way or another, I suppose we
will indeed need a qemu fallback.

As for why hypercall instead of MMIO, well, you'd have to ask the folks
who wrote the PAPR spec :-) It's specified as a hypercall and
implemented as such in pHyp (PowerVM). The existing guests expect it
that way.

It might have to do with the required whitening done by the hypervisor
(H_RANDOM output is supposed to be clean). It also abstracts us from the
underlying HW implementation which could in theory change.
 
>  Another things is that on a host hwrnd is protected from
> direct userspace access by virtue of been a device, but guest code (event
> kernel mode) is userspace as far as hosts security model goes, so by
> implementing this hypercall in a way that directly access hwrnd you
> expose hwrnd to a userspace unconditionally. Why is this a good idea? 

Why would this be a bad idea ?

Ben.

> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Oct. 2, 2013, 10:07 p.m. UTC | #29
On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:

> Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> interface is disabled? Also hwrnd is MMIO in a host why guest needs to
> use hypercall instead of emulating the device (in kernel or somewhere
> else?). Another things is that on a host hwrnd is protected from
> direct userspace access by virtue of been a device, but guest code (event
> kernel mode) is userspace as far as hosts security model goes, so by
> implementing this hypercall in a way that directly access hwrnd you
> expose hwrnd to a userspace unconditionally. Why is this a good idea? 

BTW. Is this always going to be like this ?

Every *single* architectural or design decision we make for our
architecture has to be justified 30 times over, every piece of code bike
shedded to oblivion for month, etc... ?

Do we always have to finally get to some kind of agreement on design, go
to the 6 month bike-shedding phase, just to have somebody else come up
and start re-questioning the whole original design (without any
understanding of our specific constraints of course) ?

You guys are the most horrendous community I have ever got to work with.
It's simply impossible to get anything done in any reasonable time
frame .

At this stage, it would have taken us an order of magnitude less time to
simply rewrite an entire hypervisor from scratch.

This is sad.

Ben.
Benjamin Herrenschmidt Oct. 2, 2013, 10:13 p.m. UTC | #30
On Wed, 2013-10-02 at 16:08 +0200, Alexander Graf wrote:
> A guest should live on the same permission level as a user space
> application. If you run QEMU as UID 1000 without access to /dev/mem,
> why should the guest suddenly be able to directly access a memory
> location (MMIO) it couldn't access directly through a normal user
> space interface.
> 
> It's basically a layering violation.

Guys, please stop with the academic non-sense !

Ben.
Benjamin Herrenschmidt Oct. 2, 2013, 10:15 p.m. UTC | #31
On Wed, 2013-10-02 at 17:10 +0300, Gleb Natapov wrote:
> > The hwrng is accessible by host userspace via /dev/mem.
> > 
> Regular user has no access to /dev/mem, but he can start kvm guest and
> gain access to the device.

Seriously. You guys are really trying hard to make our life hell or
what ? That discussion about access permissions makes no sense
whatsoever. Please stop.

Ben.
Benjamin Herrenschmidt Oct. 2, 2013, 10:21 p.m. UTC | #32
On Wed, 2013-10-02 at 17:37 +0300, Gleb Natapov wrote:
> On Wed, Oct 02, 2013 at 04:33:18PM +0200, Paolo Bonzini wrote:
> > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > > > The hwrng is accessible by host userspace via /dev/mem.
> > > 
> > > A guest should live on the same permission level as a user space
> > > application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > > should the guest suddenly be able to directly access a memory location
> > > (MMIO) it couldn't access directly through a normal user space interface.
> > > 
> > > It's basically a layering violation.
> > 
> > With Michael's earlier patch in this series, the hwrng is accessible by
> > host userspace via /dev/hwrng, no?
> > 
> Access to which can be controlled by its permission. Permission of
> /dev/kvm may be different. If we route hypercall via userspace and
> configure qemu to get entropy from /dev/hwrng everything will fall
> nicely together (except performance).

Yes, except abysmall performance and a lot more code for something
completely and utterly pointless .... nice.

Ben.
Paul Mackerras Oct. 2, 2013, 10:45 p.m. UTC | #33
On Wed, Oct 02, 2013 at 04:36:05PM +0200, Alexander Graf wrote:
> 
> On 02.10.2013, at 16:33, Paolo Bonzini wrote:
> 
> > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> >>> The hwrng is accessible by host userspace via /dev/mem.
> >> 
> >> A guest should live on the same permission level as a user space
> >> application. If you run QEMU as UID 1000 without access to /dev/mem, why
> >> should the guest suddenly be able to directly access a memory location
> >> (MMIO) it couldn't access directly through a normal user space interface.
> >> 
> >> It's basically a layering violation.
> > 
> > With Michael's earlier patch in this series, the hwrng is accessible by
> > host userspace via /dev/hwrng, no?
> 
> Yes, but there's not token from user space that gets passed into the kernel to check whether access is ok or not. So while QEMU may not have permission to open /dev/hwrng it could spawn a guest that opens it, drains all entropy out of it and thus stall other processes which try to fetch entropy, no?

Even if you drain all entropy out of it, wait 64 microseconds and it
will be full again. :)  Basically it produces 64 bits every
microsecond and puts that in a 64 entry x 64-bit FIFO buffer, which is
what is read by the MMIO.  So there is no danger of stalling other
processes for any significant amount of time.

Paul.
Gleb Natapov Oct. 3, 2013, 5:43 a.m. UTC | #34
On Thu, Oct 03, 2013 at 08:02:20AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:
> 
> > Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> > interface is disabled? Also hwrnd is MMIO in a host why guest needs to
> > use hypercall instead of emulating the device (in kernel or somewhere
> > else?).
> 
> Migration will have to be dealt with one way or another, I suppose we
> will indeed need a qemu fallback.
> 
So at least from amount of code perspective userspace and kernel space
solutions are on par since later will require former anyway.

What about other direction? Migration from KVM that does not have the
hypercall to one that has? We try to not change HW under a guest.

> As for why hypercall instead of MMIO, well, you'd have to ask the folks
> who wrote the PAPR spec :-) It's specified as a hypercall and
> implemented as such in pHyp (PowerVM). The existing guests expect it
> that way.
OK.

> 
> It might have to do with the required whitening done by the hypervisor
> (H_RANDOM output is supposed to be clean). It also abstracts us from the
> underlying HW implementation which could in theory change.
>  
But I guess bare metal OS has to know how to do whitening and deal with
HW change already. Anyway this is not the place to discuss PAPR
decisions. Thanks for insights.

> >  Another things is that on a host hwrnd is protected from
> > direct userspace access by virtue of been a device, but guest code (event
> > kernel mode) is userspace as far as hosts security model goes, so by
> > implementing this hypercall in a way that directly access hwrnd you
> > expose hwrnd to a userspace unconditionally. Why is this a good idea? 
> 
> Why would this be a bad idea ?
> 
When you elevate privilege you need to explain why it is not harmful
and why the privilege was restricted in the first place. /dev/hwrng
that first patch created gives access only to root, this patch changes
it to whoever can create a guest. 

Why it can be a bad idea? User can drain hwrng continuously making other
users of it much slower, or even worse, making them fall back to another
much less reliable, source of entropy.

--
			Gleb.
Gleb Natapov Oct. 3, 2013, 5:48 a.m. UTC | #35
On Thu, Oct 03, 2013 at 08:45:42AM +1000, Paul Mackerras wrote:
> On Wed, Oct 02, 2013 at 04:36:05PM +0200, Alexander Graf wrote:
> > 
> > On 02.10.2013, at 16:33, Paolo Bonzini wrote:
> > 
> > > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > >>> The hwrng is accessible by host userspace via /dev/mem.
> > >> 
> > >> A guest should live on the same permission level as a user space
> > >> application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > >> should the guest suddenly be able to directly access a memory location
> > >> (MMIO) it couldn't access directly through a normal user space interface.
> > >> 
> > >> It's basically a layering violation.
> > > 
> > > With Michael's earlier patch in this series, the hwrng is accessible by
> > > host userspace via /dev/hwrng, no?
> > 
> > Yes, but there's not token from user space that gets passed into the kernel to check whether access is ok or not. So while QEMU may not have permission to open /dev/hwrng it could spawn a guest that opens it, drains all entropy out of it and thus stall other processes which try to fetch entropy, no?
> 
> Even if you drain all entropy out of it, wait 64 microseconds and it
> will be full again. :)  Basically it produces 64 bits every
> microsecond and puts that in a 64 entry x 64-bit FIFO buffer, which is
> what is read by the MMIO.  So there is no danger of stalling other
> processes for any significant amount of time.
> 
Even if user crates 100s guests each one of which reads hwrng in a loop?

--
			Gleb.
Gleb Natapov Oct. 3, 2013, 6:08 a.m. UTC | #36
On Thu, Oct 03, 2013 at 08:21:20AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2013-10-02 at 17:37 +0300, Gleb Natapov wrote:
> > On Wed, Oct 02, 2013 at 04:33:18PM +0200, Paolo Bonzini wrote:
> > > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > > > > The hwrng is accessible by host userspace via /dev/mem.
> > > > 
> > > > A guest should live on the same permission level as a user space
> > > > application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > > > should the guest suddenly be able to directly access a memory location
> > > > (MMIO) it couldn't access directly through a normal user space interface.
> > > > 
> > > > It's basically a layering violation.
> > > 
> > > With Michael's earlier patch in this series, the hwrng is accessible by
> > > host userspace via /dev/hwrng, no?
> > > 
> > Access to which can be controlled by its permission. Permission of
> > /dev/kvm may be different. If we route hypercall via userspace and
> > configure qemu to get entropy from /dev/hwrng everything will fall
> > nicely together (except performance).
> 
> Yes, except abysmall performance and a lot more code for something
> completely and utterly pointless .... nice.
> 
Pointless? You yourself said that fallback to userspace will be required
for migration, so the code have to be there regardless. About abysmal
performance this is what you repeatedly refused to prove. All you
said is that exit to userspace is expensive, we all know that, it is
slow for all arch and all devices implemented in usrerspace, but we do
not move all of them to the kernel. We do move some, most performance
critical, so all you need to show that for typical guest workload having
device in the kernel speed up things measurably. Why not do that instead
of writing rude emails?

--
			Gleb.
Gleb Natapov Oct. 3, 2013, 6:28 a.m. UTC | #37
On Thu, Oct 03, 2013 at 08:07:22AM +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2013-10-02 at 13:02 +0300, Gleb Natapov wrote:
> 
> > Yes, I alluded to it in my email to Paul and Paolo asked also. How this
> > interface is disabled? Also hwrnd is MMIO in a host why guest needs to
> > use hypercall instead of emulating the device (in kernel or somewhere
> > else?). Another things is that on a host hwrnd is protected from
> > direct userspace access by virtue of been a device, but guest code (event
> > kernel mode) is userspace as far as hosts security model goes, so by
> > implementing this hypercall in a way that directly access hwrnd you
> > expose hwrnd to a userspace unconditionally. Why is this a good idea? 
> 
> BTW. Is this always going to be like this ?
> 
If something questionable will be noticed explanation will be required.
It is like that for all arches and all parts of kernel.

> Every *single* architectural or design decision we make for our
> architecture has to be justified 30 times over, every piece of code bike
> shedded to oblivion for month, etc... ?
> 
This is simply not true, most powerpc patches go in without any comments.

> Do we always have to finally get to some kind of agreement on design, go
> to the 6 month bike-shedding phase, just to have somebody else come up
> and start re-questioning the whole original design (without any
> understanding of our specific constraints of course) ?
> 
Do you really think that nobody here understands that exit to userspace
is slow?

> You guys are the most horrendous community I have ever got to work with.
> It's simply impossible to get anything done in any reasonable time
> frame .
> 
> At this stage, it would have taken us an order of magnitude less time to
> simply rewrite an entire hypervisor from scratch.
Of course, it is always much easier to ignore other people input and do
everything your way. Why listen to people who deal with migration issues
for many years if you can commit the patch and forget about it until
migration fails, but who cares, you got there in an order of magnitude
less time and this is what counts.

> 
> This is sad.
> 
Agree.

--
			Gleb.
Benjamin Herrenschmidt Oct. 3, 2013, 7:22 a.m. UTC | #38
On Thu, 2013-10-03 at 08:43 +0300, Gleb Natapov wrote:
> Why it can be a bad idea? User can drain hwrng continuously making other
> users of it much slower, or even worse, making them fall back to another
> much less reliable, source of entropy.

Not in a very significant way, we generate entropy at 1Mhz after all, which
is pretty good.

Cheers,
Ben.
Paul Mackerras Oct. 3, 2013, 10:06 a.m. UTC | #39
On Thu, Oct 03, 2013 at 08:48:03AM +0300, Gleb Natapov wrote:
> On Thu, Oct 03, 2013 at 08:45:42AM +1000, Paul Mackerras wrote:
> > On Wed, Oct 02, 2013 at 04:36:05PM +0200, Alexander Graf wrote:
> > > 
> > > On 02.10.2013, at 16:33, Paolo Bonzini wrote:
> > > 
> > > > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > > >>> The hwrng is accessible by host userspace via /dev/mem.
> > > >> 
> > > >> A guest should live on the same permission level as a user space
> > > >> application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > > >> should the guest suddenly be able to directly access a memory location
> > > >> (MMIO) it couldn't access directly through a normal user space interface.
> > > >> 
> > > >> It's basically a layering violation.
> > > > 
> > > > With Michael's earlier patch in this series, the hwrng is accessible by
> > > > host userspace via /dev/hwrng, no?
> > > 
> > > Yes, but there's not token from user space that gets passed into the kernel to check whether access is ok or not. So while QEMU may not have permission to open /dev/hwrng it could spawn a guest that opens it, drains all entropy out of it and thus stall other processes which try to fetch entropy, no?
> > 
> > Even if you drain all entropy out of it, wait 64 microseconds and it
> > will be full again. :)  Basically it produces 64 bits every
> > microsecond and puts that in a 64 entry x 64-bit FIFO buffer, which is
> > what is read by the MMIO.  So there is no danger of stalling other
> > processes for any significant amount of time.
> > 
> Even if user crates 100s guests each one of which reads hwrng in a loop?

Well, you can't actually have more guests running than there are cores
in a system.  POWER7+ has one RNG per chip and 8 cores per chip, each
of which can run 4 threads (which have to be in the same guest).

Michael's code uses the RNG on the same chip.  Worst case therefore is
32 threads accessing the same RNG, so a given thread might have to
wait up to 32 microseconds for its data.

Paul.
Gleb Natapov Oct. 3, 2013, 12:08 p.m. UTC | #40
On Thu, Oct 03, 2013 at 08:06:30PM +1000, Paul Mackerras wrote:
> On Thu, Oct 03, 2013 at 08:48:03AM +0300, Gleb Natapov wrote:
> > On Thu, Oct 03, 2013 at 08:45:42AM +1000, Paul Mackerras wrote:
> > > On Wed, Oct 02, 2013 at 04:36:05PM +0200, Alexander Graf wrote:
> > > > 
> > > > On 02.10.2013, at 16:33, Paolo Bonzini wrote:
> > > > 
> > > > > Il 02/10/2013 16:08, Alexander Graf ha scritto:
> > > > >>> The hwrng is accessible by host userspace via /dev/mem.
> > > > >> 
> > > > >> A guest should live on the same permission level as a user space
> > > > >> application. If you run QEMU as UID 1000 without access to /dev/mem, why
> > > > >> should the guest suddenly be able to directly access a memory location
> > > > >> (MMIO) it couldn't access directly through a normal user space interface.
> > > > >> 
> > > > >> It's basically a layering violation.
> > > > > 
> > > > > With Michael's earlier patch in this series, the hwrng is accessible by
> > > > > host userspace via /dev/hwrng, no?
> > > > 
> > > > Yes, but there's not token from user space that gets passed into the kernel to check whether access is ok or not. So while QEMU may not have permission to open /dev/hwrng it could spawn a guest that opens it, drains all entropy out of it and thus stall other processes which try to fetch entropy, no?
> > > 
> > > Even if you drain all entropy out of it, wait 64 microseconds and it
> > > will be full again. :)  Basically it produces 64 bits every
> > > microsecond and puts that in a 64 entry x 64-bit FIFO buffer, which is
> > > what is read by the MMIO.  So there is no danger of stalling other
> > > processes for any significant amount of time.
> > > 
> > Even if user crates 100s guests each one of which reads hwrng in a loop?
> 
> Well, you can't actually have more guests running than there are cores
> in a system.  POWER7+ has one RNG per chip and 8 cores per chip, each
> of which can run 4 threads (which have to be in the same guest).
> 
> Michael's code uses the RNG on the same chip.  Worst case therefore is
> 32 threads accessing the same RNG, so a given thread might have to
> wait up to 32 microseconds for its data.
> 
OK, thanks. Even if it become an issue for some reason it is always possible
to rate limit it.

--
			Gleb.
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
index d853d16..86296d5 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -25,8 +25,15 @@  static inline int arch_get_random_int(unsigned int *v)
 	return rc;
 }
 
-int powernv_get_random_long(unsigned long *v);
-
 #endif /* CONFIG_ARCH_RANDOM */
 
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
 #endif /* _ASM_POWERPC_ARCHRANDOM_H */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index b15554a..51966fd 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -177,6 +177,8 @@  extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
 extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
 extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
 
+extern int kvmppc_hwrng_present(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 8cd0dae..de12592 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -19,6 +19,7 @@ 
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
 
 #include "book3s_hv_cma.h"
 /*
@@ -181,3 +182,17 @@  void __init kvm_cma_reserve(void)
 		kvm_cma_declare_contiguous(selected_size, align_size);
 	}
 }
+
+int kvmppc_hwrng_present(void)
+{
+	return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+		return H_SUCCESS;
+
+	return H_HARDWARE;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 294b7af..35ce59e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1502,6 +1502,125 @@  hcall_real_table:
 	.long	0		/* 0x11c */
 	.long	0		/* 0x120 */
 	.long	.kvmppc_h_bulk_remove - hcall_real_table
+	.long	0		/* 0x128 */
+	.long	0		/* 0x12c */
+	.long	0		/* 0x130 */
+	.long	0		/* 0x134 */
+	.long	0		/* 0x138 */
+	.long	0		/* 0x13c */
+	.long	0		/* 0x140 */
+	.long	0		/* 0x144 */
+	.long	0		/* 0x148 */
+	.long	0		/* 0x14c */
+	.long	0		/* 0x150 */
+	.long	0		/* 0x154 */
+	.long	0		/* 0x158 */
+	.long	0		/* 0x15c */
+	.long	0		/* 0x160 */
+	.long	0		/* 0x164 */
+	.long	0		/* 0x168 */
+	.long	0		/* 0x16c */
+	.long	0		/* 0x170 */
+	.long	0		/* 0x174 */
+	.long	0		/* 0x178 */
+	.long	0		/* 0x17c */
+	.long	0		/* 0x180 */
+	.long	0		/* 0x184 */
+	.long	0		/* 0x188 */
+	.long	0		/* 0x18c */
+	.long	0		/* 0x190 */
+	.long	0		/* 0x194 */
+	.long	0		/* 0x198 */
+	.long	0		/* 0x19c */
+	.long	0		/* 0x1a0 */
+	.long	0		/* 0x1a4 */
+	.long	0		/* 0x1a8 */
+	.long	0		/* 0x1ac */
+	.long	0		/* 0x1b0 */
+	.long	0		/* 0x1b4 */
+	.long	0		/* 0x1b8 */
+	.long	0		/* 0x1bc */
+	.long	0		/* 0x1c0 */
+	.long	0		/* 0x1c4 */
+	.long	0		/* 0x1c8 */
+	.long	0		/* 0x1cc */
+	.long	0		/* 0x1d0 */
+	.long	0		/* 0x1d4 */
+	.long	0		/* 0x1d8 */
+	.long	0		/* 0x1dc */
+	.long	0		/* 0x1e0 */
+	.long	0		/* 0x1e4 */
+	.long	0		/* 0x1e8 */
+	.long	0		/* 0x1ec */
+	.long	0		/* 0x1f0 */
+	.long	0		/* 0x1f4 */
+	.long	0		/* 0x1f8 */
+	.long	0		/* 0x1fc */
+	.long	0		/* 0x200 */
+	.long	0		/* 0x204 */
+	.long	0		/* 0x208 */
+	.long	0		/* 0x20c */
+	.long	0		/* 0x210 */
+	.long	0		/* 0x214 */
+	.long	0		/* 0x218 */
+	.long	0		/* 0x21c */
+	.long	0		/* 0x220 */
+	.long	0		/* 0x224 */
+	.long	0		/* 0x228 */
+	.long	0		/* 0x22c */
+	.long	0		/* 0x230 */
+	.long	0		/* 0x234 */
+	.long	0		/* 0x238 */
+	.long	0		/* 0x23c */
+	.long	0		/* 0x240 */
+	.long	0		/* 0x244 */
+	.long	0		/* 0x248 */
+	.long	0		/* 0x24c */
+	.long	0		/* 0x250 */
+	.long	0		/* 0x254 */
+	.long	0		/* 0x258 */
+	.long	0		/* 0x25c */
+	.long	0		/* 0x260 */
+	.long	0		/* 0x264 */
+	.long	0		/* 0x268 */
+	.long	0		/* 0x26c */
+	.long	0		/* 0x270 */
+	.long	0		/* 0x274 */
+	.long	0		/* 0x278 */
+	.long	0		/* 0x27c */
+	.long	0		/* 0x280 */
+	.long	0		/* 0x284 */
+	.long	0		/* 0x288 */
+	.long	0		/* 0x28c */
+	.long	0		/* 0x290 */
+	.long	0		/* 0x294 */
+	.long	0		/* 0x298 */
+	.long	0		/* 0x29c */
+	.long	0		/* 0x2a0 */
+	.long	0		/* 0x2a4 */
+	.long	0		/* 0x2a8 */
+	.long	0		/* 0x2ac */
+	.long	0		/* 0x2b0 */
+	.long	0		/* 0x2b4 */
+	.long	0		/* 0x2b8 */
+	.long	0		/* 0x2bc */
+	.long	0		/* 0x2c0 */
+	.long	0		/* 0x2c4 */
+	.long	0		/* 0x2c8 */
+	.long	0		/* 0x2cc */
+	.long	0		/* 0x2d0 */
+	.long	0		/* 0x2d4 */
+	.long	0		/* 0x2d8 */
+	.long	0		/* 0x2dc */
+	.long	0		/* 0x2e0 */
+	.long	0		/* 0x2e4 */
+	.long	0		/* 0x2e8 */
+	.long	0		/* 0x2ec */
+	.long	0		/* 0x2f0 */
+	.long	0		/* 0x2f4 */
+	.long	0		/* 0x2f8 */
+	.long	0		/* 0x2fc */
+	.long	.kvmppc_h_random - hcall_real_table
 hcall_real_table_end:
 
 ignore_hdec:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 07c0106..0d7208e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -356,6 +356,9 @@  int kvm_dev_ioctl_check_extension(long ext)
 		if (cpu_has_feature(CPU_FTR_ARCH_201))
 			r = 2;
 		break;
+	case KVM_CAP_PPC_HWRNG:
+		r = kvmppc_hwrng_present();
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_64_HV
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 51d2e6a..ea7e5cd 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -20,12 +20,18 @@ 
 
 struct powernv_rng {
 	void __iomem *regs;
+	void __iomem *regs_real;
 	unsigned long mask;
 };
 
 static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
 
 
+int powernv_hwrng_present(void)
+{
+	return __raw_get_cpu_var(powernv_rng) != NULL;
+}
+
 static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
 {
 	unsigned long parity;
@@ -42,6 +48,17 @@  static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
 	return val;
 }
 
+int powernv_get_random_real_mode(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = __raw_get_cpu_var(powernv_rng);
+
+	*v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+	return 1;
+}
+
 int powernv_get_random_long(unsigned long *v)
 {
 	struct powernv_rng *rng;
@@ -76,12 +93,20 @@  static __init void rng_init_per_cpu(struct powernv_rng *rng,
 static __init int rng_create(struct device_node *dn)
 {
 	struct powernv_rng *rng;
+	struct resource res;
 	unsigned long val;
 
 	rng = kzalloc(sizeof(*rng), GFP_KERNEL);
 	if (!rng)
 		return -ENOMEM;
 
+	if (of_address_to_resource(dn, 0, &res)) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	rng->regs_real = (void __iomem *)res.start;
+
 	rng->regs = of_iomap(dn, 0);
 	if (!rng->regs) {
 		kfree(rng);
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 99c2533..493a409 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -668,6 +668,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_ARM_EL1_32BIT 93
 #define KVM_CAP_SPAPR_MULTITCE 94
+#define KVM_CAP_PPC_HWRNG 95
 
 #ifdef KVM_CAP_IRQ_ROUTING