diff mbox

[1/4] KVM: PPC: Add support for multiple-TCE hcalls

Message ID 1370412673-1345-2-git-send-email-aik@ozlabs.ru
State New, archived
Headers show

Commit Message

Alexey Kardashevskiy June 5, 2013, 6:11 a.m. UTC
This adds real mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition to/from real mode.

This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
(copied from user and verified) before writing the whole list into
the TCE table. This cache will be utilized more in the upcoming
VFIO/IOMMU support to continue TCE list processing in the virtual
mode in the case if the real mode handler failed for some reason.

This adds a guest physical to host real address converter
and calls the existing H_PUT_TCE handler. The converting function
is going to be fully utilized by upcoming VFIO supporting patches.

This also implements the KVM_CAP_PPC_MULTITCE capability,
so in order to support the functionality of this patch, QEMU
needs to query for this capability and set the "hcall-multi-tce"
hypertas property only if the capability is present, otherwise
there will be serious performance degradation.

Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>

---
Changelog:
2013/06/05:
* fixed mistype about IBMVIO in the commit message
* updated doc and moved it to another section
* changed capability number

2013/05/21:
* added kvm_vcpu_arch::tce_tmp
* removed cleanup if put_indirect failed, instead we do not even start
writing to TCE table if we cannot get TCEs from the user and they are
invalid
* kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
and kvmppc_emulated_validate_tce (for the previous item)
* fixed bug with failthrough for H_IPI
* removed all get_user() from real mode handlers
* kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
---
 Documentation/virtual/kvm/api.txt       |   17 ++
 arch/powerpc/include/asm/kvm_host.h     |    2 +
 arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
 arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
 arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
 arch/powerpc/kvm/book3s_hv.c            |   39 +++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
 arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
 arch/powerpc/kvm/powerpc.c              |    3 +
 include/uapi/linux/kvm.h                |    1 +
 10 files changed, 473 insertions(+), 32 deletions(-)

Comments

Benjamin Herrenschmidt June 16, 2013, 4:20 a.m. UTC | #1
On Wed, 2013-06-05 at 16:11 +1000, Alexey Kardashevskiy wrote:
> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
> devices or emulated PCI.  These calls allow adding multiple entries
> (up to 512) into the TCE table in one call which saves time on
> transition to/from real mode.
> 
> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
> (copied from user and verified) before writing the whole list into
> the TCE table. This cache will be utilized more in the upcoming
> VFIO/IOMMU support to continue TCE list processing in the virtual
> mode in the case if the real mode handler failed for some reason.
> 
> This adds a guest physical to host real address converter
> and calls the existing H_PUT_TCE handler. The converting function
> is going to be fully utilized by upcoming VFIO supporting patches.
> 
> This also implements the KVM_CAP_PPC_MULTITCE capability,
> so in order to support the functionality of this patch, QEMU
> needs to query for this capability and set the "hcall-multi-tce"
> hypertas property only if the capability is present, otherwise
> there will be serious performance degradation.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Paul Mackerras <paulus@samba.org>
> 
> ---
> Changelog:
> 2013/06/05:
> * fixed mistype about IBMVIO in the commit message
> * updated doc and moved it to another section
> * changed capability number
> 
> 2013/05/21:
> * added kvm_vcpu_arch::tce_tmp
> * removed cleanup if put_indirect failed, instead we do not even start
> writing to TCE table if we cannot get TCEs from the user and they are
> invalid
> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
> and kvmppc_emulated_validate_tce (for the previous item)
> * fixed bug with failthrough for H_IPI
> * removed all get_user() from real mode handlers
> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
> ---
>  Documentation/virtual/kvm/api.txt       |   17 ++
>  arch/powerpc/include/asm/kvm_host.h     |    2 +
>  arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>  arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>  arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
>  arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>  arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>  arch/powerpc/kvm/powerpc.c              |    3 +
>  include/uapi/linux/kvm.h                |    1 +
>  10 files changed, 473 insertions(+), 32 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 5f91eda..6c082ff 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
>  handled.
>  
> 
> +4.83 KVM_CAP_PPC_MULTITCE
> +
> +Capability: KVM_CAP_PPC_MULTITCE
> +Architectures: ppc
> +Type: vm
> +
> +This capability tells the guest that multiple TCE entry add/remove hypercalls
> +handling is supported by the kernel. This significanly accelerates DMA
> +operations for PPC KVM guests.
> +
> +Unlike other capabilities in this section, this one does not have an ioctl.
> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index af326cd..85d8f26 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>  	spinlock_t tbacct_lock;
>  	u64 busy_stolen;
>  	u64 busy_preempt;
> +
> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>  #endif
>  };
>  
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index a5287fe..e852921b 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
>  
>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  				struct kvm_create_spapr_tce *args);
> -extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> -			     unsigned long ioba, unsigned long tce);
> +extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
> +		struct kvm_vcpu *vcpu, unsigned long liobn);
> +extern long kvmppc_emulated_validate_tce(unsigned long tce);
> +extern void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
> +		unsigned long ioba, unsigned long tce);
> +extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce);
> +extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_list, unsigned long npages);
> +extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_value, unsigned long npages);
>  extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
>  				struct kvm_allocate_rma *rma);
>  extern struct kvmppc_linear_info *kvm_alloc_rma(void);
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index b2d3f3b..06b7b20 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -14,6 +14,7 @@
>   *
>   * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
>   * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
> + * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
>   */
>  
>  #include <linux/types.h>
> @@ -36,8 +37,11 @@
>  #include <asm/ppc-opcode.h>
>  #include <asm/kvm_host.h>
>  #include <asm/udbg.h>
> +#include <asm/iommu.h>
> +#include <asm/tce.h>
>  
>  #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
> +#define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>  
>  static long kvmppc_stt_npages(unsigned long window_size)
>  {
> @@ -148,3 +152,117 @@ fail:
>  	}
>  	return ret;
>  }
> +
> +/* Converts guest physical address into host virtual */
> +static void __user *kvmppc_virtmode_gpa_to_hva(struct kvm_vcpu *vcpu,
> +		unsigned long gpa)
> +{
> +	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
> +	struct kvm_memory_slot *memslot;
> +
> +	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> +	if (!memslot)
> +		return ERROR_ADDR;
> +
> +	hva = __gfn_to_hva_memslot(memslot, gfn) + (gpa & ~PAGE_MASK);
> +	return (void *) hva;
> +}
> +
> +long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce)
> +{
> +	long ret;
> +	struct kvmppc_spapr_tce_table *tt;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to userspace */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if (ioba >= tt->window_size)
> +		return H_PARAMETER;
> +
> +	ret = kvmppc_emulated_validate_tce(tce);
> +	if (ret)
> +		return ret;
> +
> +	kvmppc_emulated_put_tce(tt, ioba, tce);
> +
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_list, unsigned long npages)
> +{
> +	struct kvmppc_spapr_tce_table *tt;
> +	long i, ret;
> +	unsigned long __user *tces;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to userspace */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/*
> +	 * The spec says that the maximum size of the list is 512 TCEs so
> +	 * so the whole table addressed resides in 4K page
> +	 */
> +	if (npages > 512)
> +		return H_PARAMETER;
> +
> +	if (tce_list & ~IOMMU_PAGE_MASK)
> +		return H_PARAMETER;
> +
> +	tces = kvmppc_virtmode_gpa_to_hva(vcpu, tce_list);
> +	if (tces == ERROR_ADDR)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +		return H_PARAMETER;
> +
> +	for (i = 0; i < npages; ++i) {
> +		if (get_user(vcpu->arch.tce_tmp[i], tces + i))
> +			return H_PARAMETER;
> +
> +		ret = kvmppc_emulated_validate_tce(vcpu->arch.tce_tmp[i]);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	for (i = 0; i < npages; ++i)
> +		kvmppc_emulated_put_tce(tt,
> +				ioba + (i << IOMMU_PAGE_SHIFT),
> +				vcpu->arch.tce_tmp[i]);
> +
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_value, unsigned long npages)
> +{
> +	struct kvmppc_spapr_tce_table *tt;
> +	long i, ret;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to userspace */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +		return H_PARAMETER;
> +
> +	ret = kvmppc_emulated_validate_tce(tce_value);
> +	if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return H_PARAMETER;
> +
> +	for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
> +		kvmppc_emulated_put_tce(tt, ioba, tce_value);
> +
> +	return H_SUCCESS;
> +}
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 30c2f3b..c68d538 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -14,6 +14,7 @@
>   *
>   * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
>   * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
> + * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
>   */
>  
>  #include <linux/types.h>
> @@ -35,42 +36,249 @@
>  #include <asm/ppc-opcode.h>
>  #include <asm/kvm_host.h>
>  #include <asm/udbg.h>
> +#include <asm/iommu.h>
> +#include <asm/tce.h>
>  
>  #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
> +#define ERROR_ADDR      (~(unsigned long)0x0)
>  
> -/* WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> +/* Finds a TCE table descriptor by LIOBN */
> +struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(struct kvm_vcpu *vcpu,
> +		unsigned long liobn)
> +{
> +	struct kvmppc_spapr_tce_table *tt;
> +
> +	list_for_each_entry(tt, &vcpu->kvm->arch.spapr_tce_tables, list) {
> +		if (tt->liobn == liobn)
> +			return tt;
> +	}
> +
> +	return NULL;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_find_tce_table);
> +
> +/*
> + * Validate TCE address.
> + * At the moment only flags are validated
> + * as other check will significantly slow down
> + * or can make it even impossible to handle TCE requests
> + * in real mode.
> + */
> +long kvmppc_emulated_validate_tce(unsigned long tce)
> +{
> +	if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
> +		return H_PARAMETER;
> +
> +	return H_SUCCESS;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_emulated_validate_tce);
> +
> +/*
> + * kvmppc_emulated_put_tce() handles TCE requests for devices emulated
> + * by QEMU. It puts guest TCE values into the table and expects
> + * the QEMU to convert them later in the QEMU device implementation.
> + * Wiorks in both real and virtual modes.
> + * It cannot fail so kvmppc_emulated_validate_tce must be called before it.
>   */
> +void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
> +		unsigned long ioba, unsigned long tce)
> +{
> +	unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
> +	struct page *page;
> +	u64 *tbl;
> +
> +	/*
> +	 * Note on the use of page_address() in real mode,
> +	 *
> +	 * It is safe to use page_address() in real mode on ppc64 because
> +	 * page_address() is always defined as lowmem_page_address()
> +	 * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
> +	 * operation and does not access page struct.
> +	 *
> +	 * Theoretically page_address() could be defined different
> +	 * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
> +	 * should be enabled.
> +	 * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
> +	 * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
> +	 * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
> +	 * is not expected to be enabled on ppc32, page_address()
> +	 * is safe for ppc32 as well.
> +	 */
> +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
> +#error TODO: fix to avoid page_address() here
> +#endif
> +	page = tt->pages[idx / TCES_PER_PAGE];
> +	tbl = (u64 *)page_address(page);
> +
> +	/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
> +	tbl[idx % TCES_PER_PAGE] = tce;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
> +
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +
> +static pte_t kvmppc_lookup_pte(pgd_t *pgdir, unsigned long hva, bool writing,
> +			unsigned long *pte_sizep)
> +{
> +	pte_t *ptep;
> +	unsigned int shift = 0;
> +	pte_t pte, tmp, ret;
> +
> +	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> +	if (!ptep)
> +		return __pte(0);
> +	if (shift)
> +		*pte_sizep = 1ul << shift;
> +	else
> +		*pte_sizep = PAGE_SIZE;
> +
> +	if (!pte_present(*ptep))
> +		return __pte(0);
> +
> +	/* wait until _PAGE_BUSY is clear then set it atomically */
> +	__asm__ __volatile__ (
> +		"1:	ldarx	%0,0,%3\n"
> +		"	andi.	%1,%0,%4\n"
> +		"	bne-	1b\n"
> +		"	ori	%1,%0,%4\n"
> +		"	stdcx.	%1,0,%3\n"
> +		"	bne-	1b"
> +		: "=&r" (pte), "=&r" (tmp), "=m" (*ptep)
> +		: "r" (ptep), "i" (_PAGE_BUSY)
> +		: "cc");
> +
> +	ret = pte;
> +
> +	return ret;
> +}

The test for pte_present() needs to be done again after you lock the PTE
since potentially you could have raced with an invalidation.

More worrisome: You set _PAGE_BUSY above (lock the PTE). But when do you
clear it ?

It looks like you rely on _PAGE_BUSY being set to protect yourself
against any concurrent invalidation (or other change to the PTE) while
you access the underlying page. This is *somewhat* ok, though frowned
upon since you end up locking the PTE a lot longer (in real mode) than
we normally do, but in any case, you need to ensure that you release
that lock.

Also you must *not* continue using the resulting physical address after
releasing the lock since the page might be invalidated/freed/swapped_out
etc... at any point once you clear busy.

It might be better to use the MMU notifiers here to catch concurrent
invalidations rather than locking the PTE for a long time. If a
concurrent invalidation happens, just return TOO_HARD.

> +
> +/*
> + * Converts guest physical address into host physical address.
> + * Also returns pte and page size if the page is present in page table.
> + */
> +static unsigned long kvmppc_realmode_gpa_to_hpa(struct kvm_vcpu *vcpu,
> +		unsigned long gpa)
> +{
> +	struct kvm_memory_slot *memslot;
> +	pte_t pte;
> +	unsigned long hva, hpa, pg_size = 0, offset;
> +	unsigned long gfn = gpa >> PAGE_SHIFT;
> +	bool writing = gpa & TCE_PCI_WRITE;
> +
> +	/* Find a KVM memslot */
> +	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> +	if (!memslot)
> +		return ERROR_ADDR;
> +
> +	/* Convert guest physical address to host virtual */
> +	hva = __gfn_to_hva_memslot(memslot, gfn);
> +
> +	/* Find a PTE and determine the size */
> +	pte = kvmppc_lookup_pte(vcpu->arch.pgdir, hva,
> +			writing, &pg_size);
> +	if (!pte)
> +		return ERROR_ADDR;
> +
> +	/* Calculate host phys address keeping flags and offset in the page */
> +	offset = gpa & (pg_size - 1);
> +
> +	/* pte_pfn(pte) should return an address aligned to pg_size */
> +	hpa = (pte_pfn(pte) << PAGE_SHIFT) + offset;
> +
> +	return hpa;
> +}

Do you ever test whether the page protection on the PTE allows for
access ? At the moment you only use that to read from TCEs so chances
that this is wrong are slim (you wouldn't have PROT_NONE on TCE tables),
but it's still a worry to have code like that.

Also you do not set _PAGE_ACCESSED either, which means the VM doesn't
know the page is being accessed. Not necessarily a huge deal in this
specific case, but still.

>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  		      unsigned long ioba, unsigned long tce)
>  {
> -	struct kvm *kvm = vcpu->kvm;
> -	struct kvmppc_spapr_tce_table *stt;
> -
> -	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
> -	/* 	    liobn, ioba, tce); */
> -
> -	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> -		if (stt->liobn == liobn) {
> -			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
> -			struct page *page;
> -			u64 *tbl;
> -
> -			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
> -			/* 	    liobn, stt, stt->window_size); */
> -			if (ioba >= stt->window_size)
> -				return H_PARAMETER;
> -
> -			page = stt->pages[idx / TCES_PER_PAGE];
> -			tbl = (u64 *)page_address(page);
> -
> -			/* FIXME: Need to validate the TCE itself */
> -			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
> -			tbl[idx % TCES_PER_PAGE] = tce;
> -			return H_SUCCESS;
> -		}
> +	long ret;
> +	struct kvmppc_spapr_tce_table *tt;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to virtual space */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if (ioba >= tt->window_size)
> +		return H_PARAMETER;
> +
> +	ret = kvmppc_emulated_validate_tce(tce);
> +	if (ret)
> +		return ret;
> +
> +	kvmppc_emulated_put_tce(tt, ioba, tce);
> +
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_list,	unsigned long npages)
> +{
> +	struct kvmppc_spapr_tce_table *tt;
> +	long i, ret;
> +	unsigned long *tces;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to virtual space */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/*
> +	 * The spec says that the maximum size of the list is 512 TCEs so
> +	 * so the whole table addressed resides in 4K page
> +	 */
> +	if (npages > 512)
> +		return H_PARAMETER;
> +
> +	if (tce_list & ~IOMMU_PAGE_MASK)
> +		return H_PARAMETER;
> +
> +	tces = (unsigned long *) kvmppc_realmode_gpa_to_hpa(vcpu, tce_list);
> +	if ((unsigned long)tces == ERROR_ADDR)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +		return H_PARAMETER;
> +
> +	for (i = 0; i < npages; ++i) {
> +		ret = kvmppc_emulated_validate_tce(tces[i]);
> +		if (ret)
> +			return ret;
>  	}
>  
> -	/* Didn't find the liobn, punt it to userspace */
> -	return H_TOO_HARD;
> +	for (i = 0; i < npages; ++i)
> +		kvmppc_emulated_put_tce(tt, ioba + (i << IOMMU_PAGE_SHIFT),
> +				tces[i]);
> +
> +	return H_SUCCESS;
> +}
> +
> +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
> +		unsigned long liobn, unsigned long ioba,
> +		unsigned long tce_value, unsigned long npages)
> +{
> +	struct kvmppc_spapr_tce_table *tt;
> +	long i, ret;
> +
> +	tt = kvmppc_find_tce_table(vcpu, liobn);
> +	/* Didn't find the liobn, put it to virtual space */
> +	if (!tt)
> +		return H_TOO_HARD;
> +
> +	/* Emulated IO */
> +	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +		return H_PARAMETER;
> +
> +	ret = kvmppc_emulated_validate_tce(tce_value);
> +	if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +		return H_PARAMETER;
> +
> +	for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
> +		kvmppc_emulated_put_tce(tt, ioba, tce_value);
> +
> +	return H_SUCCESS;
>  }
> +#endif /* CONFIG_KVM_BOOK3S_64_HV */
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 550f592..a39039a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  			ret = kvmppc_xics_hcall(vcpu, req);
>  			break;
>  		} /* fallthrough */
> +		return RESUME_HOST;
> +	case H_PUT_TCE:
> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
> +	case H_PUT_TCE_INDIRECT:
> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6),
> +						kvmppc_get_gpr(vcpu, 7));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
> +	case H_STUFF_TCE:
> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6),
> +						kvmppc_get_gpr(vcpu, 7));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
>  	default:
>  		return RESUME_HOST;
>  	}
> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
>  	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>  	kvmppc_sanity_check(vcpu);
>  
> +	/*
> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
> +	 * half executed, we first read TCEs from the user, check them and
> +	 * return error if something went wrong and only then put TCEs into
> +	 * the TCE table.
> +	 *
> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
> +	 * each (4096 bytes).
> +	 */
> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
> +	if (!vcpu->arch.tce_tmp)
> +		goto free_vcpu;
> +
>  	return vcpu;
>  
>  free_vcpu:
> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>  	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>  	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>  	spin_unlock(&vcpu->arch.vpa_update_lock);
> +	kfree(vcpu->arch.tce_tmp);
>  	kvm_vcpu_uninit(vcpu);
>  	kmem_cache_free(kvm_vcpu_cache, vcpu);
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b02f91e..d35554e 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1490,6 +1490,12 @@ hcall_real_table:
>  	.long	0		/* 0x11c */
>  	.long	0		/* 0x120 */
>  	.long	.kvmppc_h_bulk_remove - hcall_real_table
> +	.long	0		/* 0x128 */
> +	.long	0		/* 0x12c */
> +	.long	0		/* 0x130 */
> +	.long	0		/* 0x134 */
> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
>  hcall_real_table_end:
>  
>  ignore_hdec:
> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
> index da0e0bc..91d4b45 100644
> --- a/arch/powerpc/kvm/book3s_pr_papr.c
> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>  	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>  	long rc;
>  
> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
> +	if (rc == H_TOO_HARD)
> +		return EMULATE_FAIL;
> +	kvmppc_set_gpr(vcpu, 3, rc);
> +	return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +	long rc;
> +
> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
> +			tce, npages);
> +	if (rc == H_TOO_HARD)
> +		return EMULATE_FAIL;
> +	kvmppc_set_gpr(vcpu, 3, rc);
> +	return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +	long rc;
> +
> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>  	if (rc == H_TOO_HARD)
>  		return EMULATE_FAIL;
>  	kvmppc_set_gpr(vcpu, 3, rc);
> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>  		return kvmppc_h_pr_bulk_remove(vcpu);
>  	case H_PUT_TCE:
>  		return kvmppc_h_pr_put_tce(vcpu);
> +	case H_PUT_TCE_INDIRECT:
> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
> +	case H_STUFF_TCE:
> +		return kvmppc_h_pr_stuff_tce(vcpu);
>  	case H_CEDE:
>  		vcpu->arch.shared->msr |= MSR_EE;
>  		kvm_vcpu_block(vcpu);
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 6316ee3..8465c2a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>  		r = 1;
>  		break;
>  #endif
> +	case KVM_CAP_SPAPR_MULTITCE:
> +		r = 1;
> +		break;
>  	default:
>  		r = 0;
>  		break;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index a5c86fc..fc0d6b9 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_IRQ_MPIC 90
>  #define KVM_CAP_PPC_RTAS 91
>  #define KVM_CAP_IRQ_XICS 92
> +#define KVM_CAP_SPAPR_MULTITCE 93
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 16, 2013, 10:06 p.m. UTC | #2
On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:

> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
> devices or emulated PCI.  These calls allow adding multiple entries
> (up to 512) into the TCE table in one call which saves time on
> transition to/from real mode.
> 
> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
> (copied from user and verified) before writing the whole list into
> the TCE table. This cache will be utilized more in the upcoming
> VFIO/IOMMU support to continue TCE list processing in the virtual
> mode in the case if the real mode handler failed for some reason.
> 
> This adds a guest physical to host real address converter
> and calls the existing H_PUT_TCE handler. The converting function
> is going to be fully utilized by upcoming VFIO supporting patches.
> 
> This also implements the KVM_CAP_PPC_MULTITCE capability,
> so in order to support the functionality of this patch, QEMU
> needs to query for this capability and set the "hcall-multi-tce"
> hypertas property only if the capability is present, otherwise
> there will be serious performance degradation.
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Paul Mackerras <paulus@samba.org>

Only a few minor nits. Ben already commented on implementation details.

> 
> ---
> Changelog:
> 2013/06/05:
> * fixed mistype about IBMVIO in the commit message
> * updated doc and moved it to another section
> * changed capability number
> 
> 2013/05/21:
> * added kvm_vcpu_arch::tce_tmp
> * removed cleanup if put_indirect failed, instead we do not even start
> writing to TCE table if we cannot get TCEs from the user and they are
> invalid
> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
> and kvmppc_emulated_validate_tce (for the previous item)
> * fixed bug with failthrough for H_IPI
> * removed all get_user() from real mode handlers
> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
> ---
> Documentation/virtual/kvm/api.txt       |   17 ++
> arch/powerpc/include/asm/kvm_host.h     |    2 +
> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
> arch/powerpc/kvm/book3s_hv.c            |   39 +++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
> arch/powerpc/kvm/powerpc.c              |    3 +
> include/uapi/linux/kvm.h                |    1 +
> 10 files changed, 473 insertions(+), 32 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
> index 5f91eda..6c082ff 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
> handled.
> 
> 
> +4.83 KVM_CAP_PPC_MULTITCE
> +
> +Capability: KVM_CAP_PPC_MULTITCE
> +Architectures: ppc
> +Type: vm
> +
> +This capability tells the guest that multiple TCE entry add/remove hypercalls
> +handling is supported by the kernel. This significanly accelerates DMA
> +operations for PPC KVM guests.
> +
> +Unlike other capabilities in this section, this one does not have an ioctl.
> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).

While this describes perfectly well what the consequences are of the patches, it does not describe properly what the CAP actually expresses. The CAP only says "this kernel is able to handle H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls directly". All other consequences are nice to document, but the semantics of the CAP are missing.

We also usually try to keep KVM behavior unchanged with regards to older versions until a CAP is enabled. In this case I don't think it matters all that much, so I'm fine with declaring it as enabled by default. Please document that this is a change in behavior versus older KVM versions though.

> +
> +
> 5. The kvm_run structure
> ------------------------
> 
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index af326cd..85d8f26 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
> 	spinlock_t tbacct_lock;
> 	u64 busy_stolen;
> 	u64 busy_preempt;
> +
> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
> #endif
> };

[...]
> 
> 

[...]

> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 550f592..a39039a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> 			ret = kvmppc_xics_hcall(vcpu, req);
> 			break;
> 		} /* fallthrough */

The fallthrough comment isn't accurate anymore.

> +		return RESUME_HOST;
> +	case H_PUT_TCE:
> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
> +	case H_PUT_TCE_INDIRECT:
> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6),
> +						kvmppc_get_gpr(vcpu, 7));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
> +	case H_STUFF_TCE:
> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +						kvmppc_get_gpr(vcpu, 5),
> +						kvmppc_get_gpr(vcpu, 6),
> +						kvmppc_get_gpr(vcpu, 7));
> +		if (ret == H_TOO_HARD)
> +			return RESUME_HOST;
> +		break;
> 	default:
> 		return RESUME_HOST;
> 	}
> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
> 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
> 	kvmppc_sanity_check(vcpu);
> 
> +	/*
> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
> +	 * half executed, we first read TCEs from the user, check them and
> +	 * return error if something went wrong and only then put TCEs into
> +	 * the TCE table.
> +	 *
> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
> +	 * each (4096 bytes).
> +	 */
> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
> +	if (!vcpu->arch.tce_tmp)
> +		goto free_vcpu;
> +
> 	return vcpu;
> 
> free_vcpu:
> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
> 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
> 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
> 	spin_unlock(&vcpu->arch.vpa_update_lock);
> +	kfree(vcpu->arch.tce_tmp);
> 	kvm_vcpu_uninit(vcpu);
> 	kmem_cache_free(kvm_vcpu_cache, vcpu);
> }
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b02f91e..d35554e 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1490,6 +1490,12 @@ hcall_real_table:
> 	.long	0		/* 0x11c */
> 	.long	0		/* 0x120 */
> 	.long	.kvmppc_h_bulk_remove - hcall_real_table
> +	.long	0		/* 0x128 */
> +	.long	0		/* 0x12c */
> +	.long	0		/* 0x130 */
> +	.long	0		/* 0x134 */
> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
> hcall_real_table_end:
> 
> ignore_hdec:
> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
> index da0e0bc..91d4b45 100644
> --- a/arch/powerpc/kvm/book3s_pr_papr.c
> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
> 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
> 	long rc;
> 
> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
> +	if (rc == H_TOO_HARD)
> +		return EMULATE_FAIL;
> +	kvmppc_set_gpr(vcpu, 3, rc);
> +	return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +	long rc;
> +
> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
> +			tce, npages);
> +	if (rc == H_TOO_HARD)
> +		return EMULATE_FAIL;
> +	kvmppc_set_gpr(vcpu, 3, rc);
> +	return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +	long rc;
> +
> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
> 	if (rc == H_TOO_HARD)
> 		return EMULATE_FAIL;
> 	kvmppc_set_gpr(vcpu, 3, rc);
> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
> 		return kvmppc_h_pr_bulk_remove(vcpu);
> 	case H_PUT_TCE:
> 		return kvmppc_h_pr_put_tce(vcpu);
> +	case H_PUT_TCE_INDIRECT:
> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
> +	case H_STUFF_TCE:
> +		return kvmppc_h_pr_stuff_tce(vcpu);
> 	case H_CEDE:
> 		vcpu->arch.shared->msr |= MSR_EE;
> 		kvm_vcpu_block(vcpu);
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 6316ee3..8465c2a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
> 		r = 1;
> 		break;
> #endif
> +	case KVM_CAP_SPAPR_MULTITCE:
> +		r = 1;

This should only be true for book3s.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy June 17, 2013, 7:55 a.m. UTC | #3
On 06/17/2013 08:06 AM, Alexander Graf wrote:
> 
> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
> 
>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
>> devices or emulated PCI.  These calls allow adding multiple entries
>> (up to 512) into the TCE table in one call which saves time on
>> transition to/from real mode.
>>
>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>> (copied from user and verified) before writing the whole list into
>> the TCE table. This cache will be utilized more in the upcoming
>> VFIO/IOMMU support to continue TCE list processing in the virtual
>> mode in the case if the real mode handler failed for some reason.
>>
>> This adds a guest physical to host real address converter
>> and calls the existing H_PUT_TCE handler. The converting function
>> is going to be fully utilized by upcoming VFIO supporting patches.
>>
>> This also implements the KVM_CAP_PPC_MULTITCE capability,
>> so in order to support the functionality of this patch, QEMU
>> needs to query for this capability and set the "hcall-multi-tce"
>> hypertas property only if the capability is present, otherwise
>> there will be serious performance degradation.
>>
>> Cc: David Gibson <david@gibson.dropbear.id.au>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> Signed-off-by: Paul Mackerras <paulus@samba.org>
> 
> Only a few minor nits. Ben already commented on implementation details.
> 
>>
>> ---
>> Changelog:
>> 2013/06/05:
>> * fixed mistype about IBMVIO in the commit message
>> * updated doc and moved it to another section
>> * changed capability number
>>
>> 2013/05/21:
>> * added kvm_vcpu_arch::tce_tmp
>> * removed cleanup if put_indirect failed, instead we do not even start
>> writing to TCE table if we cannot get TCEs from the user and they are
>> invalid
>> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
>> and kvmppc_emulated_validate_tce (for the previous item)
>> * fixed bug with failthrough for H_IPI
>> * removed all get_user() from real mode handlers
>> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
>> ---
>> Documentation/virtual/kvm/api.txt       |   17 ++
>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
>> arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>> arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>> arch/powerpc/kvm/powerpc.c              |    3 +
>> include/uapi/linux/kvm.h                |    1 +
>> 10 files changed, 473 insertions(+), 32 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>> index 5f91eda..6c082ff 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
>> handled.
>>
>>
>> +4.83 KVM_CAP_PPC_MULTITCE
>> +
>> +Capability: KVM_CAP_PPC_MULTITCE
>> +Architectures: ppc
>> +Type: vm
>> +
>> +This capability tells the guest that multiple TCE entry add/remove hypercalls
>> +handling is supported by the kernel. This significanly accelerates DMA
>> +operations for PPC KVM guests.
>> +
>> +Unlike other capabilities in this section, this one does not have an ioctl.
>> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
>> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
>> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
>> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
> 

> While this describes perfectly well what the consequences are of the
> patches, it does not describe properly what the CAP actually expresses.
> The CAP only says "this kernel is able to handle H_PUT_TCE_INDIRECT and
> H_STUFF_TCE hypercalls directly". All other consequences are nice to
> document, but the semantics of the CAP are missing.


? It expresses ability to handle 2 hcalls. What is missing?


> We also usually try to keep KVM behavior unchanged with regards to older
> versions until a CAP is enabled. In this case I don't think it matters
> all that much, so I'm fine with declaring it as enabled by default.
> Please document that this is a change in behavior versus older KVM
> versions though.


Ok!


>> +
>> +
>> 5. The kvm_run structure
>> ------------------------
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>> index af326cd..85d8f26 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>> 	spinlock_t tbacct_lock;
>> 	u64 busy_stolen;
>> 	u64 busy_preempt;
>> +
>> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>> #endif
>> };
> 
> [...]
>>
>>
> 
> [...]
> 
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 550f592..a39039a 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>> 			ret = kvmppc_xics_hcall(vcpu, req);
>> 			break;
>> 		} /* fallthrough */
> 
> The fallthrough comment isn't accurate anymore.
> 
>> +		return RESUME_HOST;
>> +	case H_PUT_TCE:
>> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>> +						kvmppc_get_gpr(vcpu, 5),
>> +						kvmppc_get_gpr(vcpu, 6));
>> +		if (ret == H_TOO_HARD)
>> +			return RESUME_HOST;
>> +		break;
>> +	case H_PUT_TCE_INDIRECT:
>> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
>> +						kvmppc_get_gpr(vcpu, 5),
>> +						kvmppc_get_gpr(vcpu, 6),
>> +						kvmppc_get_gpr(vcpu, 7));
>> +		if (ret == H_TOO_HARD)
>> +			return RESUME_HOST;
>> +		break;
>> +	case H_STUFF_TCE:
>> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>> +						kvmppc_get_gpr(vcpu, 5),
>> +						kvmppc_get_gpr(vcpu, 6),
>> +						kvmppc_get_gpr(vcpu, 7));
>> +		if (ret == H_TOO_HARD)
>> +			return RESUME_HOST;
>> +		break;
>> 	default:
>> 		return RESUME_HOST;
>> 	}
>> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
>> 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>> 	kvmppc_sanity_check(vcpu);
>>
>> +	/*
>> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
>> +	 * half executed, we first read TCEs from the user, check them and
>> +	 * return error if something went wrong and only then put TCEs into
>> +	 * the TCE table.
>> +	 *
>> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
>> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
>> +	 * each (4096 bytes).
>> +	 */
>> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
>> +	if (!vcpu->arch.tce_tmp)
>> +		goto free_vcpu;
>> +
>> 	return vcpu;
>>
>> free_vcpu:
>> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>> 	spin_unlock(&vcpu->arch.vpa_update_lock);
>> +	kfree(vcpu->arch.tce_tmp);
>> 	kvm_vcpu_uninit(vcpu);
>> 	kmem_cache_free(kvm_vcpu_cache, vcpu);
>> }
>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> index b02f91e..d35554e 100644
>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>> @@ -1490,6 +1490,12 @@ hcall_real_table:
>> 	.long	0		/* 0x11c */
>> 	.long	0		/* 0x120 */
>> 	.long	.kvmppc_h_bulk_remove - hcall_real_table
>> +	.long	0		/* 0x128 */
>> +	.long	0		/* 0x12c */
>> +	.long	0		/* 0x130 */
>> +	.long	0		/* 0x134 */
>> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
>> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
>> hcall_real_table_end:
>>
>> ignore_hdec:
>> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
>> index da0e0bc..91d4b45 100644
>> --- a/arch/powerpc/kvm/book3s_pr_papr.c
>> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
>> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>> 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>> 	long rc;
>>
>> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
>> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
>> +	if (rc == H_TOO_HARD)
>> +		return EMULATE_FAIL;
>> +	kvmppc_set_gpr(vcpu, 3, rc);
>> +	return EMULATE_DONE;
>> +}
>> +
>> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>> +	long rc;
>> +
>> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
>> +			tce, npages);
>> +	if (rc == H_TOO_HARD)
>> +		return EMULATE_FAIL;
>> +	kvmppc_set_gpr(vcpu, 3, rc);
>> +	return EMULATE_DONE;
>> +}
>> +
>> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
>> +{
>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>> +	long rc;
>> +
>> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>> 	if (rc == H_TOO_HARD)
>> 		return EMULATE_FAIL;
>> 	kvmppc_set_gpr(vcpu, 3, rc);
>> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>> 		return kvmppc_h_pr_bulk_remove(vcpu);
>> 	case H_PUT_TCE:
>> 		return kvmppc_h_pr_put_tce(vcpu);
>> +	case H_PUT_TCE_INDIRECT:
>> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
>> +	case H_STUFF_TCE:
>> +		return kvmppc_h_pr_stuff_tce(vcpu);
>> 	case H_CEDE:
>> 		vcpu->arch.shared->msr |= MSR_EE;
>> 		kvm_vcpu_block(vcpu);
>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> index 6316ee3..8465c2a 100644
>> --- a/arch/powerpc/kvm/powerpc.c
>> +++ b/arch/powerpc/kvm/powerpc.c
>> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>> 		r = 1;
>> 		break;
>> #endif
>> +	case KVM_CAP_SPAPR_MULTITCE:
>> +		r = 1;
> 
> This should only be true for book3s.


We had this discussion with v2.

David:
===
So, in the case of MULTITCE, that's not quite right.  PR KVM can
emulate a PAPR system on a BookE machine, and there's no reason not to
allow TCE acceleration as well.  We can't make it dependent on PAPR
mode being selected, because that's enabled per-vcpu, whereas these
capabilities are queried on the VM before the vcpus are created.
===

Wrong?
Alexander Graf June 17, 2013, 8:02 a.m. UTC | #4
On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:

> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>> 
>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>> 
>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
>>> devices or emulated PCI.  These calls allow adding multiple entries
>>> (up to 512) into the TCE table in one call which saves time on
>>> transition to/from real mode.
>>> 
>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>>> (copied from user and verified) before writing the whole list into
>>> the TCE table. This cache will be utilized more in the upcoming
>>> VFIO/IOMMU support to continue TCE list processing in the virtual
>>> mode in the case if the real mode handler failed for some reason.
>>> 
>>> This adds a guest physical to host real address converter
>>> and calls the existing H_PUT_TCE handler. The converting function
>>> is going to be fully utilized by upcoming VFIO supporting patches.
>>> 
>>> This also implements the KVM_CAP_PPC_MULTITCE capability,
>>> so in order to support the functionality of this patch, QEMU
>>> needs to query for this capability and set the "hcall-multi-tce"
>>> hypertas property only if the capability is present, otherwise
>>> there will be serious performance degradation.
>>> 
>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>> 
>> Only a few minor nits. Ben already commented on implementation details.
>> 
>>> 
>>> ---
>>> Changelog:
>>> 2013/06/05:
>>> * fixed mistype about IBMVIO in the commit message
>>> * updated doc and moved it to another section
>>> * changed capability number
>>> 
>>> 2013/05/21:
>>> * added kvm_vcpu_arch::tce_tmp
>>> * removed cleanup if put_indirect failed, instead we do not even start
>>> writing to TCE table if we cannot get TCEs from the user and they are
>>> invalid
>>> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
>>> and kvmppc_emulated_validate_tce (for the previous item)
>>> * fixed bug with failthrough for H_IPI
>>> * removed all get_user() from real mode handlers
>>> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
>>> ---
>>> Documentation/virtual/kvm/api.txt       |   17 ++
>>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
>>> arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>>> arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>>> arch/powerpc/kvm/powerpc.c              |    3 +
>>> include/uapi/linux/kvm.h                |    1 +
>>> 10 files changed, 473 insertions(+), 32 deletions(-)
>>> 
>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>> index 5f91eda..6c082ff 100644
>>> --- a/Documentation/virtual/kvm/api.txt
>>> +++ b/Documentation/virtual/kvm/api.txt
>>> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
>>> handled.
>>> 
>>> 
>>> +4.83 KVM_CAP_PPC_MULTITCE
>>> +
>>> +Capability: KVM_CAP_PPC_MULTITCE
>>> +Architectures: ppc
>>> +Type: vm
>>> +
>>> +This capability tells the guest that multiple TCE entry add/remove hypercalls
>>> +handling is supported by the kernel. This significanly accelerates DMA
>>> +operations for PPC KVM guests.
>>> +
>>> +Unlike other capabilities in this section, this one does not have an ioctl.
>>> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
>>> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
>>> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
>>> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
>> 
> 
>> While this describes perfectly well what the consequences are of the
>> patches, it does not describe properly what the CAP actually expresses.
>> The CAP only says "this kernel is able to handle H_PUT_TCE_INDIRECT and
>> H_STUFF_TCE hypercalls directly". All other consequences are nice to
>> document, but the semantics of the CAP are missing.
> 
> 
> ? It expresses ability to handle 2 hcalls. What is missing?

You don't describe the kvm <-> qemu interface. You describe some decisions qemu can take from this cap.

> 
> 
>> We also usually try to keep KVM behavior unchanged with regards to older
>> versions until a CAP is enabled. In this case I don't think it matters
>> all that much, so I'm fine with declaring it as enabled by default.
>> Please document that this is a change in behavior versus older KVM
>> versions though.
> 
> 
> Ok!
> 
> 
>>> +
>>> +
>>> 5. The kvm_run structure
>>> ------------------------
>>> 
>>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>>> index af326cd..85d8f26 100644
>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>>> 	spinlock_t tbacct_lock;
>>> 	u64 busy_stolen;
>>> 	u64 busy_preempt;
>>> +
>>> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>>> #endif
>>> };
>> 
>> [...]
>>> 
>>> 
>> 
>> [...]
>> 
>>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>>> index 550f592..a39039a 100644
>>> --- a/arch/powerpc/kvm/book3s_hv.c
>>> +++ b/arch/powerpc/kvm/book3s_hv.c
>>> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>>> 			ret = kvmppc_xics_hcall(vcpu, req);
>>> 			break;
>>> 		} /* fallthrough */
>> 
>> The fallthrough comment isn't accurate anymore.
>> 
>>> +		return RESUME_HOST;
>>> +	case H_PUT_TCE:
>>> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>> +						kvmppc_get_gpr(vcpu, 5),
>>> +						kvmppc_get_gpr(vcpu, 6));
>>> +		if (ret == H_TOO_HARD)
>>> +			return RESUME_HOST;
>>> +		break;
>>> +	case H_PUT_TCE_INDIRECT:
>>> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
>>> +						kvmppc_get_gpr(vcpu, 5),
>>> +						kvmppc_get_gpr(vcpu, 6),
>>> +						kvmppc_get_gpr(vcpu, 7));
>>> +		if (ret == H_TOO_HARD)
>>> +			return RESUME_HOST;
>>> +		break;
>>> +	case H_STUFF_TCE:
>>> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>> +						kvmppc_get_gpr(vcpu, 5),
>>> +						kvmppc_get_gpr(vcpu, 6),
>>> +						kvmppc_get_gpr(vcpu, 7));
>>> +		if (ret == H_TOO_HARD)
>>> +			return RESUME_HOST;
>>> +		break;
>>> 	default:
>>> 		return RESUME_HOST;
>>> 	}
>>> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
>>> 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>>> 	kvmppc_sanity_check(vcpu);
>>> 
>>> +	/*
>>> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
>>> +	 * half executed, we first read TCEs from the user, check them and
>>> +	 * return error if something went wrong and only then put TCEs into
>>> +	 * the TCE table.
>>> +	 *
>>> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
>>> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
>>> +	 * each (4096 bytes).
>>> +	 */
>>> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
>>> +	if (!vcpu->arch.tce_tmp)
>>> +		goto free_vcpu;
>>> +
>>> 	return vcpu;
>>> 
>>> free_vcpu:
>>> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>>> 	spin_unlock(&vcpu->arch.vpa_update_lock);
>>> +	kfree(vcpu->arch.tce_tmp);
>>> 	kvm_vcpu_uninit(vcpu);
>>> 	kmem_cache_free(kvm_vcpu_cache, vcpu);
>>> }
>>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> index b02f91e..d35554e 100644
>>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>> @@ -1490,6 +1490,12 @@ hcall_real_table:
>>> 	.long	0		/* 0x11c */
>>> 	.long	0		/* 0x120 */
>>> 	.long	.kvmppc_h_bulk_remove - hcall_real_table
>>> +	.long	0		/* 0x128 */
>>> +	.long	0		/* 0x12c */
>>> +	.long	0		/* 0x130 */
>>> +	.long	0		/* 0x134 */
>>> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
>>> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
>>> hcall_real_table_end:
>>> 
>>> ignore_hdec:
>>> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
>>> index da0e0bc..91d4b45 100644
>>> --- a/arch/powerpc/kvm/book3s_pr_papr.c
>>> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
>>> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>>> 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>> 	long rc;
>>> 
>>> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
>>> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
>>> +	if (rc == H_TOO_HARD)
>>> +		return EMULATE_FAIL;
>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>> +	return EMULATE_DONE;
>>> +}
>>> +
>>> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
>>> +{
>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>> +	long rc;
>>> +
>>> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
>>> +			tce, npages);
>>> +	if (rc == H_TOO_HARD)
>>> +		return EMULATE_FAIL;
>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>> +	return EMULATE_DONE;
>>> +}
>>> +
>>> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
>>> +{
>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>> +	long rc;
>>> +
>>> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>>> 	if (rc == H_TOO_HARD)
>>> 		return EMULATE_FAIL;
>>> 	kvmppc_set_gpr(vcpu, 3, rc);
>>> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>>> 		return kvmppc_h_pr_bulk_remove(vcpu);
>>> 	case H_PUT_TCE:
>>> 		return kvmppc_h_pr_put_tce(vcpu);
>>> +	case H_PUT_TCE_INDIRECT:
>>> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
>>> +	case H_STUFF_TCE:
>>> +		return kvmppc_h_pr_stuff_tce(vcpu);
>>> 	case H_CEDE:
>>> 		vcpu->arch.shared->msr |= MSR_EE;
>>> 		kvm_vcpu_block(vcpu);
>>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>>> index 6316ee3..8465c2a 100644
>>> --- a/arch/powerpc/kvm/powerpc.c
>>> +++ b/arch/powerpc/kvm/powerpc.c
>>> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>>> 		r = 1;
>>> 		break;
>>> #endif
>>> +	case KVM_CAP_SPAPR_MULTITCE:
>>> +		r = 1;
>> 
>> This should only be true for book3s.
> 
> 
> We had this discussion with v2.
> 
> David:
> ===
> So, in the case of MULTITCE, that's not quite right.  PR KVM can
> emulate a PAPR system on a BookE machine, and there's no reason not to
> allow TCE acceleration as well.  We can't make it dependent on PAPR
> mode being selected, because that's enabled per-vcpu, whereas these
> capabilities are queried on the VM before the vcpus are created.
> ===
> 
> Wrong?

Partially. BookE can not emulate a PAPR system as it stands today.

The code should of course be generic and be available generically. But your code only patches the hypercall's availability in the book3s hypercall handlers, so that specific kernel version can only handle these hypercalls on book3s.

Whether a later version of the kernel will be able to handle them is a different question.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy June 17, 2013, 8:34 a.m. UTC | #5
On 06/17/2013 06:02 PM, Alexander Graf wrote:
> 
> On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:
> 
>> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>>>
>>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>>>
>>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
>>>> devices or emulated PCI.  These calls allow adding multiple entries
>>>> (up to 512) into the TCE table in one call which saves time on
>>>> transition to/from real mode.
>>>>
>>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>>>> (copied from user and verified) before writing the whole list into
>>>> the TCE table. This cache will be utilized more in the upcoming
>>>> VFIO/IOMMU support to continue TCE list processing in the virtual
>>>> mode in the case if the real mode handler failed for some reason.
>>>>
>>>> This adds a guest physical to host real address converter
>>>> and calls the existing H_PUT_TCE handler. The converting function
>>>> is going to be fully utilized by upcoming VFIO supporting patches.
>>>>
>>>> This also implements the KVM_CAP_PPC_MULTITCE capability,
>>>> so in order to support the functionality of this patch, QEMU
>>>> needs to query for this capability and set the "hcall-multi-tce"
>>>> hypertas property only if the capability is present, otherwise
>>>> there will be serious performance degradation.
>>>>
>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>>
>>> Only a few minor nits. Ben already commented on implementation details.
>>>
>>>>
>>>> ---
>>>> Changelog:
>>>> 2013/06/05:
>>>> * fixed mistype about IBMVIO in the commit message
>>>> * updated doc and moved it to another section
>>>> * changed capability number
>>>>
>>>> 2013/05/21:
>>>> * added kvm_vcpu_arch::tce_tmp
>>>> * removed cleanup if put_indirect failed, instead we do not even start
>>>> writing to TCE table if we cannot get TCEs from the user and they are
>>>> invalid
>>>> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
>>>> and kvmppc_emulated_validate_tce (for the previous item)
>>>> * fixed bug with failthrough for H_IPI
>>>> * removed all get_user() from real mode handlers
>>>> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
>>>> ---
>>>> Documentation/virtual/kvm/api.txt       |   17 ++
>>>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
>>>> arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>>>> arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>>>> arch/powerpc/kvm/powerpc.c              |    3 +
>>>> include/uapi/linux/kvm.h                |    1 +
>>>> 10 files changed, 473 insertions(+), 32 deletions(-)
>>>>
>>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>>> index 5f91eda..6c082ff 100644
>>>> --- a/Documentation/virtual/kvm/api.txt
>>>> +++ b/Documentation/virtual/kvm/api.txt
>>>> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
>>>> handled.
>>>>
>>>>
>>>> +4.83 KVM_CAP_PPC_MULTITCE
>>>> +
>>>> +Capability: KVM_CAP_PPC_MULTITCE
>>>> +Architectures: ppc
>>>> +Type: vm
>>>> +
>>>> +This capability tells the guest that multiple TCE entry add/remove hypercalls
>>>> +handling is supported by the kernel. This significanly accelerates DMA
>>>> +operations for PPC KVM guests.
>>>> +
>>>> +Unlike other capabilities in this section, this one does not have an ioctl.
>>>> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
>>>> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
>>>> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
>>>> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
>>>
>>
>>> While this describes perfectly well what the consequences are of the
>>> patches, it does not describe properly what the CAP actually expresses.
>>> The CAP only says "this kernel is able to handle H_PUT_TCE_INDIRECT and
>>> H_STUFF_TCE hypercalls directly". All other consequences are nice to
>>> document, but the semantics of the CAP are missing.
>>
>>
>> ? It expresses ability to handle 2 hcalls. What is missing?
> 
> You don't describe the kvm <-> qemu interface. You describe some decisions qemu can take from this cap.


This file does not mention qemu at all. And the interface is - qemu (or
kvmtool could do that) just adds "hcall-multi-tce" to
"ibm,hypertas-functions" but this is for pseries linux and AIX could always
do it (no idea about it). Does it really have to be in this file?



>>> We also usually try to keep KVM behavior unchanged with regards to older
>>> versions until a CAP is enabled. In this case I don't think it matters
>>> all that much, so I'm fine with declaring it as enabled by default.
>>> Please document that this is a change in behavior versus older KVM
>>> versions though.
>>
>>
>> Ok!
>>
>>
>>>> +
>>>> +
>>>> 5. The kvm_run structure
>>>> ------------------------
>>>>
>>>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>>>> index af326cd..85d8f26 100644
>>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>>> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>>>> 	spinlock_t tbacct_lock;
>>>> 	u64 busy_stolen;
>>>> 	u64 busy_preempt;
>>>> +
>>>> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>>>> #endif
>>>> };
>>>
>>> [...]
>>>>
>>>>
>>>
>>> [...]
>>>
>>>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>>>> index 550f592..a39039a 100644
>>>> --- a/arch/powerpc/kvm/book3s_hv.c
>>>> +++ b/arch/powerpc/kvm/book3s_hv.c
>>>> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>>>> 			ret = kvmppc_xics_hcall(vcpu, req);
>>>> 			break;
>>>> 		} /* fallthrough */
>>>
>>> The fallthrough comment isn't accurate anymore.
>>>
>>>> +		return RESUME_HOST;
>>>> +	case H_PUT_TCE:
>>>> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>> +						kvmppc_get_gpr(vcpu, 6));
>>>> +		if (ret == H_TOO_HARD)
>>>> +			return RESUME_HOST;
>>>> +		break;
>>>> +	case H_PUT_TCE_INDIRECT:
>>>> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>> +						kvmppc_get_gpr(vcpu, 6),
>>>> +						kvmppc_get_gpr(vcpu, 7));
>>>> +		if (ret == H_TOO_HARD)
>>>> +			return RESUME_HOST;
>>>> +		break;
>>>> +	case H_STUFF_TCE:
>>>> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>> +						kvmppc_get_gpr(vcpu, 6),
>>>> +						kvmppc_get_gpr(vcpu, 7));
>>>> +		if (ret == H_TOO_HARD)
>>>> +			return RESUME_HOST;
>>>> +		break;
>>>> 	default:
>>>> 		return RESUME_HOST;
>>>> 	}
>>>> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
>>>> 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>>>> 	kvmppc_sanity_check(vcpu);
>>>>
>>>> +	/*
>>>> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
>>>> +	 * half executed, we first read TCEs from the user, check them and
>>>> +	 * return error if something went wrong and only then put TCEs into
>>>> +	 * the TCE table.
>>>> +	 *
>>>> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
>>>> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
>>>> +	 * each (4096 bytes).
>>>> +	 */
>>>> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
>>>> +	if (!vcpu->arch.tce_tmp)
>>>> +		goto free_vcpu;
>>>> +
>>>> 	return vcpu;
>>>>
>>>> free_vcpu:
>>>> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>>>> 	spin_unlock(&vcpu->arch.vpa_update_lock);
>>>> +	kfree(vcpu->arch.tce_tmp);
>>>> 	kvm_vcpu_uninit(vcpu);
>>>> 	kmem_cache_free(kvm_vcpu_cache, vcpu);
>>>> }
>>>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>> index b02f91e..d35554e 100644
>>>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>> @@ -1490,6 +1490,12 @@ hcall_real_table:
>>>> 	.long	0		/* 0x11c */
>>>> 	.long	0		/* 0x120 */
>>>> 	.long	.kvmppc_h_bulk_remove - hcall_real_table
>>>> +	.long	0		/* 0x128 */
>>>> +	.long	0		/* 0x12c */
>>>> +	.long	0		/* 0x130 */
>>>> +	.long	0		/* 0x134 */
>>>> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
>>>> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
>>>> hcall_real_table_end:
>>>>
>>>> ignore_hdec:
>>>> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
>>>> index da0e0bc..91d4b45 100644
>>>> --- a/arch/powerpc/kvm/book3s_pr_papr.c
>>>> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
>>>> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>>>> 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>>> 	long rc;
>>>>
>>>> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
>>>> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
>>>> +	if (rc == H_TOO_HARD)
>>>> +		return EMULATE_FAIL;
>>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>>> +	return EMULATE_DONE;
>>>> +}
>>>> +
>>>> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>>> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>>> +	long rc;
>>>> +
>>>> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
>>>> +			tce, npages);
>>>> +	if (rc == H_TOO_HARD)
>>>> +		return EMULATE_FAIL;
>>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>>> +	return EMULATE_DONE;
>>>> +}
>>>> +
>>>> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
>>>> +{
>>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>>> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
>>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>>> +	long rc;
>>>> +
>>>> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>>>> 	if (rc == H_TOO_HARD)
>>>> 		return EMULATE_FAIL;
>>>> 	kvmppc_set_gpr(vcpu, 3, rc);
>>>> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>>>> 		return kvmppc_h_pr_bulk_remove(vcpu);
>>>> 	case H_PUT_TCE:
>>>> 		return kvmppc_h_pr_put_tce(vcpu);
>>>> +	case H_PUT_TCE_INDIRECT:
>>>> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
>>>> +	case H_STUFF_TCE:
>>>> +		return kvmppc_h_pr_stuff_tce(vcpu);
>>>> 	case H_CEDE:
>>>> 		vcpu->arch.shared->msr |= MSR_EE;
>>>> 		kvm_vcpu_block(vcpu);
>>>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>>>> index 6316ee3..8465c2a 100644
>>>> --- a/arch/powerpc/kvm/powerpc.c
>>>> +++ b/arch/powerpc/kvm/powerpc.c
>>>> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>>>> 		r = 1;
>>>> 		break;
>>>> #endif
>>>> +	case KVM_CAP_SPAPR_MULTITCE:
>>>> +		r = 1;
>>>
>>> This should only be true for book3s.
>>
>>
>> We had this discussion with v2.
>>
>> David:
>> ===
>> So, in the case of MULTITCE, that's not quite right.  PR KVM can
>> emulate a PAPR system on a BookE machine, and there's no reason not to
>> allow TCE acceleration as well.  We can't make it dependent on PAPR
>> mode being selected, because that's enabled per-vcpu, whereas these
>> capabilities are queried on the VM before the vcpus are created.
>> ===
>>
>> Wrong?

> Partially. BookE can not emulate a PAPR system as it stands today.

Oh.
Ok.
So - #ifdef CONFIG_PPC_BOOK3S_64 ? Or run-time check for book3s (how...)?
Benjamin Herrenschmidt June 17, 2013, 8:37 a.m. UTC | #6
On Mon, 2013-06-17 at 17:55 +1000, Alexey Kardashevskiy wrote:
> David:
> ===
> So, in the case of MULTITCE, that's not quite right.  PR KVM can
> emulate a PAPR system on a BookE machine, and there's no reason not to
> allow TCE acceleration as well.  We can't make it dependent on PAPR
> mode being selected, because that's enabled per-vcpu, whereas these
> capabilities are queried on the VM before the vcpus are created.
> ===
> 
> Wrong?

The capability just tells qemu the kernel supports it, it doesn't have
to depend on PAPR mode, qemu can sort things out no ?

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 17, 2013, 8:40 a.m. UTC | #7
On 17.06.2013, at 10:34, Alexey Kardashevskiy wrote:

> On 06/17/2013 06:02 PM, Alexander Graf wrote:
>> 
>> On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:
>> 
>>> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>>>> 
>>>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>>>> 
>>>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>>>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
>>>>> devices or emulated PCI.  These calls allow adding multiple entries
>>>>> (up to 512) into the TCE table in one call which saves time on
>>>>> transition to/from real mode.
>>>>> 
>>>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>>>>> (copied from user and verified) before writing the whole list into
>>>>> the TCE table. This cache will be utilized more in the upcoming
>>>>> VFIO/IOMMU support to continue TCE list processing in the virtual
>>>>> mode in the case if the real mode handler failed for some reason.
>>>>> 
>>>>> This adds a guest physical to host real address converter
>>>>> and calls the existing H_PUT_TCE handler. The converting function
>>>>> is going to be fully utilized by upcoming VFIO supporting patches.
>>>>> 
>>>>> This also implements the KVM_CAP_PPC_MULTITCE capability,
>>>>> so in order to support the functionality of this patch, QEMU
>>>>> needs to query for this capability and set the "hcall-multi-tce"
>>>>> hypertas property only if the capability is present, otherwise
>>>>> there will be serious performance degradation.
>>>>> 
>>>>> Cc: David Gibson <david@gibson.dropbear.id.au>
>>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>>> Signed-off-by: Paul Mackerras <paulus@samba.org>
>>>> 
>>>> Only a few minor nits. Ben already commented on implementation details.
>>>> 
>>>>> 
>>>>> ---
>>>>> Changelog:
>>>>> 2013/06/05:
>>>>> * fixed mistype about IBMVIO in the commit message
>>>>> * updated doc and moved it to another section
>>>>> * changed capability number
>>>>> 
>>>>> 2013/05/21:
>>>>> * added kvm_vcpu_arch::tce_tmp
>>>>> * removed cleanup if put_indirect failed, instead we do not even start
>>>>> writing to TCE table if we cannot get TCEs from the user and they are
>>>>> invalid
>>>>> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
>>>>> and kvmppc_emulated_validate_tce (for the previous item)
>>>>> * fixed bug with failthrough for H_IPI
>>>>> * removed all get_user() from real mode handlers
>>>>> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
>>>>> ---
>>>>> Documentation/virtual/kvm/api.txt       |   17 ++
>>>>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>>>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>>>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>>>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 +++++++++++++++++++++++++++----
>>>>> arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>>>>> arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>>>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>>>>> arch/powerpc/kvm/powerpc.c              |    3 +
>>>>> include/uapi/linux/kvm.h                |    1 +
>>>>> 10 files changed, 473 insertions(+), 32 deletions(-)
>>>>> 
>>>>> diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
>>>>> index 5f91eda..6c082ff 100644
>>>>> --- a/Documentation/virtual/kvm/api.txt
>>>>> +++ b/Documentation/virtual/kvm/api.txt
>>>>> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to userspace to be
>>>>> handled.
>>>>> 
>>>>> 
>>>>> +4.83 KVM_CAP_PPC_MULTITCE
>>>>> +
>>>>> +Capability: KVM_CAP_PPC_MULTITCE
>>>>> +Architectures: ppc
>>>>> +Type: vm
>>>>> +
>>>>> +This capability tells the guest that multiple TCE entry add/remove hypercalls
>>>>> +handling is supported by the kernel. This significanly accelerates DMA
>>>>> +operations for PPC KVM guests.
>>>>> +
>>>>> +Unlike other capabilities in this section, this one does not have an ioctl.
>>>>> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
>>>>> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
>>>>> +the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
>>>>> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
>>>> 
>>> 
>>>> While this describes perfectly well what the consequences are of the
>>>> patches, it does not describe properly what the CAP actually expresses.
>>>> The CAP only says "this kernel is able to handle H_PUT_TCE_INDIRECT and
>>>> H_STUFF_TCE hypercalls directly". All other consequences are nice to
>>>> document, but the semantics of the CAP are missing.
>>> 
>>> 
>>> ? It expresses ability to handle 2 hcalls. What is missing?
>> 
>> You don't describe the kvm <-> qemu interface. You describe some decisions qemu can take from this cap.
> 
> 
> This file does not mention qemu at all. And the interface is - qemu (or
> kvmtool could do that) just adds "hcall-multi-tce" to
> "ibm,hypertas-functions" but this is for pseries linux and AIX could always
> do it (no idea about it). Does it really have to be in this file?

Ok, let's go back a step. What does this CAP describe? Don't look at the description you wrote above. Just write a new one. What exactly can user space expect when it finds this CAP?

> 
> 
> 
>>>> We also usually try to keep KVM behavior unchanged with regards to older
>>>> versions until a CAP is enabled. In this case I don't think it matters
>>>> all that much, so I'm fine with declaring it as enabled by default.
>>>> Please document that this is a change in behavior versus older KVM
>>>> versions though.
>>> 
>>> 
>>> Ok!
>>> 
>>> 
>>>>> +
>>>>> +
>>>>> 5. The kvm_run structure
>>>>> ------------------------
>>>>> 
>>>>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>>>>> index af326cd..85d8f26 100644
>>>>> --- a/arch/powerpc/include/asm/kvm_host.h
>>>>> +++ b/arch/powerpc/include/asm/kvm_host.h
>>>>> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>>>>> 	spinlock_t tbacct_lock;
>>>>> 	u64 busy_stolen;
>>>>> 	u64 busy_preempt;
>>>>> +
>>>>> +	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>>>>> #endif
>>>>> };
>>>> 
>>>> [...]
>>>>> 
>>>>> 
>>>> 
>>>> [...]
>>>> 
>>>>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>>>>> index 550f592..a39039a 100644
>>>>> --- a/arch/powerpc/kvm/book3s_hv.c
>>>>> +++ b/arch/powerpc/kvm/book3s_hv.c
>>>>> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>>>>> 			ret = kvmppc_xics_hcall(vcpu, req);
>>>>> 			break;
>>>>> 		} /* fallthrough */
>>>> 
>>>> The fallthrough comment isn't accurate anymore.
>>>> 
>>>>> +		return RESUME_HOST;
>>>>> +	case H_PUT_TCE:
>>>>> +		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>>> +						kvmppc_get_gpr(vcpu, 6));
>>>>> +		if (ret == H_TOO_HARD)
>>>>> +			return RESUME_HOST;
>>>>> +		break;
>>>>> +	case H_PUT_TCE_INDIRECT:
>>>>> +		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>>> +						kvmppc_get_gpr(vcpu, 6),
>>>>> +						kvmppc_get_gpr(vcpu, 7));
>>>>> +		if (ret == H_TOO_HARD)
>>>>> +			return RESUME_HOST;
>>>>> +		break;
>>>>> +	case H_STUFF_TCE:
>>>>> +		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
>>>>> +						kvmppc_get_gpr(vcpu, 5),
>>>>> +						kvmppc_get_gpr(vcpu, 6),
>>>>> +						kvmppc_get_gpr(vcpu, 7));
>>>>> +		if (ret == H_TOO_HARD)
>>>>> +			return RESUME_HOST;
>>>>> +		break;
>>>>> 	default:
>>>>> 		return RESUME_HOST;
>>>>> 	}
>>>>> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
>>>>> 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
>>>>> 	kvmppc_sanity_check(vcpu);
>>>>> 
>>>>> +	/*
>>>>> +	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
>>>>> +	 * half executed, we first read TCEs from the user, check them and
>>>>> +	 * return error if something went wrong and only then put TCEs into
>>>>> +	 * the TCE table.
>>>>> +	 *
>>>>> +	 * tce_tmp is a cache for TCEs to avoid stack allocation or
>>>>> +	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
>>>>> +	 * each (4096 bytes).
>>>>> +	 */
>>>>> +	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
>>>>> +	if (!vcpu->arch.tce_tmp)
>>>>> +		goto free_vcpu;
>>>>> +
>>>>> 	return vcpu;
>>>>> 
>>>>> free_vcpu:
>>>>> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>>>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>>>>> 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>>>>> 	spin_unlock(&vcpu->arch.vpa_update_lock);
>>>>> +	kfree(vcpu->arch.tce_tmp);
>>>>> 	kvm_vcpu_uninit(vcpu);
>>>>> 	kmem_cache_free(kvm_vcpu_cache, vcpu);
>>>>> }
>>>>> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>>> index b02f91e..d35554e 100644
>>>>> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>>> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
>>>>> @@ -1490,6 +1490,12 @@ hcall_real_table:
>>>>> 	.long	0		/* 0x11c */
>>>>> 	.long	0		/* 0x120 */
>>>>> 	.long	.kvmppc_h_bulk_remove - hcall_real_table
>>>>> +	.long	0		/* 0x128 */
>>>>> +	.long	0		/* 0x12c */
>>>>> +	.long	0		/* 0x130 */
>>>>> +	.long	0		/* 0x134 */
>>>>> +	.long	.kvmppc_h_stuff_tce - hcall_real_table
>>>>> +	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
>>>>> hcall_real_table_end:
>>>>> 
>>>>> ignore_hdec:
>>>>> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
>>>>> index da0e0bc..91d4b45 100644
>>>>> --- a/arch/powerpc/kvm/book3s_pr_papr.c
>>>>> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
>>>>> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>>>>> 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>>>> 	long rc;
>>>>> 
>>>>> -	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
>>>>> +	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
>>>>> +	if (rc == H_TOO_HARD)
>>>>> +		return EMULATE_FAIL;
>>>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>>>> +	return EMULATE_DONE;
>>>>> +}
>>>>> +
>>>>> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>>>> +	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>>>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>>>> +	long rc;
>>>>> +
>>>>> +	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
>>>>> +			tce, npages);
>>>>> +	if (rc == H_TOO_HARD)
>>>>> +		return EMULATE_FAIL;
>>>>> +	kvmppc_set_gpr(vcpu, 3, rc);
>>>>> +	return EMULATE_DONE;
>>>>> +}
>>>>> +
>>>>> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
>>>>> +{
>>>>> +	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
>>>>> +	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
>>>>> +	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
>>>>> +	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
>>>>> +	long rc;
>>>>> +
>>>>> +	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>>>>> 	if (rc == H_TOO_HARD)
>>>>> 		return EMULATE_FAIL;
>>>>> 	kvmppc_set_gpr(vcpu, 3, rc);
>>>>> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>>>>> 		return kvmppc_h_pr_bulk_remove(vcpu);
>>>>> 	case H_PUT_TCE:
>>>>> 		return kvmppc_h_pr_put_tce(vcpu);
>>>>> +	case H_PUT_TCE_INDIRECT:
>>>>> +		return kvmppc_h_pr_put_tce_indirect(vcpu);
>>>>> +	case H_STUFF_TCE:
>>>>> +		return kvmppc_h_pr_stuff_tce(vcpu);
>>>>> 	case H_CEDE:
>>>>> 		vcpu->arch.shared->msr |= MSR_EE;
>>>>> 		kvm_vcpu_block(vcpu);
>>>>> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>>>>> index 6316ee3..8465c2a 100644
>>>>> --- a/arch/powerpc/kvm/powerpc.c
>>>>> +++ b/arch/powerpc/kvm/powerpc.c
>>>>> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>>>>> 		r = 1;
>>>>> 		break;
>>>>> #endif
>>>>> +	case KVM_CAP_SPAPR_MULTITCE:
>>>>> +		r = 1;
>>>> 
>>>> This should only be true for book3s.
>>> 
>>> 
>>> We had this discussion with v2.
>>> 
>>> David:
>>> ===
>>> So, in the case of MULTITCE, that's not quite right.  PR KVM can
>>> emulate a PAPR system on a BookE machine, and there's no reason not to
>>> allow TCE acceleration as well.  We can't make it dependent on PAPR
>>> mode being selected, because that's enabled per-vcpu, whereas these
>>> capabilities are queried on the VM before the vcpus are created.
>>> ===
>>> 
>>> Wrong?
> 
>> Partially. BookE can not emulate a PAPR system as it stands today.
> 
> Oh.
> Ok.
> So - #ifdef CONFIG_PPC_BOOK3S_64 ? Or run-time check for book3s (how...)?

The former.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 17, 2013, 8:42 a.m. UTC | #8
On 17.06.2013, at 10:37, Benjamin Herrenschmidt wrote:

> On Mon, 2013-06-17 at 17:55 +1000, Alexey Kardashevskiy wrote:
>> David:
>> ===
>> So, in the case of MULTITCE, that's not quite right.  PR KVM can
>> emulate a PAPR system on a BookE machine, and there's no reason not to
>> allow TCE acceleration as well.  We can't make it dependent on PAPR
>> mode being selected, because that's enabled per-vcpu, whereas these
>> capabilities are queried on the VM before the vcpus are created.
>> ===
>> 
>> Wrong?
> 
> The capability just tells qemu the kernel supports it, it doesn't have
> to depend on PAPR mode, qemu can sort things out no ?

Yes, this goes hand-in-hand with the documentation bit I'm trying to get through to Alexey atm. The CAP merely says that if in PAPR mode the kernel can handle hypercalls X and Y itself.

This is true for all book3s implementations as the patches stand. It is not true for BookE as the patches stand. Hence the CAP should be limited to book3s, regardless of its mode :).


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexey Kardashevskiy June 17, 2013, 8:51 a.m. UTC | #9
On 06/17/2013 06:40 PM, Alexander Graf wrote:
> 
> On 17.06.2013, at 10:34, Alexey Kardashevskiy wrote:
> 
>> On 06/17/2013 06:02 PM, Alexander Graf wrote:
>>> 
>>> On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:
>>> 
>>>> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>>>>> 
>>>>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>>>>> 
>>>>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and 
>>>>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as
>>>>>> IBMVIO devices or emulated PCI.  These calls allow adding
>>>>>> multiple entries (up to 512) into the TCE table in one call
>>>>>> which saves time on transition to/from real mode.
>>>>>> 
>>>>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs 
>>>>>> (copied from user and verified) before writing the whole list
>>>>>> into the TCE table. This cache will be utilized more in the
>>>>>> upcoming VFIO/IOMMU support to continue TCE list processing in
>>>>>> the virtual mode in the case if the real mode handler failed
>>>>>> for some reason.
>>>>>> 
>>>>>> This adds a guest physical to host real address converter and
>>>>>> calls the existing H_PUT_TCE handler. The converting function 
>>>>>> is going to be fully utilized by upcoming VFIO supporting
>>>>>> patches.
>>>>>> 
>>>>>> This also implements the KVM_CAP_PPC_MULTITCE capability, so
>>>>>> in order to support the functionality of this patch, QEMU 
>>>>>> needs to query for this capability and set the
>>>>>> "hcall-multi-tce" hypertas property only if the capability is
>>>>>> present, otherwise there will be serious performance
>>>>>> degradation.
>>>>>> 
>>>>>> Cc: David Gibson <david@gibson.dropbear.id.au> Signed-off-by:
>>>>>> Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Paul
>>>>>> Mackerras <paulus@samba.org>
>>>>> 
>>>>> Only a few minor nits. Ben already commented on implementation
>>>>> details.
>>>>> 
>>>>>> 
>>>>>> --- Changelog: 2013/06/05: * fixed mistype about IBMVIO in the
>>>>>> commit message * updated doc and moved it to another section *
>>>>>> changed capability number
>>>>>> 
>>>>>> 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup
>>>>>> if put_indirect failed, instead we do not even start writing
>>>>>> to TCE table if we cannot get TCEs from the user and they are 
>>>>>> invalid * kvmppc_emulated_h_put_tce is split to
>>>>>> kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for
>>>>>> the previous item) * fixed bug with failthrough for H_IPI *
>>>>>> removed all get_user() from real mode handlers *
>>>>>> kvmppc_lookup_pte() added (instead of making lookup_linux_pte
>>>>>> public) --- Documentation/virtual/kvm/api.txt       |   17 ++ 
>>>>>> arch/powerpc/include/asm/kvm_host.h     |    2 + 
>>>>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +- 
>>>>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++ 
>>>>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266
>>>>>> +++++++++++++++++++++++++++---- arch/powerpc/kvm/book3s_hv.c
>>>>>> |   39 +++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 + 
>>>>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++- 
>>>>>> arch/powerpc/kvm/powerpc.c              |    3 + 
>>>>>> include/uapi/linux/kvm.h                |    1 + 10 files
>>>>>> changed, 473 insertions(+), 32 deletions(-)
>>>>>> 
>>>>>> diff --git a/Documentation/virtual/kvm/api.txt
>>>>>> b/Documentation/virtual/kvm/api.txt index 5f91eda..6c082ff
>>>>>> 100644 --- a/Documentation/virtual/kvm/api.txt +++
>>>>>> b/Documentation/virtual/kvm/api.txt @@ -2362,6 +2362,23 @@
>>>>>> calls by the guest for that service will be passed to
>>>>>> userspace to be handled.
>>>>>> 
>>>>>> 
>>>>>> +4.83 KVM_CAP_PPC_MULTITCE + +Capability:
>>>>>> KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This
>>>>>> capability tells the guest that multiple TCE entry add/remove
>>>>>> hypercalls +handling is supported by the kernel. This
>>>>>> significanly accelerates DMA +operations for PPC KVM guests. 
>>>>>> + +Unlike other capabilities in this section, this one does
>>>>>> not have an ioctl. +Instead, when the capability is present,
>>>>>> the H_PUT_TCE_INDIRECT and +H_STUFF_TCE hypercalls are to be
>>>>>> handled in the host kernel and not passed to +the guest.
>>>>>> Othwerwise it might be better for the guest to continue using
>>>>>> H_PUT_TCE +hypercall (if KVM_CAP_SPAPR_TCE or
>>>>>> KVM_CAP_SPAPR_TCE_IOMMU are present).
>>>>> 
>>>> 
>>>>> While this describes perfectly well what the consequences are of
>>>>> the patches, it does not describe properly what the CAP actually
>>>>> expresses. The CAP only says "this kernel is able to handle
>>>>> H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls directly". All
>>>>> other consequences are nice to document, but the semantics of
>>>>> the CAP are missing.
>>>> 
>>>> 
>>>> ? It expresses ability to handle 2 hcalls. What is missing?
>>> 
>>> You don't describe the kvm <-> qemu interface. You describe some
>>> decisions qemu can take from this cap.
>> 
>> 
>> This file does not mention qemu at all. And the interface is - qemu
>> (or kvmtool could do that) just adds "hcall-multi-tce" to 
>> "ibm,hypertas-functions" but this is for pseries linux and AIX could
>> always do it (no idea about it). Does it really have to be in this
>> file?
> 

> Ok, let's go back a step. What does this CAP describe? Don't look at the
> description you wrote above. Just write a new one.

The CAP means the kernel is capable of handling hcalls A and B without
passing those into the user space. That accelerates DMA.


> What exactly can user space expect when it finds this CAP?

The user space can expect that its handlers for A and B are not going to be
called if it configures the guest appropriately.

Any better? :)
Alexander Graf June 17, 2013, 10:46 a.m. UTC | #10
On 06/17/2013 10:51 AM, Alexey Kardashevskiy wrote:
> On 06/17/2013 06:40 PM, Alexander Graf wrote:
>> On 17.06.2013, at 10:34, Alexey Kardashevskiy wrote:
>>
>>> On 06/17/2013 06:02 PM, Alexander Graf wrote:
>>>> On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:
>>>>
>>>>> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>>>>>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>>>>>>
>>>>>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>>>>>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as
>>>>>>> IBMVIO devices or emulated PCI.  These calls allow adding
>>>>>>> multiple entries (up to 512) into the TCE table in one call
>>>>>>> which saves time on transition to/from real mode.
>>>>>>>
>>>>>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>>>>>>> (copied from user and verified) before writing the whole list
>>>>>>> into the TCE table. This cache will be utilized more in the
>>>>>>> upcoming VFIO/IOMMU support to continue TCE list processing in
>>>>>>> the virtual mode in the case if the real mode handler failed
>>>>>>> for some reason.
>>>>>>>
>>>>>>> This adds a guest physical to host real address converter and
>>>>>>> calls the existing H_PUT_TCE handler. The converting function
>>>>>>> is going to be fully utilized by upcoming VFIO supporting
>>>>>>> patches.
>>>>>>>
>>>>>>> This also implements the KVM_CAP_PPC_MULTITCE capability, so
>>>>>>> in order to support the functionality of this patch, QEMU
>>>>>>> needs to query for this capability and set the
>>>>>>> "hcall-multi-tce" hypertas property only if the capability is
>>>>>>> present, otherwise there will be serious performance
>>>>>>> degradation.
>>>>>>>
>>>>>>> Cc: David Gibson<david@gibson.dropbear.id.au>  Signed-off-by:
>>>>>>> Alexey Kardashevskiy<aik@ozlabs.ru>  Signed-off-by: Paul
>>>>>>> Mackerras<paulus@samba.org>
>>>>>> Only a few minor nits. Ben already commented on implementation
>>>>>> details.
>>>>>>
>>>>>>> --- Changelog: 2013/06/05: * fixed mistype about IBMVIO in the
>>>>>>> commit message * updated doc and moved it to another section *
>>>>>>> changed capability number
>>>>>>>
>>>>>>> 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup
>>>>>>> if put_indirect failed, instead we do not even start writing
>>>>>>> to TCE table if we cannot get TCEs from the user and they are
>>>>>>> invalid * kvmppc_emulated_h_put_tce is split to
>>>>>>> kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for
>>>>>>> the previous item) * fixed bug with failthrough for H_IPI *
>>>>>>> removed all get_user() from real mode handlers *
>>>>>>> kvmppc_lookup_pte() added (instead of making lookup_linux_pte
>>>>>>> public) --- Documentation/virtual/kvm/api.txt       |   17 ++
>>>>>>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>>>>>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>>>>>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>>>>>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266
>>>>>>> +++++++++++++++++++++++++++---- arch/powerpc/kvm/book3s_hv.c
>>>>>>> |   39 +++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>>>>>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>>>>>>> arch/powerpc/kvm/powerpc.c              |    3 +
>>>>>>> include/uapi/linux/kvm.h                |    1 + 10 files
>>>>>>> changed, 473 insertions(+), 32 deletions(-)
>>>>>>>
>>>>>>> diff --git a/Documentation/virtual/kvm/api.txt
>>>>>>> b/Documentation/virtual/kvm/api.txt index 5f91eda..6c082ff
>>>>>>> 100644 --- a/Documentation/virtual/kvm/api.txt +++
>>>>>>> b/Documentation/virtual/kvm/api.txt @@ -2362,6 +2362,23 @@
>>>>>>> calls by the guest for that service will be passed to
>>>>>>> userspace to be handled.
>>>>>>>
>>>>>>>
>>>>>>> +4.83 KVM_CAP_PPC_MULTITCE + +Capability:
>>>>>>> KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This
>>>>>>> capability tells the guest that multiple TCE entry add/remove
>>>>>>> hypercalls +handling is supported by the kernel. This
>>>>>>> significanly accelerates DMA +operations for PPC KVM guests.
>>>>>>> + +Unlike other capabilities in this section, this one does
>>>>>>> not have an ioctl. +Instead, when the capability is present,
>>>>>>> the H_PUT_TCE_INDIRECT and +H_STUFF_TCE hypercalls are to be
>>>>>>> handled in the host kernel and not passed to +the guest.
>>>>>>> Othwerwise it might be better for the guest to continue using
>>>>>>> H_PUT_TCE +hypercall (if KVM_CAP_SPAPR_TCE or
>>>>>>> KVM_CAP_SPAPR_TCE_IOMMU are present).
>>>>>> While this describes perfectly well what the consequences are of
>>>>>> the patches, it does not describe properly what the CAP actually
>>>>>> expresses. The CAP only says "this kernel is able to handle
>>>>>> H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls directly". All
>>>>>> other consequences are nice to document, but the semantics of
>>>>>> the CAP are missing.
>>>>>
>>>>> ? It expresses ability to handle 2 hcalls. What is missing?
>>>> You don't describe the kvm<->  qemu interface. You describe some
>>>> decisions qemu can take from this cap.
>>>
>>> This file does not mention qemu at all. And the interface is - qemu
>>> (or kvmtool could do that) just adds "hcall-multi-tce" to
>>> "ibm,hypertas-functions" but this is for pseries linux and AIX could
>>> always do it (no idea about it). Does it really have to be in this
>>> file?
>> Ok, let's go back a step. What does this CAP describe? Don't look at the
>> description you wrote above. Just write a new one.
> The CAP means the kernel is capable of handling hcalls A and B without
> passing those into the user space. That accelerates DMA.
>
>
>> What exactly can user space expect when it finds this CAP?
> The user space can expect that its handlers for A and B are not going to be
> called if it configures the guest appropriately.
>
> Any better? :)

A lot, yes. This is what the CAP actually means.

It's nice to give some guidance in the documentation of implications 
(should expose "ibm,hypertas-functions" to enable the guest to actually 
use these for example) but the first paragraph should only indicate what 
the CAP changes.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Graf June 17, 2013, 10:48 a.m. UTC | #11
On 06/17/2013 12:46 PM, Alexander Graf wrote:
> On 06/17/2013 10:51 AM, Alexey Kardashevskiy wrote:
>> On 06/17/2013 06:40 PM, Alexander Graf wrote:
>>> On 17.06.2013, at 10:34, Alexey Kardashevskiy wrote:
>>>
>>>> On 06/17/2013 06:02 PM, Alexander Graf wrote:
>>>>> On 17.06.2013, at 09:55, Alexey Kardashevskiy wrote:
>>>>>
>>>>>> On 06/17/2013 08:06 AM, Alexander Graf wrote:
>>>>>>> On 05.06.2013, at 08:11, Alexey Kardashevskiy wrote:
>>>>>>>
>>>>>>>> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
>>>>>>>> H_STUFF_TCE hypercalls for QEMU emulated devices such as
>>>>>>>> IBMVIO devices or emulated PCI.  These calls allow adding
>>>>>>>> multiple entries (up to 512) into the TCE table in one call
>>>>>>>> which saves time on transition to/from real mode.
>>>>>>>>
>>>>>>>> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
>>>>>>>> (copied from user and verified) before writing the whole list
>>>>>>>> into the TCE table. This cache will be utilized more in the
>>>>>>>> upcoming VFIO/IOMMU support to continue TCE list processing in
>>>>>>>> the virtual mode in the case if the real mode handler failed
>>>>>>>> for some reason.
>>>>>>>>
>>>>>>>> This adds a guest physical to host real address converter and
>>>>>>>> calls the existing H_PUT_TCE handler. The converting function
>>>>>>>> is going to be fully utilized by upcoming VFIO supporting
>>>>>>>> patches.
>>>>>>>>
>>>>>>>> This also implements the KVM_CAP_PPC_MULTITCE capability, so
>>>>>>>> in order to support the functionality of this patch, QEMU
>>>>>>>> needs to query for this capability and set the
>>>>>>>> "hcall-multi-tce" hypertas property only if the capability is
>>>>>>>> present, otherwise there will be serious performance
>>>>>>>> degradation.
>>>>>>>>
>>>>>>>> Cc: David Gibson<david@gibson.dropbear.id.au>  Signed-off-by:
>>>>>>>> Alexey Kardashevskiy<aik@ozlabs.ru>  Signed-off-by: Paul
>>>>>>>> Mackerras<paulus@samba.org>
>>>>>>> Only a few minor nits. Ben already commented on implementation
>>>>>>> details.
>>>>>>>
>>>>>>>> --- Changelog: 2013/06/05: * fixed mistype about IBMVIO in the
>>>>>>>> commit message * updated doc and moved it to another section *
>>>>>>>> changed capability number
>>>>>>>>
>>>>>>>> 2013/05/21: * added kvm_vcpu_arch::tce_tmp * removed cleanup
>>>>>>>> if put_indirect failed, instead we do not even start writing
>>>>>>>> to TCE table if we cannot get TCEs from the user and they are
>>>>>>>> invalid * kvmppc_emulated_h_put_tce is split to
>>>>>>>> kvmppc_emulated_put_tce and kvmppc_emulated_validate_tce (for
>>>>>>>> the previous item) * fixed bug with failthrough for H_IPI *
>>>>>>>> removed all get_user() from real mode handlers *
>>>>>>>> kvmppc_lookup_pte() added (instead of making lookup_linux_pte
>>>>>>>> public) --- Documentation/virtual/kvm/api.txt       |   17 ++
>>>>>>>> arch/powerpc/include/asm/kvm_host.h     |    2 +
>>>>>>>> arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>>>>>>>> arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>>>>>>>> arch/powerpc/kvm/book3s_64_vio_hv.c     |  266
>>>>>>>> +++++++++++++++++++++++++++---- arch/powerpc/kvm/book3s_hv.c
>>>>>>>> |   39 +++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>>>>>>>> arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>>>>>>>> arch/powerpc/kvm/powerpc.c              |    3 +
>>>>>>>> include/uapi/linux/kvm.h                |    1 + 10 files
>>>>>>>> changed, 473 insertions(+), 32 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/Documentation/virtual/kvm/api.txt
>>>>>>>> b/Documentation/virtual/kvm/api.txt index 5f91eda..6c082ff
>>>>>>>> 100644 --- a/Documentation/virtual/kvm/api.txt +++
>>>>>>>> b/Documentation/virtual/kvm/api.txt @@ -2362,6 +2362,23 @@
>>>>>>>> calls by the guest for that service will be passed to
>>>>>>>> userspace to be handled.
>>>>>>>>
>>>>>>>>
>>>>>>>> +4.83 KVM_CAP_PPC_MULTITCE + +Capability:
>>>>>>>> KVM_CAP_PPC_MULTITCE +Architectures: ppc +Type: vm + +This
>>>>>>>> capability tells the guest that multiple TCE entry add/remove
>>>>>>>> hypercalls +handling is supported by the kernel. This
>>>>>>>> significanly accelerates DMA +operations for PPC KVM guests.
>>>>>>>> + +Unlike other capabilities in this section, this one does
>>>>>>>> not have an ioctl. +Instead, when the capability is present,
>>>>>>>> the H_PUT_TCE_INDIRECT and +H_STUFF_TCE hypercalls are to be
>>>>>>>> handled in the host kernel and not passed to +the guest.
>>>>>>>> Othwerwise it might be better for the guest to continue using
>>>>>>>> H_PUT_TCE +hypercall (if KVM_CAP_SPAPR_TCE or
>>>>>>>> KVM_CAP_SPAPR_TCE_IOMMU are present).
>>>>>>> While this describes perfectly well what the consequences are of
>>>>>>> the patches, it does not describe properly what the CAP actually
>>>>>>> expresses. The CAP only says "this kernel is able to handle
>>>>>>> H_PUT_TCE_INDIRECT and H_STUFF_TCE hypercalls directly". All
>>>>>>> other consequences are nice to document, but the semantics of
>>>>>>> the CAP are missing.
>>>>>>
>>>>>> ? It expresses ability to handle 2 hcalls. What is missing?
>>>>> You don't describe the kvm<->  qemu interface. You describe some
>>>>> decisions qemu can take from this cap.
>>>>
>>>> This file does not mention qemu at all. And the interface is - qemu
>>>> (or kvmtool could do that) just adds "hcall-multi-tce" to
>>>> "ibm,hypertas-functions" but this is for pseries linux and AIX could
>>>> always do it (no idea about it). Does it really have to be in this
>>>> file?
>>> Ok, let's go back a step. What does this CAP describe? Don't look at 
>>> the
>>> description you wrote above. Just write a new one.
>> The CAP means the kernel is capable of handling hcalls A and B without
>> passing those into the user space. That accelerates DMA.
>>
>>
>>> What exactly can user space expect when it finds this CAP?
>> The user space can expect that its handlers for A and B are not going 
>> to be
>> called if it configures the guest appropriately.

Actually a nitpick here too. User space can expect that its handlers for 
A and B are going to already be processed by KVM. Regardless of how user 
space configures the guest.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 5f91eda..6c082ff 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2362,6 +2362,23 @@  calls by the guest for that service will be passed to userspace to be
 handled.
 
 
+4.83 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability tells the guest that multiple TCE entry add/remove hypercalls
+handling is supported by the kernel. This significanly accelerates DMA
+operations for PPC KVM guests.
+
+Unlike other capabilities in this section, this one does not have an ioctl.
+Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
+H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
+the guest. Othwerwise it might be better for the guest to continue using H_PUT_TCE
+hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
+
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index af326cd..85d8f26 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -609,6 +609,8 @@  struct kvm_vcpu_arch {
 	spinlock_t tbacct_lock;
 	u64 busy_stolen;
 	u64 busy_preempt;
+
+	unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index a5287fe..e852921b 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -133,8 +133,20 @@  extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				struct kvm_create_spapr_tce *args);
-extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-			     unsigned long ioba, unsigned long tce);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
+		struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_emulated_validate_tce(unsigned long tce);
+extern void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
+		unsigned long ioba, unsigned long tce);
+extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce);
+extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages);
+extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages);
 extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
 				struct kvm_allocate_rma *rma);
 extern struct kvmppc_linear_info *kvm_alloc_rma(void);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index b2d3f3b..06b7b20 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -14,6 +14,7 @@ 
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
@@ -36,8 +37,11 @@ 
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
 
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+#define ERROR_ADDR      ((void *)~(unsigned long)0x0)
 
 static long kvmppc_stt_npages(unsigned long window_size)
 {
@@ -148,3 +152,117 @@  fail:
 	}
 	return ret;
 }
+
+/* Converts guest physical address into host virtual */
+static void __user *kvmppc_virtmode_gpa_to_hva(struct kvm_vcpu *vcpu,
+		unsigned long gpa)
+{
+	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
+
+	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
+	if (!memslot)
+		return ERROR_ADDR;
+
+	hva = __gfn_to_hva_memslot(memslot, gfn) + (gpa & ~PAGE_MASK);
+	return (void *) hva;
+}
+
+long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce)
+{
+	long ret;
+	struct kvmppc_spapr_tce_table *tt;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to userspace */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if (ioba >= tt->window_size)
+		return H_PARAMETER;
+
+	ret = kvmppc_emulated_validate_tce(tce);
+	if (ret)
+		return ret;
+
+	kvmppc_emulated_put_tce(tt, ioba, tce);
+
+	return H_SUCCESS;
+}
+
+long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *tt;
+	long i, ret;
+	unsigned long __user *tces;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to userspace */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/*
+	 * The spec says that the maximum size of the list is 512 TCEs so
+	 * so the whole table addressed resides in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & ~IOMMU_PAGE_MASK)
+		return H_PARAMETER;
+
+	tces = kvmppc_virtmode_gpa_to_hva(vcpu, tce_list);
+	if (tces == ERROR_ADDR)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i) {
+		if (get_user(vcpu->arch.tce_tmp[i], tces + i))
+			return H_PARAMETER;
+
+		ret = kvmppc_emulated_validate_tce(vcpu->arch.tce_tmp[i]);
+		if (ret)
+			return ret;
+	}
+
+	for (i = 0; i < npages; ++i)
+		kvmppc_emulated_put_tce(tt,
+				ioba + (i << IOMMU_PAGE_SHIFT),
+				vcpu->arch.tce_tmp[i]);
+
+	return H_SUCCESS;
+}
+
+long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *tt;
+	long i, ret;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to userspace */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
+		return H_PARAMETER;
+
+	ret = kvmppc_emulated_validate_tce(tce_value);
+	if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
+		kvmppc_emulated_put_tce(tt, ioba, tce_value);
+
+	return H_SUCCESS;
+}
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 30c2f3b..c68d538 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -14,6 +14,7 @@ 
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
@@ -35,42 +36,249 @@ 
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
 
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+#define ERROR_ADDR      (~(unsigned long)0x0)
 
-/* WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
+/* Finds a TCE table descriptor by LIOBN */
+struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(struct kvm_vcpu *vcpu,
+		unsigned long liobn)
+{
+	struct kvmppc_spapr_tce_table *tt;
+
+	list_for_each_entry(tt, &vcpu->kvm->arch.spapr_tce_tables, list) {
+		if (tt->liobn == liobn)
+			return tt;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_tce_table);
+
+/*
+ * Validate TCE address.
+ * At the moment only flags are validated
+ * as other check will significantly slow down
+ * or can make it even impossible to handle TCE requests
+ * in real mode.
+ */
+long kvmppc_emulated_validate_tce(unsigned long tce)
+{
+	if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
+		return H_PARAMETER;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulated_validate_tce);
+
+/*
+ * kvmppc_emulated_put_tce() handles TCE requests for devices emulated
+ * by QEMU. It puts guest TCE values into the table and expects
+ * the QEMU to convert them later in the QEMU device implementation.
+ * Wiorks in both real and virtual modes.
+ * It cannot fail so kvmppc_emulated_validate_tce must be called before it.
  */
+void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
+		unsigned long ioba, unsigned long tce)
+{
+	unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
+	struct page *page;
+	u64 *tbl;
+
+	/*
+	 * Note on the use of page_address() in real mode,
+	 *
+	 * It is safe to use page_address() in real mode on ppc64 because
+	 * page_address() is always defined as lowmem_page_address()
+	 * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
+	 * operation and does not access page struct.
+	 *
+	 * Theoretically page_address() could be defined different
+	 * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+	 * should be enabled.
+	 * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+	 * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+	 * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+	 * is not expected to be enabled on ppc32, page_address()
+	 * is safe for ppc32 as well.
+	 */
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+	page = tt->pages[idx / TCES_PER_PAGE];
+	tbl = (u64 *)page_address(page);
+
+	/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
+	tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
+
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+
+static pte_t kvmppc_lookup_pte(pgd_t *pgdir, unsigned long hva, bool writing,
+			unsigned long *pte_sizep)
+{
+	pte_t *ptep;
+	unsigned int shift = 0;
+	pte_t pte, tmp, ret;
+
+	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
+	if (!ptep)
+		return __pte(0);
+	if (shift)
+		*pte_sizep = 1ul << shift;
+	else
+		*pte_sizep = PAGE_SIZE;
+
+	if (!pte_present(*ptep))
+		return __pte(0);
+
+	/* wait until _PAGE_BUSY is clear then set it atomically */
+	__asm__ __volatile__ (
+		"1:	ldarx	%0,0,%3\n"
+		"	andi.	%1,%0,%4\n"
+		"	bne-	1b\n"
+		"	ori	%1,%0,%4\n"
+		"	stdcx.	%1,0,%3\n"
+		"	bne-	1b"
+		: "=&r" (pte), "=&r" (tmp), "=m" (*ptep)
+		: "r" (ptep), "i" (_PAGE_BUSY)
+		: "cc");
+
+	ret = pte;
+
+	return ret;
+}
+
+/*
+ * Converts guest physical address into host physical address.
+ * Also returns pte and page size if the page is present in page table.
+ */
+static unsigned long kvmppc_realmode_gpa_to_hpa(struct kvm_vcpu *vcpu,
+		unsigned long gpa)
+{
+	struct kvm_memory_slot *memslot;
+	pte_t pte;
+	unsigned long hva, hpa, pg_size = 0, offset;
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	bool writing = gpa & TCE_PCI_WRITE;
+
+	/* Find a KVM memslot */
+	memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
+	if (!memslot)
+		return ERROR_ADDR;
+
+	/* Convert guest physical address to host virtual */
+	hva = __gfn_to_hva_memslot(memslot, gfn);
+
+	/* Find a PTE and determine the size */
+	pte = kvmppc_lookup_pte(vcpu->arch.pgdir, hva,
+			writing, &pg_size);
+	if (!pte)
+		return ERROR_ADDR;
+
+	/* Calculate host phys address keeping flags and offset in the page */
+	offset = gpa & (pg_size - 1);
+
+	/* pte_pfn(pte) should return an address aligned to pg_size */
+	hpa = (pte_pfn(pte) << PAGE_SHIFT) + offset;
+
+	return hpa;
+}
+
 long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 		      unsigned long ioba, unsigned long tce)
 {
-	struct kvm *kvm = vcpu->kvm;
-	struct kvmppc_spapr_tce_table *stt;
-
-	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
-	/* 	    liobn, ioba, tce); */
-
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == liobn) {
-			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-			struct page *page;
-			u64 *tbl;
-
-			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-			/* 	    liobn, stt, stt->window_size); */
-			if (ioba >= stt->window_size)
-				return H_PARAMETER;
-
-			page = stt->pages[idx / TCES_PER_PAGE];
-			tbl = (u64 *)page_address(page);
-
-			/* FIXME: Need to validate the TCE itself */
-			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-			tbl[idx % TCES_PER_PAGE] = tce;
-			return H_SUCCESS;
-		}
+	long ret;
+	struct kvmppc_spapr_tce_table *tt;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to virtual space */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if (ioba >= tt->window_size)
+		return H_PARAMETER;
+
+	ret = kvmppc_emulated_validate_tce(tce);
+	if (ret)
+		return ret;
+
+	kvmppc_emulated_put_tce(tt, ioba, tce);
+
+	return H_SUCCESS;
+}
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list,	unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *tt;
+	long i, ret;
+	unsigned long *tces;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to virtual space */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/*
+	 * The spec says that the maximum size of the list is 512 TCEs so
+	 * so the whole table addressed resides in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & ~IOMMU_PAGE_MASK)
+		return H_PARAMETER;
+
+	tces = (unsigned long *) kvmppc_realmode_gpa_to_hpa(vcpu, tce_list);
+	if ((unsigned long)tces == ERROR_ADDR)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i) {
+		ret = kvmppc_emulated_validate_tce(tces[i]);
+		if (ret)
+			return ret;
 	}
 
-	/* Didn't find the liobn, punt it to userspace */
-	return H_TOO_HARD;
+	for (i = 0; i < npages; ++i)
+		kvmppc_emulated_put_tce(tt, ioba + (i << IOMMU_PAGE_SHIFT),
+				tces[i]);
+
+	return H_SUCCESS;
+}
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *tt;
+	long i, ret;
+
+	tt = kvmppc_find_tce_table(vcpu, liobn);
+	/* Didn't find the liobn, put it to virtual space */
+	if (!tt)
+		return H_TOO_HARD;
+
+	/* Emulated IO */
+	if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
+		return H_PARAMETER;
+
+	ret = kvmppc_emulated_validate_tce(tce_value);
+	if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
+		kvmppc_emulated_put_tce(tt, ioba, tce_value);
+
+	return H_SUCCESS;
 }
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 550f592..a39039a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -568,6 +568,30 @@  int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
 		} /* fallthrough */
+		return RESUME_HOST;
+	case H_PUT_TCE:
+		ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PUT_TCE_INDIRECT:
+		ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_STUFF_TCE:
+		ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
 	default:
 		return RESUME_HOST;
 	}
@@ -958,6 +982,20 @@  struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
 
+	/*
+	 * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
+	 * half executed, we first read TCEs from the user, check them and
+	 * return error if something went wrong and only then put TCEs into
+	 * the TCE table.
+	 *
+	 * tce_tmp is a cache for TCEs to avoid stack allocation or
+	 * kmalloc as the whole TCE list can take up to 512 items 8 bytes
+	 * each (4096 bytes).
+	 */
+	vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
+	if (!vcpu->arch.tce_tmp)
+		goto free_vcpu;
+
 	return vcpu;
 
 free_vcpu:
@@ -980,6 +1018,7 @@  void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
 	unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
 	spin_unlock(&vcpu->arch.vpa_update_lock);
+	kfree(vcpu->arch.tce_tmp);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b02f91e..d35554e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1490,6 +1490,12 @@  hcall_real_table:
 	.long	0		/* 0x11c */
 	.long	0		/* 0x120 */
 	.long	.kvmppc_h_bulk_remove - hcall_real_table
+	.long	0		/* 0x128 */
+	.long	0		/* 0x12c */
+	.long	0		/* 0x130 */
+	.long	0		/* 0x134 */
+	.long	.kvmppc_h_stuff_tce - hcall_real_table
+	.long	.kvmppc_h_put_tce_indirect - hcall_real_table
 hcall_real_table_end:
 
 ignore_hdec:
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index da0e0bc..91d4b45 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -220,7 +220,38 @@  static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
 	long rc;
 
-	rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
+	rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
+			tce, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
 	if (rc == H_TOO_HARD)
 		return EMULATE_FAIL;
 	kvmppc_set_gpr(vcpu, 3, rc);
@@ -247,6 +278,10 @@  int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		return kvmppc_h_pr_bulk_remove(vcpu);
 	case H_PUT_TCE:
 		return kvmppc_h_pr_put_tce(vcpu);
+	case H_PUT_TCE_INDIRECT:
+		return kvmppc_h_pr_put_tce_indirect(vcpu);
+	case H_STUFF_TCE:
+		return kvmppc_h_pr_stuff_tce(vcpu);
 	case H_CEDE:
 		vcpu->arch.shared->msr |= MSR_EE;
 		kvm_vcpu_block(vcpu);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6316ee3..8465c2a 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -395,6 +395,9 @@  int kvm_dev_ioctl_check_extension(long ext)
 		r = 1;
 		break;
 #endif
+	case KVM_CAP_SPAPR_MULTITCE:
+		r = 1;
+		break;
 	default:
 		r = 0;
 		break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a5c86fc..fc0d6b9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@  struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_SPAPR_MULTITCE 93
 
 #ifdef KVM_CAP_IRQ_ROUTING