diff mbox series

[kernel,v3] KVM: PPC: Allocate guest TCEs on demand too

Message ID 20190301043436.90014-1-aik@ozlabs.ru
State Superseded
Headers show
Series [kernel,v3] KVM: PPC: Allocate guest TCEs on demand too | expand

Commit Message

Alexey Kardashevskiy March 1, 2019, 4:34 a.m. UTC
We already allocate hardware TCE tables in multiple levels and skip
intermediate levels when we can, now it is a turn of the KVM TCE tables.
Thankfully these are allocated already in 2 levels.

This moves the table's last level allocation from the creating helper to
kvmppc_tce_put() and kvm_spapr_tce_fault().

This adds kvmppc_rm_ioba_validate() to do an additional test if
the consequent kvmppc_tce_put() needs a page which has not been allocated;
if this is the case, we bail out to virtual mode handlers.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
v3:
* fixed alignments in kvmppc_rm_ioba_validate

v2:
* added kvm mutex around alloc_page to prevent races; in both place we
test the pointer, if NULL, then take a lock and check again so on a fast
path we do not take a lock at all


---
For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
system RAM the difference is gigabytes of RAM.
---
 arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
 arch/powerpc/kvm/book3s_64_vio_hv.c | 71 ++++++++++++++++++++++++++---
 2 files changed, 81 insertions(+), 19 deletions(-)

Comments

David Gibson March 4, 2019, 12:54 a.m. UTC | #1
On Fri, Mar 01, 2019 at 03:34:36PM +1100, Alexey Kardashevskiy wrote:
> We already allocate hardware TCE tables in multiple levels and skip
> intermediate levels when we can, now it is a turn of the KVM TCE tables.
> Thankfully these are allocated already in 2 levels.
> 
> This moves the table's last level allocation from the creating helper to
> kvmppc_tce_put() and kvm_spapr_tce_fault().
> 
> This adds kvmppc_rm_ioba_validate() to do an additional test if
> the consequent kvmppc_tce_put() needs a page which has not been allocated;
> if this is the case, we bail out to virtual mode handlers.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>

> ---
> Changes:
> v3:
> * fixed alignments in kvmppc_rm_ioba_validate
> 
> v2:
> * added kvm mutex around alloc_page to prevent races; in both place we
> test the pointer, if NULL, then take a lock and check again so on a fast
> path we do not take a lock at all
> 
> 
> ---
> For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented
> system RAM the difference is gigabytes of RAM.
> ---
>  arch/powerpc/kvm/book3s_64_vio.c    | 29 ++++++------
>  arch/powerpc/kvm/book3s_64_vio_hv.c | 71 ++++++++++++++++++++++++++---
>  2 files changed, 81 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
> index f02b04973710..7eed8c90ea3d 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head)
>  	unsigned long i, npages = kvmppc_tce_pages(stt->size);
>  
>  	for (i = 0; i < npages; i++)
> -		__free_page(stt->pages[i]);
> +		if (stt->pages[i])
> +			__free_page(stt->pages[i]);
>  
>  	kfree(stt);
>  }
> @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
>  		return VM_FAULT_SIGBUS;
>  
>  	page = stt->pages[vmf->pgoff];
> +	if (!page) {
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[vmf->pgoff];
> +		if (!page) {
> +			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (!page) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return VM_FAULT_OOM;
> +			}
> +			stt->pages[vmf->pgoff] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
> +
>  	get_page(page);
>  	vmf->page = page;
>  	return 0;
> @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	struct kvmppc_spapr_tce_table *siter;
>  	unsigned long npages, size = args->size;
>  	int ret = -ENOMEM;
> -	int i;
>  
>  	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
>  		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
> @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	stt->kvm = kvm;
>  	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
>  
> -	for (i = 0; i < npages; i++) {
> -		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
> -		if (!stt->pages[i])
> -			goto fail;
> -	}
> -
>  	mutex_lock(&kvm->lock);
>  
>  	/* Check this LIOBN hasn't been previously allocated */
> @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>  	if (ret >= 0)
>  		return ret;
>  
> - fail:
> -	for (i = 0; i < npages; i++)
> -		if (stt->pages[i])
> -			__free_page(stt->pages[i]);
> -
>  	kfree(stt);
>   fail_acct:
>  	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 2206bc729b9a..1cd9373f8bdc 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -158,23 +158,78 @@ static u64 *kvmppc_page_address(struct page *page)
>  	return (u64 *) page_address(page);
>  }
>  
> +/*
> + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
> + * in real mode.
> + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
> + * allocated or not required (when clearing a tce entry).
> + */
> +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
> +		unsigned long ioba, unsigned long npages, bool clearing)
> +{
> +	unsigned long i, idx, sttpage, sttpages;
> +	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
> +
> +	if (ret)
> +		return ret;
> +	/*
> +	 * clearing==true says kvmppc_tce_put won't be allocating pages
> +	 * for empty tces.
> +	 */
> +	if (clearing)
> +		return H_SUCCESS;
> +
> +	idx = (ioba >> stt->page_shift) - stt->offset;
> +	sttpage = idx / TCES_PER_PAGE;
> +	sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
> +			TCES_PER_PAGE;
> +	for (i = sttpage; i < sttpage + sttpages; ++i)
> +		if (!stt->pages[i])
> +			return H_TOO_HARD;
> +
> +	return H_SUCCESS;
> +}
> +
>  /*
>   * Handles TCE requests for emulated devices.
>   * Puts guest TCE values to the table and expects user space to convert them.
>   * Called in both real and virtual modes.
>   * Cannot fail so kvmppc_tce_validate must be called before it.
>   *
> - * WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> + * WARNING: This will be called in real-mode on HV HPT KVM and virtual
> + *          mode on PR KVM or HV radix KVM
>   */
>  void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
>  		unsigned long idx, unsigned long tce)
>  {
>  	struct page *page;
>  	u64 *tbl;
> +	unsigned long sttpage;
>  
>  	idx -= stt->offset;
> -	page = stt->pages[idx / TCES_PER_PAGE];
> +	sttpage = idx / TCES_PER_PAGE;
> +	page = stt->pages[sttpage];
> +
> +	if (!page) {
> +		/* We allow any TCE, not just with read|write permissions */
> +		if (!tce)
> +			return;
> +		/*
> +		 * We must not end up here in real mode,
> +		 * kvmppc_rm_ioba_validate() takes care of this.
> +		 */
> +		mutex_lock(&stt->kvm->lock);
> +		page = stt->pages[sttpage];
> +		if (!page) {
> +			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +			if (WARN_ON_ONCE(!page)) {
> +				mutex_unlock(&stt->kvm->lock);
> +				return;
> +			}
> +			stt->pages[sttpage] = page;
> +		}
> +		mutex_unlock(&stt->kvm->lock);
> +	}
>  	tbl = kvmppc_page_address(page);
>  
>  	tbl[idx % TCES_PER_PAGE] = tce;
> @@ -381,7 +436,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, 1);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -480,7 +535,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
>  	if (tce_list & (SZ_4K - 1))
>  		return H_PARAMETER;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -583,7 +638,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
>  	if (!stt)
>  		return H_TOO_HARD;
>  
> -	ret = kvmppc_ioba_validate(stt, ioba, npages);
> +	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
>  	if (ret != H_SUCCESS)
>  		return ret;
>  
> @@ -635,6 +690,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>  
>  	idx = (ioba >> stt->page_shift) - stt->offset;
>  	page = stt->pages[idx / TCES_PER_PAGE];
> +	if (!page) {
> +		vcpu->arch.regs.gpr[4] = 0;
> +		return H_SUCCESS;
> +	}
>  	tbl = (u64 *)page_address(page);
>  
>  	vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
diff mbox series

Patch

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..7eed8c90ea3d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -228,7 +228,8 @@  static void release_spapr_tce_table(struct rcu_head *head)
 	unsigned long i, npages = kvmppc_tce_pages(stt->size);
 
 	for (i = 0; i < npages; i++)
-		__free_page(stt->pages[i]);
+		if (stt->pages[i])
+			__free_page(stt->pages[i]);
 
 	kfree(stt);
 }
@@ -242,6 +243,20 @@  static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 
 	page = stt->pages[vmf->pgoff];
+	if (!page) {
+		mutex_lock(&stt->kvm->lock);
+		page = stt->pages[vmf->pgoff];
+		if (!page) {
+			page  = alloc_page(GFP_KERNEL | __GFP_ZERO);
+			if (!page) {
+				mutex_unlock(&stt->kvm->lock);
+				return VM_FAULT_OOM;
+			}
+			stt->pages[vmf->pgoff] = page;
+		}
+		mutex_unlock(&stt->kvm->lock);
+	}
+
 	get_page(page);
 	vmf->page = page;
 	return 0;
@@ -296,7 +311,6 @@  long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	struct kvmppc_spapr_tce_table *siter;
 	unsigned long npages, size = args->size;
 	int ret = -ENOMEM;
-	int i;
 
 	if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
 		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
@@ -320,12 +334,6 @@  long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	stt->kvm = kvm;
 	INIT_LIST_HEAD_RCU(&stt->iommu_tables);
 
-	for (i = 0; i < npages; i++) {
-		stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
-		if (!stt->pages[i])
-			goto fail;
-	}
-
 	mutex_lock(&kvm->lock);
 
 	/* Check this LIOBN hasn't been previously allocated */
@@ -352,11 +360,6 @@  long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	if (ret >= 0)
 		return ret;
 
- fail:
-	for (i = 0; i < npages; i++)
-		if (stt->pages[i])
-			__free_page(stt->pages[i]);
-
 	kfree(stt);
  fail_acct:
 	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 2206bc729b9a..1cd9373f8bdc 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -158,23 +158,78 @@  static u64 *kvmppc_page_address(struct page *page)
 	return (u64 *) page_address(page);
 }
 
+/*
+ * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so
+ * in real mode.
+ * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is
+ * allocated or not required (when clearing a tce entry).
+ */
+static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long ioba, unsigned long npages, bool clearing)
+{
+	unsigned long i, idx, sttpage, sttpages;
+	unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
+
+	if (ret)
+		return ret;
+	/*
+	 * clearing==true says kvmppc_tce_put won't be allocating pages
+	 * for empty tces.
+	 */
+	if (clearing)
+		return H_SUCCESS;
+
+	idx = (ioba >> stt->page_shift) - stt->offset;
+	sttpage = idx / TCES_PER_PAGE;
+	sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
+			TCES_PER_PAGE;
+	for (i = sttpage; i < sttpage + sttpages; ++i)
+		if (!stt->pages[i])
+			return H_TOO_HARD;
+
+	return H_SUCCESS;
+}
+
 /*
  * Handles TCE requests for emulated devices.
  * Puts guest TCE values to the table and expects user space to convert them.
  * Called in both real and virtual modes.
  * Cannot fail so kvmppc_tce_validate must be called before it.
  *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- *          mode on PR KVM
+ * WARNING: This will be called in real-mode on HV HPT KVM and virtual
+ *          mode on PR KVM or HV radix KVM
  */
 void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 		unsigned long idx, unsigned long tce)
 {
 	struct page *page;
 	u64 *tbl;
+	unsigned long sttpage;
 
 	idx -= stt->offset;
-	page = stt->pages[idx / TCES_PER_PAGE];
+	sttpage = idx / TCES_PER_PAGE;
+	page = stt->pages[sttpage];
+
+	if (!page) {
+		/* We allow any TCE, not just with read|write permissions */
+		if (!tce)
+			return;
+		/*
+		 * We must not end up here in real mode,
+		 * kvmppc_rm_ioba_validate() takes care of this.
+		 */
+		mutex_lock(&stt->kvm->lock);
+		page = stt->pages[sttpage];
+		if (!page) {
+			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+			if (WARN_ON_ONCE(!page)) {
+				mutex_unlock(&stt->kvm->lock);
+				return;
+			}
+			stt->pages[sttpage] = page;
+		}
+		mutex_unlock(&stt->kvm->lock);
+	}
 	tbl = kvmppc_page_address(page);
 
 	tbl[idx % TCES_PER_PAGE] = tce;
@@ -381,7 +436,7 @@  long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 	if (!stt)
 		return H_TOO_HARD;
 
-	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
 	if (ret != H_SUCCESS)
 		return ret;
 
@@ -480,7 +535,7 @@  long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 	if (tce_list & (SZ_4K - 1))
 		return H_PARAMETER;
 
-	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
 	if (ret != H_SUCCESS)
 		return ret;
 
@@ -583,7 +638,7 @@  long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
 	if (!stt)
 		return H_TOO_HARD;
 
-	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
 	if (ret != H_SUCCESS)
 		return ret;
 
@@ -635,6 +690,10 @@  long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
 	idx = (ioba >> stt->page_shift) - stt->offset;
 	page = stt->pages[idx / TCES_PER_PAGE];
+	if (!page) {
+		vcpu->arch.regs.gpr[4] = 0;
+		return H_SUCCESS;
+	}
 	tbl = (u64 *)page_address(page);
 
 	vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];