diff mbox series

[v2,2/3] powerpc/powernv/npu: Use size-based ATSD invalidates

Message ID 1538592694-18739-3-git-send-email-mhairgrove@nvidia.com (mailing list archive)
State Accepted
Commit 3689c37d23fceb786e74b1a71501f3583223ab39
Headers show
Series powerpc/powernv/npu: Improve ATSD invalidation overhead | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success next/apply_patch Successfully applied
snowpatch_ozlabs/checkpatch warning Test checkpatch on branch next

Commit Message

Mark Hairgrove Oct. 3, 2018, 6:51 p.m. UTC
Prior to this change only two types of ATSDs were issued to the NPU:
invalidates targeting a single page and invalidates targeting the whole
address space. The crossover point happened at the configurable
atsd_threshold which defaulted to 2M. Invalidates that size or smaller
would issue per-page invalidates for the whole range.

The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all.
These invalidates target addresses aligned to their size. 2M is a common
invalidation size for GPU-enabled applications because that is a GPU
page size, so reducing the number of invalidates by 32x in that case is a
clear improvement.

ATSD latency is high in general so now we always issue a single invalidate
rather than multiple. This will over-invalidate in some cases, but for any
invalidation size over 2M it matches or improves the prior behavior.
There's also an improvement for single-page invalidates since the prior
version issued two invalidates for that case instead of one.

With this change all issued ATSDs now perform a flush, so the flush
parameter has been removed from all the helpers.

To show the benefit here are some performance numbers from a
microbenchmark which creates a 1G allocation then uses mprotect with
PROT_NONE to trigger invalidates in strides across the allocation.

One NPU (1 GPU):

         mprotect rate (GB/s)
Stride   Before      After      Speedup
64K         5.3        5.6           5%
1M         39.3       57.4          46%
2M         49.7       82.6          66%
4M        286.6      285.7           0%

Two NPUs (6 GPUs):

         mprotect rate (GB/s)
Stride   Before      After      Speedup
64K         6.5        7.4          13%
1M         33.4       67.9         103%
2M         38.7       93.1         141%
4M        356.7      354.6          -1%

Anything over 2M is roughly the same as before since both cases issue a
single ATSD.

Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
---
 arch/powerpc/platforms/powernv/npu-dma.c |  103 ++++++++++++++++--------------
 1 files changed, 55 insertions(+), 48 deletions(-)

Comments

Alistair Popple Oct. 4, 2018, 5:20 a.m. UTC | #1
Reviewed-By: Alistair Popple <alistair@popple.id.au>

On Wednesday, 3 October 2018 11:51:33 AM AEST Mark Hairgrove wrote:
> Prior to this change only two types of ATSDs were issued to the NPU:
> invalidates targeting a single page and invalidates targeting the whole
> address space. The crossover point happened at the configurable
> atsd_threshold which defaulted to 2M. Invalidates that size or smaller
> would issue per-page invalidates for the whole range.
> 
> The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all.
> These invalidates target addresses aligned to their size. 2M is a common
> invalidation size for GPU-enabled applications because that is a GPU
> page size, so reducing the number of invalidates by 32x in that case is a
> clear improvement.
> 
> ATSD latency is high in general so now we always issue a single invalidate
> rather than multiple. This will over-invalidate in some cases, but for any
> invalidation size over 2M it matches or improves the prior behavior.
> There's also an improvement for single-page invalidates since the prior
> version issued two invalidates for that case instead of one.
> 
> With this change all issued ATSDs now perform a flush, so the flush
> parameter has been removed from all the helpers.
> 
> To show the benefit here are some performance numbers from a
> microbenchmark which creates a 1G allocation then uses mprotect with
> PROT_NONE to trigger invalidates in strides across the allocation.
> 
> One NPU (1 GPU):
> 
>          mprotect rate (GB/s)
> Stride   Before      After      Speedup
> 64K         5.3        5.6           5%
> 1M         39.3       57.4          46%
> 2M         49.7       82.6          66%
> 4M        286.6      285.7           0%
> 
> Two NPUs (6 GPUs):
> 
>          mprotect rate (GB/s)
> Stride   Before      After      Speedup
> 64K         6.5        7.4          13%
> 1M         33.4       67.9         103%
> 2M         38.7       93.1         141%
> 4M        356.7      354.6          -1%
> 
> Anything over 2M is roughly the same as before since both cases issue a
> single ATSD.
> 
> Signed-off-by: Mark Hairgrove <mhairgrove@nvidia.com>
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c |  103 ++++++++++++++++--------------
>  1 files changed, 55 insertions(+), 48 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
> index c8f438a..e4c0fab 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -18,6 +18,7 @@
>  #include <linux/memblock.h>
>  #include <linux/iommu.h>
>  #include <linux/debugfs.h>
> +#include <linux/sizes.h>
>  
>  #include <asm/debugfs.h>
>  #include <asm/tlb.h>
> @@ -458,8 +459,7 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
>  #define XTS_ATSD_AVA    1
>  #define XTS_ATSD_STAT   2
>  
> -static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
> -					bool flush)
> +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
>  {
>  	unsigned long launch = 0;
>  
> @@ -477,8 +477,7 @@ static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
>  	/* PID */
>  	launch |= pid << PPC_BITLSHIFT(38);
>  
> -	/* No flush */
> -	launch |= !flush << PPC_BITLSHIFT(39);
> +	/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
>  
>  	return launch;
>  }
> @@ -501,23 +500,22 @@ static void mmio_atsd_regs_write(struct mmio_atsd_reg
>  }
>  
>  static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> -				unsigned long pid, bool flush)
> +				unsigned long pid)
>  {
> -	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
> +	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
>  
>  	/* Invalidating the entire process doesn't use a va */
>  	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
>  }
>  
> -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
> -			unsigned long va, unsigned long pid, bool flush)
> +static void mmio_invalidate_range(struct mmio_atsd_reg
> +			mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
> +			unsigned long start, unsigned long psize)
>  {
> -	unsigned long launch;
> -
> -	launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
> +	unsigned long launch = get_atsd_launch_val(pid, psize);
>  
>  	/* Write all VAs first */
> -	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
> +	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
>  
>  	/* Issue one barrier for all address writes */
>  	eieio();
> @@ -609,14 +607,36 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
>  }
>  
>  /*
> - * Invalidate either a single address or an entire PID depending on
> - * the value of va.
> + * Invalidate a virtual address range
>   */
> -static void mmio_invalidate(struct npu_context *npu_context, int va,
> -			unsigned long address, bool flush)
> +static void mmio_invalidate(struct npu_context *npu_context,
> +			unsigned long start, unsigned long size)
>  {
>  	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
>  	unsigned long pid = npu_context->mm->context.id;
> +	unsigned long atsd_start = 0;
> +	unsigned long end = start + size - 1;
> +	int atsd_psize = MMU_PAGE_COUNT;
> +
> +	/*
> +	 * Convert the input range into one of the supported sizes. If the range
> +	 * doesn't fit, use the next larger supported size. Invalidation latency
> +	 * is high, so over-invalidation is preferred to issuing multiple
> +	 * invalidates.
> +	 *
> +	 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
> +	 * ignored.
> +	 */
> +	if (size == SZ_64K) {
> +		atsd_start = start;
> +		atsd_psize = MMU_PAGE_64K;
> +	} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
> +		atsd_start = ALIGN_DOWN(start, SZ_2M);
> +		atsd_psize = MMU_PAGE_2M;
> +	} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
> +		atsd_start = ALIGN_DOWN(start, SZ_1G);
> +		atsd_psize = MMU_PAGE_1G;
> +	}
>  
>  	if (npu_context->nmmu_flush)
>  		/*
> @@ -631,23 +651,25 @@ static void mmio_invalidate(struct npu_context *npu_context, int va,
>  	 * an invalidate.
>  	 */
>  	acquire_atsd_reg(npu_context, mmio_atsd_reg);
> -	if (va)
> -		mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
> +
> +	if (atsd_psize == MMU_PAGE_COUNT)
> +		mmio_invalidate_pid(mmio_atsd_reg, pid);
>  	else
> -		mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
> +		mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
> +					atsd_psize);
>  
>  	mmio_invalidate_wait(mmio_atsd_reg);
> -	if (flush) {
> -		/*
> -		 * The GPU requires two flush ATSDs to ensure all entries have
> -		 * been flushed. We use PID 0 as it will never be used for a
> -		 * process on the GPU.
> -		 */
> -		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> -		mmio_invalidate_wait(mmio_atsd_reg);
> -		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
> -		mmio_invalidate_wait(mmio_atsd_reg);
> -	}
> +
> +	/*
> +	 * The GPU requires two flush ATSDs to ensure all entries have been
> +	 * flushed. We use PID 0 as it will never be used for a process on the
> +	 * GPU.
> +	 */
> +	mmio_invalidate_pid(mmio_atsd_reg, 0);
> +	mmio_invalidate_wait(mmio_atsd_reg);
> +	mmio_invalidate_pid(mmio_atsd_reg, 0);
> +	mmio_invalidate_wait(mmio_atsd_reg);
> +
>  	release_atsd_reg(mmio_atsd_reg);
>  }
>  
> @@ -664,7 +686,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn,
>  	 * There should be no more translation requests for this PID, but we
>  	 * need to ensure any entries for it are removed from the TLB.
>  	 */
> -	mmio_invalidate(npu_context, 0, 0, true);
> +	mmio_invalidate(npu_context, 0, ~0UL);
>  }
>  
>  static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
> @@ -673,8 +695,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
>  				pte_t pte)
>  {
>  	struct npu_context *npu_context = mn_to_npu_context(mn);
> -
> -	mmio_invalidate(npu_context, 1, address, true);
> +	mmio_invalidate(npu_context, address, PAGE_SIZE);
>  }
>  
>  static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
> @@ -682,21 +703,7 @@ static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
>  					unsigned long start, unsigned long end)
>  {
>  	struct npu_context *npu_context = mn_to_npu_context(mn);
> -	unsigned long address;
> -
> -	if (end - start > atsd_threshold) {
> -		/*
> -		 * Just invalidate the entire PID if the address range is too
> -		 * large.
> -		 */
> -		mmio_invalidate(npu_context, 0, 0, true);
> -	} else {
> -		for (address = start; address < end; address += PAGE_SIZE)
> -			mmio_invalidate(npu_context, 1, address, false);
> -
> -		/* Do the flush only on the final addess == end */
> -		mmio_invalidate(npu_context, 1, address, true);
> -	}
> +	mmio_invalidate(npu_context, start, end - start);
>  }
>  
>  static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
>
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index c8f438a..e4c0fab 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -18,6 +18,7 @@ 
 #include <linux/memblock.h>
 #include <linux/iommu.h>
 #include <linux/debugfs.h>
+#include <linux/sizes.h>
 
 #include <asm/debugfs.h>
 #include <asm/tlb.h>
@@ -458,8 +459,7 @@  static void put_mmio_atsd_reg(struct npu *npu, int reg)
 #define XTS_ATSD_AVA    1
 #define XTS_ATSD_STAT   2
 
-static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
-					bool flush)
+static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
 {
 	unsigned long launch = 0;
 
@@ -477,8 +477,7 @@  static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize,
 	/* PID */
 	launch |= pid << PPC_BITLSHIFT(38);
 
-	/* No flush */
-	launch |= !flush << PPC_BITLSHIFT(39);
+	/* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
 
 	return launch;
 }
@@ -501,23 +500,22 @@  static void mmio_atsd_regs_write(struct mmio_atsd_reg
 }
 
 static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-				unsigned long pid, bool flush)
+				unsigned long pid)
 {
-	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
+	unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
 
 	/* Invalidating the entire process doesn't use a va */
 	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
 }
 
-static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-			unsigned long va, unsigned long pid, bool flush)
+static void mmio_invalidate_range(struct mmio_atsd_reg
+			mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
+			unsigned long start, unsigned long psize)
 {
-	unsigned long launch;
-
-	launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
+	unsigned long launch = get_atsd_launch_val(pid, psize);
 
 	/* Write all VAs first */
-	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
+	mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
 
 	/* Issue one barrier for all address writes */
 	eieio();
@@ -609,14 +607,36 @@  static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
 }
 
 /*
- * Invalidate either a single address or an entire PID depending on
- * the value of va.
+ * Invalidate a virtual address range
  */
-static void mmio_invalidate(struct npu_context *npu_context, int va,
-			unsigned long address, bool flush)
+static void mmio_invalidate(struct npu_context *npu_context,
+			unsigned long start, unsigned long size)
 {
 	struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
 	unsigned long pid = npu_context->mm->context.id;
+	unsigned long atsd_start = 0;
+	unsigned long end = start + size - 1;
+	int atsd_psize = MMU_PAGE_COUNT;
+
+	/*
+	 * Convert the input range into one of the supported sizes. If the range
+	 * doesn't fit, use the next larger supported size. Invalidation latency
+	 * is high, so over-invalidation is preferred to issuing multiple
+	 * invalidates.
+	 *
+	 * A 4K page size isn't supported by NPU/GPU ATS, so that case is
+	 * ignored.
+	 */
+	if (size == SZ_64K) {
+		atsd_start = start;
+		atsd_psize = MMU_PAGE_64K;
+	} else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
+		atsd_start = ALIGN_DOWN(start, SZ_2M);
+		atsd_psize = MMU_PAGE_2M;
+	} else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
+		atsd_start = ALIGN_DOWN(start, SZ_1G);
+		atsd_psize = MMU_PAGE_1G;
+	}
 
 	if (npu_context->nmmu_flush)
 		/*
@@ -631,23 +651,25 @@  static void mmio_invalidate(struct npu_context *npu_context, int va,
 	 * an invalidate.
 	 */
 	acquire_atsd_reg(npu_context, mmio_atsd_reg);
-	if (va)
-		mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
+
+	if (atsd_psize == MMU_PAGE_COUNT)
+		mmio_invalidate_pid(mmio_atsd_reg, pid);
 	else
-		mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
+		mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
+					atsd_psize);
 
 	mmio_invalidate_wait(mmio_atsd_reg);
-	if (flush) {
-		/*
-		 * The GPU requires two flush ATSDs to ensure all entries have
-		 * been flushed. We use PID 0 as it will never be used for a
-		 * process on the GPU.
-		 */
-		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
-		mmio_invalidate_wait(mmio_atsd_reg);
-		mmio_invalidate_pid(mmio_atsd_reg, 0, true);
-		mmio_invalidate_wait(mmio_atsd_reg);
-	}
+
+	/*
+	 * The GPU requires two flush ATSDs to ensure all entries have been
+	 * flushed. We use PID 0 as it will never be used for a process on the
+	 * GPU.
+	 */
+	mmio_invalidate_pid(mmio_atsd_reg, 0);
+	mmio_invalidate_wait(mmio_atsd_reg);
+	mmio_invalidate_pid(mmio_atsd_reg, 0);
+	mmio_invalidate_wait(mmio_atsd_reg);
+
 	release_atsd_reg(mmio_atsd_reg);
 }
 
@@ -664,7 +686,7 @@  static void pnv_npu2_mn_release(struct mmu_notifier *mn,
 	 * There should be no more translation requests for this PID, but we
 	 * need to ensure any entries for it are removed from the TLB.
 	 */
-	mmio_invalidate(npu_context, 0, 0, true);
+	mmio_invalidate(npu_context, 0, ~0UL);
 }
 
 static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
@@ -673,8 +695,7 @@  static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
 				pte_t pte)
 {
 	struct npu_context *npu_context = mn_to_npu_context(mn);
-
-	mmio_invalidate(npu_context, 1, address, true);
+	mmio_invalidate(npu_context, address, PAGE_SIZE);
 }
 
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
@@ -682,21 +703,7 @@  static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
 					unsigned long start, unsigned long end)
 {
 	struct npu_context *npu_context = mn_to_npu_context(mn);
-	unsigned long address;
-
-	if (end - start > atsd_threshold) {
-		/*
-		 * Just invalidate the entire PID if the address range is too
-		 * large.
-		 */
-		mmio_invalidate(npu_context, 0, 0, true);
-	} else {
-		for (address = start; address < end; address += PAGE_SIZE)
-			mmio_invalidate(npu_context, 1, address, false);
-
-		/* Do the flush only on the final addess == end */
-		mmio_invalidate(npu_context, 1, address, true);
-	}
+	mmio_invalidate(npu_context, start, end - start);
 }
 
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {