diff mbox series

[v3,09/16] gpu: host1x: Optimize CDMA push buffer memory usage

Message ID 20190201132837.12327-10-thierry.reding@gmail.com
State Deferred
Headers show
Series drm/tegra: Fix IOVA space on Tegra186 and later | expand

Commit Message

Thierry Reding Feb. 1, 2019, 1:28 p.m. UTC
From: Thierry Reding <treding@nvidia.com>

The host1x CDMA push buffer is terminated by a special opcode (RESTART)
that tells the CDMA to wrap around to the beginning of the push buffer.
To accomodate the RESTART opcode, an extra 4 bytes are allocated on top
of the 512 * 8 = 4096 bytes needed for the 512 slots (1 slot = 2 words)
that are used for other commands passed to CDMA. This requires that two
memory pages are allocated, but most of the second page (4092 bytes) is
never used.

Decrease the number of slots to 511 so that the RESTART opcode fits
within the page. Adjust the push buffer wraparound code to take into
account push buffer sizes that are not a power of two.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/host1x/cdma.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

Comments

Dmitry Osipenko Feb. 1, 2019, 2:48 p.m. UTC | #1
01.02.2019 16:28, Thierry Reding пишет:
> From: Thierry Reding <treding@nvidia.com>
> 
> The host1x CDMA push buffer is terminated by a special opcode (RESTART)
> that tells the CDMA to wrap around to the beginning of the push buffer.
> To accomodate the RESTART opcode, an extra 4 bytes are allocated on top
> of the 512 * 8 = 4096 bytes needed for the 512 slots (1 slot = 2 words)
> that are used for other commands passed to CDMA. This requires that two
> memory pages are allocated, but most of the second page (4092 bytes) is
> never used.
> 
> Decrease the number of slots to 511 so that the RESTART opcode fits
> within the page. Adjust the push buffer wraparound code to take into
> account push buffer sizes that are not a power of two.
> 
> Signed-off-by: Thierry Reding <treding@nvidia.com>
> ---
>  drivers/gpu/host1x/cdma.c | 29 +++++++++++++++++++++++++----
>  1 file changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
> index a96c4dd1e449..50c1370b56c7 100644
> --- a/drivers/gpu/host1x/cdma.c
> +++ b/drivers/gpu/host1x/cdma.c
> @@ -42,7 +42,17 @@
>   * means that the push buffer is full, not empty.
>   */
>  
> -#define HOST1X_PUSHBUFFER_SLOTS	512
> +/*
> + * Typically the commands written into the push buffer are a pair of words. We
> + * use slots to represent each of these pairs and to simplify things. Note the
> + * strange number of slots allocated here. 512 slots will fit exactly within a
> + * single memory page. We also need one additional word at the end of the push
> + * buffer for the RESTART opcode that will instruct the CDMA to jump back to
> + * the beginning of the push buffer. With 512 slots, this means that we'll use
> + * 2 memory pages and waste 4092 bytes of the second page that will never be
> + * used.
> + */
> +#define HOST1X_PUSHBUFFER_SLOTS	511
>  
>  /*
>   * Clean up push buffer resources
> @@ -148,7 +158,10 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
>  	WARN_ON(pb->pos == pb->fence);
>  	*(p++) = op1;
>  	*(p++) = op2;
> -	pb->pos = (pb->pos + 8) & (pb->size - 1);
> +	pb->pos += 8;
> +
> +	if (pb->pos >= pb->size)
> +		pb->pos -= pb->size;
>  }
>  
>  /*
> @@ -158,7 +171,10 @@ static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
>  static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
>  {
>  	/* Advance the next write position */
> -	pb->fence = (pb->fence + slots * 8) & (pb->size - 1);
> +	pb->fence += slots * 8;
> +
> +	if (pb->fence >= pb->size)
> +		pb->fence -= pb->size;
>  }
>  
>  /*
> @@ -166,7 +182,12 @@ static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
>   */
>  static u32 host1x_pushbuffer_space(struct push_buffer *pb)
>  {
> -	return ((pb->fence - pb->pos) & (pb->size - 1)) / 8;
> +	unsigned int fence = pb->fence;
> +
> +	if (pb->fence < pb->pos)
> +		fence += pb->size;
> +
> +	return (fence - pb->pos) / 8;
>  }
>  
>  /*
> 

Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
diff mbox series

Patch

diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
index a96c4dd1e449..50c1370b56c7 100644
--- a/drivers/gpu/host1x/cdma.c
+++ b/drivers/gpu/host1x/cdma.c
@@ -42,7 +42,17 @@ 
  * means that the push buffer is full, not empty.
  */
 
-#define HOST1X_PUSHBUFFER_SLOTS	512
+/*
+ * Typically the commands written into the push buffer are a pair of words. We
+ * use slots to represent each of these pairs and to simplify things. Note the
+ * strange number of slots allocated here. 512 slots will fit exactly within a
+ * single memory page. We also need one additional word at the end of the push
+ * buffer for the RESTART opcode that will instruct the CDMA to jump back to
+ * the beginning of the push buffer. With 512 slots, this means that we'll use
+ * 2 memory pages and waste 4092 bytes of the second page that will never be
+ * used.
+ */
+#define HOST1X_PUSHBUFFER_SLOTS	511
 
 /*
  * Clean up push buffer resources
@@ -148,7 +158,10 @@  static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 	WARN_ON(pb->pos == pb->fence);
 	*(p++) = op1;
 	*(p++) = op2;
-	pb->pos = (pb->pos + 8) & (pb->size - 1);
+	pb->pos += 8;
+
+	if (pb->pos >= pb->size)
+		pb->pos -= pb->size;
 }
 
 /*
@@ -158,7 +171,10 @@  static void host1x_pushbuffer_push(struct push_buffer *pb, u32 op1, u32 op2)
 static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
 {
 	/* Advance the next write position */
-	pb->fence = (pb->fence + slots * 8) & (pb->size - 1);
+	pb->fence += slots * 8;
+
+	if (pb->fence >= pb->size)
+		pb->fence -= pb->size;
 }
 
 /*
@@ -166,7 +182,12 @@  static void host1x_pushbuffer_pop(struct push_buffer *pb, unsigned int slots)
  */
 static u32 host1x_pushbuffer_space(struct push_buffer *pb)
 {
-	return ((pb->fence - pb->pos) & (pb->size - 1)) / 8;
+	unsigned int fence = pb->fence;
+
+	if (pb->fence < pb->pos)
+		fence += pb->size;
+
+	return (fence - pb->pos) / 8;
 }
 
 /*