diff mbox series

[v2,09/11] socfpga: arria10: Improve bitstream loading speed

Message ID 20220526143715.277342-10-pan@semihalf.com
State Superseded
Delegated to: Simon Goldschmidt
Headers show
Series Add Chameleon v3 support | expand

Commit Message

Paweł Anikiel May 26, 2022, 2:37 p.m. UTC
Apply some optimizations to speed up bitstream loading
(both for full and split periph/core bitstreams):

 * Change the size of the first fs read, so that all the subsequent
   reads are aligned to a specific value (called MAX_FIRST_LOAD_SIZE).
   This value was chosen so that in subsequent reads the fat fs driver
   doesn't have to allocate a temporary buffer in get_contents
   (assuming 8KiB clusters).

 * Change the buffer size to a larger value when reading to ddr
   (but not too large, because large transfers cause a stack overflow
   in the dwmmc driver).

Signed-off-by: Paweł Anikiel <pan@semihalf.com>
---
 drivers/fpga/socfpga_arria10.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

Comments

Simon Glass May 27, 2022, 3:54 p.m. UTC | #1
On Thu, 26 May 2022 at 07:38, Paweł Anikiel <pan@semihalf.com> wrote:
>
> Apply some optimizations to speed up bitstream loading
> (both for full and split periph/core bitstreams):
>
>  * Change the size of the first fs read, so that all the subsequent
>    reads are aligned to a specific value (called MAX_FIRST_LOAD_SIZE).
>    This value was chosen so that in subsequent reads the fat fs driver
>    doesn't have to allocate a temporary buffer in get_contents
>    (assuming 8KiB clusters).
>
>  * Change the buffer size to a larger value when reading to ddr
>    (but not too large, because large transfers cause a stack overflow
>    in the dwmmc driver).

When the size is too large, where exactly does that stack overflow happen?

>
> Signed-off-by: Paweł Anikiel <pan@semihalf.com>
> ---
>  drivers/fpga/socfpga_arria10.c | 20 ++++++++++++++++++--
>  1 file changed, 18 insertions(+), 2 deletions(-)
>

Reviewed-by: Simon Glass <sjg@chromium.org>
Paweł Anikiel May 30, 2022, 7:56 a.m. UTC | #2
On Fri, May 27, 2022 at 5:55 PM Simon Glass <sjg@chromium.org> wrote:
>
> On Thu, 26 May 2022 at 07:38, Paweł Anikiel <pan@semihalf.com> wrote:
> >
> > Apply some optimizations to speed up bitstream loading
> > (both for full and split periph/core bitstreams):
> >
> >  * Change the size of the first fs read, so that all the subsequent
> >    reads are aligned to a specific value (called MAX_FIRST_LOAD_SIZE).
> >    This value was chosen so that in subsequent reads the fat fs driver
> >    doesn't have to allocate a temporary buffer in get_contents
> >    (assuming 8KiB clusters).
> >
> >  * Change the buffer size to a larger value when reading to ddr
> >    (but not too large, because large transfers cause a stack overflow
> >    in the dwmmc driver).
>
> When the size is too large, where exactly does that stack overflow happen?

In dwmci_send_cmd (at drivers/mmc/dw_mmc.c:243). It stack-allocates a
buffer of size sizeof(struct dwmci_idmac) * (data->blocks / 8). Since
loading the bitstream is done from SPL (which is still in sram), we
only have about 100K of stack, which is not enough to load an 11MB
file in one go.
diff mbox series

Patch

diff --git a/drivers/fpga/socfpga_arria10.c b/drivers/fpga/socfpga_arria10.c
index 798e3a3f90..07bfe3060e 100644
--- a/drivers/fpga/socfpga_arria10.c
+++ b/drivers/fpga/socfpga_arria10.c
@@ -30,6 +30,14 @@ 
 #define FPGA_TIMEOUT_MSEC	1000  /* timeout in ms */
 #define FPGA_TIMEOUT_CNT	0x1000000
 #define DEFAULT_DDR_LOAD_ADDRESS	0x400
+#define DDR_BUFFER_SIZE		0x100000
+
+/* When reading bitstream from a filesystem, the size of the first read is
+ * changed so that the subsequent reads are aligned to this value. This value
+ * was chosen so that in subsequent reads the fat fs driver doesn't have to
+ * allocate a temporary buffer in get_contents (assuming 8KiB clusters).
+ */
+#define MAX_FIRST_LOAD_SIZE	0x2000
 
 DECLARE_GLOBAL_DATA_PTR;
 
@@ -526,7 +534,8 @@  static void get_rbf_image_info(struct rbf_info *rbf, u16 *buffer)
 #ifdef CONFIG_FS_LOADER
 static int first_loading_rbf_to_buffer(struct udevice *dev,
 				struct fpga_loadfs_info *fpga_loadfs,
-				u32 *buffer, size_t *buffer_bsize)
+				u32 *buffer, size_t *buffer_bsize,
+				size_t *buffer_bsize_ori)
 {
 	u32 *buffer_p = (u32 *)*buffer;
 	u32 *loadable = buffer_p;
@@ -674,6 +683,7 @@  static int first_loading_rbf_to_buffer(struct udevice *dev,
 		}
 
 		buffer_size = rbf_size;
+		*buffer_bsize_ori = DDR_BUFFER_SIZE;
 	}
 
 	debug("FPGA: External data: offset = 0x%x, size = 0x%x.\n",
@@ -686,11 +696,16 @@  static int first_loading_rbf_to_buffer(struct udevice *dev,
 	 * chunk by chunk transfer is required due to smaller buffer size
 	 * compare to bitstream
 	 */
+
+	if (buffer_size > MAX_FIRST_LOAD_SIZE)
+		buffer_size = MAX_FIRST_LOAD_SIZE;
+
 	if (rbf_size <= buffer_size) {
 		/* Loading whole bitstream into buffer */
 		buffer_size = rbf_size;
 		fpga_loadfs->remaining = 0;
 	} else {
+		buffer_size -= rbf_offset % buffer_size;
 		fpga_loadfs->remaining -= buffer_size;
 	}
 
@@ -806,7 +821,8 @@  int socfpga_loadfs(fpga_fs_info *fpga_fsinfo, const void *buf, size_t bsize,
 	 * function below.
 	 */
 	ret = first_loading_rbf_to_buffer(dev, &fpga_loadfs, &buffer,
-					   &buffer_sizebytes);
+					   &buffer_sizebytes,
+					   &buffer_sizebytes_ori);
 	if (ret == 1) {
 		printf("FPGA: Skipping configuration ...\n");
 		return 0;