diff mbox series

mtd: spi-nor: Fix Cadence QSPI page fault kernel panic

Message ID 1521485124-24882-1-git-send-email-thor.thayer@linux.intel.com
State Superseded
Headers show
Series mtd: spi-nor: Fix Cadence QSPI page fault kernel panic | expand

Commit Message

Thor Thayer March 19, 2018, 6:45 p.m. UTC
From: Thor Thayer <thor.thayer@linux.intel.com>

The current Cadence QSPI driver caused a kernel panic when loading
a Root Filesystem from QSPI. The problem was caused by reading more
bytes than needed because the QSPI operated on 4 bytes at a time.
<snip>
[    7.947754] spi_nor_read[1048]:from 0x037cad74, len 1 [bfe07fff]
[    7.956247] cqspi_read[910]:offset 0x58502516, buffer=bfe07fff
[    7.956247]
[    7.966046] Unable to handle kernel paging request at virtual
address bfe08002
[    7.973239] pgd = eebfc000
[    7.975931] [bfe08002] *pgd=2fffb811, *pte=00000000, *ppte=00000000
</snip>
Notice above how only 1 byte needed to be read but by reading 4 bytes
into the end of a mapped page, a unrecoverable page fault occurred.

This patch uses a temporary buffer to hold the 4 bytes read and then
copies only the bytes required into the buffer. A min() function is
used to limit the length to prevent buffer overflows.

Similar changes were made for the write routine.

Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
---
 drivers/mtd/spi-nor/cadence-quadspi.c | 39 ++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 8 deletions(-)

Comments

Marek Vasut March 19, 2018, 10:33 p.m. UTC | #1
On 03/19/2018 07:45 PM, thor.thayer@linux.intel.com wrote:
> From: Thor Thayer <thor.thayer@linux.intel.com>
> 
> The current Cadence QSPI driver caused a kernel panic when loading
> a Root Filesystem from QSPI. The problem was caused by reading more
> bytes than needed because the QSPI operated on 4 bytes at a time.
> <snip>
> [    7.947754] spi_nor_read[1048]:from 0x037cad74, len 1 [bfe07fff]
> [    7.956247] cqspi_read[910]:offset 0x58502516, buffer=bfe07fff
> [    7.956247]
> [    7.966046] Unable to handle kernel paging request at virtual
> address bfe08002
> [    7.973239] pgd = eebfc000
> [    7.975931] [bfe08002] *pgd=2fffb811, *pte=00000000, *ppte=00000000
> </snip>
> Notice above how only 1 byte needed to be read but by reading 4 bytes
> into the end of a mapped page, a unrecoverable page fault occurred.
> 
> This patch uses a temporary buffer to hold the 4 bytes read and then
> copies only the bytes required into the buffer. A min() function is
> used to limit the length to prevent buffer overflows.
> 
> Similar changes were made for the write routine.
> 
> Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
> ---
>  drivers/mtd/spi-nor/cadence-quadspi.c | 39 ++++++++++++++++++++++++++++-------
>  1 file changed, 31 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
> index 4b8e9183489a..b22ed982f896 100644
> --- a/drivers/mtd/spi-nor/cadence-quadspi.c
> +++ b/drivers/mtd/spi-nor/cadence-quadspi.c
> @@ -501,7 +501,8 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
>  	void __iomem *reg_base = cqspi->iobase;
>  	void __iomem *ahb_base = cqspi->ahb_base;
>  	unsigned int remaining = n_rx;
> -	unsigned int bytes_to_read = 0;
> +	unsigned int mod_bytes, words_to_read, bytes_to_read = 0;
> +	u8 *rxbuf_end = rxbuf + n_rx;
>  	int ret = 0;
>  
>  	writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
> @@ -533,9 +534,21 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
>  			bytes_to_read *= cqspi->fifo_width;
>  			bytes_to_read = bytes_to_read > remaining ?
>  					remaining : bytes_to_read;
> -			ioread32_rep(ahb_base, rxbuf,
> -				     DIV_ROUND_UP(bytes_to_read, 4));
> -			rxbuf += bytes_to_read;
> +			/* Read 4 byte chunks before using single byte mode */
> +			words_to_read = bytes_to_read / 4;
> +			mod_bytes = bytes_to_read % 4;
> +			if (words_to_read) {
> +				ioread32_rep(ahb_base, rxbuf, words_to_read);
> +				rxbuf += (words_to_read * 4);
> +			}
> +			if (mod_bytes) {
> +				unsigned int temp = ioread32(ahb_base);
> +
> +				memcpy(rxbuf, &temp, min((unsigned int)
> +							 (rxbuf_end - rxbuf),
> +							 mod_bytes));
> +				rxbuf += mod_bytes;
> +			}

Wouldn't it make more sense to read in 4 byte increments all the time
except for the one last read instead ? This code above where you always
check for trailing bytes can make the next read cycle do ioreaad32 into
unaligned memory address and thus cause slowdown. (consider the example
where the controller first reports it has 5 bytes ready, then reports it
has 7 bytes ready. If you read 4 bytes in the first cycle, wait a bit
and then check how much data the controller has in the next cycle, it
will be 8 bytes, all nicely aligned).

Does it make sense ?
kernel test robot March 20, 2018, 8:15 a.m. UTC | #2
Hi Thor,

I love your patch! Perhaps something to improve:

[auto build test WARNING on v4.16-rc4]
[also build test WARNING on next-20180319]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/thor-thayer-linux-intel-com/mtd-spi-nor-Fix-Cadence-QSPI-page-fault-kernel-panic/20180320-133857
config: alpha-allmodconfig (attached as .config)
compiler: alpha-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=alpha 

All warnings (new ones prefixed by >>):

   In file included from include/linux/clk.h:16:0,
                    from drivers/mtd/spi-nor/cadence-quadspi.c:18:
   drivers/mtd/spi-nor/cadence-quadspi.c: In function 'cqspi_indirect_write_execute':
   include/linux/kernel.h:793:16: warning: comparison of distinct pointer types lacks a cast
     (void) (&min1 == &min2);   \
                   ^
   include/linux/kernel.h:802:2: note: in expansion of macro '__min'
     __min(typeof(x), typeof(y),   \
     ^~~~~
>> drivers/mtd/spi-nor/cadence-quadspi.c:651:25: note: in expansion of macro 'min'
       memcpy(&temp, txbuf, min(sizeof(temp), mod_bytes));
                            ^~~

vim +/min +651 drivers/mtd/spi-nor/cadence-quadspi.c

   606	
   607	static int cqspi_indirect_write_execute(struct spi_nor *nor, loff_t to_addr,
   608						const u8 *txbuf, const size_t n_tx)
   609	{
   610		const unsigned int page_size = nor->page_size;
   611		struct cqspi_flash_pdata *f_pdata = nor->priv;
   612		struct cqspi_st *cqspi = f_pdata->cqspi;
   613		void __iomem *reg_base = cqspi->iobase;
   614		unsigned int remaining = n_tx;
   615		unsigned int mod_bytes, write_bytes, write_words;
   616		int ret;
   617	
   618		writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
   619		writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES);
   620	
   621		/* Clear all interrupts. */
   622		writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
   623	
   624		writel(CQSPI_IRQ_MASK_WR, reg_base + CQSPI_REG_IRQMASK);
   625	
   626		reinit_completion(&cqspi->transfer_complete);
   627		writel(CQSPI_REG_INDIRECTWR_START_MASK,
   628		       reg_base + CQSPI_REG_INDIRECTWR);
   629		/*
   630		 * As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access
   631		 * Controller programming sequence, couple of cycles of
   632		 * QSPI_REF_CLK delay is required for the above bit to
   633		 * be internally synchronized by the QSPI module. Provide 5
   634		 * cycles of delay.
   635		 */
   636		if (cqspi->wr_delay)
   637			ndelay(cqspi->wr_delay);
   638	
   639		while (remaining > 0) {
   640			write_bytes = remaining > page_size ? page_size : remaining;
   641			write_words = write_bytes / 4;
   642			mod_bytes = write_bytes % 4;
   643			/* Write 4 bytes at a time then single bytes. */
   644			if (write_words) {
   645				iowrite32_rep(cqspi->ahb_base, txbuf, write_words);
   646				txbuf += (write_words * 4);
   647			}
   648			if (mod_bytes) {
   649				unsigned int temp = 0xFFFFFFFF;
   650	
 > 651				memcpy(&temp, txbuf, min(sizeof(temp), mod_bytes));
   652				iowrite32(temp, cqspi->ahb_base);
   653				txbuf += mod_bytes;
   654			}
   655			ret = wait_for_completion_timeout(&cqspi->transfer_complete,
   656							  msecs_to_jiffies
   657							  (CQSPI_TIMEOUT_MS));
   658			if (!ret) {
   659				dev_err(nor->dev, "Indirect write timeout\n");
   660				ret = -ETIMEDOUT;
   661				goto failwr;
   662			}
   663	
   664			remaining -= write_bytes;
   665	
   666			if (remaining > 0)
   667				reinit_completion(&cqspi->transfer_complete);
   668		}
   669	
   670		/* Check indirect done status */
   671		ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR,
   672					 CQSPI_REG_INDIRECTWR_DONE_MASK, 0);
   673		if (ret) {
   674			dev_err(nor->dev,
   675				"Indirect write completion error (%i)\n", ret);
   676			goto failwr;
   677		}
   678	
   679		/* Disable interrupt. */
   680		writel(0, reg_base + CQSPI_REG_IRQMASK);
   681	
   682		/* Clear indirect completion status */
   683		writel(CQSPI_REG_INDIRECTWR_DONE_MASK, reg_base + CQSPI_REG_INDIRECTWR);
   684	
   685		cqspi_wait_idle(cqspi);
   686	
   687		return 0;
   688	
   689	failwr:
   690		/* Disable interrupt. */
   691		writel(0, reg_base + CQSPI_REG_IRQMASK);
   692	
   693		/* Cancel the indirect write */
   694		writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
   695		       reg_base + CQSPI_REG_INDIRECTWR);
   696		return ret;
   697	}
   698	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot March 20, 2018, 10:31 a.m. UTC | #3
Hi Thor,

I love your patch! Perhaps something to improve:

[auto build test WARNING on v4.16-rc4]
[also build test WARNING on next-20180319]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/thor-thayer-linux-intel-com/mtd-spi-nor-Fix-Cadence-QSPI-page-fault-kernel-panic/20180320-133857
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> drivers/mtd/spi-nor/cadence-quadspi.c:651:25: sparse: incompatible types in comparison expression (different type sizes)
   In file included from include/linux/clk.h:16:0,
                    from drivers/mtd/spi-nor/cadence-quadspi.c:18:
   drivers/mtd/spi-nor/cadence-quadspi.c: In function 'cqspi_indirect_write_execute':
   include/linux/kernel.h:793:16: warning: comparison of distinct pointer types lacks a cast
     (void) (&min1 == &min2);   7-                ^
   include/linux/kernel.h:802:2: note: in expansion of macro '__min'
     __min(typeof(x), typeof(y),   10-  ^~~~~
   drivers/mtd/spi-nor/cadence-quadspi.c:651:25: note: in expansion of macro 'min'
       memcpy(&temp, txbuf, min(sizeof(temp), mod_bytes));
                            ^~~

vim +651 drivers/mtd/spi-nor/cadence-quadspi.c

   606	
   607	static int cqspi_indirect_write_execute(struct spi_nor *nor, loff_t to_addr,
   608						const u8 *txbuf, const size_t n_tx)
   609	{
   610		const unsigned int page_size = nor->page_size;
   611		struct cqspi_flash_pdata *f_pdata = nor->priv;
   612		struct cqspi_st *cqspi = f_pdata->cqspi;
   613		void __iomem *reg_base = cqspi->iobase;
   614		unsigned int remaining = n_tx;
   615		unsigned int mod_bytes, write_bytes, write_words;
   616		int ret;
   617	
   618		writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
   619		writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES);
   620	
   621		/* Clear all interrupts. */
   622		writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
   623	
   624		writel(CQSPI_IRQ_MASK_WR, reg_base + CQSPI_REG_IRQMASK);
   625	
   626		reinit_completion(&cqspi->transfer_complete);
   627		writel(CQSPI_REG_INDIRECTWR_START_MASK,
   628		       reg_base + CQSPI_REG_INDIRECTWR);
   629		/*
   630		 * As per 66AK2G02 TRM SPRUHY8F section 11.15.5.3 Indirect Access
   631		 * Controller programming sequence, couple of cycles of
   632		 * QSPI_REF_CLK delay is required for the above bit to
   633		 * be internally synchronized by the QSPI module. Provide 5
   634		 * cycles of delay.
   635		 */
   636		if (cqspi->wr_delay)
   637			ndelay(cqspi->wr_delay);
   638	
   639		while (remaining > 0) {
   640			write_bytes = remaining > page_size ? page_size : remaining;
   641			write_words = write_bytes / 4;
   642			mod_bytes = write_bytes % 4;
   643			/* Write 4 bytes at a time then single bytes. */
   644			if (write_words) {
   645				iowrite32_rep(cqspi->ahb_base, txbuf, write_words);
   646				txbuf += (write_words * 4);
   647			}
   648			if (mod_bytes) {
   649				unsigned int temp = 0xFFFFFFFF;
   650	
 > 651				memcpy(&temp, txbuf, min(sizeof(temp), mod_bytes));
   652				iowrite32(temp, cqspi->ahb_base);
   653				txbuf += mod_bytes;
   654			}
   655			ret = wait_for_completion_timeout(&cqspi->transfer_complete,
   656							  msecs_to_jiffies
   657							  (CQSPI_TIMEOUT_MS));
   658			if (!ret) {
   659				dev_err(nor->dev, "Indirect write timeout\n");
   660				ret = -ETIMEDOUT;
   661				goto failwr;
   662			}
   663	
   664			remaining -= write_bytes;
   665	
   666			if (remaining > 0)
   667				reinit_completion(&cqspi->transfer_complete);
   668		}
   669	
   670		/* Check indirect done status */
   671		ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR,
   672					 CQSPI_REG_INDIRECTWR_DONE_MASK, 0);
   673		if (ret) {
   674			dev_err(nor->dev,
   675				"Indirect write completion error (%i)\n", ret);
   676			goto failwr;
   677		}
   678	
   679		/* Disable interrupt. */
   680		writel(0, reg_base + CQSPI_REG_IRQMASK);
   681	
   682		/* Clear indirect completion status */
   683		writel(CQSPI_REG_INDIRECTWR_DONE_MASK, reg_base + CQSPI_REG_INDIRECTWR);
   684	
   685		cqspi_wait_idle(cqspi);
   686	
   687		return 0;
   688	
   689	failwr:
   690		/* Disable interrupt. */
   691		writel(0, reg_base + CQSPI_REG_IRQMASK);
   692	
   693		/* Cancel the indirect write */
   694		writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
   695		       reg_base + CQSPI_REG_INDIRECTWR);
   696		return ret;
   697	}
   698	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Thor Thayer March 21, 2018, 3:15 p.m. UTC | #4
Hi Marek,

On 03/19/2018 05:33 PM, Marek Vasut wrote:
> On 03/19/2018 07:45 PM, thor.thayer@linux.intel.com wrote:
>> From: Thor Thayer <thor.thayer@linux.intel.com>
>>
>> The current Cadence QSPI driver caused a kernel panic when loading
>> a Root Filesystem from QSPI. The problem was caused by reading more
>> bytes than needed because the QSPI operated on 4 bytes at a time.
>> <snip>
>> [    7.947754] spi_nor_read[1048]:from 0x037cad74, len 1 [bfe07fff]
>> [    7.956247] cqspi_read[910]:offset 0x58502516, buffer=bfe07fff
>> [    7.956247]
>> [    7.966046] Unable to handle kernel paging request at virtual
>> address bfe08002
>> [    7.973239] pgd = eebfc000
>> [    7.975931] [bfe08002] *pgd=2fffb811, *pte=00000000, *ppte=00000000
>> </snip>
>> Notice above how only 1 byte needed to be read but by reading 4 bytes
>> into the end of a mapped page, a unrecoverable page fault occurred.
>>
>> This patch uses a temporary buffer to hold the 4 bytes read and then
>> copies only the bytes required into the buffer. A min() function is
>> used to limit the length to prevent buffer overflows.
>>
>> Similar changes were made for the write routine.
>>
>> Signed-off-by: Thor Thayer <thor.thayer@linux.intel.com>
>> ---
>>   drivers/mtd/spi-nor/cadence-quadspi.c | 39 ++++++++++++++++++++++++++++-------
>>   1 file changed, 31 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
>> index 4b8e9183489a..b22ed982f896 100644
>> --- a/drivers/mtd/spi-nor/cadence-quadspi.c
>> +++ b/drivers/mtd/spi-nor/cadence-quadspi.c
>> @@ -501,7 +501,8 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
>>   	void __iomem *reg_base = cqspi->iobase;
>>   	void __iomem *ahb_base = cqspi->ahb_base;
>>   	unsigned int remaining = n_rx;
>> -	unsigned int bytes_to_read = 0;
>> +	unsigned int mod_bytes, words_to_read, bytes_to_read = 0;
>> +	u8 *rxbuf_end = rxbuf + n_rx;
>>   	int ret = 0;
>>   
>>   	writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
>> @@ -533,9 +534,21 @@ static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
>>   			bytes_to_read *= cqspi->fifo_width;
>>   			bytes_to_read = bytes_to_read > remaining ?
>>   					remaining : bytes_to_read;
>> -			ioread32_rep(ahb_base, rxbuf,
>> -				     DIV_ROUND_UP(bytes_to_read, 4));
>> -			rxbuf += bytes_to_read;
>> +			/* Read 4 byte chunks before using single byte mode */
>> +			words_to_read = bytes_to_read / 4;
>> +			mod_bytes = bytes_to_read % 4;
>> +			if (words_to_read) {
>> +				ioread32_rep(ahb_base, rxbuf, words_to_read);
>> +				rxbuf += (words_to_read * 4);
>> +			}
>> +			if (mod_bytes) {
>> +				unsigned int temp = ioread32(ahb_base);
>> +
>> +				memcpy(rxbuf, &temp, min((unsigned int)
>> +							 (rxbuf_end - rxbuf),
>> +							 mod_bytes));
>> +				rxbuf += mod_bytes;
>> +			}
> 
> Wouldn't it make more sense to read in 4 byte increments all the time
> except for the one last read instead ? This code above where you always
> check for trailing bytes can make the next read cycle do ioreaad32 into
> unaligned memory address and thus cause slowdown. (consider the example
> where the controller first reports it has 5 bytes ready, then reports it
> has 7 bytes ready. If you read 4 bytes in the first cycle, wait a bit
> and then check how much data the controller has in the next cycle, it
> will be 8 bytes, all nicely aligned).
> 
> Does it make sense ?
> 
Ahh yes, Thanks for the example - I see your point. I'll look into that 
change. Thanks.
diff mbox series

Patch

diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
index 4b8e9183489a..b22ed982f896 100644
--- a/drivers/mtd/spi-nor/cadence-quadspi.c
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -501,7 +501,8 @@  static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
 	void __iomem *reg_base = cqspi->iobase;
 	void __iomem *ahb_base = cqspi->ahb_base;
 	unsigned int remaining = n_rx;
-	unsigned int bytes_to_read = 0;
+	unsigned int mod_bytes, words_to_read, bytes_to_read = 0;
+	u8 *rxbuf_end = rxbuf + n_rx;
 	int ret = 0;
 
 	writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
@@ -533,9 +534,21 @@  static int cqspi_indirect_read_execute(struct spi_nor *nor, u8 *rxbuf,
 			bytes_to_read *= cqspi->fifo_width;
 			bytes_to_read = bytes_to_read > remaining ?
 					remaining : bytes_to_read;
-			ioread32_rep(ahb_base, rxbuf,
-				     DIV_ROUND_UP(bytes_to_read, 4));
-			rxbuf += bytes_to_read;
+			/* Read 4 byte chunks before using single byte mode */
+			words_to_read = bytes_to_read / 4;
+			mod_bytes = bytes_to_read % 4;
+			if (words_to_read) {
+				ioread32_rep(ahb_base, rxbuf, words_to_read);
+				rxbuf += (words_to_read * 4);
+			}
+			if (mod_bytes) {
+				unsigned int temp = ioread32(ahb_base);
+
+				memcpy(rxbuf, &temp, min((unsigned int)
+							 (rxbuf_end - rxbuf),
+							 mod_bytes));
+				rxbuf += mod_bytes;
+			}
 			remaining -= bytes_to_read;
 			bytes_to_read = cqspi_get_rd_sram_level(cqspi);
 		}
@@ -599,7 +612,7 @@  static int cqspi_indirect_write_execute(struct spi_nor *nor, loff_t to_addr,
 	struct cqspi_st *cqspi = f_pdata->cqspi;
 	void __iomem *reg_base = cqspi->iobase;
 	unsigned int remaining = n_tx;
-	unsigned int write_bytes;
+	unsigned int mod_bytes, write_bytes, write_words;
 	int ret;
 
 	writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
@@ -625,9 +638,20 @@  static int cqspi_indirect_write_execute(struct spi_nor *nor, loff_t to_addr,
 
 	while (remaining > 0) {
 		write_bytes = remaining > page_size ? page_size : remaining;
-		iowrite32_rep(cqspi->ahb_base, txbuf,
-			      DIV_ROUND_UP(write_bytes, 4));
+		write_words = write_bytes / 4;
+		mod_bytes = write_bytes % 4;
+		/* Write 4 bytes at a time then single bytes. */
+		if (write_words) {
+			iowrite32_rep(cqspi->ahb_base, txbuf, write_words);
+			txbuf += (write_words * 4);
+		}
+		if (mod_bytes) {
+			unsigned int temp = 0xFFFFFFFF;
 
+			memcpy(&temp, txbuf, min(sizeof(temp), mod_bytes));
+			iowrite32(temp, cqspi->ahb_base);
+			txbuf += mod_bytes;
+		}
 		ret = wait_for_completion_timeout(&cqspi->transfer_complete,
 						  msecs_to_jiffies
 						  (CQSPI_TIMEOUT_MS));
@@ -637,7 +661,6 @@  static int cqspi_indirect_write_execute(struct spi_nor *nor, loff_t to_addr,
 			goto failwr;
 		}
 
-		txbuf += write_bytes;
 		remaining -= write_bytes;
 
 		if (remaining > 0)