diff mbox series

[U-Boot,RESEND,3/3] sunxi: H6: use writel_relaxed for DRAM timing register accesses

Message ID 20190210161726.5454-4-andre.przywara@arm.com
State Changes Requested
Delegated to: Jagannadha Sutradharudu Teki
Headers show
Series arm: Introduce writel/readl_relaxed accessors | expand

Commit Message

Andre Przywara Feb. 10, 2019, 4:17 p.m. UTC
The timing registers in the DRAM controller can be programmed in any
order, as they will only take effect once the controller is eventually
"activated".

Switch the MMIO writes in mctl_set_timing_lpddr3() over to use
writel_relaxed(), since we don't need the stronger guarantee of the
normal writel(). We satisfy the overall ordering requirement by ending
the function with an explicit DMB barrier.

In this case we are not interested in the performance benefit this
usually gives, but in the saved instructions, which sum up for the many
writes we have in the timing setup.
Due to alignment effects this shrinks our chronically tight H6 SPL by a
whopping 2KB, which brings it in the same region as for the other
AArch64 Allwinner SPL builds.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
---
 arch/arm/mach-sunxi/dram_sun50i_h6.c | 79 +++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 37 deletions(-)

Comments

Alexander Graf Feb. 20, 2019, 12:25 p.m. UTC | #1
On 02/10/2019 05:17 PM, Andre Przywara wrote:
> The timing registers in the DRAM controller can be programmed in any
> order, as they will only take effect once the controller is eventually
> "activated".
>
> Switch the MMIO writes in mctl_set_timing_lpddr3() over to use
> writel_relaxed(), since we don't need the stronger guarantee of the
> normal writel(). We satisfy the overall ordering requirement by ending
> the function with an explicit DMB barrier.
>
> In this case we are not interested in the performance benefit this
> usually gives, but in the saved instructions, which sum up for the many
> writes we have in the timing setup.
> Due to alignment effects this shrinks our chronically tight H6 SPL by a
> whopping 2KB, which brings it in the same region as for the other
> AArch64 Allwinner SPL builds.
>
> Signed-off-by: Andre Przywara <andre.przywara@arm.com>

If you say it still works, it sounds like a pretty nifty optimization :).

Reviewed-by: Alexander Graf <agraf@suse.de>


Alex
diff mbox series

Patch

diff --git a/arch/arm/mach-sunxi/dram_sun50i_h6.c b/arch/arm/mach-sunxi/dram_sun50i_h6.c
index 5da90a2835..84a33a63d6 100644
--- a/arch/arm/mach-sunxi/dram_sun50i_h6.c
+++ b/arch/arm/mach-sunxi/dram_sun50i_h6.c
@@ -241,51 +241,55 @@  static void mctl_set_timing_lpddr3(struct dram_para *para)
 	memcpy(mctl_phy->mr, mr_lpddr3, sizeof(mr_lpddr3));
 
 	/* set DRAM timing */
-	writel((twtp << 24) | (tfaw << 16) | (trasmax << 8) | tras,
-	       &mctl_ctl->dramtmg[0]);
-	writel((txp << 16) | (trtp << 8) | trc, &mctl_ctl->dramtmg[1]);
-	writel((tcwl << 24) | (tcl << 16) | (trd2wr << 8) | twr2rd,
-	       &mctl_ctl->dramtmg[2]);
-	writel((tmrw << 20) | (tmrd << 12) | tmod, &mctl_ctl->dramtmg[3]);
-	writel((trcd << 24) | (tccd << 16) | (trrd << 8) | trp,
-	       &mctl_ctl->dramtmg[4]);
-	writel((tcksrx << 24) | (tcksre << 16) | (tckesr << 8) | tcke,
-	       &mctl_ctl->dramtmg[5]);
+	writel_relaxed((twtp << 24) | (tfaw << 16) | (trasmax << 8) | tras,
+		       &mctl_ctl->dramtmg[0]);
+	writel_relaxed((txp << 16) | (trtp << 8) | trc, &mctl_ctl->dramtmg[1]);
+	writel_relaxed((tcwl << 24) | (tcl << 16) | (trd2wr << 8) | twr2rd,
+		       &mctl_ctl->dramtmg[2]);
+	writel_relaxed((tmrw << 20) | (tmrd << 12) | tmod,
+		       &mctl_ctl->dramtmg[3]);
+	writel_relaxed((trcd << 24) | (tccd << 16) | (trrd << 8) | trp,
+		       &mctl_ctl->dramtmg[4]);
+	writel_relaxed((tcksrx << 24) | (tcksre << 16) | (tckesr << 8) | tcke,
+		       &mctl_ctl->dramtmg[5]);
 	/* Value suggested by ZynqMP manual and used by libdram */
-	writel((txp + 2) | 0x02020000, &mctl_ctl->dramtmg[6]);
-	writel((txsfast << 24) | (txsabort << 16) | (txsdll << 8) | txs,
-	       &mctl_ctl->dramtmg[8]);
-	writel(txsr, &mctl_ctl->dramtmg[14]);
+	writel_relaxed((txp + 2) | 0x02020000, &mctl_ctl->dramtmg[6]);
+	writel_relaxed((txsfast << 24) | (txsabort << 16) | (txsdll << 8) | txs,
+		       &mctl_ctl->dramtmg[8]);
+	writel_relaxed(txsr, &mctl_ctl->dramtmg[14]);
 
 	clrsetbits_le32(&mctl_ctl->init[0], (3 << 30), (1 << 30));
-	writel(0, &mctl_ctl->dfimisc);
+	writel_relaxed(0, &mctl_ctl->dfimisc);
 	clrsetbits_le32(&mctl_ctl->rankctl, 0xff0, 0x660);
 
 	/*
 	 * Set timing registers of the PHY.
 	 * Note: the PHY is clocked 2x from the DRAM frequency.
 	 */
-	writel((trrd << 25) | (tras << 17) | (trp << 9) | (trtp << 1),
+	writel_relaxed((trrd << 25) | (tras << 17) | (trp << 9) | (trtp << 1),
 	       &mctl_phy->dtpr[0]);
-	writel((tfaw << 17) | 0x28000400 | (tmrd << 1), &mctl_phy->dtpr[1]);
-	writel(((txs << 6) - 1) | (tcke << 17), &mctl_phy->dtpr[2]);
-	writel(((txsdll << 22) - (0x1 << 16)) | twtr_sa | (tcksrea << 8),
-	       &mctl_phy->dtpr[3]);
-	writel((txp << 1) | (trfc << 17) | 0x800, &mctl_phy->dtpr[4]);
-	writel((trc << 17) | (trcd << 9) | (twtr << 1), &mctl_phy->dtpr[5]);
-	writel(0x0505, &mctl_phy->dtpr[6]);
+	writel_relaxed((tfaw << 17) | 0x28000400 | (tmrd << 1),
+		       &mctl_phy->dtpr[1]);
+	writel_relaxed(((txs << 6) - 1) | (tcke << 17), &mctl_phy->dtpr[2]);
+	writel_relaxed(((txsdll << 22) - (0x1 << 16)) | twtr_sa |
+		       (tcksrea << 8), &mctl_phy->dtpr[3]);
+	writel_relaxed((txp << 1) | (trfc << 17) | 0x800, &mctl_phy->dtpr[4]);
+	writel_relaxed((trc << 17) | (trcd << 9) | (twtr << 1),
+		       &mctl_phy->dtpr[5]);
+	writel_relaxed(0x0505, &mctl_phy->dtpr[6]);
 
 	/* Configure DFI timing */
-	writel(tcl | 0x2000200 | (t_rdata_en << 16) | 0x808000,
-	       &mctl_ctl->dfitmg0);
-	writel(0x040201, &mctl_ctl->dfitmg1);
+	writel_relaxed(tcl | 0x2000200 | (t_rdata_en << 16) | 0x808000,
+		       &mctl_ctl->dfitmg0);
+	writel_relaxed(0x040201, &mctl_ctl->dfitmg1);
 
 	/* Configure PHY timing */
-	writel(tdinit0 | (tdinit1 << 20), &mctl_phy->ptr[3]);
-	writel(tdinit2 | (tdinit3 << 18), &mctl_phy->ptr[4]);
+	writel_relaxed(tdinit0 | (tdinit1 << 20), &mctl_phy->ptr[3]);
+	writel_relaxed(tdinit2 | (tdinit3 << 18), &mctl_phy->ptr[4]);
 
 	/* set refresh timing */
-	writel((trefi << 16) | trfc, &mctl_ctl->rfshtmg);
+	writel_relaxed((trefi << 16) | trfc, &mctl_ctl->rfshtmg);
+	DMB;
 }
 
 static void mctl_sys_init(struct dram_para *para)
@@ -476,17 +480,17 @@  static void mctl_bit_delay_set(struct dram_para *para)
 		val = readl(&mctl_phy->dx[i].bdlr0);
 		for (j = 0; j < 4; j++)
 			val += para->dx_write_delays[i][j] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr0);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr0);
 
 		val = readl(&mctl_phy->dx[i].bdlr1);
 		for (j = 0; j < 4; j++)
 			val += para->dx_write_delays[i][j + 4] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr1);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr1);
 
 		val = readl(&mctl_phy->dx[i].bdlr2);
 		for (j = 0; j < 4; j++)
 			val += para->dx_write_delays[i][j + 8] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr2);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr2);
 	}
 	clrbits_le32(&mctl_phy->pgcr[0], BIT(26));
 
@@ -494,22 +498,22 @@  static void mctl_bit_delay_set(struct dram_para *para)
 		val = readl(&mctl_phy->dx[i].bdlr3);
 		for (j = 0; j < 4; j++)
 			val += para->dx_read_delays[i][j] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr3);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr3);
 
 		val = readl(&mctl_phy->dx[i].bdlr4);
 		for (j = 0; j < 4; j++)
 			val += para->dx_read_delays[i][j + 4] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr4);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr4);
 
 		val = readl(&mctl_phy->dx[i].bdlr5);
 		for (j = 0; j < 4; j++)
 			val += para->dx_read_delays[i][j + 8] << (j * 8);
-		writel(val, &mctl_phy->dx[i].bdlr5);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr5);
 
 		val = readl(&mctl_phy->dx[i].bdlr6);
 		val += (para->dx_read_delays[i][12] << 8) |
 		       (para->dx_read_delays[i][13] << 16);
-		writel(val, &mctl_phy->dx[i].bdlr6);
+		writel_relaxed(val, &mctl_phy->dx[i].bdlr6);
 	}
 	setbits_le32(&mctl_phy->pgcr[0], BIT(26));
 	udelay(1);
@@ -517,8 +521,9 @@  static void mctl_bit_delay_set(struct dram_para *para)
 	for (i = 1; i < 14; i++) {
 		val = readl(&mctl_phy->acbdlr[i]);
 		val += 0x0a0a0a0a;
-		writel(val, &mctl_phy->acbdlr[i]);
+		writel_relaxed(val, &mctl_phy->acbdlr[i]);
 	}
+	DMB;
 }
 
 static void mctl_channel_init(struct dram_para *para)