diff mbox series

[v2,2/2] armv8: generic_timer: Use event stream for udelay

Message ID 20240501081633.120913-3-peter.hoyes@arm.com
State Accepted
Commit ebc84d7b60c1ed3398e9f600fe3dc8406500bd35
Delegated to: Tom Rini
Headers show
Series Optimize udelay for aarch64 boards | expand

Commit Message

Peter Hoyes May 1, 2024, 8:16 a.m. UTC
From: Peter Hoyes <Peter.Hoyes@arm.com>

Polling cntpct_el0 in a tight loop for delays is inefficient.
This is particularly apparent on Arm FVPs, which do not simulate
real time, meaning that a 1s sleep can take a couple of orders
of magnitude longer to execute in wall time.

If running at EL2 or above (where CNTHCTL_EL2 is available), enable
the cntpct_el0 event stream temporarily and use wfe to implement
the delay more efficiently. The event period is chosen as a
trade-off between efficiency and the fact that Arm FVPs do not
typically simulate real time.

This is only implemented for Armv8 boards, where an architectural
timer exists, and only enabled by default for the ARCH_VEXPRESS64
board family.

Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
---
 arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
 arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
 arch/arm/include/asm/system.h      |  6 ++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

Comments

Andre Przywara May 7, 2024, 10:26 a.m. UTC | #1
On Wed,  1 May 2024 09:16:33 +0100
Peter Hoyes <peter.hoyes@arm.com> wrote:

Hi Peter,

thanks for the change!

> From: Peter Hoyes <Peter.Hoyes@arm.com>
> 
> Polling cntpct_el0 in a tight loop for delays is inefficient.
> This is particularly apparent on Arm FVPs, which do not simulate
> real time, meaning that a 1s sleep can take a couple of orders
> of magnitude longer to execute in wall time.
> 
> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> the cntpct_el0 event stream temporarily and use wfe to implement
> the delay more efficiently. The event period is chosen as a
> trade-off between efficiency and the fact that Arm FVPs do not
> typically simulate real time.
> 
> This is only implemented for Armv8 boards, where an architectural
> timer exists, and only enabled by default for the ARCH_VEXPRESS64
> board family.

I think total_compute would benefit from this change as well, but this
needs some testing and can easily be a separate patch.

> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>

Reviewed-by: Andre Przywara <andre.przywara@arm.com>

Cheers,
Andre

> ---
>  arch/arm/cpu/armv8/Kconfig         |  8 ++++++++
>  arch/arm/cpu/armv8/generic_timer.c | 27 +++++++++++++++++++++++++++
>  arch/arm/include/asm/system.h      |  6 ++++--
>  3 files changed, 39 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
> index 9f0fb369f7..199335cd60 100644
> --- a/arch/arm/cpu/armv8/Kconfig
> +++ b/arch/arm/cpu/armv8/Kconfig
> @@ -191,6 +191,14 @@ config ARMV8_EA_EL3_FIRST
>  	  Exception handling at all exception levels for External Abort
> and SError interrupt exception are taken in EL3.
>  
> +config ARMV8_UDELAY_EVENT_STREAM
> +	bool "Use the event stream for udelay"
> +	default y if ARCH_VEXPRESS64
> +	help
> +	  Use the event stream provided by the AArch64 architectural
> timer for
> +	  delays. This is more efficient than the default polling
> +	  implementation.
> +
>  menuconfig ARMV8_CRYPTO
>  	bool "ARM64 Accelerated Cryptographic Algorithms"
>  
> diff --git a/arch/arm/cpu/armv8/generic_timer.c
> b/arch/arm/cpu/armv8/generic_timer.c index 8f83372cbc..e18b5c8187 100644
> --- a/arch/arm/cpu/armv8/generic_timer.c
> +++ b/arch/arm/cpu/armv8/generic_timer.c
> @@ -115,3 +115,30 @@ ulong timer_get_boot_us(void)
>  
>  	return val / get_tbclk();
>  }
> +
> +#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
> +void __udelay(unsigned long usec)
> +{
> +	u64 target = get_ticks() + usec_to_tick(usec);
> +
> +	/* At EL2 or above, use the event stream to avoid polling
> CNTPCT_EL0 so often */
> +	if (current_el() >= 2) {
> +		u32 cnthctl_val;
> +		const u8 event_period = 0x7;
> +
> +		asm volatile("mrs %0, cnthctl_el2" : "=r"
> (cnthctl_val));
> +		asm volatile("msr cnthctl_el2, %0" : : "r"
> +			(cnthctl_val | CNTHCTL_EL2_EVNT_EN |
> CNTHCTL_EL2_EVNT_I(event_period))); +
> +		while (get_ticks() + (1ULL << event_period) <= target)
> +			wfe();
> +
> +		/* Reset the event stream */
> +		asm volatile("msr cnthctl_el2, %0" : : "r"
> (cnthctl_val));
> +	}
> +
> +	/* Fall back to polling CNTPCT_EL0 */
> +	while (get_ticks() <= target)
> +		;
> +}
> +#endif
> diff --git a/arch/arm/include/asm/system.h
> b/arch/arm/include/asm/system.h index 51123c2968..7e30cac32a 100644
> --- a/arch/arm/include/asm/system.h
> +++ b/arch/arm/include/asm/system.h
> @@ -69,8 +69,10 @@
>  /*
>   * CNTHCTL_EL2 bits definitions
>   */
> -#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)  /* Physical timer regs
> accessible   */ -#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)  /*
> Physical counter accessible      */ +#define CNTHCTL_EL2_EVNT_EN
> BIT(2)	     /* Enable the event stream       */ +#define
> CNTHCTL_EL2_EVNT_I(val)	((val) << 4) /* Event stream trigger bits
>     */ +#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)     /* Physical
> timer regs accessible */ +#define CNTHCTL_EL2_EL1PCTEN_EN	(1 <<
> 0)     /* Physical counter accessible   */ /*
>   * HCR_EL2 bits definitions
Tom Rini May 14, 2024, 4:15 p.m. UTC | #2
On Wed, May 01, 2024 at 09:16:33AM +0100, Peter Hoyes wrote:

> From: Peter Hoyes <Peter.Hoyes@arm.com>
> 
> Polling cntpct_el0 in a tight loop for delays is inefficient.
> This is particularly apparent on Arm FVPs, which do not simulate
> real time, meaning that a 1s sleep can take a couple of orders
> of magnitude longer to execute in wall time.
> 
> If running at EL2 or above (where CNTHCTL_EL2 is available), enable
> the cntpct_el0 event stream temporarily and use wfe to implement
> the delay more efficiently. The event period is chosen as a
> trade-off between efficiency and the fact that Arm FVPs do not
> typically simulate real time.
> 
> This is only implemented for Armv8 boards, where an architectural
> timer exists, and only enabled by default for the ARCH_VEXPRESS64
> board family.
> 
> Signed-off-by: Peter Hoyes <Peter.Hoyes@arm.com>
> Reviewed-by: Andre Przywara <andre.przywara@arm.com>

Applied to u-boot/next, thanks!
diff mbox series

Patch

diff --git a/arch/arm/cpu/armv8/Kconfig b/arch/arm/cpu/armv8/Kconfig
index 9f0fb369f7..199335cd60 100644
--- a/arch/arm/cpu/armv8/Kconfig
+++ b/arch/arm/cpu/armv8/Kconfig
@@ -191,6 +191,14 @@  config ARMV8_EA_EL3_FIRST
 	  Exception handling at all exception levels for External Abort and
 	  SError interrupt exception are taken in EL3.
 
+config ARMV8_UDELAY_EVENT_STREAM
+	bool "Use the event stream for udelay"
+	default y if ARCH_VEXPRESS64
+	help
+	  Use the event stream provided by the AArch64 architectural timer for
+	  delays. This is more efficient than the default polling
+	  implementation.
+
 menuconfig ARMV8_CRYPTO
 	bool "ARM64 Accelerated Cryptographic Algorithms"
 
diff --git a/arch/arm/cpu/armv8/generic_timer.c b/arch/arm/cpu/armv8/generic_timer.c
index 8f83372cbc..e18b5c8187 100644
--- a/arch/arm/cpu/armv8/generic_timer.c
+++ b/arch/arm/cpu/armv8/generic_timer.c
@@ -115,3 +115,30 @@  ulong timer_get_boot_us(void)
 
 	return val / get_tbclk();
 }
+
+#if CONFIG_IS_ENABLED(ARMV8_UDELAY_EVENT_STREAM)
+void __udelay(unsigned long usec)
+{
+	u64 target = get_ticks() + usec_to_tick(usec);
+
+	/* At EL2 or above, use the event stream to avoid polling CNTPCT_EL0 so often */
+	if (current_el() >= 2) {
+		u32 cnthctl_val;
+		const u8 event_period = 0x7;
+
+		asm volatile("mrs %0, cnthctl_el2" : "=r" (cnthctl_val));
+		asm volatile("msr cnthctl_el2, %0" : : "r"
+			(cnthctl_val | CNTHCTL_EL2_EVNT_EN | CNTHCTL_EL2_EVNT_I(event_period)));
+
+		while (get_ticks() + (1ULL << event_period) <= target)
+			wfe();
+
+		/* Reset the event stream */
+		asm volatile("msr cnthctl_el2, %0" : : "r" (cnthctl_val));
+	}
+
+	/* Fall back to polling CNTPCT_EL0 */
+	while (get_ticks() <= target)
+		;
+}
+#endif
diff --git a/arch/arm/include/asm/system.h b/arch/arm/include/asm/system.h
index 51123c2968..7e30cac32a 100644
--- a/arch/arm/include/asm/system.h
+++ b/arch/arm/include/asm/system.h
@@ -69,8 +69,10 @@ 
 /*
  * CNTHCTL_EL2 bits definitions
  */
-#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)  /* Physical timer regs accessible   */
-#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)  /* Physical counter accessible      */
+#define CNTHCTL_EL2_EVNT_EN	BIT(2)	     /* Enable the event stream       */
+#define CNTHCTL_EL2_EVNT_I(val)	((val) << 4) /* Event stream trigger bits     */
+#define CNTHCTL_EL2_EL1PCEN_EN	(1 << 1)     /* Physical timer regs accessible */
+#define CNTHCTL_EL2_EL1PCTEN_EN	(1 << 0)     /* Physical counter accessible   */
 
 /*
  * HCR_EL2 bits definitions