diff mbox

[v1,2/6] sparc64: access tick function from variable

Message ID 1496428854-936411-3-git-send-email-pasha.tatashin@oracle.com
State Changes Requested
Delegated to: David Miller
Headers show

Commit Message

Pavel Tatashin June 2, 2017, 6:40 p.m. UTC
In timer_64.c tick functions are access via pointer (tick_ops), every time
clock is read, there is one extra load to get to the function.

This patch optimizes it, by accessing functions pointer from value.

Current ched_clock():
sethi  %hi(0xb9b400), %g1
ldx  [ %g1 + 0x250 ], %g1  ! <tick_ops>
ldx  [ %g1 ], %g1
call  %g1
nop
sethi  %hi(0xb9b400), %g1
ldx  [ %g1 + 0x300 ], %g1  ! <timer_ticks_per_nsec_quotient>
mulx  %o0, %g1, %g1
rett  %i7 + 8
srlx  %g1, 0xa, %o0

New sched_clock():
sethi  %hi(0xb9b400), %g1
ldx  [ %g1 + 0x340 ], %g1
call  %g1
nop
sethi  %hi(0xb9b400), %g1
ldx  [ %g1 + 0x378 ], %g1
mulx  %o0, %g1, %g1
rett  %i7 + 8
srlx  %g1, 0xa, %o0

Before three loads, now two loads.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
---
 arch/sparc/kernel/time_64.c |   30 ++++++++++++++++--------------
 1 files changed, 16 insertions(+), 14 deletions(-)

Comments

David Miller June 5, 2017, 2:27 a.m. UTC | #1
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Fri,  2 Jun 2017 14:40:50 -0400

> In timer_64.c tick functions are access via pointer (tick_ops), every time
> clock is read, there is one extra load to get to the function.
> 
> This patch optimizes it, by accessing functions pointer from value.
> 
> Current ched_clock():
> sethi  %hi(0xb9b400), %g1
> ldx  [ %g1 + 0x250 ], %g1  ! <tick_ops>
> ldx  [ %g1 ], %g1
> call  %g1
> nop
> sethi  %hi(0xb9b400), %g1
> ldx  [ %g1 + 0x300 ], %g1  ! <timer_ticks_per_nsec_quotient>
> mulx  %o0, %g1, %g1
> rett  %i7 + 8
> srlx  %g1, 0xa, %o0
> 
> New sched_clock():
> sethi  %hi(0xb9b400), %g1
> ldx  [ %g1 + 0x340 ], %g1
> call  %g1
> nop
> sethi  %hi(0xb9b400), %g1
> ldx  [ %g1 + 0x378 ], %g1
> mulx  %o0, %g1, %g1
> rett  %i7 + 8
> srlx  %g1, 0xa, %o0
> 
> Before three loads, now two loads.
> 
> Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
> Reviewed-by: Shannon Nelson <shannon.nelson@oracle.com>
> Reviewed-by: Steven Sistare <steven.sistare@oracle.com>

For the tick read itself itt's probably time for code patching, and
thus taking the number of loads down to one.

It's not that hard, the largest code sequence is for hummingbird which
is about 13 or 14 instructions.

So just make a tick_get_tick() assembler function that's a software
trap instruction and then 14 or 15 nops.  Patch in the correct
assembler into this function as early as possible, and then just call
it directly instead of using the ops.

We do this for so many things (TLB flushes etc.) so there are many
examples to cull the implementation from.

In the end we'll get your early boot timestamps for free, load count
wise.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin June 5, 2017, 4 a.m. UTC | #2
> For the tick read itself itt's probably time for code patching, and
> thus taking the number of loads down to one.

Hi Dave,

True, we could save one more load, by patching tick_get_tick() but that 
would save us only 3-5 instructions because with the changes done in 
this patchset the extra load comes from the same cacheline as the other 
two variables: offset and quotient. So overall, we still have 3 loads as 
before, but they are much faster compared to what we have now, where 
every load is from a different cacheline.

Pasha

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 5, 2017, 4:09 a.m. UTC | #3
From: Pasha Tatashin <pasha.tatashin@oracle.com>
Date: Mon, 5 Jun 2017 00:00:07 -0400

> True, we could save one more load, by patching tick_get_tick() but
> that would save us only 3-5 instructions because with the changes done
> in this patchset the extra load comes from the same cacheline as the
> other two variables: offset and quotient. So overall, we still have 3
> loads as before, but they are much faster compared to what we have
> now, where every load is from a different cacheline.

But if you take things a step further, you can hide the other load
costs in the time it takes the %stick register read to complete.

So, for example, if we subsequently patch also sched_clock() in
assembler it becomes:

	load	timer_ticks_per_nsec_quotient, reg1
	load	timer_ticks_offset, reg2
	rd	%stick, reg3
	...

All 3 values will be in the cpu by the time the %stick read completes.

That is the fastest possible implementation of sched_clock().
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c
index 98d05de..6724bcb 100644
--- a/arch/sparc/kernel/time_64.c
+++ b/arch/sparc/kernel/time_64.c
@@ -585,6 +585,7 @@  static int __init clock_init(void)
 /* This is gets the master TICK_INT timer going. */
 static unsigned long sparc64_init_timers(void)
 {
+	struct sparc64_tick_ops *ops = NULL;
 	struct device_node *dp;
 	unsigned long freq;
 
@@ -598,16 +599,17 @@  static unsigned long sparc64_init_timers(void)
 		impl = ((ver >> 32) & 0xffff);
 		if (manuf == 0x17 && impl == 0x13) {
 			/* Hummingbird, aka Ultra-IIe */
-			tick_ops = &hbtick_operations;
+			ops = &hbtick_operations;
 			freq = of_getintprop_default(dp, "stick-frequency", 0);
 		} else {
-			tick_ops = &tick_operations;
 			freq = local_cpu_data().clock_tick;
 		}
 	} else {
-		tick_ops = &stick_operations;
+		ops = &stick_operations;
 		freq = of_getintprop_default(dp, "stick-frequency", 0);
 	}
+	if (ops)
+		memcpy(&tick_operations, ops, sizeof(struct sparc64_tick_ops));
 
 	return freq;
 }
@@ -671,12 +673,12 @@  static int __init register_sparc64_cpufreq_notifier(void)
 static int sparc64_next_event(unsigned long delta,
 			      struct clock_event_device *evt)
 {
-	return tick_ops->add_compare(delta) ? -ETIME : 0;
+	return tick_operations.add_compare(delta) ? -ETIME : 0;
 }
 
 static int sparc64_timer_shutdown(struct clock_event_device *evt)
 {
-	tick_ops->disable_irq();
+	tick_operations.disable_irq();
 	return 0;
 }
 
@@ -693,7 +695,7 @@  static int sparc64_timer_shutdown(struct clock_event_device *evt)
 void __irq_entry timer_interrupt(int irq, struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-	unsigned long tick_mask = tick_ops->softint_mask;
+	unsigned long tick_mask = tick_operations.softint_mask;
 	int cpu = smp_processor_id();
 	struct clock_event_device *evt = &per_cpu(sparc64_events, cpu);
 
@@ -728,7 +730,7 @@  void setup_sparc64_timer(void)
 			     : "=r" (pstate)
 			     : "i" (PSTATE_IE));
 
-	tick_ops->init_tick();
+	tick_operations.init_tick();
 
 	/* Restore PSTATE_IE. */
 	__asm__ __volatile__("wrpr	%0, 0x0, %%pstate"
@@ -757,9 +759,9 @@  void __delay(unsigned long loops)
 {
 	unsigned long bclock, now;
 
-	bclock = tick_ops->get_tick();
+	bclock = tick_operations.get_tick();
 	do {
-		now = tick_ops->get_tick();
+		now = tick_operations.get_tick();
 	} while ((now-bclock) < loops);
 }
 EXPORT_SYMBOL(__delay);
@@ -772,7 +774,7 @@  void udelay(unsigned long usecs)
 
 static u64 clocksource_tick_read(struct clocksource *cs)
 {
-	return tick_ops->get_tick();
+	return tick_operations.get_tick();
 }
 
 void __init time_init(void)
@@ -784,14 +786,14 @@  void __init time_init(void)
 	timer_ticks_per_nsec_quotient =
 		clocksource_hz2mult(freq, SPARC64_NSEC_PER_CYC_SHIFT);
 
-	clocksource_tick.name = tick_ops->name;
+	clocksource_tick.name = tick_operations.name;
 	clocksource_tick.read = clocksource_tick_read;
 
 	clocksource_register_hz(&clocksource_tick, freq);
 	printk("clocksource: mult[%x] shift[%d]\n",
 	       clocksource_tick.mult, clocksource_tick.shift);
 
-	sparc64_clockevent.name = tick_ops->name;
+	sparc64_clockevent.name = tick_operations.name;
 	clockevents_calc_mult_shift(&sparc64_clockevent, freq, 4);
 
 	sparc64_clockevent.max_delta_ns =
@@ -809,7 +811,7 @@  void __init time_init(void)
 
 unsigned long long sched_clock(void)
 {
-	unsigned long ticks = tick_ops->get_tick();
+	unsigned long ticks = tick_operations.get_tick();
 
 	return (ticks * timer_ticks_per_nsec_quotient)
 		>> SPARC64_NSEC_PER_CYC_SHIFT;
@@ -817,6 +819,6 @@  unsigned long long sched_clock(void)
 
 int read_current_timer(unsigned long *timer_val)
 {
-	*timer_val = tick_ops->get_tick();
+	*timer_val = tick_operations.get_tick();
 	return 0;
 }