diff mbox

powernv: Add OPAL tracepoints

Message ID 20140703172050.495b3f72@kryten (mailing list archive)
State Accepted
Headers show

Commit Message

Anton Blanchard July 3, 2014, 7:20 a.m. UTC
Knowing how long we spend in firmware calls is an important part of
minimising OS jitter.

This patch adds tracepoints to each OPAL call. If tracepoints are
enabled we branch out to a common routine that calls an entry and exit
tracepoint.

This allows us to write tools that monitor the frequency and duration
of OPAL calls, eg:

name                  count  total(ms)  min(ms)  max(ms)  avg(ms)  period(ms)
OPAL_HANDLE_INTERRUPT     5      0.199    0.037    0.042    0.040   12547.545
OPAL_POLL_EVENTS        204      2.590    0.012    0.036    0.013    2264.899
OPAL_PCI_MSI_EOI       2830      3.066    0.001    0.005    0.001      81.166

We use jump labels if configured, which means we only add a single
nop instruction to every OPAL call when the tracepoints are disabled.

Signed-off-by: Anton Blanchard <anton@samba.org>
---

Comments

Paul E. McKenney July 9, 2014, 7:42 p.m. UTC | #1
On Thu, Jul 03, 2014 at 05:20:50PM +1000, Anton Blanchard wrote:
> Knowing how long we spend in firmware calls is an important part of
> minimising OS jitter.
> 
> This patch adds tracepoints to each OPAL call. If tracepoints are
> enabled we branch out to a common routine that calls an entry and exit
> tracepoint.
> 
> This allows us to write tools that monitor the frequency and duration
> of OPAL calls, eg:
> 
> name                  count  total(ms)  min(ms)  max(ms)  avg(ms)  period(ms)
> OPAL_HANDLE_INTERRUPT     5      0.199    0.037    0.042    0.040   12547.545
> OPAL_POLL_EVENTS        204      2.590    0.012    0.036    0.013    2264.899
> OPAL_PCI_MSI_EOI       2830      3.066    0.001    0.005    0.001      81.166
> 
> We use jump labels if configured, which means we only add a single
> nop instruction to every OPAL call when the tracepoints are disabled.
> 
> Signed-off-by: Anton Blanchard <anton@samba.org>

That is what I call invoking tracepoints the hard way -- from assembly!
Just one question -- can these tracepoints be invoked from the idle
loop?  If so, you need to use the _rcuidle suffix, for example, as
in trace_opal_entry_rcuidle().  If not:

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> ---
> 
> Index: b/arch/powerpc/include/asm/trace.h
> ===================================================================
> --- a/arch/powerpc/include/asm/trace.h
> +++ b/arch/powerpc/include/asm/trace.h
> @@ -99,6 +99,51 @@ TRACE_EVENT_FN(hcall_exit,
>  );
>  #endif
> 
> +#ifdef CONFIG_PPC_POWERNV
> +extern void opal_tracepoint_regfunc(void);
> +extern void opal_tracepoint_unregfunc(void);
> +
> +TRACE_EVENT_FN(opal_entry,
> +
> +	TP_PROTO(unsigned long opcode, unsigned long *args),
> +
> +	TP_ARGS(opcode, args),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long, opcode)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->opcode = opcode;
> +	),
> +
> +	TP_printk("opcode=%lu", __entry->opcode),
> +
> +	opal_tracepoint_regfunc, opal_tracepoint_unregfunc
> +);
> +
> +TRACE_EVENT_FN(opal_exit,
> +
> +	TP_PROTO(unsigned long opcode, unsigned long retval),
> +
> +	TP_ARGS(opcode, retval),
> +
> +	TP_STRUCT__entry(
> +		__field(unsigned long, opcode)
> +		__field(unsigned long, retval)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->opcode = opcode;
> +		__entry->retval = retval;
> +	),
> +
> +	TP_printk("opcode=%lu retval=%lu", __entry->opcode, __entry->retval),
> +
> +	opal_tracepoint_regfunc, opal_tracepoint_unregfunc
> +);
> +#endif
> +
>  #endif /* _TRACE_POWERPC_H */
> 
>  #undef TRACE_INCLUDE_PATH
> Index: b/arch/powerpc/platforms/powernv/Makefile
> ===================================================================
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -8,3 +8,4 @@ obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o
>  obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
>  obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
>  obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
> +obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
> Index: b/arch/powerpc/platforms/powernv/opal-wrappers.S
> ===================================================================
> --- a/arch/powerpc/platforms/powernv/opal-wrappers.S
> +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
> @@ -13,30 +13,69 @@
>  #include <asm/hvcall.h>
>  #include <asm/asm-offsets.h>
>  #include <asm/opal.h>
> +#include <asm/jump_label.h>
> +
> +	.section	".text"
> +
> +#ifdef CONFIG_TRACEPOINTS
> +#ifdef CONFIG_JUMP_LABEL
> +#define OPAL_BRANCH(LABEL)					\
> +	ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
> +#else
> +
> +	.section	".toc","aw"
> +
> +	.globl opal_tracepoint_refcount
> +opal_tracepoint_refcount:
> +	.llong	0
> +
> +	.section	".text"
> +
> +/*
> + * We branch around this in early init by using an unconditional cpu
> + * feature.
> + */
> +#define OPAL_BRANCH(LABEL)					\
> +BEGIN_FTR_SECTION;						\
> +	b	1f;						\
> +END_FTR_SECTION(0, 1);						\
> +	ld	r12,opal_tracepoint_refcount@toc(r2);		\
> +	std	r12,32(r1);					\
> +	cmpdi	r12,0;						\
> +	bne-	LABEL;						\
> +1:
> +
> +#endif
> +
> +#else
> +#define OPAL_BRANCH(LABEL)
> +#endif
> 
>  /* TODO:
>   *
>   * - Trace irqs in/off (needs saving/restoring all args, argh...)
>   * - Get r11 feed up by Dave so I can have better register usage
>   */
> +
>  #define OPAL_CALL(name, token)		\
>   _GLOBAL(name);				\
>  	mflr	r0;			\
> -	mfcr	r12;			\
>  	std	r0,16(r1);		\
> +	li	r0,token;		\
> +	OPAL_BRANCH(opal_tracepoint_entry) \
> +	mfcr	r12;			\
>  	stw	r12,8(r1);		\
>  	std	r1,PACAR1(r13);		\
> -	li	r0,0;			\
> +	li	r11,0;			\
>  	mfmsr	r12;			\
> -	ori	r0,r0,MSR_EE;		\
> +	ori	r11,r11,MSR_EE;		\
>  	std	r12,PACASAVEDMSR(r13);	\
> -	andc	r12,r12,r0;		\
> +	andc	r12,r12,r11;		\
>  	mtmsrd	r12,1;			\
> -	LOAD_REG_ADDR(r0,opal_return);	\
> -	mtlr	r0;			\
> -	li	r0,MSR_DR|MSR_IR|MSR_LE;\
> -	andc	r12,r12,r0;		\
> -	li	r0,token;		\
> +	LOAD_REG_ADDR(r11,opal_return);	\
> +	mtlr	r11;			\
> +	li	r11,MSR_DR|MSR_IR|MSR_LE;\
> +	andc	r12,r12,r11;		\
>  	mtspr	SPRN_HSRR1,r12;		\
>  	LOAD_REG_ADDR(r11,opal);	\
>  	ld	r12,8(r11);		\
> @@ -61,6 +100,64 @@ opal_return:
>  	mtcr	r4;
>  	rfid
> 
> +#ifdef CONFIG_TRACEPOINTS
> +opal_tracepoint_entry:
> +	stdu	r1,-STACKFRAMESIZE(r1)
> +	std	r0,STK_REG(R23)(r1)
> +	std	r3,STK_REG(R24)(r1)
> +	std	r4,STK_REG(R25)(r1)
> +	std	r5,STK_REG(R26)(r1)
> +	std	r6,STK_REG(R27)(r1)
> +	std	r7,STK_REG(R28)(r1)
> +	std	r8,STK_REG(R29)(r1)
> +	std	r9,STK_REG(R30)(r1)
> +	std	r10,STK_REG(R31)(r1)
> +	mr	r3,r0
> +	addi	r4,r1,STK_REG(R24)
> +	bl	__trace_opal_entry
> +	ld	r0,STK_REG(R23)(r1)
> +	ld	r3,STK_REG(R24)(r1)
> +	ld	r4,STK_REG(R25)(r1)
> +	ld	r5,STK_REG(R26)(r1)
> +	ld	r6,STK_REG(R27)(r1)
> +	ld	r7,STK_REG(R28)(r1)
> +	ld	r8,STK_REG(R29)(r1)
> +	ld	r9,STK_REG(R30)(r1)
> +	ld	r10,STK_REG(R31)(r1)
> +	LOAD_REG_ADDR(r11,opal_tracepoint_return)
> +	mfcr	r12
> +	std	r11,16(r1)
> +	stw	r12,8(r1)
> +	std	r1,PACAR1(r13)
> +	li	r11,0
> +	mfmsr	r12
> +	ori	r11,r11,MSR_EE
> +	std	r12,PACASAVEDMSR(r13)
> +	andc	r12,r12,r11
> +	mtmsrd	r12,1
> +	LOAD_REG_ADDR(r11,opal_return)
> +	mtlr	r11
> +	li	r11,MSR_DR|MSR_IR|MSR_LE
> +	andc	r12,r12,r11
> +	mtspr	SPRN_HSRR1,r12
> +	LOAD_REG_ADDR(r11,opal)
> +	ld	r12,8(r11)
> +	ld	r2,0(r11)
> +	mtspr	SPRN_HSRR0,r12
> +	hrfid
> +
> +opal_tracepoint_return:
> +	std	r3,STK_REG(R31)(r1)
> +	mr	r4,r3
> +	ld	r0,STK_REG(R23)(r1)
> +	bl	__trace_opal_exit
> +	ld	r3,STK_REG(R31)(r1)
> +	addi	r1,r1,STACKFRAMESIZE
> +	ld	r0,16(r1)
> +	mtlr	r0
> +	blr
> +#endif
> +
>  OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
>  OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
>  OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
> Index: b/arch/powerpc/platforms/powernv/opal-tracepoints.c
> ===================================================================
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
> @@ -0,0 +1,84 @@
> +#include <linux/percpu.h>
> +#include <linux/jump_label.h>
> +#include <asm/trace.h>
> +
> +#ifdef CONFIG_JUMP_LABEL
> +struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
> +
> +void opal_tracepoint_regfunc(void)
> +{
> +	static_key_slow_inc(&opal_tracepoint_key);
> +}
> +
> +void opal_tracepoint_unregfunc(void)
> +{
> +	static_key_slow_dec(&opal_tracepoint_key);
> +}
> +#else
> +/*
> + * We optimise OPAL calls by placing opal_tracepoint_refcount
> + * directly in the TOC so we can check if the opal tracepoints are
> + * enabled via a single load.
> + */
> +
> +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
> +extern long opal_tracepoint_refcount;
> +
> +void opal_tracepoint_regfunc(void)
> +{
> +	opal_tracepoint_refcount++;
> +}
> +
> +void opal_tracepoint_unregfunc(void)
> +{
> +	opal_tracepoint_refcount--;
> +}
> +#endif
> +
> +/*
> + * Since the tracing code might execute OPAL calls we need to guard against
> + * recursion.
> + */
> +static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
> +
> +void __trace_opal_entry(unsigned long opcode, unsigned long *args)
> +{
> +	unsigned long flags;
> +	unsigned int *depth;
> +
> +	local_irq_save(flags);
> +
> +	depth = &__get_cpu_var(opal_trace_depth);
> +
> +	if (*depth)
> +		goto out;
> +
> +	(*depth)++;
> +	preempt_disable();
> +	trace_opal_entry(opcode, args);
> +	(*depth)--;
> +
> +out:
> +	local_irq_restore(flags);
> +}
> +
> +void __trace_opal_exit(long opcode, unsigned long retval)
> +{
> +	unsigned long flags;
> +	unsigned int *depth;
> +
> +	local_irq_save(flags);
> +
> +	depth = &__get_cpu_var(opal_trace_depth);
> +
> +	if (*depth)
> +		goto out;
> +
> +	(*depth)++;
> +	trace_opal_exit(opcode, retval);
> +	preempt_enable();
> +	(*depth)--;
> +
> +out:
> +	local_irq_restore(flags);
> +}
>
diff mbox

Patch

Index: b/arch/powerpc/include/asm/trace.h
===================================================================
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -99,6 +99,51 @@  TRACE_EVENT_FN(hcall_exit,
 );
 #endif
 
+#ifdef CONFIG_PPC_POWERNV
+extern void opal_tracepoint_regfunc(void);
+extern void opal_tracepoint_unregfunc(void);
+
+TRACE_EVENT_FN(opal_entry,
+
+	TP_PROTO(unsigned long opcode, unsigned long *args),
+
+	TP_ARGS(opcode, args),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, opcode)
+	),
+
+	TP_fast_assign(
+		__entry->opcode = opcode;
+	),
+
+	TP_printk("opcode=%lu", __entry->opcode),
+
+	opal_tracepoint_regfunc, opal_tracepoint_unregfunc
+);
+
+TRACE_EVENT_FN(opal_exit,
+
+	TP_PROTO(unsigned long opcode, unsigned long retval),
+
+	TP_ARGS(opcode, retval),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, opcode)
+		__field(unsigned long, retval)
+	),
+
+	TP_fast_assign(
+		__entry->opcode = opcode;
+		__entry->retval = retval;
+	),
+
+	TP_printk("opcode=%lu retval=%lu", __entry->opcode, __entry->retval),
+
+	opal_tracepoint_regfunc, opal_tracepoint_unregfunc
+);
+#endif
+
 #endif /* _TRACE_POWERPC_H */
 
 #undef TRACE_INCLUDE_PATH
Index: b/arch/powerpc/platforms/powernv/Makefile
===================================================================
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -8,3 +8,4 @@  obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o
 obj-$(CONFIG_EEH)	+= eeh-ioda.o eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
+obj-$(CONFIG_TRACEPOINTS)	+= opal-tracepoints.o
Index: b/arch/powerpc/platforms/powernv/opal-wrappers.S
===================================================================
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -13,30 +13,69 @@ 
 #include <asm/hvcall.h>
 #include <asm/asm-offsets.h>
 #include <asm/opal.h>
+#include <asm/jump_label.h>
+
+	.section	".text"
+
+#ifdef CONFIG_TRACEPOINTS
+#ifdef CONFIG_JUMP_LABEL
+#define OPAL_BRANCH(LABEL)					\
+	ARCH_STATIC_BRANCH(LABEL, opal_tracepoint_key)
+#else
+
+	.section	".toc","aw"
+
+	.globl opal_tracepoint_refcount
+opal_tracepoint_refcount:
+	.llong	0
+
+	.section	".text"
+
+/*
+ * We branch around this in early init by using an unconditional cpu
+ * feature.
+ */
+#define OPAL_BRANCH(LABEL)					\
+BEGIN_FTR_SECTION;						\
+	b	1f;						\
+END_FTR_SECTION(0, 1);						\
+	ld	r12,opal_tracepoint_refcount@toc(r2);		\
+	std	r12,32(r1);					\
+	cmpdi	r12,0;						\
+	bne-	LABEL;						\
+1:
+
+#endif
+
+#else
+#define OPAL_BRANCH(LABEL)
+#endif
 
 /* TODO:
  *
  * - Trace irqs in/off (needs saving/restoring all args, argh...)
  * - Get r11 feed up by Dave so I can have better register usage
  */
+
 #define OPAL_CALL(name, token)		\
  _GLOBAL(name);				\
 	mflr	r0;			\
-	mfcr	r12;			\
 	std	r0,16(r1);		\
+	li	r0,token;		\
+	OPAL_BRANCH(opal_tracepoint_entry) \
+	mfcr	r12;			\
 	stw	r12,8(r1);		\
 	std	r1,PACAR1(r13);		\
-	li	r0,0;			\
+	li	r11,0;			\
 	mfmsr	r12;			\
-	ori	r0,r0,MSR_EE;		\
+	ori	r11,r11,MSR_EE;		\
 	std	r12,PACASAVEDMSR(r13);	\
-	andc	r12,r12,r0;		\
+	andc	r12,r12,r11;		\
 	mtmsrd	r12,1;			\
-	LOAD_REG_ADDR(r0,opal_return);	\
-	mtlr	r0;			\
-	li	r0,MSR_DR|MSR_IR|MSR_LE;\
-	andc	r12,r12,r0;		\
-	li	r0,token;		\
+	LOAD_REG_ADDR(r11,opal_return);	\
+	mtlr	r11;			\
+	li	r11,MSR_DR|MSR_IR|MSR_LE;\
+	andc	r12,r12,r11;		\
 	mtspr	SPRN_HSRR1,r12;		\
 	LOAD_REG_ADDR(r11,opal);	\
 	ld	r12,8(r11);		\
@@ -61,6 +100,64 @@  opal_return:
 	mtcr	r4;
 	rfid
 
+#ifdef CONFIG_TRACEPOINTS
+opal_tracepoint_entry:
+	stdu	r1,-STACKFRAMESIZE(r1)
+	std	r0,STK_REG(R23)(r1)
+	std	r3,STK_REG(R24)(r1)
+	std	r4,STK_REG(R25)(r1)
+	std	r5,STK_REG(R26)(r1)
+	std	r6,STK_REG(R27)(r1)
+	std	r7,STK_REG(R28)(r1)
+	std	r8,STK_REG(R29)(r1)
+	std	r9,STK_REG(R30)(r1)
+	std	r10,STK_REG(R31)(r1)
+	mr	r3,r0
+	addi	r4,r1,STK_REG(R24)
+	bl	__trace_opal_entry
+	ld	r0,STK_REG(R23)(r1)
+	ld	r3,STK_REG(R24)(r1)
+	ld	r4,STK_REG(R25)(r1)
+	ld	r5,STK_REG(R26)(r1)
+	ld	r6,STK_REG(R27)(r1)
+	ld	r7,STK_REG(R28)(r1)
+	ld	r8,STK_REG(R29)(r1)
+	ld	r9,STK_REG(R30)(r1)
+	ld	r10,STK_REG(R31)(r1)
+	LOAD_REG_ADDR(r11,opal_tracepoint_return)
+	mfcr	r12
+	std	r11,16(r1)
+	stw	r12,8(r1)
+	std	r1,PACAR1(r13)
+	li	r11,0
+	mfmsr	r12
+	ori	r11,r11,MSR_EE
+	std	r12,PACASAVEDMSR(r13)
+	andc	r12,r12,r11
+	mtmsrd	r12,1
+	LOAD_REG_ADDR(r11,opal_return)
+	mtlr	r11
+	li	r11,MSR_DR|MSR_IR|MSR_LE
+	andc	r12,r12,r11
+	mtspr	SPRN_HSRR1,r12
+	LOAD_REG_ADDR(r11,opal)
+	ld	r12,8(r11)
+	ld	r2,0(r11)
+	mtspr	SPRN_HSRR0,r12
+	hrfid
+
+opal_tracepoint_return:
+	std	r3,STK_REG(R31)(r1)
+	mr	r4,r3
+	ld	r0,STK_REG(R23)(r1)
+	bl	__trace_opal_exit
+	ld	r3,STK_REG(R31)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	ld	r0,16(r1)
+	mtlr	r0
+	blr
+#endif
+
 OPAL_CALL(opal_invalid_call,			OPAL_INVALID_CALL);
 OPAL_CALL(opal_console_write,			OPAL_CONSOLE_WRITE);
 OPAL_CALL(opal_console_read,			OPAL_CONSOLE_READ);
Index: b/arch/powerpc/platforms/powernv/opal-tracepoints.c
===================================================================
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-tracepoints.c
@@ -0,0 +1,84 @@ 
+#include <linux/percpu.h>
+#include <linux/jump_label.h>
+#include <asm/trace.h>
+
+#ifdef CONFIG_JUMP_LABEL
+struct static_key opal_tracepoint_key = STATIC_KEY_INIT;
+
+void opal_tracepoint_regfunc(void)
+{
+	static_key_slow_inc(&opal_tracepoint_key);
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	static_key_slow_dec(&opal_tracepoint_key);
+}
+#else
+/*
+ * We optimise OPAL calls by placing opal_tracepoint_refcount
+ * directly in the TOC so we can check if the opal tracepoints are
+ * enabled via a single load.
+ */
+
+/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
+extern long opal_tracepoint_refcount;
+
+void opal_tracepoint_regfunc(void)
+{
+	opal_tracepoint_refcount++;
+}
+
+void opal_tracepoint_unregfunc(void)
+{
+	opal_tracepoint_refcount--;
+}
+#endif
+
+/*
+ * Since the tracing code might execute OPAL calls we need to guard against
+ * recursion.
+ */
+static DEFINE_PER_CPU(unsigned int, opal_trace_depth);
+
+void __trace_opal_entry(unsigned long opcode, unsigned long *args)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	preempt_disable();
+	trace_opal_entry(opcode, args);
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}
+
+void __trace_opal_exit(long opcode, unsigned long retval)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+
+	depth = &__get_cpu_var(opal_trace_depth);
+
+	if (*depth)
+		goto out;
+
+	(*depth)++;
+	trace_opal_exit(opcode, retval);
+	preempt_enable();
+	(*depth)--;
+
+out:
+	local_irq_restore(flags);
+}