diff mbox series

[RFC] powerpc/64/ftrace: mprofile-kernel patch out mflr

Message ID 20190413015940.31170-1-npiggin@gmail.com (mailing list archive)
State Superseded
Headers show
Series [RFC] powerpc/64/ftrace: mprofile-kernel patch out mflr | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch next (8c2ffd9174779014c3fe1f96d9dc3641d9175f00)
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch fail total: 1 errors, 0 warnings, 0 checks, 127 lines checked

Commit Message

Nicholas Piggin April 13, 2019, 1:59 a.m. UTC
The new mprofile-kernel mcount sequence is

  mflr	r0
  bl	_mcount

Dynamic ftrace patches the branch instruction with a noop, but leaves
the mflr. mflr is executed by the branch unit that can only execute one
per cycle on POWER9 and shared with branches, so it would be nice to
avoid it where possible.

This patch is a hacky proof of concept to nop out the mflr. Can we do
this or are there races or other issues with it?
---
 arch/powerpc/kernel/trace/ftrace.c | 77 +++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 2 deletions(-)

Comments

Naveen N. Rao April 15, 2019, 9:42 a.m. UTC | #1
Hi Nick,

Nicholas Piggin wrote:
> The new mprofile-kernel mcount sequence is
> 
>   mflr	r0
>   bl	_mcount
> 
> Dynamic ftrace patches the branch instruction with a noop, but leaves
> the mflr. mflr is executed by the branch unit that can only execute one
> per cycle on POWER9 and shared with branches, so it would be nice to
> avoid it where possible.
> 
> This patch is a hacky proof of concept to nop out the mflr. Can we do
> this or are there races or other issues with it?

Thanks for implementing this. I don't see a problem with this.

- Naveen
Michael Ellerman May 13, 2019, 3:25 a.m. UTC | #2
Nicholas Piggin <npiggin@gmail.com> writes:
> The new mprofile-kernel mcount sequence is
>
>   mflr	r0
>   bl	_mcount
>
> Dynamic ftrace patches the branch instruction with a noop, but leaves
> the mflr. mflr is executed by the branch unit that can only execute one
> per cycle on POWER9 and shared with branches, so it would be nice to
> avoid it where possible.
>
> This patch is a hacky proof of concept to nop out the mflr. Can we do
> this or are there races or other issues with it?

There's a race, isn't there?

We have a function foo which currently has tracing disabled, so the mflr
and bl are nop'ed out.

  CPU 0			CPU 1
  ==================================
  bl foo
  nop (ie. not mflr)
  -> interrupt
  something else	enable tracing for foo
  ...			patch mflr and branch
  <- rfi
  bl _mcount

So we end up in _mcount() but with r0 not populated.

Or else what am I missing?

cheers


> diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
> index 52ee24fd353f..ecc75baef23e 100644
> --- a/arch/powerpc/kernel/trace/ftrace.c
> +++ b/arch/powerpc/kernel/trace/ftrace.c
> @@ -172,6 +172,19 @@ __ftrace_make_nop(struct module *mod,
>  		pr_err("Unexpected instruction %08x around bl _mcount\n", op);
>  		return -EINVAL;
>  	}
> +
> +	if (patch_instruction((unsigned int *)ip, pop)) {
> +		pr_err("Patching NOP failed.\n");
> +		return -EPERM;
> +	}
> +
> +	if (op == PPC_INST_MFLR) {
> +		if (patch_instruction((unsigned int *)(ip - 4), pop)) {
> +			pr_err("Patching NOP failed.\n");
> +			return -EPERM;
> +		}
> +	}
> +
>  #else
>  	/*
>  	 * Our original call site looks like:
> @@ -202,13 +215,14 @@ __ftrace_make_nop(struct module *mod,
>  		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
>  		return -EINVAL;
>  	}
> -#endif /* CONFIG_MPROFILE_KERNEL */
>  
>  	if (patch_instruction((unsigned int *)ip, pop)) {
>  		pr_err("Patching NOP failed.\n");
>  		return -EPERM;
>  	}
>  
> +#endif /* CONFIG_MPROFILE_KERNEL */
> +
>  	return 0;
>  }
>  
> @@ -421,6 +435,20 @@ static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr)
>  		return -EPERM;
>  	}
>  
> +#ifdef CONFIG_MPROFILE_KERNEL
> +	if (probe_kernel_read(&op, (void *)(ip - 4), 4)) {
> +		pr_err("Fetching instruction at %lx failed.\n", ip - 4);
> +		return -EFAULT;
> +	}
> +
> +	if (op == PPC_INST_MFLR) {
> +		if (patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) {
> +			pr_err("Patching NOP failed.\n");
> +			return -EPERM;
> +		}
> +	}
> +#endif
> +
>  	return 0;
>  }
>  
> @@ -437,9 +465,20 @@ int ftrace_make_nop(struct module *mod,
>  	 */
>  	if (test_24bit_addr(ip, addr)) {
>  		/* within range */
> +		int rc;
> +
>  		old = ftrace_call_replace(ip, addr, 1);
>  		new = PPC_INST_NOP;
> -		return ftrace_modify_code(ip, old, new);
> +		rc = ftrace_modify_code(ip, old, new);
> +		if (rc)
> +			return rc;
> +#ifdef CONFIG_MPROFILE_KERNEL
> +		old = PPC_INST_MFLR;
> +		new = PPC_INST_NOP;
> +		ftrace_modify_code(ip - 4, old, new);
> +		/* old mprofile kernel will error because no mflr */
> +#endif
> +		return rc;
>  	} else if (core_kernel_text(ip))
>  		return __ftrace_make_nop_kernel(rec, addr);
>  
> @@ -562,6 +601,20 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>  		return -EINVAL;
>  	}
>  
> +#ifdef CONFIG_MPROFILE_KERNEL
> +	if (probe_kernel_read(op, (ip - 4), 4)) {
> +		pr_err("Fetching instruction at %lx failed.\n", (unsigned long)(ip - 4));
> +		return -EFAULT;
> +	}
> +
> +	if (op[0] == PPC_INST_NOP) {
> +		if (patch_instruction((ip - 4), PPC_INST_MFLR)) {
> +			pr_err("Patching mflr failed.\n");
> +			return -EINVAL;
> +		}
> +	}
> +#endif
> +
>  	if (patch_branch(ip, tramp, BRANCH_SET_LINK)) {
>  		pr_err("REL24 out of range!\n");
>  		return -EINVAL;
> @@ -650,6 +703,20 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr)
>  		return -EINVAL;
>  	}
>  
> +#ifdef CONFIG_MPROFILE_KERNEL
> +	if (probe_kernel_read(&op, (ip - 4), 4)) {
> +		pr_err("Fetching instruction at %lx failed.\n", (unsigned long)(ip - 4));
> +		return -EFAULT;
> +	}
> +
> +	if (op == PPC_INST_NOP) {
> +		if (patch_instruction((ip - 4), PPC_INST_MFLR)) {
> +			pr_err("Patching mflr failed.\n");
> +			return -EINVAL;
> +		}
> +	}
> +#endif
> +
>  	if (patch_branch(ip, tramp, BRANCH_SET_LINK)) {
>  		pr_err("Error patching branch to ftrace tramp!\n");
>  		return -EINVAL;
> @@ -670,6 +737,12 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
>  	 */
>  	if (test_24bit_addr(ip, addr)) {
>  		/* within range */
> +#ifdef CONFIG_MPROFILE_KERNEL
> +		old = PPC_INST_NOP;
> +		new = PPC_INST_MFLR;
> +		ftrace_modify_code(ip - 4, old, new);
> +		/* old mprofile-kernel sequence will error because no nop */
> +#endif
>  		old = PPC_INST_NOP;
>  		new = ftrace_call_replace(ip, addr, 1);
>  		return ftrace_modify_code(ip, old, new);
> -- 
> 2.20.1
Naveen N. Rao May 13, 2019, 6:47 a.m. UTC | #3
Michael Ellerman wrote:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> The new mprofile-kernel mcount sequence is
>>
>>   mflr	r0
>>   bl	_mcount
>>
>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>> the mflr. mflr is executed by the branch unit that can only execute one
>> per cycle on POWER9 and shared with branches, so it would be nice to
>> avoid it where possible.
>>
>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>> this or are there races or other issues with it?
> 
> There's a race, isn't there?
> 
> We have a function foo which currently has tracing disabled, so the mflr
> and bl are nop'ed out.
> 
>   CPU 0			CPU 1
>   ==================================
>   bl foo
>   nop (ie. not mflr)
>   -> interrupt
>   something else	enable tracing for foo
>   ...			patch mflr and branch
>   <- rfi
>   bl _mcount
> 
> So we end up in _mcount() but with r0 not populated.

Good catch! Looks like we need to patch the mflr with a "b +8" similar 
to what we do in __ftrace_make_nop().


- Naveen
Michael Ellerman May 13, 2019, 11:34 a.m. UTC | #4
"Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
> Michael Ellerman wrote:
>> Nicholas Piggin <npiggin@gmail.com> writes:
>>> The new mprofile-kernel mcount sequence is
>>>
>>>   mflr	r0
>>>   bl	_mcount
>>>
>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>> the mflr. mflr is executed by the branch unit that can only execute one
>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>> avoid it where possible.
>>>
>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>> this or are there races or other issues with it?
>> 
>> There's a race, isn't there?
>> 
>> We have a function foo which currently has tracing disabled, so the mflr
>> and bl are nop'ed out.
>> 
>>   CPU 0			CPU 1
>>   ==================================
>>   bl foo
>>   nop (ie. not mflr)
>>   -> interrupt
>>   something else	enable tracing for foo
>>   ...			patch mflr and branch
>>   <- rfi
>>   bl _mcount
>> 
>> So we end up in _mcount() but with r0 not populated.
>
> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
> to what we do in __ftrace_make_nop().

Would that actually make it any faster though? Nick?

cheers
Naveen N. Rao May 14, 2019, 8:32 a.m. UTC | #5
Michael Ellerman wrote:
> "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
>> Michael Ellerman wrote:
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>> The new mprofile-kernel mcount sequence is
>>>>
>>>>   mflr	r0
>>>>   bl	_mcount
>>>>
>>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>>> the mflr. mflr is executed by the branch unit that can only execute one
>>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>>> avoid it where possible.
>>>>
>>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>>> this or are there races or other issues with it?
>>> 
>>> There's a race, isn't there?
>>> 
>>> We have a function foo which currently has tracing disabled, so the mflr
>>> and bl are nop'ed out.
>>> 
>>>   CPU 0			CPU 1
>>>   ==================================
>>>   bl foo
>>>   nop (ie. not mflr)
>>>   -> interrupt
>>>   something else	enable tracing for foo
>>>   ...			patch mflr and branch
>>>   <- rfi
>>>   bl _mcount
>>> 
>>> So we end up in _mcount() but with r0 not populated.
>>
>> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
>> to what we do in __ftrace_make_nop().
> 
> Would that actually make it any faster though? Nick?

Ok, how about doing this as a 2-step process?
1. patch 'mflr r0' with a 'b +8'
   synchronize_rcu_tasks()
2. convert 'b +8' to a 'nop'

- Naveen
Michael Ellerman May 15, 2019, 12:20 p.m. UTC | #6
"Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
> Michael Ellerman wrote:
>> "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
>>> Michael Ellerman wrote:
>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>> The new mprofile-kernel mcount sequence is
>>>>>
>>>>>   mflr	r0
>>>>>   bl	_mcount
>>>>>
>>>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>>>> the mflr. mflr is executed by the branch unit that can only execute one
>>>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>>>> avoid it where possible.
>>>>>
>>>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>>>> this or are there races or other issues with it?
>>>> 
>>>> There's a race, isn't there?
>>>> 
>>>> We have a function foo which currently has tracing disabled, so the mflr
>>>> and bl are nop'ed out.
>>>> 
>>>>   CPU 0			CPU 1
>>>>   ==================================
>>>>   bl foo
>>>>   nop (ie. not mflr)
>>>>   -> interrupt
>>>>   something else	enable tracing for foo
>>>>   ...			patch mflr and branch
>>>>   <- rfi
>>>>   bl _mcount
>>>> 
>>>> So we end up in _mcount() but with r0 not populated.
>>>
>>> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
>>> to what we do in __ftrace_make_nop().
>> 
>> Would that actually make it any faster though? Nick?
>
> Ok, how about doing this as a 2-step process?
> 1. patch 'mflr r0' with a 'b +8'
>    synchronize_rcu_tasks()
> 2. convert 'b +8' to a 'nop'

I think that would work, if I understand synchronize_rcu_tasks().

I worry that it will make the enable/disable expensive. But could be
worth trying.

cheers
Nicholas Piggin May 16, 2019, 5:49 a.m. UTC | #7
Naveen N. Rao's on May 14, 2019 6:32 pm:
> Michael Ellerman wrote:
>> "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
>>> Michael Ellerman wrote:
>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>> The new mprofile-kernel mcount sequence is
>>>>>
>>>>>   mflr	r0
>>>>>   bl	_mcount
>>>>>
>>>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>>>> the mflr. mflr is executed by the branch unit that can only execute one
>>>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>>>> avoid it where possible.
>>>>>
>>>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>>>> this or are there races or other issues with it?
>>>> 
>>>> There's a race, isn't there?
>>>> 
>>>> We have a function foo which currently has tracing disabled, so the mflr
>>>> and bl are nop'ed out.
>>>> 
>>>>   CPU 0			CPU 1
>>>>   ==================================
>>>>   bl foo
>>>>   nop (ie. not mflr)
>>>>   -> interrupt
>>>>   something else	enable tracing for foo
>>>>   ...			patch mflr and branch
>>>>   <- rfi
>>>>   bl _mcount
>>>> 
>>>> So we end up in _mcount() but with r0 not populated.
>>>
>>> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
>>> to what we do in __ftrace_make_nop().
>> 
>> Would that actually make it any faster though? Nick?
> 
> Ok, how about doing this as a 2-step process?
> 1. patch 'mflr r0' with a 'b +8'
>    synchronize_rcu_tasks()
> 2. convert 'b +8' to a 'nop'

Good idea. Well the mflr r0 is harmless, so you can leave that in.
You just need to ensure it's not removed before the bl is. So nop
the bl _mcount, then synchronize_rcu_tasks(), then nop the mflr?

Thanks,
Nick
Naveen N. Rao May 16, 2019, 6:22 p.m. UTC | #8
Nicholas Piggin wrote:
> Naveen N. Rao's on May 14, 2019 6:32 pm:
>> Michael Ellerman wrote:
>>> "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
>>>> Michael Ellerman wrote:
>>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>>> The new mprofile-kernel mcount sequence is
>>>>>>
>>>>>>   mflr	r0
>>>>>>   bl	_mcount
>>>>>>
>>>>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>>>>> the mflr. mflr is executed by the branch unit that can only execute one
>>>>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>>>>> avoid it where possible.
>>>>>>
>>>>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>>>>> this or are there races or other issues with it?
>>>>> 
>>>>> There's a race, isn't there?
>>>>> 
>>>>> We have a function foo which currently has tracing disabled, so the mflr
>>>>> and bl are nop'ed out.
>>>>> 
>>>>>   CPU 0			CPU 1
>>>>>   ==================================
>>>>>   bl foo
>>>>>   nop (ie. not mflr)
>>>>>   -> interrupt
>>>>>   something else	enable tracing for foo
>>>>>   ...			patch mflr and branch
>>>>>   <- rfi
>>>>>   bl _mcount
>>>>> 
>>>>> So we end up in _mcount() but with r0 not populated.
>>>>
>>>> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
>>>> to what we do in __ftrace_make_nop().
>>> 
>>> Would that actually make it any faster though? Nick?
>> 
>> Ok, how about doing this as a 2-step process?
>> 1. patch 'mflr r0' with a 'b +8'
>>    synchronize_rcu_tasks()
>> 2. convert 'b +8' to a 'nop'
> 
> Good idea. Well the mflr r0 is harmless, so you can leave that in.
> You just need to ensure it's not removed before the bl is. So nop
> the bl _mcount, then synchronize_rcu_tasks(), then nop the mflr?

The problem actually seems to be when we try to patch in the branch to 
_mcount(), rather than when we are patching in the nop instructions 
(i.e., the race is when we try to enable the function tracer, rather 
than while disabling it).

When we disable ftrace, we only need to ensure we patch out the branch 
to _mcount() before patching out the preceding 'mflr r0'. I don't think 
we need a synchronize_rcu_tasks() in that case.

While enabling ftrace, we will first need to patch the preceding 'mflr 
r0' (which would now be a 'nop') with 'b +8', then use 
synchronize_rcu_tasks() and finally patch in 'bl _mcount()' followed by 
'mflr r0'.

I think that's what you meant, just that my reference to patching 'mflr 
r0' with a 'b +8' should have called out that the mflr would have been 
nop'ed out.

- Naveen
Nicholas Piggin May 17, 2019, 9:12 a.m. UTC | #9
Naveen N. Rao's on May 17, 2019 4:22 am:
> Nicholas Piggin wrote:
>> Naveen N. Rao's on May 14, 2019 6:32 pm:
>>> Michael Ellerman wrote:
>>>> "Naveen N. Rao" <naveen.n.rao@linux.ibm.com> writes:
>>>>> Michael Ellerman wrote:
>>>>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>>>>> The new mprofile-kernel mcount sequence is
>>>>>>>
>>>>>>>   mflr	r0
>>>>>>>   bl	_mcount
>>>>>>>
>>>>>>> Dynamic ftrace patches the branch instruction with a noop, but leaves
>>>>>>> the mflr. mflr is executed by the branch unit that can only execute one
>>>>>>> per cycle on POWER9 and shared with branches, so it would be nice to
>>>>>>> avoid it where possible.
>>>>>>>
>>>>>>> This patch is a hacky proof of concept to nop out the mflr. Can we do
>>>>>>> this or are there races or other issues with it?
>>>>>> 
>>>>>> There's a race, isn't there?
>>>>>> 
>>>>>> We have a function foo which currently has tracing disabled, so the mflr
>>>>>> and bl are nop'ed out.
>>>>>> 
>>>>>>   CPU 0			CPU 1
>>>>>>   ==================================
>>>>>>   bl foo
>>>>>>   nop (ie. not mflr)
>>>>>>   -> interrupt
>>>>>>   something else	enable tracing for foo
>>>>>>   ...			patch mflr and branch
>>>>>>   <- rfi
>>>>>>   bl _mcount
>>>>>> 
>>>>>> So we end up in _mcount() but with r0 not populated.
>>>>>
>>>>> Good catch! Looks like we need to patch the mflr with a "b +8" similar 
>>>>> to what we do in __ftrace_make_nop().
>>>> 
>>>> Would that actually make it any faster though? Nick?
>>> 
>>> Ok, how about doing this as a 2-step process?
>>> 1. patch 'mflr r0' with a 'b +8'
>>>    synchronize_rcu_tasks()
>>> 2. convert 'b +8' to a 'nop'
>> 
>> Good idea. Well the mflr r0 is harmless, so you can leave that in.
>> You just need to ensure it's not removed before the bl is. So nop
>> the bl _mcount, then synchronize_rcu_tasks(), then nop the mflr?
> 
> The problem actually seems to be when we try to patch in the branch to 
> _mcount(), rather than when we are patching in the nop instructions 
> (i.e., the race is when we try to enable the function tracer, rather 
> than while disabling it).
> 
> When we disable ftrace, we only need to ensure we patch out the branch 
> to _mcount() before patching out the preceding 'mflr r0'. I don't think 
> we need a synchronize_rcu_tasks() in that case.

That's probably right.

> While enabling ftrace, we will first need to patch the preceding 'mflr 
> r0' (which would now be a 'nop') with 'b +8', then use 
> synchronize_rcu_tasks() and finally patch in 'bl _mcount()' followed by 
> 'mflr r0'.
> 
> I think that's what you meant, just that my reference to patching 'mflr 
> r0' with a 'b +8' should have called out that the mflr would have been 
> nop'ed out.

I meant that we don't need the b +8 anywhere, because the mflr r0
is harmless. Enabling ftrace just needs to patch in 'mflr r0', and
then synchronize_rcu_tasks(), and then patch in 'bl _mcount'. I think?

Thanks,
Nick
Naveen N. Rao May 17, 2019, 5:25 p.m. UTC | #10
Nicholas Piggin wrote:
> Naveen N. Rao's on May 17, 2019 4:22 am:
> 
>> While enabling ftrace, we will first need to patch the preceding 'mflr 
>> r0' (which would now be a 'nop') with 'b +8', then use 
>> synchronize_rcu_tasks() and finally patch in 'bl _mcount()' followed by 
>> 'mflr r0'.
>> 
>> I think that's what you meant, just that my reference to patching 'mflr 
>> r0' with a 'b +8' should have called out that the mflr would have been 
>> nop'ed out.
> 
> I meant that we don't need the b +8 anywhere, because the mflr r0
> is harmless. Enabling ftrace just needs to patch in 'mflr r0', and
> then synchronize_rcu_tasks(), and then patch in 'bl _mcount'. I think?

Ah, that's a good point! That should be enough and it simplifies this 
further.


Thanks,
Naveen
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 52ee24fd353f..ecc75baef23e 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -172,6 +172,19 @@  __ftrace_make_nop(struct module *mod,
 		pr_err("Unexpected instruction %08x around bl _mcount\n", op);
 		return -EINVAL;
 	}
+
+	if (patch_instruction((unsigned int *)ip, pop)) {
+		pr_err("Patching NOP failed.\n");
+		return -EPERM;
+	}
+
+	if (op == PPC_INST_MFLR) {
+		if (patch_instruction((unsigned int *)(ip - 4), pop)) {
+			pr_err("Patching NOP failed.\n");
+			return -EPERM;
+		}
+	}
+
 #else
 	/*
 	 * Our original call site looks like:
@@ -202,13 +215,14 @@  __ftrace_make_nop(struct module *mod,
 		pr_err("Expected %08x found %08x\n", PPC_INST_LD_TOC, op);
 		return -EINVAL;
 	}
-#endif /* CONFIG_MPROFILE_KERNEL */
 
 	if (patch_instruction((unsigned int *)ip, pop)) {
 		pr_err("Patching NOP failed.\n");
 		return -EPERM;
 	}
 
+#endif /* CONFIG_MPROFILE_KERNEL */
+
 	return 0;
 }
 
@@ -421,6 +435,20 @@  static int __ftrace_make_nop_kernel(struct dyn_ftrace *rec, unsigned long addr)
 		return -EPERM;
 	}
 
+#ifdef CONFIG_MPROFILE_KERNEL
+	if (probe_kernel_read(&op, (void *)(ip - 4), 4)) {
+		pr_err("Fetching instruction at %lx failed.\n", ip - 4);
+		return -EFAULT;
+	}
+
+	if (op == PPC_INST_MFLR) {
+		if (patch_instruction((unsigned int *)(ip - 4), PPC_INST_NOP)) {
+			pr_err("Patching NOP failed.\n");
+			return -EPERM;
+		}
+	}
+#endif
+
 	return 0;
 }
 
@@ -437,9 +465,20 @@  int ftrace_make_nop(struct module *mod,
 	 */
 	if (test_24bit_addr(ip, addr)) {
 		/* within range */
+		int rc;
+
 		old = ftrace_call_replace(ip, addr, 1);
 		new = PPC_INST_NOP;
-		return ftrace_modify_code(ip, old, new);
+		rc = ftrace_modify_code(ip, old, new);
+		if (rc)
+			return rc;
+#ifdef CONFIG_MPROFILE_KERNEL
+		old = PPC_INST_MFLR;
+		new = PPC_INST_NOP;
+		ftrace_modify_code(ip - 4, old, new);
+		/* old mprofile kernel will error because no mflr */
+#endif
+		return rc;
 	} else if (core_kernel_text(ip))
 		return __ftrace_make_nop_kernel(rec, addr);
 
@@ -562,6 +601,20 @@  __ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 		return -EINVAL;
 	}
 
+#ifdef CONFIG_MPROFILE_KERNEL
+	if (probe_kernel_read(op, (ip - 4), 4)) {
+		pr_err("Fetching instruction at %lx failed.\n", (unsigned long)(ip - 4));
+		return -EFAULT;
+	}
+
+	if (op[0] == PPC_INST_NOP) {
+		if (patch_instruction((ip - 4), PPC_INST_MFLR)) {
+			pr_err("Patching mflr failed.\n");
+			return -EINVAL;
+		}
+	}
+#endif
+
 	if (patch_branch(ip, tramp, BRANCH_SET_LINK)) {
 		pr_err("REL24 out of range!\n");
 		return -EINVAL;
@@ -650,6 +703,20 @@  static int __ftrace_make_call_kernel(struct dyn_ftrace *rec, unsigned long addr)
 		return -EINVAL;
 	}
 
+#ifdef CONFIG_MPROFILE_KERNEL
+	if (probe_kernel_read(&op, (ip - 4), 4)) {
+		pr_err("Fetching instruction at %lx failed.\n", (unsigned long)(ip - 4));
+		return -EFAULT;
+	}
+
+	if (op == PPC_INST_NOP) {
+		if (patch_instruction((ip - 4), PPC_INST_MFLR)) {
+			pr_err("Patching mflr failed.\n");
+			return -EINVAL;
+		}
+	}
+#endif
+
 	if (patch_branch(ip, tramp, BRANCH_SET_LINK)) {
 		pr_err("Error patching branch to ftrace tramp!\n");
 		return -EINVAL;
@@ -670,6 +737,12 @@  int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	 */
 	if (test_24bit_addr(ip, addr)) {
 		/* within range */
+#ifdef CONFIG_MPROFILE_KERNEL
+		old = PPC_INST_NOP;
+		new = PPC_INST_MFLR;
+		ftrace_modify_code(ip - 4, old, new);
+		/* old mprofile-kernel sequence will error because no nop */
+#endif
 		old = PPC_INST_NOP;
 		new = ftrace_call_replace(ip, addr, 1);
 		return ftrace_modify_code(ip, old, new);