diff mbox series

[RFC,3/3] powerpc/lib: Use a temporary mm for code patching

Message ID 20200323045205.20314-4-cmr@informatik.wtf
State New
Headers show
Series Use per-CPU temporary mappings for patching | expand

Checks

Context Check Description
snowpatch_ozlabs/needsstable success Patch has no Fixes tags
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 195 lines checked
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/build-ppc64e fail build failed!
snowpatch_ozlabs/build-ppc64be success Build succeeded
snowpatch_ozlabs/build-ppc64le success Build succeeded
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (a87b93bdf800a4d7a42d95683624a4516e516b4f)

Commit Message

Christopher M. Riedl March 23, 2020, 4:52 a.m. UTC
Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
mappings to other CPUs. These mappings should be kept local to the CPU
doing the patching. Use the pre-initialized temporary mm and patching
address for this purpose. Also add a check after patching to ensure the
patch succeeded.

Based on x86 implementation:

commit b3fd8e83ada0
("x86/alternatives: Use temporary mm for text poking")

Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
---
 arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
 1 file changed, 57 insertions(+), 71 deletions(-)

Comments

Christophe Leroy March 24, 2020, 4:25 p.m. UTC | #1
Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> mappings to other CPUs. These mappings should be kept local to the CPU
> doing the patching. Use the pre-initialized temporary mm and patching
> address for this purpose. Also add a check after patching to ensure the
> patch succeeded.
> 
> Based on x86 implementation:
> 
> commit b3fd8e83ada0
> ("x86/alternatives: Use temporary mm for text poking")
> 
> Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> ---
>   arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
>   1 file changed, 57 insertions(+), 71 deletions(-)
> 
> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> index 18b88ecfc5a8..f156132e8975 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -19,6 +19,7 @@
>   #include <asm/page.h>
>   #include <asm/code-patching.h>
>   #include <asm/setup.h>
> +#include <asm/mmu_context.h>
>   
>   static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
>   			       unsigned int *patch_addr)
> @@ -65,99 +66,79 @@ void __init poking_init(void)
>   	pte_unmap_unlock(ptep, ptl);
>   }
>   
> -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> -
> -static int text_area_cpu_up(unsigned int cpu)
> -{
> -	struct vm_struct *area;
> -
> -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> -	if (!area) {
> -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> -			cpu);
> -		return -1;
> -	}
> -	this_cpu_write(text_poke_area, area);
> -
> -	return 0;
> -}
> -
> -static int text_area_cpu_down(unsigned int cpu)
> -{
> -	free_vm_area(this_cpu_read(text_poke_area));
> -	return 0;
> -}
> -
> -/*
> - * Run as a late init call. This allows all the boot time patching to be done
> - * simply by patching the code, and then we're called here prior to
> - * mark_rodata_ro(), which happens after all init calls are run. Although
> - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
> - * it as being preferable to a kernel that will crash later when someone tries
> - * to use patch_instruction().
> - */
> -static int __init setup_text_poke_area(void)
> -{
> -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> -		"powerpc/text_poke:online", text_area_cpu_up,
> -		text_area_cpu_down));
> -
> -	return 0;
> -}
> -late_initcall(setup_text_poke_area);
> +struct patch_mapping {
> +	spinlock_t *ptl; /* for protecting pte table */
> +	struct temp_mm temp_mm;
> +};
>   
>   /*
>    * This can be called for kernel text or a module.
>    */
> -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)

Why change the name ?

>   {
> -	unsigned long pfn;
> -	int err;
> +	struct page *page;
> +	pte_t pte, *ptep;
> +	pgprot_t pgprot;
>   
>   	if (is_vmalloc_addr(addr))
> -		pfn = vmalloc_to_pfn(addr);
> +		page = vmalloc_to_page(addr);
>   	else
> -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> +		page = virt_to_page(addr);
>   
> -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> +	if (radix_enabled())
> +		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> +	else
> +		pgprot = PAGE_SHARED;

Can you explain the difference between radix and non radix ?

Why PAGE_KERNEL for a page that is mapped in userspace ?

Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just 
using PAGE_KERNEL ?

>   
> -	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> -	if (err)
> +	ptep = get_locked_pte(patching_mm, patching_addr, &patch_mapping->ptl);
> +	if (unlikely(!ptep)) {
> +		pr_warn("map patch: failed to allocate pte for patching\n");
>   		return -1;
> +	}
> +
> +	pte = mk_pte(page, pgprot);
> +	set_pte_at(patching_mm, patching_addr, ptep, pte);
> +
> +	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
> +	use_temporary_mm(&patch_mapping->temp_mm);
>   
>   	return 0;
>   }
>   
> -static inline int unmap_patch_area(unsigned long addr)
> +static int unmap_patch(struct patch_mapping *patch_mapping)
>   {
>   	pte_t *ptep;
>   	pmd_t *pmdp;
>   	pud_t *pudp;
>   	pgd_t *pgdp;
>   
> -	pgdp = pgd_offset_k(addr);
> +	pgdp = pgd_offset(patching_mm, patching_addr);
>   	if (unlikely(!pgdp))
>   		return -EINVAL;
>   
> -	pudp = pud_offset(pgdp, addr);
> +	pudp = pud_offset(pgdp, patching_addr);
>   	if (unlikely(!pudp))
>   		return -EINVAL;
>   
> -	pmdp = pmd_offset(pudp, addr);
> +	pmdp = pmd_offset(pudp, patching_addr);
>   	if (unlikely(!pmdp))
>   		return -EINVAL;
>   
> -	ptep = pte_offset_kernel(pmdp, addr);
> +	ptep = pte_offset_kernel(pmdp, patching_addr);

ptep should be stored in the patch_mapping struct instead of walking 
again the page tables.

>   	if (unlikely(!ptep))
>   		return -EINVAL;
>   
> -	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
> +	/*
> +	 * In hash, pte_clear flushes the tlb
> +	 */
> +	pte_clear(patching_mm, patching_addr, ptep);
> +	unuse_temporary_mm(&patch_mapping->temp_mm);
>   
>   	/*
> -	 * In hash, pte_clear flushes the tlb, in radix, we have to
> +	 * In radix, we have to explicitly flush the tlb (no-op in hash)
>   	 */
> -	pte_clear(&init_mm, addr, ptep);
> -	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> +	local_flush_tlb_mm(patching_mm);
> +	pte_unmap_unlock(ptep, patch_mapping->ptl);
>   
>   	return 0;
>   }
> @@ -167,33 +148,38 @@ static int do_patch_instruction(unsigned int *addr, unsigned int instr)
>   	int err;
>   	unsigned int *patch_addr = NULL;
>   	unsigned long flags;
> -	unsigned long text_poke_addr;
> -	unsigned long kaddr = (unsigned long)addr;
> +	struct patch_mapping patch_mapping;
>   
>   	/*
> -	 * During early early boot patch_instruction is called
> -	 * when text_poke_area is not ready, but we still need
> -	 * to allow patching. We just do the plain old patching
> +	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
> +	 * to this, patch_instruction is called when we don't have (and don't
> +	 * need) the patching_mm so just do plain old patching.
>   	 */
> -	if (!this_cpu_read(text_poke_area))
> +	if (!patching_mm)
>   		return raw_patch_instruction(addr, instr);
>   
>   	local_irq_save(flags);
>   
> -	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> -	if (map_patch_area(addr, text_poke_addr)) {
> -		err = -1;
> +	err = map_patch(addr, &patch_mapping);
> +	if (err)
>   		goto out;
> -	}
>   
> -	patch_addr = (unsigned int *)(text_poke_addr) +
> -			((kaddr & ~PAGE_MASK) / sizeof(unsigned int));
> +	patch_addr = (unsigned int *)(patching_addr) +
> +			(offset_in_page((unsigned long)addr) /
> +				sizeof(unsigned int));
>   
>   	__patch_instruction(addr, instr, patch_addr);

The error returned by __patch_instruction() should be managed.

>   
> -	err = unmap_patch_area(text_poke_addr);
> +	err = unmap_patch(&patch_mapping);
>   	if (err)
> -		pr_warn("failed to unmap %lx\n", text_poke_addr);
> +		pr_warn("unmap patch: failed to unmap patch\n");
> +
> +	/*
> +	 * Something is wrong if what we just wrote doesn't match what we
> +	 * think we just wrote.
> +	 * XXX: BUG_ON() instead?

No, not a BUG_ON(). If patching fails, that's no a vital fault, we can 
fail gracefully. You should return a fault instead.

> +	 */
> +	WARN_ON(memcmp(addr, &instr, sizeof(instr)));

Come on. addr is an *int, instr is an int. By doing a memcmp() on 
&instr, you for the compiler to write instr into the stack whereas local 
vars are mainly in registers on RISC processors like powerpc. Following 
should do it:

	WARN_ON(*addr != instr);

>   
>   out:
>   	local_irq_restore(flags);
> 

Christophe
Christopher M. Riedl April 15, 2020, 5:11 a.m. UTC | #2
> On March 24, 2020 11:25 AM Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> 
>  
> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> > Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> > mappings to other CPUs. These mappings should be kept local to the CPU
> > doing the patching. Use the pre-initialized temporary mm and patching
> > address for this purpose. Also add a check after patching to ensure the
> > patch succeeded.
> > 
> > Based on x86 implementation:
> > 
> > commit b3fd8e83ada0
> > ("x86/alternatives: Use temporary mm for text poking")
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> > ---
> >   arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
> >   1 file changed, 57 insertions(+), 71 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 18b88ecfc5a8..f156132e8975 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -19,6 +19,7 @@
> >   #include <asm/page.h>
> >   #include <asm/code-patching.h>
> >   #include <asm/setup.h>
> > +#include <asm/mmu_context.h>
> >   
> >   static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
> >   			       unsigned int *patch_addr)
> > @@ -65,99 +66,79 @@ void __init poking_init(void)
> >   	pte_unmap_unlock(ptep, ptl);
> >   }
> >   
> > -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > -
> > -static int text_area_cpu_up(unsigned int cpu)
> > -{
> > -	struct vm_struct *area;
> > -
> > -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> > -	if (!area) {
> > -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> > -			cpu);
> > -		return -1;
> > -	}
> > -	this_cpu_write(text_poke_area, area);
> > -
> > -	return 0;
> > -}
> > -
> > -static int text_area_cpu_down(unsigned int cpu)
> > -{
> > -	free_vm_area(this_cpu_read(text_poke_area));
> > -	return 0;
> > -}
> > -
> > -/*
> > - * Run as a late init call. This allows all the boot time patching to be done
> > - * simply by patching the code, and then we're called here prior to
> > - * mark_rodata_ro(), which happens after all init calls are run. Although
> > - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
> > - * it as being preferable to a kernel that will crash later when someone tries
> > - * to use patch_instruction().
> > - */
> > -static int __init setup_text_poke_area(void)
> > -{
> > -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > -		"powerpc/text_poke:online", text_area_cpu_up,
> > -		text_area_cpu_down));
> > -
> > -	return 0;
> > -}
> > -late_initcall(setup_text_poke_area);
> > +struct patch_mapping {
> > +	spinlock_t *ptl; /* for protecting pte table */
> > +	struct temp_mm temp_mm;
> > +};
> >   
> >   /*
> >    * This can be called for kernel text or a module.
> >    */
> > -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> > +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> 
> Why change the name ?
> 

It's not really an "area" anymore.

> >   {
> > -	unsigned long pfn;
> > -	int err;
> > +	struct page *page;
> > +	pte_t pte, *ptep;
> > +	pgprot_t pgprot;
> >   
> >   	if (is_vmalloc_addr(addr))
> > -		pfn = vmalloc_to_pfn(addr);
> > +		page = vmalloc_to_page(addr);
> >   	else
> > -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> > +		page = virt_to_page(addr);
> >   
> > -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> > +	if (radix_enabled())
> > +		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> > +	else
> > +		pgprot = PAGE_SHARED;
> 
> Can you explain the difference between radix and non radix ?
> 
> Why PAGE_KERNEL for a page that is mapped in userspace ?
> 
> Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just 
> using PAGE_KERNEL ?
> 

On hash there is a manual check which prevents setting _PAGE_PRIVILEGED for
kernel to userspace access in __hash_page - hence we cannot access the mapping
if the page is mapped PAGE_KERNEL on hash. However, I would like to use
PAGE_KERNEL here as well and am working on understanding why this check is
done in hash and if this can change. On radix this works just fine.

The page is mapped PAGE_KERNEL because the address is technically a userspace
address - but only to keep the mapping local to this CPU doing the patching.
PAGE_KERNEL makes it clear both in intent and protection that this is a kernel
mapping.

I think the correct way is pgprot_val(PAGE_KERNEL) since PAGE_KERNEL is defined
as:

#define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)

and __pgprot() is defined as:

typedef struct { unsigned long pgprot; } pgprot_t;
#define pgprot_val(x)   ((x).pgprot)
#define __pgprot(x)     ((pgprot_t) { (x) })

> >   
> > -	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> > -	if (err)
> > +	ptep = get_locked_pte(patching_mm, patching_addr, &patch_mapping->ptl);
> > +	if (unlikely(!ptep)) {
> > +		pr_warn("map patch: failed to allocate pte for patching\n");
> >   		return -1;
> > +	}
> > +
> > +	pte = mk_pte(page, pgprot);
> > +	set_pte_at(patching_mm, patching_addr, ptep, pte);
> > +
> > +	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
> > +	use_temporary_mm(&patch_mapping->temp_mm);
> >   
> >   	return 0;
> >   }
> >   
> > -static inline int unmap_patch_area(unsigned long addr)
> > +static int unmap_patch(struct patch_mapping *patch_mapping)
> >   {
> >   	pte_t *ptep;
> >   	pmd_t *pmdp;
> >   	pud_t *pudp;
> >   	pgd_t *pgdp;
> >   
> > -	pgdp = pgd_offset_k(addr);
> > +	pgdp = pgd_offset(patching_mm, patching_addr);
> >   	if (unlikely(!pgdp))
> >   		return -EINVAL;
> >   
> > -	pudp = pud_offset(pgdp, addr);
> > +	pudp = pud_offset(pgdp, patching_addr);
> >   	if (unlikely(!pudp))
> >   		return -EINVAL;
> >   
> > -	pmdp = pmd_offset(pudp, addr);
> > +	pmdp = pmd_offset(pudp, patching_addr);
> >   	if (unlikely(!pmdp))
> >   		return -EINVAL;
> >   
> > -	ptep = pte_offset_kernel(pmdp, addr);
> > +	ptep = pte_offset_kernel(pmdp, patching_addr);
> 
> ptep should be stored in the patch_mapping struct instead of walking 
> again the page tables.
> 

Oh yes - this will be in the next version.

> >   	if (unlikely(!ptep))
> >   		return -EINVAL;
> >   
> > -	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
> > +	/*
> > +	 * In hash, pte_clear flushes the tlb
> > +	 */
> > +	pte_clear(patching_mm, patching_addr, ptep);
> > +	unuse_temporary_mm(&patch_mapping->temp_mm);
> >   
> >   	/*
> > -	 * In hash, pte_clear flushes the tlb, in radix, we have to
> > +	 * In radix, we have to explicitly flush the tlb (no-op in hash)
> >   	 */
> > -	pte_clear(&init_mm, addr, ptep);
> > -	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> > +	local_flush_tlb_mm(patching_mm);
> > +	pte_unmap_unlock(ptep, patch_mapping->ptl);
> >   
> >   	return 0;
> >   }
> > @@ -167,33 +148,38 @@ static int do_patch_instruction(unsigned int *addr, unsigned int instr)
> >   	int err;
> >   	unsigned int *patch_addr = NULL;
> >   	unsigned long flags;
> > -	unsigned long text_poke_addr;
> > -	unsigned long kaddr = (unsigned long)addr;
> > +	struct patch_mapping patch_mapping;
> >   
> >   	/*
> > -	 * During early early boot patch_instruction is called
> > -	 * when text_poke_area is not ready, but we still need
> > -	 * to allow patching. We just do the plain old patching
> > +	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
> > +	 * to this, patch_instruction is called when we don't have (and don't
> > +	 * need) the patching_mm so just do plain old patching.
> >   	 */
> > -	if (!this_cpu_read(text_poke_area))
> > +	if (!patching_mm)
> >   		return raw_patch_instruction(addr, instr);
> >   
> >   	local_irq_save(flags);
> >   
> > -	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> > -	if (map_patch_area(addr, text_poke_addr)) {
> > -		err = -1;
> > +	err = map_patch(addr, &patch_mapping);
> > +	if (err)
> >   		goto out;
> > -	}
> >   
> > -	patch_addr = (unsigned int *)(text_poke_addr) +
> > -			((kaddr & ~PAGE_MASK) / sizeof(unsigned int));
> > +	patch_addr = (unsigned int *)(patching_addr) +
> > +			(offset_in_page((unsigned long)addr) /
> > +				sizeof(unsigned int));
> >   
> >   	__patch_instruction(addr, instr, patch_addr);
> 
> The error returned by __patch_instruction() should be managed.
> 

Agreed, will do something in the next spin.

> >   
> > -	err = unmap_patch_area(text_poke_addr);
> > +	err = unmap_patch(&patch_mapping);
> >   	if (err)
> > -		pr_warn("failed to unmap %lx\n", text_poke_addr);
> > +		pr_warn("unmap patch: failed to unmap patch\n");
> > +
> > +	/*
> > +	 * Something is wrong if what we just wrote doesn't match what we
> > +	 * think we just wrote.
> > +	 * XXX: BUG_ON() instead?
> 
> No, not a BUG_ON(). If patching fails, that's no a vital fault, we can 
> fail gracefully. You should return a fault instead.
> 

Yup - will make these changes in the next version.

> > +	 */
> > +	WARN_ON(memcmp(addr, &instr, sizeof(instr)));
> 
> Come on. addr is an *int, instr is an int. By doing a memcmp() on 
> &instr, you for the compiler to write instr into the stack whereas local 
> vars are mainly in registers on RISC processors like powerpc. Following 
> should do it:
> 
> 	WARN_ON(*addr != instr);
> 

Oh man - I agree, that's just embarrassing.
Appreciate your feedback on this RFC series, thanks!

> >   
> >   out:
> >   	local_irq_restore(flags);
> > 
> 
> Christophe
Christophe Leroy April 15, 2020, 8:45 a.m. UTC | #3
Le 15/04/2020 à 07:11, Christopher M Riedl a écrit :
>> On March 24, 2020 11:25 AM Christophe Leroy <christophe.leroy@c-s.fr> wrote:
>>
>>   
>> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
>>> Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
>>> mappings to other CPUs. These mappings should be kept local to the CPU
>>> doing the patching. Use the pre-initialized temporary mm and patching
>>> address for this purpose. Also add a check after patching to ensure the
>>> patch succeeded.
>>>
>>> Based on x86 implementation:
>>>
>>> commit b3fd8e83ada0
>>> ("x86/alternatives: Use temporary mm for text poking")
>>>
>>> Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
>>> ---
>>>    arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
>>>    1 file changed, 57 insertions(+), 71 deletions(-)
>>>
>>> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
>>> index 18b88ecfc5a8..f156132e8975 100644
>>> --- a/arch/powerpc/lib/code-patching.c
>>> +++ b/arch/powerpc/lib/code-patching.c
>>> @@ -19,6 +19,7 @@
>>>    #include <asm/page.h>
>>>    #include <asm/code-patching.h>
>>>    #include <asm/setup.h>
>>> +#include <asm/mmu_context.h>
>>>    
>>>    static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
>>>    			       unsigned int *patch_addr)
>>> @@ -65,99 +66,79 @@ void __init poking_init(void)
>>>    	pte_unmap_unlock(ptep, ptl);
>>>    }
>>>    
>>> -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
>>> -
>>> -static int text_area_cpu_up(unsigned int cpu)
>>> -{
>>> -	struct vm_struct *area;
>>> -
>>> -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
>>> -	if (!area) {
>>> -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
>>> -			cpu);
>>> -		return -1;
>>> -	}
>>> -	this_cpu_write(text_poke_area, area);
>>> -
>>> -	return 0;
>>> -}
>>> -
>>> -static int text_area_cpu_down(unsigned int cpu)
>>> -{
>>> -	free_vm_area(this_cpu_read(text_poke_area));
>>> -	return 0;
>>> -}
>>> -
>>> -/*
>>> - * Run as a late init call. This allows all the boot time patching to be done
>>> - * simply by patching the code, and then we're called here prior to
>>> - * mark_rodata_ro(), which happens after all init calls are run. Although
>>> - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
>>> - * it as being preferable to a kernel that will crash later when someone tries
>>> - * to use patch_instruction().
>>> - */
>>> -static int __init setup_text_poke_area(void)
>>> -{
>>> -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
>>> -		"powerpc/text_poke:online", text_area_cpu_up,
>>> -		text_area_cpu_down));
>>> -
>>> -	return 0;
>>> -}
>>> -late_initcall(setup_text_poke_area);
>>> +struct patch_mapping {
>>> +	spinlock_t *ptl; /* for protecting pte table */
>>> +	struct temp_mm temp_mm;
>>> +};
>>>    
>>>    /*
>>>     * This can be called for kernel text or a module.
>>>     */
>>> -static int map_patch_area(void *addr, unsigned long text_poke_addr)
>>> +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
>>
>> Why change the name ?
>>
> 
> It's not really an "area" anymore.
> 
>>>    {
>>> -	unsigned long pfn;
>>> -	int err;
>>> +	struct page *page;
>>> +	pte_t pte, *ptep;
>>> +	pgprot_t pgprot;
>>>    
>>>    	if (is_vmalloc_addr(addr))
>>> -		pfn = vmalloc_to_pfn(addr);
>>> +		page = vmalloc_to_page(addr);
>>>    	else
>>> -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
>>> +		page = virt_to_page(addr);
>>>    
>>> -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
>>> +	if (radix_enabled())
>>> +		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
>>> +	else
>>> +		pgprot = PAGE_SHARED;
>>
>> Can you explain the difference between radix and non radix ?
>>
>> Why PAGE_KERNEL for a page that is mapped in userspace ?
>>
>> Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just
>> using PAGE_KERNEL ?
>>
> 
> On hash there is a manual check which prevents setting _PAGE_PRIVILEGED for
> kernel to userspace access in __hash_page - hence we cannot access the mapping
> if the page is mapped PAGE_KERNEL on hash. However, I would like to use
> PAGE_KERNEL here as well and am working on understanding why this check is
> done in hash and if this can change. On radix this works just fine.
> 
> The page is mapped PAGE_KERNEL because the address is technically a userspace
> address - but only to keep the mapping local to this CPU doing the patching.
> PAGE_KERNEL makes it clear both in intent and protection that this is a kernel
> mapping.
> 
> I think the correct way is pgprot_val(PAGE_KERNEL) since PAGE_KERNEL is defined
> as:
> 
> #define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
> 
> and __pgprot() is defined as:
> 
> typedef struct { unsigned long pgprot; } pgprot_t;
> #define pgprot_val(x)   ((x).pgprot)
> #define __pgprot(x)     ((pgprot_t) { (x) })


Yes, so:
	pgprot_val(__pgprot(x)) == x


You do:

	pgprot = __pgprot(pgprot_val(PAGE_KERNEL));

Which is:

	pgprot = __pgprot(pgprot_val(__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)));

Which is equivalent to:

	pgprot = __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW);

So at the end it should simply be:

	pgprot = PAGE_KERNEL;




Christophe
Christopher M. Riedl April 15, 2020, 4:24 p.m. UTC | #4
> On April 15, 2020 3:45 AM Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> 
>  
> Le 15/04/2020 à 07:11, Christopher M Riedl a écrit :
> >> On March 24, 2020 11:25 AM Christophe Leroy <christophe.leroy@c-s.fr> wrote:
> >>
> >>   
> >> Le 23/03/2020 à 05:52, Christopher M. Riedl a écrit :
> >>> Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> >>> mappings to other CPUs. These mappings should be kept local to the CPU
> >>> doing the patching. Use the pre-initialized temporary mm and patching
> >>> address for this purpose. Also add a check after patching to ensure the
> >>> patch succeeded.
> >>>
> >>> Based on x86 implementation:
> >>>
> >>> commit b3fd8e83ada0
> >>> ("x86/alternatives: Use temporary mm for text poking")
> >>>
> >>> Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> >>> ---
> >>>    arch/powerpc/lib/code-patching.c | 128 ++++++++++++++-----------------
> >>>    1 file changed, 57 insertions(+), 71 deletions(-)
> >>>
> >>> diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> >>> index 18b88ecfc5a8..f156132e8975 100644
> >>> --- a/arch/powerpc/lib/code-patching.c
> >>> +++ b/arch/powerpc/lib/code-patching.c
> >>> @@ -19,6 +19,7 @@
> >>>    #include <asm/page.h>
> >>>    #include <asm/code-patching.h>
> >>>    #include <asm/setup.h>
> >>> +#include <asm/mmu_context.h>
> >>>    
> >>>    static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
> >>>    			       unsigned int *patch_addr)
> >>> @@ -65,99 +66,79 @@ void __init poking_init(void)
> >>>    	pte_unmap_unlock(ptep, ptl);
> >>>    }
> >>>    
> >>> -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> >>> -
> >>> -static int text_area_cpu_up(unsigned int cpu)
> >>> -{
> >>> -	struct vm_struct *area;
> >>> -
> >>> -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> >>> -	if (!area) {
> >>> -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> >>> -			cpu);
> >>> -		return -1;
> >>> -	}
> >>> -	this_cpu_write(text_poke_area, area);
> >>> -
> >>> -	return 0;
> >>> -}
> >>> -
> >>> -static int text_area_cpu_down(unsigned int cpu)
> >>> -{
> >>> -	free_vm_area(this_cpu_read(text_poke_area));
> >>> -	return 0;
> >>> -}
> >>> -
> >>> -/*
> >>> - * Run as a late init call. This allows all the boot time patching to be done
> >>> - * simply by patching the code, and then we're called here prior to
> >>> - * mark_rodata_ro(), which happens after all init calls are run. Although
> >>> - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
> >>> - * it as being preferable to a kernel that will crash later when someone tries
> >>> - * to use patch_instruction().
> >>> - */
> >>> -static int __init setup_text_poke_area(void)
> >>> -{
> >>> -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> >>> -		"powerpc/text_poke:online", text_area_cpu_up,
> >>> -		text_area_cpu_down));
> >>> -
> >>> -	return 0;
> >>> -}
> >>> -late_initcall(setup_text_poke_area);
> >>> +struct patch_mapping {
> >>> +	spinlock_t *ptl; /* for protecting pte table */
> >>> +	struct temp_mm temp_mm;
> >>> +};
> >>>    
> >>>    /*
> >>>     * This can be called for kernel text or a module.
> >>>     */
> >>> -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> >>> +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> >>
> >> Why change the name ?
> >>
> > 
> > It's not really an "area" anymore.
> > 
> >>>    {
> >>> -	unsigned long pfn;
> >>> -	int err;
> >>> +	struct page *page;
> >>> +	pte_t pte, *ptep;
> >>> +	pgprot_t pgprot;
> >>>    
> >>>    	if (is_vmalloc_addr(addr))
> >>> -		pfn = vmalloc_to_pfn(addr);
> >>> +		page = vmalloc_to_page(addr);
> >>>    	else
> >>> -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> >>> +		page = virt_to_page(addr);
> >>>    
> >>> -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> >>> +	if (radix_enabled())
> >>> +		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> >>> +	else
> >>> +		pgprot = PAGE_SHARED;
> >>
> >> Can you explain the difference between radix and non radix ?
> >>
> >> Why PAGE_KERNEL for a page that is mapped in userspace ?
> >>
> >> Why do you need to do __pgprot(pgprot_val(PAGE_KERNEL)) instead of just
> >> using PAGE_KERNEL ?
> >>
> > 
> > On hash there is a manual check which prevents setting _PAGE_PRIVILEGED for
> > kernel to userspace access in __hash_page - hence we cannot access the mapping
> > if the page is mapped PAGE_KERNEL on hash. However, I would like to use
> > PAGE_KERNEL here as well and am working on understanding why this check is
> > done in hash and if this can change. On radix this works just fine.
> > 
> > The page is mapped PAGE_KERNEL because the address is technically a userspace
> > address - but only to keep the mapping local to this CPU doing the patching.
> > PAGE_KERNEL makes it clear both in intent and protection that this is a kernel
> > mapping.
> > 
> > I think the correct way is pgprot_val(PAGE_KERNEL) since PAGE_KERNEL is defined
> > as:
> > 
> > #define PAGE_KERNEL	__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
> > 
> > and __pgprot() is defined as:
> > 
> > typedef struct { unsigned long pgprot; } pgprot_t;
> > #define pgprot_val(x)   ((x).pgprot)
> > #define __pgprot(x)     ((pgprot_t) { (x) })
> 
> 
> Yes, so:
> 	pgprot_val(__pgprot(x)) == x
> 
> 
> You do:
> 
> 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
> 
> Which is:
> 
> 	pgprot = __pgprot(pgprot_val(__pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)));
> 
> Which is equivalent to:
> 
> 	pgprot = __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW);
> 
> So at the end it should simply be:
> 
> 	pgprot = PAGE_KERNEL;
> 

Yes you're correct. Picking this up in the next spin.

> 
> 
> 
> Christophe
diff mbox series

Patch

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 18b88ecfc5a8..f156132e8975 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -19,6 +19,7 @@ 
 #include <asm/page.h>
 #include <asm/code-patching.h>
 #include <asm/setup.h>
+#include <asm/mmu_context.h>
 
 static int __patch_instruction(unsigned int *exec_addr, unsigned int instr,
 			       unsigned int *patch_addr)
@@ -65,99 +66,79 @@  void __init poking_init(void)
 	pte_unmap_unlock(ptep, ptl);
 }
 
-static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
-
-static int text_area_cpu_up(unsigned int cpu)
-{
-	struct vm_struct *area;
-
-	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
-	if (!area) {
-		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
-			cpu);
-		return -1;
-	}
-	this_cpu_write(text_poke_area, area);
-
-	return 0;
-}
-
-static int text_area_cpu_down(unsigned int cpu)
-{
-	free_vm_area(this_cpu_read(text_poke_area));
-	return 0;
-}
-
-/*
- * Run as a late init call. This allows all the boot time patching to be done
- * simply by patching the code, and then we're called here prior to
- * mark_rodata_ro(), which happens after all init calls are run. Although
- * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
- * it as being preferable to a kernel that will crash later when someone tries
- * to use patch_instruction().
- */
-static int __init setup_text_poke_area(void)
-{
-	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
-		"powerpc/text_poke:online", text_area_cpu_up,
-		text_area_cpu_down));
-
-	return 0;
-}
-late_initcall(setup_text_poke_area);
+struct patch_mapping {
+	spinlock_t *ptl; /* for protecting pte table */
+	struct temp_mm temp_mm;
+};
 
 /*
  * This can be called for kernel text or a module.
  */
-static int map_patch_area(void *addr, unsigned long text_poke_addr)
+static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
 {
-	unsigned long pfn;
-	int err;
+	struct page *page;
+	pte_t pte, *ptep;
+	pgprot_t pgprot;
 
 	if (is_vmalloc_addr(addr))
-		pfn = vmalloc_to_pfn(addr);
+		page = vmalloc_to_page(addr);
 	else
-		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
+		page = virt_to_page(addr);
 
-	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
+	if (radix_enabled())
+		pgprot = __pgprot(pgprot_val(PAGE_KERNEL));
+	else
+		pgprot = PAGE_SHARED;
 
-	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
-	if (err)
+	ptep = get_locked_pte(patching_mm, patching_addr, &patch_mapping->ptl);
+	if (unlikely(!ptep)) {
+		pr_warn("map patch: failed to allocate pte for patching\n");
 		return -1;
+	}
+
+	pte = mk_pte(page, pgprot);
+	set_pte_at(patching_mm, patching_addr, ptep, pte);
+
+	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
+	use_temporary_mm(&patch_mapping->temp_mm);
 
 	return 0;
 }
 
-static inline int unmap_patch_area(unsigned long addr)
+static int unmap_patch(struct patch_mapping *patch_mapping)
 {
 	pte_t *ptep;
 	pmd_t *pmdp;
 	pud_t *pudp;
 	pgd_t *pgdp;
 
-	pgdp = pgd_offset_k(addr);
+	pgdp = pgd_offset(patching_mm, patching_addr);
 	if (unlikely(!pgdp))
 		return -EINVAL;
 
-	pudp = pud_offset(pgdp, addr);
+	pudp = pud_offset(pgdp, patching_addr);
 	if (unlikely(!pudp))
 		return -EINVAL;
 
-	pmdp = pmd_offset(pudp, addr);
+	pmdp = pmd_offset(pudp, patching_addr);
 	if (unlikely(!pmdp))
 		return -EINVAL;
 
-	ptep = pte_offset_kernel(pmdp, addr);
+	ptep = pte_offset_kernel(pmdp, patching_addr);
 	if (unlikely(!ptep))
 		return -EINVAL;
 
-	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
+	/*
+	 * In hash, pte_clear flushes the tlb
+	 */
+	pte_clear(patching_mm, patching_addr, ptep);
+	unuse_temporary_mm(&patch_mapping->temp_mm);
 
 	/*
-	 * In hash, pte_clear flushes the tlb, in radix, we have to
+	 * In radix, we have to explicitly flush the tlb (no-op in hash)
 	 */
-	pte_clear(&init_mm, addr, ptep);
-	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+	local_flush_tlb_mm(patching_mm);
+	pte_unmap_unlock(ptep, patch_mapping->ptl);
 
 	return 0;
 }
@@ -167,33 +148,38 @@  static int do_patch_instruction(unsigned int *addr, unsigned int instr)
 	int err;
 	unsigned int *patch_addr = NULL;
 	unsigned long flags;
-	unsigned long text_poke_addr;
-	unsigned long kaddr = (unsigned long)addr;
+	struct patch_mapping patch_mapping;
 
 	/*
-	 * During early early boot patch_instruction is called
-	 * when text_poke_area is not ready, but we still need
-	 * to allow patching. We just do the plain old patching
+	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
+	 * to this, patch_instruction is called when we don't have (and don't
+	 * need) the patching_mm so just do plain old patching.
 	 */
-	if (!this_cpu_read(text_poke_area))
+	if (!patching_mm)
 		return raw_patch_instruction(addr, instr);
 
 	local_irq_save(flags);
 
-	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
-	if (map_patch_area(addr, text_poke_addr)) {
-		err = -1;
+	err = map_patch(addr, &patch_mapping);
+	if (err)
 		goto out;
-	}
 
-	patch_addr = (unsigned int *)(text_poke_addr) +
-			((kaddr & ~PAGE_MASK) / sizeof(unsigned int));
+	patch_addr = (unsigned int *)(patching_addr) +
+			(offset_in_page((unsigned long)addr) /
+				sizeof(unsigned int));
 
 	__patch_instruction(addr, instr, patch_addr);
 
-	err = unmap_patch_area(text_poke_addr);
+	err = unmap_patch(&patch_mapping);
 	if (err)
-		pr_warn("failed to unmap %lx\n", text_poke_addr);
+		pr_warn("unmap patch: failed to unmap patch\n");
+
+	/*
+	 * Something is wrong if what we just wrote doesn't match what we
+	 * think we just wrote.
+	 * XXX: BUG_ON() instead?
+	 */
+	WARN_ON(memcmp(addr, &instr, sizeof(instr)));
 
 out:
 	local_irq_restore(flags);