diff mbox series

[RESEND,v4,05/11] powerpc/64s: Add ability to skip SLB preload

Message ID 20210506043452.9674-6-cmr@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Use per-CPU temporary mappings for patching | expand
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (7619d98e5041d5c25aba5428704dba6121237a9a)
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 107 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Christopher M. Riedl May 6, 2021, 4:34 a.m. UTC
Switching to a different mm with Hash translation causes SLB entries to
be preloaded from the current thread_info. This reduces SLB faults, for
example when threads share a common mm but operate on different address
ranges.

Preloading entries from the thread_info struct may not always be
appropriate - such as when switching to a temporary mm. Introduce a new
boolean in mm_context_t to skip the SLB preload entirely. Also move the
SLB preload code into a separate function since switch_slb() is already
quite long. The default behavior (preloading SLB entries from the
current thread_info struct) remains unchanged.

Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>

---

v4:  * New to series.
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
 arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
 arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
 arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
 4 files changed, 50 insertions(+), 24 deletions(-)

Comments

Daniel Axtens June 21, 2021, 3:13 a.m. UTC | #1
"Christopher M. Riedl" <cmr@linux.ibm.com> writes:

> Switching to a different mm with Hash translation causes SLB entries to
> be preloaded from the current thread_info. This reduces SLB faults, for
> example when threads share a common mm but operate on different address
> ranges.
>
> Preloading entries from the thread_info struct may not always be
> appropriate - such as when switching to a temporary mm. Introduce a new
> boolean in mm_context_t to skip the SLB preload entirely. Also move the
> SLB preload code into a separate function since switch_slb() is already
> quite long. The default behavior (preloading SLB entries from the
> current thread_info struct) remains unchanged.
>
> Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
>
> ---
>
> v4:  * New to series.
> ---
>  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
>  4 files changed, 50 insertions(+), 24 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> index eace8c3f7b0a1..b23a9dcdee5af 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> @@ -130,6 +130,9 @@ typedef struct {
>  	u32 pkey_allocation_map;
>  	s16 execute_only_pkey; /* key holding execute-only protection */
>  #endif
> +
> +	/* Do not preload SLB entries from thread_info during switch_slb() */
> +	bool skip_slb_preload;
>  } mm_context_t;
>  
>  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
> diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> index 4bc45d3ed8b0e..264787e90b1a1 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
> +
> +static inline void skip_slb_preload_mm(struct mm_struct *mm)
> +{
> +	mm->context.skip_slb_preload = true;
> +}
> +
> +#else
> +
> +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
> +
> +#endif /* CONFIG_PPC_BOOK3S_64 */
> +
>  #include <asm-generic/mmu_context.h>
>  
>  #endif /* __KERNEL__ */
> diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
> index c10fc8a72fb37..3479910264c59 100644
> --- a/arch/powerpc/mm/book3s64/mmu_context.c
> +++ b/arch/powerpc/mm/book3s64/mmu_context.c
> @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>  	atomic_set(&mm->context.active_cpus, 0);
>  	atomic_set(&mm->context.copros, 0);
>  
> +	mm->context.skip_slb_preload = false;
> +
>  	return 0;
>  }
>  
> diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> index c91bd85eb90e3..da0836cb855af 100644
> --- a/arch/powerpc/mm/book3s64/slb.c
> +++ b/arch/powerpc/mm/book3s64/slb.c
> @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
>  	asm volatile("slbie %0" : : "r" (slbie_data));
>  }
>  
> +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
Should this be explicitly inline or even __always_inline? I'm thinking
switch_slb is probably a fairly hot path on hash?

> +{
> +	struct thread_info *ti = task_thread_info(tsk);
> +	unsigned char i;
> +
> +	/*
> +	 * We gradually age out SLBs after a number of context switches to
> +	 * reduce reload overhead of unused entries (like we do with FP/VEC
> +	 * reload). Each time we wrap 256 switches, take an entry out of the
> +	 * SLB preload cache.
> +	 */
> +	tsk->thread.load_slb++;
> +	if (!tsk->thread.load_slb) {
> +		unsigned long pc = KSTK_EIP(tsk);
> +
> +		preload_age(ti);
> +		preload_add(ti, pc);
> +	}
> +
> +	for (i = 0; i < ti->slb_preload_nr; i++) {
> +		unsigned char idx;
> +		unsigned long ea;
> +
> +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> +
> +		slb_allocate_user(mm, ea);
> +	}
> +}
> +
>  /* Flush all user entries from the segment table of the current processor. */
>  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>  {
> -	struct thread_info *ti = task_thread_info(tsk);
>  	unsigned char i;
>  
>  	/*
> @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>  
>  	copy_mm_to_paca(mm);
>  
> -	/*
> -	 * We gradually age out SLBs after a number of context switches to
> -	 * reduce reload overhead of unused entries (like we do with FP/VEC
> -	 * reload). Each time we wrap 256 switches, take an entry out of the
> -	 * SLB preload cache.
> -	 */
> -	tsk->thread.load_slb++;
> -	if (!tsk->thread.load_slb) {
> -		unsigned long pc = KSTK_EIP(tsk);
> -
> -		preload_age(ti);
> -		preload_add(ti, pc);
> -	}
> -
> -	for (i = 0; i < ti->slb_preload_nr; i++) {
> -		unsigned char idx;
> -		unsigned long ea;
> -
> -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> -
> -		slb_allocate_user(mm, ea);
> -	}
> +	if (!mm->context.skip_slb_preload)
> +		preload_slb_entries(tsk, mm);

Should this be wrapped in likely()?

>  
>  	/*
>  	 * Synchronize slbmte preloads with possible subsequent user memory

Right below this comment is the isync. It seems to be specifically
concerned with synchronising preloaded slbs. Do you need it if you are
skipping SLB preloads?

It's probably not a big deal to have an extra isync in the fairly rare
path when we're skipping preloads, but I thought I'd check.

Kind regards,
Daniel

> -- 
> 2.26.1
Christopher M. Riedl July 1, 2021, 3:48 a.m. UTC | #2
On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
>
> > Switching to a different mm with Hash translation causes SLB entries to
> > be preloaded from the current thread_info. This reduces SLB faults, for
> > example when threads share a common mm but operate on different address
> > ranges.
> >
> > Preloading entries from the thread_info struct may not always be
> > appropriate - such as when switching to a temporary mm. Introduce a new
> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
> > SLB preload code into a separate function since switch_slb() is already
> > quite long. The default behavior (preloading SLB entries from the
> > current thread_info struct) remains unchanged.
> >
> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> >
> > ---
> >
> > v4:  * New to series.
> > ---
> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
> >  4 files changed, 50 insertions(+), 24 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> > index eace8c3f7b0a1..b23a9dcdee5af 100644
> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> > @@ -130,6 +130,9 @@ typedef struct {
> >  	u32 pkey_allocation_map;
> >  	s16 execute_only_pkey; /* key holding execute-only protection */
> >  #endif
> > +
> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
> > +	bool skip_slb_preload;
> >  } mm_context_t;
> >  
> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> > index 4bc45d3ed8b0e..264787e90b1a1 100644
> > --- a/arch/powerpc/include/asm/mmu_context.h
> > +++ b/arch/powerpc/include/asm/mmu_context.h
> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
> >  	return 0;
> >  }
> >  
> > +#ifdef CONFIG_PPC_BOOK3S_64
> > +
> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
> > +{
> > +	mm->context.skip_slb_preload = true;
> > +}
> > +
> > +#else
> > +
> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
> > +
> > +#endif /* CONFIG_PPC_BOOK3S_64 */
> > +
> >  #include <asm-generic/mmu_context.h>
> >  
> >  #endif /* __KERNEL__ */
> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
> > index c10fc8a72fb37..3479910264c59 100644
> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> >  	atomic_set(&mm->context.active_cpus, 0);
> >  	atomic_set(&mm->context.copros, 0);
> >  
> > +	mm->context.skip_slb_preload = false;
> > +
> >  	return 0;
> >  }
> >  
> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> > index c91bd85eb90e3..da0836cb855af 100644
> > --- a/arch/powerpc/mm/book3s64/slb.c
> > +++ b/arch/powerpc/mm/book3s64/slb.c
> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
> >  	asm volatile("slbie %0" : : "r" (slbie_data));
> >  }
> >  
> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
> Should this be explicitly inline or even __always_inline? I'm thinking
> switch_slb is probably a fairly hot path on hash?

Yes absolutely. I'll make this change in v5.

>
> > +{
> > +	struct thread_info *ti = task_thread_info(tsk);
> > +	unsigned char i;
> > +
> > +	/*
> > +	 * We gradually age out SLBs after a number of context switches to
> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
> > +	 * SLB preload cache.
> > +	 */
> > +	tsk->thread.load_slb++;
> > +	if (!tsk->thread.load_slb) {
> > +		unsigned long pc = KSTK_EIP(tsk);
> > +
> > +		preload_age(ti);
> > +		preload_add(ti, pc);
> > +	}
> > +
> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
> > +		unsigned char idx;
> > +		unsigned long ea;
> > +
> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> > +
> > +		slb_allocate_user(mm, ea);
> > +	}
> > +}
> > +
> >  /* Flush all user entries from the segment table of the current processor. */
> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >  {
> > -	struct thread_info *ti = task_thread_info(tsk);
> >  	unsigned char i;
> >  
> >  	/*
> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >  
> >  	copy_mm_to_paca(mm);
> >  
> > -	/*
> > -	 * We gradually age out SLBs after a number of context switches to
> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
> > -	 * SLB preload cache.
> > -	 */
> > -	tsk->thread.load_slb++;
> > -	if (!tsk->thread.load_slb) {
> > -		unsigned long pc = KSTK_EIP(tsk);
> > -
> > -		preload_age(ti);
> > -		preload_add(ti, pc);
> > -	}
> > -
> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
> > -		unsigned char idx;
> > -		unsigned long ea;
> > -
> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> > -
> > -		slb_allocate_user(mm, ea);
> > -	}
> > +	if (!mm->context.skip_slb_preload)
> > +		preload_slb_entries(tsk, mm);
>
> Should this be wrapped in likely()?

Seems like a good idea - yes.

>
> >  
> >  	/*
> >  	 * Synchronize slbmte preloads with possible subsequent user memory
>
> Right below this comment is the isync. It seems to be specifically
> concerned with synchronising preloaded slbs. Do you need it if you are
> skipping SLB preloads?
>
> It's probably not a big deal to have an extra isync in the fairly rare
> path when we're skipping preloads, but I thought I'd check.

I don't _think_ we need the `isync` if we are skipping the SLB preloads,
but then again it was always in the code-path before. If someone can
make a compelling argument to drop it when not preloading SLBs I will,
otherwise (considering some of the other non-obvious things I stepped
into with the Hash code) I will keep it here for now.

Thanks for the comments!

>
> Kind regards,
> Daniel
>
> > -- 
> > 2.26.1
Nicholas Piggin July 1, 2021, 4:15 a.m. UTC | #3
Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
> On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
>> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
>>
>> > Switching to a different mm with Hash translation causes SLB entries to
>> > be preloaded from the current thread_info. This reduces SLB faults, for
>> > example when threads share a common mm but operate on different address
>> > ranges.
>> >
>> > Preloading entries from the thread_info struct may not always be
>> > appropriate - such as when switching to a temporary mm. Introduce a new
>> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
>> > SLB preload code into a separate function since switch_slb() is already
>> > quite long. The default behavior (preloading SLB entries from the
>> > current thread_info struct) remains unchanged.
>> >
>> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
>> >
>> > ---
>> >
>> > v4:  * New to series.
>> > ---
>> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
>> >  4 files changed, 50 insertions(+), 24 deletions(-)
>> >
>> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>> > index eace8c3f7b0a1..b23a9dcdee5af 100644
>> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>> > @@ -130,6 +130,9 @@ typedef struct {
>> >  	u32 pkey_allocation_map;
>> >  	s16 execute_only_pkey; /* key holding execute-only protection */
>> >  #endif
>> > +
>> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
>> > +	bool skip_slb_preload;
>> >  } mm_context_t;
>> >  
>> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
>> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
>> > index 4bc45d3ed8b0e..264787e90b1a1 100644
>> > --- a/arch/powerpc/include/asm/mmu_context.h
>> > +++ b/arch/powerpc/include/asm/mmu_context.h
>> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
>> >  	return 0;
>> >  }
>> >  
>> > +#ifdef CONFIG_PPC_BOOK3S_64
>> > +
>> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
>> > +{
>> > +	mm->context.skip_slb_preload = true;
>> > +}
>> > +
>> > +#else
>> > +
>> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
>> > +
>> > +#endif /* CONFIG_PPC_BOOK3S_64 */
>> > +
>> >  #include <asm-generic/mmu_context.h>
>> >  
>> >  #endif /* __KERNEL__ */
>> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
>> > index c10fc8a72fb37..3479910264c59 100644
>> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
>> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
>> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>> >  	atomic_set(&mm->context.active_cpus, 0);
>> >  	atomic_set(&mm->context.copros, 0);
>> >  
>> > +	mm->context.skip_slb_preload = false;
>> > +
>> >  	return 0;
>> >  }
>> >  
>> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
>> > index c91bd85eb90e3..da0836cb855af 100644
>> > --- a/arch/powerpc/mm/book3s64/slb.c
>> > +++ b/arch/powerpc/mm/book3s64/slb.c
>> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
>> >  	asm volatile("slbie %0" : : "r" (slbie_data));
>> >  }
>> >  
>> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
>> Should this be explicitly inline or even __always_inline? I'm thinking
>> switch_slb is probably a fairly hot path on hash?
> 
> Yes absolutely. I'll make this change in v5.
> 
>>
>> > +{
>> > +	struct thread_info *ti = task_thread_info(tsk);
>> > +	unsigned char i;
>> > +
>> > +	/*
>> > +	 * We gradually age out SLBs after a number of context switches to
>> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
>> > +	 * SLB preload cache.
>> > +	 */
>> > +	tsk->thread.load_slb++;
>> > +	if (!tsk->thread.load_slb) {
>> > +		unsigned long pc = KSTK_EIP(tsk);
>> > +
>> > +		preload_age(ti);
>> > +		preload_add(ti, pc);
>> > +	}
>> > +
>> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
>> > +		unsigned char idx;
>> > +		unsigned long ea;
>> > +
>> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> > +
>> > +		slb_allocate_user(mm, ea);
>> > +	}
>> > +}
>> > +
>> >  /* Flush all user entries from the segment table of the current processor. */
>> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >  {
>> > -	struct thread_info *ti = task_thread_info(tsk);
>> >  	unsigned char i;
>> >  
>> >  	/*
>> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >  
>> >  	copy_mm_to_paca(mm);
>> >  
>> > -	/*
>> > -	 * We gradually age out SLBs after a number of context switches to
>> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
>> > -	 * SLB preload cache.
>> > -	 */
>> > -	tsk->thread.load_slb++;
>> > -	if (!tsk->thread.load_slb) {
>> > -		unsigned long pc = KSTK_EIP(tsk);
>> > -
>> > -		preload_age(ti);
>> > -		preload_add(ti, pc);
>> > -	}
>> > -
>> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
>> > -		unsigned char idx;
>> > -		unsigned long ea;
>> > -
>> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> > -
>> > -		slb_allocate_user(mm, ea);
>> > -	}
>> > +	if (!mm->context.skip_slb_preload)
>> > +		preload_slb_entries(tsk, mm);
>>
>> Should this be wrapped in likely()?
> 
> Seems like a good idea - yes.
> 
>>
>> >  
>> >  	/*
>> >  	 * Synchronize slbmte preloads with possible subsequent user memory
>>
>> Right below this comment is the isync. It seems to be specifically
>> concerned with synchronising preloaded slbs. Do you need it if you are
>> skipping SLB preloads?
>>
>> It's probably not a big deal to have an extra isync in the fairly rare
>> path when we're skipping preloads, but I thought I'd check.
> 
> I don't _think_ we need the `isync` if we are skipping the SLB preloads,
> but then again it was always in the code-path before. If someone can
> make a compelling argument to drop it when not preloading SLBs I will,
> otherwise (considering some of the other non-obvious things I stepped
> into with the Hash code) I will keep it here for now.

The ISA says slbia wants an isync afterward, so we probably should keep 
it. The comment is a bit misleading in that case.

Why isn't preloading appropriate for a temporary mm? 

Thanks,
Nick
Christopher M. Riedl July 1, 2021, 5:28 a.m. UTC | #4
On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
> >>
> >> > Switching to a different mm with Hash translation causes SLB entries to
> >> > be preloaded from the current thread_info. This reduces SLB faults, for
> >> > example when threads share a common mm but operate on different address
> >> > ranges.
> >> >
> >> > Preloading entries from the thread_info struct may not always be
> >> > appropriate - such as when switching to a temporary mm. Introduce a new
> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
> >> > SLB preload code into a separate function since switch_slb() is already
> >> > quite long. The default behavior (preloading SLB entries from the
> >> > current thread_info struct) remains unchanged.
> >> >
> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> >> >
> >> > ---
> >> >
> >> > v4:  * New to series.
> >> > ---
> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
> >> >
> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> > @@ -130,6 +130,9 @@ typedef struct {
> >> >  	u32 pkey_allocation_map;
> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
> >> >  #endif
> >> > +
> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
> >> > +	bool skip_slb_preload;
> >> >  } mm_context_t;
> >> >  
> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
> >> > --- a/arch/powerpc/include/asm/mmu_context.h
> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
> >> >  	return 0;
> >> >  }
> >> >  
> >> > +#ifdef CONFIG_PPC_BOOK3S_64
> >> > +
> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
> >> > +{
> >> > +	mm->context.skip_slb_preload = true;
> >> > +}
> >> > +
> >> > +#else
> >> > +
> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
> >> > +
> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
> >> > +
> >> >  #include <asm-generic/mmu_context.h>
> >> >  
> >> >  #endif /* __KERNEL__ */
> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
> >> > index c10fc8a72fb37..3479910264c59 100644
> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> >> >  	atomic_set(&mm->context.active_cpus, 0);
> >> >  	atomic_set(&mm->context.copros, 0);
> >> >  
> >> > +	mm->context.skip_slb_preload = false;
> >> > +
> >> >  	return 0;
> >> >  }
> >> >  
> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> >> > index c91bd85eb90e3..da0836cb855af 100644
> >> > --- a/arch/powerpc/mm/book3s64/slb.c
> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
> >> >  }
> >> >  
> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
> >> Should this be explicitly inline or even __always_inline? I'm thinking
> >> switch_slb is probably a fairly hot path on hash?
> > 
> > Yes absolutely. I'll make this change in v5.
> > 
> >>
> >> > +{
> >> > +	struct thread_info *ti = task_thread_info(tsk);
> >> > +	unsigned char i;
> >> > +
> >> > +	/*
> >> > +	 * We gradually age out SLBs after a number of context switches to
> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> > +	 * SLB preload cache.
> >> > +	 */
> >> > +	tsk->thread.load_slb++;
> >> > +	if (!tsk->thread.load_slb) {
> >> > +		unsigned long pc = KSTK_EIP(tsk);
> >> > +
> >> > +		preload_age(ti);
> >> > +		preload_add(ti, pc);
> >> > +	}
> >> > +
> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> > +		unsigned char idx;
> >> > +		unsigned long ea;
> >> > +
> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> > +
> >> > +		slb_allocate_user(mm, ea);
> >> > +	}
> >> > +}
> >> > +
> >> >  /* Flush all user entries from the segment table of the current processor. */
> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >  {
> >> > -	struct thread_info *ti = task_thread_info(tsk);
> >> >  	unsigned char i;
> >> >  
> >> >  	/*
> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >  
> >> >  	copy_mm_to_paca(mm);
> >> >  
> >> > -	/*
> >> > -	 * We gradually age out SLBs after a number of context switches to
> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> > -	 * SLB preload cache.
> >> > -	 */
> >> > -	tsk->thread.load_slb++;
> >> > -	if (!tsk->thread.load_slb) {
> >> > -		unsigned long pc = KSTK_EIP(tsk);
> >> > -
> >> > -		preload_age(ti);
> >> > -		preload_add(ti, pc);
> >> > -	}
> >> > -
> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> > -		unsigned char idx;
> >> > -		unsigned long ea;
> >> > -
> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> > -
> >> > -		slb_allocate_user(mm, ea);
> >> > -	}
> >> > +	if (!mm->context.skip_slb_preload)
> >> > +		preload_slb_entries(tsk, mm);
> >>
> >> Should this be wrapped in likely()?
> > 
> > Seems like a good idea - yes.
> > 
> >>
> >> >  
> >> >  	/*
> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
> >>
> >> Right below this comment is the isync. It seems to be specifically
> >> concerned with synchronising preloaded slbs. Do you need it if you are
> >> skipping SLB preloads?
> >>
> >> It's probably not a big deal to have an extra isync in the fairly rare
> >> path when we're skipping preloads, but I thought I'd check.
> > 
> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
> > but then again it was always in the code-path before. If someone can
> > make a compelling argument to drop it when not preloading SLBs I will,
> > otherwise (considering some of the other non-obvious things I stepped
> > into with the Hash code) I will keep it here for now.
>
> The ISA says slbia wants an isync afterward, so we probably should keep
> it. The comment is a bit misleading in that case.
>
> Why isn't preloading appropriate for a temporary mm?

The preloaded entries come from the thread_info struct which isn't
necessarily related to the temporary mm at all. I saw SLB multihits
while testing this series with my LKDTM test where the "patching
address" (userspace address for the temporary mapping w/
write-permissions) ends up in a thread's preload list and then we
explicitly insert it again in map_patch() when trying to patch. At that
point the SLB multihit triggers.

>
> Thanks,
> Nick
Nicholas Piggin July 1, 2021, 6:04 a.m. UTC | #5
Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
> On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
>> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
>> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
>> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
>> >>
>> >> > Switching to a different mm with Hash translation causes SLB entries to
>> >> > be preloaded from the current thread_info. This reduces SLB faults, for
>> >> > example when threads share a common mm but operate on different address
>> >> > ranges.
>> >> >
>> >> > Preloading entries from the thread_info struct may not always be
>> >> > appropriate - such as when switching to a temporary mm. Introduce a new
>> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
>> >> > SLB preload code into a separate function since switch_slb() is already
>> >> > quite long. The default behavior (preloading SLB entries from the
>> >> > current thread_info struct) remains unchanged.
>> >> >
>> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
>> >> >
>> >> > ---
>> >> >
>> >> > v4:  * New to series.
>> >> > ---
>> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
>> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
>> >> >
>> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
>> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> > @@ -130,6 +130,9 @@ typedef struct {
>> >> >  	u32 pkey_allocation_map;
>> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
>> >> >  #endif
>> >> > +
>> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
>> >> > +	bool skip_slb_preload;
>> >> >  } mm_context_t;
>> >> >  
>> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
>> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
>> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
>> >> > --- a/arch/powerpc/include/asm/mmu_context.h
>> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
>> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
>> >> >  	return 0;
>> >> >  }
>> >> >  
>> >> > +#ifdef CONFIG_PPC_BOOK3S_64
>> >> > +
>> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
>> >> > +{
>> >> > +	mm->context.skip_slb_preload = true;
>> >> > +}
>> >> > +
>> >> > +#else
>> >> > +
>> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
>> >> > +
>> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
>> >> > +
>> >> >  #include <asm-generic/mmu_context.h>
>> >> >  
>> >> >  #endif /* __KERNEL__ */
>> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
>> >> > index c10fc8a72fb37..3479910264c59 100644
>> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
>> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
>> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>> >> >  	atomic_set(&mm->context.active_cpus, 0);
>> >> >  	atomic_set(&mm->context.copros, 0);
>> >> >  
>> >> > +	mm->context.skip_slb_preload = false;
>> >> > +
>> >> >  	return 0;
>> >> >  }
>> >> >  
>> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
>> >> > index c91bd85eb90e3..da0836cb855af 100644
>> >> > --- a/arch/powerpc/mm/book3s64/slb.c
>> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
>> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
>> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
>> >> >  }
>> >> >  
>> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
>> >> Should this be explicitly inline or even __always_inline? I'm thinking
>> >> switch_slb is probably a fairly hot path on hash?
>> > 
>> > Yes absolutely. I'll make this change in v5.
>> > 
>> >>
>> >> > +{
>> >> > +	struct thread_info *ti = task_thread_info(tsk);
>> >> > +	unsigned char i;
>> >> > +
>> >> > +	/*
>> >> > +	 * We gradually age out SLBs after a number of context switches to
>> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
>> >> > +	 * SLB preload cache.
>> >> > +	 */
>> >> > +	tsk->thread.load_slb++;
>> >> > +	if (!tsk->thread.load_slb) {
>> >> > +		unsigned long pc = KSTK_EIP(tsk);
>> >> > +
>> >> > +		preload_age(ti);
>> >> > +		preload_add(ti, pc);
>> >> > +	}
>> >> > +
>> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
>> >> > +		unsigned char idx;
>> >> > +		unsigned long ea;
>> >> > +
>> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> >> > +
>> >> > +		slb_allocate_user(mm, ea);
>> >> > +	}
>> >> > +}
>> >> > +
>> >> >  /* Flush all user entries from the segment table of the current processor. */
>> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >> >  {
>> >> > -	struct thread_info *ti = task_thread_info(tsk);
>> >> >  	unsigned char i;
>> >> >  
>> >> >  	/*
>> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >> >  
>> >> >  	copy_mm_to_paca(mm);
>> >> >  
>> >> > -	/*
>> >> > -	 * We gradually age out SLBs after a number of context switches to
>> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
>> >> > -	 * SLB preload cache.
>> >> > -	 */
>> >> > -	tsk->thread.load_slb++;
>> >> > -	if (!tsk->thread.load_slb) {
>> >> > -		unsigned long pc = KSTK_EIP(tsk);
>> >> > -
>> >> > -		preload_age(ti);
>> >> > -		preload_add(ti, pc);
>> >> > -	}
>> >> > -
>> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
>> >> > -		unsigned char idx;
>> >> > -		unsigned long ea;
>> >> > -
>> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> >> > -
>> >> > -		slb_allocate_user(mm, ea);
>> >> > -	}
>> >> > +	if (!mm->context.skip_slb_preload)
>> >> > +		preload_slb_entries(tsk, mm);
>> >>
>> >> Should this be wrapped in likely()?
>> > 
>> > Seems like a good idea - yes.
>> > 
>> >>
>> >> >  
>> >> >  	/*
>> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
>> >>
>> >> Right below this comment is the isync. It seems to be specifically
>> >> concerned with synchronising preloaded slbs. Do you need it if you are
>> >> skipping SLB preloads?
>> >>
>> >> It's probably not a big deal to have an extra isync in the fairly rare
>> >> path when we're skipping preloads, but I thought I'd check.
>> > 
>> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
>> > but then again it was always in the code-path before. If someone can
>> > make a compelling argument to drop it when not preloading SLBs I will,
>> > otherwise (considering some of the other non-obvious things I stepped
>> > into with the Hash code) I will keep it here for now.
>>
>> The ISA says slbia wants an isync afterward, so we probably should keep
>> it. The comment is a bit misleading in that case.
>>
>> Why isn't preloading appropriate for a temporary mm?
> 
> The preloaded entries come from the thread_info struct which isn't
> necessarily related to the temporary mm at all. I saw SLB multihits
> while testing this series with my LKDTM test where the "patching
> address" (userspace address for the temporary mapping w/
> write-permissions) ends up in a thread's preload list and then we
> explicitly insert it again in map_patch() when trying to patch. At that
> point the SLB multihit triggers.

Hmm, so what if we use a mm, take some SLB faults then unuse it and
use a different one? I wonder if kthread_use_mm has existing problems
with this incorrect SLB preloading. Quite possibly. We should clear
the preload whenever mm changes I think. That should cover this as
well.

Thanks,
Nick
Christopher M. Riedl July 1, 2021, 6:53 a.m. UTC | #6
On Thu Jul 1, 2021 at 1:04 AM CDT, Nicholas Piggin wrote:
> Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
> > On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
> >> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
> >> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
> >> >>
> >> >> > Switching to a different mm with Hash translation causes SLB entries to
> >> >> > be preloaded from the current thread_info. This reduces SLB faults, for
> >> >> > example when threads share a common mm but operate on different address
> >> >> > ranges.
> >> >> >
> >> >> > Preloading entries from the thread_info struct may not always be
> >> >> > appropriate - such as when switching to a temporary mm. Introduce a new
> >> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
> >> >> > SLB preload code into a separate function since switch_slb() is already
> >> >> > quite long. The default behavior (preloading SLB entries from the
> >> >> > current thread_info struct) remains unchanged.
> >> >> >
> >> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> >> >> >
> >> >> > ---
> >> >> >
> >> >> > v4:  * New to series.
> >> >> > ---
> >> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
> >> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
> >> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
> >> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
> >> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
> >> >> >
> >> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
> >> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> > @@ -130,6 +130,9 @@ typedef struct {
> >> >> >  	u32 pkey_allocation_map;
> >> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
> >> >> >  #endif
> >> >> > +
> >> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
> >> >> > +	bool skip_slb_preload;
> >> >> >  } mm_context_t;
> >> >> >  
> >> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
> >> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> >> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
> >> >> > --- a/arch/powerpc/include/asm/mmu_context.h
> >> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
> >> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
> >> >> >  	return 0;
> >> >> >  }
> >> >> >  
> >> >> > +#ifdef CONFIG_PPC_BOOK3S_64
> >> >> > +
> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
> >> >> > +{
> >> >> > +	mm->context.skip_slb_preload = true;
> >> >> > +}
> >> >> > +
> >> >> > +#else
> >> >> > +
> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
> >> >> > +
> >> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
> >> >> > +
> >> >> >  #include <asm-generic/mmu_context.h>
> >> >> >  
> >> >> >  #endif /* __KERNEL__ */
> >> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> > index c10fc8a72fb37..3479910264c59 100644
> >> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >  	atomic_set(&mm->context.active_cpus, 0);
> >> >> >  	atomic_set(&mm->context.copros, 0);
> >> >> >  
> >> >> > +	mm->context.skip_slb_preload = false;
> >> >> > +
> >> >> >  	return 0;
> >> >> >  }
> >> >> >  
> >> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> >> >> > index c91bd85eb90e3..da0836cb855af 100644
> >> >> > --- a/arch/powerpc/mm/book3s64/slb.c
> >> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
> >> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
> >> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
> >> >> >  }
> >> >> >  
> >> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
> >> >> Should this be explicitly inline or even __always_inline? I'm thinking
> >> >> switch_slb is probably a fairly hot path on hash?
> >> > 
> >> > Yes absolutely. I'll make this change in v5.
> >> > 
> >> >>
> >> >> > +{
> >> >> > +	struct thread_info *ti = task_thread_info(tsk);
> >> >> > +	unsigned char i;
> >> >> > +
> >> >> > +	/*
> >> >> > +	 * We gradually age out SLBs after a number of context switches to
> >> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> >> > +	 * SLB preload cache.
> >> >> > +	 */
> >> >> > +	tsk->thread.load_slb++;
> >> >> > +	if (!tsk->thread.load_slb) {
> >> >> > +		unsigned long pc = KSTK_EIP(tsk);
> >> >> > +
> >> >> > +		preload_age(ti);
> >> >> > +		preload_add(ti, pc);
> >> >> > +	}
> >> >> > +
> >> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> >> > +		unsigned char idx;
> >> >> > +		unsigned long ea;
> >> >> > +
> >> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> >> > +
> >> >> > +		slb_allocate_user(mm, ea);
> >> >> > +	}
> >> >> > +}
> >> >> > +
> >> >> >  /* Flush all user entries from the segment table of the current processor. */
> >> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >  {
> >> >> > -	struct thread_info *ti = task_thread_info(tsk);
> >> >> >  	unsigned char i;
> >> >> >  
> >> >> >  	/*
> >> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >  
> >> >> >  	copy_mm_to_paca(mm);
> >> >> >  
> >> >> > -	/*
> >> >> > -	 * We gradually age out SLBs after a number of context switches to
> >> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> >> > -	 * SLB preload cache.
> >> >> > -	 */
> >> >> > -	tsk->thread.load_slb++;
> >> >> > -	if (!tsk->thread.load_slb) {
> >> >> > -		unsigned long pc = KSTK_EIP(tsk);
> >> >> > -
> >> >> > -		preload_age(ti);
> >> >> > -		preload_add(ti, pc);
> >> >> > -	}
> >> >> > -
> >> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> >> > -		unsigned char idx;
> >> >> > -		unsigned long ea;
> >> >> > -
> >> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> >> > -
> >> >> > -		slb_allocate_user(mm, ea);
> >> >> > -	}
> >> >> > +	if (!mm->context.skip_slb_preload)
> >> >> > +		preload_slb_entries(tsk, mm);
> >> >>
> >> >> Should this be wrapped in likely()?
> >> > 
> >> > Seems like a good idea - yes.
> >> > 
> >> >>
> >> >> >  
> >> >> >  	/*
> >> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
> >> >>
> >> >> Right below this comment is the isync. It seems to be specifically
> >> >> concerned with synchronising preloaded slbs. Do you need it if you are
> >> >> skipping SLB preloads?
> >> >>
> >> >> It's probably not a big deal to have an extra isync in the fairly rare
> >> >> path when we're skipping preloads, but I thought I'd check.
> >> > 
> >> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
> >> > but then again it was always in the code-path before. If someone can
> >> > make a compelling argument to drop it when not preloading SLBs I will,
> >> > otherwise (considering some of the other non-obvious things I stepped
> >> > into with the Hash code) I will keep it here for now.
> >>
> >> The ISA says slbia wants an isync afterward, so we probably should keep
> >> it. The comment is a bit misleading in that case.
> >>
> >> Why isn't preloading appropriate for a temporary mm?
> > 
> > The preloaded entries come from the thread_info struct which isn't
> > necessarily related to the temporary mm at all. I saw SLB multihits
> > while testing this series with my LKDTM test where the "patching
> > address" (userspace address for the temporary mapping w/
> > write-permissions) ends up in a thread's preload list and then we
> > explicitly insert it again in map_patch() when trying to patch. At that
> > point the SLB multihit triggers.
>
> Hmm, so what if we use a mm, take some SLB faults then unuse it and
> use a different one? I wonder if kthread_use_mm has existing problems
> with this incorrect SLB preloading. Quite possibly. We should clear
> the preload whenever mm changes I think. That should cover this as
> well.

I actually did this initially but thought it was a bit too intrusive to
include as part of this series and hurt performance. I agree that
preloading the SLB from the thread may be a problem in general when
switching in/out an mm.

kthread_use_mm may not be affected unless we explicitly insert some SLB
entries which could collide with an existing preload (which I don't
think we do anywhere until this series).

>
> Thanks,
> Nick
Nicholas Piggin July 1, 2021, 7:37 a.m. UTC | #7
Excerpts from Christopher M. Riedl's message of July 1, 2021 4:53 pm:
> On Thu Jul 1, 2021 at 1:04 AM CDT, Nicholas Piggin wrote:
>> Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
>> > On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
>> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
>> >> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
>> >> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
>> >> >>
>> >> >> > Switching to a different mm with Hash translation causes SLB entries to
>> >> >> > be preloaded from the current thread_info. This reduces SLB faults, for
>> >> >> > example when threads share a common mm but operate on different address
>> >> >> > ranges.
>> >> >> >
>> >> >> > Preloading entries from the thread_info struct may not always be
>> >> >> > appropriate - such as when switching to a temporary mm. Introduce a new
>> >> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
>> >> >> > SLB preload code into a separate function since switch_slb() is already
>> >> >> > quite long. The default behavior (preloading SLB entries from the
>> >> >> > current thread_info struct) remains unchanged.
>> >> >> >
>> >> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
>> >> >> >
>> >> >> > ---
>> >> >> >
>> >> >> > v4:  * New to series.
>> >> >> > ---
>> >> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>> >> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>> >> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>> >> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
>> >> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
>> >> >> >
>> >> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
>> >> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>> >> >> > @@ -130,6 +130,9 @@ typedef struct {
>> >> >> >  	u32 pkey_allocation_map;
>> >> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
>> >> >> >  #endif
>> >> >> > +
>> >> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
>> >> >> > +	bool skip_slb_preload;
>> >> >> >  } mm_context_t;
>> >> >> >  
>> >> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
>> >> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
>> >> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
>> >> >> > --- a/arch/powerpc/include/asm/mmu_context.h
>> >> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
>> >> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
>> >> >> >  	return 0;
>> >> >> >  }
>> >> >> >  
>> >> >> > +#ifdef CONFIG_PPC_BOOK3S_64
>> >> >> > +
>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
>> >> >> > +{
>> >> >> > +	mm->context.skip_slb_preload = true;
>> >> >> > +}
>> >> >> > +
>> >> >> > +#else
>> >> >> > +
>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
>> >> >> > +
>> >> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
>> >> >> > +
>> >> >> >  #include <asm-generic/mmu_context.h>
>> >> >> >  
>> >> >> >  #endif /* __KERNEL__ */
>> >> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
>> >> >> > index c10fc8a72fb37..3479910264c59 100644
>> >> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
>> >> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
>> >> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>> >> >> >  	atomic_set(&mm->context.active_cpus, 0);
>> >> >> >  	atomic_set(&mm->context.copros, 0);
>> >> >> >  
>> >> >> > +	mm->context.skip_slb_preload = false;
>> >> >> > +
>> >> >> >  	return 0;
>> >> >> >  }
>> >> >> >  
>> >> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
>> >> >> > index c91bd85eb90e3..da0836cb855af 100644
>> >> >> > --- a/arch/powerpc/mm/book3s64/slb.c
>> >> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
>> >> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
>> >> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
>> >> >> >  }
>> >> >> >  
>> >> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
>> >> >> Should this be explicitly inline or even __always_inline? I'm thinking
>> >> >> switch_slb is probably a fairly hot path on hash?
>> >> > 
>> >> > Yes absolutely. I'll make this change in v5.
>> >> > 
>> >> >>
>> >> >> > +{
>> >> >> > +	struct thread_info *ti = task_thread_info(tsk);
>> >> >> > +	unsigned char i;
>> >> >> > +
>> >> >> > +	/*
>> >> >> > +	 * We gradually age out SLBs after a number of context switches to
>> >> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> >> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
>> >> >> > +	 * SLB preload cache.
>> >> >> > +	 */
>> >> >> > +	tsk->thread.load_slb++;
>> >> >> > +	if (!tsk->thread.load_slb) {
>> >> >> > +		unsigned long pc = KSTK_EIP(tsk);
>> >> >> > +
>> >> >> > +		preload_age(ti);
>> >> >> > +		preload_add(ti, pc);
>> >> >> > +	}
>> >> >> > +
>> >> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
>> >> >> > +		unsigned char idx;
>> >> >> > +		unsigned long ea;
>> >> >> > +
>> >> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> >> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> >> >> > +
>> >> >> > +		slb_allocate_user(mm, ea);
>> >> >> > +	}
>> >> >> > +}
>> >> >> > +
>> >> >> >  /* Flush all user entries from the segment table of the current processor. */
>> >> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >> >> >  {
>> >> >> > -	struct thread_info *ti = task_thread_info(tsk);
>> >> >> >  	unsigned char i;
>> >> >> >  
>> >> >> >  	/*
>> >> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> >> >> >  
>> >> >> >  	copy_mm_to_paca(mm);
>> >> >> >  
>> >> >> > -	/*
>> >> >> > -	 * We gradually age out SLBs after a number of context switches to
>> >> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
>> >> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
>> >> >> > -	 * SLB preload cache.
>> >> >> > -	 */
>> >> >> > -	tsk->thread.load_slb++;
>> >> >> > -	if (!tsk->thread.load_slb) {
>> >> >> > -		unsigned long pc = KSTK_EIP(tsk);
>> >> >> > -
>> >> >> > -		preload_age(ti);
>> >> >> > -		preload_add(ti, pc);
>> >> >> > -	}
>> >> >> > -
>> >> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
>> >> >> > -		unsigned char idx;
>> >> >> > -		unsigned long ea;
>> >> >> > -
>> >> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>> >> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>> >> >> > -
>> >> >> > -		slb_allocate_user(mm, ea);
>> >> >> > -	}
>> >> >> > +	if (!mm->context.skip_slb_preload)
>> >> >> > +		preload_slb_entries(tsk, mm);
>> >> >>
>> >> >> Should this be wrapped in likely()?
>> >> > 
>> >> > Seems like a good idea - yes.
>> >> > 
>> >> >>
>> >> >> >  
>> >> >> >  	/*
>> >> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
>> >> >>
>> >> >> Right below this comment is the isync. It seems to be specifically
>> >> >> concerned with synchronising preloaded slbs. Do you need it if you are
>> >> >> skipping SLB preloads?
>> >> >>
>> >> >> It's probably not a big deal to have an extra isync in the fairly rare
>> >> >> path when we're skipping preloads, but I thought I'd check.
>> >> > 
>> >> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
>> >> > but then again it was always in the code-path before. If someone can
>> >> > make a compelling argument to drop it when not preloading SLBs I will,
>> >> > otherwise (considering some of the other non-obvious things I stepped
>> >> > into with the Hash code) I will keep it here for now.
>> >>
>> >> The ISA says slbia wants an isync afterward, so we probably should keep
>> >> it. The comment is a bit misleading in that case.
>> >>
>> >> Why isn't preloading appropriate for a temporary mm?
>> > 
>> > The preloaded entries come from the thread_info struct which isn't
>> > necessarily related to the temporary mm at all. I saw SLB multihits
>> > while testing this series with my LKDTM test where the "patching
>> > address" (userspace address for the temporary mapping w/
>> > write-permissions) ends up in a thread's preload list and then we
>> > explicitly insert it again in map_patch() when trying to patch. At that
>> > point the SLB multihit triggers.
>>
>> Hmm, so what if we use a mm, take some SLB faults then unuse it and
>> use a different one? I wonder if kthread_use_mm has existing problems
>> with this incorrect SLB preloading. Quite possibly. We should clear
>> the preload whenever mm changes I think. That should cover this as
>> well.
> 
> I actually did this initially but thought it was a bit too intrusive to
> include as part of this series and hurt performance. I agree that
> preloading the SLB from the thread may be a problem in general when
> switching in/out an mm.
> 
> kthread_use_mm may not be affected unless we explicitly insert some SLB
> entries which could collide with an existing preload (which I don't
> think we do anywhere until this series).

kthread_use_mm(mm1);
*ea = blah; /* slb preload[n++][ea] = va */
kthread_unuse_mm(mm1);

kthread_use_mm(mm2);
  switch_slb();
schedule();
  /* preload ea=va? */
x = *ea;
kthread_unuse_mm(mm2);

? I'm sure we'd have a bug in existing code if you're hitting a bug 
there.

Thanks,
Nick
Nicholas Piggin July 1, 2021, 11:30 a.m. UTC | #8
Excerpts from Nicholas Piggin's message of July 1, 2021 5:37 pm:
> Excerpts from Christopher M. Riedl's message of July 1, 2021 4:53 pm:
>> On Thu Jul 1, 2021 at 1:04 AM CDT, Nicholas Piggin wrote:
>>> Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
>>> > On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
>>> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
>>> >> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
>>> >> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
>>> >> >>
>>> >> >> > Switching to a different mm with Hash translation causes SLB entries to
>>> >> >> > be preloaded from the current thread_info. This reduces SLB faults, for
>>> >> >> > example when threads share a common mm but operate on different address
>>> >> >> > ranges.
>>> >> >> >
>>> >> >> > Preloading entries from the thread_info struct may not always be
>>> >> >> > appropriate - such as when switching to a temporary mm. Introduce a new
>>> >> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
>>> >> >> > SLB preload code into a separate function since switch_slb() is already
>>> >> >> > quite long. The default behavior (preloading SLB entries from the
>>> >> >> > current thread_info struct) remains unchanged.
>>> >> >> >
>>> >> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
>>> >> >> >
>>> >> >> > ---
>>> >> >> >
>>> >> >> > v4:  * New to series.
>>> >> >> > ---
>>> >> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
>>> >> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
>>> >> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
>>> >> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
>>> >> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
>>> >> >> >
>>> >> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
>>> >> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
>>> >> >> > @@ -130,6 +130,9 @@ typedef struct {
>>> >> >> >  	u32 pkey_allocation_map;
>>> >> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
>>> >> >> >  #endif
>>> >> >> > +
>>> >> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
>>> >> >> > +	bool skip_slb_preload;
>>> >> >> >  } mm_context_t;
>>> >> >> >  
>>> >> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
>>> >> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
>>> >> >> > --- a/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
>>> >> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
>>> >> >> >  	return 0;
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > +#ifdef CONFIG_PPC_BOOK3S_64
>>> >> >> > +
>>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
>>> >> >> > +{
>>> >> >> > +	mm->context.skip_slb_preload = true;
>>> >> >> > +}
>>> >> >> > +
>>> >> >> > +#else
>>> >> >> > +
>>> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
>>> >> >> > +
>>> >> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
>>> >> >> > +
>>> >> >> >  #include <asm-generic/mmu_context.h>
>>> >> >> >  
>>> >> >> >  #endif /* __KERNEL__ */
>>> >> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > index c10fc8a72fb37..3479910264c59 100644
>>> >> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
>>> >> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
>>> >> >> >  	atomic_set(&mm->context.active_cpus, 0);
>>> >> >> >  	atomic_set(&mm->context.copros, 0);
>>> >> >> >  
>>> >> >> > +	mm->context.skip_slb_preload = false;
>>> >> >> > +
>>> >> >> >  	return 0;
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > index c91bd85eb90e3..da0836cb855af 100644
>>> >> >> > --- a/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
>>> >> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
>>> >> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
>>> >> >> >  }
>>> >> >> >  
>>> >> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
>>> >> >> Should this be explicitly inline or even __always_inline? I'm thinking
>>> >> >> switch_slb is probably a fairly hot path on hash?
>>> >> > 
>>> >> > Yes absolutely. I'll make this change in v5.
>>> >> > 
>>> >> >>
>>> >> >> > +{
>>> >> >> > +	struct thread_info *ti = task_thread_info(tsk);
>>> >> >> > +	unsigned char i;
>>> >> >> > +
>>> >> >> > +	/*
>>> >> >> > +	 * We gradually age out SLBs after a number of context switches to
>>> >> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
>>> >> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
>>> >> >> > +	 * SLB preload cache.
>>> >> >> > +	 */
>>> >> >> > +	tsk->thread.load_slb++;
>>> >> >> > +	if (!tsk->thread.load_slb) {
>>> >> >> > +		unsigned long pc = KSTK_EIP(tsk);
>>> >> >> > +
>>> >> >> > +		preload_age(ti);
>>> >> >> > +		preload_add(ti, pc);
>>> >> >> > +	}
>>> >> >> > +
>>> >> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
>>> >> >> > +		unsigned char idx;
>>> >> >> > +		unsigned long ea;
>>> >> >> > +
>>> >> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>>> >> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>>> >> >> > +
>>> >> >> > +		slb_allocate_user(mm, ea);
>>> >> >> > +	}
>>> >> >> > +}
>>> >> >> > +
>>> >> >> >  /* Flush all user entries from the segment table of the current processor. */
>>> >> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>>> >> >> >  {
>>> >> >> > -	struct thread_info *ti = task_thread_info(tsk);
>>> >> >> >  	unsigned char i;
>>> >> >> >  
>>> >> >> >  	/*
>>> >> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>>> >> >> >  
>>> >> >> >  	copy_mm_to_paca(mm);
>>> >> >> >  
>>> >> >> > -	/*
>>> >> >> > -	 * We gradually age out SLBs after a number of context switches to
>>> >> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
>>> >> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
>>> >> >> > -	 * SLB preload cache.
>>> >> >> > -	 */
>>> >> >> > -	tsk->thread.load_slb++;
>>> >> >> > -	if (!tsk->thread.load_slb) {
>>> >> >> > -		unsigned long pc = KSTK_EIP(tsk);
>>> >> >> > -
>>> >> >> > -		preload_age(ti);
>>> >> >> > -		preload_add(ti, pc);
>>> >> >> > -	}
>>> >> >> > -
>>> >> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
>>> >> >> > -		unsigned char idx;
>>> >> >> > -		unsigned long ea;
>>> >> >> > -
>>> >> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
>>> >> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
>>> >> >> > -
>>> >> >> > -		slb_allocate_user(mm, ea);
>>> >> >> > -	}
>>> >> >> > +	if (!mm->context.skip_slb_preload)
>>> >> >> > +		preload_slb_entries(tsk, mm);
>>> >> >>
>>> >> >> Should this be wrapped in likely()?
>>> >> > 
>>> >> > Seems like a good idea - yes.
>>> >> > 
>>> >> >>
>>> >> >> >  
>>> >> >> >  	/*
>>> >> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
>>> >> >>
>>> >> >> Right below this comment is the isync. It seems to be specifically
>>> >> >> concerned with synchronising preloaded slbs. Do you need it if you are
>>> >> >> skipping SLB preloads?
>>> >> >>
>>> >> >> It's probably not a big deal to have an extra isync in the fairly rare
>>> >> >> path when we're skipping preloads, but I thought I'd check.
>>> >> > 
>>> >> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
>>> >> > but then again it was always in the code-path before. If someone can
>>> >> > make a compelling argument to drop it when not preloading SLBs I will,
>>> >> > otherwise (considering some of the other non-obvious things I stepped
>>> >> > into with the Hash code) I will keep it here for now.
>>> >>
>>> >> The ISA says slbia wants an isync afterward, so we probably should keep
>>> >> it. The comment is a bit misleading in that case.
>>> >>
>>> >> Why isn't preloading appropriate for a temporary mm?
>>> > 
>>> > The preloaded entries come from the thread_info struct which isn't
>>> > necessarily related to the temporary mm at all. I saw SLB multihits
>>> > while testing this series with my LKDTM test where the "patching
>>> > address" (userspace address for the temporary mapping w/
>>> > write-permissions) ends up in a thread's preload list and then we
>>> > explicitly insert it again in map_patch() when trying to patch. At that
>>> > point the SLB multihit triggers.
>>>
>>> Hmm, so what if we use a mm, take some SLB faults then unuse it and
>>> use a different one? I wonder if kthread_use_mm has existing problems
>>> with this incorrect SLB preloading. Quite possibly. We should clear
>>> the preload whenever mm changes I think. That should cover this as
>>> well.
>> 
>> I actually did this initially but thought it was a bit too intrusive to
>> include as part of this series and hurt performance. I agree that
>> preloading the SLB from the thread may be a problem in general when
>> switching in/out an mm.
>> 
>> kthread_use_mm may not be affected unless we explicitly insert some SLB
>> entries which could collide with an existing preload (which I don't
>> think we do anywhere until this series).
> 
> kthread_use_mm(mm1);
> *ea = blah; /* slb preload[n++][ea] = va */
> kthread_unuse_mm(mm1);
> 
> kthread_use_mm(mm2);
>   switch_slb();
> schedule();
>   /* preload ea=va? */
> x = *ea;
> kthread_unuse_mm(mm2);
> 
> ? I'm sure we'd have a bug in existing code if you're hitting a bug 
> there.

Something like this I think should prevent it. I thought there was a 
better arch hook for it, but doesn't seem so. I have an unexplained
SLB crash bug somewhere too, better check if it matches...

Thanks,
Nick

diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c91bd85eb90e..cb8c8a5d861e 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -502,6 +502,9 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 
        copy_mm_to_paca(mm);
 
+       if (unlikely(tsk->flags & PF_KTHREAD))
+               goto no_preload;
+
        /*
         * We gradually age out SLBs after a number of context switches to
         * reduce reload overhead of unused entries (like we do with FP/VEC
@@ -526,10 +529,11 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
                slb_allocate_user(mm, ea);
        }
 
+no_preload:
        /*
-        * Synchronize slbmte preloads with possible subsequent user memory
-        * address accesses by the kernel (user mode won't happen until
-        * rfid, which is safe).
+        * Synchronize slbias and slbmte preloads with possible subsequent user
+        * memory address accesses by the kernel (user mode won't happen until
+        * rfid, which is synchronizing).
         */
        isync();
 }
Christopher M. Riedl July 9, 2021, 4:55 a.m. UTC | #9
On Thu Jul 1, 2021 at 2:37 AM CDT, Nicholas Piggin wrote:
> Excerpts from Christopher M. Riedl's message of July 1, 2021 4:53 pm:
> > On Thu Jul 1, 2021 at 1:04 AM CDT, Nicholas Piggin wrote:
> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 3:28 pm:
> >> > On Wed Jun 30, 2021 at 11:15 PM CDT, Nicholas Piggin wrote:
> >> >> Excerpts from Christopher M. Riedl's message of July 1, 2021 1:48 pm:
> >> >> > On Sun Jun 20, 2021 at 10:13 PM CDT, Daniel Axtens wrote:
> >> >> >> "Christopher M. Riedl" <cmr@linux.ibm.com> writes:
> >> >> >>
> >> >> >> > Switching to a different mm with Hash translation causes SLB entries to
> >> >> >> > be preloaded from the current thread_info. This reduces SLB faults, for
> >> >> >> > example when threads share a common mm but operate on different address
> >> >> >> > ranges.
> >> >> >> >
> >> >> >> > Preloading entries from the thread_info struct may not always be
> >> >> >> > appropriate - such as when switching to a temporary mm. Introduce a new
> >> >> >> > boolean in mm_context_t to skip the SLB preload entirely. Also move the
> >> >> >> > SLB preload code into a separate function since switch_slb() is already
> >> >> >> > quite long. The default behavior (preloading SLB entries from the
> >> >> >> > current thread_info struct) remains unchanged.
> >> >> >> >
> >> >> >> > Signed-off-by: Christopher M. Riedl <cmr@linux.ibm.com>
> >> >> >> >
> >> >> >> > ---
> >> >> >> >
> >> >> >> > v4:  * New to series.
> >> >> >> > ---
> >> >> >> >  arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
> >> >> >> >  arch/powerpc/include/asm/mmu_context.h   | 13 ++++++
> >> >> >> >  arch/powerpc/mm/book3s64/mmu_context.c   |  2 +
> >> >> >> >  arch/powerpc/mm/book3s64/slb.c           | 56 ++++++++++++++----------
> >> >> >> >  4 files changed, 50 insertions(+), 24 deletions(-)
> >> >> >> >
> >> >> >> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> >> > index eace8c3f7b0a1..b23a9dcdee5af 100644
> >> >> >> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> >> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> >> >> >> > @@ -130,6 +130,9 @@ typedef struct {
> >> >> >> >  	u32 pkey_allocation_map;
> >> >> >> >  	s16 execute_only_pkey; /* key holding execute-only protection */
> >> >> >> >  #endif
> >> >> >> > +
> >> >> >> > +	/* Do not preload SLB entries from thread_info during switch_slb() */
> >> >> >> > +	bool skip_slb_preload;
> >> >> >> >  } mm_context_t;
> >> >> >> >  
> >> >> >> >  static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
> >> >> >> > diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
> >> >> >> > index 4bc45d3ed8b0e..264787e90b1a1 100644
> >> >> >> > --- a/arch/powerpc/include/asm/mmu_context.h
> >> >> >> > +++ b/arch/powerpc/include/asm/mmu_context.h
> >> >> >> > @@ -298,6 +298,19 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
> >> >> >> >  	return 0;
> >> >> >> >  }
> >> >> >> >  
> >> >> >> > +#ifdef CONFIG_PPC_BOOK3S_64
> >> >> >> > +
> >> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm)
> >> >> >> > +{
> >> >> >> > +	mm->context.skip_slb_preload = true;
> >> >> >> > +}
> >> >> >> > +
> >> >> >> > +#else
> >> >> >> > +
> >> >> >> > +static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
> >> >> >> > +
> >> >> >> > +#endif /* CONFIG_PPC_BOOK3S_64 */
> >> >> >> > +
> >> >> >> >  #include <asm-generic/mmu_context.h>
> >> >> >> >  
> >> >> >> >  #endif /* __KERNEL__ */
> >> >> >> > diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> >> > index c10fc8a72fb37..3479910264c59 100644
> >> >> >> > --- a/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> >> > +++ b/arch/powerpc/mm/book3s64/mmu_context.c
> >> >> >> > @@ -202,6 +202,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >> >  	atomic_set(&mm->context.active_cpus, 0);
> >> >> >> >  	atomic_set(&mm->context.copros, 0);
> >> >> >> >  
> >> >> >> > +	mm->context.skip_slb_preload = false;
> >> >> >> > +
> >> >> >> >  	return 0;
> >> >> >> >  }
> >> >> >> >  
> >> >> >> > diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
> >> >> >> > index c91bd85eb90e3..da0836cb855af 100644
> >> >> >> > --- a/arch/powerpc/mm/book3s64/slb.c
> >> >> >> > +++ b/arch/powerpc/mm/book3s64/slb.c
> >> >> >> > @@ -441,10 +441,39 @@ static void slb_cache_slbie_user(unsigned int index)
> >> >> >> >  	asm volatile("slbie %0" : : "r" (slbie_data));
> >> >> >> >  }
> >> >> >> >  
> >> >> >> > +static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >> Should this be explicitly inline or even __always_inline? I'm thinking
> >> >> >> switch_slb is probably a fairly hot path on hash?
> >> >> > 
> >> >> > Yes absolutely. I'll make this change in v5.
> >> >> > 
> >> >> >>
> >> >> >> > +{
> >> >> >> > +	struct thread_info *ti = task_thread_info(tsk);
> >> >> >> > +	unsigned char i;
> >> >> >> > +
> >> >> >> > +	/*
> >> >> >> > +	 * We gradually age out SLBs after a number of context switches to
> >> >> >> > +	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> >> >> > +	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> >> >> > +	 * SLB preload cache.
> >> >> >> > +	 */
> >> >> >> > +	tsk->thread.load_slb++;
> >> >> >> > +	if (!tsk->thread.load_slb) {
> >> >> >> > +		unsigned long pc = KSTK_EIP(tsk);
> >> >> >> > +
> >> >> >> > +		preload_age(ti);
> >> >> >> > +		preload_add(ti, pc);
> >> >> >> > +	}
> >> >> >> > +
> >> >> >> > +	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> >> >> > +		unsigned char idx;
> >> >> >> > +		unsigned long ea;
> >> >> >> > +
> >> >> >> > +		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> >> >> > +		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> >> >> > +
> >> >> >> > +		slb_allocate_user(mm, ea);
> >> >> >> > +	}
> >> >> >> > +}
> >> >> >> > +
> >> >> >> >  /* Flush all user entries from the segment table of the current processor. */
> >> >> >> >  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >> >  {
> >> >> >> > -	struct thread_info *ti = task_thread_info(tsk);
> >> >> >> >  	unsigned char i;
> >> >> >> >  
> >> >> >> >  	/*
> >> >> >> > @@ -502,29 +531,8 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
> >> >> >> >  
> >> >> >> >  	copy_mm_to_paca(mm);
> >> >> >> >  
> >> >> >> > -	/*
> >> >> >> > -	 * We gradually age out SLBs after a number of context switches to
> >> >> >> > -	 * reduce reload overhead of unused entries (like we do with FP/VEC
> >> >> >> > -	 * reload). Each time we wrap 256 switches, take an entry out of the
> >> >> >> > -	 * SLB preload cache.
> >> >> >> > -	 */
> >> >> >> > -	tsk->thread.load_slb++;
> >> >> >> > -	if (!tsk->thread.load_slb) {
> >> >> >> > -		unsigned long pc = KSTK_EIP(tsk);
> >> >> >> > -
> >> >> >> > -		preload_age(ti);
> >> >> >> > -		preload_add(ti, pc);
> >> >> >> > -	}
> >> >> >> > -
> >> >> >> > -	for (i = 0; i < ti->slb_preload_nr; i++) {
> >> >> >> > -		unsigned char idx;
> >> >> >> > -		unsigned long ea;
> >> >> >> > -
> >> >> >> > -		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
> >> >> >> > -		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
> >> >> >> > -
> >> >> >> > -		slb_allocate_user(mm, ea);
> >> >> >> > -	}
> >> >> >> > +	if (!mm->context.skip_slb_preload)
> >> >> >> > +		preload_slb_entries(tsk, mm);
> >> >> >>
> >> >> >> Should this be wrapped in likely()?
> >> >> > 
> >> >> > Seems like a good idea - yes.
> >> >> > 
> >> >> >>
> >> >> >> >  
> >> >> >> >  	/*
> >> >> >> >  	 * Synchronize slbmte preloads with possible subsequent user memory
> >> >> >>
> >> >> >> Right below this comment is the isync. It seems to be specifically
> >> >> >> concerned with synchronising preloaded slbs. Do you need it if you are
> >> >> >> skipping SLB preloads?
> >> >> >>
> >> >> >> It's probably not a big deal to have an extra isync in the fairly rare
> >> >> >> path when we're skipping preloads, but I thought I'd check.
> >> >> > 
> >> >> > I don't _think_ we need the `isync` if we are skipping the SLB preloads,
> >> >> > but then again it was always in the code-path before. If someone can
> >> >> > make a compelling argument to drop it when not preloading SLBs I will,
> >> >> > otherwise (considering some of the other non-obvious things I stepped
> >> >> > into with the Hash code) I will keep it here for now.
> >> >>
> >> >> The ISA says slbia wants an isync afterward, so we probably should keep
> >> >> it. The comment is a bit misleading in that case.
> >> >>
> >> >> Why isn't preloading appropriate for a temporary mm?
> >> > 
> >> > The preloaded entries come from the thread_info struct which isn't
> >> > necessarily related to the temporary mm at all. I saw SLB multihits
> >> > while testing this series with my LKDTM test where the "patching
> >> > address" (userspace address for the temporary mapping w/
> >> > write-permissions) ends up in a thread's preload list and then we
> >> > explicitly insert it again in map_patch() when trying to patch. At that
> >> > point the SLB multihit triggers.
> >>
> >> Hmm, so what if we use a mm, take some SLB faults then unuse it and
> >> use a different one? I wonder if kthread_use_mm has existing problems
> >> with this incorrect SLB preloading. Quite possibly. We should clear
> >> the preload whenever mm changes I think. That should cover this as
> >> well.
> > 
> > I actually did this initially but thought it was a bit too intrusive to
> > include as part of this series and hurt performance. I agree that
> > preloading the SLB from the thread may be a problem in general when
> > switching in/out an mm.
> > 
> > kthread_use_mm may not be affected unless we explicitly insert some SLB
> > entries which could collide with an existing preload (which I don't
> > think we do anywhere until this series).
>
> kthread_use_mm(mm1);
> *ea = blah; /* slb preload[n++][ea] = va */
> kthread_unuse_mm(mm1);
>
> kthread_use_mm(mm2);
> switch_slb();
> schedule();
> /* preload ea=va? */
> x = *ea;
> kthread_unuse_mm(mm2);
>
> ? I'm sure we'd have a bug in existing code if you're hitting a bug
> there.

Not exactly - the SLB multihit happens because of the new code in this
series - specifically the slb_allocate_user() call during patching:

put_user(..., ea); /* insert ea into thread's preload list */
...
patch_instruction(..., ea);
  map_patch()
    switch_slb(); /* preload slb entry for ea from thread_info */
    ...
    slb_allocate_user(..., ea); /* insert slb entry for ea */
    __put_kernel_nofault(..., ea); /* ie. a 'stw' to patch */
    >>> SLB Multihit since we have an SLBE from the preload and the
        explicit slb_allocate_user()

Based on your other comments on this series I am dropping the Hash
support for percpu temp mm altogether for now so this is moot. But, I
still think it doesn't make much sense to preload SLB entries from a
thread_info struct when switching to a completely different mm.

>
> Thanks,
> Nick
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index eace8c3f7b0a1..b23a9dcdee5af 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -130,6 +130,9 @@  typedef struct {
 	u32 pkey_allocation_map;
 	s16 execute_only_pkey; /* key holding execute-only protection */
 #endif
+
+	/* Do not preload SLB entries from thread_info during switch_slb() */
+	bool skip_slb_preload;
 } mm_context_t;
 
 static inline u16 mm_ctx_user_psize(mm_context_t *ctx)
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 4bc45d3ed8b0e..264787e90b1a1 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -298,6 +298,19 @@  static inline int arch_dup_mmap(struct mm_struct *oldmm,
 	return 0;
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+
+static inline void skip_slb_preload_mm(struct mm_struct *mm)
+{
+	mm->context.skip_slb_preload = true;
+}
+
+#else
+
+static inline void skip_slb_preload_mm(struct mm_struct *mm) {}
+
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 #include <asm-generic/mmu_context.h>
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c
index c10fc8a72fb37..3479910264c59 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -202,6 +202,8 @@  int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
 	atomic_set(&mm->context.active_cpus, 0);
 	atomic_set(&mm->context.copros, 0);
 
+	mm->context.skip_slb_preload = false;
+
 	return 0;
 }
 
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index c91bd85eb90e3..da0836cb855af 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -441,10 +441,39 @@  static void slb_cache_slbie_user(unsigned int index)
 	asm volatile("slbie %0" : : "r" (slbie_data));
 }
 
+static void preload_slb_entries(struct task_struct *tsk, struct mm_struct *mm)
+{
+	struct thread_info *ti = task_thread_info(tsk);
+	unsigned char i;
+
+	/*
+	 * We gradually age out SLBs after a number of context switches to
+	 * reduce reload overhead of unused entries (like we do with FP/VEC
+	 * reload). Each time we wrap 256 switches, take an entry out of the
+	 * SLB preload cache.
+	 */
+	tsk->thread.load_slb++;
+	if (!tsk->thread.load_slb) {
+		unsigned long pc = KSTK_EIP(tsk);
+
+		preload_age(ti);
+		preload_add(ti, pc);
+	}
+
+	for (i = 0; i < ti->slb_preload_nr; i++) {
+		unsigned char idx;
+		unsigned long ea;
+
+		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
+		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
+
+		slb_allocate_user(mm, ea);
+	}
+}
+
 /* Flush all user entries from the segment table of the current processor. */
 void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 {
-	struct thread_info *ti = task_thread_info(tsk);
 	unsigned char i;
 
 	/*
@@ -502,29 +531,8 @@  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
 
 	copy_mm_to_paca(mm);
 
-	/*
-	 * We gradually age out SLBs after a number of context switches to
-	 * reduce reload overhead of unused entries (like we do with FP/VEC
-	 * reload). Each time we wrap 256 switches, take an entry out of the
-	 * SLB preload cache.
-	 */
-	tsk->thread.load_slb++;
-	if (!tsk->thread.load_slb) {
-		unsigned long pc = KSTK_EIP(tsk);
-
-		preload_age(ti);
-		preload_add(ti, pc);
-	}
-
-	for (i = 0; i < ti->slb_preload_nr; i++) {
-		unsigned char idx;
-		unsigned long ea;
-
-		idx = (ti->slb_preload_tail + i) % SLB_PRELOAD_NR;
-		ea = (unsigned long)ti->slb_preload_esid[idx] << SID_SHIFT;
-
-		slb_allocate_user(mm, ea);
-	}
+	if (!mm->context.skip_slb_preload)
+		preload_slb_entries(tsk, mm);
 
 	/*
 	 * Synchronize slbmte preloads with possible subsequent user memory