[2/2] ppc: lazy flush_tlb_mm for nohash architectures

Message ID	1285351297-9999-3-git-send-email-shaggy@linux.vnet.ibm.com (mailing list archive)
State	Not Applicable
Delegated to:	Josh Boyer
Headers	show Return-Path: <linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org> From: Dave Kleikamp <shaggy@linux.vnet.ibm.com> To: Josh Boyer <jwboyer@linux.vnet.ibm.com>, Benjamin Herrenschmidt <benh@kernel.crashing.org> Subject: [PATCH 2/2] ppc: lazy flush_tlb_mm for nohash architectures Date: Fri, 24 Sep 2010 13:01:37 -0500 Message-Id: <1285351297-9999-3-git-send-email-shaggy@linux.vnet.ibm.com> In-Reply-To: <1285351297-9999-1-git-send-email-shaggy@linux.vnet.ibm.com> References: <1285351297-9999-1-git-send-email-shaggy@linux.vnet.ibm.com> Cc: linuxppc-dev list <Linuxppc-dev@ozlabs.org>, Dave Kleikamp <shaggy@linux.vnet.ibm.com> Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org Errors-To: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org

Message ID

1285351297-9999-3-git-send-email-shaggy@linux.vnet.ibm.com (mailing list archive)

State

Not Applicable

Delegated to:

Josh Boyer

Headers

From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
To: Josh Boyer <jwboyer@linux.vnet.ibm.com>,
	Benjamin Herrenschmidt <benh@kernel.crashing.org>
Subject: [PATCH 2/2] ppc: lazy flush_tlb_mm for nohash architectures
Date: Fri, 24 Sep 2010 13:01:37 -0500
Message-Id: <1285351297-9999-3-git-send-email-shaggy@linux.vnet.ibm.com>
In-Reply-To: <1285351297-9999-1-git-send-email-shaggy@linux.vnet.ibm.com>
References: <1285351297-9999-1-git-send-email-shaggy@linux.vnet.ibm.com>
Cc: linuxppc-dev list <Linuxppc-dev@ozlabs.org>,
	Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Precedence: list
MIME-Version: 1.0
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit
Sender: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org
Errors-To: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org

Commit Message

Dave Kleikamp Sept. 24, 2010, 6:01 p.m. UTC

On PPC_MMU_NOHASH processors that support a large number of contexts,
implement a lazy flush_tlb_mm() that switches to a free context, marking
the old one stale.  The tlb is only flushed when no free contexts are
available.

The lazy tlb flushing is controlled by the global variable tlb_lazy_flush
which is set during init, dependent upon MMU_FTR_TYPE_47x.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 arch/powerpc/mm/mmu_context_nohash.c |  154 +++++++++++++++++++++++++++++++---
 arch/powerpc/mm/mmu_decl.h           |    8 ++
 arch/powerpc/mm/tlb_nohash.c         |   28 +++++-
 3 files changed, 174 insertions(+), 16 deletions(-)

Comments

Benjamin Herrenschmidt Oct. 14, 2010, 12:52 a.m. UTC | #1

On Fri, 2010-09-24 at 13:01 -0500, Dave Kleikamp wrote:
> On PPC_MMU_NOHASH processors that support a large number of contexts,
> implement a lazy flush_tlb_mm() that switches to a free context, marking
> the old one stale.  The tlb is only flushed when no free contexts are
> available.
> 
> The lazy tlb flushing is controlled by the global variable tlb_lazy_flush
> which is set during init, dependent upon MMU_FTR_TYPE_47x.

Unless I'm mistaken, there are some issues with that patch... sorry for
the late review, I've been away for a couple of weeks.

> +int tlb_lazy_flush;
> +static int tlb_needs_flush[NR_CPUS];
> +static unsigned long *context_available_map;
> +static unsigned int nr_stale_contexts;

Now I understand what you're doing here, but wouldn't it have been
possible to re-use the existing stale map concept or do you reckon it
would have been too messy ?

At the very least, the "old style" stale map code and "new style" stale
TLB code should be more in sync, you may end up flushing the TLB
twice...

>  #define CTX_MAP_SIZE	\
>  	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
>  
> +/*
> + * if another cpu recycled the stale contexts, we need to flush
> + * the local TLB, so that we may re-use those contexts
> + */
> +void flush_recycled_contexts(int cpu)
> +{
> +	int i;
> +
> +	if (tlb_needs_flush[cpu]) {
> +		pr_hard("[%d] flushing tlb\n", cpu);
> +		_tlbil_all();
> +		for (i = cpu_first_thread_in_core(cpu);
> +		     i <= cpu_last_thread_in_core(cpu); i++) {
> +			tlb_needs_flush[i] = 0;
> +		}
> +	}
> +}

So far so good :-)

>  /* Steal a context from a task that has one at the moment.
>   *
> @@ -147,7 +167,7 @@ static unsigned int steal_context_up(unsigned int id)
>  	pr_hardcont(" | steal %d from 0x%p", id, mm);
>  
>  	/* Flush the TLB for that context */
> -	local_flush_tlb_mm(mm);
> +	__local_flush_tlb_mm(mm);
>  
>  	/* Mark this mm has having no context anymore */
>  	mm->context.id = MMU_NO_CONTEXT;

Ok.

> @@ -161,13 +181,19 @@ static unsigned int steal_context_up(unsigned int id)
>  #ifdef DEBUG_MAP_CONSISTENCY
>  static void context_check_map(void)
>  {
> -	unsigned int id, nrf, nact;
> +	unsigned int id, nrf, nact, nstale;
>  
> -	nrf = nact = 0;
> +	nrf = nact = nstale = 0;
>  	for (id = first_context; id <= last_context; id++) {
>  		int used = test_bit(id, context_map);
> -		if (!used)
> -			nrf++;
> +		int allocated = tlb_lazy_flush &&
> +				test_bit(id, context_available_map);
> +		if (!used) {
> +			if (allocated)
> +				nstale++;
> +			else
> +				nrf++;
> +		}
>  		if (used != (context_mm[id] != NULL))
>  			pr_err("MMU: Context %d is %s and MM is %p !\n",
>  			       id, used ? "used" : "free", context_mm[id]);
> @@ -179,6 +205,11 @@ static void context_check_map(void)
>  		       nr_free_contexts, nrf);
>  		nr_free_contexts = nrf;
>  	}
> +	if (nstale != nr_stale_contexts) {
> +		pr_err("MMU: Stale context count out of sync ! (%d vs %d)\n",
> +		       nr_stale_contexts, nstale);
> +		nr_stale_contexts = nstale;
> +	}
>  	if (nact > num_online_cpus())
>  		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
>  		       nact, num_online_cpus());

Cursory glance on the above looks ok.

> @@ -189,6 +220,38 @@ static void context_check_map(void)
>  static void context_check_map(void) { }
>  #endif
>  
> +/*
> + * On architectures that support a large number of contexts, the tlb
> + * can be flushed lazily by picking a new context and making the stale
> + * context unusable until a lazy tlb flush has been issued.
> + *
> + * context_available_map keeps track of both active and stale contexts,
> + * while context_map continues to track only active contexts.  When the
> + * lazy tlb flush is triggered, context_map is copied to
> + * context_available_map, making the once-stale contexts available again
> + */
> +static void recycle_stale_contexts(void)
> +{
> +	if (nr_free_contexts == 0 && nr_stale_contexts > 0) {

Do an early return and avoid the indentation instead ?

> +		unsigned int cpu = smp_processor_id();
> +		unsigned int i;
> +
> +		pr_hard("[%d] recycling stale contexts\n", cpu);
> +		/* Time to flush the TLB's */
> +		memcpy(context_available_map, context_map, CTX_MAP_SIZE);
> +		nr_free_contexts = nr_stale_contexts;
> +		nr_stale_contexts = 0;
> +		for_each_online_cpu(i) {
> +			if ((i < cpu_first_thread_in_core(cpu)) ||
> +			    (i > cpu_last_thread_in_core(cpu)))
> +				tlb_needs_flush[i] = 1;
> +			else
> +				tlb_needs_flush[i] = 0;	/* This core */
> +		}
> +		_tlbil_all();
> +	}
> +}
> +
>  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
>  {
>  	unsigned int i, id, cpu = smp_processor_id();
> @@ -197,6 +260,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
>  	/* No lockless fast path .. yet */
>  	raw_spin_lock(&context_lock);
>  
> +	flush_recycled_contexts(cpu);
> +

Ok so here's the nasty one I think. You need to make sure that whenever
you pick something off the context_available_map, you've done the above
first within the same context_lock section right ? At least before you
actually -use- said context.

So far so good ... but steal_context can drop the lock iirc. So you may
need to re-flush there. Not sure that can happen in practice but better
safe than sorry. I would have preferred seeing that flush near the end
of the function to avoid such problem.

Then, you can end up in cases where you flush the TLB, but your context
is marked stale due to stealing, and flush again. That's one of the
reason I wonder if we can consolidate a bit the two orthogonal
"staleness" concepts we have now.

Granted, stealing on 47x is unlikely, but I have reasons to think that
this lazy flushing will benefit 440 too.

>  	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
>  		cpu, next, next->context.active, next->context.id);
>  
> @@ -227,7 +292,12 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
>  	id = next_context;
>  	if (id > last_context)
>  		id = first_context;
> -	map = context_map;
> +
> +	if (tlb_lazy_flush) {
> +		recycle_stale_contexts();
> +		map = context_available_map;
> +	} else
> +		map = context_map;
>  
>  	/* No more free contexts, let's try to steal one */
>  	if (nr_free_contexts == 0) {
> @@ -250,6 +320,13 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
>  		if (id > last_context)
>  			id = first_context;
>  	}
> +	if (tlb_lazy_flush)
> +		/*
> +		 * In the while loop above, we set the bit in
> +		 * context_available_map, it also needs to be set in
> +		 * context_map
> +		 */
> +		__set_bit(id, context_map);
>   stolen:
>  	next_context = id + 1;
>  	context_mm[id] = next;
> @@ -267,7 +344,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
>  			    id, cpu_first_thread_in_core(cpu),
>  			    cpu_last_thread_in_core(cpu));
>  
> -		local_flush_tlb_mm(next);
> +		__local_flush_tlb_mm(next);
>  
>  		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
>  		for (i = cpu_first_thread_in_core(cpu);
> @@ -317,11 +394,61 @@ void destroy_context(struct mm_struct *mm)
>  		mm->context.active = 0;
>  #endif
>  		context_mm[id] = NULL;
> -		nr_free_contexts++;
> +
> +		if (tlb_lazy_flush)
> +			nr_stale_contexts++;
> +		else
> +			nr_free_contexts++;
>  	}
>  	raw_spin_unlock_irqrestore(&context_lock, flags);
>  }

Now...

> +/*
> + * This is called from flush_tlb_mm().  Mark the current context as stale
> + * and grab an available one.  The tlb will be flushed when no more
> + * contexts are available
> + */
> +void lazy_flush_context(struct mm_struct *mm)
> +{
> +	unsigned int id;
> +	unsigned long flags;
> +	unsigned long *map;
> +
> +	raw_spin_lock_irqsave(&context_lock, flags);
> +
> +	id = mm->context.id;
> +	if (unlikely(id == MMU_NO_CONTEXT))
> +		goto no_context;

First thing is ... you reproduce quite a bit of logic from
switch_mmu_context() here. Shouldn't it be abstracted in a separate
function ?

The other thing here is that another CPU might have done a
recycle_stale_contexts() before you get here. IE. Your TLB may be stale.
Shouln't you do a flush here ? Since you are picking up a new PID from
the context_available_map, it can potentially be stale if your tlb needs
flushing due to another CPU having just done a recycle.

> +	/*
> +	 * Make the existing context stale.  It remains in
> +	 * context_available_map as long as nr_free_contexts remains non-zero
> +	 */
> +	 __clear_bit(id, context_map);
> +	 context_mm[id] = NULL;
> +	 nr_stale_contexts++;
> +
> +	recycle_stale_contexts();
> +	BUG_ON(nr_free_contexts == 0);
> +
> +	nr_free_contexts--;
> +	id = last_context;
> +	map = context_available_map;
> +	while (__test_and_set_bit(id, map)) {
> +		id = find_next_zero_bit(map, last_context+1, id);
> +		if (id > last_context)
> +			id = first_context;
> +	}
> +	set_bit(id, context_map);
> +	next_context = id + 1;
> +	context_mm[id] = mm;
> +	mm->context.id = id;
> +	if (current->active_mm == mm)
> +		set_context(id, mm->pgd);
> +no_context:
> +	raw_spin_unlock_irqrestore(&context_lock, flags);
> +}
> +
>  #ifdef CONFIG_SMP
>  
>  static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
> @@ -407,6 +534,7 @@ void __init mmu_context_init(void)
>  	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
>  		first_context = 1;
>  		last_context = 65535;
> +		tlb_lazy_flush = 1;
>  	} else {
>  		first_context = 1;
>  		last_context = 255;

Somebody should measure on 440, might actually improve perfs. Something
like a kernel compile sounds like a good test here.

> @@ -419,6 +547,8 @@ void __init mmu_context_init(void)
>  	 * Allocate the maps used by context management
>  	 */
>  	context_map = alloc_bootmem(CTX_MAP_SIZE);
> +	if (tlb_lazy_flush)
> +		context_available_map = alloc_bootmem(CTX_MAP_SIZE);
>  	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
>  	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
>  
> diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
> index 63b84a0..64240f1 100644
> --- a/arch/powerpc/mm/mmu_decl.h
> +++ b/arch/powerpc/mm/mmu_decl.h
> @@ -25,6 +25,14 @@
>  #ifdef CONFIG_PPC_MMU_NOHASH

Cheers,
Ben.

Dave Kleikamp Oct. 18, 2010, 9:57 p.m. UTC | #2

On Thu, 2010-10-14 at 11:52 +1100, Benjamin Herrenschmidt wrote:
> On Fri, 2010-09-24 at 13:01 -0500, Dave Kleikamp wrote:
> > On PPC_MMU_NOHASH processors that support a large number of contexts,
> > implement a lazy flush_tlb_mm() that switches to a free context, marking
> > the old one stale.  The tlb is only flushed when no free contexts are
> > available.
> > 
> > The lazy tlb flushing is controlled by the global variable tlb_lazy_flush
> > which is set during init, dependent upon MMU_FTR_TYPE_47x.
> 
> Unless I'm mistaken, there are some issues with that patch... sorry for
> the late review, I've been away for a couple of weeks.
> 
> > +int tlb_lazy_flush;
> > +static int tlb_needs_flush[NR_CPUS];
> > +static unsigned long *context_available_map;
> > +static unsigned int nr_stale_contexts;
> 
> Now I understand what you're doing here, but wouldn't it have been
> possible to re-use the existing stale map concept or do you reckon it
> would have been too messy ?

I didn't like the implementation of a per-core stale map.  The existing
implementation flushes the core's tlb, but only clears a specific entry
from the stale map.  It's dealing with the stale contexts one at a time,
where the new function is accumulating many stale contexts, with the
intent to do a single tlb flush per core.

Since I originally intended the new function only to be enabled on the
47x, I left the context-stealing code as untouched as possible thinking
it wouldn't be exercised in 47x-land.  This was probably narrow-minded,
and I should look at either 1) aligning the context-stealing code closer
to the new lazy flush code, or 2) dropping this code on the floor and
picking back up the new design that we worked on last year.


> At the very least, the "old style" stale map code and "new style" stale
> TLB code should be more in sync, you may end up flushing the TLB
> twice...

yeah.  if we enable this for 440, it is more likely to be an issue than
on 476.

> > @@ -189,6 +220,38 @@ static void context_check_map(void)
> >  static void context_check_map(void) { }
> >  #endif
> >  
> > +/*
> > + * On architectures that support a large number of contexts, the tlb
> > + * can be flushed lazily by picking a new context and making the stale
> > + * context unusable until a lazy tlb flush has been issued.
> > + *
> > + * context_available_map keeps track of both active and stale contexts,
> > + * while context_map continues to track only active contexts.  When the
> > + * lazy tlb flush is triggered, context_map is copied to
> > + * context_available_map, making the once-stale contexts available again
> > + */
> > +static void recycle_stale_contexts(void)
> > +{
> > +	if (nr_free_contexts == 0 && nr_stale_contexts > 0) {
> 
> Do an early return and avoid the indentation instead ?

Yeah, that makes sense.

> > +		unsigned int cpu = smp_processor_id();
> > +		unsigned int i;
> > +
> > +		pr_hard("[%d] recycling stale contexts\n", cpu);
> > +		/* Time to flush the TLB's */
> > +		memcpy(context_available_map, context_map, CTX_MAP_SIZE);
> > +		nr_free_contexts = nr_stale_contexts;
> > +		nr_stale_contexts = 0;
> > +		for_each_online_cpu(i) {
> > +			if ((i < cpu_first_thread_in_core(cpu)) ||
> > +			    (i > cpu_last_thread_in_core(cpu)))
> > +				tlb_needs_flush[i] = 1;
> > +			else
> > +				tlb_needs_flush[i] = 0;	/* This core */
> > +		}
> > +		_tlbil_all();
> > +	}
> > +}
> > +
> >  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> >  {
> >  	unsigned int i, id, cpu = smp_processor_id();
> > @@ -197,6 +260,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> >  	/* No lockless fast path .. yet */
> >  	raw_spin_lock(&context_lock);
> >  
> > +	flush_recycled_contexts(cpu);
> > +
> 
> Ok so here's the nasty one I think. You need to make sure that whenever
> you pick something off the context_available_map, you've done the above
> first within the same context_lock section right ? At least before you
> actually -use- said context.

right.

> So far so good ... but steal_context can drop the lock iirc. So you may
> need to re-flush there. Not sure that can happen in practice but better
> safe than sorry. I would have preferred seeing that flush near the end
> of the function to avoid such problem.

I can fix this.  For 476, I don't think that even if steal_context()
could be called, it wouldn't drop the lock.  But then again, if we
enable this for other architectures, it may be a possibility.

> Then, you can end up in cases where you flush the TLB, but your context
> is marked stale due to stealing, and flush again. That's one of the
> reason I wonder if we can consolidate a bit the two orthogonal
> "staleness" concepts we have now.
> 
> Granted, stealing on 47x is unlikely, but I have reasons to think that
> this lazy flushing will benefit 440 too.
> 
> >  	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
> >  		cpu, next, next->context.active, next->context.id);
> >  
> > @@ -227,7 +292,12 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> >  	id = next_context;
> >  	if (id > last_context)
> >  		id = first_context;
> > -	map = context_map;
> > +
> > +	if (tlb_lazy_flush) {
> > +		recycle_stale_contexts();
> > +		map = context_available_map;
> > +	} else
> > +		map = context_map;
> >  
> >  	/* No more free contexts, let's try to steal one */
> >  	if (nr_free_contexts == 0) {
> > @@ -250,6 +320,13 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> >  		if (id > last_context)
> >  			id = first_context;
> >  	}
> > +	if (tlb_lazy_flush)
> > +		/*
> > +		 * In the while loop above, we set the bit in
> > +		 * context_available_map, it also needs to be set in
> > +		 * context_map
> > +		 */
> > +		__set_bit(id, context_map);
> >   stolen:
> >  	next_context = id + 1;
> >  	context_mm[id] = next;
> > @@ -267,7 +344,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> >  			    id, cpu_first_thread_in_core(cpu),
> >  			    cpu_last_thread_in_core(cpu));
> >  
> > -		local_flush_tlb_mm(next);
> > +		__local_flush_tlb_mm(next);
> >  
> >  		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
> >  		for (i = cpu_first_thread_in_core(cpu);
> > @@ -317,11 +394,61 @@ void destroy_context(struct mm_struct *mm)
> >  		mm->context.active = 0;
> >  #endif
> >  		context_mm[id] = NULL;
> > -		nr_free_contexts++;
> > +
> > +		if (tlb_lazy_flush)
> > +			nr_stale_contexts++;
> > +		else
> > +			nr_free_contexts++;
> >  	}
> >  	raw_spin_unlock_irqrestore(&context_lock, flags);
> >  }
> 
> Now...
> 
> > +/*
> > + * This is called from flush_tlb_mm().  Mark the current context as stale
> > + * and grab an available one.  The tlb will be flushed when no more
> > + * contexts are available
> > + */
> > +void lazy_flush_context(struct mm_struct *mm)
> > +{
> > +	unsigned int id;
> > +	unsigned long flags;
> > +	unsigned long *map;
> > +
> > +	raw_spin_lock_irqsave(&context_lock, flags);
> > +
> > +	id = mm->context.id;
> > +	if (unlikely(id == MMU_NO_CONTEXT))
> > +		goto no_context;
> 
> First thing is ... you reproduce quite a bit of logic from
> switch_mmu_context() here. Shouldn't it be abstracted in a separate
> function ?

I'm sure there's something I can do there.

> The other thing here is that another CPU might have done a
> recycle_stale_contexts() before you get here. IE. Your TLB may be stale.
> Shouln't you do a flush here ? Since you are picking up a new PID from
> the context_available_map, it can potentially be stale if your tlb needs
> flushing due to another CPU having just done a recycle.

It looks like I missed that.  It does seem that there should be a flush
in here.

> > +	/*
> > +	 * Make the existing context stale.  It remains in
> > +	 * context_available_map as long as nr_free_contexts remains non-zero
> > +	 */
> > +	 __clear_bit(id, context_map);
> > +	 context_mm[id] = NULL;
> > +	 nr_stale_contexts++;
> > +
> > +	recycle_stale_contexts();
> > +	BUG_ON(nr_free_contexts == 0);
> > +
> > +	nr_free_contexts--;
> > +	id = last_context;
> > +	map = context_available_map;
> > +	while (__test_and_set_bit(id, map)) {
> > +		id = find_next_zero_bit(map, last_context+1, id);
> > +		if (id > last_context)
> > +			id = first_context;
> > +	}
> > +	set_bit(id, context_map);
> > +	next_context = id + 1;
> > +	context_mm[id] = mm;
> > +	mm->context.id = id;
> > +	if (current->active_mm == mm)
> > +		set_context(id, mm->pgd);
> > +no_context:
> > +	raw_spin_unlock_irqrestore(&context_lock, flags);
> > +}
> > +
> >  #ifdef CONFIG_SMP
> >  
> >  static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
> > @@ -407,6 +534,7 @@ void __init mmu_context_init(void)
> >  	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
> >  		first_context = 1;
> >  		last_context = 65535;
> > +		tlb_lazy_flush = 1;
> >  	} else {
> >  		first_context = 1;
> >  		last_context = 255;
> 
> Somebody should measure on 440, might actually improve perfs. Something
> like a kernel compile sounds like a good test here.

I think I'm going to dust off the newer implementation based on your and
Paul's design.  I can probably get that in good working order without
too much more work, and it's something we need to look at eventually
anyway.  If I find anything that really gets in my way, I might fix up
this patch in the mean time.

Thanks,
Shaggy

Benjamin Herrenschmidt Oct. 18, 2010, 11:34 p.m. UTC | #3

On Mon, 2010-10-18 at 16:57 -0500, Dave Kleikamp wrote:
> 
> I didn't like the implementation of a per-core stale map.  The existing
> implementation flushes the core's tlb, but only clears a specific entry
> from the stale map.  It's dealing with the stale contexts one at a time,
> where the new function is accumulating many stale contexts, with the
> intent to do a single tlb flush per core.

Right, because I wrote it with A2 in mind which has a TLB invalidate by
PID instruction :-) So I don't flush the whole TLB there... but then
this instruction can take hundreds (or more) of cycles so it might not
necessarily be that great anyways...

> Since I originally intended the new function only to be enabled on the
> 47x, I left the context-stealing code as untouched as possible thinking
> it wouldn't be exercised in 47x-land.  This was probably narrow-minded,
> and I should look at either 1) aligning the context-stealing code closer
> to the new lazy flush code, or 2) dropping this code on the floor and
> picking back up the new design that we worked on last year.

In any case, we can probably merge you current stuff upstream (with
appropriate bug fixes if necessary) for now and move from there.

> > At the very least, the "old style" stale map code and "new style" stale
> > TLB code should be more in sync, you may end up flushing the TLB
> > twice...
> 
> yeah.  if we enable this for 440, it is more likely to be an issue than
> on 476.

Right.

> > >  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> > >  {
> > >     unsigned int i, id, cpu = smp_processor_id();
> > > @@ -197,6 +260,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
> > >     /* No lockless fast path .. yet */
> > >     raw_spin_lock(&context_lock);
> > >  
> > > +   flush_recycled_contexts(cpu);
> > > +
> > 
> > Ok so here's the nasty one I think. You need to make sure that whenever
> > you pick something off the context_available_map, you've done the above
> > first within the same context_lock section right ? At least before you
> > actually -use- said context.
> 
> right.
> 
> > So far so good ... but steal_context can drop the lock iirc. So you may
> > need to re-flush there. Not sure that can happen in practice but better
> > safe than sorry. I would have preferred seeing that flush near the end
> > of the function to avoid such problem.
> 
> I can fix this.  For 476, I don't think that even if steal_context()
> could be called, it wouldn't drop the lock.  But then again, if we
> enable this for other architectures, it may be a possibility.

Yeah, it's a minor issue but I'd rather get the code right to avoid
surprises later.

 .../...

> > Now...
> > 
> > > +/*
> > > + * This is called from flush_tlb_mm().  Mark the current context as stale
> > > + * and grab an available one.  The tlb will be flushed when no more
> > > + * contexts are available
> > > + */
> > > +void lazy_flush_context(struct mm_struct *mm)
> > > +{
> > > +   unsigned int id;
> > > +   unsigned long flags;
> > > +   unsigned long *map;
> > > +
> > > +   raw_spin_lock_irqsave(&context_lock, flags);
> > > +
> > > +   id = mm->context.id;
> > > +   if (unlikely(id == MMU_NO_CONTEXT))
> > > +           goto no_context;
> > 
> > First thing is ... you reproduce quite a bit of logic from
> > switch_mmu_context() here. Shouldn't it be abstracted in a separate
> > function ?
> 
> I'm sure there's something I can do there.
> 
> > The other thing here is that another CPU might have done a
> > recycle_stale_contexts() before you get here. IE. Your TLB may be stale.
> > Shouln't you do a flush here ? Since you are picking up a new PID from
> > the context_available_map, it can potentially be stale if your tlb needs
> > flushing due to another CPU having just done a recycle.
> 
> It looks like I missed that.  It does seem that there should be a flush
> in here.

Ok, so It wasn't just shit in my eyes :-)

> I think I'm going to dust off the newer implementation based on your and
> Paul's design.  I can probably get that in good working order without
> too much more work, and it's something we need to look at eventually
> anyway.  If I find anything that really gets in my way, I might fix up
> this patch in the mean time.

As you like. As I said earlier, I'm happy to merge a fixed version of
this one first if you think it's going to take a while to get the other
one right. However, I believe this is too late for the next merge window
anyways so that gives you some time ahead to play and make a decision.

Cheers,
Ben.

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index ddfd7ad..87c7dc2 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -17,10 +17,6 @@ 
  * TODO:
  *
  *   - The global context lock will not scale very well
- *   - The maps should be dynamically allocated to allow for processors
- *     that support more PID bits at runtime
- *   - Implement flush_tlb_mm() by making the context stale and picking
- *     a new one
  *   - More aggressively clear stale map bits and maybe find some way to
  *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
@@ -52,6 +48,8 @@ 
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
+#include "mmu_decl.h"
+
 static unsigned int first_context, last_context;
 static unsigned int next_context, nr_free_contexts;
 static unsigned long *context_map;
@@ -59,9 +57,31 @@  static unsigned long *stale_map[NR_CPUS];
 static struct mm_struct **context_mm;
 static DEFINE_RAW_SPINLOCK(context_lock);
 
+int tlb_lazy_flush;
+static int tlb_needs_flush[NR_CPUS];
+static unsigned long *context_available_map;
+static unsigned int nr_stale_contexts;
+
 #define CTX_MAP_SIZE	\
 	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
 
+/*
+ * if another cpu recycled the stale contexts, we need to flush
+ * the local TLB, so that we may re-use those contexts
+ */
+void flush_recycled_contexts(int cpu)
+{
+	int i;
+
+	if (tlb_needs_flush[cpu]) {
+		pr_hard("[%d] flushing tlb\n", cpu);
+		_tlbil_all();
+		for (i = cpu_first_thread_in_core(cpu);
+		     i <= cpu_last_thread_in_core(cpu); i++) {
+			tlb_needs_flush[i] = 0;
+		}
+	}
+}
 
 /* Steal a context from a task that has one at the moment.
  *
@@ -147,7 +167,7 @@  static unsigned int steal_context_up(unsigned int id)
 	pr_hardcont(" | steal %d from 0x%p", id, mm);
 
 	/* Flush the TLB for that context */
-	local_flush_tlb_mm(mm);
+	__local_flush_tlb_mm(mm);
 
 	/* Mark this mm has having no context anymore */
 	mm->context.id = MMU_NO_CONTEXT;
@@ -161,13 +181,19 @@  static unsigned int steal_context_up(unsigned int id)
 #ifdef DEBUG_MAP_CONSISTENCY
 static void context_check_map(void)
 {
-	unsigned int id, nrf, nact;
+	unsigned int id, nrf, nact, nstale;
 
-	nrf = nact = 0;
+	nrf = nact = nstale = 0;
 	for (id = first_context; id <= last_context; id++) {
 		int used = test_bit(id, context_map);
-		if (!used)
-			nrf++;
+		int allocated = tlb_lazy_flush &&
+				test_bit(id, context_available_map);
+		if (!used) {
+			if (allocated)
+				nstale++;
+			else
+				nrf++;
+		}
 		if (used != (context_mm[id] != NULL))
 			pr_err("MMU: Context %d is %s and MM is %p !\n",
 			       id, used ? "used" : "free", context_mm[id]);
@@ -179,6 +205,11 @@  static void context_check_map(void)
 		       nr_free_contexts, nrf);
 		nr_free_contexts = nrf;
 	}
+	if (nstale != nr_stale_contexts) {
+		pr_err("MMU: Stale context count out of sync ! (%d vs %d)\n",
+		       nr_stale_contexts, nstale);
+		nr_stale_contexts = nstale;
+	}
 	if (nact > num_online_cpus())
 		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
 		       nact, num_online_cpus());
@@ -189,6 +220,38 @@  static void context_check_map(void)
 static void context_check_map(void) { }
 #endif
 
+/*
+ * On architectures that support a large number of contexts, the tlb
+ * can be flushed lazily by picking a new context and making the stale
+ * context unusable until a lazy tlb flush has been issued.
+ *
+ * context_available_map keeps track of both active and stale contexts,
+ * while context_map continues to track only active contexts.  When the
+ * lazy tlb flush is triggered, context_map is copied to
+ * context_available_map, making the once-stale contexts available again
+ */
+static void recycle_stale_contexts(void)
+{
+	if (nr_free_contexts == 0 && nr_stale_contexts > 0) {
+		unsigned int cpu = smp_processor_id();
+		unsigned int i;
+
+		pr_hard("[%d] recycling stale contexts\n", cpu);
+		/* Time to flush the TLB's */
+		memcpy(context_available_map, context_map, CTX_MAP_SIZE);
+		nr_free_contexts = nr_stale_contexts;
+		nr_stale_contexts = 0;
+		for_each_online_cpu(i) {
+			if ((i < cpu_first_thread_in_core(cpu)) ||
+			    (i > cpu_last_thread_in_core(cpu)))
+				tlb_needs_flush[i] = 1;
+			else
+				tlb_needs_flush[i] = 0;	/* This core */
+		}
+		_tlbil_all();
+	}
+}
+
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
 	unsigned int i, id, cpu = smp_processor_id();
@@ -197,6 +260,8 @@  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	/* No lockless fast path .. yet */
 	raw_spin_lock(&context_lock);
 
+	flush_recycled_contexts(cpu);
+
 	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
 		cpu, next, next->context.active, next->context.id);
 
@@ -227,7 +292,12 @@  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	id = next_context;
 	if (id > last_context)
 		id = first_context;
-	map = context_map;
+
+	if (tlb_lazy_flush) {
+		recycle_stale_contexts();
+		map = context_available_map;
+	} else
+		map = context_map;
 
 	/* No more free contexts, let's try to steal one */
 	if (nr_free_contexts == 0) {
@@ -250,6 +320,13 @@  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 		if (id > last_context)
 			id = first_context;
 	}
+	if (tlb_lazy_flush)
+		/*
+		 * In the while loop above, we set the bit in
+		 * context_available_map, it also needs to be set in
+		 * context_map
+		 */
+		__set_bit(id, context_map);
  stolen:
 	next_context = id + 1;
 	context_mm[id] = next;
@@ -267,7 +344,7 @@  void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 			    id, cpu_first_thread_in_core(cpu),
 			    cpu_last_thread_in_core(cpu));
 
-		local_flush_tlb_mm(next);
+		__local_flush_tlb_mm(next);
 
 		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
 		for (i = cpu_first_thread_in_core(cpu);
@@ -317,11 +394,61 @@  void destroy_context(struct mm_struct *mm)
 		mm->context.active = 0;
 #endif
 		context_mm[id] = NULL;
-		nr_free_contexts++;
+
+		if (tlb_lazy_flush)
+			nr_stale_contexts++;
+		else
+			nr_free_contexts++;
 	}
 	raw_spin_unlock_irqrestore(&context_lock, flags);
 }
 
+/*
+ * This is called from flush_tlb_mm().  Mark the current context as stale
+ * and grab an available one.  The tlb will be flushed when no more
+ * contexts are available
+ */
+void lazy_flush_context(struct mm_struct *mm)
+{
+	unsigned int id;
+	unsigned long flags;
+	unsigned long *map;
+
+	raw_spin_lock_irqsave(&context_lock, flags);
+
+	id = mm->context.id;
+	if (unlikely(id == MMU_NO_CONTEXT))
+		goto no_context;
+
+	/*
+	 * Make the existing context stale.  It remains in
+	 * context_available_map as long as nr_free_contexts remains non-zero
+	 */
+	 __clear_bit(id, context_map);
+	 context_mm[id] = NULL;
+	 nr_stale_contexts++;
+
+	recycle_stale_contexts();
+	BUG_ON(nr_free_contexts == 0);
+
+	nr_free_contexts--;
+	id = last_context;
+	map = context_available_map;
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, last_context+1, id);
+		if (id > last_context)
+			id = first_context;
+	}
+	set_bit(id, context_map);
+	next_context = id + 1;
+	context_mm[id] = mm;
+	mm->context.id = id;
+	if (current->active_mm == mm)
+		set_context(id, mm->pgd);
+no_context:
+	raw_spin_unlock_irqrestore(&context_lock, flags);
+}
+
 #ifdef CONFIG_SMP
 
 static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
@@ -407,6 +534,7 @@  void __init mmu_context_init(void)
 	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
 		first_context = 1;
 		last_context = 65535;
+		tlb_lazy_flush = 1;
 	} else {
 		first_context = 1;
 		last_context = 255;
@@ -419,6 +547,8 @@  void __init mmu_context_init(void)
 	 * Allocate the maps used by context management
 	 */
 	context_map = alloc_bootmem(CTX_MAP_SIZE);
+	if (tlb_lazy_flush)
+		context_available_map = alloc_bootmem(CTX_MAP_SIZE);
 	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
 	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
 
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 63b84a0..64240f1 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -25,6 +25,14 @@ 
 #ifdef CONFIG_PPC_MMU_NOHASH
 
 /*
+ * Lazy tlb flush
+ */
+extern int tlb_lazy_flush;
+extern void flush_recycled_contexts(int);
+void lazy_flush_context(struct mm_struct *mm);
+void __local_flush_tlb_mm(struct mm_struct *mm);
+
+/*
  * On 40x and 8xx, we directly inline tlbia and tlbivax
  */
 #if defined(CONFIG_40x) || defined(CONFIG_8xx)
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index fe391e9..264d0ea 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -36,6 +36,7 @@ 
 #include <linux/spinlock.h>
 #include <linux/memblock.h>
 
+#include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
@@ -117,7 +118,7 @@  unsigned long linear_map_top;	/* Top of linear mapping */
 /*
  * These are the base non-SMP variants of page and mm flushing
  */
-void local_flush_tlb_mm(struct mm_struct *mm)
+void __local_flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned int pid;
 
@@ -127,6 +128,14 @@  void local_flush_tlb_mm(struct mm_struct *mm)
 		_tlbil_pid(pid);
 	preempt_enable();
 }
+
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	if (tlb_lazy_flush)
+		lazy_flush_context(mm);
+	else
+		__local_flush_tlb_mm(mm);
+}
 EXPORT_SYMBOL(local_flush_tlb_mm);
 
 void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
@@ -166,13 +175,19 @@  struct tlb_flush_param {
 	unsigned int pid;
 	unsigned int tsize;
 	unsigned int ind;
+	struct mm_struct *mm;
 };
 
 static void do_flush_tlb_mm_ipi(void *param)
 {
 	struct tlb_flush_param *p = param;
 
-	_tlbil_pid(p ? p->pid : 0);
+	if (tlb_lazy_flush && p) {
+		flush_recycled_contexts(smp_processor_id());
+		if (current->active_mm == p->mm)
+			set_context(p->pid, p->mm->pgd);
+	} else
+		_tlbil_pid(p ? p->pid : 0);
 }
 
 static void do_flush_tlb_page_ipi(void *param)
@@ -207,13 +222,18 @@  void flush_tlb_mm(struct mm_struct *mm)
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
+	if (tlb_lazy_flush) {
+		lazy_flush_context(mm);
+		pid = mm->context.id;
+	}
 	if (!mm_is_core_local(mm)) {
-		struct tlb_flush_param p = { .pid = pid };
+		struct tlb_flush_param p = { .pid = pid, .mm = mm };
 		/* Ignores smp_processor_id() even if set. */
 		smp_call_function_many(mm_cpumask(mm),
 				       do_flush_tlb_mm_ipi, &p, 1);
 	}
-	_tlbil_pid(pid);
+	if (!tlb_lazy_flush)
+		_tlbil_pid(pid);
  no_context:
 	preempt_enable();
 }

[2/2] ppc: lazy flush_tlb_mm for nohash architectures

Commit Message

Comments

Patch