diff mbox

mm, page_alloc: re-enable softirq use of per-cpu page allocator

Message ID 20170410150821.vcjlz7ntabtfsumm@techsingularity.net
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Mel Gorman April 10, 2017, 3:08 p.m. UTC
From: Jesper Dangaard Brouer <brouer@redhat.com>

IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists caching
of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only use per-cpu
allocator for irq-safe requests").

This unfortunately also included excluded SoftIRQ.  This hurt the performance
for the use-case of refilling DMA RX rings in softirq context.

This patch re-allow softirq context, which should be safe by disabling
BH/softirq, while accessing the list.  PCP-lists access from both hard-IRQ
and NMI context must not be allowed.  Peter Zijlstra says in_nmi() code
never access the page allocator, thus it should be sufficient to only test
for !in_irq().

One concern with this change is adding a BH (enable) scheduling point at
both PCP alloc and free. If further concerns are highlighted by this patch,
the result wiill be to revert 374ad05ab64d and try again at a later date
to offset the irq enable/disable overhead.

Fixes: 374ad05ab64d ("mm, page_alloc: only use per-cpu allocator for irq-safe requests")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

Comments

Jesper Dangaard Brouer April 10, 2017, 8:53 p.m. UTC | #1
I will appreciate review of this patch. My micro-benchmarking show we
basically return to same page alloc+free cost as before 374ad05ab64d
("mm, page_alloc: only use per-cpu allocator for irq-safe requests").
Which sort of invalidates this attempt of optimizing the page allocator.
But Mel's micro-benchmarks still show an improvement.

Notice the slowdown comes from me checking irqs_disabled() ... if
someone can spot a way to get rid of that this, then this patch will be
a win.

I'm traveling out of Montreal today, and will rerun my benchmarks when
I get home.  Tariq will also do some more testing with 100G NIC (he
also participated in the Montreal conf, so he is likely in transit too).


On Mon, 10 Apr 2017 16:08:21 +0100
Mel Gorman <mgorman@techsingularity.net> wrote:

> From: Jesper Dangaard Brouer <brouer@redhat.com>
> 
> IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists caching
> of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only use per-cpu
> allocator for irq-safe requests").
> 
> This unfortunately also included excluded SoftIRQ.  This hurt the performance
> for the use-case of refilling DMA RX rings in softirq context.
> 
> This patch re-allow softirq context, which should be safe by disabling
> BH/softirq, while accessing the list.  PCP-lists access from both hard-IRQ
> and NMI context must not be allowed.  Peter Zijlstra says in_nmi() code
> never access the page allocator, thus it should be sufficient to only test
> for !in_irq().
> 
> One concern with this change is adding a BH (enable) scheduling point at
> both PCP alloc and free. If further concerns are highlighted by this patch,
> the result wiill be to revert 374ad05ab64d and try again at a later date
> to offset the irq enable/disable overhead.
> 
> Fixes: 374ad05ab64d ("mm, page_alloc: only use per-cpu allocator for irq-safe requests")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Mel Gorman <mgorman@techsingularity.net>

Missing:

Reported-by: Tariq [...]

> ---
>  mm/page_alloc.c | 26 +++++++++++++++++---------
>  1 file changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 6cbde310abed..d7e986967910 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2351,9 +2351,9 @@ static void drain_local_pages_wq(struct work_struct *work)
>  	 * cpu which is allright but we also have to make sure to not move to
>  	 * a different one.
>  	 */
> -	preempt_disable();
> +	local_bh_disable();
>  	drain_local_pages(NULL);
> -	preempt_enable();
> +	local_bh_enable();
>  }
>  
>  /*
> @@ -2481,7 +2481,11 @@ void free_hot_cold_page(struct page *page, bool cold)
>  	unsigned long pfn = page_to_pfn(page);
>  	int migratetype;
>  
> -	if (in_interrupt()) {
> +	/*
> +	 * Exclude (hard) IRQ and NMI context from using the pcplists.
> +	 * But allow softirq context, via disabling BH.
> +	 */
> +	if (in_irq() || irqs_disabled()) {
>  		__free_pages_ok(page, 0);
>  		return;
>  	}
> @@ -2491,7 +2495,7 @@ void free_hot_cold_page(struct page *page, bool cold)
>  
>  	migratetype = get_pfnblock_migratetype(page, pfn);
>  	set_pcppage_migratetype(page, migratetype);
> -	preempt_disable();
> +	local_bh_disable();
>  
>  	/*
>  	 * We only track unmovable, reclaimable and movable on pcp lists.
> @@ -2522,7 +2526,7 @@ void free_hot_cold_page(struct page *page, bool cold)
>  	}
>  
>  out:
> -	preempt_enable();
> +	local_bh_enable();
>  }
>  
>  /*
> @@ -2647,7 +2651,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
>  {
>  	struct page *page;
>  
> -	VM_BUG_ON(in_interrupt());
> +	VM_BUG_ON(in_irq() || irqs_disabled());
>  
>  	do {
>  		if (list_empty(list)) {
> @@ -2680,7 +2684,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
>  	bool cold = ((gfp_flags & __GFP_COLD) != 0);
>  	struct page *page;
>  
> -	preempt_disable();
> +	local_bh_disable();
>  	pcp = &this_cpu_ptr(zone->pageset)->pcp;
>  	list = &pcp->lists[migratetype];
>  	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
> @@ -2688,7 +2692,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
>  		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
>  		zone_statistics(preferred_zone, zone);
>  	}
> -	preempt_enable();
> +	local_bh_enable();
>  	return page;
>  }
>  
> @@ -2704,7 +2708,11 @@ struct page *rmqueue(struct zone *preferred_zone,
>  	unsigned long flags;
>  	struct page *page;
>  
> -	if (likely(order == 0) && !in_interrupt()) {
> +	/*
> +	 * Exclude (hard) IRQ and NMI context from using the pcplists.
> +	 * But allow softirq context, via disabling BH.
> +	 */
> +	if (likely(order == 0) && !(in_irq() || irqs_disabled()) ) {
>  		page = rmqueue_pcplist(preferred_zone, zone, order,
>  				gfp_flags, migratetype);
>  		goto out;
Andrew Morton April 10, 2017, 9:26 p.m. UTC | #2
On Mon, 10 Apr 2017 16:08:21 +0100 Mel Gorman <mgorman@techsingularity.net> wrote:

> IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists caching
> of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only use per-cpu
> allocator for irq-safe requests").
> 
> This unfortunately also included excluded SoftIRQ.  This hurt the performance
> for the use-case of refilling DMA RX rings in softirq context.

Out of curiosity: by how much did it "hurt"?

<ruffles through the archives>

Tariq found:

: I disabled the page-cache (recycle) mechanism to stress the page
: allocator, and see a drastic degradation in BW, from 47.5 G in v4.10 to
: 31.4 G in v4.11-rc1 (34% drop).

then with this patch he found

: It looks very good!  I get line-rate (94Gbits/sec) with 8 streams, in
: comparison to less than 55Gbits/sec before.

Can I take this to mean that the page allocator's per-cpu-pages feature
ended up doubling the performance of this driver?  Better than the
driver's private page recycling?  I'd like to believe that, but am
having trouble doing so ;)

> This patch re-allow softirq context, which should be safe by disabling
> BH/softirq, while accessing the list.  PCP-lists access from both hard-IRQ
> and NMI context must not be allowed.  Peter Zijlstra says in_nmi() code
> never access the page allocator, thus it should be sufficient to only test
> for !in_irq().
> 
> One concern with this change is adding a BH (enable) scheduling point at
> both PCP alloc and free. If further concerns are highlighted by this patch,
> the result wiill be to revert 374ad05ab64d and try again at a later date
> to offset the irq enable/disable overhead.
Mel Gorman April 11, 2017, 8:23 a.m. UTC | #3
On Mon, Apr 10, 2017 at 10:53:02PM +0200, Jesper Dangaard Brouer wrote:
> 
> I will appreciate review of this patch.

I had reviewed it but didn't have much to say other than the in_interrupt()
is inconvenient rather than wrong.

> My micro-benchmarking show we
> basically return to same page alloc+free cost as before 374ad05ab64d
> ("mm, page_alloc: only use per-cpu allocator for irq-safe requests").
> Which sort of invalidates this attempt of optimizing the page allocator.
> But Mel's micro-benchmarks still show an improvement.
> 

Which could be down to differences in CPUs.

> Notice the slowdown comes from me checking irqs_disabled() ... if
> someone can spot a way to get rid of that this, then this patch will be
> a win.
> 

I didn't spot an easy way of doing it. One approach which would be lighter,
if somewhat surprising, is to put a lock into the per-cpu structures that
is *not* IRQ-safe and trylock it for the per-cpu allocator. If it's !irq
allocation, it'll be uncontended. If it's an irq-allocation and contended,
it means that CPU has re-entered the allocator and must use the irq-safe
buddy lists. That would mean that for the uncontended case, both irq-safe
and irq-unsafe allocations could use the list and in the contended case, irq
allocations will still succeed. However, it would need careful development,
testing and review and not appropriate to wedge in as a fix late in the
rc cycle.

> I'm traveling out of Montreal today, and will rerun my benchmarks when
> I get home.  Tariq will also do some more testing with 100G NIC (he
> also participated in the Montreal conf, so he is likely in transit too).
> 

Rerun them please. If there is a problem or any doubt then I'll post the
revert and try this again outside of an rc cycle. That would be preferable
to releasing 4.11 with a known regression.
Jesper Dangaard Brouer April 14, 2017, 10:10 a.m. UTC | #4
On Mon, 10 Apr 2017 14:26:16 -0700
Andrew Morton <akpm@linux-foundation.org> wrote:

> On Mon, 10 Apr 2017 16:08:21 +0100 Mel Gorman <mgorman@techsingularity.net> wrote:
> 
> > IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists caching
> > of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only use per-cpu
> > allocator for irq-safe requests").
> > 
> > This unfortunately also included excluded SoftIRQ.  This hurt the performance
> > for the use-case of refilling DMA RX rings in softirq context.  
> 
> Out of curiosity: by how much did it "hurt"?
>
> <ruffles through the archives>
> 
> Tariq found:
> 
> : I disabled the page-cache (recycle) mechanism to stress the page
> : allocator, and see a drastic degradation in BW, from 47.5 G in v4.10 to
> : 31.4 G in v4.11-rc1 (34% drop).

I've tried to reproduce this in my home testlab, using ConnectX-4 dual
100Gbit/s. Hardware limits cause that I cannot reach 100Gbit/s, once a
memory copy is performed.  (Word of warning: you need PCIe Gen3 width
16 (which I do have) to handle 100Gbit/s, and the memory bandwidth of
the system also need something like 2x 12500MBytes/s (which is where my
system failed)).

The mlx5 driver have a driver local page recycler, which I can see fail
between 29%-38% of the time, with 8 parallel netperf TCP_STREAMs.  I
speculate adding more streams will make in fail more.  To factor out
the driver recycler, I simply disable it (like I believe Tariq also did).

With disabled-mlx5-recycler, 8 parallel netperf TCP_STREAMs:

Baseline v4.10.0  : 60316 Mbit/s
Current 4.11.0-rc6: 47491 Mbit/s
This patch        : 60662 Mbit/s

While this patch does "fix" the performance regression, it does not
bring any noticeable improvement (as my micro-bench also indicated),
thus I feel our previous optimization is almost nullified. (p.s. It
does feel wrong to argue against my own patch ;-)).

The reason for the current 4.11.0-rc6 regression is lock congestion on
the (per NUMA) page allocator lock, perf report show we spend 34.92% in
queued_spin_lock_slowpath (compared to top#2 copy cost of 13.81% in
copy_user_enhanced_fast_string).


> then with this patch he found
> 
> : It looks very good!  I get line-rate (94Gbits/sec) with 8 streams, in
> : comparison to less than 55Gbits/sec before.
> 
> Can I take this to mean that the page allocator's per-cpu-pages feature
> ended up doubling the performance of this driver?  Better than the
> driver's private page recycling?  I'd like to believe that, but am
> having trouble doing so ;)

I would not conclude that. I'm also very suspicious about such big
performance "jumps".  Tariq should also benchmark with v4.10 and a
disabled mlx5-recycler, as I believe the results should be the same as
after this patch.

That said, it is possible to see a regression this large, when all the
CPUs are congesting on the page allocator lock. AFAIK Tariq also
mentioned seeing 60% spend on the lock, which would confirm this theory.

 
> > This patch re-allow softirq context, which should be safe by disabling
> > BH/softirq, while accessing the list.  PCP-lists access from both hard-IRQ
> > and NMI context must not be allowed.  Peter Zijlstra says in_nmi() code
> > never access the page allocator, thus it should be sufficient to only test
> > for !in_irq().
> > 
> > One concern with this change is adding a BH (enable) scheduling point at
> > both PCP alloc and free. If further concerns are highlighted by this patch,
> > the result wiill be to revert 374ad05ab64d and try again at a later date
> > to offset the irq enable/disable overhead.
Mel Gorman April 15, 2017, 3:03 p.m. UTC | #5
On Fri, Apr 14, 2017 at 12:10:27PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 10 Apr 2017 14:26:16 -0700
> Andrew Morton <akpm@linux-foundation.org> wrote:
> 
> > On Mon, 10 Apr 2017 16:08:21 +0100 Mel Gorman <mgorman@techsingularity.net> wrote:
> > 
> > > IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists caching
> > > of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only use per-cpu
> > > allocator for irq-safe requests").
> > > 
> > > This unfortunately also included excluded SoftIRQ.  This hurt the performance
> > > for the use-case of refilling DMA RX rings in softirq context.  
> > 
> > Out of curiosity: by how much did it "hurt"?
> >
> > <ruffles through the archives>
> > 
> > Tariq found:
> > 
> > : I disabled the page-cache (recycle) mechanism to stress the page
> > : allocator, and see a drastic degradation in BW, from 47.5 G in v4.10 to
> > : 31.4 G in v4.11-rc1 (34% drop).
> 
> I've tried to reproduce this in my home testlab, using ConnectX-4 dual
> 100Gbit/s. Hardware limits cause that I cannot reach 100Gbit/s, once a
> memory copy is performed.  (Word of warning: you need PCIe Gen3 width
> 16 (which I do have) to handle 100Gbit/s, and the memory bandwidth of
> the system also need something like 2x 12500MBytes/s (which is where my
> system failed)).
> 
> The mlx5 driver have a driver local page recycler, which I can see fail
> between 29%-38% of the time, with 8 parallel netperf TCP_STREAMs.  I
> speculate adding more streams will make in fail more.  To factor out
> the driver recycler, I simply disable it (like I believe Tariq also did).
> 
> With disabled-mlx5-recycler, 8 parallel netperf TCP_STREAMs:
> 
> Baseline v4.10.0  : 60316 Mbit/s
> Current 4.11.0-rc6: 47491 Mbit/s
> This patch        : 60662 Mbit/s
> 
> While this patch does "fix" the performance regression, it does not
> bring any noticeable improvement (as my micro-bench also indicated),
> thus I feel our previous optimization is almost nullified. (p.s. It
> does feel wrong to argue against my own patch ;-)).
> 
> The reason for the current 4.11.0-rc6 regression is lock congestion on
> the (per NUMA) page allocator lock, perf report show we spend 34.92% in
> queued_spin_lock_slowpath (compared to top#2 copy cost of 13.81% in
> copy_user_enhanced_fast_string).
> 

The lock contention is likely due to the per-cpu allocator being bypassed.

> 
> > then with this patch he found
> > 
> > : It looks very good!  I get line-rate (94Gbits/sec) with 8 streams, in
> > : comparison to less than 55Gbits/sec before.
> > 
> > Can I take this to mean that the page allocator's per-cpu-pages feature
> > ended up doubling the performance of this driver?  Better than the
> > driver's private page recycling?  I'd like to believe that, but am
> > having trouble doing so ;)
> 
> I would not conclude that. I'm also very suspicious about such big
> performance "jumps".  Tariq should also benchmark with v4.10 and a
> disabled mlx5-recycler, as I believe the results should be the same as
> after this patch.
> 
> That said, it is possible to see a regression this large, when all the
> CPUs are congesting on the page allocator lock. AFAIK Tariq also
> mentioned seeing 60% spend on the lock, which would confirm this theory.
> 

On that basis, I've posted a revert of the original patch which should
either go into 4.11 or 4.11-stable. Andrew, the revert should also
remove the "re-enable softirq use of per-cpu page" patch from mmotm.

Thanks.
diff mbox

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..d7e986967910 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2351,9 +2351,9 @@  static void drain_local_pages_wq(struct work_struct *work)
 	 * cpu which is allright but we also have to make sure to not move to
 	 * a different one.
 	 */
-	preempt_disable();
+	local_bh_disable();
 	drain_local_pages(NULL);
-	preempt_enable();
+	local_bh_enable();
 }
 
 /*
@@ -2481,7 +2481,11 @@  void free_hot_cold_page(struct page *page, bool cold)
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
-	if (in_interrupt()) {
+	/*
+	 * Exclude (hard) IRQ and NMI context from using the pcplists.
+	 * But allow softirq context, via disabling BH.
+	 */
+	if (in_irq() || irqs_disabled()) {
 		__free_pages_ok(page, 0);
 		return;
 	}
@@ -2491,7 +2495,7 @@  void free_hot_cold_page(struct page *page, bool cold)
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
-	preempt_disable();
+	local_bh_disable();
 
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2522,7 +2526,7 @@  void free_hot_cold_page(struct page *page, bool cold)
 	}
 
 out:
-	preempt_enable();
+	local_bh_enable();
 }
 
 /*
@@ -2647,7 +2651,7 @@  static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
 	struct page *page;
 
-	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_irq() || irqs_disabled());
 
 	do {
 		if (list_empty(list)) {
@@ -2680,7 +2684,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 
-	preempt_disable();
+	local_bh_disable();
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2688,7 +2692,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
 	}
-	preempt_enable();
+	local_bh_enable();
 	return page;
 }
 
@@ -2704,7 +2708,11 @@  struct page *rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 
-	if (likely(order == 0) && !in_interrupt()) {
+	/*
+	 * Exclude (hard) IRQ and NMI context from using the pcplists.
+	 * But allow softirq context, via disabling BH.
+	 */
+	if (likely(order == 0) && !(in_irq() || irqs_disabled()) ) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				gfp_flags, migratetype);
 		goto out;