diff mbox

Page allocator order-0 optimizations merged

Message ID 20170327143947.4c237e54@redhat.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Jesper Dangaard Brouer March 27, 2017, 12:39 p.m. UTC
On Mon, 27 Mar 2017 10:55:14 +0200
Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> A possible solution, would be use the local_bh_{disable,enable} instead
> of the {preempt_disable,enable} calls.  But it is slower, using numbers
> from [1] (19 vs 11 cycles), thus the expected cycles saving is 38-19=19.
> 
> The problematic part of using local_bh_enable is that this adds a
> softirq/bottom-halves rescheduling point (as it checks for pending
> BHs).  Thus, this might affects real workloads.

I implemented this solution in patch below... and tested it on mlx5 at
50G with manually disabled driver-page-recycling.  It works for me.

To Mel, that do you prefer... a partial-revert or something like this?


[PATCH] mm, page_alloc: re-enable softirq use of per-cpu page allocator

From: Jesper Dangaard Brouer <brouer@redhat.com>

IRQ context were excluded from using the Per-Cpu-Pages (PCP) lists
caching of order-0 pages in commit 374ad05ab64d ("mm, page_alloc: only
use per-cpu allocator for irq-safe requests").

This unfortunately also included excluded SoftIRQ.  This hurt the
performance for the use-case of refilling DMA RX rings in softirq
context.

This patch re-allow softirq context, which should be safe by disabling
BH/softirq, while accessing the list.  And makes sure to avoid
PCP-lists access from both hard-IRQ and NMI context.

One concern with this change is adding a BH (enable) scheduling point
at both PCP alloc and free.

Fixes: 374ad05ab64d ("mm, page_alloc: only use per-cpu allocator for irq-safe requests")
---
 include/trace/events/kmem.h |    2 ++
 mm/page_alloc.c             |   41 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 7 deletions(-)

Comments

Mel Gorman March 27, 2017, 1:32 p.m. UTC | #1
On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
> On Mon, 27 Mar 2017 10:55:14 +0200
> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > A possible solution, would be use the local_bh_{disable,enable} instead
> > of the {preempt_disable,enable} calls.  But it is slower, using numbers
> > from [1] (19 vs 11 cycles), thus the expected cycles saving is 38-19=19.
> > 
> > The problematic part of using local_bh_enable is that this adds a
> > softirq/bottom-halves rescheduling point (as it checks for pending
> > BHs).  Thus, this might affects real workloads.
> 
> I implemented this solution in patch below... and tested it on mlx5 at
> 50G with manually disabled driver-page-recycling.  It works for me.
> 
> To Mel, that do you prefer... a partial-revert or something like this?
> 

If Tariq confirms it works for him as well, this looks far safer patch
than having a dedicate IRQ-safe queue. Your concern about the BH
scheduling point is valid but if it's proven to be a problem, there is
still the option of a partial revert.
Matthew Wilcox March 27, 2017, 2:15 p.m. UTC | #2
On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
>  
> +static __always_inline int in_irq_or_nmi(void)
> +{
> +	return in_irq() || in_nmi();
> +// XXX: hoping compiler will optimize this (todo verify) into:
> +// #define in_irq_or_nmi()	(preempt_count() & (HARDIRQ_MASK | NMI_MASK))
> +
> +	/* compiler was smart enough to only read __preempt_count once
> +	 * but added two branches
> +asm code:
> + │       mov    __preempt_count,%eax
> + │       test   $0xf0000,%eax    // HARDIRQ_MASK: 0x000f0000
> + │    ┌──jne    2a
> + │    │  test   $0x100000,%eax   // NMI_MASK:     0x00100000
> + │    │↓ je     3f
> + │ 2a:└─→mov    %rbx,%rdi
> +
> +	 */
> +}

To be fair, you told the compiler to do that with your use of fancy-pants ||
instead of optimisable |.  Try this instead:

static __always_inline int in_irq_or_nmi(void)
{
	return in_irq() | in_nmi();
}

0000000000001770 <test_fn>:
    1770:       65 8b 05 00 00 00 00    mov    %gs:0x0(%rip),%eax        # 1777 <test_fn+0x7>
                        1773: R_X86_64_PC32     __preempt_count-0x4
#define in_nmi()                (preempt_count() & NMI_MASK)
#define in_task()               (!(preempt_count() & \
                                   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
static __always_inline int in_irq_or_nmi(void)
{
        return in_irq() | in_nmi();
    1777:       25 00 00 1f 00          and    $0x1f0000,%eax
}
    177c:       c3                      retq   
    177d:       0f 1f 00                nopl   (%rax)
Jesper Dangaard Brouer March 27, 2017, 3:15 p.m. UTC | #3
On Mon, 27 Mar 2017 07:15:18 -0700
Matthew Wilcox <willy@infradead.org> wrote:

> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
> >  
> > +static __always_inline int in_irq_or_nmi(void)
> > +{
> > +	return in_irq() || in_nmi();
> > +// XXX: hoping compiler will optimize this (todo verify) into:
> > +// #define in_irq_or_nmi()	(preempt_count() & (HARDIRQ_MASK | NMI_MASK))
> > +
> > +	/* compiler was smart enough to only read __preempt_count once
> > +	 * but added two branches
> > +asm code:
> > + │       mov    __preempt_count,%eax
> > + │       test   $0xf0000,%eax    // HARDIRQ_MASK: 0x000f0000
> > + │    ┌──jne    2a
> > + │    │  test   $0x100000,%eax   // NMI_MASK:     0x00100000
> > + │    │↓ je     3f
> > + │ 2a:└─→mov    %rbx,%rdi
> > +
> > +	 */
> > +}  
> 
> To be fair, you told the compiler to do that with your use of fancy-pants ||
> instead of optimisable |.  Try this instead:

Thanks you! -- good point! :-)

> static __always_inline int in_irq_or_nmi(void)
> {
> 	return in_irq() | in_nmi();
> }
> 
> 0000000000001770 <test_fn>:
>     1770:       65 8b 05 00 00 00 00    mov    %gs:0x0(%rip),%eax        # 1777 <test_fn+0x7>
>                         1773: R_X86_64_PC32     __preempt_count-0x4
> #define in_nmi()                (preempt_count() & NMI_MASK)
> #define in_task()               (!(preempt_count() & \
>                                    (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
> static __always_inline int in_irq_or_nmi(void)
> {
>         return in_irq() | in_nmi();
>     1777:       25 00 00 1f 00          and    $0x1f0000,%eax
> }
>     177c:       c3                      retq   
>     177d:       0f 1f 00                nopl   (%rax)

And I also verified it worked:

  0.63 │       mov    __preempt_count,%eax
       │     free_hot_cold_page():
  1.25 │       test   $0x1f0000,%eax
       │     ↓ jne    1e4

And this simplification also made the compiler change this into a
unlikely branch, which is a micro-optimization (that I will leave up to
the compiler).
Tariq Toukan March 28, 2017, 7:32 a.m. UTC | #4
On 27/03/2017 4:32 PM, Mel Gorman wrote:
> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
>> On Mon, 27 Mar 2017 10:55:14 +0200
>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>>
>>> A possible solution, would be use the local_bh_{disable,enable} instead
>>> of the {preempt_disable,enable} calls.  But it is slower, using numbers
>>> from [1] (19 vs 11 cycles), thus the expected cycles saving is 38-19=19.
>>>
>>> The problematic part of using local_bh_enable is that this adds a
>>> softirq/bottom-halves rescheduling point (as it checks for pending
>>> BHs).  Thus, this might affects real workloads.
>>
>> I implemented this solution in patch below... and tested it on mlx5 at
>> 50G with manually disabled driver-page-recycling.  It works for me.
>>
>> To Mel, that do you prefer... a partial-revert or something like this?
>>
>
> If Tariq confirms it works for him as well, this looks far safer patch

Great.
I will test Jesper's patch today in the afternoon.

> than having a dedicate IRQ-safe queue. Your concern about the BH
> scheduling point is valid but if it's proven to be a problem, there is
> still the option of a partial revert.
>
Pankaj Gupta March 28, 2017, 8:28 a.m. UTC | #5
> 
> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
> > On Mon, 27 Mar 2017 10:55:14 +0200
> > Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > 
> > > A possible solution, would be use the local_bh_{disable,enable} instead
> > > of the {preempt_disable,enable} calls.  But it is slower, using numbers
> > > from [1] (19 vs 11 cycles), thus the expected cycles saving is 38-19=19.
> > > 
> > > The problematic part of using local_bh_enable is that this adds a
> > > softirq/bottom-halves rescheduling point (as it checks for pending
> > > BHs).  Thus, this might affects real workloads.
> > 
> > I implemented this solution in patch below... and tested it on mlx5 at
> > 50G with manually disabled driver-page-recycling.  It works for me.
> > 
> > To Mel, that do you prefer... a partial-revert or something like this?
> > 
> 
> If Tariq confirms it works for him as well, this looks far safer patch
> than having a dedicate IRQ-safe queue. Your concern about the BH
> scheduling point is valid but if it's proven to be a problem, there is
> still the option of a partial revert.

I also feel the same.

Thanks,
Pankaj

> 
> --
> Mel Gorman
> SUSE Labs
>
Jesper Dangaard Brouer March 28, 2017, 8:29 a.m. UTC | #6
On Tue, 28 Mar 2017 10:32:19 +0300
Tariq Toukan <ttoukan.linux@gmail.com> wrote:

> On 27/03/2017 4:32 PM, Mel Gorman wrote:
> > On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:  
> >> On Mon, 27 Mar 2017 10:55:14 +0200
> >> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >>  
> >>> A possible solution, would be use the local_bh_{disable,enable} instead
> >>> of the {preempt_disable,enable} calls.  But it is slower, using numbers
> >>> from [1] (19 vs 11 cycles), thus the expected cycles saving is 38-19=19.
> >>>
> >>> The problematic part of using local_bh_enable is that this adds a
> >>> softirq/bottom-halves rescheduling point (as it checks for pending
> >>> BHs).  Thus, this might affects real workloads.  
> >>
> >> I implemented this solution in patch below... and tested it on mlx5 at
> >> 50G with manually disabled driver-page-recycling.  It works for me.
> >>
> >> To Mel, that do you prefer... a partial-revert or something like this?
> >>  
> >
> > If Tariq confirms it works for him as well, this looks far safer patch  
> 
> Great.
> I will test Jesper's patch today in the afternoon.

Good to hear :-)

> > than having a dedicate IRQ-safe queue. Your concern about the BH
> > scheduling point is valid but if it's proven to be a problem, there is
> > still the option of a partial revert.

I wanted to evaluate my own BH scheduling point concern, but I could
not, because I ran into a softirq acct regression (which I bisected
see[1]).  AFAIK this should not affect Tariq's multi-TCP-stream test
(netperf TCP stream testing works fine on my testlab).

[1] http://lkml.kernel.org/r/20170328101403.34a82fbf@redhat.com
Tariq Toukan March 28, 2017, 4:05 p.m. UTC | #7
On 28/03/2017 10:32 AM, Tariq Toukan wrote:
>
>
> On 27/03/2017 4:32 PM, Mel Gorman wrote:
>> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
>>> On Mon, 27 Mar 2017 10:55:14 +0200
>>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>>>
>>>> A possible solution, would be use the local_bh_{disable,enable} instead
>>>> of the {preempt_disable,enable} calls.  But it is slower, using numbers
>>>> from [1] (19 vs 11 cycles), thus the expected cycles saving is
>>>> 38-19=19.
>>>>
>>>> The problematic part of using local_bh_enable is that this adds a
>>>> softirq/bottom-halves rescheduling point (as it checks for pending
>>>> BHs).  Thus, this might affects real workloads.
>>>
>>> I implemented this solution in patch below... and tested it on mlx5 at
>>> 50G with manually disabled driver-page-recycling.  It works for me.
>>>
>>> To Mel, that do you prefer... a partial-revert or something like this?
>>>
>>
>> If Tariq confirms it works for him as well, this looks far safer patch
>
> Great.
> I will test Jesper's patch today in the afternoon.
>

It looks very good!
I get line-rate (94Gbits/sec) with 8 streams, in comparison to less than 
55Gbits/sec before.

Many thanks guys.

>> than having a dedicate IRQ-safe queue. Your concern about the BH
>> scheduling point is valid but if it's proven to be a problem, there is
>> still the option of a partial revert.
>>
Jesper Dangaard Brouer March 28, 2017, 6:24 p.m. UTC | #8
On Tue, 28 Mar 2017 19:05:12 +0300
Tariq Toukan <ttoukan.linux@gmail.com> wrote:

> On 28/03/2017 10:32 AM, Tariq Toukan wrote:
> >
> >
> > On 27/03/2017 4:32 PM, Mel Gorman wrote:  
> >> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:  
> >>> On Mon, 27 Mar 2017 10:55:14 +0200
> >>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >>>  
> >>>> A possible solution, would be use the local_bh_{disable,enable} instead
> >>>> of the {preempt_disable,enable} calls.  But it is slower, using numbers
> >>>> from [1] (19 vs 11 cycles), thus the expected cycles saving is
> >>>> 38-19=19.
> >>>>
> >>>> The problematic part of using local_bh_enable is that this adds a
> >>>> softirq/bottom-halves rescheduling point (as it checks for pending
> >>>> BHs).  Thus, this might affects real workloads.  
> >>>
> >>> I implemented this solution in patch below... and tested it on mlx5 at
> >>> 50G with manually disabled driver-page-recycling.  It works for me.
> >>>
> >>> To Mel, that do you prefer... a partial-revert or something like this?
> >>>  
> >>
> >> If Tariq confirms it works for him as well, this looks far safer patch  
> >
> > Great.
> > I will test Jesper's patch today in the afternoon.
> >  
> 
> It looks very good!
> I get line-rate (94Gbits/sec) with 8 streams, in comparison to less than 
> 55Gbits/sec before.

Just confirming, this is when you have disabled mlx5 driver
page-recycling, right?


> >> than having a dedicate IRQ-safe queue. Your concern about the BH
> >> scheduling point is valid but if it's proven to be a problem, there is
> >> still the option of a partial revert.
Tariq Toukan March 29, 2017, 7:13 a.m. UTC | #9
On 28/03/2017 9:24 PM, Jesper Dangaard Brouer wrote:
> On Tue, 28 Mar 2017 19:05:12 +0300
> Tariq Toukan <ttoukan.linux@gmail.com> wrote:
>
>> On 28/03/2017 10:32 AM, Tariq Toukan wrote:
>>>
>>>
>>> On 27/03/2017 4:32 PM, Mel Gorman wrote:
>>>> On Mon, Mar 27, 2017 at 02:39:47PM +0200, Jesper Dangaard Brouer wrote:
>>>>> On Mon, 27 Mar 2017 10:55:14 +0200
>>>>> Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>>>>>
>>>>>> A possible solution, would be use the local_bh_{disable,enable} instead
>>>>>> of the {preempt_disable,enable} calls.  But it is slower, using numbers
>>>>>> from [1] (19 vs 11 cycles), thus the expected cycles saving is
>>>>>> 38-19=19.
>>>>>>
>>>>>> The problematic part of using local_bh_enable is that this adds a
>>>>>> softirq/bottom-halves rescheduling point (as it checks for pending
>>>>>> BHs).  Thus, this might affects real workloads.
>>>>>
>>>>> I implemented this solution in patch below... and tested it on mlx5 at
>>>>> 50G with manually disabled driver-page-recycling.  It works for me.
>>>>>
>>>>> To Mel, that do you prefer... a partial-revert or something like this?
>>>>>
>>>>
>>>> If Tariq confirms it works for him as well, this looks far safer patch
>>>
>>> Great.
>>> I will test Jesper's patch today in the afternoon.
>>>
>>
>> It looks very good!
>> I get line-rate (94Gbits/sec) with 8 streams, in comparison to less than
>> 55Gbits/sec before.
>
> Just confirming, this is when you have disabled mlx5 driver
> page-recycling, right?
>
>
Right.
This is a great result!

>>>> than having a dedicate IRQ-safe queue. Your concern about the BH
>>>> scheduling point is valid but if it's proven to be a problem, there is
>>>> still the option of a partial revert.
>
diff mbox

Patch

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 6b2e154fd23a..ad412ad1b092 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -244,6 +244,8 @@  DECLARE_EVENT_CLASS(mm_page,
 		__entry->order,
 		__entry->migratetype,
 		__entry->order == 0)
+// WARNING: percpu_refill check not 100% correct after commit
+// 374ad05ab64d ("mm, page_alloc: only use per-cpu allocator for irq-safe requests")
 );
 
 DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cbde310abed..db9ffc8ac538 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2470,6 +2470,25 @@  void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
+static __always_inline int in_irq_or_nmi(void)
+{
+	return in_irq() || in_nmi();
+// XXX: hoping compiler will optimize this (todo verify) into:
+// #define in_irq_or_nmi()	(preempt_count() & (HARDIRQ_MASK | NMI_MASK))
+
+	/* compiler was smart enough to only read __preempt_count once
+	 * but added two branches
+asm code:
+ │       mov    __preempt_count,%eax
+ │       test   $0xf0000,%eax    // HARDIRQ_MASK: 0x000f0000
+ │    ┌──jne    2a
+ │    │  test   $0x100000,%eax   // NMI_MASK:     0x00100000
+ │    │↓ je     3f
+ │ 2a:└─→mov    %rbx,%rdi
+
+	 */
+}
+
 /*
  * Free a 0-order page
  * cold == true ? free a cold page : free a hot page
@@ -2481,7 +2500,11 @@  void free_hot_cold_page(struct page *page, bool cold)
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
-	if (in_interrupt()) {
+	/*
+	 * Exclude (hard) IRQ and NMI context from using the pcplists.
+	 * But allow softirq context, via disabling BH.
+	 */
+	if (in_irq_or_nmi()) {
 		__free_pages_ok(page, 0);
 		return;
 	}
@@ -2491,7 +2514,7 @@  void free_hot_cold_page(struct page *page, bool cold)
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
 	set_pcppage_migratetype(page, migratetype);
-	preempt_disable();
+	local_bh_disable();
 
 	/*
 	 * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2522,7 +2545,7 @@  void free_hot_cold_page(struct page *page, bool cold)
 	}
 
 out:
-	preempt_enable();
+	local_bh_enable();
 }
 
 /*
@@ -2647,7 +2670,7 @@  static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
 	struct page *page;
 
-	VM_BUG_ON(in_interrupt());
+	VM_BUG_ON(in_irq());
 
 	do {
 		if (list_empty(list)) {
@@ -2680,7 +2703,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 
-	preempt_disable();
+	local_bh_disable();
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
 	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2688,7 +2711,7 @@  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
 	}
-	preempt_enable();
+	local_bh_enable();
 	return page;
 }
 
@@ -2704,7 +2727,11 @@  struct page *rmqueue(struct zone *preferred_zone,
 	unsigned long flags;
 	struct page *page;
 
-	if (likely(order == 0) && !in_interrupt()) {
+	/*
+	 * Exclude (hard) IRQ and NMI context from using the pcplists.
+	 * But allow softirq context, via disabling BH.
+	 */
+	if (likely(order == 0) && !in_irq_or_nmi() ) {
 		page = rmqueue_pcplist(preferred_zone, zone, order,
 				gfp_flags, migratetype);
 		goto out;