diff mbox

[4/5] page allocator: Pre-emptively wake kswapd when high-order watermarks are hit

Message ID 1256221356-26049-5-git-send-email-mel@csn.ul.ie
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Mel Gorman Oct. 22, 2009, 2:22 p.m. UTC
When a high-order allocation fails, kswapd is kicked so that it reclaims
at a higher-order to avoid direct reclaimers stall and to help GFP_ATOMIC
allocations. Something has changed in recent kernels that affect the timing
where high-order GFP_ATOMIC allocations are now failing with more frequency,
particularly under pressure.

This patch pre-emptively checks if watermarks have been hit after a
high-order allocation completes successfully. If the watermarks have been
reached, kswapd is woken in the hope it fixes the watermarks before the
next GFP_ATOMIC allocation fails.

Warning, this patch is somewhat of a band-aid. If this makes a difference,
it still implies that something has changed that is either causing more
GFP_ATOMIC allocations to occur (such as the case with iwlagn wireless
driver) or make them more likely to fail.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
---
 mm/page_alloc.c |   33 ++++++++++++++++++++++-----------
 1 files changed, 22 insertions(+), 11 deletions(-)

Comments

David Rientjes Oct. 22, 2009, 7:41 p.m. UTC | #1
On Thu, 22 Oct 2009, Mel Gorman wrote:

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7f2aa3e..851df40 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1596,6 +1596,17 @@ try_next_zone:
>  	return page;
>  }
>  
> +static inline
> +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> +						enum zone_type high_zoneidx)
> +{
> +	struct zoneref *z;
> +	struct zone *zone;
> +
> +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> +		wakeup_kswapd(zone, order);
> +}
> +
>  static inline int
>  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
>  				unsigned long pages_reclaimed)
> @@ -1730,18 +1741,18 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
>  			congestion_wait(BLK_RW_ASYNC, HZ/50);
>  	} while (!page && (gfp_mask & __GFP_NOFAIL));
>  
> -	return page;
> -}
> -
> -static inline
> -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> -						enum zone_type high_zoneidx)
> -{
> -	struct zoneref *z;
> -	struct zone *zone;
> +	/*
> +	 * If after a high-order allocation we are now below watermarks,
> +	 * pre-emptively kick kswapd rather than having the next allocation
> +	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
> +	 * allocations or entering direct reclaim
> +	 */
> +	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
> +				preferred_zone->watermark[ALLOC_WMARK_LOW],
> +				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
> +		wake_all_kswapd(order, zonelist, high_zoneidx);
>  
> -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> -		wakeup_kswapd(zone, order);
> +	return page;
>  }
>  
>  static inline int

Hmm, is this really supposed to be added to __alloc_pages_high_priority()?  
By the patch description I was expecting kswapd to be woken up 
preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and 
we're known to have just allocated at a higher order, not just when 
current was oom killed (when we should already be freeing a _lot_ of 
memory soon) or is doing a higher order allocation during direct reclaim.

For the best coverage, it would have to be add the branch to the fastpath.  
That seems fine for a debugging aid and to see if progress is being made 
on the GFP_ATOMIC allocation issues, but doesn't seem like it should make 
its way to mainline, the subsequent GFP_ATOMIC allocation could already be 
happening and in the page allocator's slowpath at this point that this 
wakeup becomes unnecessary.

If this is moved to the fastpath, why is this wake_all_kswapd() and not
wakeup_kswapd(preferred_zone, order)?  Do we need to kick kswapd in all 
zones even though they may be free just because preferred_zone is now 
below the watermark?

Wouldn't it be better to do this on page_zone(page) instead of 
preferred_zone anyway?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mel Gorman Oct. 23, 2009, 9:13 a.m. UTC | #2
On Thu, Oct 22, 2009 at 12:41:42PM -0700, David Rientjes wrote:
> On Thu, 22 Oct 2009, Mel Gorman wrote:
> 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 7f2aa3e..851df40 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1596,6 +1596,17 @@ try_next_zone:
> >  	return page;
> >  }
> >  
> > +static inline
> > +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> > +						enum zone_type high_zoneidx)
> > +{
> > +	struct zoneref *z;
> > +	struct zone *zone;
> > +
> > +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> > +		wakeup_kswapd(zone, order);
> > +}
> > +
> >  static inline int
> >  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
> >  				unsigned long pages_reclaimed)
> > @@ -1730,18 +1741,18 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
> >  			congestion_wait(BLK_RW_ASYNC, HZ/50);
> >  	} while (!page && (gfp_mask & __GFP_NOFAIL));
> >  
> > -	return page;
> > -}
> > -
> > -static inline
> > -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> > -						enum zone_type high_zoneidx)
> > -{
> > -	struct zoneref *z;
> > -	struct zone *zone;
> > +	/*
> > +	 * If after a high-order allocation we are now below watermarks,
> > +	 * pre-emptively kick kswapd rather than having the next allocation
> > +	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
> > +	 * allocations or entering direct reclaim
> > +	 */
> > +	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
> > +				preferred_zone->watermark[ALLOC_WMARK_LOW],
> > +				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
> > +		wake_all_kswapd(order, zonelist, high_zoneidx);
> >  
> > -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> > -		wakeup_kswapd(zone, order);
> > +	return page;
> >  }
> >  
> >  static inline int
> 
> Hmm, is this really supposed to be added to __alloc_pages_high_priority()?  
> By the patch description I was expecting kswapd to be woken up 
> preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and 
> we're known to have just allocated at a higher order, not just when 
> current was oom killed (when we should already be freeing a _lot_ of 
> memory soon) or is doing a higher order allocation during direct reclaim.
> 

It was a somewhat arbitrary choice to have it trigger in the event high
priority allocations were happening frequently.

> For the best coverage, it would have to be add the branch to the fastpath.  

Agreed - specifically at the end of __alloc_pages_nodemask()

> That seems fine for a debugging aid and to see if progress is being made 
> on the GFP_ATOMIC allocation issues, but doesn't seem like it should make 
> its way to mainline, the subsequent GFP_ATOMIC allocation could already be 
> happening and in the page allocator's slowpath at this point that this 
> wakeup becomes unnecessary.
> 
> If this is moved to the fastpath, why is this wake_all_kswapd() and not
> wakeup_kswapd(preferred_zone, order)?  Do we need to kick kswapd in all 
> zones even though they may be free just because preferred_zone is now 
> below the watermark?
> 

It probably makes no difference as zones are checked for their watermarks
before any real work happens. However, even if this patch makes a difference,
I don't want to see it merged.  At best, it is an extremely heavy-handed
hack which is why I asked for it to be tested in isolation. It shouldn't
be necessary at all because sort of pre-emptive waking of kswapd was never
necessary before.

> Wouldn't it be better to do this on page_zone(page) instead of 
> preferred_zone anyway?
> 

No. The preferred_zone is the zone we should be allocating from. If we
failed to allocate from it, it implies the watermarks are not being met
so we want to wake it.
David Rientjes Oct. 23, 2009, 9:36 a.m. UTC | #3
On Fri, 23 Oct 2009, Mel Gorman wrote:

> > Hmm, is this really supposed to be added to __alloc_pages_high_priority()?  
> > By the patch description I was expecting kswapd to be woken up 
> > preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and 
> > we're known to have just allocated at a higher order, not just when 
> > current was oom killed (when we should already be freeing a _lot_ of 
> > memory soon) or is doing a higher order allocation during direct reclaim.
> > 
> 
> It was a somewhat arbitrary choice to have it trigger in the event high
> priority allocations were happening frequently.
> 

I don't quite understand, users of PF_MEMALLOC shouldn't be doing these 
higher order allocations and if ALLOC_NO_WATERMARKS is by way of the oom 
killer, we should be freeing a substantial amount of memory imminently 
when it exits that waking up kswapd would be irrelevant.

> > If this is moved to the fastpath, why is this wake_all_kswapd() and not
> > wakeup_kswapd(preferred_zone, order)?  Do we need to kick kswapd in all 
> > zones even though they may be free just because preferred_zone is now 
> > below the watermark?
> > 
> 
> It probably makes no difference as zones are checked for their watermarks
> before any real work happens. However, even if this patch makes a difference,
> I don't want to see it merged.  At best, it is an extremely heavy-handed
> hack which is why I asked for it to be tested in isolation. It shouldn't
> be necessary at all because sort of pre-emptive waking of kswapd was never
> necessary before.
> 

Ahh, that makes a ton more sense: this particular patch is a debugging 
effort while the first two are candidates for 2.6.32 and -stable.  Gotcha.

> > Wouldn't it be better to do this on page_zone(page) instead of 
> > preferred_zone anyway?
> > 
> 
> No. The preferred_zone is the zone we should be allocating from. If we
> failed to allocate from it, it implies the watermarks are not being met
> so we want to wake it.
> 

Oops, I'm even more confused now :)  I thought the existing 
wake_all_kswapd() in the slowpath was doing that and that this patch was 
waking them prematurely because it speculates that a subsequent high 
order allocation will fail unless memory is reclaimed.  I thought we'd  
want to reclaim from the zone we just did a high order allocation from so 
that the fastpath could find the memory next time with ALLOC_WMARK_LOW.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mel Gorman Oct. 23, 2009, 11:25 a.m. UTC | #4
On Fri, Oct 23, 2009 at 02:36:53AM -0700, David Rientjes wrote:
> On Fri, 23 Oct 2009, Mel Gorman wrote:
> 
> > > Hmm, is this really supposed to be added to __alloc_pages_high_priority()?  
> > > By the patch description I was expecting kswapd to be woken up 
> > > preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and 
> > > we're known to have just allocated at a higher order, not just when 
> > > current was oom killed (when we should already be freeing a _lot_ of 
> > > memory soon) or is doing a higher order allocation during direct reclaim.
> > > 
> > 
> > It was a somewhat arbitrary choice to have it trigger in the event high
> > priority allocations were happening frequently.
> > 
> 
> I don't quite understand, users of PF_MEMALLOC shouldn't be doing these 
> higher order allocations and if ALLOC_NO_WATERMARKS is by way of the oom 
> killer, we should be freeing a substantial amount of memory imminently 
> when it exits that waking up kswapd would be irrelevant.
> 

I agree. I think it's highly unlikely this patch will make any
difference but I wanted to eliminate it as a possibility. Patch 3 and 4
were previously one patch that were tested together.

> > > If this is moved to the fastpath, why is this wake_all_kswapd() and not
> > > wakeup_kswapd(preferred_zone, order)?  Do we need to kick kswapd in all 
> > > zones even though they may be free just because preferred_zone is now 
> > > below the watermark?
> > > 
> > 
> > It probably makes no difference as zones are checked for their watermarks
> > before any real work happens. However, even if this patch makes a difference,
> > I don't want to see it merged.  At best, it is an extremely heavy-handed
> > hack which is why I asked for it to be tested in isolation. It shouldn't
> > be necessary at all because sort of pre-emptive waking of kswapd was never
> > necessary before.
> > 
> 
> Ahh, that makes a ton more sense: this particular patch is a debugging 
> effort while the first two are candidates for 2.6.32 and -stable.  Gotcha.
> 

Yep.

> > > Wouldn't it be better to do this on page_zone(page) instead of 
> > > preferred_zone anyway?
> > > 
> > 
> > No. The preferred_zone is the zone we should be allocating from. If we
> > failed to allocate from it, it implies the watermarks are not being met
> > so we want to wake it.
> > 
> 
> Oops, I'm even more confused now :)  I thought the existing 
> wake_all_kswapd() in the slowpath was doing that and that this patch was 
> waking them prematurely because it speculates that a subsequent high 
> order allocation will fail unless memory is reclaimed. 


It should be doing that. This patch should be junk but because it was tested
previously, I needed to be sure it was actually junk.

> I thought we'd  
> want to reclaim from the zone we just did a high order allocation from so 
> that the fastpath could find the memory next time with ALLOC_WMARK_LOW.

The fastpath should be getting the pages it needs from the
preferred_zone. If it's not, we still want to get pages back in that
zone and the zone we actually ended up getting pages from.

It's probably best to ignore this patch except in the unlikely event Tobias
says it makes a difference to his testing. I'm hoping he's covered by patches
1+2 and maybe 3 and that patches 4 and 5 of this set get consigned to
the bit bucket.
Tobias Oetiker Oct. 23, 2009, 11:31 a.m. UTC | #5
Mel,

Today Mel Gorman wrote:

> On Fri, Oct 23, 2009 at 02:36:53AM -0700, David Rientjes wrote:
> > On Fri, 23 Oct 2009, Mel Gorman wrote:
> >
> > > > Hmm, is this really supposed to be added to __alloc_pages_high_priority()?
> > > > By the patch description I was expecting kswapd to be woken up
> > > > preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and
> > > > we're known to have just allocated at a higher order, not just when
> > > > current was oom killed (when we should already be freeing a _lot_ of
> > > > memory soon) or is doing a higher order allocation during direct reclaim.
> > > >
> > >
> > > It was a somewhat arbitrary choice to have it trigger in the event high
> > > priority allocations were happening frequently.
> > >
> >
> > I don't quite understand, users of PF_MEMALLOC shouldn't be doing these
> > higher order allocations and if ALLOC_NO_WATERMARKS is by way of the oom
> > killer, we should be freeing a substantial amount of memory imminently
> > when it exits that waking up kswapd would be irrelevant.
> >
>
> I agree. I think it's highly unlikely this patch will make any
> difference but I wanted to eliminate it as a possibility. Patch 3 and 4
> were previously one patch that were tested together.

hi hi ... I have tested '3 only' this morning, and the allocation
problems started again ... so for me 3 alone does not work while
3+4 does.

cheers
tobi
Mel Gorman Oct. 23, 2009, 1:39 p.m. UTC | #6
On Fri, Oct 23, 2009 at 01:31:10PM +0200, Tobias Oetiker wrote:
> Mel,
> 
> Today Mel Gorman wrote:
> 
> > On Fri, Oct 23, 2009 at 02:36:53AM -0700, David Rientjes wrote:
> > > On Fri, 23 Oct 2009, Mel Gorman wrote:
> > >
> > > > > Hmm, is this really supposed to be added to __alloc_pages_high_priority()?
> > > > > By the patch description I was expecting kswapd to be woken up
> > > > > preemptively whenever the preferred zone is below ALLOC_WMARK_LOW and
> > > > > we're known to have just allocated at a higher order, not just when
> > > > > current was oom killed (when we should already be freeing a _lot_ of
> > > > > memory soon) or is doing a higher order allocation during direct reclaim.
> > > > >
> > > >
> > > > It was a somewhat arbitrary choice to have it trigger in the event high
> > > > priority allocations were happening frequently.
> > > >
> > >
> > > I don't quite understand, users of PF_MEMALLOC shouldn't be doing these
> > > higher order allocations and if ALLOC_NO_WATERMARKS is by way of the oom
> > > killer, we should be freeing a substantial amount of memory imminently
> > > when it exits that waking up kswapd would be irrelevant.
> > >
> >
> > I agree. I think it's highly unlikely this patch will make any
> > difference but I wanted to eliminate it as a possibility. Patch 3 and 4
> > were previously one patch that were tested together.
> 
> hi hi ... I have tested '3 only' this morning, and the allocation
> problems started again ... so for me 3 alone does not work while
> 3+4 does.
> 

Hi,

What was the outcome of 1+2?
KOSAKI Motohiro Oct. 27, 2009, 2:42 a.m. UTC | #7
> When a high-order allocation fails, kswapd is kicked so that it reclaims
> at a higher-order to avoid direct reclaimers stall and to help GFP_ATOMIC
> allocations. Something has changed in recent kernels that affect the timing
> where high-order GFP_ATOMIC allocations are now failing with more frequency,
> particularly under pressure.
> 
> This patch pre-emptively checks if watermarks have been hit after a
> high-order allocation completes successfully. If the watermarks have been
> reached, kswapd is woken in the hope it fixes the watermarks before the
> next GFP_ATOMIC allocation fails.
> 
> Warning, this patch is somewhat of a band-aid. If this makes a difference,
> it still implies that something has changed that is either causing more
> GFP_ATOMIC allocations to occur (such as the case with iwlagn wireless
> driver) or make them more likely to fail.

hmm, I'm confused. this description addressed generic high order allocation.
but, 

> 
> Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> ---
>  mm/page_alloc.c |   33 ++++++++++++++++++++++-----------
>  1 files changed, 22 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 7f2aa3e..851df40 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1596,6 +1596,17 @@ try_next_zone:
>  	return page;
>  }
>  
> +static inline
> +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> +						enum zone_type high_zoneidx)
> +{
> +	struct zoneref *z;
> +	struct zone *zone;
> +
> +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> +		wakeup_kswapd(zone, order);
> +}
> +
>  static inline int
>  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
>  				unsigned long pages_reclaimed)
> @@ -1730,18 +1741,18 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,

__alloc_pages_high_priority() is only called if ALLOC_NO_WATERMARKS.
ALLOC_NO_WATERMARKS mean PF_MEMALLOC or TIF_MEMDIE and GFP_ATOMIC don't make
nested alloc_pages() (= don't make PF_MEMALLOC case). 
Then, I haven't understand why this patch improve iwlagn GFP_ATOMIC case.

hmm, maybe I missed something. I see the code again tommorow.


>  			congestion_wait(BLK_RW_ASYNC, HZ/50);
>  	} while (!page && (gfp_mask & __GFP_NOFAIL));
>  
> -	return page;
> -}
> -
> -static inline
> -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> -						enum zone_type high_zoneidx)
> -{
> -	struct zoneref *z;
> -	struct zone *zone;
> +	/*
> +	 * If after a high-order allocation we are now below watermarks,
> +	 * pre-emptively kick kswapd rather than having the next allocation
> +	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
> +	 * allocations or entering direct reclaim
> +	 */
> +	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
> +				preferred_zone->watermark[ALLOC_WMARK_LOW],
> +				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
> +		wake_all_kswapd(order, zonelist, high_zoneidx);
>  
> -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> -		wakeup_kswapd(zone, order);
> +	return page;
>  }
>  
>  static inline int
> -- 
> 1.6.3.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mel Gorman Oct. 27, 2009, 3:26 p.m. UTC | #8
On Tue, Oct 27, 2009 at 11:42:58AM +0900, KOSAKI Motohiro wrote:
> > When a high-order allocation fails, kswapd is kicked so that it reclaims
> > at a higher-order to avoid direct reclaimers stall and to help GFP_ATOMIC
> > allocations. Something has changed in recent kernels that affect the timing
> > where high-order GFP_ATOMIC allocations are now failing with more frequency,
> > particularly under pressure.
> > 
> > This patch pre-emptively checks if watermarks have been hit after a
> > high-order allocation completes successfully. If the watermarks have been
> > reached, kswapd is woken in the hope it fixes the watermarks before the
> > next GFP_ATOMIC allocation fails.
> > 
> > Warning, this patch is somewhat of a band-aid. If this makes a difference,
> > it still implies that something has changed that is either causing more
> > GFP_ATOMIC allocations to occur (such as the case with iwlagn wireless
> > driver) or make them more likely to fail.
> 
> hmm, I'm confused. this description addressed generic high order allocation.
> but, 
> 
> > 
> > Signed-off-by: Mel Gorman <mel@csn.ul.ie>
> > ---
> >  mm/page_alloc.c |   33 ++++++++++++++++++++++-----------
> >  1 files changed, 22 insertions(+), 11 deletions(-)
> > 
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 7f2aa3e..851df40 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -1596,6 +1596,17 @@ try_next_zone:
> >  	return page;
> >  }
> >  
> > +static inline
> > +void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> > +						enum zone_type high_zoneidx)
> > +{
> > +	struct zoneref *z;
> > +	struct zone *zone;
> > +
> > +	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> > +		wakeup_kswapd(zone, order);
> > +}
> > +
> >  static inline int
> >  should_alloc_retry(gfp_t gfp_mask, unsigned int order,
> >  				unsigned long pages_reclaimed)
> > @@ -1730,18 +1741,18 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
> 
> __alloc_pages_high_priority() is only called if ALLOC_NO_WATERMARKS.
> ALLOC_NO_WATERMARKS mean PF_MEMALLOC or TIF_MEMDIE and GFP_ATOMIC don't make
> nested alloc_pages() (= don't make PF_MEMALLOC case). 
> Then, I haven't understand why this patch improve iwlagn GFP_ATOMIC case.
> 
> hmm, maybe I missed something. I see the code again tommorow.
> 

The description is misleading but in the patches current form, it makes
a different to Tobias's testing. I still haven't figured out why.

> 
> >  			congestion_wait(BLK_RW_ASYNC, HZ/50);
> >  	} while (!page && (gfp_mask & __GFP_NOFAIL));
> >  
> > -	return page;
> > -}
> > -
> > -static inline
> > -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
> > -						enum zone_type high_zoneidx)
> > -{
> > -	struct zoneref *z;
> > -	struct zone *zone;
> > +	/*
> > +	 * If after a high-order allocation we are now below watermarks,
> > +	 * pre-emptively kick kswapd rather than having the next allocation
> > +	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
> > +	 * allocations or entering direct reclaim
> > +	 */
> > +	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
> > +				preferred_zone->watermark[ALLOC_WMARK_LOW],
> > +				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
> > +		wake_all_kswapd(order, zonelist, high_zoneidx);
> >  
> > -	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
> > -		wakeup_kswapd(zone, order);
> > +	return page;
> >  }
> >  
> >  static inline int
> > -- 
> > 1.6.3.3
> > 
> > --
> > To unsubscribe, send a message with 'unsubscribe linux-mm' in
> > the body to majordomo@kvack.org.  For more info on Linux MM,
> > see: http://www.linux-mm.org/ .
> > Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
> 
> 
>
diff mbox

Patch

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f2aa3e..851df40 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1596,6 +1596,17 @@  try_next_zone:
 	return page;
 }
 
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+						enum zone_type high_zoneidx)
+{
+	struct zoneref *z;
+	struct zone *zone;
+
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+		wakeup_kswapd(zone, order);
+}
+
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 				unsigned long pages_reclaimed)
@@ -1730,18 +1741,18 @@  __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 	} while (!page && (gfp_mask & __GFP_NOFAIL));
 
-	return page;
-}
-
-static inline
-void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-						enum zone_type high_zoneidx)
-{
-	struct zoneref *z;
-	struct zone *zone;
+	/*
+	 * If after a high-order allocation we are now below watermarks,
+	 * pre-emptively kick kswapd rather than having the next allocation
+	 * fail and have to wake up kswapd, potentially failing GFP_ATOMIC
+	 * allocations or entering direct reclaim
+	 */
+	if (unlikely(order) && page && !zone_watermark_ok(preferred_zone, order,
+				preferred_zone->watermark[ALLOC_WMARK_LOW],
+				zone_idx(preferred_zone), ALLOC_WMARK_LOW))
+		wake_all_kswapd(order, zonelist, high_zoneidx);
 
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order);
+	return page;
 }
 
 static inline int