diff mbox series

[V3,1/2] mm: Add get_user_pages_cma_migrate

Message ID 20180918115839.22154-2-aneesh.kumar@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series mm/kvm/vfio/ppc64: Migrate compound pages out of CMA region | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success next/apply_patch Successfully applied
snowpatch_ozlabs/checkpatch fail Test checkpatch on branch next

Commit Message

Aneesh Kumar K V Sept. 18, 2018, 11:58 a.m. UTC
This helper does a get_user_pages_fast and if it find pages in the CMA area
it will try to migrate them before taking page reference. This makes sure that
we don't keep non-movable pages (due to page reference count) in the CMA area.
Not able to move pages out of CMA area result in CMA allocation failures.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 include/linux/hugetlb.h |   2 +
 include/linux/migrate.h |   3 +
 mm/hugetlb.c            |   4 +-
 mm/migrate.c            | 132 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 139 insertions(+), 2 deletions(-)

Comments

Alexey Kardashevskiy Oct. 16, 2018, 4:57 a.m. UTC | #1
On 18/09/2018 21:58, Aneesh Kumar K.V wrote:
> This helper does a get_user_pages_fast and if it find pages in the CMA area
> it will try to migrate them before taking page reference. This makes sure that
> we don't keep non-movable pages (due to page reference count) in the CMA area.
> Not able to move pages out of CMA area result in CMA allocation failures.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  include/linux/hugetlb.h |   2 +
>  include/linux/migrate.h |   3 +
>  mm/hugetlb.c            |   4 +-
>  mm/migrate.c            | 132 ++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 139 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6b68e345f0ca..1abccb1a1ecc 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -357,6 +357,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
>  				nodemask_t *nmask);
>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
>  				unsigned long address);
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +				     int nid, nodemask_t *nmask);
>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>  			pgoff_t idx);
>  
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index f2b4abbca55e..d82b35afd2eb 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct migrate_vma_ops *ops,
>  }
>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>  
> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
> +				      struct page **pages);
> +
>  #endif /* CONFIG_MIGRATION */
>  
>  #endif /* _LINUX_MIGRATE_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3c21775f196b..1abbfcb84f66 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
>  	return page;
>  }
>  
> -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> -		int nid, nodemask_t *nmask)
> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
> +				     int nid, nodemask_t *nmask)
>  {
>  	struct page *page;
>  
> diff --git a/mm/migrate.c b/mm/migrate.c
> index d6a2e89b086a..2f92534ea7a1 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -3006,3 +3006,135 @@ int migrate_vma(const struct migrate_vma_ops *ops,
>  }
>  EXPORT_SYMBOL(migrate_vma);
>  #endif /* defined(MIGRATE_VMA_HELPER) */
> +
> +static struct page *new_non_cma_page(struct page *page, unsigned long private)
> +{
> +	/*
> +	 * We want to make sure we allocate the new page from the same node
> +	 * as the source page.
> +	 */
> +	int nid = page_to_nid(page);
> +	gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
> +
> +	if (PageHighMem(page))
> +		gfp_mask |= __GFP_HIGHMEM;
> +
> +	if (PageTransHuge(page)) {
> +		struct page *thp;
> +		gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
> +
> +		/*
> +		 * Remove the movable mask so that we don't allocate from
> +		 * CMA area again.
> +		 */
> +		thp_gfpmask &= ~__GFP_MOVABLE;
> +		thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);


HPAGE_PMD_ORDER is 2MB or 1GB? THP are always that PMD order?


> +		if (!thp)
> +			return NULL;
> +		prep_transhuge_page(thp);
> +		return thp;
> +
> +#ifdef  CONFIG_HUGETLB_PAGE
> +	} else if (PageHuge(page)) {
> +
> +		struct hstate *h = page_hstate(page);
> +		/*
> +		 * We don't want to dequeue from the pool because pool pages will
> +		 * mostly be from the CMA region.
> +		 */
> +		return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
> +#endif
> +	}
> +
> +	return __alloc_pages_node(nid, gfp_mask, 0);
> +}
> +
> +/**
> + * get_user_pages_cma_migrate() - pin user pages in memory by migrating pages in CMA region
> + * @start:	starting user address
> + * @nr_pages:	number of pages from start to pin
> + * @write:	whether pages will be written to
> + * @pages:	array that receives pointers to the pages pinned.
> + *		Should be at least nr_pages long.
> + *
> + * Attempt to pin user pages in memory without taking mm->mmap_sem.
> + * If not successful, it will fall back to taking the lock and
> + * calling get_user_pages().


I do not see any locking or get_user_pages(), hidden somewhere?

> + *
> + * If the pinned pages are backed by CMA region, we migrate those pages out,
> + * allocating new pages from non-CMA region. This helps in avoiding keeping
> + * pages pinned in the CMA region for a long time thereby resulting in
> + * CMA allocation failures.
> + *
> + * Returns number of pages pinned. This may be fewer than the number
> + * requested. If nr_pages is 0 or negative, returns 0. If no pages
> + * were pinned, returns -errno.
> + */
> +
> +int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
> +			       struct page **pages)
> +{
> +	int i, ret;
> +	bool drain_allow = true;
> +	bool migrate_allow = true;
> +	LIST_HEAD(cma_page_list);
> +
> +get_user_again:
> +	ret = get_user_pages_fast(start, nr_pages, write, pages);
> +	if (ret <= 0)
> +		return ret;
> +
> +	for (i = 0; i < ret; ++i) {
> +		/*
> +		 * If we get a page from the CMA zone, since we are going to
> +		 * be pinning these entries, we might as well move them out
> +		 * of the CMA zone if possible.
> +		 */
> +		if (is_migrate_cma_page(pages[i]) && migrate_allow) {
> +			if (PageHuge(pages[i]))
> +				isolate_huge_page(pages[i], &cma_page_list);
> +			else {
> +				struct page *head = compound_head(pages[i]);
> +
> +				if (!PageLRU(head) && drain_allow) {
> +					lru_add_drain_all();
> +					drain_allow = false;
> +				}
> +
> +				if (!isolate_lru_page(head)) {
> +					list_add_tail(&head->lru, &cma_page_list);
> +					mod_node_page_state(page_pgdat(head),
> +							    NR_ISOLATED_ANON +
> +							    page_is_file_cache(head),
> +							    hpage_nr_pages(head));


Above 10 lines I cannot really comment due to my massive ignorance in
this area, especially about what lru_add_drain_all() and
mod_node_page_state() :(


> +				}
> +			}
> +		}
> +	}
> +	if (!list_empty(&cma_page_list)) {
> +		/*
> +		 * drop the above get_user_pages reference.
> +		 */
> +		for (i = 0; i < ret; ++i)
> +			put_page(pages[i]);
> +
> +		if (migrate_pages(&cma_page_list, new_non_cma_page,
> +				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
> +			/*
> +			 * some of the pages failed migration. Do get_user_pages
> +			 * without migration.
> +			 */
> +			migrate_allow = false;


migrate_allow seems useless, simply calling get_user_pages_fast() should
make the code easier to read imho. And the comment says
get_user_pages(), where does this guy hide?

> +
> +			if (!list_empty(&cma_page_list))
> +				putback_movable_pages(&cma_page_list);
> +		}
> +		/*
> +		 * We did migrate all the pages, Try to get the page references again
> +		 * migrating any new CMA pages which we failed to isolate earlier.
> +		 */
> +		drain_allow = true;

Move this "drain_allow = true" right after "get_user_again:"?


> +		goto get_user_again;
> +	}
> +	return ret;
> +}
>
Aneesh Kumar K V Oct. 16, 2018, 7:16 a.m. UTC | #2
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> On 18/09/2018 21:58, Aneesh Kumar K.V wrote:
>> This helper does a get_user_pages_fast and if it find pages in the CMA area
>> it will try to migrate them before taking page reference. This makes sure that
>> we don't keep non-movable pages (due to page reference count) in the CMA area.
>> Not able to move pages out of CMA area result in CMA allocation failures.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  include/linux/hugetlb.h |   2 +
>>  include/linux/migrate.h |   3 +
>>  mm/hugetlb.c            |   4 +-
>>  mm/migrate.c            | 132 ++++++++++++++++++++++++++++++++++++++++
>>  4 files changed, 139 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>> index 6b68e345f0ca..1abccb1a1ecc 100644
>> --- a/include/linux/hugetlb.h
>> +++ b/include/linux/hugetlb.h
>> @@ -357,6 +357,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
>>  				nodemask_t *nmask);
>>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
>>  				unsigned long address);
>> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>> +				     int nid, nodemask_t *nmask);
>>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>>  			pgoff_t idx);
>>  
>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>> index f2b4abbca55e..d82b35afd2eb 100644
>> --- a/include/linux/migrate.h
>> +++ b/include/linux/migrate.h
>> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct migrate_vma_ops *ops,
>>  }
>>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>>  
>> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
>> +				      struct page **pages);
>> +
>>  #endif /* CONFIG_MIGRATION */
>>  
>>  #endif /* _LINUX_MIGRATE_H */
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 3c21775f196b..1abbfcb84f66 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
>>  	return page;
>>  }
>>  
>> -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>> -		int nid, nodemask_t *nmask)
>> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>> +				     int nid, nodemask_t *nmask)
>>  {
>>  	struct page *page;
>>  
>> diff --git a/mm/migrate.c b/mm/migrate.c
>> index d6a2e89b086a..2f92534ea7a1 100644
>> --- a/mm/migrate.c
>> +++ b/mm/migrate.c
>> @@ -3006,3 +3006,135 @@ int migrate_vma(const struct migrate_vma_ops *ops,
>>  }
>>  EXPORT_SYMBOL(migrate_vma);
>>  #endif /* defined(MIGRATE_VMA_HELPER) */
>> +
>> +static struct page *new_non_cma_page(struct page *page, unsigned long private)
>> +{
>> +	/*
>> +	 * We want to make sure we allocate the new page from the same node
>> +	 * as the source page.
>> +	 */
>> +	int nid = page_to_nid(page);
>> +	gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
>> +
>> +	if (PageHighMem(page))
>> +		gfp_mask |= __GFP_HIGHMEM;
>> +
>> +	if (PageTransHuge(page)) {
>> +		struct page *thp;
>> +		gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
>> +
>> +		/*
>> +		 * Remove the movable mask so that we don't allocate from
>> +		 * CMA area again.
>> +		 */
>> +		thp_gfpmask &= ~__GFP_MOVABLE;
>> +		thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
>
>
> HPAGE_PMD_ORDER is 2MB or 1GB? THP are always that PMD order?

2M or 16M. THP is at PMD level.

>
>
>> +		if (!thp)
>> +			return NULL;
>> +		prep_transhuge_page(thp);
>> +		return thp;
>> +
>> +#ifdef  CONFIG_HUGETLB_PAGE
>> +	} else if (PageHuge(page)) {
>> +
>> +		struct hstate *h = page_hstate(page);
>> +		/*
>> +		 * We don't want to dequeue from the pool because pool pages will
>> +		 * mostly be from the CMA region.
>> +		 */
>> +		return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
>> +#endif
>> +	}
>> +
>> +	return __alloc_pages_node(nid, gfp_mask, 0);
>> +}
>> +
>> +/**
>> + * get_user_pages_cma_migrate() - pin user pages in memory by migrating pages in CMA region
>> + * @start:	starting user address
>> + * @nr_pages:	number of pages from start to pin
>> + * @write:	whether pages will be written to
>> + * @pages:	array that receives pointers to the pages pinned.
>> + *		Should be at least nr_pages long.
>> + *
>> + * Attempt to pin user pages in memory without taking mm->mmap_sem.
>> + * If not successful, it will fall back to taking the lock and
>> + * calling get_user_pages().
>
>
> I do not see any locking or get_user_pages(), hidden somewhere?
>

The rules are same as get_user_pages_fast, which does that pin attempt
without taking mm->mmap_sem. If it fail get_user_pages_fast will take
the mmap_sem and try to pin the pages. The details are in
get_user_pages_fast. You can look at get_user_pages_unlocked


>> + *
>> + * If the pinned pages are backed by CMA region, we migrate those pages out,
>> + * allocating new pages from non-CMA region. This helps in avoiding keeping
>> + * pages pinned in the CMA region for a long time thereby resulting in
>> + * CMA allocation failures.
>> + *
>> + * Returns number of pages pinned. This may be fewer than the number
>> + * requested. If nr_pages is 0 or negative, returns 0. If no pages
>> + * were pinned, returns -errno.
>> + */
>> +
>> +int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
>> +			       struct page **pages)
>> +{
>> +	int i, ret;
>> +	bool drain_allow = true;
>> +	bool migrate_allow = true;
>> +	LIST_HEAD(cma_page_list);
>> +
>> +get_user_again:
>> +	ret = get_user_pages_fast(start, nr_pages, write, pages);
>> +	if (ret <= 0)
>> +		return ret;
>> +
>> +	for (i = 0; i < ret; ++i) {
>> +		/*
>> +		 * If we get a page from the CMA zone, since we are going to
>> +		 * be pinning these entries, we might as well move them out
>> +		 * of the CMA zone if possible.
>> +		 */
>> +		if (is_migrate_cma_page(pages[i]) && migrate_allow) {
>> +			if (PageHuge(pages[i]))
>> +				isolate_huge_page(pages[i], &cma_page_list);
>> +			else {
>> +				struct page *head = compound_head(pages[i]);
>> +
>> +				if (!PageLRU(head) && drain_allow) {
>> +					lru_add_drain_all();
>> +					drain_allow = false;
>> +				}
>> +
>> +				if (!isolate_lru_page(head)) {
>> +					list_add_tail(&head->lru, &cma_page_list);
>> +					mod_node_page_state(page_pgdat(head),
>> +							    NR_ISOLATED_ANON +
>> +							    page_is_file_cache(head),
>> +							    hpage_nr_pages(head));
>
>
> Above 10 lines I cannot really comment due to my massive ignorance in
> this area, especially about what lru_add_drain_all() and
> mod_node_page_state() :(

That makes sure we move the pages from per cpu lru vec and add them to
the right lru list so that we can isolate the pages correctly.

>
>
>> +				}
>> +			}
>> +		}
>> +	}
>> +	if (!list_empty(&cma_page_list)) {
>> +		/*
>> +		 * drop the above get_user_pages reference.
>> +		 */
>> +		for (i = 0; i < ret; ++i)
>> +			put_page(pages[i]);
>> +
>> +		if (migrate_pages(&cma_page_list, new_non_cma_page,
>> +				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
>> +			/*
>> +			 * some of the pages failed migration. Do get_user_pages
>> +			 * without migration.
>> +			 */
>> +			migrate_allow = false;
>
>
> migrate_allow seems useless, simply calling get_user_pages_fast() should
> make the code easier to read imho. And the comment says
> get_user_pages(), where does this guy hide?

I didn't get that suggestion. What we want to do here is try to migrate pages as
long as we find CMA pages in the result of get_user_pages_fast. If we
failed any migration attempt, don't try to migrate again.

>
>> +
>> +			if (!list_empty(&cma_page_list))
>> +				putback_movable_pages(&cma_page_list);
>> +		}
>> +		/*
>> +		 * We did migrate all the pages, Try to get the page references again
>> +		 * migrating any new CMA pages which we failed to isolate earlier.
>> +		 */
>> +		drain_allow = true;
>
> Move this "drain_allow = true" right after "get_user_again:"? 1
 
>
>
>> +		goto get_user_again;
>> +	}
>> +	return ret;
>> +}
>> 
>
> -- 
> Alexey

-aneesh
Alexey Kardashevskiy Oct. 16, 2018, 7:52 a.m. UTC | #3
On 16/10/2018 18:16, Aneesh Kumar K.V wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> 
>> On 18/09/2018 21:58, Aneesh Kumar K.V wrote:
>>> This helper does a get_user_pages_fast and if it find pages in the CMA area
>>> it will try to migrate them before taking page reference. This makes sure that
>>> we don't keep non-movable pages (due to page reference count) in the CMA area.
>>> Not able to move pages out of CMA area result in CMA allocation failures.
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>>  include/linux/hugetlb.h |   2 +
>>>  include/linux/migrate.h |   3 +
>>>  mm/hugetlb.c            |   4 +-
>>>  mm/migrate.c            | 132 ++++++++++++++++++++++++++++++++++++++++
>>>  4 files changed, 139 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
>>> index 6b68e345f0ca..1abccb1a1ecc 100644
>>> --- a/include/linux/hugetlb.h
>>> +++ b/include/linux/hugetlb.h
>>> @@ -357,6 +357,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
>>>  				nodemask_t *nmask);
>>>  struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
>>>  				unsigned long address);
>>> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>>> +				     int nid, nodemask_t *nmask);
>>>  int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
>>>  			pgoff_t idx);
>>>  
>>> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
>>> index f2b4abbca55e..d82b35afd2eb 100644
>>> --- a/include/linux/migrate.h
>>> +++ b/include/linux/migrate.h
>>> @@ -286,6 +286,9 @@ static inline int migrate_vma(const struct migrate_vma_ops *ops,
>>>  }
>>>  #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
>>>  
>>> +extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
>>> +				      struct page **pages);
>>> +
>>>  #endif /* CONFIG_MIGRATION */
>>>  
>>>  #endif /* _LINUX_MIGRATE_H */
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 3c21775f196b..1abbfcb84f66 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
>>>  	return page;
>>>  }
>>>  
>>> -static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>>> -		int nid, nodemask_t *nmask)
>>> +struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
>>> +				     int nid, nodemask_t *nmask)
>>>  {
>>>  	struct page *page;
>>>  
>>> diff --git a/mm/migrate.c b/mm/migrate.c
>>> index d6a2e89b086a..2f92534ea7a1 100644
>>> --- a/mm/migrate.c
>>> +++ b/mm/migrate.c
>>> @@ -3006,3 +3006,135 @@ int migrate_vma(const struct migrate_vma_ops *ops,
>>>  }
>>>  EXPORT_SYMBOL(migrate_vma);
>>>  #endif /* defined(MIGRATE_VMA_HELPER) */
>>> +
>>> +static struct page *new_non_cma_page(struct page *page, unsigned long private)
>>> +{
>>> +	/*
>>> +	 * We want to make sure we allocate the new page from the same node
>>> +	 * as the source page.
>>> +	 */
>>> +	int nid = page_to_nid(page);
>>> +	gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
>>> +
>>> +	if (PageHighMem(page))
>>> +		gfp_mask |= __GFP_HIGHMEM;
>>> +
>>> +	if (PageTransHuge(page)) {
>>> +		struct page *thp;
>>> +		gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
>>> +
>>> +		/*
>>> +		 * Remove the movable mask so that we don't allocate from
>>> +		 * CMA area again.
>>> +		 */
>>> +		thp_gfpmask &= ~__GFP_MOVABLE;
>>> +		thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
>>
>>
>> HPAGE_PMD_ORDER is 2MB or 1GB? THP are always that PMD order?
> 
> 2M or 16M. THP is at PMD level.
> 
>>
>>
>>> +		if (!thp)
>>> +			return NULL;
>>> +		prep_transhuge_page(thp);
>>> +		return thp;
>>> +
>>> +#ifdef  CONFIG_HUGETLB_PAGE
>>> +	} else if (PageHuge(page)) {
>>> +
>>> +		struct hstate *h = page_hstate(page);
>>> +		/*
>>> +		 * We don't want to dequeue from the pool because pool pages will
>>> +		 * mostly be from the CMA region.
>>> +		 */
>>> +		return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
>>> +#endif
>>> +	}
>>> +
>>> +	return __alloc_pages_node(nid, gfp_mask, 0);
>>> +}
>>> +
>>> +/**
>>> + * get_user_pages_cma_migrate() - pin user pages in memory by migrating pages in CMA region
>>> + * @start:	starting user address
>>> + * @nr_pages:	number of pages from start to pin
>>> + * @write:	whether pages will be written to
>>> + * @pages:	array that receives pointers to the pages pinned.
>>> + *		Should be at least nr_pages long.
>>> + *
>>> + * Attempt to pin user pages in memory without taking mm->mmap_sem.
>>> + * If not successful, it will fall back to taking the lock and
>>> + * calling get_user_pages().
>>
>>
>> I do not see any locking or get_user_pages(), hidden somewhere?
>>
> 
> The rules are same as get_user_pages_fast, which does that pin attempt
> without taking mm->mmap_sem. If it fail get_user_pages_fast will take
> the mmap_sem and try to pin the pages. The details are in
> get_user_pages_fast. You can look at get_user_pages_unlocked

Ah, right.


>>> + *
>>> + * If the pinned pages are backed by CMA region, we migrate those pages out,
>>> + * allocating new pages from non-CMA region. This helps in avoiding keeping
>>> + * pages pinned in the CMA region for a long time thereby resulting in
>>> + * CMA allocation failures.
>>> + *
>>> + * Returns number of pages pinned. This may be fewer than the number
>>> + * requested. If nr_pages is 0 or negative, returns 0. If no pages
>>> + * were pinned, returns -errno.
>>> + */
>>> +
>>> +int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
>>> +			       struct page **pages)
>>> +{
>>> +	int i, ret;
>>> +	bool drain_allow = true;
>>> +	bool migrate_allow = true;
>>> +	LIST_HEAD(cma_page_list);
>>> +
>>> +get_user_again:
>>> +	ret = get_user_pages_fast(start, nr_pages, write, pages);
>>> +	if (ret <= 0)
>>> +		return ret;
>>> +
>>> +	for (i = 0; i < ret; ++i) {
>>> +		/*
>>> +		 * If we get a page from the CMA zone, since we are going to
>>> +		 * be pinning these entries, we might as well move them out
>>> +		 * of the CMA zone if possible.
>>> +		 */
>>> +		if (is_migrate_cma_page(pages[i]) && migrate_allow) {
>>> +			if (PageHuge(pages[i]))
>>> +				isolate_huge_page(pages[i], &cma_page_list);
>>> +			else {
>>> +				struct page *head = compound_head(pages[i]);
>>> +
>>> +				if (!PageLRU(head) && drain_allow) {
>>> +					lru_add_drain_all();
>>> +					drain_allow = false;
>>> +				}
>>> +
>>> +				if (!isolate_lru_page(head)) {
>>> +					list_add_tail(&head->lru, &cma_page_list);
>>> +					mod_node_page_state(page_pgdat(head),
>>> +							    NR_ISOLATED_ANON +
>>> +							    page_is_file_cache(head),
>>> +							    hpage_nr_pages(head));
>>
>>
>> Above 10 lines I cannot really comment due to my massive ignorance in
>> this area, especially about what lru_add_drain_all() and
>> mod_node_page_state() :(
> 
> That makes sure we move the pages from per cpu lru vec and add them to
> the right lru list so that we can isolate the pages correctly.

I understand the idea but cannot confirm the correctness :-/


> 
>>
>>
>>> +				}
>>> +			}
>>> +		}
>>> +	}
>>> +	if (!list_empty(&cma_page_list)) {
>>> +		/*
>>> +		 * drop the above get_user_pages reference.
>>> +		 */


btw, can these pages be used by somebody else in this short window
before we migrated and pinned them?


>>> +		for (i = 0; i < ret; ++i)
>>> +			put_page(pages[i]);
>>> +
>>> +		if (migrate_pages(&cma_page_list, new_non_cma_page,
>>> +				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
>>> +			/*
>>> +			 * some of the pages failed migration. Do get_user_pages
>>> +			 * without migration.
>>> +			 */
>>> +			migrate_allow = false;
>>
>>
>> migrate_allow seems useless, simply calling get_user_pages_fast() should
>> make the code easier to read imho. And the comment says
>> get_user_pages(), where does this guy hide?
> 
> I didn't get that suggestion. What we want to do here is try to migrate pages as
> long as we find CMA pages in the result of get_user_pages_fast. If we
> failed any migration attempt, don't try to migrate again.


Setting migrate_allow to false here means you jump up, call
get_user_pages_fast() and then run the loop which will do nothing just
because if(...migrate_allow) is false. Instead of jumping up you could
just call get_user_pages_fast().

btw what is migrate_pages() leaves something in cma_page_list (I cannot
see it removing pages)? Won't it loop indefinitely?



> 
>>
>>> +
>>> +			if (!list_empty(&cma_page_list))
>>> +				putback_movable_pages(&cma_page_list);
>>> +		}
>>> +		/*
>>> +		 * We did migrate all the pages, Try to get the page references again
>>> +		 * migrating any new CMA pages which we failed to isolate earlier.
>>> +		 */
>>> +		drain_allow = true;
>>
>> Move this "drain_allow = true" right after "get_user_again:"? 1
>  
>>
>>
>>> +		goto get_user_again;
>>> +	}
>>> +	return ret;
>>> +}
>>>
>>
>> -- 
>> Alexey
> 
> -aneesh
>
Aneesh Kumar K V Oct. 16, 2018, 8:35 a.m. UTC | #4
Alexey Kardashevskiy <aik@ozlabs.ru> writes:

> On 16/10/2018 18:16, Aneesh Kumar K.V wrote:
>> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> 
>>> +			}
>>>> +		}
>>>> +	}
>>>> +	if (!list_empty(&cma_page_list)) {
>>>> +		/*
>>>> +		 * drop the above get_user_pages reference.
>>>> +		 */
>
>
> btw, can these pages be used by somebody else in this short window
> before we migrated and pinned them?

isolate lru page make sure that we remove them from lru list. So lru
walkers won't find the page. If somebody happen to increment the page
reference count in that window, the migrate_pages will fail. That is
handled via migrate_page_move_mapping returning EAGAIN

>
>
>>>> +		for (i = 0; i < ret; ++i)
>>>> +			put_page(pages[i]);
>>>> +
>>>> +		if (migrate_pages(&cma_page_list, new_non_cma_page,
>>>> +				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
>>>> +			/*
>>>> +			 * some of the pages failed migration. Do get_user_pages
>>>> +			 * without migration.
>>>> +			 */
>>>> +			migrate_allow = false;
>>>
>>>
>>> migrate_allow seems useless, simply calling get_user_pages_fast() should
>>> make the code easier to read imho. And the comment says
>>> get_user_pages(), where does this guy hide?
>> 
>> I didn't get that suggestion. What we want to do here is try to migrate pages as
>> long as we find CMA pages in the result of get_user_pages_fast. If we
>> failed any migration attempt, don't try to migrate again.
>
>
> Setting migrate_allow to false here means you jump up, call
> get_user_pages_fast() and then run the loop which will do nothing just
> because if(...migrate_allow) is false. Instead of jumping up you could
> just call get_user_pages_fast().

ok, that is coding preference I guess, I prefer to avoid multiple
get_user_pages_fast there. Since we droped the page reference, we need
to _go back_ and get the page reference without attempting to migrate. That
is the way I was looking at this.

>
> btw what is migrate_pages() leaves something in cma_page_list (I cannot
> see it removing pages)? Won't it loop indefinitely?
>

putback_movable_pages take care of that. The below hunk.

			if (!list_empty(&cma_page_list))
				putback_movable_pages(&cma_page_list);

-aneesh
diff mbox series

Patch

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..1abccb1a1ecc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -357,6 +357,8 @@  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
 				nodemask_t *nmask);
 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
 				unsigned long address);
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+				     int nid, nodemask_t *nmask);
 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 			pgoff_t idx);
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f2b4abbca55e..d82b35afd2eb 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -286,6 +286,9 @@  static inline int migrate_vma(const struct migrate_vma_ops *ops,
 }
 #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
 
+extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
+				      struct page **pages);
+
 #endif /* CONFIG_MIGRATION */
 
 #endif /* _LINUX_MIGRATE_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..1abbfcb84f66 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1585,8 +1585,8 @@  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
 	return page;
 }
 
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
-		int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+				     int nid, nodemask_t *nmask)
 {
 	struct page *page;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index d6a2e89b086a..2f92534ea7a1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3006,3 +3006,135 @@  int migrate_vma(const struct migrate_vma_ops *ops,
 }
 EXPORT_SYMBOL(migrate_vma);
 #endif /* defined(MIGRATE_VMA_HELPER) */
+
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+	/*
+	 * We want to make sure we allocate the new page from the same node
+	 * as the source page.
+	 */
+	int nid = page_to_nid(page);
+	gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
+
+	if (PageHighMem(page))
+		gfp_mask |= __GFP_HIGHMEM;
+
+	if (PageTransHuge(page)) {
+		struct page *thp;
+		gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
+
+		/*
+		 * Remove the movable mask so that we don't allocate from
+		 * CMA area again.
+		 */
+		thp_gfpmask &= ~__GFP_MOVABLE;
+		thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+		if (!thp)
+			return NULL;
+		prep_transhuge_page(thp);
+		return thp;
+
+#ifdef  CONFIG_HUGETLB_PAGE
+	} else if (PageHuge(page)) {
+
+		struct hstate *h = page_hstate(page);
+		/*
+		 * We don't want to dequeue from the pool because pool pages will
+		 * mostly be from the CMA region.
+		 */
+		return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+#endif
+	}
+
+	return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+/**
+ * get_user_pages_cma_migrate() - pin user pages in memory by migrating pages in CMA region
+ * @start:	starting user address
+ * @nr_pages:	number of pages from start to pin
+ * @write:	whether pages will be written to
+ * @pages:	array that receives pointers to the pages pinned.
+ *		Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * If the pinned pages are backed by CMA region, we migrate those pages out,
+ * allocating new pages from non-CMA region. This helps in avoiding keeping
+ * pages pinned in the CMA region for a long time thereby resulting in
+ * CMA allocation failures.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+
+int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int write,
+			       struct page **pages)
+{
+	int i, ret;
+	bool drain_allow = true;
+	bool migrate_allow = true;
+	LIST_HEAD(cma_page_list);
+
+get_user_again:
+	ret = get_user_pages_fast(start, nr_pages, write, pages);
+	if (ret <= 0)
+		return ret;
+
+	for (i = 0; i < ret; ++i) {
+		/*
+		 * If we get a page from the CMA zone, since we are going to
+		 * be pinning these entries, we might as well move them out
+		 * of the CMA zone if possible.
+		 */
+		if (is_migrate_cma_page(pages[i]) && migrate_allow) {
+			if (PageHuge(pages[i]))
+				isolate_huge_page(pages[i], &cma_page_list);
+			else {
+				struct page *head = compound_head(pages[i]);
+
+				if (!PageLRU(head) && drain_allow) {
+					lru_add_drain_all();
+					drain_allow = false;
+				}
+
+				if (!isolate_lru_page(head)) {
+					list_add_tail(&head->lru, &cma_page_list);
+					mod_node_page_state(page_pgdat(head),
+							    NR_ISOLATED_ANON +
+							    page_is_file_cache(head),
+							    hpage_nr_pages(head));
+				}
+			}
+		}
+	}
+	if (!list_empty(&cma_page_list)) {
+		/*
+		 * drop the above get_user_pages reference.
+		 */
+		for (i = 0; i < ret; ++i)
+			put_page(pages[i]);
+
+		if (migrate_pages(&cma_page_list, new_non_cma_page,
+				  NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+			/*
+			 * some of the pages failed migration. Do get_user_pages
+			 * without migration.
+			 */
+			migrate_allow = false;
+
+			if (!list_empty(&cma_page_list))
+				putback_movable_pages(&cma_page_list);
+		}
+		/*
+		 * We did migrate all the pages, Try to get the page references again
+		 * migrating any new CMA pages which we failed to isolate earlier.
+		 */
+		drain_allow = true;
+		goto get_user_again;
+	}
+	return ret;
+}