Patchwork [v5,08/14] memory-hotplug: Common APIs to support page tables hot-remove

login
register
mail settings
Submitter Tang Chen
Date Dec. 24, 2012, 12:09 p.m.
Message ID <1356350964-13437-9-git-send-email-tangchen@cn.fujitsu.com>
Download mbox | patch
Permalink /patch/208063/
State Not Applicable
Delegated to: David Miller
Headers show

Comments

Tang Chen - Dec. 24, 2012, 12:09 p.m.
From: Wen Congyang <wency@cn.fujitsu.com>

When memory is removed, the corresponding pagetables should alse be removed.
This patch introduces some common APIs to support vmemmap pagetable and x86_64
architecture pagetable removing.

All pages of virtual mapping in removed memory cannot be freedi if some pages
used as PGD/PUD includes not only removed memory but also other memory. So the
patch uses the following way to check whether page can be freed or not.

 1. When removing memory, the page structs of the revmoved memory are filled
    with 0FD.
 2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
    In this case, the page used as PT/PMD can be freed.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 arch/x86/include/asm/pgtable_types.h |    1 +
 arch/x86/mm/init_64.c                |  297 ++++++++++++++++++++++++++++++++++
 arch/x86/mm/pageattr.c               |   47 +++---
 include/linux/bootmem.h              |    1 +
 4 files changed, 324 insertions(+), 22 deletions(-)
Jianguo Wu - Dec. 25, 2012, 8:17 a.m.
On 2012/12/24 20:09, Tang Chen wrote:

> From: Wen Congyang <wency@cn.fujitsu.com>
> 
> When memory is removed, the corresponding pagetables should alse be removed.
> This patch introduces some common APIs to support vmemmap pagetable and x86_64
> architecture pagetable removing.
> 
> All pages of virtual mapping in removed memory cannot be freedi if some pages
> used as PGD/PUD includes not only removed memory but also other memory. So the
> patch uses the following way to check whether page can be freed or not.
> 
>  1. When removing memory, the page structs of the revmoved memory are filled
>     with 0FD.
>  2. All page structs are filled with 0xFD on PT/PMD, PT/PMD can be cleared.
>     In this case, the page used as PT/PMD can be freed.
> 
> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Signed-off-by: Tang Chen <tangchen@cn.fujitsu.com>
> ---
>  arch/x86/include/asm/pgtable_types.h |    1 +
>  arch/x86/mm/init_64.c                |  297 ++++++++++++++++++++++++++++++++++
>  arch/x86/mm/pageattr.c               |   47 +++---
>  include/linux/bootmem.h              |    1 +
>  4 files changed, 324 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
> index 3c32db8..4b6fd2a 100644
> --- a/arch/x86/include/asm/pgtable_types.h
> +++ b/arch/x86/include/asm/pgtable_types.h
> @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
>   * as a pte too.
>   */
>  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
> +extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
>  
>  #endif	/* !__ASSEMBLY__ */
>  
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index aeaa27e..b30df3c 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -682,6 +682,303 @@ int arch_add_memory(int nid, u64 start, u64 size)
>  }
>  EXPORT_SYMBOL_GPL(arch_add_memory);
>  
> +#define PAGE_INUSE 0xFD
> +
> +static void __meminit free_pagetable(struct page *page, int order)
> +{
> +	struct zone *zone;
> +	bool bootmem = false;
> +	unsigned long magic;
> +
> +	/* bootmem page has reserved flag */
> +	if (PageReserved(page)) {
> +		__ClearPageReserved(page);
> +		bootmem = true;
> +
> +		magic = (unsigned long)page->lru.next;
> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
> +			put_page_bootmem(page);

Hi Tang,

For removing memmap of sparse-vmemmap, in cpu_has_pse case, if magic == SECTION_INFO,
the order will be get_order(PMD_SIZE), so we need a loop here to put all the 512 pages.

Thanks,
Jianguo Wu

> +		else
> +			__free_pages_bootmem(page, order);
> +	} else
> +		free_pages((unsigned long)page_address(page), order);
> +
> +	/*
> +	 * SECTION_INFO pages and MIX_SECTION_INFO pages
> +	 * are all allocated by bootmem.
> +	 */
> +	if (bootmem) {
> +		zone = page_zone(page);
> +		zone_span_writelock(zone);
> +		zone->present_pages++;
> +		zone_span_writeunlock(zone);
> +		totalram_pages++;
> +	}
> +}
> +
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (pte_val(*pte))
> +			return;
> +	}
> +
> +	/* free a pte talbe */
> +	free_pagetable(pmd_page(*pmd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pmd_clear(pmd);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (pmd_val(*pmd))
> +			return;
> +	}
> +
> +	/* free a pmd talbe */
> +	free_pagetable(pud_page(*pud), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pud_clear(pud);
> +	spin_unlock(&init_mm.page_table_lock);
> +}
> +
> +/* Return true if pgd is changed, otherwise return false. */
> +static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
> +{
> +	pud_t *pud;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PUD; i++) {
> +		pud = pud_start + i;
> +		if (pud_val(*pud))
> +			return false;
> +	}
> +
> +	/* free a pud table */
> +	free_pagetable(pgd_page(*pgd), 0);
> +	spin_lock(&init_mm.page_table_lock);
> +	pgd_clear(pgd);
> +	spin_unlock(&init_mm.page_table_lock);
> +
> +	return true;
> +}
> +
> +static void __meminit
> +remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long next, pages = 0;
> +	pte_t *pte;
> +	void *page_addr;
> +	phys_addr_t phys_addr;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr = next, pte++) {
> +		next = (addr + PAGE_SIZE) & PAGE_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		/*
> +		 * We mapped [0,1G) memory as identity mapping when
> +		 * initializing, in arch/x86/kernel/head_64.S. These
> +		 * pagetables cannot be removed.
> +		 */
> +		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
> +		if (phys_addr < (phys_addr_t)0x40000000)
> +			return;
> +
> +		if (IS_ALIGNED(addr, PAGE_SIZE) &&
> +		    IS_ALIGNED(next, PAGE_SIZE)) {
> +			if (!direct) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +			}
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pte_clear(&init_mm, addr, pte);
> +			spin_unlock(&init_mm.page_table_lock);
> +		} else {
> +			/*
> +			 * If we are not removing the whole page, it means
> +			 * other ptes in this page are being used and we canot
> +			 * remove them. So fill the unused ptes with 0xFD, and
> +			 * remove the page when it is wholly filled with 0xFD.
> +			 */
> +			memset((void *)addr, PAGE_INUSE, next - addr);
> +			page_addr = page_address(pte_page(*pte));
> +
> +			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
> +				free_pagetable(pte_page(*pte), 0);
> +				pages++;
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pte_clear(&init_mm, addr, pte);
> +				spin_unlock(&init_mm.page_table_lock);
> +			}
> +		}
> +	}
> +
> +	/* Call free_pte_table() in remove_pmd_table(). */
> +	flush_tlb_all();
> +	if (direct)
> +		update_page_count(PG_LEVEL_4K, -pages);
> +}
> +
> +static void __meminit
> +remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pte_phys, next, pages = 0;
> +	pte_t *pte_base;
> +	pmd_t *pmd;
> +
> +	pmd = pmd_start + pmd_index(addr);
> +	for (; addr < end; addr = next, pmd++) {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_large(*pmd)) {
> +			if (IS_ALIGNED(addr, PMD_SIZE) &&
> +			    IS_ALIGNED(next, PMD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pmd_page(*pmd),
> +						       get_order(PMD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pmd_clear(pmd);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 2M page, but we need to remove part of them,
> +			 * so split 2M page to 4K page.
> +			 */
> +			pte_base = (pte_t *)alloc_low_page(&pte_phys);
> +			BUG_ON(!pte_base);
> +			__split_large_page((pte_t *)pmd, addr,
> +					   (pte_t *)pte_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
> +		remove_pte_table(pte_base, addr, next, direct);
> +		free_pte_table(pte_base, pmd);
> +		unmap_low_page(pte_base);
> +	}
> +
> +	/* Call free_pmd_table() in remove_pud_table(). */
> +	if (direct)
> +		update_page_count(PG_LEVEL_2M, -pages);
> +}
> +
> +static void __meminit
> +remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
> +		 bool direct)
> +{
> +	unsigned long pmd_phys, next, pages = 0;
> +	pmd_t *pmd_base;
> +	pud_t *pud;
> +
> +	pud = pud_start + pud_index(addr);
> +	for (; addr < end; addr = next, pud++) {
> +		next = pud_addr_end(addr, end);
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_large(*pud)) {
> +			if (IS_ALIGNED(addr, PUD_SIZE) &&
> +			    IS_ALIGNED(next, PUD_SIZE)) {
> +				if (!direct) {
> +					free_pagetable(pud_page(*pud),
> +						       get_order(PUD_SIZE));
> +					pages++;
> +				}
> +
> +				spin_lock(&init_mm.page_table_lock);
> +				pud_clear(pud);
> +				spin_unlock(&init_mm.page_table_lock);
> +				continue;
> +			}
> +
> +			/*
> +			 * We use 1G page, but we need to remove part of them,
> +			 * so split 1G page to 2M page.
> +			 */
> +			pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
> +			BUG_ON(!pmd_base);
> +			__split_large_page((pte_t *)pud, addr,
> +					   (pte_t *)pmd_base);
> +
> +			spin_lock(&init_mm.page_table_lock);
> +			pud_populate(&init_mm, pud, __va(pmd_phys));
> +			spin_unlock(&init_mm.page_table_lock);
> +
> +			flush_tlb_all();
> +		}
> +
> +		pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
> +		remove_pmd_table(pmd_base, addr, next, direct);
> +		free_pmd_table(pmd_base, pud);
> +		unmap_low_page(pmd_base);
> +	}
> +
> +	if (direct)
> +		update_page_count(PG_LEVEL_1G, -pages);
> +}
> +
> +/* start and end are both virtual address. */
> +static void __meminit
> +remove_pagetable(unsigned long start, unsigned long end, bool direct)
> +{
> +	unsigned long next;
> +	pgd_t *pgd;
> +	pud_t *pud;
> +	bool pgd_changed = false;
> +
> +	for (; start < end; start = next) {
> +		pgd = pgd_offset_k(start);
> +		if (!pgd_present(*pgd))
> +			continue;
> +
> +		next = pgd_addr_end(start, end);
> +
> +		pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
> +		remove_pud_table(pud, start, next, direct);
> +		if (free_pud_table(pud, pgd))
> +			pgd_changed = true;
> +		unmap_low_page(pud);
> +	}
> +
> +	if (pgd_changed)
> +		sync_global_pgds(start, end - 1);
> +
> +	flush_tlb_all();
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  int __ref arch_remove_memory(u64 start, u64 size)
>  {
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index a718e0d..7dcb6f9 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -501,21 +501,13 @@ out_unlock:
>  	return do_split;
>  }
>  
> -static int split_large_page(pte_t *kpte, unsigned long address)
> +int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
>  {
>  	unsigned long pfn, pfninc = 1;
>  	unsigned int i, level;
> -	pte_t *pbase, *tmp;
> +	pte_t *tmp;
>  	pgprot_t ref_prot;
> -	struct page *base;
> -
> -	if (!debug_pagealloc)
> -		spin_unlock(&cpa_lock);
> -	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> -	if (!debug_pagealloc)
> -		spin_lock(&cpa_lock);
> -	if (!base)
> -		return -ENOMEM;
> +	struct page *base = virt_to_page(pbase);
>  
>  	spin_lock(&pgd_lock);
>  	/*
> @@ -523,10 +515,11 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * up for us already:
>  	 */
>  	tmp = lookup_address(address, &level);
> -	if (tmp != kpte)
> -		goto out_unlock;
> +	if (tmp != kpte) {
> +		spin_unlock(&pgd_lock);
> +		return 1;
> +	}
>  
> -	pbase = (pte_t *)page_address(base);
>  	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
>  	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
>  	/*
> @@ -579,17 +572,27 @@ static int split_large_page(pte_t *kpte, unsigned long address)
>  	 * going on.
>  	 */
>  	__flush_tlb_all();
> +	spin_unlock(&pgd_lock);
>  
> -	base = NULL;
> +	return 0;
> +}
>  
> -out_unlock:
> -	/*
> -	 * If we dropped out via the lookup_address check under
> -	 * pgd_lock then stick the page back into the pool:
> -	 */
> -	if (base)
> +static int split_large_page(pte_t *kpte, unsigned long address)
> +{
> +	pte_t *pbase;
> +	struct page *base;
> +
> +	if (!debug_pagealloc)
> +		spin_unlock(&cpa_lock);
> +	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
> +	if (!debug_pagealloc)
> +		spin_lock(&cpa_lock);
> +	if (!base)
> +		return -ENOMEM;
> +
> +	pbase = (pte_t *)page_address(base);
> +	if (__split_large_page(kpte, address, pbase))
>  		__free_page(base);
> -	spin_unlock(&pgd_lock);
>  
>  	return 0;
>  }
> diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
> index 3f778c2..190ff06 100644
> --- a/include/linux/bootmem.h
> +++ b/include/linux/bootmem.h
> @@ -53,6 +53,7 @@ extern void free_bootmem_node(pg_data_t *pgdat,
>  			      unsigned long size);
>  extern void free_bootmem(unsigned long physaddr, unsigned long size);
>  extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
> +extern void __free_pages_bootmem(struct page *page, unsigned int order);
>  
>  /*
>   * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,



--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tang Chen - Dec. 26, 2012, 2:49 a.m.
On 12/25/2012 04:17 PM, Jianguo Wu wrote:
>> +
>> +static void __meminit free_pagetable(struct page *page, int order)
>> +{
>> +	struct zone *zone;
>> +	bool bootmem = false;
>> +	unsigned long magic;
>> +
>> +	/* bootmem page has reserved flag */
>> +	if (PageReserved(page)) {
>> +		__ClearPageReserved(page);
>> +		bootmem = true;
>> +
>> +		magic = (unsigned long)page->lru.next;
>> +		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
>> +			put_page_bootmem(page);
>
> Hi Tang,
>
> For removing memmap of sparse-vmemmap, in cpu_has_pse case, if magic == SECTION_INFO,
> the order will be get_order(PMD_SIZE), so we need a loop here to put all the 512 pages.
>
Hi Wu,

Thanks for reminding me that. I truely missed it.

And since in register_page_bootmem_info_section(), a whole memory
section will be set as SECTION_INFO, I think we don't need to check
the page magic one by one, just the first one is enough. :)

I will fix it, thanks. :)

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tang Chen - Dec. 26, 2012, 3:11 a.m.
On 12/26/2012 10:49 AM, Tang Chen wrote:
> On 12/25/2012 04:17 PM, Jianguo Wu wrote:
>>> +
>>> +static void __meminit free_pagetable(struct page *page, int order)
>>> +{
>>> + struct zone *zone;
>>> + bool bootmem = false;
>>> + unsigned long magic;
>>> +
>>> + /* bootmem page has reserved flag */
>>> + if (PageReserved(page)) {
>>> + __ClearPageReserved(page);
>>> + bootmem = true;
>>> +
>>> + magic = (unsigned long)page->lru.next;
>>> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)

And also, I think we don't need to check MIX_SECTION_INFO since it is
for the pageblock_flags, not the memmap in the section.

Thanks. :)

>>> + put_page_bootmem(page);
>>
>> Hi Tang,
>>
>> For removing memmap of sparse-vmemmap, in cpu_has_pse case, if magic
>> == SECTION_INFO,
>> the order will be get_order(PMD_SIZE), so we need a loop here to put
>> all the 512 pages.
>>
> Hi Wu,
>
> Thanks for reminding me that. I truely missed it.
>
> And since in register_page_bootmem_info_section(), a whole memory
> section will be set as SECTION_INFO, I think we don't need to check
> the page magic one by one, just the first one is enough. :)
>
> I will fix it, thanks. :)
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tang Chen - Dec. 26, 2012, 3:19 a.m.
On 12/26/2012 11:11 AM, Tang Chen wrote:
> On 12/26/2012 10:49 AM, Tang Chen wrote:
>> On 12/25/2012 04:17 PM, Jianguo Wu wrote:
>>>> +
>>>> +static void __meminit free_pagetable(struct page *page, int order)
>>>> +{
>>>> + struct zone *zone;
>>>> + bool bootmem = false;
>>>> + unsigned long magic;
>>>> +
>>>> + /* bootmem page has reserved flag */
>>>> + if (PageReserved(page)) {
>>>> + __ClearPageReserved(page);
>>>> + bootmem = true;
>>>> +
>>>> + magic = (unsigned long)page->lru.next;
>>>> + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
>
> And also, I think we don't need to check MIX_SECTION_INFO since it is
> for the pageblock_flags, not the memmap in the section.

Oh, no :)

We also need to check MIX_SECTION_INFO because we set pgd, pud, pmd
pages as MIX_SECTION_INFO in register_page_bootmem_memmap() in patch6.

Thanks. :)

>
> Thanks. :)
>
>>>> + put_page_bootmem(page);
>>>
>>> Hi Tang,
>>>
>>> For removing memmap of sparse-vmemmap, in cpu_has_pse case, if magic
>>> == SECTION_INFO,
>>> the order will be get_order(PMD_SIZE), so we need a loop here to put
>>> all the 512 pages.
>>>
>> Hi Wu,
>>
>> Thanks for reminding me that. I truely missed it.
>>
>> And since in register_page_bootmem_info_section(), a whole memory
>> section will be set as SECTION_INFO, I think we don't need to check
>> the page magic one by one, just the first one is enough. :)
>>
>> I will fix it, thanks. :)
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3c32db8..4b6fd2a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -352,6 +352,7 @@  static inline void update_page_count(int level, unsigned long pages) { }
  * as a pte too.
  */
 extern pte_t *lookup_address(unsigned long address, unsigned int *level);
+extern int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase);
 
 #endif	/* !__ASSEMBLY__ */
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index aeaa27e..b30df3c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -682,6 +682,303 @@  int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
+#define PAGE_INUSE 0xFD
+
+static void __meminit free_pagetable(struct page *page, int order)
+{
+	struct zone *zone;
+	bool bootmem = false;
+	unsigned long magic;
+
+	/* bootmem page has reserved flag */
+	if (PageReserved(page)) {
+		__ClearPageReserved(page);
+		bootmem = true;
+
+		magic = (unsigned long)page->lru.next;
+		if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+			put_page_bootmem(page);
+		else
+			__free_pages_bootmem(page, order);
+	} else
+		free_pages((unsigned long)page_address(page), order);
+
+	/*
+	 * SECTION_INFO pages and MIX_SECTION_INFO pages
+	 * are all allocated by bootmem.
+	 */
+	if (bootmem) {
+		zone = page_zone(page);
+		zone_span_writelock(zone);
+		zone->present_pages++;
+		zone_span_writeunlock(zone);
+		totalram_pages++;
+	}
+}
+
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (pte_val(*pte))
+			return;
+	}
+
+	/* free a pte talbe */
+	free_pagetable(pmd_page(*pmd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pmd_clear(pmd);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (pmd_val(*pmd))
+			return;
+	}
+
+	/* free a pmd talbe */
+	free_pagetable(pud_page(*pud), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pud_clear(pud);
+	spin_unlock(&init_mm.page_table_lock);
+}
+
+/* Return true if pgd is changed, otherwise return false. */
+static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd)
+{
+	pud_t *pud;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PUD; i++) {
+		pud = pud_start + i;
+		if (pud_val(*pud))
+			return false;
+	}
+
+	/* free a pud table */
+	free_pagetable(pgd_page(*pgd), 0);
+	spin_lock(&init_mm.page_table_lock);
+	pgd_clear(pgd);
+	spin_unlock(&init_mm.page_table_lock);
+
+	return true;
+}
+
+static void __meminit
+remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long next, pages = 0;
+	pte_t *pte;
+	void *page_addr;
+	phys_addr_t phys_addr;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		/*
+		 * We mapped [0,1G) memory as identity mapping when
+		 * initializing, in arch/x86/kernel/head_64.S. These
+		 * pagetables cannot be removed.
+		 */
+		phys_addr = pte_val(*pte) + (addr & PAGE_MASK);
+		if (phys_addr < (phys_addr_t)0x40000000)
+			return;
+
+		if (IS_ALIGNED(addr, PAGE_SIZE) &&
+		    IS_ALIGNED(next, PAGE_SIZE)) {
+			if (!direct) {
+				free_pagetable(pte_page(*pte), 0);
+				pages++;
+			}
+
+			spin_lock(&init_mm.page_table_lock);
+			pte_clear(&init_mm, addr, pte);
+			spin_unlock(&init_mm.page_table_lock);
+		} else {
+			/*
+			 * If we are not removing the whole page, it means
+			 * other ptes in this page are being used and we canot
+			 * remove them. So fill the unused ptes with 0xFD, and
+			 * remove the page when it is wholly filled with 0xFD.
+			 */
+			memset((void *)addr, PAGE_INUSE, next - addr);
+			page_addr = page_address(pte_page(*pte));
+
+			if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
+				free_pagetable(pte_page(*pte), 0);
+				pages++;
+
+				spin_lock(&init_mm.page_table_lock);
+				pte_clear(&init_mm, addr, pte);
+				spin_unlock(&init_mm.page_table_lock);
+			}
+		}
+	}
+
+	/* Call free_pte_table() in remove_pmd_table(). */
+	flush_tlb_all();
+	if (direct)
+		update_page_count(PG_LEVEL_4K, -pages);
+}
+
+static void __meminit
+remove_pmd_table(pmd_t *pmd_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long pte_phys, next, pages = 0;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_large(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
+				if (!direct) {
+					free_pagetable(pmd_page(*pmd),
+						       get_order(PMD_SIZE));
+					pages++;
+				}
+
+				spin_lock(&init_mm.page_table_lock);
+				pmd_clear(pmd);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+
+			/*
+			 * We use 2M page, but we need to remove part of them,
+			 * so split 2M page to 4K page.
+			 */
+			pte_base = (pte_t *)alloc_low_page(&pte_phys);
+			BUG_ON(!pte_base);
+			__split_large_page((pte_t *)pmd, addr,
+					   (pte_t *)pte_base);
+
+			spin_lock(&init_mm.page_table_lock);
+			pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
+			spin_unlock(&init_mm.page_table_lock);
+
+			flush_tlb_all();
+		}
+
+		pte_base = (pte_t *)map_low_page((pte_t *)pmd_page_vaddr(*pmd));
+		remove_pte_table(pte_base, addr, next, direct);
+		free_pte_table(pte_base, pmd);
+		unmap_low_page(pte_base);
+	}
+
+	/* Call free_pmd_table() in remove_pud_table(). */
+	if (direct)
+		update_page_count(PG_LEVEL_2M, -pages);
+}
+
+static void __meminit
+remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
+		 bool direct)
+{
+	unsigned long pmd_phys, next, pages = 0;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_large(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE)) {
+				if (!direct) {
+					free_pagetable(pud_page(*pud),
+						       get_order(PUD_SIZE));
+					pages++;
+				}
+
+				spin_lock(&init_mm.page_table_lock);
+				pud_clear(pud);
+				spin_unlock(&init_mm.page_table_lock);
+				continue;
+			}
+
+			/*
+			 * We use 1G page, but we need to remove part of them,
+			 * so split 1G page to 2M page.
+			 */
+			pmd_base = (pmd_t *)alloc_low_page(&pmd_phys);
+			BUG_ON(!pmd_base);
+			__split_large_page((pte_t *)pud, addr,
+					   (pte_t *)pmd_base);
+
+			spin_lock(&init_mm.page_table_lock);
+			pud_populate(&init_mm, pud, __va(pmd_phys));
+			spin_unlock(&init_mm.page_table_lock);
+
+			flush_tlb_all();
+		}
+
+		pmd_base = (pmd_t *)map_low_page((pmd_t *)pud_page_vaddr(*pud));
+		remove_pmd_table(pmd_base, addr, next, direct);
+		free_pmd_table(pmd_base, pud);
+		unmap_low_page(pmd_base);
+	}
+
+	if (direct)
+		update_page_count(PG_LEVEL_1G, -pages);
+}
+
+/* start and end are both virtual address. */
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct)
+{
+	unsigned long next;
+	pgd_t *pgd;
+	pud_t *pud;
+	bool pgd_changed = false;
+
+	for (; start < end; start = next) {
+		pgd = pgd_offset_k(start);
+		if (!pgd_present(*pgd))
+			continue;
+
+		next = pgd_addr_end(start, end);
+
+		pud = (pud_t *)map_low_page((pud_t *)pgd_page_vaddr(*pgd));
+		remove_pud_table(pud, start, next, direct);
+		if (free_pud_table(pud, pgd))
+			pgd_changed = true;
+		unmap_low_page(pud);
+	}
+
+	if (pgd_changed)
+		sync_global_pgds(start, end - 1);
+
+	flush_tlb_all();
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 int __ref arch_remove_memory(u64 start, u64 size)
 {
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a718e0d..7dcb6f9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -501,21 +501,13 @@  out_unlock:
 	return do_split;
 }
 
-static int split_large_page(pte_t *kpte, unsigned long address)
+int __split_large_page(pte_t *kpte, unsigned long address, pte_t *pbase)
 {
 	unsigned long pfn, pfninc = 1;
 	unsigned int i, level;
-	pte_t *pbase, *tmp;
+	pte_t *tmp;
 	pgprot_t ref_prot;
-	struct page *base;
-
-	if (!debug_pagealloc)
-		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-	if (!debug_pagealloc)
-		spin_lock(&cpa_lock);
-	if (!base)
-		return -ENOMEM;
+	struct page *base = virt_to_page(pbase);
 
 	spin_lock(&pgd_lock);
 	/*
@@ -523,10 +515,11 @@  static int split_large_page(pte_t *kpte, unsigned long address)
 	 * up for us already:
 	 */
 	tmp = lookup_address(address, &level);
-	if (tmp != kpte)
-		goto out_unlock;
+	if (tmp != kpte) {
+		spin_unlock(&pgd_lock);
+		return 1;
+	}
 
-	pbase = (pte_t *)page_address(base);
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 	ref_prot = pte_pgprot(pte_clrhuge(*kpte));
 	/*
@@ -579,17 +572,27 @@  static int split_large_page(pte_t *kpte, unsigned long address)
 	 * going on.
 	 */
 	__flush_tlb_all();
+	spin_unlock(&pgd_lock);
 
-	base = NULL;
+	return 0;
+}
 
-out_unlock:
-	/*
-	 * If we dropped out via the lookup_address check under
-	 * pgd_lock then stick the page back into the pool:
-	 */
-	if (base)
+static int split_large_page(pte_t *kpte, unsigned long address)
+{
+	pte_t *pbase;
+	struct page *base;
+
+	if (!debug_pagealloc)
+		spin_unlock(&cpa_lock);
+	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	if (!debug_pagealloc)
+		spin_lock(&cpa_lock);
+	if (!base)
+		return -ENOMEM;
+
+	pbase = (pte_t *)page_address(base);
+	if (__split_large_page(kpte, address, pbase))
 		__free_page(base);
-	spin_unlock(&pgd_lock);
 
 	return 0;
 }
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 3f778c2..190ff06 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -53,6 +53,7 @@  extern void free_bootmem_node(pg_data_t *pgdat,
 			      unsigned long size);
 extern void free_bootmem(unsigned long physaddr, unsigned long size);
 extern void free_bootmem_late(unsigned long physaddr, unsigned long size);
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
 
 /*
  * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE,