diff mbox

[RFC,v2,14/32] x86: mm: Provide support to use memblock when spliting large pages

Message ID c3b12452-9533-87c8-4543-8f12f5c90fe3@redhat.com
State Not Applicable
Headers show

Commit Message

Paolo Bonzini March 16, 2017, 1:15 p.m. UTC
On 10/03/2017 23:41, Brijesh Singh wrote:
>> Maybe there's a reason this fires:
>>
>> WARNING: modpost: Found 2 section mismatch(es).
>> To see full details build your kernel with:
>> 'make CONFIG_DEBUG_SECTION_MISMATCH=y'
>>
>> WARNING: vmlinux.o(.text+0x48edc): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .init.text:memblock_alloc()
>> The function __change_page_attr() references
>> the function __init memblock_alloc().
>> This is often because __change_page_attr lacks a __init
>> annotation or the annotation of memblock_alloc is wrong.
>>
>> WARNING: vmlinux.o(.text+0x491d1): Section mismatch in reference from
>> the function __change_page_attr() to the function
>> .meminit.text:memblock_free()
>> The function __change_page_attr() references
>> the function __meminit memblock_free().
>> This is often because __change_page_attr lacks a __meminit
>> annotation or the annotation of memblock_free is wrong.
>> 
>> But maybe Paolo might have an even better idea...
> 
> I am sure he will have better idea :)

Not sure if it's better or worse, but an alternative idea is to turn
__change_page_attr and __change_page_attr_set_clr inside out, so that:
1) the alloc_pages/__free_page happens in __change_page_attr_set_clr;
2) __change_page_attr_set_clr overall does not beocome more complex.

Then you can introduce __early_change_page_attr_set_clr and/or
early_kernel_map_pages_in_pgd, for use in your next patches.  They use
the memblock allocator instead of alloc/free_page

The attached patch is compile-tested only and almost certainly has some
thinko in it.  But it even skims a few lines from the code so the idea
might have some merit.

Paolo
diff mbox

Patch

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..953c8e697562 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -490,11 +490,12 @@  static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
 }
 
 static int
-try_preserve_large_page(pte_t *kpte, unsigned long address,
+try_preserve_large_page(pte_t **p_kpte, unsigned long address,
 			struct cpa_data *cpa)
 {
 	unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
-	pte_t new_pte, old_pte, *tmp;
+	pte_t *kpte = *p_kpte;
+	pte_t new_pte, old_pte;
 	pgprot_t old_prot, new_prot, req_prot;
 	int i, do_split = 1;
 	enum pg_level level;
@@ -507,8 +508,8 @@  try_preserve_large_page(pte_t *kpte, unsigned long address,
 	 * Check for races, another CPU might have split this page
 	 * up already:
 	 */
-	tmp = _lookup_address_cpa(cpa, address, &level);
-	if (tmp != kpte)
+	*p_kpte = _lookup_address_cpa(cpa, address, &level);
+	if (*p_kpte != kpte)
 		goto out_unlock;
 
 	switch (level) {
@@ -634,17 +635,18 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	unsigned int i, level;
 	pte_t *tmp;
 	pgprot_t ref_prot;
+	int retry = 1;
 
+	if (!debug_pagealloc_enabled())
+		spin_lock(&cpa_lock);
 	spin_lock(&pgd_lock);
 	/*
 	 * Check for races, another CPU might have split this page
 	 * up for us already:
 	 */
 	tmp = _lookup_address_cpa(cpa, address, &level);
-	if (tmp != kpte) {
-		spin_unlock(&pgd_lock);
-		return 1;
-	}
+	if (tmp != kpte)
+		goto out;
 
 	paravirt_alloc_pte(&init_mm, page_to_pfn(base));
 
@@ -671,10 +673,11 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 		break;
 
 	default:
-		spin_unlock(&pgd_lock);
-		return 1;
+		goto out;
 	}
 
+	retry = 0;
+
 	/*
 	 * Set the GLOBAL flags only if the PRESENT flag is set
 	 * otherwise pmd/pte_present will return true even on a non
@@ -718,28 +721,34 @@  __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
 	 * going on.
 	 */
 	__flush_tlb_all();
-	spin_unlock(&pgd_lock);
 
-	return 0;
-}
-
-static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
-			    unsigned long address)
-{
-	struct page *base;
+out:
+	spin_unlock(&pgd_lock);
 
+	/*
+	 * Do a global flush tlb after splitting the large page
+ 	 * and before we do the actual change page attribute in the PTE.
+ 	 *
+ 	 * With out this, we violate the TLB application note, that says
+ 	 * "The TLBs may contain both ordinary and large-page
+	 *  translations for a 4-KByte range of linear addresses. This
+	 *  may occur if software modifies the paging structures so that
+	 *  the page size used for the address range changes. If the two
+	 *  translations differ with respect to page frame or attributes
+	 *  (e.g., permissions), processor behavior is undefined and may
+	 *  be implementation-specific."
+ 	 *
+ 	 * We do this global tlb flush inside the cpa_lock, so that we
+	 * don't allow any other cpu, with stale tlb entries change the
+	 * page attribute in parallel, that also falls into the
+	 * just split large page entry.
+ 	 */
+	if (!retry)
+		flush_tlb_all();
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
-	if (!debug_pagealloc_enabled())
-		spin_lock(&cpa_lock);
-	if (!base)
-		return -ENOMEM;
-
-	if (__split_large_page(cpa, kpte, address, base))
-		__free_page(base);
 
-	return 0;
+	return retry;
 }
 
 static bool try_to_free_pte_page(pte_t *pte)
@@ -1166,30 +1175,26 @@  static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
 	}
 }
 
-static int __change_page_attr(struct cpa_data *cpa, int primary)
+static int __change_page_attr(struct cpa_data *cpa, pte_t **p_kpte, unsigned long address,
+			      int primary)
 {
-	unsigned long address;
-	int do_split, err;
 	unsigned int level;
 	pte_t *kpte, old_pte;
+	int err = 0;
 
-	if (cpa->flags & CPA_PAGES_ARRAY) {
-		struct page *page = cpa->pages[cpa->curpage];
-		if (unlikely(PageHighMem(page)))
-			return 0;
-		address = (unsigned long)page_address(page);
-	} else if (cpa->flags & CPA_ARRAY)
-		address = cpa->vaddr[cpa->curpage];
-	else
-		address = *cpa->vaddr;
-repeat:
-	kpte = _lookup_address_cpa(cpa, address, &level);
-	if (!kpte)
-		return __cpa_process_fault(cpa, address, primary);
+	if (!debug_pagealloc_enabled())
+		spin_lock(&cpa_lock);
+	*p_kpte = kpte = _lookup_address_cpa(cpa, address, &level);
+	if (!kpte) {
+		err = __cpa_process_fault(cpa, address, primary);
+		goto out;
+	}
 
 	old_pte = *kpte;
-	if (pte_none(old_pte))
-		return __cpa_process_fault(cpa, address, primary);
+	if (pte_none(old_pte)) {
+		err = __cpa_process_fault(cpa, address, primary);
+		goto out;
+	}
 
 	if (level == PG_LEVEL_4K) {
 		pte_t new_pte;
@@ -1228,59 +1233,27 @@  static int __change_page_attr(struct cpa_data *cpa, int primary)
 			cpa->flags |= CPA_FLUSHTLB;
 		}
 		cpa->numpages = 1;
-		return 0;
+		goto out;
 	}
 
 	/*
 	 * Check, whether we can keep the large page intact
 	 * and just change the pte:
 	 */
-	do_split = try_preserve_large_page(kpte, address, cpa);
-	/*
-	 * When the range fits into the existing large page,
-	 * return. cp->numpages and cpa->tlbflush have been updated in
-	 * try_large_page:
-	 */
-	if (do_split <= 0)
-		return do_split;
-
-	/*
-	 * We have to split the large page:
-	 */
-	err = split_large_page(cpa, kpte, address);
-	if (!err) {
-		/*
-	 	 * Do a global flush tlb after splitting the large page
-	 	 * and before we do the actual change page attribute in the PTE.
-	 	 *
-	 	 * With out this, we violate the TLB application note, that says
-	 	 * "The TLBs may contain both ordinary and large-page
-		 *  translations for a 4-KByte range of linear addresses. This
-		 *  may occur if software modifies the paging structures so that
-		 *  the page size used for the address range changes. If the two
-		 *  translations differ with respect to page frame or attributes
-		 *  (e.g., permissions), processor behavior is undefined and may
-		 *  be implementation-specific."
-	 	 *
-	 	 * We do this global tlb flush inside the cpa_lock, so that we
-		 * don't allow any other cpu, with stale tlb entries change the
-		 * page attribute in parallel, that also falls into the
-		 * just split large page entry.
-	 	 */
-		flush_tlb_all();
-		goto repeat;
-	}
+	err = try_preserve_large_page(p_kpte, address, cpa);
 
+out:
+	if (!debug_pagealloc_enabled())
+		spin_unlock(&cpa_lock);
 	return err;
 }
 
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias);
 
-static int cpa_process_alias(struct cpa_data *cpa)
+static int cpa_process_alias(struct cpa_data *cpa, unsigned long vaddr)
 {
 	struct cpa_data alias_cpa;
 	unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-	unsigned long vaddr;
 	int ret;
 
 	if (!pfn_range_is_mapped(cpa->pfn, cpa->pfn + 1))
@@ -1290,16 +1263,6 @@  static int cpa_process_alias(struct cpa_data *cpa)
 	 * No need to redo, when the primary call touched the direct
 	 * mapping already:
 	 */
-	if (cpa->flags & CPA_PAGES_ARRAY) {
-		struct page *page = cpa->pages[cpa->curpage];
-		if (unlikely(PageHighMem(page)))
-			return 0;
-		vaddr = (unsigned long)page_address(page);
-	} else if (cpa->flags & CPA_ARRAY)
-		vaddr = cpa->vaddr[cpa->curpage];
-	else
-		vaddr = *cpa->vaddr;
-
 	if (!(within(vaddr, PAGE_OFFSET,
 		    PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)))) {
 
@@ -1338,33 +1301,64 @@  static int cpa_process_alias(struct cpa_data *cpa)
 	return 0;
 }
 
+static unsigned long cpa_address(struct cpa_data *cpa, unsigned long numpages)
+{
+	/*
+	 * Store the remaining nr of pages for the large page
+	 * preservation check.
+	 */
+	/* for array changes, we can't use large page */
+	cpa->numpages = 1;
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		struct page *page = cpa->pages[cpa->curpage];
+		if (unlikely(PageHighMem(page)))
+			return -EINVAL;
+		return (unsigned long)page_address(page);
+	} else if (cpa->flags & CPA_ARRAY) {
+		return cpa->vaddr[cpa->curpage];
+	} else {
+		cpa->numpages = numpages;
+		return *cpa->vaddr;
+	}
+}
+
+static void cpa_advance(struct cpa_data *cpa)
+{
+	if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+		cpa->curpage++;
+	else
+		*cpa->vaddr += cpa->numpages * PAGE_SIZE;
+}
+
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 {
 	unsigned long numpages = cpa->numpages;
+	unsigned long vaddr;
+	struct page *base;
+	pte_t *kpte;
 	int ret;
 
 	while (numpages) {
-		/*
-		 * Store the remaining nr of pages for the large page
-		 * preservation check.
-		 */
-		cpa->numpages = numpages;
-		/* for array changes, we can't use large page */
-		if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
-			cpa->numpages = 1;
-
-		if (!debug_pagealloc_enabled())
-			spin_lock(&cpa_lock);
-		ret = __change_page_attr(cpa, checkalias);
-		if (!debug_pagealloc_enabled())
-			spin_unlock(&cpa_lock);
-		if (ret)
-			return ret;
-
-		if (checkalias) {
-			ret = cpa_process_alias(cpa);
-			if (ret)
+		vaddr = cpa_address(cpa, numpages);
+		if (!IS_ERR_VALUE(vaddr)) {
+repeat:
+			ret = __change_page_attr(cpa, &kpte, vaddr, checkalias);
+			if (ret < 0)
 				return ret;
+			if (ret) {
+				base = alloc_page(GFP_KERNEL|__GFP_NOTRACK);
+				if (!base)
+					return -ENOMEM;
+				if (__split_large_page(cpa, kpte, vaddr, base))
+					__free_page(base);
+				goto repeat;
+			}
+
+			if (checkalias) {
+				ret = cpa_process_alias(cpa, vaddr);
+				if (ret < 0)
+					return ret;
+			}
 		}
 
 		/*
@@ -1374,11 +1368,7 @@  static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 		 */
 		BUG_ON(cpa->numpages > numpages || !cpa->numpages);
 		numpages -= cpa->numpages;
-		if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
-			cpa->curpage++;
-		else
-			*cpa->vaddr += cpa->numpages * PAGE_SIZE;
-
+		cpa_advance(cpa);
 	}
 	return 0;
 }