[v2] KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler

Message ID 20180302044903.yqacab2v2ygjxfth@oak.ozlabs.ibm.com
State Accepted
Headers show
Series
  • [v2] KVM: PPC: Book3S HV: Fix handling of large pages in radix page fault handler
Related show

Commit Message

Paul Mackerras March 2, 2018, 4:49 a.m.
This fixes several bugs in the radix page fault handler relating to
the way large pages in the memory backing the guest were handled.
First, the check for large pages only checked for explicit huge pages
and missed transparent huge pages.  Then the check that the addresses
(host virtual vs. guest physical) had appropriate alignment was
wrong, meaning that the code never put a large page in the partition
scoped radix tree; it was always demoted to a small page.

Fixing this exposed bugs in kvmppc_create_pte().  We were never
invalidating a 2MB PTE, which meant that if a page was initially
faulted in without write permission and the guest then attempted
to store to it, we would never update the PTE to have write permission.
If we find a valid 2MB PTE in the PMD, we need to clear it and
do a TLB invalidation before installing either the new 2MB PTE or
a pointer to a page table page.

This also corrects an assumption that get_user_pages_fast would set
the _PAGE_DIRTY bit if we are writing, which is not true.  Instead we
mark the page dirty explicitly with set_page_dirty_lock().  This
also means we don't need the dirty bit set on the host PTE when
providing write access on a read fault.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
v2: Incorporate suggestion from Aneesh Kumar K.V

 arch/powerpc/kvm/book3s_64_mmu_radix.c | 69 +++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 26 deletions(-)

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 0c854816e653..5cb4e4687107 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -195,6 +195,12 @@  static void kvmppc_pte_free(pte_t *ptep)
 	kmem_cache_free(kvm_pte_cache, ptep);
 }
 
+/* Like pmd_huge() and pmd_large(), but works regardless of config options */
+static inline int pmd_is_leaf(pmd_t pmd)
+{
+	return !!(pmd_val(pmd) & _PAGE_PTE);
+}
+
 static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 			     unsigned int level, unsigned long mmu_seq)
 {
@@ -219,7 +225,7 @@  static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	else
 		new_pmd = pmd_alloc_one(kvm->mm, gpa);
 
-	if (level == 0 && !(pmd && pmd_present(*pmd)))
+	if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
 		new_ptep = kvmppc_pte_alloc();
 
 	/* Check if we might have been invalidated; let the guest retry if so */
@@ -244,12 +250,30 @@  static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 		new_pmd = NULL;
 	}
 	pmd = pmd_offset(pud, gpa);
-	if (pmd_large(*pmd)) {
-		/* Someone else has instantiated a large page here; retry */
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
-	if (level == 1 && !pmd_none(*pmd)) {
+	if (pmd_is_leaf(*pmd)) {
+		unsigned long lgpa = gpa & PMD_MASK;
+
+		/*
+		 * If we raced with another CPU which has just put
+		 * a 2MB pte in after we saw a pte page, try again.
+		 */
+		if (level == 0 && !new_ptep) {
+			ret = -EAGAIN;
+			goto out_unlock;
+		}
+		/* Valid 2MB page here already, remove it */
+		old = kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
+					      ~0UL, 0, lgpa, PMD_SHIFT);
+		kvmppc_radix_tlbie_page(kvm, lgpa, PMD_SHIFT);
+		if (old & _PAGE_DIRTY) {
+			unsigned long gfn = lgpa >> PAGE_SHIFT;
+			struct kvm_memory_slot *memslot;
+			memslot = gfn_to_memslot(kvm, gfn);
+			if (memslot && memslot->dirty_bitmap)
+				kvmppc_update_dirty_map(memslot,
+							gfn, PMD_SIZE);
+		}
+	} else if (level == 1 && !pmd_none(*pmd)) {
 		/*
 		 * There's a page table page here, but we wanted
 		 * to install a large page.  Tell the caller and let
@@ -412,28 +436,24 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	} else {
 		page = pages[0];
 		pfn = page_to_pfn(page);
-		if (PageHuge(page)) {
-			page = compound_head(page);
-			pte_size <<= compound_order(page);
+		if (PageCompound(page)) {
+			pte_size <<= compound_order(compound_head(page));
 			/* See if we can insert a 2MB large-page PTE here */
 			if (pte_size >= PMD_SIZE &&
-			    (gpa & PMD_MASK & PAGE_MASK) ==
-			    (hva & PMD_MASK & PAGE_MASK)) {
+			    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+			    (hva & (PMD_SIZE - PAGE_SIZE))) {
 				level = 1;
 				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
 			}
 		}
 		/* See if we can provide write access */
 		if (writing) {
-			/*
-			 * We assume gup_fast has set dirty on the host PTE.
-			 */
 			pgflags |= _PAGE_WRITE;
 		} else {
 			local_irq_save(flags);
 			ptep = find_current_mm_pte(current->mm->pgd,
 						   hva, NULL, NULL);
-			if (ptep && pte_write(*ptep) && pte_dirty(*ptep))
+			if (ptep && pte_write(*ptep))
 				pgflags |= _PAGE_WRITE;
 			local_irq_restore(flags);
 		}
@@ -459,18 +479,15 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte = pfn_pte(pfn, __pgprot(pgflags));
 		ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
 	}
-	if (ret == 0 || ret == -EAGAIN)
-		ret = RESUME_GUEST;
 
 	if (page) {
-		/*
-		 * We drop pages[0] here, not page because page might
-		 * have been set to the head page of a compound, but
-		 * we have to drop the reference on the correct tail
-		 * page to match the get inside gup()
-		 */
-		put_page(pages[0]);
+		if (!ret && (pgflags & _PAGE_WRITE))
+			set_page_dirty_lock(page);
+		put_page(page);
 	}
+
+	if (ret == 0 || ret == -EAGAIN)
+		ret = RESUME_GUEST;
 	return ret;
 }
 
@@ -644,7 +661,7 @@  void kvmppc_free_radix(struct kvm *kvm)
 				continue;
 			pmd = pmd_offset(pud, 0);
 			for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) {
-				if (pmd_huge(*pmd)) {
+				if (pmd_is_leaf(*pmd)) {
 					pmd_clear(pmd);
 					continue;
 				}