[5/5] KVM: PPC: Book3S HV: Use __gfn_to_pfn_memslot() in page fault handler

Message ID 1520236499-29499-6-git-send-email-paulus@ozlabs.org
State Superseded
Headers show
Series
  • KVM: PPC: Improve page fault handling for radix guests
Related show

Commit Message

Paul Mackerras March 5, 2018, 7:54 a.m.
This changes the hypervisor page fault handler for radix guests to use
the generic KVM __gfn_to_pfn_memslot() function instead of using
get_user_pages_fast() and then handling the case of VM_PFNMAP vmas
specially.  The old code missed the case of VM_IO vmas; with this
change, VM_IO vmas will now be handled correctly by code within
__gfn_to_pfn_memslot.

Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses
__get_user_pages_fast for the initial lookup in the cases where
either atomic or async is set.  Since we are not setting either
atomic or async, we do our own __get_user_pages_fast first, for now.

This also adds code to check for the KVM_MEM_READONLY flag on the
memslot.  If it is set and this is a write access, we synthesize a
data storage interrupt for the guest.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 117 +++++++++++++++++----------------
 1 file changed, 59 insertions(+), 58 deletions(-)

Comments

Alexey Kardashevskiy March 15, 2018, 1:56 a.m. | #1
On 5/3/18 6:54 pm, Paul Mackerras wrote:
> This changes the hypervisor page fault handler for radix guests to use
> the generic KVM __gfn_to_pfn_memslot() function instead of using
> get_user_pages_fast() and then handling the case of VM_PFNMAP vmas
> specially.  The old code missed the case of VM_IO vmas; with this
> change, VM_IO vmas will now be handled correctly by code within
> __gfn_to_pfn_memslot.
> 
> Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses
> __get_user_pages_fast for the initial lookup in the cases where
> either atomic or async is set.  Since we are not setting either
> atomic or async, we do our own __get_user_pages_fast first, for now.
> 
> This also adds code to check for the KVM_MEM_READONLY flag on the
> memslot.  If it is set and this is a write access, we synthesize a
> data storage interrupt for the guest.
> 
> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>


This one produces multiple errors like this:

Severe Machine check interrupt [Not recovered]
  NIP [c008000005b34810]: 0xc008000005b34810
  Initiator: CPU
  Error type: Real address [Load (bad)]
    Effective address: c00c000080100284


0xc008000005b34810 is a guest module, 0xc00c000080100284 is an MMIO address
(bar0 of a nvlink bridge).



> ---
>  arch/powerpc/kvm/book3s_64_mmu_radix.c | 117 +++++++++++++++++----------------
>  1 file changed, 59 insertions(+), 58 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> index 05acc67..6c71aa2 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
> @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  	unsigned long mmu_seq, pte_size;
>  	unsigned long gpa, gfn, hva, pfn;
>  	struct kvm_memory_slot *memslot;
> -	struct page *page = NULL, *pages[1];
> -	long ret, npages;
> -	unsigned int writing;
> -	struct vm_area_struct *vma;
> -	unsigned long flags;
> +	struct page *page = NULL;
> +	long ret;
> +	bool writing;
> +	bool upgrade_write = false;
> +	bool *upgrade_p = &upgrade_write;
>  	pte_t pte, *ptep;
>  	unsigned long pgflags;
>  	unsigned int shift, level;
> @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  					      dsisr & DSISR_ISSTORE);
>  	}
>  
> -	/* used to check for invalidations in progress */
> -	mmu_seq = kvm->mmu_notifier_seq;
> -	smp_rmb();
> -
>  	writing = (dsisr & DSISR_ISSTORE) != 0;
> -	hva = gfn_to_hva_memslot(memslot, gfn);
> +	if (memslot->flags & KVM_MEM_READONLY) {
> +		if (writing) {
> +			/* give the guest a DSI */
> +			dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
> +			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
> +			return RESUME_GUEST;
> +		}
> +		upgrade_p = NULL;
> +	}
> +
>  	if (dsisr & DSISR_SET_RC) {
>  		/*
>  		 * Need to set an R or C bit in the 2nd-level tables;
> @@ -470,62 +475,58 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>  			return RESUME_GUEST;
>  	}
>  
> -	ret = -EFAULT;
> -	pfn = 0;
> -	pte_size = PAGE_SIZE;
> -	pgflags = _PAGE_READ | _PAGE_EXEC;
> -	level = 0;
> -	npages = get_user_pages_fast(hva, 1, writing, pages);
> -	if (npages < 1) {
> -		/* Check if it's an I/O mapping */
> -		down_read(&current->mm->mmap_sem);
> -		vma = find_vma(current->mm, hva);
> -		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
> -		    (vma->vm_flags & VM_PFNMAP)) {
> -			pfn = vma->vm_pgoff +
> -				((hva - vma->vm_start) >> PAGE_SHIFT);
> -			pgflags = pgprot_val(vma->vm_page_prot);
> -		}
> -		up_read(&current->mm->mmap_sem);
> -		if (!pfn)
> -			return -EFAULT;
> -	} else {
> -		page = pages[0];
> +	/* used to check for invalidations in progress */
> +	mmu_seq = kvm->mmu_notifier_seq;
> +	smp_rmb();
> +
> +	/*
> +	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
> +	 * do it with !atomic && !async, which is how we call it.
> +	 * We always ask for write permission since the common case
> +	 * is that the page is writable.
> +	 */
> +	hva = gfn_to_hva_memslot(memslot, gfn);
> +	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
>  		pfn = page_to_pfn(page);
> -		if (PageCompound(page)) {
> -			pte_size <<= compound_order(compound_head(page));
> -			/* See if we can insert a 1GB or 2MB large PTE here */
> -			if (pte_size >= PUD_SIZE &&
> -			    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> -			    (hva & (PUD_SIZE - PAGE_SIZE))) {
> -				level = 2;
> -				pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> -			} else if (pte_size >= PMD_SIZE &&
> -			    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> -			    (hva & (PMD_SIZE - PAGE_SIZE))) {
> -				level = 1;
> -				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
> -			}
> +		upgrade_write = true;
> +	} else {
> +		/* Call KVM generic code to do the slow-path check */
> +		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
> +					   writing, upgrade_p);
> +		if (is_error_noslot_pfn(pfn))
> +			return -EFAULT;
> +		page = NULL;
> +		if (pfn_valid(pfn)) {
> +			page = pfn_to_page(pfn);
> +			if (PageReserved(page))
> +				page = NULL;
>  		}
> -		/* See if we can provide write access */
> -		if (writing) {
> -			pgflags |= _PAGE_WRITE;
> -		} else {
> -			local_irq_save(flags);
> -			ptep = find_current_mm_pte(current->mm->pgd,
> -						   hva, NULL, NULL);
> -			if (ptep && pte_write(*ptep))
> -				pgflags |= _PAGE_WRITE;
> -			local_irq_restore(flags);
> +	}
> +
> +	/* See if we can insert a 1GB or 2MB large PTE here */
> +	level = 0;
> +	if (page && PageCompound(page)) {
> +		pte_size = PAGE_SIZE << compound_order(compound_head(page));
> +		if (pte_size >= PUD_SIZE &&
> +		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
> +		    (hva & (PUD_SIZE - PAGE_SIZE))) {
> +			level = 2;
> +			pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
> +		} else if (pte_size >= PMD_SIZE &&
> +			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
> +			   (hva & (PMD_SIZE - PAGE_SIZE))) {
> +			level = 1;
> +			pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
>  		}
>  	}
>  
>  	/*
>  	 * Compute the PTE value that we need to insert.
>  	 */
> -	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
> -	if (pgflags & _PAGE_WRITE)
> -		pgflags |= _PAGE_DIRTY;
> +	pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
> +		_PAGE_ACCESSED;
> +	if (writing || upgrade_write)
> +		pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
>  	pte = pfn_pte(pfn, __pgprot(pgflags));
>  
>  	/* Allocate space in the tree and write the PTE */
>
C├ędric Le Goater March 23, 2018, 8:04 a.m. | #2
On 03/15/2018 02:56 AM, Alexey Kardashevskiy wrote:
> On 5/3/18 6:54 pm, Paul Mackerras wrote:
>> This changes the hypervisor page fault handler for radix guests to use
>> the generic KVM __gfn_to_pfn_memslot() function instead of using
>> get_user_pages_fast() and then handling the case of VM_PFNMAP vmas
>> specially.  The old code missed the case of VM_IO vmas; with this
>> change, VM_IO vmas will now be handled correctly by code within
>> __gfn_to_pfn_memslot.
>>
>> Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses
>> __get_user_pages_fast for the initial lookup in the cases where
>> either atomic or async is set.  Since we are not setting either
>> atomic or async, we do our own __get_user_pages_fast first, for now.
>>
>> This also adds code to check for the KVM_MEM_READONLY flag on the
>> memslot.  If it is set and this is a write access, we synthesize a
>> data storage interrupt for the guest.
>>
>> Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
> 
> 
> This one produces multiple errors like this:
> 
> Severe Machine check interrupt [Not recovered]
>   NIP [c008000005b34810]: 0xc008000005b34810
>   Initiator: CPU
>   Error type: Real address [Load (bad)]
>     Effective address: c00c000080100284
> 

This is the same symptom for the XIVE ESBs. 

I have kept the workaround with hva_to_pfn_remapped() in 
kvmppc_book3s_radix_page_fault() for 4.16-rc6
 
Thanks,

C.

> 0xc008000005b34810 is a guest module, 0xc00c000080100284 is an MMIO address
> (bar0 of a nvlink bridge).


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 05acc67..6c71aa2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -392,11 +392,11 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	unsigned long mmu_seq, pte_size;
 	unsigned long gpa, gfn, hva, pfn;
 	struct kvm_memory_slot *memslot;
-	struct page *page = NULL, *pages[1];
-	long ret, npages;
-	unsigned int writing;
-	struct vm_area_struct *vma;
-	unsigned long flags;
+	struct page *page = NULL;
+	long ret;
+	bool writing;
+	bool upgrade_write = false;
+	bool *upgrade_p = &upgrade_write;
 	pte_t pte, *ptep;
 	unsigned long pgflags;
 	unsigned int shift, level;
@@ -436,12 +436,17 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 					      dsisr & DSISR_ISSTORE);
 	}
 
-	/* used to check for invalidations in progress */
-	mmu_seq = kvm->mmu_notifier_seq;
-	smp_rmb();
-
 	writing = (dsisr & DSISR_ISSTORE) != 0;
-	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (memslot->flags & KVM_MEM_READONLY) {
+		if (writing) {
+			/* give the guest a DSI */
+			dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+			return RESUME_GUEST;
+		}
+		upgrade_p = NULL;
+	}
+
 	if (dsisr & DSISR_SET_RC) {
 		/*
 		 * Need to set an R or C bit in the 2nd-level tables;
@@ -470,62 +475,58 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			return RESUME_GUEST;
 	}
 
-	ret = -EFAULT;
-	pfn = 0;
-	pte_size = PAGE_SIZE;
-	pgflags = _PAGE_READ | _PAGE_EXEC;
-	level = 0;
-	npages = get_user_pages_fast(hva, 1, writing, pages);
-	if (npages < 1) {
-		/* Check if it's an I/O mapping */
-		down_read(&current->mm->mmap_sem);
-		vma = find_vma(current->mm, hva);
-		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
-		    (vma->vm_flags & VM_PFNMAP)) {
-			pfn = vma->vm_pgoff +
-				((hva - vma->vm_start) >> PAGE_SHIFT);
-			pgflags = pgprot_val(vma->vm_page_prot);
-		}
-		up_read(&current->mm->mmap_sem);
-		if (!pfn)
-			return -EFAULT;
-	} else {
-		page = pages[0];
+	/* used to check for invalidations in progress */
+	mmu_seq = kvm->mmu_notifier_seq;
+	smp_rmb();
+
+	/*
+	 * Do a fast check first, since __gfn_to_pfn_memslot doesn't
+	 * do it with !atomic && !async, which is how we call it.
+	 * We always ask for write permission since the common case
+	 * is that the page is writable.
+	 */
+	hva = gfn_to_hva_memslot(memslot, gfn);
+	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
 		pfn = page_to_pfn(page);
-		if (PageCompound(page)) {
-			pte_size <<= compound_order(compound_head(page));
-			/* See if we can insert a 1GB or 2MB large PTE here */
-			if (pte_size >= PUD_SIZE &&
-			    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
-			    (hva & (PUD_SIZE - PAGE_SIZE))) {
-				level = 2;
-				pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
-			} else if (pte_size >= PMD_SIZE &&
-			    (gpa & (PMD_SIZE - PAGE_SIZE)) ==
-			    (hva & (PMD_SIZE - PAGE_SIZE))) {
-				level = 1;
-				pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
-			}
+		upgrade_write = true;
+	} else {
+		/* Call KVM generic code to do the slow-path check */
+		pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
+					   writing, upgrade_p);
+		if (is_error_noslot_pfn(pfn))
+			return -EFAULT;
+		page = NULL;
+		if (pfn_valid(pfn)) {
+			page = pfn_to_page(pfn);
+			if (PageReserved(page))
+				page = NULL;
 		}
-		/* See if we can provide write access */
-		if (writing) {
-			pgflags |= _PAGE_WRITE;
-		} else {
-			local_irq_save(flags);
-			ptep = find_current_mm_pte(current->mm->pgd,
-						   hva, NULL, NULL);
-			if (ptep && pte_write(*ptep))
-				pgflags |= _PAGE_WRITE;
-			local_irq_restore(flags);
+	}
+
+	/* See if we can insert a 1GB or 2MB large PTE here */
+	level = 0;
+	if (page && PageCompound(page)) {
+		pte_size = PAGE_SIZE << compound_order(compound_head(page));
+		if (pte_size >= PUD_SIZE &&
+		    (gpa & (PUD_SIZE - PAGE_SIZE)) ==
+		    (hva & (PUD_SIZE - PAGE_SIZE))) {
+			level = 2;
+			pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1);
+		} else if (pte_size >= PMD_SIZE &&
+			   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
+			   (hva & (PMD_SIZE - PAGE_SIZE))) {
+			level = 1;
+			pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1);
 		}
 	}
 
 	/*
 	 * Compute the PTE value that we need to insert.
 	 */
-	pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED;
-	if (pgflags & _PAGE_WRITE)
-		pgflags |= _PAGE_DIRTY;
+	pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE |
+		_PAGE_ACCESSED;
+	if (writing || upgrade_write)
+		pgflags |= _PAGE_WRITE | _PAGE_DIRTY;
 	pte = pfn_pte(pfn, __pgprot(pgflags));
 
 	/* Allocate space in the tree and write the PTE */