Message ID | 1520236499-29499-6-git-send-email-paulus@ozlabs.org |
---|---|
State | Superseded |
Headers | show |
Series | KVM: PPC: Improve page fault handling for radix guests | expand |
On 5/3/18 6:54 pm, Paul Mackerras wrote: > This changes the hypervisor page fault handler for radix guests to use > the generic KVM __gfn_to_pfn_memslot() function instead of using > get_user_pages_fast() and then handling the case of VM_PFNMAP vmas > specially. The old code missed the case of VM_IO vmas; with this > change, VM_IO vmas will now be handled correctly by code within > __gfn_to_pfn_memslot. > > Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses > __get_user_pages_fast for the initial lookup in the cases where > either atomic or async is set. Since we are not setting either > atomic or async, we do our own __get_user_pages_fast first, for now. > > This also adds code to check for the KVM_MEM_READONLY flag on the > memslot. If it is set and this is a write access, we synthesize a > data storage interrupt for the guest. > > Signed-off-by: Paul Mackerras <paulus@ozlabs.org> This one produces multiple errors like this: Severe Machine check interrupt [Not recovered] NIP [c008000005b34810]: 0xc008000005b34810 Initiator: CPU Error type: Real address [Load (bad)] Effective address: c00c000080100284 0xc008000005b34810 is a guest module, 0xc00c000080100284 is an MMIO address (bar0 of a nvlink bridge). > --- > arch/powerpc/kvm/book3s_64_mmu_radix.c | 117 +++++++++++++++++---------------- > 1 file changed, 59 insertions(+), 58 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c > index 05acc67..6c71aa2 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c > @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > unsigned long mmu_seq, pte_size; > unsigned long gpa, gfn, hva, pfn; > struct kvm_memory_slot *memslot; > - struct page *page = NULL, *pages[1]; > - long ret, npages; > - unsigned int writing; > - struct vm_area_struct *vma; > - unsigned long flags; > + struct page *page = NULL; > + long ret; > + bool writing; > + bool upgrade_write = false; > + bool *upgrade_p = &upgrade_write; > pte_t pte, *ptep; > unsigned long pgflags; > unsigned int shift, level; > @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > dsisr & DSISR_ISSTORE); > } > > - /* used to check for invalidations in progress */ > - mmu_seq = kvm->mmu_notifier_seq; > - smp_rmb(); > - > writing = (dsisr & DSISR_ISSTORE) != 0; > - hva = gfn_to_hva_memslot(memslot, gfn); > + if (memslot->flags & KVM_MEM_READONLY) { > + if (writing) { > + /* give the guest a DSI */ > + dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; > + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); > + return RESUME_GUEST; > + } > + upgrade_p = NULL; > + } > + > if (dsisr & DSISR_SET_RC) { > /* > * Need to set an R or C bit in the 2nd-level tables; > @@ -470,62 +475,58 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, > return RESUME_GUEST; > } > > - ret = -EFAULT; > - pfn = 0; > - pte_size = PAGE_SIZE; > - pgflags = _PAGE_READ | _PAGE_EXEC; > - level = 0; > - npages = get_user_pages_fast(hva, 1, writing, pages); > - if (npages < 1) { > - /* Check if it's an I/O mapping */ > - down_read(¤t->mm->mmap_sem); > - vma = find_vma(current->mm, hva); > - if (vma && vma->vm_start <= hva && hva < vma->vm_end && > - (vma->vm_flags & VM_PFNMAP)) { > - pfn = vma->vm_pgoff + > - ((hva - vma->vm_start) >> PAGE_SHIFT); > - pgflags = pgprot_val(vma->vm_page_prot); > - } > - up_read(¤t->mm->mmap_sem); > - if (!pfn) > - return -EFAULT; > - } else { > - page = pages[0]; > + /* used to check for invalidations in progress */ > + mmu_seq = kvm->mmu_notifier_seq; > + smp_rmb(); > + > + /* > + * Do a fast check first, since __gfn_to_pfn_memslot doesn't > + * do it with !atomic && !async, which is how we call it. > + * We always ask for write permission since the common case > + * is that the page is writable. > + */ > + hva = gfn_to_hva_memslot(memslot, gfn); > + if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { > pfn = page_to_pfn(page); > - if (PageCompound(page)) { > - pte_size <<= compound_order(compound_head(page)); > - /* See if we can insert a 1GB or 2MB large PTE here */ > - if (pte_size >= PUD_SIZE && > - (gpa & (PUD_SIZE - PAGE_SIZE)) == > - (hva & (PUD_SIZE - PAGE_SIZE))) { > - level = 2; > - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); > - } else if (pte_size >= PMD_SIZE && > - (gpa & (PMD_SIZE - PAGE_SIZE)) == > - (hva & (PMD_SIZE - PAGE_SIZE))) { > - level = 1; > - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); > - } > + upgrade_write = true; > + } else { > + /* Call KVM generic code to do the slow-path check */ > + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, > + writing, upgrade_p); > + if (is_error_noslot_pfn(pfn)) > + return -EFAULT; > + page = NULL; > + if (pfn_valid(pfn)) { > + page = pfn_to_page(pfn); > + if (PageReserved(page)) > + page = NULL; > } > - /* See if we can provide write access */ > - if (writing) { > - pgflags |= _PAGE_WRITE; > - } else { > - local_irq_save(flags); > - ptep = find_current_mm_pte(current->mm->pgd, > - hva, NULL, NULL); > - if (ptep && pte_write(*ptep)) > - pgflags |= _PAGE_WRITE; > - local_irq_restore(flags); > + } > + > + /* See if we can insert a 1GB or 2MB large PTE here */ > + level = 0; > + if (page && PageCompound(page)) { > + pte_size = PAGE_SIZE << compound_order(compound_head(page)); > + if (pte_size >= PUD_SIZE && > + (gpa & (PUD_SIZE - PAGE_SIZE)) == > + (hva & (PUD_SIZE - PAGE_SIZE))) { > + level = 2; > + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); > + } else if (pte_size >= PMD_SIZE && > + (gpa & (PMD_SIZE - PAGE_SIZE)) == > + (hva & (PMD_SIZE - PAGE_SIZE))) { > + level = 1; > + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); > } > } > > /* > * Compute the PTE value that we need to insert. > */ > - pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; > - if (pgflags & _PAGE_WRITE) > - pgflags |= _PAGE_DIRTY; > + pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | > + _PAGE_ACCESSED; > + if (writing || upgrade_write) > + pgflags |= _PAGE_WRITE | _PAGE_DIRTY; > pte = pfn_pte(pfn, __pgprot(pgflags)); > > /* Allocate space in the tree and write the PTE */ >
On 03/15/2018 02:56 AM, Alexey Kardashevskiy wrote: > On 5/3/18 6:54 pm, Paul Mackerras wrote: >> This changes the hypervisor page fault handler for radix guests to use >> the generic KVM __gfn_to_pfn_memslot() function instead of using >> get_user_pages_fast() and then handling the case of VM_PFNMAP vmas >> specially. The old code missed the case of VM_IO vmas; with this >> change, VM_IO vmas will now be handled correctly by code within >> __gfn_to_pfn_memslot. >> >> Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses >> __get_user_pages_fast for the initial lookup in the cases where >> either atomic or async is set. Since we are not setting either >> atomic or async, we do our own __get_user_pages_fast first, for now. >> >> This also adds code to check for the KVM_MEM_READONLY flag on the >> memslot. If it is set and this is a write access, we synthesize a >> data storage interrupt for the guest. >> >> Signed-off-by: Paul Mackerras <paulus@ozlabs.org> > > > This one produces multiple errors like this: > > Severe Machine check interrupt [Not recovered] > NIP [c008000005b34810]: 0xc008000005b34810 > Initiator: CPU > Error type: Real address [Load (bad)] > Effective address: c00c000080100284 > This is the same symptom for the XIVE ESBs. I have kept the workaround with hva_to_pfn_remapped() in kvmppc_book3s_radix_page_fault() for 4.16-rc6 Thanks, C. > 0xc008000005b34810 is a guest module, 0xc00c000080100284 is an MMIO address > (bar0 of a nvlink bridge). -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 05acc67..6c71aa2 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -392,11 +392,11 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long mmu_seq, pte_size; unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; - struct page *page = NULL, *pages[1]; - long ret, npages; - unsigned int writing; - struct vm_area_struct *vma; - unsigned long flags; + struct page *page = NULL; + long ret; + bool writing; + bool upgrade_write = false; + bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; unsigned long pgflags; unsigned int shift, level; @@ -436,12 +436,17 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr & DSISR_ISSTORE); } - /* used to check for invalidations in progress */ - mmu_seq = kvm->mmu_notifier_seq; - smp_rmb(); - writing = (dsisr & DSISR_ISSTORE) != 0; - hva = gfn_to_hva_memslot(memslot, gfn); + if (memslot->flags & KVM_MEM_READONLY) { + if (writing) { + /* give the guest a DSI */ + dsisr = DSISR_ISSTORE | DSISR_PROTFAULT; + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + upgrade_p = NULL; + } + if (dsisr & DSISR_SET_RC) { /* * Need to set an R or C bit in the 2nd-level tables; @@ -470,62 +475,58 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; } - ret = -EFAULT; - pfn = 0; - pte_size = PAGE_SIZE; - pgflags = _PAGE_READ | _PAGE_EXEC; - level = 0; - npages = get_user_pages_fast(hva, 1, writing, pages); - if (npages < 1) { - /* Check if it's an I/O mapping */ - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, hva); - if (vma && vma->vm_start <= hva && hva < vma->vm_end && - (vma->vm_flags & VM_PFNMAP)) { - pfn = vma->vm_pgoff + - ((hva - vma->vm_start) >> PAGE_SHIFT); - pgflags = pgprot_val(vma->vm_page_prot); - } - up_read(¤t->mm->mmap_sem); - if (!pfn) - return -EFAULT; - } else { - page = pages[0]; + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Do a fast check first, since __gfn_to_pfn_memslot doesn't + * do it with !atomic && !async, which is how we call it. + * We always ask for write permission since the common case + * is that the page is writable. + */ + hva = gfn_to_hva_memslot(memslot, gfn); + if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { pfn = page_to_pfn(page); - if (PageCompound(page)) { - pte_size <<= compound_order(compound_head(page)); - /* See if we can insert a 1GB or 2MB large PTE here */ - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } + upgrade_write = true; + } else { + /* Call KVM generic code to do the slow-path check */ + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + writing, upgrade_p); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + page = NULL; + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; } - /* See if we can provide write access */ - if (writing) { - pgflags |= _PAGE_WRITE; - } else { - local_irq_save(flags); - ptep = find_current_mm_pte(current->mm->pgd, - hva, NULL, NULL); - if (ptep && pte_write(*ptep)) - pgflags |= _PAGE_WRITE; - local_irq_restore(flags); + } + + /* See if we can insert a 1GB or 2MB large PTE here */ + level = 0; + if (page && PageCompound(page)) { + pte_size = PAGE_SIZE << compound_order(compound_head(page)); + if (pte_size >= PUD_SIZE && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); + } else if (pte_size >= PMD_SIZE && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); } } /* * Compute the PTE value that we need to insert. */ - pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; - if (pgflags & _PAGE_WRITE) - pgflags |= _PAGE_DIRTY; + pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | + _PAGE_ACCESSED; + if (writing || upgrade_write) + pgflags |= _PAGE_WRITE | _PAGE_DIRTY; pte = pfn_pte(pfn, __pgprot(pgflags)); /* Allocate space in the tree and write the PTE */
This changes the hypervisor page fault handler for radix guests to use the generic KVM __gfn_to_pfn_memslot() function instead of using get_user_pages_fast() and then handling the case of VM_PFNMAP vmas specially. The old code missed the case of VM_IO vmas; with this change, VM_IO vmas will now be handled correctly by code within __gfn_to_pfn_memslot. Currently, __gfn_to_pfn_memslot calls hva_to_pfn, which only uses __get_user_pages_fast for the initial lookup in the cases where either atomic or async is set. Since we are not setting either atomic or async, we do our own __get_user_pages_fast first, for now. This also adds code to check for the KVM_MEM_READONLY flag on the memslot. If it is set and this is a write access, we synthesize a data storage interrupt for the guest. Signed-off-by: Paul Mackerras <paulus@ozlabs.org> --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 117 +++++++++++++++++---------------- 1 file changed, 59 insertions(+), 58 deletions(-)