[RFC] KVM: PPC: Book3S HV: add support for page faults in VM_IO|VM_PFNMAP vmas

Message ID 20180208160303.19016-1-clg@kaod.org
State New
Headers show
Series
  • [RFC] KVM: PPC: Book3S HV: add support for page faults in VM_IO|VM_PFNMAP vmas
Related show

Commit Message

Cédric Le Goater Feb. 8, 2018, 4:03 p.m.
On the POWER9 processor, the XIVE interrupt controller can control
interrupt sources using MMIO to trigger events, to EOI or to turn off
the sources. Priority management and interrupt acknowledgment is also
controlled by MMIO in the presenter subengine.

These MMIO regions are exposed to guests in QEMU with a set of memory
mappings, similarly to VFIO, and it would be good to populate
dynamically the VMAs with the appropriate pages using a fault handler.

Largy inspired by Paulo's commit add6a0cd1c5b ("KVM: MMU: try to fix
up page faults before giving up"), this adds support for page faults
under KVM/PPC for memory mappings of host MMIO regions exposed in
guests.

If this is the right approach, can we externalize the
hva_to_pfn_remapped() routine to use it under kvm/ppc in the Radix
tree and HPT MMU modes ?

Signed-off-by: Cédric Le Goater <clg@kaod.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 59 ++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

Comments

Benjamin Herrenschmidt Feb. 8, 2018, 10:19 p.m. | #1
On Thu, 2018-02-08 at 17:03 +0100, Cédric Le Goater wrote:
>  
> +/*
> + * Stolen from virt/kvm/kvm_main.c
> + */

Just export it. It's annoying that we can't just ues hva_to_pfn() ...

> +static int hva_to_pfn_remapped(struct vm_area_struct *vma,
> +                              unsigned long addr, bool write_fault,
> +                              unsigned long *p_pfn)
> +{
> +       unsigned long pfn;
> +       int r;
> +
> +       r = follow_pfn(vma, addr, &pfn);
> +       if (r) {
> +               /*
> +                * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
> +                * not call the fault handler, so do it here.
> +                */
> +               bool unlocked = false;
> +
> +               r = fixup_user_fault(current, current->mm, addr,
> +                                    (write_fault ? FAULT_FLAG_WRITE : 0),
> +                                    &unlocked);
> +               if (unlocked)
> +                       return -EAGAIN;
> +               if (r)
> +                       return r;
> +
> +               r = follow_pfn(vma, addr, &pfn);
> +               if (r)
> +                       return r;
> +       }
> +
> +       /*
> +        * Get a reference here because callers of *hva_to_pfn* and
> +        * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
> +        * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
> +        * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
> +        * simply do nothing for reserved pfns.
> +        *
> +        * Whoever called remap_pfn_range is also going to call e.g.
> +        * unmap_mapping_range before the underlying pages are freed,
> +        * causing a call to our MMU notifier.
> +        */
> +       kvm_get_pfn(pfn);
> +
> +       *p_pfn = pfn;
> +       return 0;
> +}
> +
>  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                                    unsigned long ea, unsigned long dsisr)
>  {
> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>                 vma = find_vma(current->mm, hva);
>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>                     (vma->vm_flags & VM_PFNMAP)) {
> -                       pfn = vma->vm_pgoff +
> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
> +                                                         &pfn);
> +                               if (ret == -EAGAIN)
> +                                       return RESUME_GUEST;
> +                       } else {
> +                               pfn = vma->vm_pgoff +
> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
> +                       }

I don't think the else case is useful. If fact you are checking
VM_PFNMAP twice...

>                         pgflags = pgprot_val(vma->vm_page_prot);
>                 }
>                 up_read(&current->mm->mmap_sem);
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Cédric Le Goater Feb. 9, 2018, 7:14 a.m. | #2
On 02/08/2018 11:19 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2018-02-08 at 17:03 +0100, Cédric Le Goater wrote:
>>  
>> +/*
>> + * Stolen from virt/kvm/kvm_main.c
>> + */
> 
> Just export it. It's annoying that we can't just ues hva_to_pfn() ...

yes. I might add a kvm_ prefix. I will check to see what is the
best pratice.

[ ... ]

>> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>>                 vma = find_vma(current->mm, hva);
>>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>>                     (vma->vm_flags & VM_PFNMAP)) {
>> -                       pfn = vma->vm_pgoff +
>> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
>> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
>> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
>> +                                                         &pfn);
>> +                               if (ret == -EAGAIN)
>> +                                       return RESUME_GUEST;
>> +                       } else {
>> +                               pfn = vma->vm_pgoff +
>> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
>> +                       }
> 
> I don't think the else case is useful. If fact you are checking
> VM_PFNMAP twice...

yes. I am not sure what the VM_PFNMAP case does. The code comes 
from kvmppc_book3s_hv_page_fault() I suppose.

Thanks,

C. 
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 9, 2018, 3:01 p.m. | #3
On 09/02/2018 08:14, Cédric Le Goater wrote:
>>> @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
>>>                 vma = find_vma(current->mm, hva);
>>>                 if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
>>>                     (vma->vm_flags & VM_PFNMAP)) {
>>> -                       pfn = vma->vm_pgoff +
>>> -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
>>> +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
>>> +                               ret = hva_to_pfn_remapped(vma, hva, writing,
>>> +                                                         &pfn);
>>> +                               if (ret == -EAGAIN)
>>> +                                       return RESUME_GUEST;
>>> +                       } else {
>>> +                               pfn = vma->vm_pgoff +
>>> +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
>>> +                       }
>> I don't think the else case is useful. If fact you are checking
>> VM_PFNMAP twice...
> yes. I am not sure what the VM_PFNMAP case does. The code comes 
> from kvmppc_book3s_hv_page_fault() I suppose.

Maybe the outer check should be for VM_IO|VM_PFNMAP?

(Also, why is this code reinventing most of hva_to_pfn?  Not asking you
to fix that of course, but perhaps there's something to improve in the
generic code too).

Thanks,

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 9, 2018, 9:44 p.m. | #4
On Fri, 2018-02-09 at 16:01 +0100, Paolo Bonzini wrote:
> > > I don't think the else case is useful. If fact you are checking
> > > VM_PFNMAP twice...
> > 
> > yes. I am not sure what the VM_PFNMAP case does. The code comes 
> > from kvmppc_book3s_hv_page_fault() I suppose.
> 
> Maybe the outer check should be for VM_IO|VM_PFNMAP?
> 
> (Also, why is this code reinventing most of hva_to_pfn?  Not asking you
> to fix that of course, but perhaps there's something to improve in the
> generic code too).

I've been wondering the same thing, which led me to try to understand
hva_to_pfn, which resulted in skull marks in the wall ;-)

So hva_to_pfn does a complicated dance around 3 or 4 different variants
of gup and it's really not obvious why, the "intent" doesn't appear to
be documented clearly. I think I somewhat figued out it relates to the
async page fault stuff but even then I don't actually know what those
are and what is their purpose :-)

But yeah, Paul's code more/less does its own thing with a simple
gup_fast, and is missing the hva_to_pfn_remapped case for VM_IO |
VM_PFNMAP. The short fix is to make hva_to_pfn_remapped exported and
use it via a fixed variant of Cedric patch.

The longer term fix might be to understand if/how we can use hva_to_pfn

Cheers,
Ben.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 9, 2018, 9:54 p.m. | #5
On Fri, 2018-02-09 at 08:14 +0100, Cédric Le Goater wrote:
> > > @@ -402,8 +450,15 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
> > >                  vma = find_vma(current->mm, hva);
> > >                  if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
> > >                      (vma->vm_flags & VM_PFNMAP)) {
> > > -                       pfn = vma->vm_pgoff +
> > > -                               ((hva - vma->vm_start) >> PAGE_SHIFT);
> > > +                       if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
> > > +                               ret = hva_to_pfn_remapped(vma, hva, writing,
> > > +                                                         &pfn);
> > > +                               if (ret == -EAGAIN)
> > > +                                       return RESUME_GUEST;
> > > +                       } else {
> > > +                               pfn = vma->vm_pgoff +
> > > +                                       ((hva - vma->vm_start) >> PAGE_SHIFT);
> > > +                       }
> > 
> > I don't think the else case is useful. If fact you are checking
> > VM_PFNMAP twice...
> 
> yes. I am not sure what the VM_PFNMAP case does. The code comes 
> from kvmppc_book3s_hv_page_fault() I suppose.

You just need to reproduce the bottom of hva_to_pfn. I don't think
that PFNMAP case in Paul's code is quite right anyway, the generic
code will just go read the PTE which is the right thing to do.

Cheers,
Ben.
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2018, 9:57 a.m. | #6
On 09/02/2018 22:44, Benjamin Herrenschmidt wrote:
>>
>> (Also, why is this code reinventing most of hva_to_pfn?  Not asking you
>> to fix that of course, but perhaps there's something to improve in the
>> generic code too).
> I've been wondering the same thing, which led me to try to understand
> hva_to_pfn, which resulted in skull marks in the wall ;-)
> 
> So hva_to_pfn does a complicated dance around 3 or 4 different variants
> of gup and it's really not obvious why, the "intent" doesn't appear to
> be documented clearly. I think I somewhat figued out it relates to the
> async page fault stuff but even then I don't actually know what those
> are and what is their purpose :-)

Yeah, hva_to_pfn has all those arguments to tweak its behavior, however 
you can treat it as essentially get_user_pages_unlocked:

- if async==false && atomic==false you can ignore hva_to_pfn_fast.  
(hva_to_pfn_fast uses __get_user_pages_fast)

- hva_to_pfn_slow then is essentially get_user_pages_unlocked.  
Optionally it's followed by __get_user_pages_fast to get a writable 
mapping if the caller would like it but does not require it, but you can 
ignore this if you pass writable==NULL.

PPC uses get_user_pages_fast; x86 cannot use it directly because we need 
FOLL_HWPOISON and get_user_pages_fast does not support it.

However, get_user_pages_fast is itself a combination of 
__get_user_pages_fast and get_user_pages_unlocked.  So if it is useful 
for PPC to use get_user_pages_fast, perhaps the generic code should go 
down hva_to_pfn_fast unconditionally, even if async and atomic are both 
false.

The other big difference is that you follow up with the PageHuge check 
to get compound_head/compound_order.  This is a bit like 
kvm_host_page_size, but different.  It would be fine for me to add a 
struct page ** argument to hva_to_pfn if that's enough to remove the 
duplicate code in arch/powerpc/kvm/.

See the untested/uncompiled patch below the signature for some ideas.

Thanks,

Paolo

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b4414842b023..a2c8824ae608 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1339,15 +1339,12 @@ static inline int check_user_page_hwpoison(unsigned long addr)
  * The atomic path to get the writable pfn which will be stored in @pfn,
  * true indicates success, otherwise false is returned.
  */
-static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
-			    bool write_fault, bool *writable, kvm_pfn_t *pfn)
+static struct page *__hva_to_page_fast(unsigned long addr, bool write_fault,
+				       bool *writable)
 {
 	struct page *page[1];
 	int npages;
 
-	if (!(async || atomic))
-		return false;
-
 	/*
 	 * Fast pin a writable pfn only if it is a write fault request
 	 * or the caller allows to map a writable pfn for a read fault
@@ -1357,23 +1354,21 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
 		return false;
 
 	npages = __get_user_pages_fast(addr, 1, 1, page);
-	if (npages == 1) {
-		*pfn = page_to_pfn(page[0]);
+	if (npages < 0)
+		return ERR_PTR(npages);
 
-		if (writable)
-			*writable = true;
-		return true;
-	}
+	if (writable)
+		*writable = true;
 
-	return false;
+	return page[0];
 }
 
 /*
  * The slow path to get the pfn of the specified host virtual address,
  * 1 indicates success, -errno is returned if error is detected.
  */
-static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
-			   bool *writable, kvm_pfn_t *pfn)
+static struct page *hva_to_page_unlocked(unsigned long addr, bool *async,
+					 bool write_fault, bool *writable)
 {
 	struct page *page[1];
 	int npages = 0;
@@ -1395,24 +1390,20 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
 
 		npages = get_user_pages_unlocked(addr, 1, page, flags);
 	}
-	if (npages != 1)
-		return npages;
+	if (npages < 0)
+		return ERR_PTR(npages);
 
 	/* map read fault as writable if possible */
 	if (unlikely(!write_fault) && writable) {
-		struct page *wpage[1];
-
-		npages = __get_user_pages_fast(addr, 1, 1, wpage);
-		if (npages == 1) {
-			*writable = true;
+		struct page *wpage;
+		wpage = __hva_to_page_fast(addr, write_fault, writable);
+		if (!IS_ERR(wpage)) {
 			put_page(page[0]);
-			page[0] = wpage[0];
+			return wpage;
 		}
-
-		npages = 1;
 	}
-	*pfn = page_to_pfn(page[0]);
-	return npages;
+
+	return page[0];
 }
 
 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
@@ -1487,33 +1478,37 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
  *     whether the mapping is writable.
  */
 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
-			bool write_fault, bool *writable)
+			    bool write_fault, bool *writable, struct page **p_page)
 {
 	struct vm_area_struct *vma;
 	kvm_pfn_t pfn = 0;
-	int npages, r;
+	struct page *page;
+	int r;
 
 	/* we can do it either atomically or asynchronously, not both */
 	BUG_ON(atomic && async);
 
-	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
-		return pfn;
+	page = __hva_to_page_fast(addr, write_fault, writable);
+	if (!IS_ERR(page))
+		goto got_page;
 
 	if (atomic)
 		return KVM_PFN_ERR_FAULT;
 
-	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
-	if (npages == 1)
-		return pfn;
+	page = hva_to_page_unlocked(addr, async, write_fault, writable);
+	if (!IS_ERR(page))
+		goto got_page;
 
 	down_read(&current->mm->mmap_sem);
-	if (npages == -EHWPOISON ||
+	/* FIXME, is check_user_page_hwpoison still needed? */
+	if (PTR_ERR(page) == -EHWPOISON ||
 	      (!async && check_user_page_hwpoison(addr))) {
-		pfn = KVM_PFN_ERR_HWPOISON;
-		goto exit;
+		up_read(&current->mm->mmap_sem);
+		return KVM_PFN_ERR_HWPOISON;
 	}
 
 retry:
+	*p_page = NULL;
 	vma = find_vma_intersection(current->mm, addr, addr + 1);
 
 	if (vma == NULL)
@@ -1529,9 +1523,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 			*async = true;
 		pfn = KVM_PFN_ERR_FAULT;
 	}
-exit:
 	up_read(&current->mm->mmap_sem);
 	return pfn;
+
+got_page:
+	if (p_page)
+		*p_page = page;
+	return page_to_pfn(page);
 }
 
 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
@@ -1559,7 +1557,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
 	}
 
 	return hva_to_pfn(addr, atomic, async, write_fault,
-			  writable);
+			  writable, NULL);
 }
 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
 
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 12, 2018, 10:33 a.m. | #7
On Mon, 2018-02-12 at 10:57 +0100, Paolo Bonzini wrote:
> On 09/02/2018 22:44, Benjamin Herrenschmidt wrote:
> > > 
> > > (Also, why is this code reinventing most of hva_to_pfn?  Not asking you
> > > to fix that of course, but perhaps there's something to improve in the
> > > generic code too).
> > 
> > I've been wondering the same thing, which led me to try to understand
> > hva_to_pfn, which resulted in skull marks in the wall ;-)
> > 
> > So hva_to_pfn does a complicated dance around 3 or 4 different variants
> > of gup and it's really not obvious why, the "intent" doesn't appear to
> > be documented clearly. I think I somewhat figued out it relates to the
> > async page fault stuff but even then I don't actually know what those
> > are and what is their purpose :-)
> 
> Yeah, hva_to_pfn has all those arguments to tweak its behavior, however 
> you can treat it as essentially get_user_pages_unlocked:
> 
> - if async==false && atomic==false you can ignore hva_to_pfn_fast.  
> (hva_to_pfn_fast uses __get_user_pages_fast)

Do you mean async == NULL && atomic == false ? IE. async is a
pointer...

> - hva_to_pfn_slow then is essentially get_user_pages_unlocked.  
> Optionally it's followed by __get_user_pages_fast to get a writable 
> mapping if the caller would like it but does not require it, but you can 
> ignore this if you pass writable==NULL.
>
> PPC uses get_user_pages_fast; x86 cannot use it directly because we need 
> FOLL_HWPOISON and get_user_pages_fast does not support it.

Remind me what that does ? I've lost track ... we do support some
amount of HWPOISON stuff on powerpc so we should probably do that
too...

I also at some point would like to understand how we sync the dirty
bit in a race free way ;-) For the accessed bit it looks like the
generic mm always calls the notifiers that gets into our "age" callback
, but dirty seems to be treated differently. My limited understanding
right now is that we set it via gup on a write fault in the struct
page, and it can only be cleared via page_mkclean which takes the
mapping out. But I haven't checked that this is 100% tight (meaning we 
don't actually rely on the 2nd level page table dirty bit except for
dirty_map tracking).
 
> However, get_user_pages_fast is itself a combination of 
> __get_user_pages_fast and get_user_pages_unlocked.  So if it is useful 
> for PPC to use get_user_pages_fast, perhaps the generic code should go 
> down hva_to_pfn_fast unconditionally, even if async and atomic are both 
> false.
> 
> The other big difference is that you follow up with the PageHuge check 
> to get compound_head/compound_order.  This is a bit like 
> kvm_host_page_size, but different.  It would be fine for me to add a 
> struct page ** argument to hva_to_pfn if that's enough to remove the 
> duplicate code in arch/powerpc/kvm/.

Yes, that would be nice.

> See the untested/uncompiled patch below the signature for some ideas.

I can give that a spin tomorrow with a bit of luck (30 other things on
my plate), or maybe Paul can.

BTW. Is there some doc/explanation about the whole Async page fault
business ? What is it about ? I wonder if it's something we
could/should use as well.

Cheers,
Ben.

> Thanks,
> 
> Paolo
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b4414842b023..a2c8824ae608 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1339,15 +1339,12 @@ static inline int check_user_page_hwpoison(unsigned long addr)
>   * The atomic path to get the writable pfn which will be stored in @pfn,
>   * true indicates success, otherwise false is returned.
>   */
> -static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
> -			    bool write_fault, bool *writable, kvm_pfn_t *pfn)
> +static struct page *__hva_to_page_fast(unsigned long addr, bool write_fault,
> +				       bool *writable)
>  {
>  	struct page *page[1];
>  	int npages;
>  
> -	if (!(async || atomic))
> -		return false;
> -
>  	/*
>  	 * Fast pin a writable pfn only if it is a write fault request
>  	 * or the caller allows to map a writable pfn for a read fault
> @@ -1357,23 +1354,21 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
>  		return false;
>  
>  	npages = __get_user_pages_fast(addr, 1, 1, page);
> -	if (npages == 1) {
> -		*pfn = page_to_pfn(page[0]);
> +	if (npages < 0)
> +		return ERR_PTR(npages);
>  
> -		if (writable)
> -			*writable = true;
> -		return true;
> -	}
> +	if (writable)
> +		*writable = true;
>  
> -	return false;
> +	return page[0];
>  }
>  
>  /*
>   * The slow path to get the pfn of the specified host virtual address,
>   * 1 indicates success, -errno is returned if error is detected.
>   */
> -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> -			   bool *writable, kvm_pfn_t *pfn)
> +static struct page *hva_to_page_unlocked(unsigned long addr, bool *async,
> +					 bool write_fault, bool *writable)
>  {
>  	struct page *page[1];
>  	int npages = 0;
> @@ -1395,24 +1390,20 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
>  
>  		npages = get_user_pages_unlocked(addr, 1, page, flags);
>  	}
> -	if (npages != 1)
> -		return npages;
> +	if (npages < 0)
> +		return ERR_PTR(npages);
>  
>  	/* map read fault as writable if possible */
>  	if (unlikely(!write_fault) && writable) {
> -		struct page *wpage[1];
> -
> -		npages = __get_user_pages_fast(addr, 1, 1, wpage);
> -		if (npages == 1) {
> -			*writable = true;
> +		struct page *wpage;
> +		wpage = __hva_to_page_fast(addr, write_fault, writable);
> +		if (!IS_ERR(wpage)) {
>  			put_page(page[0]);
> -			page[0] = wpage[0];
> +			return wpage;
>  		}
> -
> -		npages = 1;
>  	}
> -	*pfn = page_to_pfn(page[0]);
> -	return npages;
> +
> +	return page[0];
>  }
>  
>  static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
> @@ -1487,33 +1478,37 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma,
>   *     whether the mapping is writable.
>   */
>  static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
> -			bool write_fault, bool *writable)
> +			    bool write_fault, bool *writable, struct page **p_page)
>  {
>  	struct vm_area_struct *vma;
>  	kvm_pfn_t pfn = 0;
> -	int npages, r;
> +	struct page *page;
> +	int r;
>  
>  	/* we can do it either atomically or asynchronously, not both */
>  	BUG_ON(atomic && async);
>  
> -	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
> -		return pfn;
> +	page = __hva_to_page_fast(addr, write_fault, writable);
> +	if (!IS_ERR(page))
> +		goto got_page;
>  
>  	if (atomic)
>  		return KVM_PFN_ERR_FAULT;
>  
> -	npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn);
> -	if (npages == 1)
> -		return pfn;
> +	page = hva_to_page_unlocked(addr, async, write_fault, writable);
> +	if (!IS_ERR(page))
> +		goto got_page;
>  
>  	down_read(&current->mm->mmap_sem);
> -	if (npages == -EHWPOISON ||
> +	/* FIXME, is check_user_page_hwpoison still needed? */
> +	if (PTR_ERR(page) == -EHWPOISON ||
>  	      (!async && check_user_page_hwpoison(addr))) {
> -		pfn = KVM_PFN_ERR_HWPOISON;
> -		goto exit;
> +		up_read(&current->mm->mmap_sem);
> +		return KVM_PFN_ERR_HWPOISON;
>  	}
>  
>  retry:
> +	*p_page = NULL;
>  	vma = find_vma_intersection(current->mm, addr, addr + 1);
>  
>  	if (vma == NULL)
> @@ -1529,9 +1523,13 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
>  			*async = true;
>  		pfn = KVM_PFN_ERR_FAULT;
>  	}
> -exit:
>  	up_read(&current->mm->mmap_sem);
>  	return pfn;
> +
> +got_page:
> +	if (p_page)
> +		*p_page = page;
> +	return page_to_pfn(page);
>  }
>  
>  kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
> @@ -1559,7 +1557,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
>  	}
>  
>  	return hva_to_pfn(addr, atomic, async, write_fault,
> -			  writable);
> +			  writable, NULL);
>  }
>  EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
>  
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2018, 10:41 a.m. | #8
On 12/02/2018 11:33, Benjamin Herrenschmidt wrote:
> My limited understanding
> right now is that we set it via gup on a write fault in the struct
> page, and it can only be cleared via page_mkclean which takes the
> mapping out. But I haven't checked that this is 100% tight (meaning we 
> don't actually rely on the 2nd level page table dirty bit except for
> dirty_map tracking).

Yeah, 2nd-level page tables only use dirty bits internally for KVM's
dirty page logging (if it's active) or don't use them at all.

In fact I don't know about PPC but on x86 we actually do
write-protection except on the newest processors that have a dirty page
_log_ in addition to the bits.  With the bits only, scanning the page
tables and clearing the bits atomically one page at a time is incredibly
slow and hits the cache really badly.  With write protection, each fault
is slower but KVM_GET_DIRTY_LOG is hundreds of times faster.  And
slowing down accesses to dirty pages might actually _help_ migration
converge---it's not a bug, it's a feature! :)

>> See the untested/uncompiled patch below the signature for some ideas.
> I can give that a spin tomorrow with a bit of luck (30 other things on
> my plate), or maybe Paul can.
> 
> BTW. Is there some doc/explanation about the whole Async page fault
> business ? What is it about ? I wonder if it's something we
> could/should use as well.

Quite likely since it was stolen from pHyp.

The idea is that you tell the guest about _host_ page faults and the
guest starts running some other process.  Then when the host page fault
is resolved, you tell the guest about that as well.  This way host page
faults are resolved without blocking the whole guest.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 12, 2018, 10:54 a.m. | #9
On Mon, 2018-02-12 at 11:41 +0100, Paolo Bonzini wrote:
> On 12/02/2018 11:33, Benjamin Herrenschmidt wrote:
> > My limited understanding
> > right now is that we set it via gup on a write fault in the struct
> > page, and it can only be cleared via page_mkclean which takes the
> > mapping out. But I haven't checked that this is 100% tight (meaning we 
> > don't actually rely on the 2nd level page table dirty bit except for
> > dirty_map tracking).
> 
> Yeah, 2nd-level page tables only use dirty bits internally for KVM's
> dirty page logging (if it's active) or don't use them at all.

Ok.

> In fact I don't know about PPC but on x86 we actually do
> write-protection except on the newest processors that have a dirty page
> _log_ in addition to the bits.

Can you explain ? I'm not sure I got what you mean here.

>   With the bits only, scanning the page
> tables and clearing the bits atomically one page at a time is incredibly
> slow and hits the cache really badly.  With write protection, each fault
> is slower but KVM_GET_DIRTY_LOG is hundreds of times faster.  And
> slowing down accesses to dirty pages might actually _help_ migration
> converge---it's not a bug, it's a feature! :)
> 
> > > See the untested/uncompiled patch below the signature for some ideas.
> > 
> > I can give that a spin tomorrow with a bit of luck (30 other things on
> > my plate), or maybe Paul can.
> > 
> > BTW. Is there some doc/explanation about the whole Async page fault
> > business ? What is it about ? I wonder if it's something we
> > could/should use as well.
> 
> Quite likely since it was stolen from pHyp.
> 
> The idea is that you tell the guest about _host_ page faults and the
> guest starts running some other process.  Then when the host page fault
> is resolved, you tell the guest about that as well.  This way host page
> faults are resolved without blocking the whole guest.

Not pHyp then, I don't think we have any such thing on PAPR, you are
probably thinking about s390 ?

How do you tell the guest on x86 ? You synthetize some kind of
artificial interrupt ? There's a paravirt stuff for the guest to tell
you it knows about it ? Or some other mechanism ?

For powerpc, we could do something similar fairly easily since we are
paravirtualized but I'd like to see how that's plumbed into the exist
archs who use it so we don't diverge too much.

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2018, 11:01 a.m. | #10
On 12/02/2018 11:54, Benjamin Herrenschmidt wrote:
> On Mon, 2018-02-12 at 11:41 +0100, Paolo Bonzini wrote:
>> In fact I don't know about PPC but on x86 we actually do
>> write-protection except on the newest processors that have a dirty page
>> _log_ in addition to the bits.
> 
> Can you explain ? I'm not sure I got what you mean here.

On recent processors you can get an exit to the hypervisor every time N
dirty pages are marked dirty, and the processor also keeps a log of
_which_ pages are marked dirty.  Without the log, you'd have to walk the
entire radix tree to find the dirty pages, which is so expensive that
without the log we don't use dirty bits at all.

>> Quite likely since it was stolen from pHyp.
>>
>> The idea is that you tell the guest about _host_ page faults and the
>> guest starts running some other process.  Then when the host page fault
>> is resolved, you tell the guest about that as well.  This way host page
>> faults are resolved without blocking the whole guest.
> 
> Not pHyp then, I don't think we have any such thing on PAPR, you are
> probably thinking about s390 ?

Yeah, s390 too.  Gleb said AIX on
https://www.linux-kvm.org/images/a/ac/2010-forum-Async-page-faults.pdf
but maybe he was wrong.

> How do you tell the guest on x86 ? You synthetize some kind of
> artificial interrupt ? There's a paravirt stuff for the guest to tell
> you it knows about it ? Or some other mechanism ?

Yes, the guest sets aside a communication page, tells us the address
that it knows about it, and then we start sending the guest "weird" page
faults that are slightly different from the real thing.  Search for
MSR_KVM_ASYNC_PF_EN in Documentation/virtual/kvm/msr.txt.

Thanks,

Paolo

> For powerpc, we could do something similar fairly easily since we are
> paravirtualized but I'd like to see how that's plumbed into the exist
> archs who use it so we don't diverge too much.
> 
> Cheers,
> Ben.
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt Feb. 12, 2018, 2:38 p.m. | #11
On Mon, 2018-02-12 at 12:01 +0100, Paolo Bonzini wrote:
> On 12/02/2018 11:54, Benjamin Herrenschmidt wrote:
> > On Mon, 2018-02-12 at 11:41 +0100, Paolo Bonzini wrote:
> > > In fact I don't know about PPC but on x86 we actually do
> > > write-protection except on the newest processors that have a dirty page
> > > _log_ in addition to the bits.
> > 
> > Can you explain ? I'm not sure I got what you mean here.
> 
> On recent processors you can get an exit to the hypervisor every time N
> dirty pages are marked dirty, and the processor also keeps a log of
> _which_ pages are marked dirty.  Without the log, you'd have to walk the
> entire radix tree to find the dirty pages, which is so expensive that
> without the log we don't use dirty bits at all.

Oh interesting. How is that log "kept" ? Some kind of HW fifo ? There's
a feature on hash ppc where the CPU can keep an array of dirty bit but
it's still indexed by hash idx so you still have to walk it. What you
describe sounds more like a log which would be more interesting... 

> > > Quite likely since it was stolen from pHyp.
> > > 
> > > The idea is that you tell the guest about _host_ page faults and the
> > > guest starts running some other process.  Then when the host page fault
> > > is resolved, you tell the guest about that as well.  This way host page
> > > faults are resolved without blocking the whole guest.
> > 
> > Not pHyp then, I don't think we have any such thing on PAPR, you are
> > probably thinking about s390 ?
> 
> Yeah, s390 too.  Gleb said AIX on
> https://www.linux-kvm.org/images/a/ac/2010-forum-Async-page-faults.pdf
> but maybe he was wrong.

Maybe you aren't, but if it exists us (Linux) never used it :-) I'll
ask around.

> > How do you tell the guest on x86 ? You synthetize some kind of
> > artificial interrupt ? There's a paravirt stuff for the guest to tell
> > you it knows about it ? Or some other mechanism ?
> 
> Yes, the guest sets aside a communication page, tells us the address
> that it knows about it, and then we start sending the guest "weird" page
> faults that are slightly different from the real thing.  Search for
> MSR_KVM_ASYNC_PF_EN in Documentation/virtual/kvm/msr.txt.

Thanks !

Cheers,
Ben.

> Thanks,
> 
> Paolo
> 
> > For powerpc, we could do something similar fairly easily since we are
> > paravirtualized but I'd like to see how that's plumbed into the exist
> > archs who use it so we don't diverge too much.
> > 
> > Cheers,
> > Ben.
> > 
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Feb. 12, 2018, 2:41 p.m. | #12
On 12/02/2018 15:38, Benjamin Herrenschmidt wrote:
> On Mon, 2018-02-12 at 12:01 +0100, Paolo Bonzini wrote:
>> On 12/02/2018 11:54, Benjamin Herrenschmidt wrote:
>>> On Mon, 2018-02-12 at 11:41 +0100, Paolo Bonzini wrote:
>>>> In fact I don't know about PPC but on x86 we actually do
>>>> write-protection except on the newest processors that have a dirty page
>>>> _log_ in addition to the bits.
>>>
>>> Can you explain ? I'm not sure I got what you mean here.
>>
>> On recent processors you can get an exit to the hypervisor every time N
>> dirty pages are marked dirty, and the processor also keeps a log of
>> _which_ pages are marked dirty.  Without the log, you'd have to walk the
>> entire radix tree to find the dirty pages, which is so expensive that
>> without the log we don't use dirty bits at all.
> 
> Oh interesting. How is that log "kept" ? Some kind of HW fifo ?

Yeah, forced vmexit after 512 dirty pages, plus a pointer to the last
written entry (kept in the hypervisor->processor communication block).
Pretty clever.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 58618f644c56..74e889575bf0 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -291,6 +291,54 @@  static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 	return ret;
 }
 
+/*
+ * Stolen from virt/kvm/kvm_main.c
+ */
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+			       unsigned long addr, bool write_fault,
+			       unsigned long *p_pfn)
+{
+	unsigned long pfn;
+	int r;
+
+	r = follow_pfn(vma, addr, &pfn);
+	if (r) {
+		/*
+		 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+		 * not call the fault handler, so do it here.
+		 */
+		bool unlocked = false;
+
+		r = fixup_user_fault(current, current->mm, addr,
+				     (write_fault ? FAULT_FLAG_WRITE : 0),
+				     &unlocked);
+		if (unlocked)
+			return -EAGAIN;
+		if (r)
+			return r;
+
+		r = follow_pfn(vma, addr, &pfn);
+		if (r)
+			return r;
+	}
+
+	/*
+	 * Get a reference here because callers of *hva_to_pfn* and
+	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
+	 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+	 * simply do nothing for reserved pfns.
+	 *
+	 * Whoever called remap_pfn_range is also going to call e.g.
+	 * unmap_mapping_range before the underlying pages are freed,
+	 * causing a call to our MMU notifier.
+	 */
+	kvm_get_pfn(pfn);
+
+	*p_pfn = pfn;
+	return 0;
+}
+
 int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   unsigned long ea, unsigned long dsisr)
 {
@@ -402,8 +450,15 @@  int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		vma = find_vma(current->mm, hva);
 		if (vma && vma->vm_start <= hva && hva < vma->vm_end &&
 		    (vma->vm_flags & VM_PFNMAP)) {
-			pfn = vma->vm_pgoff +
-				((hva - vma->vm_start) >> PAGE_SHIFT);
+			if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+				ret = hva_to_pfn_remapped(vma, hva, writing,
+							  &pfn);
+				if (ret == -EAGAIN)
+					return RESUME_GUEST;
+			} else {
+				pfn = vma->vm_pgoff +
+					((hva - vma->vm_start) >> PAGE_SHIFT);
+			}
 			pgflags = pgprot_val(vma->vm_page_prot);
 		}
 		up_read(&current->mm->mmap_sem);