diff mbox series

[v7,3/8] KVM: Make __kvm_follow_pfn not imply FOLL_GET

Message ID 20230704075054.3344915-4-stevensd@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series KVM: allow mapping non-refcounted pages | expand

Commit Message

David Stevens July 4, 2023, 7:50 a.m. UTC
From: David Stevens <stevensd@chromium.org>

Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
callers to resolve a gfn when the associated pfn has a valid struct page
that isn't being actively refcounted (e.g. tail pages of non-compound
higher order pages). For a caller to safely omit FOLL_GET, all usages of
the returned pfn must be guarded by a mmu notifier.

This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
is set when the returned pfn has an associated struct page with a valid
refcount. Callers that don't pass FOLL_GET should remember this value
and use it to avoid places like kvm_is_ad_tracked_page that assume a
non-zero refcount.

Signed-off-by: David Stevens <stevensd@chromium.org>
---
 include/linux/kvm_host.h | 10 ++++++
 virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
 virt/kvm/pfncache.c      |  2 +-
 3 files changed, 47 insertions(+), 32 deletions(-)

Comments

Yu Zhang July 5, 2023, 7:23 a.m. UTC | #1
>  
> +void kvm_set_page_accessed(struct page *page);
> +void kvm_set_page_dirty(struct page *page);
> +
No one calls these 2 routines in this patch. How about move this change to
[PATCH v7 5/8] KVM: x86/mmu: Don't pass FOLL_GET to __kvm_follow_pfn ?

...

> @@ -2930,17 +2933,19 @@ static bool kvm_is_ad_tracked_page(struct page *page)
>  	return !PageReserved(page);
>  }
>  
> -static void kvm_set_page_dirty(struct page *page)
> +void kvm_set_page_dirty(struct page *page)
>  {
>  	if (kvm_is_ad_tracked_page(page))
>  		SetPageDirty(page);
>  }
> +EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
>  
> -static void kvm_set_page_accessed(struct page *page)
> +void kvm_set_page_accessed(struct page *page)
>  {
>  	if (kvm_is_ad_tracked_page(page))
>  		mark_page_accessed(page);
>  }
> +EXPORT_SYMBOL_GPL(kvm_set_page_accessed);

Same here.

B.R.
Yu
Yu Zhang July 5, 2023, 11:56 a.m. UTC | #2
On Tue, Jul 04, 2023 at 04:50:48PM +0900, David Stevens wrote:
> From: David Stevens <stevensd@chromium.org>
> 
> Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
> callers to resolve a gfn when the associated pfn has a valid struct page
> that isn't being actively refcounted (e.g. tail pages of non-compound
> higher order pages). For a caller to safely omit FOLL_GET, all usages of
> the returned pfn must be guarded by a mmu notifier.
> 
> This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
> is set when the returned pfn has an associated struct page with a valid
> refcount. Callers that don't pass FOLL_GET should remember this value
> and use it to avoid places like kvm_is_ad_tracked_page that assume a
> non-zero refcount.
> 
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  include/linux/kvm_host.h | 10 ++++++
>  virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
>  virt/kvm/pfncache.c      |  2 +-
>  3 files changed, 47 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ef2763c2b12e..a45308c7d2d9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1157,6 +1157,9 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
>  void kvm_release_page_clean(struct page *page);
>  void kvm_release_page_dirty(struct page *page);
>  
> +void kvm_set_page_accessed(struct page *page);
> +void kvm_set_page_dirty(struct page *page);
> +
>  struct kvm_follow_pfn {
>  	const struct kvm_memory_slot *slot;
>  	gfn_t gfn;
> @@ -1164,10 +1167,17 @@ struct kvm_follow_pfn {
>  	bool atomic;
>  	/* Allow a read fault to create a writeable mapping. */
>  	bool allow_write_mapping;
> +	/*
> +	 * Usage of the returned pfn will be guared by a mmu notifier. Must
> +	 * be true if FOLL_GET is not set.
> +	 */
> +	bool guarded_by_mmu_notifier;

And how? Any place to check the invalidate seq?

B.R.
Yu
Zhi Wang July 5, 2023, 1:19 p.m. UTC | #3
On Tue,  4 Jul 2023 16:50:48 +0900
David Stevens <stevensd@chromium.org> wrote:

> From: David Stevens <stevensd@chromium.org>
> 
> Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
> callers to resolve a gfn when the associated pfn has a valid struct page
> that isn't being actively refcounted (e.g. tail pages of non-compound
> higher order pages). For a caller to safely omit FOLL_GET, all usages of
> the returned pfn must be guarded by a mmu notifier.
> 
> This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
> is set when the returned pfn has an associated struct page with a valid
> refcount. Callers that don't pass FOLL_GET should remember this value
> and use it to avoid places like kvm_is_ad_tracked_page that assume a
> non-zero refcount.
> 
> Signed-off-by: David Stevens <stevensd@chromium.org>
> ---
>  include/linux/kvm_host.h | 10 ++++++
>  virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
>  virt/kvm/pfncache.c      |  2 +-
>  3 files changed, 47 insertions(+), 32 deletions(-)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ef2763c2b12e..a45308c7d2d9 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1157,6 +1157,9 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
>  void kvm_release_page_clean(struct page *page);
>  void kvm_release_page_dirty(struct page *page);
>  
> +void kvm_set_page_accessed(struct page *page);
> +void kvm_set_page_dirty(struct page *page);
> +
>  struct kvm_follow_pfn {
>  	const struct kvm_memory_slot *slot;
>  	gfn_t gfn;
> @@ -1164,10 +1167,17 @@ struct kvm_follow_pfn {
>  	bool atomic;
>  	/* Allow a read fault to create a writeable mapping. */
>  	bool allow_write_mapping;
> +	/*
> +	 * Usage of the returned pfn will be guared by a mmu notifier. Must
                                              ^guarded
> +	 * be true if FOLL_GET is not set.
> +	 */
> +	bool guarded_by_mmu_notifier;
>
It seems no one sets the guraded_by_mmu_notifier in this patch. Is
guarded_by_mmu_notifier always equal to !foll->FOLL_GET and set by the
caller of __kvm_follow_pfn()?

If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
temporary solution. I don't think it is a good idea to play tricks with
a temporary solution, more like we are abusing the toleration.

Is a flag like guarded_by_mmu_notifier (perhaps a better name) enough to
indicate a tail page?
 
>  	/* Outputs of __kvm_follow_pfn */
>  	hva_t hva;
>  	bool writable;
> +	/* True if the returned pfn is for a page with a valid refcount. */
> +	bool is_refcounted_page;
>  };
>  
>  kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b13f22861d2f..0f7b41f220b6 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2502,6 +2502,9 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
>  		*pfn = page_to_pfn(page[0]);
>  		foll->writable = foll->allow_write_mapping;
> +		foll->is_refcounted_page = true;
> +		if (!(foll->flags & FOLL_GET))
> +			put_page(page[0]);
>  		return true;
>  	}
>  
> @@ -2525,6 +2528,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  		return npages;
>  
>  	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> +	foll->is_refcounted_page = true;
>  
>  	/* map read fault as writable if possible */
>  	if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> @@ -2537,6 +2541,8 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>  		}
>  	}
>  	*pfn = page_to_pfn(page);
> +	if (!(foll->flags & FOLL_GET))
> +		put_page(page);
>  	return npages;
>  }
>  
> @@ -2551,16 +2557,6 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
>  	return true;
>  }
>  
> -static int kvm_try_get_pfn(kvm_pfn_t pfn)
> -{
> -	struct page *page = kvm_pfn_to_refcounted_page(pfn);
> -
> -	if (!page)
> -		return 1;
> -
> -	return get_page_unless_zero(page);
> -}
> -
>  static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
>  			       kvm_pfn_t *p_pfn)
>  {
> @@ -2568,6 +2564,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
>  	pte_t *ptep;
>  	spinlock_t *ptl;
>  	bool write_fault = foll->flags & FOLL_WRITE;
> +	struct page *page;
>  	int r;
>  
>  	r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
> @@ -2599,24 +2596,27 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
>  	pfn = pte_pfn(*ptep);
>  
>  	/*
> -	 * Get a reference here because callers of *hva_to_pfn* and
> -	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
> -	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
> -	 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
> -	 * simply do nothing for reserved pfns.
> -	 *
> -	 * Whoever called remap_pfn_range is also going to call e.g.
> -	 * unmap_mapping_range before the underlying pages are freed,
> -	 * causing a call to our MMU notifier.
> +	 * Now deal with reference counting. If kvm_pfn_to_refcounted_page
> +	 * returns NULL, then there's no refcount to worry about.
>  	 *
> -	 * Certain IO or PFNMAP mappings can be backed with valid
> -	 * struct pages, but be allocated without refcounting e.g.,
> -	 * tail pages of non-compound higher order allocations, which
> -	 * would then underflow the refcount when the caller does the
> -	 * required put_page. Don't allow those pages here.
> +	 * Otherwise, certain IO or PFNMAP mappings can be backed with valid
> +	 * struct pages but be allocated without refcounting e.g., tail pages of
> +	 * non-compound higher order allocations. If FOLL_GET is set and we
> +	 * increment such a refcount, then when that pfn is eventually passed to
> +	 * kvm_release_pfn_clean, its refcount would hit zero and be incorrectly
> +	 * freed. Therefore don't allow those pages here when FOLL_GET is set.
>  	 */ 
> -	if (!kvm_try_get_pfn(pfn))
> +	page = kvm_pfn_to_refcounted_page(pfn);
> +	if (!page)
> +		goto out;
> +
> +	if (get_page_unless_zero(page)) {
> +		foll->is_refcounted_page = true;
> +		if (!(foll->flags & FOLL_GET))
> +			put_page(page);
> +	} else if (foll->flags & FOLL_GET) {
>  		r = -EFAULT;
> +	}
>  
>  out:
>  	pte_unmap_unlock(ptep, ptl);
> @@ -2693,6 +2693,9 @@ kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
>  
>  kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
>  {
> +	if (WARN_ON_ONCE(!(foll->flags & FOLL_GET) && !foll->guarded_by_mmu_notifier))
> +		return KVM_PFN_ERR_FAULT;
> +
>  	foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
>  				      foll->flags & FOLL_WRITE);
>  
> @@ -2717,7 +2720,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
>  	struct kvm_follow_pfn foll = {
>  		.slot = slot,
>  		.gfn = gfn,
> -		.flags = 0,
> +		.flags = FOLL_GET,
>  		.atomic = atomic,
>  		.allow_write_mapping = !!writable,
>  	};
> @@ -2749,7 +2752,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
>  	struct kvm_follow_pfn foll = {
>  		.slot = gfn_to_memslot(kvm, gfn),
>  		.gfn = gfn,
> -		.flags = write_fault ? FOLL_WRITE : 0,
> +		.flags = FOLL_GET | (write_fault ? FOLL_WRITE : 0),
>  		.allow_write_mapping = !!writable,
>  	};
>  	pfn = __kvm_follow_pfn(&foll);
> @@ -2764,7 +2767,7 @@ kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
>  	struct kvm_follow_pfn foll = {
>  		.slot = slot,
>  		.gfn = gfn,
> -		.flags = FOLL_WRITE,
> +		.flags = FOLL_GET | FOLL_WRITE,
>  	};
>  	return __kvm_follow_pfn(&foll);
>  }
> @@ -2775,7 +2778,7 @@ kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gf
>  	struct kvm_follow_pfn foll = {
>  		.slot = slot,
>  		.gfn = gfn,
> -		.flags = FOLL_WRITE,
> +		.flags = FOLL_GET | FOLL_WRITE,
>  		.atomic = true,
>  	};
>  	return __kvm_follow_pfn(&foll);
> @@ -2930,17 +2933,19 @@ static bool kvm_is_ad_tracked_page(struct page *page)
>  	return !PageReserved(page);
>  }
>  
> -static void kvm_set_page_dirty(struct page *page)
> +void kvm_set_page_dirty(struct page *page)
>  {
>  	if (kvm_is_ad_tracked_page(page))
>  		SetPageDirty(page);
>  }
> +EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
>  
> -static void kvm_set_page_accessed(struct page *page)
> +void kvm_set_page_accessed(struct page *page)
>  {
>  	if (kvm_is_ad_tracked_page(page))
>  		mark_page_accessed(page);
>  }
> +EXPORT_SYMBOL_GPL(kvm_set_page_accessed);
>  
>  void kvm_release_page_clean(struct page *page)
>  {
> diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
> index e3fefa753a51..87caafce3dd0 100644
> --- a/virt/kvm/pfncache.c
> +++ b/virt/kvm/pfncache.c
> @@ -147,7 +147,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
>  	struct kvm_follow_pfn foll = {
>  		.slot = gpc->memslot,
>  		.gfn = gpa_to_gfn(gpc->gpa),
> -		.flags = FOLL_WRITE,
> +		.flags = FOLL_WRITE | FOLL_GET,
>  		.hva = gpc->uhva,
>  	};
>
David Stevens July 6, 2023, 6:09 a.m. UTC | #4
On Wed, Jul 5, 2023 at 8:56 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote:
>
> On Tue, Jul 04, 2023 at 04:50:48PM +0900, David Stevens wrote:
> > From: David Stevens <stevensd@chromium.org>
> >
> > Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
> > callers to resolve a gfn when the associated pfn has a valid struct page
> > that isn't being actively refcounted (e.g. tail pages of non-compound
> > higher order pages). For a caller to safely omit FOLL_GET, all usages of
> > the returned pfn must be guarded by a mmu notifier.
> >
> > This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
> > is set when the returned pfn has an associated struct page with a valid
> > refcount. Callers that don't pass FOLL_GET should remember this value
> > and use it to avoid places like kvm_is_ad_tracked_page that assume a
> > non-zero refcount.
> >
> > Signed-off-by: David Stevens <stevensd@chromium.org>
> > ---
> >  include/linux/kvm_host.h | 10 ++++++
> >  virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
> >  virt/kvm/pfncache.c      |  2 +-
> >  3 files changed, 47 insertions(+), 32 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index ef2763c2b12e..a45308c7d2d9 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1157,6 +1157,9 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> >  void kvm_release_page_clean(struct page *page);
> >  void kvm_release_page_dirty(struct page *page);
> >
> > +void kvm_set_page_accessed(struct page *page);
> > +void kvm_set_page_dirty(struct page *page);
> > +
> >  struct kvm_follow_pfn {
> >       const struct kvm_memory_slot *slot;
> >       gfn_t gfn;
> > @@ -1164,10 +1167,17 @@ struct kvm_follow_pfn {
> >       bool atomic;
> >       /* Allow a read fault to create a writeable mapping. */
> >       bool allow_write_mapping;
> > +     /*
> > +      * Usage of the returned pfn will be guared by a mmu notifier. Must
> > +      * be true if FOLL_GET is not set.
> > +      */
> > +     bool guarded_by_mmu_notifier;
>
> And how? Any place to check the invalidate seq?

kvm_follow_pfn can't meaningfully validate the seq number, since the
mmu notifier locking is handled by the caller. This is more of a
sanity check that the API is being used properly, as proposed here
[1]. I did deviate from the proposal with a bool instead of some type
of integer, since the exact value of mmu_seq wouldn't be useful.

[1] https://lore.kernel.org/all/ZGvUsf7lMkrNDHuE@google.com/#t

-David
David Stevens July 6, 2023, 6:49 a.m. UTC | #5
On Wed, Jul 5, 2023 at 10:19 PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
>
> On Tue,  4 Jul 2023 16:50:48 +0900
> David Stevens <stevensd@chromium.org> wrote:
>
> > From: David Stevens <stevensd@chromium.org>
> >
> > Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
> > callers to resolve a gfn when the associated pfn has a valid struct page
> > that isn't being actively refcounted (e.g. tail pages of non-compound
> > higher order pages). For a caller to safely omit FOLL_GET, all usages of
> > the returned pfn must be guarded by a mmu notifier.
> >
> > This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
> > is set when the returned pfn has an associated struct page with a valid
> > refcount. Callers that don't pass FOLL_GET should remember this value
> > and use it to avoid places like kvm_is_ad_tracked_page that assume a
> > non-zero refcount.
> >
> > Signed-off-by: David Stevens <stevensd@chromium.org>
> > ---
> >  include/linux/kvm_host.h | 10 ++++++
> >  virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
> >  virt/kvm/pfncache.c      |  2 +-
> >  3 files changed, 47 insertions(+), 32 deletions(-)
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index ef2763c2b12e..a45308c7d2d9 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -1157,6 +1157,9 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> >  void kvm_release_page_clean(struct page *page);
> >  void kvm_release_page_dirty(struct page *page);
> >
> > +void kvm_set_page_accessed(struct page *page);
> > +void kvm_set_page_dirty(struct page *page);
> > +
> >  struct kvm_follow_pfn {
> >       const struct kvm_memory_slot *slot;
> >       gfn_t gfn;
> > @@ -1164,10 +1167,17 @@ struct kvm_follow_pfn {
> >       bool atomic;
> >       /* Allow a read fault to create a writeable mapping. */
> >       bool allow_write_mapping;
> > +     /*
> > +      * Usage of the returned pfn will be guared by a mmu notifier. Must
>                                               ^guarded
> > +      * be true if FOLL_GET is not set.
> > +      */
> > +     bool guarded_by_mmu_notifier;
> >
> It seems no one sets the guraded_by_mmu_notifier in this patch. Is
> guarded_by_mmu_notifier always equal to !foll->FOLL_GET and set by the
> caller of __kvm_follow_pfn()?

Yes, this is the case.

> If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> temporary solution. I don't think it is a good idea to play tricks with
> a temporary solution, more like we are abusing the toleration.

I'm not sure I understand what you're getting at. This series never
calls gup without FOLL_GET.

This series aims to provide kvm_follow_pfn as a unified API on top of
gup+follow_pte. Since one of the major clients of this API uses an mmu
notifier, it makes sense to support returning a pfn without taking a
reference. And we indeed need to do that for certain types of memory.

> Is a flag like guarded_by_mmu_notifier (perhaps a better name) enough to
> indicate a tail page?

What do you mean by to indicate a tail page? Do you mean to indicate
that the returned pfn refers to non-refcounted page? That's specified
by is_refcounted_page.

-David
Zhi Wang July 11, 2023, 5:33 p.m. UTC | #6
On Thu, 6 Jul 2023 15:49:39 +0900
David Stevens <stevensd@chromium.org> wrote:

> On Wed, Jul 5, 2023 at 10:19___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> >
> > On Tue,  4 Jul 2023 16:50:48 +0900
> > David Stevens <stevensd@chromium.org> wrote:
> >  
> > > From: David Stevens <stevensd@chromium.org>
> > >
> > > Make it so that __kvm_follow_pfn does not imply FOLL_GET. This allows
> > > callers to resolve a gfn when the associated pfn has a valid struct page
> > > that isn't being actively refcounted (e.g. tail pages of non-compound
> > > higher order pages). For a caller to safely omit FOLL_GET, all usages of
> > > the returned pfn must be guarded by a mmu notifier.
> > >
> > > This also adds a is_refcounted_page out parameter to kvm_follow_pfn that
> > > is set when the returned pfn has an associated struct page with a valid
> > > refcount. Callers that don't pass FOLL_GET should remember this value
> > > and use it to avoid places like kvm_is_ad_tracked_page that assume a
> > > non-zero refcount.
> > >
> > > Signed-off-by: David Stevens <stevensd@chromium.org>
> > > ---
> > >  include/linux/kvm_host.h | 10 ++++++
> > >  virt/kvm/kvm_main.c      | 67 +++++++++++++++++++++-------------------
> > >  virt/kvm/pfncache.c      |  2 +-
> > >  3 files changed, 47 insertions(+), 32 deletions(-)
> > >
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index ef2763c2b12e..a45308c7d2d9 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -1157,6 +1157,9 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
> > >  void kvm_release_page_clean(struct page *page);
> > >  void kvm_release_page_dirty(struct page *page);
> > >
> > > +void kvm_set_page_accessed(struct page *page);
> > > +void kvm_set_page_dirty(struct page *page);
> > > +
> > >  struct kvm_follow_pfn {
> > >       const struct kvm_memory_slot *slot;
> > >       gfn_t gfn;
> > > @@ -1164,10 +1167,17 @@ struct kvm_follow_pfn {
> > >       bool atomic;
> > >       /* Allow a read fault to create a writeable mapping. */
> > >       bool allow_write_mapping;
> > > +     /*
> > > +      * Usage of the returned pfn will be guared by a mmu notifier. Must  
> >                                               ^guarded  
> > > +      * be true if FOLL_GET is not set.
> > > +      */
> > > +     bool guarded_by_mmu_notifier;
> > >  
> > It seems no one sets the guraded_by_mmu_notifier in this patch. Is
> > guarded_by_mmu_notifier always equal to !foll->FOLL_GET and set by the
> > caller of __kvm_follow_pfn()?  
> 
> Yes, this is the case.
> 
> > If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> > It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> > temporary solution. I don't think it is a good idea to play tricks with
> > a temporary solution, more like we are abusing the toleration.  
> 
> I'm not sure I understand what you're getting at. This series never
> calls gup without FOLL_GET.
> 
> This series aims to provide kvm_follow_pfn as a unified API on top of
> gup+follow_pte. Since one of the major clients of this API uses an mmu
> notifier, it makes sense to support returning a pfn without taking a
> reference. And we indeed need to do that for certain types of memory.
> 

I am not having prob with taking a pfn without taking a ref. I am
questioning if using !FOLL_GET in struct kvm_follow_pfn to indicate taking
a pfn without a ref is a good idea or not, while there is another flag
actually showing it.

I can understand that using FOLL_XXX in kvm_follow_pfn saves some
translation between struct kvm_follow_pfn.{write, async, xxxx} and GUP
flags. However FOLL_XXX is for GUP. Using FOLL_XXX for reflecting the
requirements of GUP in the code path that going to call GUP is reasonable.

But using FOLL_XXX with purposes that are not related to GUP call really
feels off. Those flags can be changed in future because of GUP requirements.
Then people have to figure out what actually is happening with FOLL_GET here
as it is not actually tied to GUP calls.


> > Is a flag like guarded_by_mmu_notifier (perhaps a better name) enough to
> > indicate a tail page?  
> 
> What do you mean by to indicate a tail page? Do you mean to indicate
> that the returned pfn refers to non-refcounted page? That's specified
> by is_refcounted_page.
>

I figured out the reason why I got confused.

+	 * Otherwise, certain IO or PFNMAP mappings can be backed with valid
+	 * struct pages but be allocated without refcounting e.g., tail pages of
+	 * non-compound higher order allocations. If FOLL_GET is set and we
+	 * increment such a refcount, then when that pfn is eventually passed to
+	 * kvm_release_pfn_clean, its refcount would hit zero and be incorrectly
+	 * freed. Therefore don't allow those pages here when FOLL_GET is set.
 	 */ 

The above statements only explains the wrong behavior, but doesn't explain the
expected behavior. It would be better to explain that for manipulating mmu
notifier guard page (!FOLL_GET), we put back the reference taken by GUP.
FOLL_GET stuff really confused me a lot.

-	if (!kvm_try_get_pfn(pfn))
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		goto out;
+
+	if (get_page_unless_zero(page)) {
+		foll->is_refcounted_page = true;
+		if (!(foll->flags & FOLL_GET))
+			put_page(page);
+	} else if (foll->flags & FOLL_GET) {
 		r = -EFAULT;
+	}

> -David
Sean Christopherson July 11, 2023, 9:59 p.m. UTC | #7
On Tue, Jul 11, 2023, Zhi Wang wrote:
> On Thu, 6 Jul 2023 15:49:39 +0900
> David Stevens <stevensd@chromium.org> wrote:
> 
> > On Wed, Jul 5, 2023 at 10:19___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> > >
> > > On Tue,  4 Jul 2023 16:50:48 +0900
> > > David Stevens <stevensd@chromium.org> wrote:
> > > If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> > > It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> > > temporary solution. I don't think it is a good idea to play tricks with
> > > a temporary solution, more like we are abusing the toleration.  
> > 
> > I'm not sure I understand what you're getting at. This series never
> > calls gup without FOLL_GET.
> > 
> > This series aims to provide kvm_follow_pfn as a unified API on top of
> > gup+follow_pte. Since one of the major clients of this API uses an mmu
> > notifier, it makes sense to support returning a pfn without taking a
> > reference. And we indeed need to do that for certain types of memory.
> > 
> 
> I am not having prob with taking a pfn without taking a ref. I am
> questioning if using !FOLL_GET in struct kvm_follow_pfn to indicate taking
> a pfn without a ref is a good idea or not, while there is another flag
> actually showing it.
> 
> I can understand that using FOLL_XXX in kvm_follow_pfn saves some
> translation between struct kvm_follow_pfn.{write, async, xxxx} and GUP
> flags. However FOLL_XXX is for GUP. Using FOLL_XXX for reflecting the
> requirements of GUP in the code path that going to call GUP is reasonable.
> 
> But using FOLL_XXX with purposes that are not related to GUP call really
> feels off.

I agree, assuming you're talking specifically about the logic in hva_to_pfn_remapped()
that handles non-refcounted pages, i.e. this

	if (get_page_unless_zero(page)) {
		foll->is_refcounted_page = true;
		if (!(foll->flags & FOLL_GET))
			put_page(page);
	} else if (foll->flags & FOLL_GET) {
		r = -EFAULT;
	}

should be

	if (get_page_unless_zero(page)) {
		foll->is_refcounted_page = true;
		if (!(foll->flags & FOLL_GET))
			put_page(page);
	else if (!foll->guarded_by_mmu_notifier)
		r = -EFAULT;

because it's not the desire to grab a reference that makes getting non-refcounted
pfns "safe", it's whether or not the caller is plugged into the MMU notifiers.

Though that highlights that checking guarded_by_mmu_notifier should be done for
*all* non-refcounted pfns, not just non-refcounted struct page memory.

As for the other usage of FOLL_GET in this series (using it to conditionally do
put_page()), IMO that's very much related to the GUP call.  Invoking put_page()
is a hack to workaround the fact that GUP doesn't provide a way to get the pfn
without grabbing a reference to the page.  In an ideal world, KVM would NOT pass
FOLL_GET to the various GUP helpers, i.e. FOLL_GET would be passed as-is and KVM
wouldn't "need" to kinda sorta overload FOLL_GET to manually drop the reference.

I do think it's worth providing a helper to consolidate and document that hacky
code, e.g. add a kvm_follow_refcounted_pfn() helper.

All in all, I think the below (completely untested) is what we want?

David (and others), I am planning on doing a full review of this series "soon",
but it will likely be a few weeks until that happens.  I jumped in on this
specific thread because this caught my eye and I really don't want to throw out
*all* of the FOLL_GET usage.

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b5afd70f239..90d424990e0a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2481,6 +2481,25 @@ static inline int check_user_page_hwpoison(unsigned long addr)
        return rc == -EHWPOISON;
 }
 
+static kvm_pfn_t kvm_follow_refcounted_pfn(struct kvm_follow_pfn *foll,
+                                          struct page *page)
+{
+       kvm_pfn_t pfn = page_to_pfn(page);
+
+       foll->is_refcounted_page = true;
+
+       /*
+        * FIXME: Ideally, KVM wouldn't pass FOLL_GET to gup() when the caller
+        * doesn't want to grab a reference, but gup() doesn't support getting
+        * just the pfn, i.e. FOLL_GET is effectively mandatory.  If that ever
+        * changes, drop this and simply don't pass FOLL_GET to gup().
+        */
+       if (!(foll->flags & FOLL_GET))
+               put_page(page);
+
+       return pfn;
+}
+
 /*
  * The fast path to get the writable pfn which will be stored in @pfn,
  * true indicates success, otherwise false is returned.  It's also the
@@ -2500,11 +2519,9 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
                return false;
 
        if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
-               *pfn = page_to_pfn(page[0]);
                foll->writable = foll->allow_write_mapping;
-               foll->is_refcounted_page = true;
-               if (!(foll->flags & FOLL_GET))
-                       put_page(page[0]);
+
+               *pfn = kvm_follow_refcounted_pfn(foll, page[0]);
                return true;
        }
 
@@ -2528,7 +2545,6 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
                return npages;
 
        foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
-       foll->is_refcounted_page = true;
 
        /* map read fault as writable if possible */
        if (unlikely(!foll->writable) && foll->allow_write_mapping) {
@@ -2540,9 +2556,8 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
                        page = wpage;
                }
        }
-       *pfn = page_to_pfn(page);
-       if (!(foll->flags & FOLL_GET))
-               put_page(page);
+
+       *pfn = kvm_follow_refcounted_pfn(foll, page);
        return npages;
 }
 
@@ -2610,17 +2625,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
        if (!page)
                goto out;
 
-       if (get_page_unless_zero(page)) {
-               foll->is_refcounted_page = true;
-               if (!(foll->flags & FOLL_GET))
-                       put_page(page);
-       } else if (foll->flags & FOLL_GET) {
-               r = -EFAULT;
-       }
-
+       if (get_page_unless_zero(page))
+               WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
 out:
        pte_unmap_unlock(ptep, ptl);
-       *p_pfn = pfn;
+
+       if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier &&
+           !allow_unsafe_mappings)
+               r = -EFAULT;
+       else
+               *p_pfn = pfn;
 
        return r;
 }
David Stevens Sept. 5, 2023, 8:26 a.m. UTC | #8
On Wed, Jul 12, 2023 at 7:00 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Jul 11, 2023, Zhi Wang wrote:
> > On Thu, 6 Jul 2023 15:49:39 +0900
> > David Stevens <stevensd@chromium.org> wrote:
> >
> > > On Wed, Jul 5, 2023 at 10:19___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> > > >
> > > > On Tue,  4 Jul 2023 16:50:48 +0900
> > > > David Stevens <stevensd@chromium.org> wrote:
> > > > If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> > > > It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> > > > temporary solution. I don't think it is a good idea to play tricks with
> > > > a temporary solution, more like we are abusing the toleration.
> > >
> > > I'm not sure I understand what you're getting at. This series never
> > > calls gup without FOLL_GET.
> > >
> > > This series aims to provide kvm_follow_pfn as a unified API on top of
> > > gup+follow_pte. Since one of the major clients of this API uses an mmu
> > > notifier, it makes sense to support returning a pfn without taking a
> > > reference. And we indeed need to do that for certain types of memory.
> > >
> >
> > I am not having prob with taking a pfn without taking a ref. I am
> > questioning if using !FOLL_GET in struct kvm_follow_pfn to indicate taking
> > a pfn without a ref is a good idea or not, while there is another flag
> > actually showing it.
> >
> > I can understand that using FOLL_XXX in kvm_follow_pfn saves some
> > translation between struct kvm_follow_pfn.{write, async, xxxx} and GUP
> > flags. However FOLL_XXX is for GUP. Using FOLL_XXX for reflecting the
> > requirements of GUP in the code path that going to call GUP is reasonable.
> >
> > But using FOLL_XXX with purposes that are not related to GUP call really
> > feels off.
>
> I agree, assuming you're talking specifically about the logic in hva_to_pfn_remapped()
> that handles non-refcounted pages, i.e. this
>
>         if (get_page_unless_zero(page)) {
>                 foll->is_refcounted_page = true;
>                 if (!(foll->flags & FOLL_GET))
>                         put_page(page);
>         } else if (foll->flags & FOLL_GET) {
>                 r = -EFAULT;
>         }
>
> should be
>
>         if (get_page_unless_zero(page)) {
>                 foll->is_refcounted_page = true;
>                 if (!(foll->flags & FOLL_GET))
>                         put_page(page);
>         else if (!foll->guarded_by_mmu_notifier)
>                 r = -EFAULT;
>
> because it's not the desire to grab a reference that makes getting non-refcounted
> pfns "safe", it's whether or not the caller is plugged into the MMU notifiers.
>
> Though that highlights that checking guarded_by_mmu_notifier should be done for
> *all* non-refcounted pfns, not just non-refcounted struct page memory.

I think things are getting confused here because there are multiple
things which "safe" refers to. There are three different definitions
that I think are relevant here:

1) "safe" in the sense that KVM doesn't corrupt page reference counts
2) "safe" in the sense that KVM doesn't access pfns after they have been freed
3) "safe" in the sense that KVM doesn't use stale hva -> pfn translations

For property 1, FOLL_GET is important. If the caller passes FOLL_GET,
then they expect to be able to pass the returned pfn to
kvm_release_pfn. This means that when FOLL_GET is set, if
kvm_pfn_to_refcounted_page returns a page, then hva_to_pfn_remapped
must take a reference count to avoid eventually corrupting the page
ref count. I guess replacing the FOLL_GET check with
!guarded_by_mmu_notifier is logically equivalent because
__kvm_follow_pfn requires that at least one of guarded_by_mmu_notifier
and FOLL_GET is set. But since we're concerned about a property of the
refcount, I think that checking FOLL_GET is clearer.

For property 2, FOLL_GET is also important. If guarded_by_mmu_notifier
is set, then we're all good here. If guarded_by_mmu_notifier is not
set, then the check in __kvm_follow_pfn guarantees that FOLL_GET is
set. For struct page memory, we're safe because KVM will hold a
reference as long as it's still using the page. For non struct page
memory, we're not safe - this is where the breaking change of
allow_unsafe_mappings would go. Note that for non-refcounted struct
page, we can't use the allow_unsafe_mappings escape hatch. Since
FOLL_GET was requested, if we returned such a page, then the caller
would eventually corrupt the page refcount via kvm_release_pfn.

Property 3 would be nice, but we've already concluded that guarding
all translations with mmu notifiers is infeasible. So maintaining
property 2 is the best we can hope for.

> As for the other usage of FOLL_GET in this series (using it to conditionally do
> put_page()), IMO that's very much related to the GUP call.  Invoking put_page()
> is a hack to workaround the fact that GUP doesn't provide a way to get the pfn
> without grabbing a reference to the page.  In an ideal world, KVM would NOT pass
> FOLL_GET to the various GUP helpers, i.e. FOLL_GET would be passed as-is and KVM
> wouldn't "need" to kinda sorta overload FOLL_GET to manually drop the reference.
>
> I do think it's worth providing a helper to consolidate and document that hacky
> code, e.g. add a kvm_follow_refcounted_pfn() helper.
>
> All in all, I think the below (completely untested) is what we want?
>
> David (and others), I am planning on doing a full review of this series "soon",
> but it will likely be a few weeks until that happens.  I jumped in on this
> specific thread because this caught my eye and I really don't want to throw out
> *all* of the FOLL_GET usage.
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 5b5afd70f239..90d424990e0a 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2481,6 +2481,25 @@ static inline int check_user_page_hwpoison(unsigned long addr)
>         return rc == -EHWPOISON;
>  }
>
> +static kvm_pfn_t kvm_follow_refcounted_pfn(struct kvm_follow_pfn *foll,
> +                                          struct page *page)
> +{
> +       kvm_pfn_t pfn = page_to_pfn(page);
> +
> +       foll->is_refcounted_page = true;
> +
> +       /*
> +        * FIXME: Ideally, KVM wouldn't pass FOLL_GET to gup() when the caller
> +        * doesn't want to grab a reference, but gup() doesn't support getting
> +        * just the pfn, i.e. FOLL_GET is effectively mandatory.  If that ever
> +        * changes, drop this and simply don't pass FOLL_GET to gup().
> +        */
> +       if (!(foll->flags & FOLL_GET))
> +               put_page(page);
> +
> +       return pfn;
> +}
> +
>  /*
>   * The fast path to get the writable pfn which will be stored in @pfn,
>   * true indicates success, otherwise false is returned.  It's also the
> @@ -2500,11 +2519,9 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>                 return false;
>
>         if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> -               *pfn = page_to_pfn(page[0]);
>                 foll->writable = foll->allow_write_mapping;
> -               foll->is_refcounted_page = true;
> -               if (!(foll->flags & FOLL_GET))
> -                       put_page(page[0]);
> +
> +               *pfn = kvm_follow_refcounted_pfn(foll, page[0]);
>                 return true;
>         }
>
> @@ -2528,7 +2545,6 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>                 return npages;
>
>         foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> -       foll->is_refcounted_page = true;
>
>         /* map read fault as writable if possible */
>         if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> @@ -2540,9 +2556,8 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
>                         page = wpage;
>                 }
>         }
> -       *pfn = page_to_pfn(page);
> -       if (!(foll->flags & FOLL_GET))
> -               put_page(page);
> +
> +       *pfn = kvm_follow_refcounted_pfn(foll, page);
>         return npages;
>  }
>
> @@ -2610,17 +2625,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
>         if (!page)
>                 goto out;
>
> -       if (get_page_unless_zero(page)) {
> -               foll->is_refcounted_page = true;
> -               if (!(foll->flags & FOLL_GET))
> -                       put_page(page);
> -       } else if (foll->flags & FOLL_GET) {
> -               r = -EFAULT;
> -       }
> -
> +       if (get_page_unless_zero(page))
> +               WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
>  out:
>         pte_unmap_unlock(ptep, ptl);
> -       *p_pfn = pfn;
> +
> +       if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier &&
> +           !allow_unsafe_mappings)
> +               r = -EFAULT;
> +       else
> +               *p_pfn = pfn;
>
>         return r;
>  }
>

As I pointed out above, this suggestion is broken because a FOLL_GET
&& !guarded_by_mmu_notifier request (e.g. kvm_vcpu_map) for a
non-refcounted page will result in the refcount eventually being
corrupted.

What do you think of this implementation? If it makes sense, I can
send out an updated patch series.

/*
 * If FOLL_GET is set, then the caller wants us to take a reference to
 * keep the pfn alive. If FOLL_GET isn't set, then __kvm_follow_pfn
 * guarantees that guarded_by_mmu_notifier is set, so there aren't any
 * use-after-free concerns.
 */
page = kvm_pfn_to_refcounted_page(pfn);
if (page) {
        if (get_page_unless_zero(page)) {
                WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
        } else if (foll->flags & FOLL_GET) {
                /*
                 * Certain IO or PFNMAP mappings can be backed with
                 * valid struct pages but be allocated without
                 * refcounting e.g., tail pages of non-compound higher
                 * order allocations. The caller asked for a ref, but
                 * we can't take one, since releasing such a ref would
                 * free the page.
                 */
                r = -EFAULT;
        }
} else if (foll->flags & FOLL_GET) {
        /*
         * When there's no struct page to refcount and no MMU notifier,
         * then KVM can't be guarantee to avoid use-after-free. However,
         * there are valid reasons to set up such mappings. If userspace
         * is trusted and willing to forego kernel safety guarantees,
         * allow this check to be bypassed.
         */
        if (foll->guarded_by_mmu_notifier && !allow_unsafe_mappings)
                r = -EFAULT;
}

-David
Sean Christopherson Sept. 6, 2023, 12:45 a.m. UTC | #9
On Tue, Sep 05, 2023, David Stevens wrote:
> On Wed, Jul 12, 2023 at 7:00 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Jul 11, 2023, Zhi Wang wrote:
> > > On Thu, 6 Jul 2023 15:49:39 +0900
> > > David Stevens <stevensd@chromium.org> wrote:
> > >
> > > > On Wed, Jul 5, 2023 at 10:19___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> > > > >
> > > > > On Tue,  4 Jul 2023 16:50:48 +0900
> > > > > David Stevens <stevensd@chromium.org> wrote:
> > > > > If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> > > > > It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> > > > > temporary solution. I don't think it is a good idea to play tricks with
> > > > > a temporary solution, more like we are abusing the toleration.
> > > >
> > > > I'm not sure I understand what you're getting at. This series never
> > > > calls gup without FOLL_GET.
> > > >
> > > > This series aims to provide kvm_follow_pfn as a unified API on top of
> > > > gup+follow_pte. Since one of the major clients of this API uses an mmu
> > > > notifier, it makes sense to support returning a pfn without taking a
> > > > reference. And we indeed need to do that for certain types of memory.
> > > >
> > >
> > > I am not having prob with taking a pfn without taking a ref. I am
> > > questioning if using !FOLL_GET in struct kvm_follow_pfn to indicate taking
> > > a pfn without a ref is a good idea or not, while there is another flag
> > > actually showing it.
> > >
> > > I can understand that using FOLL_XXX in kvm_follow_pfn saves some
> > > translation between struct kvm_follow_pfn.{write, async, xxxx} and GUP
> > > flags. However FOLL_XXX is for GUP. Using FOLL_XXX for reflecting the
> > > requirements of GUP in the code path that going to call GUP is reasonable.
> > >
> > > But using FOLL_XXX with purposes that are not related to GUP call really
> > > feels off.
> >
> > I agree, assuming you're talking specifically about the logic in hva_to_pfn_remapped()
> > that handles non-refcounted pages, i.e. this
> >
> >         if (get_page_unless_zero(page)) {
> >                 foll->is_refcounted_page = true;
> >                 if (!(foll->flags & FOLL_GET))
> >                         put_page(page);
> >         } else if (foll->flags & FOLL_GET) {
> >                 r = -EFAULT;
> >         }
> >
> > should be
> >
> >         if (get_page_unless_zero(page)) {
> >                 foll->is_refcounted_page = true;
> >                 if (!(foll->flags & FOLL_GET))
> >                         put_page(page);
> >         else if (!foll->guarded_by_mmu_notifier)
> >                 r = -EFAULT;
> >
> > because it's not the desire to grab a reference that makes getting non-refcounted
> > pfns "safe", it's whether or not the caller is plugged into the MMU notifiers.
> >
> > Though that highlights that checking guarded_by_mmu_notifier should be done for
> > *all* non-refcounted pfns, not just non-refcounted struct page memory.
> 
> I think things are getting confused here because there are multiple
> things which "safe" refers to. There are three different definitions
> that I think are relevant here:
> 
> 1) "safe" in the sense that KVM doesn't corrupt page reference counts
> 2) "safe" in the sense that KVM doesn't access pfns after they have been freed
> 3) "safe" in the sense that KVM doesn't use stale hva -> pfn translations
>
> For property 1, FOLL_GET is important. If the caller passes FOLL_GET,
> then they expect to be able to pass the returned pfn to
> kvm_release_pfn. This means that when FOLL_GET is set, if
> kvm_pfn_to_refcounted_page returns a page, then hva_to_pfn_remapped
> must take a reference count to avoid eventually corrupting the page
> ref count. I guess replacing the FOLL_GET check with
> !guarded_by_mmu_notifier is logically equivalent because
> __kvm_follow_pfn requires that at least one of guarded_by_mmu_notifier
> and FOLL_GET is set. But since we're concerned about a property of the
> refcount, I think that checking FOLL_GET is clearer.
> 
> For property 2, FOLL_GET is also important. If guarded_by_mmu_notifier
> is set, then we're all good here. If guarded_by_mmu_notifier is not
> set, then the check in __kvm_follow_pfn guarantees that FOLL_GET is
> set. For struct page memory, we're safe because KVM will hold a
> reference as long as it's still using the page. For non struct page
> memory, we're not safe - this is where the breaking change of
> allow_unsafe_mappings would go. Note that for non-refcounted struct
> page, we can't use the allow_unsafe_mappings escape hatch. Since
> FOLL_GET was requested, if we returned such a page, then the caller
> would eventually corrupt the page refcount via kvm_release_pfn.

Yes we can.  The caller simply needs to be made aware of is_refcounted_page.   I
didn't include that in the snippet below because I didn't want to write the entire
patch.  The whole point of adding is_refcounted_page is so that callers can
identify exactly what type of page was at the end of the trail that was followed.

> Property 3 would be nice, but we've already concluded that guarding
> all translations with mmu notifiers is infeasible. So maintaining
> property 2 is the best we can hope for.

No, #3 is just a variant of #2.  Unless you're talking about not making guarantees
about guest accesses being ordered with respect to VMA/memslot updates, but I
don't think that's the case.

> > As for the other usage of FOLL_GET in this series (using it to conditionally do
> > put_page()), IMO that's very much related to the GUP call.  Invoking put_page()
> > is a hack to workaround the fact that GUP doesn't provide a way to get the pfn
> > without grabbing a reference to the page.  In an ideal world, KVM would NOT pass
> > FOLL_GET to the various GUP helpers, i.e. FOLL_GET would be passed as-is and KVM
> > wouldn't "need" to kinda sorta overload FOLL_GET to manually drop the reference.
> >
> > I do think it's worth providing a helper to consolidate and document that hacky
> > code, e.g. add a kvm_follow_refcounted_pfn() helper.
> >
> > All in all, I think the below (completely untested) is what we want?
> >
> > David (and others), I am planning on doing a full review of this series "soon",
> > but it will likely be a few weeks until that happens.  I jumped in on this
> > specific thread because this caught my eye and I really don't want to throw out
> > *all* of the FOLL_GET usage.
> >
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 5b5afd70f239..90d424990e0a 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2481,6 +2481,25 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> >         return rc == -EHWPOISON;
> >  }
> >
> > +static kvm_pfn_t kvm_follow_refcounted_pfn(struct kvm_follow_pfn *foll,
> > +                                          struct page *page)
> > +{
> > +       kvm_pfn_t pfn = page_to_pfn(page);
> > +
> > +       foll->is_refcounted_page = true;
> > +
> > +       /*
> > +        * FIXME: Ideally, KVM wouldn't pass FOLL_GET to gup() when the caller
> > +        * doesn't want to grab a reference, but gup() doesn't support getting
> > +        * just the pfn, i.e. FOLL_GET is effectively mandatory.  If that ever
> > +        * changes, drop this and simply don't pass FOLL_GET to gup().
> > +        */
> > +       if (!(foll->flags & FOLL_GET))
> > +               put_page(page);
> > +
> > +       return pfn;
> > +}
> > +
> >  /*
> >   * The fast path to get the writable pfn which will be stored in @pfn,
> >   * true indicates success, otherwise false is returned.  It's also the
> > @@ -2500,11 +2519,9 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >                 return false;
> >
> >         if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> > -               *pfn = page_to_pfn(page[0]);
> >                 foll->writable = foll->allow_write_mapping;
> > -               foll->is_refcounted_page = true;
> > -               if (!(foll->flags & FOLL_GET))
> > -                       put_page(page[0]);
> > +
> > +               *pfn = kvm_follow_refcounted_pfn(foll, page[0]);
> >                 return true;
> >         }
> >
> > @@ -2528,7 +2545,6 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >                 return npages;
> >
> >         foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > -       foll->is_refcounted_page = true;
> >
> >         /* map read fault as writable if possible */
> >         if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > @@ -2540,9 +2556,8 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> >                         page = wpage;
> >                 }
> >         }
> > -       *pfn = page_to_pfn(page);
> > -       if (!(foll->flags & FOLL_GET))
> > -               put_page(page);
> > +
> > +       *pfn = kvm_follow_refcounted_pfn(foll, page);
> >         return npages;
> >  }
> >
> > @@ -2610,17 +2625,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
> >         if (!page)
> >                 goto out;
> >
> > -       if (get_page_unless_zero(page)) {
> > -               foll->is_refcounted_page = true;
> > -               if (!(foll->flags & FOLL_GET))
> > -                       put_page(page);
> > -       } else if (foll->flags & FOLL_GET) {
> > -               r = -EFAULT;
> > -       }
> > -
> > +       if (get_page_unless_zero(page))
> > +               WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
> >  out:
> >         pte_unmap_unlock(ptep, ptl);
> > -       *p_pfn = pfn;
> > +
> > +       if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier &&
> > +           !allow_unsafe_mappings)
> > +               r = -EFAULT;
> > +       else
> > +               *p_pfn = pfn;
> >
> >         return r;
> >  }
> >
> 
> As I pointed out above, this suggestion is broken because a FOLL_GET
> && !guarded_by_mmu_notifier request (e.g. kvm_vcpu_map) for a
> non-refcounted page will result in the refcount eventually being
> corrupted.

I don't think so, unless I'm misunderstanding the concern.  It just wasn't a
complete patch, and wasn't intended to be.

> What do you think of this implementation? If it makes sense, I can
> send out an updated patch series.
>
> /*
>  * If FOLL_GET is set, then the caller wants us to take a reference to
>  * keep the pfn alive. If FOLL_GET isn't set, then __kvm_follow_pfn
>  * guarantees that guarded_by_mmu_notifier is set, so there aren't any
>  * use-after-free concerns.
>  */
> page = kvm_pfn_to_refcounted_page(pfn);
> if (page) {
>         if (get_page_unless_zero(page)) {
>                 WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
>         } else if (foll->flags & FOLL_GET) {
>                 /*
>                  * Certain IO or PFNMAP mappings can be backed with
>                  * valid struct pages but be allocated without
>                  * refcounting e.g., tail pages of non-compound higher
>                  * order allocations. The caller asked for a ref, but
>                  * we can't take one, since releasing such a ref would
>                  * free the page.
>                  */
>                 r = -EFAULT;
>         }
> } else if (foll->flags & FOLL_GET) {
>         /*
>          * When there's no struct page to refcount and no MMU notifier,
>          * then KVM can't be guarantee to avoid use-after-free. However,
>          * there are valid reasons to set up such mappings. If userspace
>          * is trusted and willing to forego kernel safety guarantees,
>          * allow this check to be bypassed.
>          */
>         if (foll->guarded_by_mmu_notifier && !allow_unsafe_mappings)

I assume you mean:

	if (!foll->guarded_by_mmu_notifier && !allow_unsafe_mappings)

>                 r = -EFAULT;
> }

Please no.  I don't want to overload FOLL_GET or have dependencies between
FOLL_GET and guarded_by_mmu_notifier.  The guest page fault path should be able
to omit FOLL_GET, and it obviously should set guarded_by_mmu_notifier.  And
kvm_vcpu_map() should be able to set FOLL_GET, omit guarded_by_mmu_notifier, _and_
play nice with "unsafe", non-refcounted memory when allow_unsafe_mappings is true.

The above fits your use case because nothing in KVM needs to do kvm_vcpu_map()
on the GPU's buffer, but as a general solution the semantics are very odd.  E.g.
in long form, they are:

  Get get a reference if a there's a refcounted page, fail with -EFAULT if there's
  a struct page but it's not refcounted, and fail with -EFAULT if there's not struct
  page unless the caller is protected by mmu_notifiers or unsafe mappings are allowed.

That's rather bonkers.  What I instead want is FOLL_GET to be:

  Get a reference if there's a refcounted page.

And then allow_unsafe_mappings is simply:

  Allow mapping non-refcounted memory into the guest even if the mapping isn't
  guarded by mmu_notifier events.
David Stevens Sept. 6, 2023, 3:24 a.m. UTC | #10
On Wed, Sep 6, 2023 at 9:45 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Sep 05, 2023, David Stevens wrote:
> > On Wed, Jul 12, 2023 at 7:00 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Tue, Jul 11, 2023, Zhi Wang wrote:
> > > > On Thu, 6 Jul 2023 15:49:39 +0900
> > > > David Stevens <stevensd@chromium.org> wrote:
> > > >
> > > > > On Wed, Jul 5, 2023 at 10:19___PM Zhi Wang <zhi.wang.linux@gmail.com> wrote:
> > > > > >
> > > > > > On Tue,  4 Jul 2023 16:50:48 +0900
> > > > > > David Stevens <stevensd@chromium.org> wrote:
> > > > > > If yes, do we have to use FOLL_GET to resolve GFN associated with a tail page?
> > > > > > It seems gup can tolerate gup_flags without FOLL_GET, but it is more like a
> > > > > > temporary solution. I don't think it is a good idea to play tricks with
> > > > > > a temporary solution, more like we are abusing the toleration.
> > > > >
> > > > > I'm not sure I understand what you're getting at. This series never
> > > > > calls gup without FOLL_GET.
> > > > >
> > > > > This series aims to provide kvm_follow_pfn as a unified API on top of
> > > > > gup+follow_pte. Since one of the major clients of this API uses an mmu
> > > > > notifier, it makes sense to support returning a pfn without taking a
> > > > > reference. And we indeed need to do that for certain types of memory.
> > > > >
> > > >
> > > > I am not having prob with taking a pfn without taking a ref. I am
> > > > questioning if using !FOLL_GET in struct kvm_follow_pfn to indicate taking
> > > > a pfn without a ref is a good idea or not, while there is another flag
> > > > actually showing it.
> > > >
> > > > I can understand that using FOLL_XXX in kvm_follow_pfn saves some
> > > > translation between struct kvm_follow_pfn.{write, async, xxxx} and GUP
> > > > flags. However FOLL_XXX is for GUP. Using FOLL_XXX for reflecting the
> > > > requirements of GUP in the code path that going to call GUP is reasonable.
> > > >
> > > > But using FOLL_XXX with purposes that are not related to GUP call really
> > > > feels off.
> > >
> > > I agree, assuming you're talking specifically about the logic in hva_to_pfn_remapped()
> > > that handles non-refcounted pages, i.e. this
> > >
> > >         if (get_page_unless_zero(page)) {
> > >                 foll->is_refcounted_page = true;
> > >                 if (!(foll->flags & FOLL_GET))
> > >                         put_page(page);
> > >         } else if (foll->flags & FOLL_GET) {
> > >                 r = -EFAULT;
> > >         }
> > >
> > > should be
> > >
> > >         if (get_page_unless_zero(page)) {
> > >                 foll->is_refcounted_page = true;
> > >                 if (!(foll->flags & FOLL_GET))
> > >                         put_page(page);
> > >         else if (!foll->guarded_by_mmu_notifier)
> > >                 r = -EFAULT;
> > >
> > > because it's not the desire to grab a reference that makes getting non-refcounted
> > > pfns "safe", it's whether or not the caller is plugged into the MMU notifiers.
> > >
> > > Though that highlights that checking guarded_by_mmu_notifier should be done for
> > > *all* non-refcounted pfns, not just non-refcounted struct page memory.
> >
> > I think things are getting confused here because there are multiple
> > things which "safe" refers to. There are three different definitions
> > that I think are relevant here:
> >
> > 1) "safe" in the sense that KVM doesn't corrupt page reference counts
> > 2) "safe" in the sense that KVM doesn't access pfns after they have been freed
> > 3) "safe" in the sense that KVM doesn't use stale hva -> pfn translations
> >
> > For property 1, FOLL_GET is important. If the caller passes FOLL_GET,
> > then they expect to be able to pass the returned pfn to
> > kvm_release_pfn. This means that when FOLL_GET is set, if
> > kvm_pfn_to_refcounted_page returns a page, then hva_to_pfn_remapped
> > must take a reference count to avoid eventually corrupting the page
> > ref count. I guess replacing the FOLL_GET check with
> > !guarded_by_mmu_notifier is logically equivalent because
> > __kvm_follow_pfn requires that at least one of guarded_by_mmu_notifier
> > and FOLL_GET is set. But since we're concerned about a property of the
> > refcount, I think that checking FOLL_GET is clearer.
> >
> > For property 2, FOLL_GET is also important. If guarded_by_mmu_notifier
> > is set, then we're all good here. If guarded_by_mmu_notifier is not
> > set, then the check in __kvm_follow_pfn guarantees that FOLL_GET is
> > set. For struct page memory, we're safe because KVM will hold a
> > reference as long as it's still using the page. For non struct page
> > memory, we're not safe - this is where the breaking change of
> > allow_unsafe_mappings would go. Note that for non-refcounted struct
> > page, we can't use the allow_unsafe_mappings escape hatch. Since
> > FOLL_GET was requested, if we returned such a page, then the caller
> > would eventually corrupt the page refcount via kvm_release_pfn.
>
> Yes we can.  The caller simply needs to be made aware of is_refcounted_page.   I
> didn't include that in the snippet below because I didn't want to write the entire
> patch.  The whole point of adding is_refcounted_page is so that callers can
> identify exactly what type of page was at the end of the trail that was followed.

Are you asking me to completely migrate every caller of any gfn_to_pfn
variant to __kvm_follow_pfn, so that they can respect
is_refcounted_page? That's the only way to make it safe for
allow_unsafe_mappings to apply to non-refcounted pages. That is
decidedly not simple. Or is kvm_vcpu_map the specific call site you
care about? At best, I can try to migrate x86, and then just add some
sort of compatibility shim for other architectures that rejects
non-refcounted pages.

> > Property 3 would be nice, but we've already concluded that guarding
> > all translations with mmu notifiers is infeasible. So maintaining
> > property 2 is the best we can hope for.
>
> No, #3 is just a variant of #2.  Unless you're talking about not making guarantees
> about guest accesses being ordered with respect to VMA/memslot updates, but I
> don't think that's the case.

I'm talking about the fact that kvm_vcpu_map is busted with respect to
updates to VMA updates. It won't corrupt host memory because the
mapping keeps a reference to the page, but it will continue to use
stale translations. From [1], it sounds like you've granted that
fixing that is not feasible, so I just wanted to make sure that this
isn't the "unsafe" referred to by allow_unsafe_mappings.

[1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/

> > > As for the other usage of FOLL_GET in this series (using it to conditionally do
> > > put_page()), IMO that's very much related to the GUP call.  Invoking put_page()
> > > is a hack to workaround the fact that GUP doesn't provide a way to get the pfn
> > > without grabbing a reference to the page.  In an ideal world, KVM would NOT pass
> > > FOLL_GET to the various GUP helpers, i.e. FOLL_GET would be passed as-is and KVM
> > > wouldn't "need" to kinda sorta overload FOLL_GET to manually drop the reference.
> > >
> > > I do think it's worth providing a helper to consolidate and document that hacky
> > > code, e.g. add a kvm_follow_refcounted_pfn() helper.
> > >
> > > All in all, I think the below (completely untested) is what we want?
> > >
> > > David (and others), I am planning on doing a full review of this series "soon",
> > > but it will likely be a few weeks until that happens.  I jumped in on this
> > > specific thread because this caught my eye and I really don't want to throw out
> > > *all* of the FOLL_GET usage.
> > >
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index 5b5afd70f239..90d424990e0a 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2481,6 +2481,25 @@ static inline int check_user_page_hwpoison(unsigned long addr)
> > >         return rc == -EHWPOISON;
> > >  }
> > >
> > > +static kvm_pfn_t kvm_follow_refcounted_pfn(struct kvm_follow_pfn *foll,
> > > +                                          struct page *page)
> > > +{
> > > +       kvm_pfn_t pfn = page_to_pfn(page);
> > > +
> > > +       foll->is_refcounted_page = true;
> > > +
> > > +       /*
> > > +        * FIXME: Ideally, KVM wouldn't pass FOLL_GET to gup() when the caller
> > > +        * doesn't want to grab a reference, but gup() doesn't support getting
> > > +        * just the pfn, i.e. FOLL_GET is effectively mandatory.  If that ever
> > > +        * changes, drop this and simply don't pass FOLL_GET to gup().
> > > +        */
> > > +       if (!(foll->flags & FOLL_GET))
> > > +               put_page(page);
> > > +
> > > +       return pfn;
> > > +}
> > > +
> > >  /*
> > >   * The fast path to get the writable pfn which will be stored in @pfn,
> > >   * true indicates success, otherwise false is returned.  It's also the
> > > @@ -2500,11 +2519,9 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >                 return false;
> > >
> > >         if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
> > > -               *pfn = page_to_pfn(page[0]);
> > >                 foll->writable = foll->allow_write_mapping;
> > > -               foll->is_refcounted_page = true;
> > > -               if (!(foll->flags & FOLL_GET))
> > > -                       put_page(page[0]);
> > > +
> > > +               *pfn = kvm_follow_refcounted_pfn(foll, page[0]);
> > >                 return true;
> > >         }
> > >
> > > @@ -2528,7 +2545,6 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >                 return npages;
> > >
> > >         foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
> > > -       foll->is_refcounted_page = true;
> > >
> > >         /* map read fault as writable if possible */
> > >         if (unlikely(!foll->writable) && foll->allow_write_mapping) {
> > > @@ -2540,9 +2556,8 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
> > >                         page = wpage;
> > >                 }
> > >         }
> > > -       *pfn = page_to_pfn(page);
> > > -       if (!(foll->flags & FOLL_GET))
> > > -               put_page(page);
> > > +
> > > +       *pfn = kvm_follow_refcounted_pfn(foll, page);
> > >         return npages;
> > >  }
> > >
> > > @@ -2610,17 +2625,16 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
> > >         if (!page)
> > >                 goto out;
> > >
> > > -       if (get_page_unless_zero(page)) {
> > > -               foll->is_refcounted_page = true;
> > > -               if (!(foll->flags & FOLL_GET))
> > > -                       put_page(page);
> > > -       } else if (foll->flags & FOLL_GET) {
> > > -               r = -EFAULT;
> > > -       }
> > > -
> > > +       if (get_page_unless_zero(page))
> > > +               WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
> > >  out:
> > >         pte_unmap_unlock(ptep, ptl);
> > > -       *p_pfn = pfn;
> > > +
> > > +       if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier &&
> > > +           !allow_unsafe_mappings)
> > > +               r = -EFAULT;
> > > +       else
> > > +               *p_pfn = pfn;
> > >
> > >         return r;
> > >  }
> > >
> >
> > As I pointed out above, this suggestion is broken because a FOLL_GET
> > && !guarded_by_mmu_notifier request (e.g. kvm_vcpu_map) for a
> > non-refcounted page will result in the refcount eventually being
> > corrupted.
>
> I don't think so, unless I'm misunderstanding the concern.  It just wasn't a
> complete patch, and wasn't intended to be.

I guess I misunderstood the scale of the changes you're suggesting.

-David
Sean Christopherson Sept. 6, 2023, 10:03 p.m. UTC | #11
On Wed, Sep 06, 2023, David Stevens wrote:
> On Wed, Sep 6, 2023 at 9:45 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Tue, Sep 05, 2023, David Stevens wrote:
> > > For property 2, FOLL_GET is also important. If guarded_by_mmu_notifier
> > > is set, then we're all good here. If guarded_by_mmu_notifier is not
> > > set, then the check in __kvm_follow_pfn guarantees that FOLL_GET is
> > > set. For struct page memory, we're safe because KVM will hold a
> > > reference as long as it's still using the page. For non struct page
> > > memory, we're not safe - this is where the breaking change of
> > > allow_unsafe_mappings would go. Note that for non-refcounted struct
> > > page, we can't use the allow_unsafe_mappings escape hatch. Since
> > > FOLL_GET was requested, if we returned such a page, then the caller
> > > would eventually corrupt the page refcount via kvm_release_pfn.
> >
> > Yes we can.  The caller simply needs to be made aware of is_refcounted_page.   I
> > didn't include that in the snippet below because I didn't want to write the entire
> > patch.  The whole point of adding is_refcounted_page is so that callers can
> > identify exactly what type of page was at the end of the trail that was followed.
> 
> Are you asking me to completely migrate every caller of any gfn_to_pfn
> variant to __kvm_follow_pfn, so that they can respect
> is_refcounted_page? That's the only way to make it safe for
> allow_unsafe_mappings to apply to non-refcounted pages. That is
> decidedly not simple. Or is kvm_vcpu_map the specific call site you
> care about? At best, I can try to migrate x86, and then just add some
> sort of compatibility shim for other architectures that rejects
> non-refcounted pages.

Ah, I see your conundrum.  No, I don't think it's reasonable to require you to
convert all users in every architecture.  I'll still ask, just in case you're
feeling generous, but it's not a requirement :-)

The easiest way forward I can think of is to add yet another flag to kvm_follow_pfn,
e.g. allow_non_refcounted_struct_page, to communicate whether or not the caller
has been enlightened to play nice with non-refcounted struct page memory.  We'll
need that flag no matter what, otherwise we'd have to convert all users in a single
patch (LOL).  Then your series can simply stop at a reasonable point, e.g. convert
all x86 usage (including kvm_vcpu_map(), and leave converting everything else to
future work.

E.g. I think this would be the outro of hva_to_pfn_remapped():

        if (!page)
                goto out;

        if (get_page_unless_zero(page))
                WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn);
 out:
        pte_unmap_unlock(ptep, ptl);

	/*
	 * TODO: Drop allow_non_refcounted_struct_page once all callers have
	 * been taught to play nice with non-refcounted tail pages.
	 */
	if (page && !foll->is_refcounted_page &&
	    !foll->allow_non_refcounted_struct_page)
		r = -EFAULT
        else if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier &&
        	 !allow_unsafe_mappings)
        	r = -EFAULT;
        else
               *p_pfn = pfn;

        return r;

> > > Property 3 would be nice, but we've already concluded that guarding
> > > all translations with mmu notifiers is infeasible. So maintaining
> > > property 2 is the best we can hope for.
> >
> > No, #3 is just a variant of #2.  Unless you're talking about not making guarantees
> > about guest accesses being ordered with respect to VMA/memslot updates, but I
> > don't think that's the case.
> 
> I'm talking about the fact that kvm_vcpu_map is busted with respect to
> updates to VMA updates. It won't corrupt host memory because the
> mapping keeps a reference to the page, but it will continue to use
> stale translations.

True.  But barring some crazy paravirt use case, userspace modifying a mapping
that is in active use is inherently broken, the guest will have no idea that memory
just got yanked away.

Hmm, though I suppose userspace could theoretically mprotect() a mapping to be
read-only, which would "work" for mmu_notifier paths but not kvm_vcpu_map().  But
KVM doesn't provide enough information on -EFAULT for userspace to do anything in
response to a write to read-only memory, so in practice that's likely inherently
broken too.

> From [1], it sounds like you've granted that fixing that is not feasible, so
> I just wanted to make sure that this isn't the "unsafe" referred to by
> allow_unsafe_mappings.

Right, this is not the "unsafe" I'm referring to.

> [1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/
diff mbox series

Patch

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ef2763c2b12e..a45308c7d2d9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1157,6 +1157,9 @@  unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
 void kvm_release_page_clean(struct page *page);
 void kvm_release_page_dirty(struct page *page);
 
+void kvm_set_page_accessed(struct page *page);
+void kvm_set_page_dirty(struct page *page);
+
 struct kvm_follow_pfn {
 	const struct kvm_memory_slot *slot;
 	gfn_t gfn;
@@ -1164,10 +1167,17 @@  struct kvm_follow_pfn {
 	bool atomic;
 	/* Allow a read fault to create a writeable mapping. */
 	bool allow_write_mapping;
+	/*
+	 * Usage of the returned pfn will be guared by a mmu notifier. Must
+	 * be true if FOLL_GET is not set.
+	 */
+	bool guarded_by_mmu_notifier;
 
 	/* Outputs of __kvm_follow_pfn */
 	hva_t hva;
 	bool writable;
+	/* True if the returned pfn is for a page with a valid refcount. */
+	bool is_refcounted_page;
 };
 
 kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b13f22861d2f..0f7b41f220b6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2502,6 +2502,9 @@  static bool hva_to_pfn_fast(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
 	if (get_user_page_fast_only(foll->hva, FOLL_WRITE, page)) {
 		*pfn = page_to_pfn(page[0]);
 		foll->writable = foll->allow_write_mapping;
+		foll->is_refcounted_page = true;
+		if (!(foll->flags & FOLL_GET))
+			put_page(page[0]);
 		return true;
 	}
 
@@ -2525,6 +2528,7 @@  static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
 		return npages;
 
 	foll->writable = (foll->flags & FOLL_WRITE) && foll->allow_write_mapping;
+	foll->is_refcounted_page = true;
 
 	/* map read fault as writable if possible */
 	if (unlikely(!foll->writable) && foll->allow_write_mapping) {
@@ -2537,6 +2541,8 @@  static int hva_to_pfn_slow(struct kvm_follow_pfn *foll, kvm_pfn_t *pfn)
 		}
 	}
 	*pfn = page_to_pfn(page);
+	if (!(foll->flags & FOLL_GET))
+		put_page(page);
 	return npages;
 }
 
@@ -2551,16 +2557,6 @@  static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
 	return true;
 }
 
-static int kvm_try_get_pfn(kvm_pfn_t pfn)
-{
-	struct page *page = kvm_pfn_to_refcounted_page(pfn);
-
-	if (!page)
-		return 1;
-
-	return get_page_unless_zero(page);
-}
-
 static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *foll,
 			       kvm_pfn_t *p_pfn)
 {
@@ -2568,6 +2564,7 @@  static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
 	pte_t *ptep;
 	spinlock_t *ptl;
 	bool write_fault = foll->flags & FOLL_WRITE;
+	struct page *page;
 	int r;
 
 	r = follow_pte(vma->vm_mm, foll->hva, &ptep, &ptl);
@@ -2599,24 +2596,27 @@  static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn
 	pfn = pte_pfn(*ptep);
 
 	/*
-	 * Get a reference here because callers of *hva_to_pfn* and
-	 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
-	 * returned pfn.  This is only needed if the VMA has VM_MIXEDMAP
-	 * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
-	 * simply do nothing for reserved pfns.
-	 *
-	 * Whoever called remap_pfn_range is also going to call e.g.
-	 * unmap_mapping_range before the underlying pages are freed,
-	 * causing a call to our MMU notifier.
+	 * Now deal with reference counting. If kvm_pfn_to_refcounted_page
+	 * returns NULL, then there's no refcount to worry about.
 	 *
-	 * Certain IO or PFNMAP mappings can be backed with valid
-	 * struct pages, but be allocated without refcounting e.g.,
-	 * tail pages of non-compound higher order allocations, which
-	 * would then underflow the refcount when the caller does the
-	 * required put_page. Don't allow those pages here.
+	 * Otherwise, certain IO or PFNMAP mappings can be backed with valid
+	 * struct pages but be allocated without refcounting e.g., tail pages of
+	 * non-compound higher order allocations. If FOLL_GET is set and we
+	 * increment such a refcount, then when that pfn is eventually passed to
+	 * kvm_release_pfn_clean, its refcount would hit zero and be incorrectly
+	 * freed. Therefore don't allow those pages here when FOLL_GET is set.
 	 */ 
-	if (!kvm_try_get_pfn(pfn))
+	page = kvm_pfn_to_refcounted_page(pfn);
+	if (!page)
+		goto out;
+
+	if (get_page_unless_zero(page)) {
+		foll->is_refcounted_page = true;
+		if (!(foll->flags & FOLL_GET))
+			put_page(page);
+	} else if (foll->flags & FOLL_GET) {
 		r = -EFAULT;
+	}
 
 out:
 	pte_unmap_unlock(ptep, ptl);
@@ -2693,6 +2693,9 @@  kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *foll)
 
 kvm_pfn_t __kvm_follow_pfn(struct kvm_follow_pfn *foll)
 {
+	if (WARN_ON_ONCE(!(foll->flags & FOLL_GET) && !foll->guarded_by_mmu_notifier))
+		return KVM_PFN_ERR_FAULT;
+
 	foll->hva = __gfn_to_hva_many(foll->slot, foll->gfn, NULL,
 				      foll->flags & FOLL_WRITE);
 
@@ -2717,7 +2720,7 @@  kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
 	struct kvm_follow_pfn foll = {
 		.slot = slot,
 		.gfn = gfn,
-		.flags = 0,
+		.flags = FOLL_GET,
 		.atomic = atomic,
 		.allow_write_mapping = !!writable,
 	};
@@ -2749,7 +2752,7 @@  kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
 	struct kvm_follow_pfn foll = {
 		.slot = gfn_to_memslot(kvm, gfn),
 		.gfn = gfn,
-		.flags = write_fault ? FOLL_WRITE : 0,
+		.flags = FOLL_GET | (write_fault ? FOLL_WRITE : 0),
 		.allow_write_mapping = !!writable,
 	};
 	pfn = __kvm_follow_pfn(&foll);
@@ -2764,7 +2767,7 @@  kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
 	struct kvm_follow_pfn foll = {
 		.slot = slot,
 		.gfn = gfn,
-		.flags = FOLL_WRITE,
+		.flags = FOLL_GET | FOLL_WRITE,
 	};
 	return __kvm_follow_pfn(&foll);
 }
@@ -2775,7 +2778,7 @@  kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gf
 	struct kvm_follow_pfn foll = {
 		.slot = slot,
 		.gfn = gfn,
-		.flags = FOLL_WRITE,
+		.flags = FOLL_GET | FOLL_WRITE,
 		.atomic = true,
 	};
 	return __kvm_follow_pfn(&foll);
@@ -2930,17 +2933,19 @@  static bool kvm_is_ad_tracked_page(struct page *page)
 	return !PageReserved(page);
 }
 
-static void kvm_set_page_dirty(struct page *page)
+void kvm_set_page_dirty(struct page *page)
 {
 	if (kvm_is_ad_tracked_page(page))
 		SetPageDirty(page);
 }
+EXPORT_SYMBOL_GPL(kvm_set_page_dirty);
 
-static void kvm_set_page_accessed(struct page *page)
+void kvm_set_page_accessed(struct page *page)
 {
 	if (kvm_is_ad_tracked_page(page))
 		mark_page_accessed(page);
 }
+EXPORT_SYMBOL_GPL(kvm_set_page_accessed);
 
 void kvm_release_page_clean(struct page *page)
 {
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c
index e3fefa753a51..87caafce3dd0 100644
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -147,7 +147,7 @@  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
 	struct kvm_follow_pfn foll = {
 		.slot = gpc->memslot,
 		.gfn = gpa_to_gfn(gpc->gpa),
-		.flags = FOLL_WRITE,
+		.flags = FOLL_WRITE | FOLL_GET,
 		.hva = gpc->uhva,
 	};