diff mbox series

[v3,kvm/queue,14/16] KVM: Handle page fault for private memory

Message ID 20211223123011.41044-15-chao.p.peng@linux.intel.com
State New
Headers show
Series KVM: mm: fd-based approach for supporting KVM guest private memory | expand

Commit Message

Chao Peng Dec. 23, 2021, 12:30 p.m. UTC
When a page fault from the secondary page table while the guest is
running happens in a memslot with KVM_MEM_PRIVATE, we need go
different paths for private access and shared access.

  - For private access, KVM checks if the page is already allocated in
    the memory backend, if yes KVM establishes the mapping, otherwise
    exits to userspace to convert a shared page to private one.

  - For shared access, KVM also checks if the page is already allocated
    in the memory backend, if yes then exit to userspace to convert a
    private page to shared one, otherwise it's treated as a traditional
    hva-based shared memory, KVM lets existing code to obtain a pfn with
    get_user_pages() and establish the mapping.

The above code assume private memory is persistent and pre-allocated in
the memory backend so KVM can use this information as an indicator for
a page is private or shared. The above check is then performed by
calling kvm_memfd_get_pfn() which currently is implemented as a
pagecache search but in theory that can be implemented differently
(i.e. when the page is even not mapped into host pagecache there should
be some different implementation).

Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
---
 arch/x86/kvm/mmu/mmu.c         | 73 ++++++++++++++++++++++++++++++++--
 arch/x86/kvm/mmu/paging_tmpl.h | 11 +++--
 2 files changed, 77 insertions(+), 7 deletions(-)

Comments

Yan Zhao Jan. 4, 2022, 1:46 a.m. UTC | #1
On Thu, Dec 23, 2021 at 08:30:09PM +0800, Chao Peng wrote:
> When a page fault from the secondary page table while the guest is
> running happens in a memslot with KVM_MEM_PRIVATE, we need go
> different paths for private access and shared access.
> 
>   - For private access, KVM checks if the page is already allocated in
>     the memory backend, if yes KVM establishes the mapping, otherwise
>     exits to userspace to convert a shared page to private one.
>
will this conversion be atomical or not?
For example, after punching a hole in a private memory slot, will KVM
see two notifications: one for invalidation of the whole private memory
slot, and one for fallocate of the rest ranges besides the hole?
Or, KVM only sees one invalidation notification for the hole?
Could you please show QEMU code about this conversion?


>   - For shared access, KVM also checks if the page is already allocated
>     in the memory backend, if yes then exit to userspace to convert a
>     private page to shared one, otherwise it's treated as a traditional
>     hva-based shared memory, KVM lets existing code to obtain a pfn with
>     get_user_pages() and establish the mapping.
> 
> The above code assume private memory is persistent and pre-allocated in
> the memory backend so KVM can use this information as an indicator for
> a page is private or shared. The above check is then performed by
> calling kvm_memfd_get_pfn() which currently is implemented as a
> pagecache search but in theory that can be implemented differently
> (i.e. when the page is even not mapped into host pagecache there should
> be some different implementation).
> 
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> ---
>  arch/x86/kvm/mmu/mmu.c         | 73 ++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/mmu/paging_tmpl.h | 11 +++--
>  2 files changed, 77 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 2856eb662a21..fbcdf62f8281 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -2920,6 +2920,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> +	if (kvm_slot_is_private(slot))
> +		return max_level;
> +
>  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
>  	return min(host_level, max_level);
>  }
> @@ -3950,7 +3953,59 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
>  				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
>  }
>  
> -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
> +static bool kvm_vcpu_is_private_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
> +{
> +	/*
> +	 * At this time private gfn has not been supported yet. Other patch
> +	 * that enables it should change this.
> +	 */
> +	return false;
> +}
> +
> +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> +				    struct kvm_page_fault *fault,
> +				    bool *is_private_pfn, int *r)
> +{
> +	int order;
> +	int mem_convert_type;
> +	struct kvm_memory_slot *slot = fault->slot;
> +	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
For private memory slots, it's possible to have pfns backed by
backends other than memfd, e.g. devicefd. So is it possible to let those
private memslots keep private and use traditional hva-based way?
Reasons below:
1. only memfd is supported in this patch set.
2. qemu/host read/write to those private memslots backing up by devicefd may
not cause machine check.

Thanks
Yan


> +
> +	if (kvm_vcpu_is_private_gfn(vcpu, fault->addr >> PAGE_SHIFT)) {
> +		if (pfn < 0)
> +			mem_convert_type = KVM_EXIT_MEM_MAP_PRIVATE;
> +		else {
> +			fault->pfn = pfn;
> +			if (slot->flags & KVM_MEM_READONLY)
> +				fault->map_writable = false;
> +			else
> +				fault->map_writable = true;
> +
> +			if (order == 0)
> +				fault->max_level = PG_LEVEL_4K;
> +			*is_private_pfn = true;
> +			*r = RET_PF_FIXED;
> +			return true;
> +		}
> +	} else {
> +		if (pfn < 0)
> +			return false;
> +
> +		kvm_memfd_put_pfn(pfn);
> +		mem_convert_type = KVM_EXIT_MEM_MAP_SHARED;
> +	}
> +
> +	vcpu->run->exit_reason = KVM_EXIT_MEMORY_ERROR;
> +	vcpu->run->mem.type = mem_convert_type;
> +	vcpu->run->mem.u.map.gpa = fault->gfn << PAGE_SHIFT;
> +	vcpu->run->mem.u.map.size = PAGE_SIZE;
> +	fault->pfn = -1;
> +	*r = -1;
> +	return true;
> +}
> +
> +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
> +			    bool *is_private_pfn, int *r)
>  {
>  	struct kvm_memory_slot *slot = fault->slot;
>  	bool async;
> @@ -3984,6 +4039,10 @@ static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
>  		}
>  	}
>  
> +	if (kvm_slot_is_private(slot) &&
> +	    kvm_faultin_pfn_private(vcpu, fault, is_private_pfn, r))
> +		return *r == RET_PF_FIXED ? false : true;
> +
>  	async = false;
>  	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
>  					  fault->write, &fault->map_writable,
> @@ -4044,6 +4103,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
>  
>  	unsigned long mmu_seq;
> +	bool is_private_pfn = false;
>  	int r;
>  
>  	fault->gfn = fault->addr >> PAGE_SHIFT;
> @@ -4063,7 +4123,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
>  
> -	if (kvm_faultin_pfn(vcpu, fault, &r))
> +	if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r))
>  		return r;
>  
>  	if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
> @@ -4076,7 +4136,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	else
>  		write_lock(&vcpu->kvm->mmu_lock);
>  
> -	if (is_page_fault_stale(vcpu, fault, mmu_seq))
> +	if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq))
>  		goto out_unlock;
>  
>  	r = make_mmu_pages_available(vcpu);
> @@ -4093,7 +4153,12 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  		read_unlock(&vcpu->kvm->mmu_lock);
>  	else
>  		write_unlock(&vcpu->kvm->mmu_lock);
> -	kvm_release_pfn_clean(fault->pfn);
> +
> +	if (is_private_pfn)
> +		kvm_memfd_put_pfn(fault->pfn);
> +	else
> +		kvm_release_pfn_clean(fault->pfn);
> +
>  	return r;
>  }
>  
> diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
> index 5b5bdac97c7b..640fd1e2fe4c 100644
> --- a/arch/x86/kvm/mmu/paging_tmpl.h
> +++ b/arch/x86/kvm/mmu/paging_tmpl.h
> @@ -825,6 +825,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	int r;
>  	unsigned long mmu_seq;
>  	bool is_self_change_mapping;
> +	bool is_private_pfn = false;
> +
>  
>  	pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
>  	WARN_ON_ONCE(fault->is_tdp);
> @@ -873,7 +875,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	mmu_seq = vcpu->kvm->mmu_notifier_seq;
>  	smp_rmb();
>  
> -	if (kvm_faultin_pfn(vcpu, fault, &r))
> +	if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r))
>  		return r;
>  
>  	if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r))
> @@ -901,7 +903,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  	r = RET_PF_RETRY;
>  	write_lock(&vcpu->kvm->mmu_lock);
>  
> -	if (is_page_fault_stale(vcpu, fault, mmu_seq))
> +	if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq))
>  		goto out_unlock;
>  
>  	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
> @@ -913,7 +915,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
>  
>  out_unlock:
>  	write_unlock(&vcpu->kvm->mmu_lock);
> -	kvm_release_pfn_clean(fault->pfn);
> +	if (is_private_pfn)
> +		kvm_memfd_put_pfn(fault->pfn);
> +	else
> +		kvm_release_pfn_clean(fault->pfn);
>  	return r;
>  }
>  
> -- 
> 2.17.1
> 
>
Chao Peng Jan. 4, 2022, 9:10 a.m. UTC | #2
On Tue, Jan 04, 2022 at 09:46:35AM +0800, Yan Zhao wrote:
> On Thu, Dec 23, 2021 at 08:30:09PM +0800, Chao Peng wrote:
> > When a page fault from the secondary page table while the guest is
> > running happens in a memslot with KVM_MEM_PRIVATE, we need go
> > different paths for private access and shared access.
> > 
> >   - For private access, KVM checks if the page is already allocated in
> >     the memory backend, if yes KVM establishes the mapping, otherwise
> >     exits to userspace to convert a shared page to private one.
> >
> will this conversion be atomical or not?
> For example, after punching a hole in a private memory slot, will KVM
> see two notifications: one for invalidation of the whole private memory
> slot, and one for fallocate of the rest ranges besides the hole?
> Or, KVM only sees one invalidation notification for the hole?

Punching hole doesn't need to invalidate the whole memory slot. It only
send one invalidation notification to KVM for the 'hole' part.

Taking shared-to-private conversion as example it only invalidates the
'hole' part (that usually only the portion of the whole memory) on the
shared fd,, and then fallocate the private memory in the private fd at
the 'hole'. The KVM invalidation notification happens when the shared
hole gets invalidated. The establishment of the private mapping happens
at subsequent KVM page fault handlers.

> Could you please show QEMU code about this conversion?

See below for the QEMU side conversion code. The above described
invalidation and fallocation will be two steps in this conversion. If
error happens in the middle then this error will be propagated to
kvm_run to do the proper action (e.g. may kill the guest?).

int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length,
                            bool shared_to_private)
{
    int ret; 
    int fd_from, fd_to;

    if (!rb || rb->private_fd <= 0) { 
        return -1;
    }    

    if (!QEMU_PTR_IS_ALIGNED(start, rb->page_size) ||
        !QEMU_PTR_IS_ALIGNED(length, rb->page_size)) {
        return -1;
    }    

    if (length > rb->max_length) {
        return -1;
    }    

    if (shared_to_private) {
        fd_from = rb->fd;
        fd_to = rb->private_fd;
    } else {
        fd_from = rb->private_fd;
        fd_to = rb->fd;
    }    

    ret = ram_block_discard_range_fd(rb, start, length, fd_from);
    if (ret) {
        return ret; 
    }    

    if (fd_to > 0) { 
        return fallocate(fd_to, 0, start, length);
    }    

    return 0;
}

> 
> 
> >   - For shared access, KVM also checks if the page is already allocated
> >     in the memory backend, if yes then exit to userspace to convert a
> >     private page to shared one, otherwise it's treated as a traditional
> >     hva-based shared memory, KVM lets existing code to obtain a pfn with
> >     get_user_pages() and establish the mapping.
> > 
> > The above code assume private memory is persistent and pre-allocated in
> > the memory backend so KVM can use this information as an indicator for
> > a page is private or shared. The above check is then performed by
> > calling kvm_memfd_get_pfn() which currently is implemented as a
> > pagecache search but in theory that can be implemented differently
> > (i.e. when the page is even not mapped into host pagecache there should
> > be some different implementation).
> > 
> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > ---
> >  arch/x86/kvm/mmu/mmu.c         | 73 ++++++++++++++++++++++++++++++++--
> >  arch/x86/kvm/mmu/paging_tmpl.h | 11 +++--
> >  2 files changed, 77 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > index 2856eb662a21..fbcdf62f8281 100644
> > --- a/arch/x86/kvm/mmu/mmu.c
> > +++ b/arch/x86/kvm/mmu/mmu.c
> > @@ -2920,6 +2920,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> >  	if (max_level == PG_LEVEL_4K)
> >  		return PG_LEVEL_4K;
> >  
> > +	if (kvm_slot_is_private(slot))
> > +		return max_level;
> > +
> >  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> >  	return min(host_level, max_level);
> >  }
> > @@ -3950,7 +3953,59 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> >  				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
> >  }
> >  
> > -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
> > +static bool kvm_vcpu_is_private_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
> > +{
> > +	/*
> > +	 * At this time private gfn has not been supported yet. Other patch
> > +	 * that enables it should change this.
> > +	 */
> > +	return false;
> > +}
> > +
> > +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> > +				    struct kvm_page_fault *fault,
> > +				    bool *is_private_pfn, int *r)
> > +{
> > +	int order;
> > +	int mem_convert_type;
> > +	struct kvm_memory_slot *slot = fault->slot;
> > +	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> For private memory slots, it's possible to have pfns backed by
> backends other than memfd, e.g. devicefd.

Surely yes, although this patch only supports memfd, but it's designed
to be extensible to support other memory backing stores than memfd. There
is one assumption in this design however: one private memslot can be
backed by only one type of such memory backing store, e.g. if the
devicefd you mentioned can independently provide memory for a memslot
then that's no issue.

>So is it possible to let those
> private memslots keep private and use traditional hva-based way?

Typically this fd-based private memory uses the 'offset' as the
userspace address to get a pfn from the backing store fd. But I believe
the current code does not prevent you from using the hva as the
userspace address, as long as your memory backing store understand that
address and can provide the pfn basing on it. But since you already have
the hva, you probably already mmap-ed the fd to userspace, that seems
not this private memory patch can protect you. Probably I didn't quite
understand 'keep private' you mentioned here.

Thanks,
Chao
> Reasons below:
> 1. only memfd is supported in this patch set.
> 2. qemu/host read/write to those private memslots backing up by devicefd may
> not cause machine check.
> 
> Thanks
> Yan
>
Yan Zhao Jan. 4, 2022, 10:06 a.m. UTC | #3
On Tue, Jan 04, 2022 at 05:10:08PM +0800, Chao Peng wrote:
> On Tue, Jan 04, 2022 at 09:46:35AM +0800, Yan Zhao wrote:
> > On Thu, Dec 23, 2021 at 08:30:09PM +0800, Chao Peng wrote:
> > > When a page fault from the secondary page table while the guest is
> > > running happens in a memslot with KVM_MEM_PRIVATE, we need go
> > > different paths for private access and shared access.
> > > 
> > >   - For private access, KVM checks if the page is already allocated in
> > >     the memory backend, if yes KVM establishes the mapping, otherwise
> > >     exits to userspace to convert a shared page to private one.
> > >
> > will this conversion be atomical or not?
> > For example, after punching a hole in a private memory slot, will KVM
> > see two notifications: one for invalidation of the whole private memory
> > slot, and one for fallocate of the rest ranges besides the hole?
> > Or, KVM only sees one invalidation notification for the hole?
> 
> Punching hole doesn't need to invalidate the whole memory slot. It only
> send one invalidation notification to KVM for the 'hole' part.
good :)

> 
> Taking shared-to-private conversion as example it only invalidates the
> 'hole' part (that usually only the portion of the whole memory) on the
> shared fd,, and then fallocate the private memory in the private fd at
> the 'hole'. The KVM invalidation notification happens when the shared
> hole gets invalidated. The establishment of the private mapping happens
> at subsequent KVM page fault handlers.
> 
> > Could you please show QEMU code about this conversion?
> 
> See below for the QEMU side conversion code. The above described
> invalidation and fallocation will be two steps in this conversion. If
> error happens in the middle then this error will be propagated to
> kvm_run to do the proper action (e.g. may kill the guest?).
> 
> int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length,
>                             bool shared_to_private)
> {
>     int ret; 
>     int fd_from, fd_to;
> 
>     if (!rb || rb->private_fd <= 0) { 
>         return -1;
>     }    
> 
>     if (!QEMU_PTR_IS_ALIGNED(start, rb->page_size) ||
>         !QEMU_PTR_IS_ALIGNED(length, rb->page_size)) {
>         return -1;
>     }    
> 
>     if (length > rb->max_length) {
>         return -1;
>     }    
> 
>     if (shared_to_private) {
>         fd_from = rb->fd;
>         fd_to = rb->private_fd;
>     } else {
>         fd_from = rb->private_fd;
>         fd_to = rb->fd;
>     }    
> 
>     ret = ram_block_discard_range_fd(rb, start, length, fd_from);
>     if (ret) {
>         return ret; 
>     }    
> 
>     if (fd_to > 0) { 
>         return fallocate(fd_to, 0, start, length);
>     }    
> 
>     return 0;
> }
> 
Thanks. So QEMU will re-generate memslots and set KVM_MEM_PRIVATE
accordingly? Will it involve slot deletion and create?

> > 
> > 
> > >   - For shared access, KVM also checks if the page is already allocated
> > >     in the memory backend, if yes then exit to userspace to convert a
> > >     private page to shared one, otherwise it's treated as a traditional
> > >     hva-based shared memory, KVM lets existing code to obtain a pfn with
> > >     get_user_pages() and establish the mapping.
> > > 
> > > The above code assume private memory is persistent and pre-allocated in
> > > the memory backend so KVM can use this information as an indicator for
> > > a page is private or shared. The above check is then performed by
> > > calling kvm_memfd_get_pfn() which currently is implemented as a
> > > pagecache search but in theory that can be implemented differently
> > > (i.e. when the page is even not mapped into host pagecache there should
> > > be some different implementation).
> > > 
> > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > ---
> > >  arch/x86/kvm/mmu/mmu.c         | 73 ++++++++++++++++++++++++++++++++--
> > >  arch/x86/kvm/mmu/paging_tmpl.h | 11 +++--
> > >  2 files changed, 77 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > index 2856eb662a21..fbcdf62f8281 100644
> > > --- a/arch/x86/kvm/mmu/mmu.c
> > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > @@ -2920,6 +2920,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> > >  	if (max_level == PG_LEVEL_4K)
> > >  		return PG_LEVEL_4K;
> > >  
> > > +	if (kvm_slot_is_private(slot))
> > > +		return max_level;
> > > +
> > >  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> > >  	return min(host_level, max_level);
> > >  }
> > > @@ -3950,7 +3953,59 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > >  				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
> > >  }
> > >  
> > > -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
> > > +static bool kvm_vcpu_is_private_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
> > > +{
> > > +	/*
> > > +	 * At this time private gfn has not been supported yet. Other patch
> > > +	 * that enables it should change this.
> > > +	 */
> > > +	return false;
> > > +}
> > > +
> > > +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> > > +				    struct kvm_page_fault *fault,
> > > +				    bool *is_private_pfn, int *r)
> > > +{
> > > +	int order;
> > > +	int mem_convert_type;
> > > +	struct kvm_memory_slot *slot = fault->slot;
> > > +	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> > For private memory slots, it's possible to have pfns backed by
> > backends other than memfd, e.g. devicefd.
> 
> Surely yes, although this patch only supports memfd, but it's designed
> to be extensible to support other memory backing stores than memfd. There
> is one assumption in this design however: one private memslot can be
> backed by only one type of such memory backing store, e.g. if the
> devicefd you mentioned can independently provide memory for a memslot
> then that's no issue.
> 
> >So is it possible to let those
> > private memslots keep private and use traditional hva-based way?
> 
> Typically this fd-based private memory uses the 'offset' as the
> userspace address to get a pfn from the backing store fd. But I believe
> the current code does not prevent you from using the hva as the
By hva-based way, I mean mmap is required for this fd.

> userspace address, as long as your memory backing store understand that
> address and can provide the pfn basing on it. But since you already have
> the hva, you probably already mmap-ed the fd to userspace, that seems
> not this private memory patch can protect you. Probably I didn't quite
Yes, for this fd, though mapped in private memslot, there's no need to
prevent QEMU/host from accessing it as it will not cause the severe machine
check.

> understand 'keep private' you mentioned here.
'keep private' means allow this kind of private memslot which does not
require protection from this private memory patch :)


Thanks
Yan
> > Reasons below:
> > 1. only memfd is supported in this patch set.
> > 2. qemu/host read/write to those private memslots backing up by devicefd may
> > not cause machine check.
> >
Chao Peng Jan. 5, 2022, 6:28 a.m. UTC | #4
On Tue, Jan 04, 2022 at 06:06:12PM +0800, Yan Zhao wrote:
> On Tue, Jan 04, 2022 at 05:10:08PM +0800, Chao Peng wrote:
> > On Tue, Jan 04, 2022 at 09:46:35AM +0800, Yan Zhao wrote:
> > > On Thu, Dec 23, 2021 at 08:30:09PM +0800, Chao Peng wrote:
> > > > When a page fault from the secondary page table while the guest is
> > > > running happens in a memslot with KVM_MEM_PRIVATE, we need go
> > > > different paths for private access and shared access.
> > > > 
> > > >   - For private access, KVM checks if the page is already allocated in
> > > >     the memory backend, if yes KVM establishes the mapping, otherwise
> > > >     exits to userspace to convert a shared page to private one.
> > > >
> > > will this conversion be atomical or not?
> > > For example, after punching a hole in a private memory slot, will KVM
> > > see two notifications: one for invalidation of the whole private memory
> > > slot, and one for fallocate of the rest ranges besides the hole?
> > > Or, KVM only sees one invalidation notification for the hole?
> > 
> > Punching hole doesn't need to invalidate the whole memory slot. It only
> > send one invalidation notification to KVM for the 'hole' part.
> good :)
> 
> > 
> > Taking shared-to-private conversion as example it only invalidates the
> > 'hole' part (that usually only the portion of the whole memory) on the
> > shared fd,, and then fallocate the private memory in the private fd at
> > the 'hole'. The KVM invalidation notification happens when the shared
> > hole gets invalidated. The establishment of the private mapping happens
> > at subsequent KVM page fault handlers.
> > 
> > > Could you please show QEMU code about this conversion?
> > 
> > See below for the QEMU side conversion code. The above described
> > invalidation and fallocation will be two steps in this conversion. If
> > error happens in the middle then this error will be propagated to
> > kvm_run to do the proper action (e.g. may kill the guest?).
> > 
> > int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length,
> >                             bool shared_to_private)
> > {
> >     int ret; 
> >     int fd_from, fd_to;
> > 
> >     if (!rb || rb->private_fd <= 0) { 
> >         return -1;
> >     }    
> > 
> >     if (!QEMU_PTR_IS_ALIGNED(start, rb->page_size) ||
> >         !QEMU_PTR_IS_ALIGNED(length, rb->page_size)) {
> >         return -1;
> >     }    
> > 
> >     if (length > rb->max_length) {
> >         return -1;
> >     }    
> > 
> >     if (shared_to_private) {
> >         fd_from = rb->fd;
> >         fd_to = rb->private_fd;
> >     } else {
> >         fd_from = rb->private_fd;
> >         fd_to = rb->fd;
> >     }    
> > 
> >     ret = ram_block_discard_range_fd(rb, start, length, fd_from);
> >     if (ret) {
> >         return ret; 
> >     }    
> > 
> >     if (fd_to > 0) { 
> >         return fallocate(fd_to, 0, start, length);
> >     }    
> > 
> >     return 0;
> > }
> > 
> Thanks. So QEMU will re-generate memslots and set KVM_MEM_PRIVATE
> accordingly? Will it involve slot deletion and create?

KVM will not re-generate memslots when do the conversion, instead, it
does unmap/map a range on the same memslot. For memslot with tag
KVM_MEM_PRIVATE, it always have two mappings (private/shared) but at a
time only one is effective. What conversion does is to turn off the
existing mapping and turn on the other mapping for specified range in
that slot.

> 
> > > 
> > > 
> > > >   - For shared access, KVM also checks if the page is already allocated
> > > >     in the memory backend, if yes then exit to userspace to convert a
> > > >     private page to shared one, otherwise it's treated as a traditional
> > > >     hva-based shared memory, KVM lets existing code to obtain a pfn with
> > > >     get_user_pages() and establish the mapping.
> > > > 
> > > > The above code assume private memory is persistent and pre-allocated in
> > > > the memory backend so KVM can use this information as an indicator for
> > > > a page is private or shared. The above check is then performed by
> > > > calling kvm_memfd_get_pfn() which currently is implemented as a
> > > > pagecache search but in theory that can be implemented differently
> > > > (i.e. when the page is even not mapped into host pagecache there should
> > > > be some different implementation).
> > > > 
> > > > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> > > > Signed-off-by: Chao Peng <chao.p.peng@linux.intel.com>
> > > > ---
> > > >  arch/x86/kvm/mmu/mmu.c         | 73 ++++++++++++++++++++++++++++++++--
> > > >  arch/x86/kvm/mmu/paging_tmpl.h | 11 +++--
> > > >  2 files changed, 77 insertions(+), 7 deletions(-)
> > > > 
> > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> > > > index 2856eb662a21..fbcdf62f8281 100644
> > > > --- a/arch/x86/kvm/mmu/mmu.c
> > > > +++ b/arch/x86/kvm/mmu/mmu.c
> > > > @@ -2920,6 +2920,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm,
> > > >  	if (max_level == PG_LEVEL_4K)
> > > >  		return PG_LEVEL_4K;
> > > >  
> > > > +	if (kvm_slot_is_private(slot))
> > > > +		return max_level;
> > > > +
> > > >  	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
> > > >  	return min(host_level, max_level);
> > > >  }
> > > > @@ -3950,7 +3953,59 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> > > >  				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
> > > >  }
> > > >  
> > > > -static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
> > > > +static bool kvm_vcpu_is_private_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
> > > > +{
> > > > +	/*
> > > > +	 * At this time private gfn has not been supported yet. Other patch
> > > > +	 * that enables it should change this.
> > > > +	 */
> > > > +	return false;
> > > > +}
> > > > +
> > > > +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> > > > +				    struct kvm_page_fault *fault,
> > > > +				    bool *is_private_pfn, int *r)
> > > > +{
> > > > +	int order;
> > > > +	int mem_convert_type;
> > > > +	struct kvm_memory_slot *slot = fault->slot;
> > > > +	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> > > For private memory slots, it's possible to have pfns backed by
> > > backends other than memfd, e.g. devicefd.
> > 
> > Surely yes, although this patch only supports memfd, but it's designed
> > to be extensible to support other memory backing stores than memfd. There
> > is one assumption in this design however: one private memslot can be
> > backed by only one type of such memory backing store, e.g. if the
> > devicefd you mentioned can independently provide memory for a memslot
> > then that's no issue.
> > 
> > >So is it possible to let those
> > > private memslots keep private and use traditional hva-based way?
> > 
> > Typically this fd-based private memory uses the 'offset' as the
> > userspace address to get a pfn from the backing store fd. But I believe
> > the current code does not prevent you from using the hva as the
> By hva-based way, I mean mmap is required for this fd.
> 
> > userspace address, as long as your memory backing store understand that
> > address and can provide the pfn basing on it. But since you already have
> > the hva, you probably already mmap-ed the fd to userspace, that seems
> > not this private memory patch can protect you. Probably I didn't quite
> Yes, for this fd, though mapped in private memslot, there's no need to
> prevent QEMU/host from accessing it as it will not cause the severe machine
> check.
> 
> > understand 'keep private' you mentioned here.
> 'keep private' means allow this kind of private memslot which does not
> require protection from this private memory patch :)

Then I think such memory can be the shared part of memory of the
KVM_MEM_PRIVATE memslot. As said above, this is initially supported :)

Chao
> 
> 
> Thanks
> Yan
> > > Reasons below:
> > > 1. only memfd is supported in this patch set.
> > > 2. qemu/host read/write to those private memslots backing up by devicefd may
> > > not cause machine check.
> > >
Yan Zhao Jan. 5, 2022, 7:53 a.m. UTC | #5
On Wed, Jan 05, 2022 at 02:28:10PM +0800, Chao Peng wrote:
> On Tue, Jan 04, 2022 at 06:06:12PM +0800, Yan Zhao wrote:
> > On Tue, Jan 04, 2022 at 05:10:08PM +0800, Chao Peng wrote:
<...> 
> > Thanks. So QEMU will re-generate memslots and set KVM_MEM_PRIVATE
> > accordingly? Will it involve slot deletion and create?
> 
> KVM will not re-generate memslots when do the conversion, instead, it
> does unmap/map a range on the same memslot. For memslot with tag
> KVM_MEM_PRIVATE, it always have two mappings (private/shared) but at a
> time only one is effective. What conversion does is to turn off the
> existing mapping and turn on the other mapping for specified range in
> that slot.
>
got it. thanks!

<...>
> > > > > +static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> > > > > +				    struct kvm_page_fault *fault,
> > > > > +				    bool *is_private_pfn, int *r)
> > > > > +{
> > > > > +	int order;
> > > > > +	int mem_convert_type;
> > > > > +	struct kvm_memory_slot *slot = fault->slot;
> > > > > +	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> > > > For private memory slots, it's possible to have pfns backed by
> > > > backends other than memfd, e.g. devicefd.
> > > 
> > > Surely yes, although this patch only supports memfd, but it's designed
> > > to be extensible to support other memory backing stores than memfd. There
> > > is one assumption in this design however: one private memslot can be
> > > backed by only one type of such memory backing store, e.g. if the
> > > devicefd you mentioned can independently provide memory for a memslot
> > > then that's no issue.
> > > 
> > > >So is it possible to let those
> > > > private memslots keep private and use traditional hva-based way?
> > > 
> > > Typically this fd-based private memory uses the 'offset' as the
> > > userspace address to get a pfn from the backing store fd. But I believe
> > > the current code does not prevent you from using the hva as the
> > By hva-based way, I mean mmap is required for this fd.
> > 
> > > userspace address, as long as your memory backing store understand that
> > > address and can provide the pfn basing on it. But since you already have
> > > the hva, you probably already mmap-ed the fd to userspace, that seems
> > > not this private memory patch can protect you. Probably I didn't quite
> > Yes, for this fd, though mapped in private memslot, there's no need to
> > prevent QEMU/host from accessing it as it will not cause the severe machine
> > check.
> > 
> > > understand 'keep private' you mentioned here.
> > 'keep private' means allow this kind of private memslot which does not
> > require protection from this private memory patch :)
> 
> Then I think such memory can be the shared part of memory of the
> KVM_MEM_PRIVATE memslot. As said above, this is initially supported :)
>
Sorry, maybe I didn't express it clearly.

As in the kvm_faultin_pfn_private(), 
static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
				    struct kvm_page_fault *fault,
				    bool *is_private_pfn, int *r)
{
	int order;
	int mem_convert_type;
	struct kvm_memory_slot *slot = fault->slot;
	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
	...
}
Currently, kvm_memfd_get_pfn() is called unconditionally.
However, if the backend of a private memslot is not memfd, and is device
fd for example, a different xxx_get_pfn() is required here.

Further, though mapped to a private gfn, it might be ok for QEMU to
access the device fd in hva-based way (or call it MMU access way, e.g.
read/write/mmap), it's desired that it could use the traditional to get
pfn without convert the range to a shared one.
pfn = __gfn_to_pfn_memslot(slot, fault->gfn, ...)
	|->addr = __gfn_to_hva_many (slot, gfn,...)
	|  pfn = hva_to_pfn (addr,...)


So, is it possible to recognize such kind of backends in KVM, and to get
the pfn in traditional way without converting them to shared?
e.g.
- specify KVM_MEM_PRIVATE_NONPROTECT to memory regions with such kind
of backends, or
- detect the fd type and check if get_pfn is provided. if no, go the
  traditional way.

Thanks
Yan

> > > > Reasons below:
> > > > 1. only memfd is supported in this patch set.
> > > > 2. qemu/host read/write to those private memslots backing up by devicefd may
> > > > not cause machine check.
Sean Christopherson Jan. 5, 2022, 8:52 p.m. UTC | #6
On Wed, Jan 05, 2022, Yan Zhao wrote:
> Sorry, maybe I didn't express it clearly.
> 
> As in the kvm_faultin_pfn_private(), 
> static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> 				    struct kvm_page_fault *fault,
> 				    bool *is_private_pfn, int *r)
> {
> 	int order;
> 	int mem_convert_type;
> 	struct kvm_memory_slot *slot = fault->slot;
> 	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> 	...
> }
> Currently, kvm_memfd_get_pfn() is called unconditionally.
> However, if the backend of a private memslot is not memfd, and is device
> fd for example, a different xxx_get_pfn() is required here.

Ya, I've complained about this in a different thread[*].  This should really be
something like kvm_private_fd_get_pfn(), where the underlying ops struct can point
at any compatible backing store.

https://lore.kernel.org/all/YcuMUemyBXFYyxCC@google.com/

> Further, though mapped to a private gfn, it might be ok for QEMU to
> access the device fd in hva-based way (or call it MMU access way, e.g.
> read/write/mmap), it's desired that it could use the traditional to get
> pfn without convert the range to a shared one.

No, this is expressly forbidden.  The backing store for a private gfn must not
be accessible by userspace.  It's possible a backing store could support both, but
not concurrently, and any conversion must be done without KVM being involved.
In other words, resolving a private gfn must either succeed or fail (exit to
userspace), KVM cannot initiate any conversions.

> pfn = __gfn_to_pfn_memslot(slot, fault->gfn, ...)
> 	|->addr = __gfn_to_hva_many (slot, gfn,...)
> 	|  pfn = hva_to_pfn (addr,...)
> 
> 
> So, is it possible to recognize such kind of backends in KVM, and to get
> the pfn in traditional way without converting them to shared?
> e.g.
> - specify KVM_MEM_PRIVATE_NONPROTECT to memory regions with such kind
> of backends, or
> - detect the fd type and check if get_pfn is provided. if no, go the
>   traditional way.

No, because the whole point of this is to make guest private memory inaccessible
to host userspace.  Or did I misinterpret your questions?
Yan Zhao Jan. 14, 2022, 5:53 a.m. UTC | #7
hi Sean,
Sorry for the late reply. I just saw this mail in my mailbox.

On Wed, Jan 05, 2022 at 08:52:39PM +0000, Sean Christopherson wrote:
> On Wed, Jan 05, 2022, Yan Zhao wrote:
> > Sorry, maybe I didn't express it clearly.
> > 
> > As in the kvm_faultin_pfn_private(), 
> > static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
> > 				    struct kvm_page_fault *fault,
> > 				    bool *is_private_pfn, int *r)
> > {
> > 	int order;
> > 	int mem_convert_type;
> > 	struct kvm_memory_slot *slot = fault->slot;
> > 	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
> > 	...
> > }
> > Currently, kvm_memfd_get_pfn() is called unconditionally.
> > However, if the backend of a private memslot is not memfd, and is device
> > fd for example, a different xxx_get_pfn() is required here.
> 
> Ya, I've complained about this in a different thread[*].  This should really be
> something like kvm_private_fd_get_pfn(), where the underlying ops struct can point
> at any compatible backing store.
> 
> https://lore.kernel.org/all/YcuMUemyBXFYyxCC@google.com/
>
ok. 

> > Further, though mapped to a private gfn, it might be ok for QEMU to
> > access the device fd in hva-based way (or call it MMU access way, e.g.
> > read/write/mmap), it's desired that it could use the traditional to get
> > pfn without convert the range to a shared one.
> 
> No, this is expressly forbidden.  The backing store for a private gfn must not
> be accessible by userspace.  It's possible a backing store could support both, but
> not concurrently, and any conversion must be done without KVM being involved.
> In other words, resolving a private gfn must either succeed or fail (exit to
> userspace), KVM cannot initiate any conversions.
>
When it comes to a device passthrough via VFIO, there might be more work
related to the device fd as a backend.

First, unlike memfd which can allocate one private fd for a set of PFNs,
and one shared fd for another set of PFNs, for device fd, it needs to open
the same physical device twice, one for shared fd, and one for private fd.

Then, for private device fd, now its ramblock has to use qemu_ram_alloc_from_fd()
instead of current qemu_ram_alloc_from_ptr().
And as in VFIO, this private fd is shared by several ramblocks (each locating from
a different base offset), the base offsets also need to be kept somewhere 
in order to call get_pfn successfully. (this info is kept in
vma through mmap() previously, so without mmap(), a new interface might
be required). 

Also, for shared device fd,  mmap() is required in order to allocate the
ramblock with qemu_ram_alloc_from_ptr(), and more importantly to make
the future gfn_to_hva, and hva_to_pfn possible.
But as the shared and private fds are based on the same physical device,
the vfio driver needs to record which vma ranges are allowed for the actual
mmap_fault, which vma area are not.

With the above changes, it only prevents the host user space from accessing
the device mapped to private GFNs.
For memory backends, host kernel space accessing is prevented via MKTME.
And for device, the device needs to the work to disallow host kernel
space access.
However, unlike memory side, the device side would not cause any MCE. 
Thereby, host user space access to the device also would not cause MCEs, either. 

So, I'm not sure if the above work is worthwhile to the device fd.


> > pfn = __gfn_to_pfn_memslot(slot, fault->gfn, ...)
> > 	|->addr = __gfn_to_hva_many (slot, gfn,...)
> > 	|  pfn = hva_to_pfn (addr,...)
> > 
> > 
> > So, is it possible to recognize such kind of backends in KVM, and to get
> > the pfn in traditional way without converting them to shared?
> > e.g.
> > - specify KVM_MEM_PRIVATE_NONPROTECT to memory regions with such kind
> > of backends, or
> > - detect the fd type and check if get_pfn is provided. if no, go the
> >   traditional way.
> 
> No, because the whole point of this is to make guest private memory inaccessible
> to host userspace.  Or did I misinterpret your questions?
I think the host unmap series is based on the assumption that host user
space access to the memory based to private guest GFNs would cause fatal
MCEs.
So, I hope for backends who will not bring this fatal error can keep
using traditional way to get pfn and be mapped to private GFNs at the
same time.

Thanks
Yan
diff mbox series

Patch

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2856eb662a21..fbcdf62f8281 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2920,6 +2920,9 @@  int kvm_mmu_max_mapping_level(struct kvm *kvm,
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
+	if (kvm_slot_is_private(slot))
+		return max_level;
+
 	host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot);
 	return min(host_level, max_level);
 }
@@ -3950,7 +3953,59 @@  static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 				  kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
-static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int *r)
+static bool kvm_vcpu_is_private_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	/*
+	 * At this time private gfn has not been supported yet. Other patch
+	 * that enables it should change this.
+	 */
+	return false;
+}
+
+static bool kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
+				    struct kvm_page_fault *fault,
+				    bool *is_private_pfn, int *r)
+{
+	int order;
+	int mem_convert_type;
+	struct kvm_memory_slot *slot = fault->slot;
+	long pfn = kvm_memfd_get_pfn(slot, fault->gfn, &order);
+
+	if (kvm_vcpu_is_private_gfn(vcpu, fault->addr >> PAGE_SHIFT)) {
+		if (pfn < 0)
+			mem_convert_type = KVM_EXIT_MEM_MAP_PRIVATE;
+		else {
+			fault->pfn = pfn;
+			if (slot->flags & KVM_MEM_READONLY)
+				fault->map_writable = false;
+			else
+				fault->map_writable = true;
+
+			if (order == 0)
+				fault->max_level = PG_LEVEL_4K;
+			*is_private_pfn = true;
+			*r = RET_PF_FIXED;
+			return true;
+		}
+	} else {
+		if (pfn < 0)
+			return false;
+
+		kvm_memfd_put_pfn(pfn);
+		mem_convert_type = KVM_EXIT_MEM_MAP_SHARED;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_MEMORY_ERROR;
+	vcpu->run->mem.type = mem_convert_type;
+	vcpu->run->mem.u.map.gpa = fault->gfn << PAGE_SHIFT;
+	vcpu->run->mem.u.map.size = PAGE_SIZE;
+	fault->pfn = -1;
+	*r = -1;
+	return true;
+}
+
+static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
+			    bool *is_private_pfn, int *r)
 {
 	struct kvm_memory_slot *slot = fault->slot;
 	bool async;
@@ -3984,6 +4039,10 @@  static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 		}
 	}
 
+	if (kvm_slot_is_private(slot) &&
+	    kvm_faultin_pfn_private(vcpu, fault, is_private_pfn, r))
+		return *r == RET_PF_FIXED ? false : true;
+
 	async = false;
 	fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, &async,
 					  fault->write, &fault->map_writable,
@@ -4044,6 +4103,7 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu);
 
 	unsigned long mmu_seq;
+	bool is_private_pfn = false;
 	int r;
 
 	fault->gfn = fault->addr >> PAGE_SHIFT;
@@ -4063,7 +4123,7 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (kvm_faultin_pfn(vcpu, fault, &r))
+	if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r))
 		return r;
 
 	if (handle_abnormal_pfn(vcpu, fault, ACC_ALL, &r))
@@ -4076,7 +4136,7 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	else
 		write_lock(&vcpu->kvm->mmu_lock);
 
-	if (is_page_fault_stale(vcpu, fault, mmu_seq))
+	if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq))
 		goto out_unlock;
 
 	r = make_mmu_pages_available(vcpu);
@@ -4093,7 +4153,12 @@  static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		read_unlock(&vcpu->kvm->mmu_lock);
 	else
 		write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+
+	if (is_private_pfn)
+		kvm_memfd_put_pfn(fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
+
 	return r;
 }
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 5b5bdac97c7b..640fd1e2fe4c 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -825,6 +825,8 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	int r;
 	unsigned long mmu_seq;
 	bool is_self_change_mapping;
+	bool is_private_pfn = false;
+
 
 	pgprintk("%s: addr %lx err %x\n", __func__, fault->addr, fault->error_code);
 	WARN_ON_ONCE(fault->is_tdp);
@@ -873,7 +875,7 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 
-	if (kvm_faultin_pfn(vcpu, fault, &r))
+	if (kvm_faultin_pfn(vcpu, fault, &is_private_pfn, &r))
 		return r;
 
 	if (handle_abnormal_pfn(vcpu, fault, walker.pte_access, &r))
@@ -901,7 +903,7 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	r = RET_PF_RETRY;
 	write_lock(&vcpu->kvm->mmu_lock);
 
-	if (is_page_fault_stale(vcpu, fault, mmu_seq))
+	if (!is_private_pfn && is_page_fault_stale(vcpu, fault, mmu_seq))
 		goto out_unlock;
 
 	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
@@ -913,7 +915,10 @@  static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 
 out_unlock:
 	write_unlock(&vcpu->kvm->mmu_lock);
-	kvm_release_pfn_clean(fault->pfn);
+	if (is_private_pfn)
+		kvm_memfd_put_pfn(fault->pfn);
+	else
+		kvm_release_pfn_clean(fault->pfn);
 	return r;
 }