Message ID | 20230704075054.3344915-6-stevensd@google.com (mailing list archive) |
---|---|
State | Handled Elsewhere, archived |
Headers | show |
Series | KVM: allow mapping non-refcounted pages | expand |
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > From: David Stevens <stevensd@chromium.org> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > memory into the guest that is backed by un-refcounted struct pages - for > example, higher order non-compound pages allocated by the amdgpu driver > via ttm_pool_alloc_page. I guess you mean the tail pages of the higher order non-compound pages? And as to the head page, it is said to be set to one coincidentally[*], and shall not be considered as refcounted. IIUC, refcount of this head page will be increased and decreased soon in hva_to_pfn_remapped(), so this may not be a problem(?). But treating this head page differently, as a refcounted one(e.g., to set the A/D flags), is weired. Or maybe I missed some context, e.g., can the head page be allocted to guest at all? > > The bulk of this change is tracking the is_refcounted_page flag so that > non-refcounted pages don't trigger page_count() == 0 warnings. This is > done by storing the flag in an unused bit in the sptes. Also, maybe we should mention this only works on x86-64. > > Signed-off-by: David Stevens <stevensd@chromium.org> > --- > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > arch/x86/kvm/mmu/mmu_internal.h | 1 + > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > arch/x86/kvm/mmu/spte.c | 4 ++- > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > 6 files changed, 62 insertions(+), 30 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index e44ab512c3a1..b1607e314497 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c ... > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > bool host_writable = !fault || fault->map_writable; > bool prefetch = !fault || fault->prefetch; > bool write_fault = fault && fault->write; > + bool is_refcounted = !fault || fault->is_refcounted_page; Just wonder, what if a non-refcounted page is prefetched? Or is it possible in practice? ... > > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > */ > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) > { > - bool host_writable; > + bool host_writable, is_refcounted; > gpa_t first_pte_gpa; > u64 *sptep, spte; > struct kvm_memory_slot *slot; > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int > sptep = &sp->spt[i]; > spte = *sptep; > host_writable = spte & shadow_host_writable_mask; > + // TODO: is this correct? > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED; > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > make_spte(vcpu, sp, slot, pte_access, gfn, > spte_to_pfn(spte), spte, true, false, > - host_writable, &spte); > + host_writable, is_refcounted, &spte); Could we restrict that a non-refcounted page shall not be used as shadow page? [*] https://lore.kernel.org/all/8caf3008-dcf3-985a-631e-e019b277c6f0@amd.com/ B.R. Yu
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > From: David Stevens <stevensd@chromium.org> > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > memory into the guest that is backed by un-refcounted struct pages - for > example, higher order non-compound pages allocated by the amdgpu driver > via ttm_pool_alloc_page. > > The bulk of this change is tracking the is_refcounted_page flag so that > non-refcounted pages don't trigger page_count() == 0 warnings. This is > done by storing the flag in an unused bit in the sptes. > > Signed-off-by: David Stevens <stevensd@chromium.org> > --- > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > arch/x86/kvm/mmu/mmu_internal.h | 1 + > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > arch/x86/kvm/mmu/spte.c | 4 ++- > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > 6 files changed, 62 insertions(+), 30 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index e44ab512c3a1..b1607e314497 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) > > if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { > flush = true; > - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); > + if (is_refcounted_page_pte(old_spte)) > + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); > } > > if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { > flush = true; > - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); > + if (is_refcounted_page_pte(old_spte)) > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte))); > } > > return flush; > @@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) > * before they are reclaimed. Sanity check that, if the pfn is backed > * by a refcounted page, the refcount is elevated. > */ > - page = kvm_pfn_to_refcounted_page(pfn); > - WARN_ON(page && !page_count(page)); > + if (is_refcounted_page_pte(old_spte)) { > + page = kvm_pfn_to_refcounted_page(pfn); > + WARN_ON(!page || !page_count(page)); > + } > > - if (is_accessed_spte(old_spte)) > - kvm_set_pfn_accessed(pfn); > + if (is_refcounted_page_pte(old_spte)) { > + if (is_accessed_spte(old_spte)) > + kvm_set_page_accessed(pfn_to_page(pfn)); > > - if (is_dirty_spte(old_spte)) > - kvm_set_pfn_dirty(pfn); > + if (is_dirty_spte(old_spte)) > + kvm_set_page_dirty(pfn_to_page(pfn)); > + } > > return old_spte; > } > @@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep) > * Capture the dirty status of the page, so that it doesn't get > * lost when the SPTE is marked for access tracking. > */ > - if (is_writable_pte(spte)) > - kvm_set_pfn_dirty(spte_to_pfn(spte)); > + if (is_writable_pte(spte) && is_refcounted_page_pte(spte)) > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte))); > > spte = mark_spte_for_access_track(spte); > mmu_spte_update_no_track(sptep, spte); > @@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) > { > bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, > (unsigned long *)sptep); > - if (was_writable && !spte_ad_enabled(*sptep)) > - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); > + if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep)) > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep))); > > return was_writable; > } > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > bool host_writable = !fault || fault->map_writable; > bool prefetch = !fault || fault->prefetch; > bool write_fault = fault && fault->write; > + bool is_refcounted = !fault || fault->is_refcounted_page; > > pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, > *sptep, write_fault, gfn); > @@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > } > > wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, > - true, host_writable, &spte); > + true, host_writable, is_refcounted, &spte); > > if (*sptep == spte) { > ret = RET_PF_SPURIOUS; > @@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > struct kvm_follow_pfn foll = { > .slot = slot, > .gfn = fault->gfn, > - .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0), > + .flags = fault->write ? FOLL_WRITE : 0, > .allow_write_mapping = true, > + .guarded_by_mmu_notifier = true, > }; > > /* > @@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > fault->slot = NULL; > fault->pfn = KVM_PFN_NOSLOT; > fault->map_writable = false; > + fault->is_refcounted_page = false; > return RET_PF_CONTINUE; > } > /* > @@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > success: > fault->hva = foll.hva; > fault->map_writable = foll.writable; > + fault->is_refcounted_page = foll.is_refcounted_page; > return RET_PF_CONTINUE; > } > > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > out_unlock: > write_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); > + if (fault->is_refcounted_page) > + kvm_set_page_accessed(pfn_to_page(fault->pfn)); > return r; > } > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > out_unlock: > read_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns. What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I believe this is not gonna happen in real world... B.R. Yu
> > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > */ > > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) > > { > > - bool host_writable; > > + bool host_writable, is_refcounted; > > gpa_t first_pte_gpa; > > u64 *sptep, spte; > > struct kvm_memory_slot *slot; > > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int > > sptep = &sp->spt[i]; > > spte = *sptep; > > host_writable = spte & shadow_host_writable_mask; > > + // TODO: is this correct? > > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED; > > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > > make_spte(vcpu, sp, slot, pte_access, gfn, > > spte_to_pfn(spte), spte, true, false, > > - host_writable, &spte); > > + host_writable, is_refcounted, &spte); > > Could we restrict that a non-refcounted page shall not be used as shadow page? Oh, sorry. It's not about shadow page. It's about guest page being mapped as not refcounted. Silly me... B.R. Yu
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens <stevensd@chromium.org> wrote: > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c > index cf2c6426a6fc..46c681dc45e6 100644 > --- a/arch/x86/kvm/mmu/spte.c > +++ b/arch/x86/kvm/mmu/spte.c > @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > const struct kvm_memory_slot *slot, > unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, > u64 old_spte, bool prefetch, bool can_unsync, > - bool host_writable, u64 *new_spte) > + bool host_writable, bool is_refcounted, u64 *new_spte) > { > int level = sp->role.level; > u64 spte = SPTE_MMU_PRESENT_MASK; > @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > > if (level > PG_LEVEL_4K) > spte |= PT_PAGE_SIZE_MASK; > + else if (is_refcounted) > + spte |= SPTE_MMU_PAGE_REFCOUNTED; Is REFCOUNTED for 4K page only? What guarantees that large page doesn't have FOLL_GET? or can we set the bit for large page? > > if (shadow_memtype_mask) > spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn,
On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > From: David Stevens <stevensd@chromium.org> > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > memory into the guest that is backed by un-refcounted struct pages - for > > example, higher order non-compound pages allocated by the amdgpu driver > > via ttm_pool_alloc_page. > > I guess you mean the tail pages of the higher order non-compound pages? > And as to the head page, it is said to be set to one coincidentally[*], > and shall not be considered as refcounted. IIUC, refcount of this head > page will be increased and decreased soon in hva_to_pfn_remapped(), so > this may not be a problem(?). But treating this head page differently, > as a refcounted one(e.g., to set the A/D flags), is weired. > > Or maybe I missed some context, e.g., can the head page be allocted to > guest at all? Yes, this is to allow mapping the tail pages of higher order non-compound pages - I should have been more precise in my wording. The head pages can already be mapped into the guest. Treating the head and tail pages would require changing how KVM behaves in a situation it supports today (rather than just adding support for an unsupported situation). Currently, without this series, KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the guest. When that happens, KVM sets the A/D flags. I'm not sure whether that's actually valid behavior, nor do I know whether anyone actually cares about it. But it's what KVM does today, and I would shy away from modifying that behavior without good reason. > > > > The bulk of this change is tracking the is_refcounted_page flag so that > > non-refcounted pages don't trigger page_count() == 0 warnings. This is > > done by storing the flag in an unused bit in the sptes. > > Also, maybe we should mention this only works on x86-64. > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > --- > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > > arch/x86/kvm/mmu/mmu_internal.h | 1 + > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > > arch/x86/kvm/mmu/spte.c | 4 ++- > > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > > 6 files changed, 62 insertions(+), 30 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index e44ab512c3a1..b1607e314497 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > ... > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > bool host_writable = !fault || fault->map_writable; > > bool prefetch = !fault || fault->prefetch; > > bool write_fault = fault && fault->write; > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in > practice? Prefetching is still done via gfn_to_page_many_atomic, which sets FOLL_GET. That's fixable, but it's not something this series currently does. > ... > > > > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > */ > > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) > > { > > - bool host_writable; > > + bool host_writable, is_refcounted; > > gpa_t first_pte_gpa; > > u64 *sptep, spte; > > struct kvm_memory_slot *slot; > > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int > > sptep = &sp->spt[i]; > > spte = *sptep; > > host_writable = spte & shadow_host_writable_mask; > > + // TODO: is this correct? > > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED; > > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > > make_spte(vcpu, sp, slot, pte_access, gfn, > > spte_to_pfn(spte), spte, true, false, > > - host_writable, &spte); > > + host_writable, is_refcounted, &spte); > > Could we restrict that a non-refcounted page shall not be used as shadow page? I'm not very familiar with the shadow mmu, so my response might not make sense. But do you mean not allowing non-refcoutned pages as the guest page tables shadowed by a kvm_mmu_page? It would probably be possible to do that, and I doubt anyone would care about the restriction. But as far as I can tell, the guest page table is only accessed via kvm_vcpu_read_guest_atomic, which handles non-refcounted pages just fine. -David
On Thu, Jul 6, 2023 at 11:10 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, > David Stevens <stevensd@chromium.org> wrote: > > > diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c > > index cf2c6426a6fc..46c681dc45e6 100644 > > --- a/arch/x86/kvm/mmu/spte.c > > +++ b/arch/x86/kvm/mmu/spte.c > > @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > > const struct kvm_memory_slot *slot, > > unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, > > u64 old_spte, bool prefetch, bool can_unsync, > > - bool host_writable, u64 *new_spte) > > + bool host_writable, bool is_refcounted, u64 *new_spte) > > { > > int level = sp->role.level; > > u64 spte = SPTE_MMU_PRESENT_MASK; > > @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, > > > > if (level > PG_LEVEL_4K) > > spte |= PT_PAGE_SIZE_MASK; > > + else if (is_refcounted) > > + spte |= SPTE_MMU_PAGE_REFCOUNTED; > > Is REFCOUNTED for 4K page only? What guarantees that large page doesn't have > FOLL_GET? or can we set the bit for large page? Oh, you're right, it should apply to >4K pages as well. This was based on stale thinking from earlier versions of this series. -David
On Thu, Jul 06, 2023 at 01:52:08PM +0900, David Stevens wrote: > On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > From: David Stevens <stevensd@chromium.org> > > > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > > memory into the guest that is backed by un-refcounted struct pages - for > > > example, higher order non-compound pages allocated by the amdgpu driver > > > via ttm_pool_alloc_page. > > > > I guess you mean the tail pages of the higher order non-compound pages? > > And as to the head page, it is said to be set to one coincidentally[*], > > and shall not be considered as refcounted. IIUC, refcount of this head > > page will be increased and decreased soon in hva_to_pfn_remapped(), so > > this may not be a problem(?). But treating this head page differently, > > as a refcounted one(e.g., to set the A/D flags), is weired. > > > > Or maybe I missed some context, e.g., can the head page be allocted to > > guest at all? > > Yes, this is to allow mapping the tail pages of higher order > non-compound pages - I should have been more precise in my wording. > The head pages can already be mapped into the guest. > > Treating the head and tail pages would require changing how KVM > behaves in a situation it supports today (rather than just adding > support for an unsupported situation). Currently, without this series, > KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the > guest. When that happens, KVM sets the A/D flags. I'm not sure whether > that's actually valid behavior, nor do I know whether anyone actually > cares about it. But it's what KVM does today, and I would shy away > from modifying that behavior without good reason. I know the A/D status of the refcounted, VM_PFNMAP|VM_IO backed pages will be recorded. And I have no idea if this is a necessary requirement either. But it feels awkward to see the head and the tail ones of non-compound pages be treated inconsistently. After all, the head page just happens to have its refcount being 1, it is not a real refcounted page. So I would suggest to mention such different behehavior in the commit message at least. :) > > > > > > @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, > > > */ > > > static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) > > > { > > > - bool host_writable; > > > + bool host_writable, is_refcounted; > > > gpa_t first_pte_gpa; > > > u64 *sptep, spte; > > > struct kvm_memory_slot *slot; > > > @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int > > > sptep = &sp->spt[i]; > > > spte = *sptep; > > > host_writable = spte & shadow_host_writable_mask; > > > + // TODO: is this correct? > > > + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED; > > > slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); > > > make_spte(vcpu, sp, slot, pte_access, gfn, > > > spte_to_pfn(spte), spte, true, false, > > > - host_writable, &spte); > > > + host_writable, is_refcounted, &spte); > > > > Could we restrict that a non-refcounted page shall not be used as shadow page? > > I'm not very familiar with the shadow mmu, so my response might not > make sense. But do you mean not allowing non-refcoutned pages as the > guest page tables shadowed by a kvm_mmu_page? It would probably be > possible to do that, and I doubt anyone would care about the > restriction. But as far as I can tell, the guest page table is only > accessed via kvm_vcpu_read_guest_atomic, which handles non-refcounted > pages just fine. Sorry, my brain just got baked... Pls just ignore this question :) B.R. Yu
On Thu, Jul 06, 2023 at 01:52:08PM +0900, David Stevens <stevensd@chromium.org> wrote: > On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > From: David Stevens <stevensd@chromium.org> > > > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > > memory into the guest that is backed by un-refcounted struct pages - for > > > example, higher order non-compound pages allocated by the amdgpu driver > > > via ttm_pool_alloc_page. > > > > I guess you mean the tail pages of the higher order non-compound pages? > > And as to the head page, it is said to be set to one coincidentally[*], > > and shall not be considered as refcounted. IIUC, refcount of this head > > page will be increased and decreased soon in hva_to_pfn_remapped(), so > > this may not be a problem(?). But treating this head page differently, > > as a refcounted one(e.g., to set the A/D flags), is weired. > > > > Or maybe I missed some context, e.g., can the head page be allocted to > > guest at all? > > Yes, this is to allow mapping the tail pages of higher order > non-compound pages - I should have been more precise in my wording. > The head pages can already be mapped into the guest. > > Treating the head and tail pages would require changing how KVM > behaves in a situation it supports today (rather than just adding > support for an unsupported situation). Currently, without this series, > KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the > guest. When that happens, KVM sets the A/D flags. I'm not sure whether > that's actually valid behavior, nor do I know whether anyone actually > cares about it. But it's what KVM does today, and I would shy away > from modifying that behavior without good reason. > > > > > > > The bulk of this change is tracking the is_refcounted_page flag so that > > > non-refcounted pages don't trigger page_count() == 0 warnings. This is > > > done by storing the flag in an unused bit in the sptes. > > > > Also, maybe we should mention this only works on x86-64. > > > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > --- > > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > > > arch/x86/kvm/mmu/mmu_internal.h | 1 + > > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > > > arch/x86/kvm/mmu/spte.c | 4 ++- > > > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > > > 6 files changed, 62 insertions(+), 30 deletions(-) > > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > index e44ab512c3a1..b1607e314497 100644 > > > --- a/arch/x86/kvm/mmu/mmu.c > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > > ... > > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > > bool host_writable = !fault || fault->map_writable; > > > bool prefetch = !fault || fault->prefetch; > > > bool write_fault = fault && fault->write; > > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in > > practice? > > Prefetching is still done via gfn_to_page_many_atomic, which sets > FOLL_GET. That's fixable, but it's not something this series currently > does. So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent whether the corresponding page is ref-countable or not, right? Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte) is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit the issue, though. Thanks,
On Fri, Jul 7, 2023 at 12:58 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > > On Thu, Jul 06, 2023 at 01:52:08PM +0900, > David Stevens <stevensd@chromium.org> wrote: > > > On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > > From: David Stevens <stevensd@chromium.org> > > > > > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > > > memory into the guest that is backed by un-refcounted struct pages - for > > > > example, higher order non-compound pages allocated by the amdgpu driver > > > > via ttm_pool_alloc_page. > > > > > > I guess you mean the tail pages of the higher order non-compound pages? > > > And as to the head page, it is said to be set to one coincidentally[*], > > > and shall not be considered as refcounted. IIUC, refcount of this head > > > page will be increased and decreased soon in hva_to_pfn_remapped(), so > > > this may not be a problem(?). But treating this head page differently, > > > as a refcounted one(e.g., to set the A/D flags), is weired. > > > > > > Or maybe I missed some context, e.g., can the head page be allocted to > > > guest at all? > > > > Yes, this is to allow mapping the tail pages of higher order > > non-compound pages - I should have been more precise in my wording. > > The head pages can already be mapped into the guest. > > > > Treating the head and tail pages would require changing how KVM > > behaves in a situation it supports today (rather than just adding > > support for an unsupported situation). Currently, without this series, > > KVM can map VM_PFNMAP|VM_IO memory backed by refcounted pages into the > > guest. When that happens, KVM sets the A/D flags. I'm not sure whether > > that's actually valid behavior, nor do I know whether anyone actually > > cares about it. But it's what KVM does today, and I would shy away > > from modifying that behavior without good reason. > > > > > > > > > > The bulk of this change is tracking the is_refcounted_page flag so that > > > > non-refcounted pages don't trigger page_count() == 0 warnings. This is > > > > done by storing the flag in an unused bit in the sptes. > > > > > > Also, maybe we should mention this only works on x86-64. > > > > > > > > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > > > --- > > > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > > > > arch/x86/kvm/mmu/mmu_internal.h | 1 + > > > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > > > > arch/x86/kvm/mmu/spte.c | 4 ++- > > > > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > > > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > > > > 6 files changed, 62 insertions(+), 30 deletions(-) > > > > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > > index e44ab512c3a1..b1607e314497 100644 > > > > --- a/arch/x86/kvm/mmu/mmu.c > > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > > > > ... > > > > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > > > bool host_writable = !fault || fault->map_writable; > > > > bool prefetch = !fault || fault->prefetch; > > > > bool write_fault = fault && fault->write; > > > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > > > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in > > > practice? > > > > Prefetching is still done via gfn_to_page_many_atomic, which sets > > FOLL_GET. That's fixable, but it's not something this series currently > > does. > > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this > hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched > spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent > whether the corresponding page is ref-countable or not, right? > > Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte) > is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit > the issue, though. > direct_pte_prefetch_many and prefetch_gpte both pass NULL for the fault parameter, so is_refcounted will evaluate to true. So the spte's refcounted bit will get set in that case. -David
On Fri, Jul 07, 2023 at 10:35:02AM +0900, David Stevens <stevensd@chromium.org> wrote: > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > > > index e44ab512c3a1..b1607e314497 100644 > > > > > --- a/arch/x86/kvm/mmu/mmu.c > > > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > > > > > > ... > > > > > > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > > > > bool host_writable = !fault || fault->map_writable; > > > > > bool prefetch = !fault || fault->prefetch; > > > > > bool write_fault = fault && fault->write; > > > > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > > > > > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in > > > > practice? > > > > > > Prefetching is still done via gfn_to_page_many_atomic, which sets > > > FOLL_GET. That's fixable, but it's not something this series currently > > > does. > > > > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this > > hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched > > spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent > > whether the corresponding page is ref-countable or not, right? > > > > Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte) > > is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit > > the issue, though. > > > > direct_pte_prefetch_many and prefetch_gpte both pass NULL for the > fault parameter, so is_refcounted will evaluate to true. So the spte's > refcounted bit will get set in that case. Oops, my bad. My point is "unconditionally". Is the bit always set for non-refcountable pages? Or non-refcountable pages are not prefeched?
On Tue, Jul 11, 2023 at 1:34 AM Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > > On Fri, Jul 07, 2023 at 10:35:02AM +0900, > David Stevens <stevensd@chromium.org> wrote: > > > > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > > > > > index e44ab512c3a1..b1607e314497 100644 > > > > > > --- a/arch/x86/kvm/mmu/mmu.c > > > > > > +++ b/arch/x86/kvm/mmu/mmu.c > > > > > > > > > > ... > > > > > > > > > > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > > > > > bool host_writable = !fault || fault->map_writable; > > > > > > bool prefetch = !fault || fault->prefetch; > > > > > > bool write_fault = fault && fault->write; > > > > > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > > > > > > > > > Just wonder, what if a non-refcounted page is prefetched? Or is it possible in > > > > > practice? > > > > > > > > Prefetching is still done via gfn_to_page_many_atomic, which sets > > > > FOLL_GET. That's fixable, but it's not something this series currently > > > > does. > > > > > > So if we prefetch a page, REFCOUNTED bit is cleared unconditionally with this > > > hunk. kvm_set_page_{dirty, accessed} won't be called as expected for prefetched > > > spte. If I read the patch correctly, REFCOUNTED bit in SPTE should represent > > > whether the corresponding page is ref-countable or not, right? > > > > > > Because direct_pte_prefetch_many() is for legacy KVM MMU and FNAME(prefetch_pte) > > > is shadow paging, we need to test it with legacy KVM MMU or shadow paging to hit > > > the issue, though. > > > > > > > direct_pte_prefetch_many and prefetch_gpte both pass NULL for the > > fault parameter, so is_refcounted will evaluate to true. So the spte's > > refcounted bit will get set in that case. > > Oops, my bad. My point is "unconditionally". Is the bit always set for > non-refcountable pages? Or non-refcountable pages are not prefeched? The bit is never set for non-refcounted pages, and is always set for refcounted pages. The current series never prefetches non-refcounted pages, since it continues to use the gfn_to_page_many_atomic API. -David
On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > out_unlock: > write_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); > + if (fault->is_refcounted_page) > + kvm_set_page_accessed(pfn_to_page(fault->pfn)); For a refcounted page, as now KVM puts its ref early in kvm_faultin_pfn(), should this kvm_set_page_accessed() be placed before unlocking mmu_lock? Otherwise, if the user unmaps a region (which triggers kvm_unmap_gfn_range() with mmu_lock holding for write), and release the page, and if the two steps happen after checking page_count() in kvm_set_page_accessed() and before mark_page_accessed(), the latter function may mark accessed to a page that is released or does not belong to current process. Is it true? > return r; > } > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > out_unlock: > read_unlock(&vcpu->kvm->mmu_lock); > - kvm_release_pfn_clean(fault->pfn); > + if (fault->is_refcounted_page) > + kvm_set_page_accessed(pfn_to_page(fault->pfn)); > return r; > } Ditto.
On Wed, Jul 19, 2023 at 3:35 PM Yan Zhao <yan.y.zhao@intel.com> wrote: > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > > > out_unlock: > > write_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > + if (fault->is_refcounted_page) > > + kvm_set_page_accessed(pfn_to_page(fault->pfn)); > For a refcounted page, as now KVM puts its ref early in kvm_faultin_pfn(), > should this kvm_set_page_accessed() be placed before unlocking mmu_lock? > > Otherwise, if the user unmaps a region (which triggers kvm_unmap_gfn_range() > with mmu_lock holding for write), and release the page, and if the two > steps happen after checking page_count() in kvm_set_page_accessed() and > before mark_page_accessed(), the latter function may mark accessed to a page > that is released or does not belong to current process. > > Is it true? Yes, good catch. During some testing last week, I actually found this bug thanks to the WARN_ON the first patch in this series added to kvm_is_ad_tracked_page. I'll fix it in the next revision, after Sean gets a chance to comment on the series. Thanks, David
On Thu, Jul 06, 2023, David Stevens wrote: > On Wed, Jul 5, 2023 at 7:17 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > From: David Stevens <stevensd@chromium.org> > > > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > > memory into the guest that is backed by un-refcounted struct pages - for > > > example, higher order non-compound pages allocated by the amdgpu driver > > > via ttm_pool_alloc_page. > > > > I guess you mean the tail pages of the higher order non-compound pages? > > And as to the head page, it is said to be set to one coincidentally[*], > > and shall not be considered as refcounted. IIUC, refcount of this head > > page will be increased and decreased soon in hva_to_pfn_remapped(), so > > this may not be a problem(?). But treating this head page differently, > > as a refcounted one(e.g., to set the A/D flags), is weired. > > > > Or maybe I missed some context, e.g., can the head page be allocted to > > guest at all? > > Yes, this is to allow mapping the tail pages of higher order > non-compound pages - I should have been more precise in my wording. > The head pages can already be mapped into the guest. Recording for posterity (or to make an incorrect statment and get corrected), because I recently had a conversation about the head page not actually being refcounted. (I can't remember with whom I had the conversation, but I'm pretty sure it wasn't an imaginary friend). Even though whatever allocates the page doesn't explicit refcount the head page, __free_pages() will still do the right thing and (a) keep the head page around until its last reference is put. And my understanding is that even though it's a "head" page, it's not a PG_head page, i.e. not a compound page and so is treated as an order-0 page when KVM invoke put_page(). void __free_pages(struct page *page, unsigned int order) { /* get PageHead before we drop reference */ int head = PageHead(page); if (put_page_testzero(page)) <=== will evaluate false if KVM holds a ref free_the_page(page, order); else if (!head) <=== will be false for non-compound pages while (order-- > 0) free_the_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages);
On Wed, Jul 5, 2023 at 7:25 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > From: David Stevens <stevensd@chromium.org> > > > > Stop passing FOLL_GET to __kvm_follow_pfn. This allows the host to map > > memory into the guest that is backed by un-refcounted struct pages - for > > example, higher order non-compound pages allocated by the amdgpu driver > > via ttm_pool_alloc_page. > > > > The bulk of this change is tracking the is_refcounted_page flag so that > > non-refcounted pages don't trigger page_count() == 0 warnings. This is > > done by storing the flag in an unused bit in the sptes. > > > > Signed-off-by: David Stevens <stevensd@chromium.org> > > --- > > arch/x86/kvm/mmu/mmu.c | 44 +++++++++++++++++++++------------ > > arch/x86/kvm/mmu/mmu_internal.h | 1 + > > arch/x86/kvm/mmu/paging_tmpl.h | 9 ++++--- > > arch/x86/kvm/mmu/spte.c | 4 ++- > > arch/x86/kvm/mmu/spte.h | 12 ++++++++- > > arch/x86/kvm/mmu/tdp_mmu.c | 22 ++++++++++------- > > 6 files changed, 62 insertions(+), 30 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > > index e44ab512c3a1..b1607e314497 100644 > > --- a/arch/x86/kvm/mmu/mmu.c > > +++ b/arch/x86/kvm/mmu/mmu.c > > @@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) > > > > if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { > > flush = true; > > - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); > > + if (is_refcounted_page_pte(old_spte)) > > + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); > > } > > > > if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { > > flush = true; > > - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); > > + if (is_refcounted_page_pte(old_spte)) > > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte))); > > } > > > > return flush; > > @@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) > > * before they are reclaimed. Sanity check that, if the pfn is backed > > * by a refcounted page, the refcount is elevated. > > */ > > - page = kvm_pfn_to_refcounted_page(pfn); > > - WARN_ON(page && !page_count(page)); > > + if (is_refcounted_page_pte(old_spte)) { > > + page = kvm_pfn_to_refcounted_page(pfn); > > + WARN_ON(!page || !page_count(page)); > > + } > > > > - if (is_accessed_spte(old_spte)) > > - kvm_set_pfn_accessed(pfn); > > + if (is_refcounted_page_pte(old_spte)) { > > + if (is_accessed_spte(old_spte)) > > + kvm_set_page_accessed(pfn_to_page(pfn)); > > > > - if (is_dirty_spte(old_spte)) > > - kvm_set_pfn_dirty(pfn); > > + if (is_dirty_spte(old_spte)) > > + kvm_set_page_dirty(pfn_to_page(pfn)); > > + } > > > > return old_spte; > > } > > @@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep) > > * Capture the dirty status of the page, so that it doesn't get > > * lost when the SPTE is marked for access tracking. > > */ > > - if (is_writable_pte(spte)) > > - kvm_set_pfn_dirty(spte_to_pfn(spte)); > > + if (is_writable_pte(spte) && is_refcounted_page_pte(spte)) > > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte))); > > > > spte = mark_spte_for_access_track(spte); > > mmu_spte_update_no_track(sptep, spte); > > @@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) > > { > > bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, > > (unsigned long *)sptep); > > - if (was_writable && !spte_ad_enabled(*sptep)) > > - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); > > + if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep)) > > + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep))); > > > > return was_writable; > > } > > @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > bool host_writable = !fault || fault->map_writable; > > bool prefetch = !fault || fault->prefetch; > > bool write_fault = fault && fault->write; > > + bool is_refcounted = !fault || fault->is_refcounted_page; > > > > pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, > > *sptep, write_fault, gfn); > > @@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > > } > > > > wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, > > - true, host_writable, &spte); > > + true, host_writable, is_refcounted, &spte); > > > > if (*sptep == spte) { > > ret = RET_PF_SPURIOUS; > > @@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > struct kvm_follow_pfn foll = { > > .slot = slot, > > .gfn = fault->gfn, > > - .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0), > > + .flags = fault->write ? FOLL_WRITE : 0, > > .allow_write_mapping = true, > > + .guarded_by_mmu_notifier = true, > > }; > > > > /* > > @@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > fault->slot = NULL; > > fault->pfn = KVM_PFN_NOSLOT; > > fault->map_writable = false; > > + fault->is_refcounted_page = false; > > return RET_PF_CONTINUE; > > } > > /* > > @@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > success: > > fault->hva = foll.hva; > > fault->map_writable = foll.writable; > > + fault->is_refcounted_page = foll.is_refcounted_page; > > return RET_PF_CONTINUE; > > } > > > > @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > > > out_unlock: > > write_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > + if (fault->is_refcounted_page) > > + kvm_set_page_accessed(pfn_to_page(fault->pfn)); > > return r; > > } > > > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > > > out_unlock: > > read_unlock(&vcpu->kvm->mmu_lock); > > - kvm_release_pfn_clean(fault->pfn); > > Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns. > What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I > believe this is not gonna happen in real world... kvm_vcpu_map still uses gfn_to_pfn, which eventually passes FOLL_GET to __kvm_follow_pfn. So if a guest tries to use a non-refcounted page like that, then kvm_vcpu_map will fail and the guest will probably crash. It won't trigger any bugs in the host, though. It is unfortunate that the guest will be able to use certain types of memory for some purposes but not for others. However, while it is theoretically fixable, it's an unreasonable amount of work for something that, as you say, nobody really cares about in practice [1]. [1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/ -David
On Thu, Aug 24, 2023, David Stevens wrote: > On Wed, Jul 5, 2023 at 7:25 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > > > > > out_unlock: > > > read_unlock(&vcpu->kvm->mmu_lock); > > > - kvm_release_pfn_clean(fault->pfn); > > > > Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns. > > What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I > > believe this is not gonna happen in real world... > > kvm_vcpu_map still uses gfn_to_pfn, which eventually passes FOLL_GET > to __kvm_follow_pfn. So if a guest tries to use a non-refcounted page > like that, then kvm_vcpu_map will fail and the guest will probably > crash. It won't trigger any bugs in the host, though. > > It is unfortunate that the guest will be able to use certain types of > memory for some purposes but not for others. However, while it is > theoretically fixable, it's an unreasonable amount of work for > something that, as you say, nobody really cares about in practice [1]. > > [1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/ There are use cases that care, which is why I suggested allow_unsafe_kmap. Specifically, AWS manages their pool of guest memory in userspace and maps it all via /dev/mem. Without that module param to let userspace opt-in, this series will break such setups. It still arguably is a breaking change since it requires userspace to opt-in, but allowing such behavior by default is simply not a viable option, and I don't have much sympathy since so much of this mess has its origins in commit e45adf665a53 ("KVM: Introduce a new guest mapping API"). The use cases that no one cares about (AFAIK) is allowing _untrusted_ userspace to back guest RAM with arbitrary memory. In other words, I want KVM to allow (by default) mapping device memory into the guest for things like vGPUs, without having to do the massive and invasive overhaul needed to safely allow backing guest RAM with completely arbitrary memory.
On Fri, Aug 25, 2023 at 12:15 AM Sean Christopherson <seanjc@google.com> wrote: > > On Thu, Aug 24, 2023, David Stevens wrote: > > On Wed, Jul 5, 2023 at 7:25 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > > > > > > > out_unlock: > > > > read_unlock(&vcpu->kvm->mmu_lock); > > > > - kvm_release_pfn_clean(fault->pfn); > > > > > > Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns. > > > What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I > > > believe this is not gonna happen in real world... > > > > kvm_vcpu_map still uses gfn_to_pfn, which eventually passes FOLL_GET > > to __kvm_follow_pfn. So if a guest tries to use a non-refcounted page > > like that, then kvm_vcpu_map will fail and the guest will probably > > crash. It won't trigger any bugs in the host, though. > > > > It is unfortunate that the guest will be able to use certain types of > > memory for some purposes but not for others. However, while it is > > theoretically fixable, it's an unreasonable amount of work for > > something that, as you say, nobody really cares about in practice [1]. > > > > [1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/ > > There are use cases that care, which is why I suggested allow_unsafe_kmap. > Specifically, AWS manages their pool of guest memory in userspace and maps it all > via /dev/mem. Without that module param to let userspace opt-in, this series will > break such setups. It still arguably is a breaking change since it requires > userspace to opt-in, but allowing such behavior by default is simply not a viable > option, and I don't have much sympathy since so much of this mess has its origins > in commit e45adf665a53 ("KVM: Introduce a new guest mapping API"). > > The use cases that no one cares about (AFAIK) is allowing _untrusted_ userspace > to back guest RAM with arbitrary memory. In other words, I want KVM to allow > (by default) mapping device memory into the guest for things like vGPUs, without > having to do the massive and invasive overhaul needed to safely allow backing guest > RAM with completely arbitrary memory. Do you specifically want the allow_unsafe_kmap breaking change? v7 of this series should have supported everything that is currently supported by KVM, but you're right that the v8 version of hva_to_pfn_remapped doesn't support mapping !kvm_pfn_to_refcounted_page() pages. That could be supported explicitly with allow_unsafe_kmap as you suggested, or it could be supported implicitly with this implementation: @@ -2774,8 +2771,14 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * would then underflow the refcount when the caller does the * required put_page. Don't allow those pages here. */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; + page = kvm_pfn_to_refcounted_page(pfn); + if (page) { + if (get_page_unless_zero(page)) + WARN_ON_ONCE(kvm_follow_refcounted_pfn(foll, page) != pfn); + + if (!foll->is_refcounted_page && !foll->guarded_by_mmu_notifier) + r = -EFAULT; + } -David
On Fri, Aug 25, 2023, David Stevens wrote: > On Fri, Aug 25, 2023 at 12:15 AM Sean Christopherson <seanjc@google.com> wrote: > > > > On Thu, Aug 24, 2023, David Stevens wrote: > > > On Wed, Jul 5, 2023 at 7:25 PM Yu Zhang <yu.c.zhang@linux.intel.com> wrote: > > > > > > > > On Tue, Jul 04, 2023 at 04:50:50PM +0900, David Stevens wrote: > > > > > @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > > > > > > > > > > out_unlock: > > > > > read_unlock(&vcpu->kvm->mmu_lock); > > > > > - kvm_release_pfn_clean(fault->pfn); > > > > > > > > Yet kvm_release_pfn() can still be triggered for the kvm_vcpu_maped gfns. > > > > What if guest uses a non-referenced page(e.g., as a vmcs12)? Although I > > > > believe this is not gonna happen in real world... > > > > > > kvm_vcpu_map still uses gfn_to_pfn, which eventually passes FOLL_GET > > > to __kvm_follow_pfn. So if a guest tries to use a non-refcounted page > > > like that, then kvm_vcpu_map will fail and the guest will probably > > > crash. It won't trigger any bugs in the host, though. > > > > > > It is unfortunate that the guest will be able to use certain types of > > > memory for some purposes but not for others. However, while it is > > > theoretically fixable, it's an unreasonable amount of work for > > > something that, as you say, nobody really cares about in practice [1]. > > > > > > [1] https://lore.kernel.org/all/ZBEEQtmtNPaEqU1i@google.com/ > > > > There are use cases that care, which is why I suggested allow_unsafe_kmap. > > Specifically, AWS manages their pool of guest memory in userspace and maps it all > > via /dev/mem. Without that module param to let userspace opt-in, this series will > > break such setups. It still arguably is a breaking change since it requires > > userspace to opt-in, but allowing such behavior by default is simply not a viable > > option, and I don't have much sympathy since so much of this mess has its origins > > in commit e45adf665a53 ("KVM: Introduce a new guest mapping API"). > > > > The use cases that no one cares about (AFAIK) is allowing _untrusted_ userspace > > to back guest RAM with arbitrary memory. In other words, I want KVM to allow > > (by default) mapping device memory into the guest for things like vGPUs, without > > having to do the massive and invasive overhaul needed to safely allow backing guest > > RAM with completely arbitrary memory. > > Do you specifically want the allow_unsafe_kmap breaking change? v7 of > this series should have supported everything that is currently > supported by KVM, but you're right that the v8 version of > hva_to_pfn_remapped doesn't support mapping > !kvm_pfn_to_refcounted_page() pages. That could be supported > explicitly with allow_unsafe_kmap as you suggested, I think it needs to be explicit, i.e. needs the admin to opt-in to the unsafe behavior.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e44ab512c3a1..b1607e314497 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -553,12 +553,14 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) { flush = true; - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + if (is_refcounted_page_pte(old_spte)) + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); } if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) { flush = true; - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + if (is_refcounted_page_pte(old_spte)) + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte))); } return flush; @@ -596,14 +598,18 @@ static u64 mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) * before they are reclaimed. Sanity check that, if the pfn is backed * by a refcounted page, the refcount is elevated. */ - page = kvm_pfn_to_refcounted_page(pfn); - WARN_ON(page && !page_count(page)); + if (is_refcounted_page_pte(old_spte)) { + page = kvm_pfn_to_refcounted_page(pfn); + WARN_ON(!page || !page_count(page)); + } - if (is_accessed_spte(old_spte)) - kvm_set_pfn_accessed(pfn); + if (is_refcounted_page_pte(old_spte)) { + if (is_accessed_spte(old_spte)) + kvm_set_page_accessed(pfn_to_page(pfn)); - if (is_dirty_spte(old_spte)) - kvm_set_pfn_dirty(pfn); + if (is_dirty_spte(old_spte)) + kvm_set_page_dirty(pfn_to_page(pfn)); + } return old_spte; } @@ -639,8 +645,8 @@ static bool mmu_spte_age(u64 *sptep) * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(spte)) - kvm_set_pfn_dirty(spte_to_pfn(spte)); + if (is_writable_pte(spte) && is_refcounted_page_pte(spte)) + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(spte))); spte = mark_spte_for_access_track(spte); mmu_spte_update_no_track(sptep, spte); @@ -1278,8 +1284,8 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) { bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT, (unsigned long *)sptep); - if (was_writable && !spte_ad_enabled(*sptep)) - kvm_set_pfn_dirty(spte_to_pfn(*sptep)); + if (was_writable && !spte_ad_enabled(*sptep) && is_refcounted_page_pte(*sptep)) + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(*sptep))); return was_writable; } @@ -2937,6 +2943,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, bool host_writable = !fault || fault->map_writable; bool prefetch = !fault || fault->prefetch; bool write_fault = fault && fault->write; + bool is_refcounted = !fault || fault->is_refcounted_page; pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, *sptep, write_fault, gfn); @@ -2969,7 +2976,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, } wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, *sptep, prefetch, - true, host_writable, &spte); + true, host_writable, is_refcounted, &spte); if (*sptep == spte) { ret = RET_PF_SPURIOUS; @@ -4299,8 +4306,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault struct kvm_follow_pfn foll = { .slot = slot, .gfn = fault->gfn, - .flags = FOLL_GET | (fault->write ? FOLL_WRITE : 0), + .flags = fault->write ? FOLL_WRITE : 0, .allow_write_mapping = true, + .guarded_by_mmu_notifier = true, }; /* @@ -4317,6 +4325,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault fault->slot = NULL; fault->pfn = KVM_PFN_NOSLOT; fault->map_writable = false; + fault->is_refcounted_page = false; return RET_PF_CONTINUE; } /* @@ -4366,6 +4375,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault success: fault->hva = foll.hva; fault->map_writable = foll.writable; + fault->is_refcounted_page = foll.is_refcounted_page; return RET_PF_CONTINUE; } @@ -4451,7 +4461,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + if (fault->is_refcounted_page) + kvm_set_page_accessed(pfn_to_page(fault->pfn)); return r; } @@ -4529,7 +4540,8 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, out_unlock: read_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + if (fault->is_refcounted_page) + kvm_set_page_accessed(pfn_to_page(fault->pfn)); return r; } #endif diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index d39af5639ce9..55790085884f 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -240,6 +240,7 @@ struct kvm_page_fault { kvm_pfn_t pfn; hva_t hva; bool map_writable; + bool is_refcounted_page; /* * Indicates the guest is trying to write a gfn that contains one or diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 0662e0278e70..3284e7bd9619 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -829,7 +829,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + if (fault->is_refcounted_page) + kvm_set_page_accessed(pfn_to_page(fault->pfn)); return r; } @@ -883,7 +884,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, */ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { - bool host_writable; + bool host_writable, is_refcounted; gpa_t first_pte_gpa; u64 *sptep, spte; struct kvm_memory_slot *slot; @@ -940,10 +941,12 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; + // TODO: is this correct? + is_refcounted = spte & SPTE_MMU_PAGE_REFCOUNTED; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); make_spte(vcpu, sp, slot, pte_access, gfn, spte_to_pfn(spte), spte, true, false, - host_writable, &spte); + host_writable, is_refcounted, &spte); return mmu_spte_update(sptep, spte); } diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c index cf2c6426a6fc..46c681dc45e6 100644 --- a/arch/x86/kvm/mmu/spte.c +++ b/arch/x86/kvm/mmu/spte.c @@ -138,7 +138,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, - bool host_writable, u64 *new_spte) + bool host_writable, bool is_refcounted, u64 *new_spte) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; @@ -188,6 +188,8 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; + else if (is_refcounted) + spte |= SPTE_MMU_PAGE_REFCOUNTED; if (shadow_memtype_mask) spte |= static_call(kvm_x86_get_mt_mask)(vcpu, gfn, diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 1279db2eab44..be93dd061ae3 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -95,6 +95,11 @@ static_assert(!(EPT_SPTE_MMU_WRITABLE & SHADOW_ACC_TRACK_SAVED_MASK)); /* Defined only to keep the above static asserts readable. */ #undef SHADOW_ACC_TRACK_SAVED_MASK +/* + * Indicates that the SPTE refers to a page with a valid refcount. + */ +#define SPTE_MMU_PAGE_REFCOUNTED BIT_ULL(59) + /* * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of * the memslots generation and is derived as follows: @@ -332,6 +337,11 @@ static inline bool is_dirty_spte(u64 spte) return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK; } +static inline bool is_refcounted_page_pte(u64 spte) +{ + return spte & SPTE_MMU_PAGE_REFCOUNTED; +} + static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) { @@ -462,7 +472,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, - bool host_writable, u64 *new_spte); + bool host_writable, bool is_refcounted, u64 *new_spte); u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index); u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 512163d52194..a9b1b14d2e26 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -474,6 +474,7 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, bool was_leaf = was_present && is_last_spte(old_spte, level); bool is_leaf = is_present && is_last_spte(new_spte, level); bool pfn_changed = spte_to_pfn(old_spte) != spte_to_pfn(new_spte); + bool is_refcounted = is_refcounted_page_pte(old_spte); WARN_ON(level > PT64_ROOT_MAX_LEVEL); WARN_ON(level < PG_LEVEL_4K); @@ -538,9 +539,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, if (is_leaf != was_leaf) kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); - if (was_leaf && is_dirty_spte(old_spte) && + if (was_leaf && is_dirty_spte(old_spte) && is_refcounted && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) - kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(old_spte))); /* * Recursively handle child PTs if the change removed a subtree from @@ -552,9 +553,9 @@ static void handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, (is_leaf || !is_present || WARN_ON_ONCE(pfn_changed))) handle_removed_pt(kvm, spte_to_child_pt(old_spte, level), shared); - if (was_leaf && is_accessed_spte(old_spte) && + if (was_leaf && is_accessed_spte(old_spte) && is_refcounted && (!is_present || !is_accessed_spte(new_spte) || pfn_changed)) - kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + kvm_set_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); } /* @@ -988,8 +989,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else wrprot = make_spte(vcpu, sp, fault->slot, ACC_ALL, iter->gfn, - fault->pfn, iter->old_spte, fault->prefetch, true, - fault->map_writable, &new_spte); + fault->pfn, iter->old_spte, fault->prefetch, true, + fault->map_writable, fault->is_refcounted_page, + &new_spte); if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; @@ -1205,8 +1207,9 @@ static bool age_gfn_range(struct kvm *kvm, struct tdp_iter *iter, * Capture the dirty status of the page, so that it doesn't get * lost when the SPTE is marked for access tracking. */ - if (is_writable_pte(iter->old_spte)) - kvm_set_pfn_dirty(spte_to_pfn(iter->old_spte)); + if (is_writable_pte(iter->old_spte) && + is_refcounted_page_pte(iter->old_spte)) + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(iter->old_spte))); new_spte = mark_spte_for_access_track(iter->old_spte); iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep, @@ -1626,7 +1629,8 @@ static void clear_dirty_pt_masked(struct kvm *kvm, struct kvm_mmu_page *root, trace_kvm_tdp_mmu_spte_changed(iter.as_id, iter.gfn, iter.level, iter.old_spte, iter.old_spte & ~dbit); - kvm_set_pfn_dirty(spte_to_pfn(iter.old_spte)); + if (is_refcounted_page_pte(iter.old_spte)) + kvm_set_page_dirty(pfn_to_page(spte_to_pfn(iter.old_spte))); } rcu_read_unlock();