diff mbox series

[mm-unstable,v1,2/5] kvm/x86: add kvm_arch_test_clear_young()

Message ID 20230217041230.2417228-3-yuzhao@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series mm/kvm: lockless accessed bit harvest | expand

Commit Message

Yu Zhao Feb. 17, 2023, 4:12 a.m. UTC
This patch adds kvm_arch_test_clear_young() for the vast majority of
VMs that are not nested and run on hardware that sets the accessed bit
in TDP MMU page tables.

It relies on two techniques, RCU and cmpxchg, to safely test and clear
the accessed bit without taking the MMU lock. The former protects KVM
page tables from being freed while the latter clears the accessed bit
atomically against both the hardware and other software page table
walkers.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 arch/x86/include/asm/kvm_host.h | 27 ++++++++++++++++++++++
 arch/x86/kvm/mmu/spte.h         | 12 ----------
 arch/x86/kvm/mmu/tdp_mmu.c      | 41 +++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 12 deletions(-)

Comments

Yu Zhao Feb. 17, 2023, 4:19 a.m. UTC | #1
On Thu, Feb 16, 2023 at 9:12 PM Yu Zhao <yuzhao@google.com> wrote:
>
> This patch adds kvm_arch_test_clear_young() for the vast majority of
> VMs that are not nested and run on hardware that sets the accessed bit
> in TDP MMU page tables.
>
> It relies on two techniques, RCU and cmpxchg, to safely test and clear
> the accessed bit without taking the MMU lock. The former protects KVM
> page tables from being freed while the latter clears the accessed bit
> atomically against both the hardware and other software page table
> walkers.
>
> Signed-off-by: Yu Zhao <yuzhao@google.com>
> ---
>  arch/x86/include/asm/kvm_host.h | 27 ++++++++++++++++++++++
>  arch/x86/kvm/mmu/spte.h         | 12 ----------
>  arch/x86/kvm/mmu/tdp_mmu.c      | 41 +++++++++++++++++++++++++++++++++
>  3 files changed, 68 insertions(+), 12 deletions(-)

Adding Sean and David.

Can you please add other interested parties that I've missed?

Thanks.
Sean Christopherson Feb. 17, 2023, 4:27 p.m. UTC | #2
On Thu, Feb 16, 2023, Yu Zhao wrote:
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 6aaae18f1854..d2995c9e8f07 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1367,6 +1367,12 @@ struct kvm_arch {
>  	 *	the MMU lock in read mode + the tdp_mmu_pages_lock or
>  	 *	the MMU lock in write mode
>  	 *
> +	 * kvm_arch_test_clear_young() is a special case. It relies on two

No, it's not.  The TDP MMU already employs on RCU and CMPXCHG.  Just drop the
entire comment.

> +	 * techniques, RCU and cmpxchg, to safely test and clear the accessed
> +	 * bit without taking the MMU lock. The former protects KVM page tables
> +	 * from being freed while the latter clears the accessed bit atomically
> +	 * against both the hardware and other software page table walkers.
> +	 *
>  	 * Roots will remain in the list until their tdp_mmu_root_count
>  	 * drops to zero, at which point the thread that decremented the
>  	 * count to zero should removed the root from the list and clean
> @@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
>  	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
>  	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
>  
> +extern u64 __read_mostly shadow_accessed_mask;
> +
> +/*
> + * Returns true if A/D bits are supported in hardware and are enabled by KVM.
> + * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
> + * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
> + * scenario where KVM is using A/D bits for L1, but not L2.
> + */
> +static inline bool kvm_ad_enabled(void)
> +{
> +	return shadow_accessed_mask;
> +}

Absolutely not.  This information is not getting directly exposed outside of KVM.

> +
> +/* see the comments on the generic kvm_arch_has_test_clear_young() */
> +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> +static inline bool kvm_arch_has_test_clear_young(void)
> +{
> +	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
> +	       (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled));
> +}

Pending the justification for why this is KVM-only, I would strongly prefer we
find a way to have the mmu_notifier framework track whether or not any listeners
have a test_clear_young().  E.g. have KVM nullify its hook during module load.

> +
>  #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> index 6f54dc9409c9..0dc7fed1f3fd 100644
> --- a/arch/x86/kvm/mmu/spte.h
> +++ b/arch/x86/kvm/mmu/spte.h
> @@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
>  extern u64 __read_mostly shadow_nx_mask;
>  extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
>  extern u64 __read_mostly shadow_user_mask;
> -extern u64 __read_mostly shadow_accessed_mask;
>  extern u64 __read_mostly shadow_dirty_mask;
>  extern u64 __read_mostly shadow_mmio_value;
>  extern u64 __read_mostly shadow_mmio_mask;
> @@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte)
>  	return !!(pte & SPTE_MMU_PRESENT_MASK);
>  }
>  
> -/*
> - * Returns true if A/D bits are supported in hardware and are enabled by KVM.
> - * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
> - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
> - * scenario where KVM is using A/D bits for L1, but not L2.
> - */
> -static inline bool kvm_ad_enabled(void)
> -{
> -	return !!shadow_accessed_mask;
> -}

As Oliver said in the ARM patch, _if_ this is justified, please do code movement
in a separate patch.

> -
>  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
>  {
>  	return sp->role.ad_disabled;
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index d6df38d371a0..9028e09f1aab 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>  	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
>  }
>  
> +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
> +			       gfn_t lsb_gfn, unsigned long *bitmap)
> +{
> +	struct kvm_mmu_page *root;
> +
> +	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
> +		return false;
> +
> +	if (kvm_memslots_have_rmaps(kvm))

This completely disables the API on VMs that have _ever_ run a nested VM.  I doubt
that's the intended behavior.

> +		return false;
> +
> +	/* see the comments on kvm_arch->tdp_mmu_roots */
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
> +		struct tdp_iter iter;
> +
> +		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
> +			continue;

for_each_tdp_mmu_root() does this for you.

> +
> +		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
> +			u64 *sptep = rcu_dereference(iter.sptep);

kvm_tdp_mmu_read_spte(), thought it's not clear to me why this doesn't test+clear
the SPTE's accessed bit and then toggle the bitmap.

> +			u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
> +
> +			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));

This doesn't do what I assume it's intended to do.  The sptep points at a KVM,
a.k.a. kernel, allocated page, not at guest memory.  Assuming the intent is to
assert that the memory being aged has an elevated refcount, this would need to
extract the pfn out of the SPTE and get the struct page for that.  But that's
completely unsafe because KVM supports mapping VM_PFNMAP and VM_IO memory into
the guest.  Maybe the proposed caller only operates on struct page memory, but
I am not willing to make that assumption in KVM.

TL;DR: drop this.

> +			VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);

This adds no value, KVM is completely hosed if tdp_root_for_each_leaf_pte() botches
the ranges.

> +
> +			if (new_spte == iter.old_spte)
> +				continue;
> +
> +			/* see the comments on the generic kvm_arch_has_test_clear_young() */

No, "see xyz" for unintuitive logic is not acceptable.  Add a helper and document
the logic there, don't splatter "see XYZ" comments everywhere.

> +			if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
> +				cmpxchg64(sptep, iter.old_spte, new_spte);

Clearing a single bit doesn't need a CMPXCHG.  Please weigh in on a relevant series
that is modifying the aging flows[*], I want to have exactly one helper for aging
TDP MMU SPTEs.

[*] https://lore.kernel.org/all/20230211014626.3659152-5-vipinsh@google.com
Yu Zhao Feb. 23, 2023, 5:58 a.m. UTC | #3
On Fri, Feb 17, 2023 at 9:27 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 16, 2023, Yu Zhao wrote:
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index 6aaae18f1854..d2995c9e8f07 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -1367,6 +1367,12 @@ struct kvm_arch {
> >        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
> >        *      the MMU lock in write mode
> >        *
> > +      * kvm_arch_test_clear_young() is a special case. It relies on two
>
> No, it's not.  The TDP MMU already employs on RCU and CMPXCHG.

It is -- you read it out of context :)

         * For reads, this list is protected by:
         *      the MMU lock in read mode + RCU or
         *      the MMU lock in write mode
         *
         * For writes, this list is protected by:
         *      the MMU lock in read mode + the tdp_mmu_pages_lock or
         *      the MMU lock in write mode
         *
         * kvm_arch_test_clear_young() is a special case.
         ...

        struct list_head tdp_mmu_roots;

> Just drop the
> entire comment.

Let me move it into kvm_arch_test_clear_young().

Also I want to be clear:
1. We can't just focus on here and now; we need to consider the distant future.
2. From my POV, "see the comments on ..." is like the index of a book.

> > +      * techniques, RCU and cmpxchg, to safely test and clear the accessed
> > +      * bit without taking the MMU lock. The former protects KVM page tables
> > +      * from being freed while the latter clears the accessed bit atomically
> > +      * against both the hardware and other software page table walkers.
> > +      *
> >        * Roots will remain in the list until their tdp_mmu_root_count
> >        * drops to zero, at which point the thread that decremented the
> >        * count to zero should removed the root from the list and clean
> > @@ -2171,4 +2177,25 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
> >        KVM_X86_QUIRK_FIX_HYPERCALL_INSN |     \
> >        KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
> >
> > +extern u64 __read_mostly shadow_accessed_mask;
> > +
> > +/*
> > + * Returns true if A/D bits are supported in hardware and are enabled by KVM.
> > + * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
> > + * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
> > + * scenario where KVM is using A/D bits for L1, but not L2.
> > + */
> > +static inline bool kvm_ad_enabled(void)
> > +{
> > +     return shadow_accessed_mask;
> > +}
>
> Absolutely not.  This information is not getting directly exposed outside of KVM.

Will do.

> > +
> > +/* see the comments on the generic kvm_arch_has_test_clear_young() */
> > +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> > +static inline bool kvm_arch_has_test_clear_young(void)
> > +{
> > +     return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
> > +            (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled));
> > +}
>
> Pending the justification for why this is KVM-only

Nothing else has *_young()... IOW, KVM is the only user of *_young().

> I would strongly prefer we
> find a way to have the mmu_notifier framework track whether or not any listeners
> have a test_clear_young().  E.g. have KVM nullify its hook during module load.

It's already done that way. This function is just for the caller to
avoid unnecessary trips into the MMU notifier on archs that don't
support this capability. (The caller would find it unsupported.)

> > +
> >  #endif /* _ASM_X86_KVM_HOST_H */
> > diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
> > index 6f54dc9409c9..0dc7fed1f3fd 100644
> > --- a/arch/x86/kvm/mmu/spte.h
> > +++ b/arch/x86/kvm/mmu/spte.h
> > @@ -153,7 +153,6 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
> >  extern u64 __read_mostly shadow_nx_mask;
> >  extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
> >  extern u64 __read_mostly shadow_user_mask;
> > -extern u64 __read_mostly shadow_accessed_mask;
> >  extern u64 __read_mostly shadow_dirty_mask;
> >  extern u64 __read_mostly shadow_mmio_value;
> >  extern u64 __read_mostly shadow_mmio_mask;
> > @@ -247,17 +246,6 @@ static inline bool is_shadow_present_pte(u64 pte)
> >       return !!(pte & SPTE_MMU_PRESENT_MASK);
> >  }
> >
> > -/*
> > - * Returns true if A/D bits are supported in hardware and are enabled by KVM.
> > - * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
> > - * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
> > - * scenario where KVM is using A/D bits for L1, but not L2.
> > - */
> > -static inline bool kvm_ad_enabled(void)
> > -{
> > -     return !!shadow_accessed_mask;
> > -}
>
> As Oliver said in the ARM patch, _if_ this is justified, please do code movement
> in a separate patch.

I'll just drop this.

> >  static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
> >  {
> >       return sp->role.ad_disabled;
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index d6df38d371a0..9028e09f1aab 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -1309,6 +1309,47 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >       return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
> >  }
> >
> > +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
> > +                            gfn_t lsb_gfn, unsigned long *bitmap)
> > +{
> > +     struct kvm_mmu_page *root;
> > +
> > +     if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
> > +             return false;
> > +
> > +     if (kvm_memslots_have_rmaps(kvm))
>
> This completely disables the API on VMs that have _ever_ run a nested VM.

Ok, we definitely don't want this.

> I doubt
> that's the intended behavior.

Good catch, thanks.

> > +             return false;
> > +
> > +     /* see the comments on kvm_arch->tdp_mmu_roots */
> > +     rcu_read_lock();
> > +
> > +     list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
> > +             struct tdp_iter iter;
> > +
> > +             if (kvm_mmu_page_as_id(root) != range->slot->as_id)
> > +                     continue;
>
> for_each_tdp_mmu_root() does this for you.
>
> > +
> > +             tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
> > +                     u64 *sptep = rcu_dereference(iter.sptep);
>
> kvm_tdp_mmu_read_spte(), thought it's not clear to me why this doesn't test+clear
> the SPTE's accessed bit and then toggle the bitmap.
>
> > +                     u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
> > +
> > +                     VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
>
> This doesn't do what I assume it's intended to do.

This asserts the page table page is not freed, i.e., the memory we are
modifying still belongs to someone.

 If we forget to do RCU free, i.e., we free immediately, this can
catch it, assuming no reuse.

> The sptep points at a KVM,
> a.k.a. kernel, allocated page, not at guest memory.  Assuming the intent is to
> assert that the memory being aged has an elevated refcount, this would need to
> extract the pfn out of the SPTE and get the struct page for that.  But that's
> completely unsafe because KVM supports mapping VM_PFNMAP and VM_IO memory into
> the guest.  Maybe the proposed caller only operates on struct page memory, but
> I am not willing to make that assumption in KVM.
>
> TL;DR: drop this.
>
> > +                     VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);
>
> This adds no value KVM is completely hosed if tdp_root_for_each_leaf_pte() botches
> the ranges.

Ok.

> > +
> > +                     if (new_spte == iter.old_spte)
> > +                             continue;
> > +
> > +                     /* see the comments on the generic kvm_arch_has_test_clear_young() */
>
> No, "see xyz" for unintuitive logic is not acceptable.  Add a helper and document
> the logic there, don't splatter "see XYZ" comments everywhere.
>
> > +                     if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
> > +                             cmpxchg64(sptep, iter.old_spte, new_spte);
>
> Clearing a single bit doesn't need a CMPXCHG.  Please weigh in on a relevant series
> that is modifying the aging flows[*], I want to have exactly one helper for aging
> TDP MMU SPTEs.
>
> [*] https://lore.kernel.org/all/20230211014626.3659152-5-vipinsh@google.com

I'll take a look at that series. clear_bit() probably won't cause any
practical damage but is technically wrong because, for example, it can
end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
in this case, obviously.)
Sean Christopherson Feb. 23, 2023, 5:09 p.m. UTC | #4
On Wed, Feb 22, 2023, Yu Zhao wrote:
> On Fri, Feb 17, 2023 at 9:27 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 16, 2023, Yu Zhao wrote:
> > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > index 6aaae18f1854..d2995c9e8f07 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -1367,6 +1367,12 @@ struct kvm_arch {
> > >        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
> > >        *      the MMU lock in write mode
> > >        *
> > > +      * kvm_arch_test_clear_young() is a special case. It relies on two
> >
> > No, it's not.  The TDP MMU already employs on RCU and CMPXCHG.
> 
> It is -- you read it out of context :)

Ah, the special case is that it's fully lockless.  That's still not all that
special, e.g. see kvm_tdp_mmu_walk_lockless_{begin,end}().

>          * For reads, this list is protected by:
>          *      the MMU lock in read mode + RCU or
>          *      the MMU lock in write mode
>          *
>          * For writes, this list is protected by:
>          *      the MMU lock in read mode + the tdp_mmu_pages_lock or
>          *      the MMU lock in write mode
>          *
>          * kvm_arch_test_clear_young() is a special case.
>          ...
> 
>         struct list_head tdp_mmu_roots;
> 
> > Just drop the
> > entire comment.
> 
> Let me move it into kvm_arch_test_clear_young().

No, I do not want kvm_arch_test_clear_young(), or any other one-off function, to
be "special".  I love the idea of a lockless walk, but I want it to be a formal,
documented way to walk TDP MMU roots.  I.e. add macro to go with for_each_tdp_mmu_root()
and the yield-safe variants.

/* blah blah blah */
#define for_each_tdp_mmu_root_lockless(_kvm, _root, _as_id)		\
	list_for_each_entry_rcu(_root, &kvm->arch.tdp_mmu_roots, link)	\
		if (refcount_read(&root->tdp_mmu_root_count) &&		\
		    kvm_mmu_page_as_id(_root) != _as_id) {		\
		} else

> Also I want to be clear:
> 1. We can't just focus on here and now; we need to consider the distant future.

I 100% agree, but those words need to be backed up by actions.  This series is
littered with code that is not maintainable long term, e.g. open coding stuff
that belongs in helpers and/or for which KVM already provides helpers, copy-pasting
__kvm_handle_hva_range() instead of extending it to have a lockless option, poking
directly into KVM from mm/ code, etc.

I apologize for being so blunt.  My intent isn't to be rude/snarky, it's to set
very clear expectations for getting any of these changes merges.  I asbolutely do
want to land improvments to KVM's test+clear young flows, but it needs to be done
in a way that is maintainable and doesn't saddle KVM with more tech debt.

> 2. From my POV, "see the comments on ..." is like the index of a book.

And my _very_ strong preference is to provide the "index" via code, not comments.

> > Clearing a single bit doesn't need a CMPXCHG.  Please weigh in on a relevant series
> > that is modifying the aging flows[*], I want to have exactly one helper for aging
> > TDP MMU SPTEs.
> >
> > [*] https://lore.kernel.org/all/20230211014626.3659152-5-vipinsh@google.com
> 
> I'll take a look at that series. clear_bit() probably won't cause any
> practical damage but is technically wrong because, for example, it can
> end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> in this case, obviously.)

Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
wrong because the target gfn may or may not have been accessed.  The only way for
KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
replaced between the "is leaf" and the clear_bit().
Yu Zhao Feb. 23, 2023, 5:27 p.m. UTC | #5
On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Feb 22, 2023, Yu Zhao wrote:
> > On Fri, Feb 17, 2023 at 9:27 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 16, 2023, Yu Zhao wrote:
> > > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > > > index 6aaae18f1854..d2995c9e8f07 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -1367,6 +1367,12 @@ struct kvm_arch {
> > > >        *      the MMU lock in read mode + the tdp_mmu_pages_lock or
> > > >        *      the MMU lock in write mode
> > > >        *
> > > > +      * kvm_arch_test_clear_young() is a special case. It relies on two
> > >
> > > No, it's not.  The TDP MMU already employs on RCU and CMPXCHG.
> >
> > It is -- you read it out of context :)
>
> Ah, the special case is that it's fully lockless.  That's still not all that
> special, e.g. see kvm_tdp_mmu_walk_lockless_{begin,end}().
>
> >          * For reads, this list is protected by:
> >          *      the MMU lock in read mode + RCU or
> >          *      the MMU lock in write mode
> >          *
> >          * For writes, this list is protected by:
> >          *      the MMU lock in read mode + the tdp_mmu_pages_lock or
> >          *      the MMU lock in write mode
> >          *
> >          * kvm_arch_test_clear_young() is a special case.
> >          ...
> >
> >         struct list_head tdp_mmu_roots;
> >
> > > Just drop the
> > > entire comment.
> >
> > Let me move it into kvm_arch_test_clear_young().
>
> No, I do not want kvm_arch_test_clear_young(), or any other one-off function, to
> be "special".  I love the idea of a lockless walk, but I want it to be a formal,
> documented way to walk TDP MMU roots.  I.e. add macro to go with for_each_tdp_mmu_root()
> and the yield-safe variants.

I see what you mean now. will do.

> /* blah blah blah */
> #define for_each_tdp_mmu_root_lockless(_kvm, _root, _as_id)             \
>         list_for_each_entry_rcu(_root, &kvm->arch.tdp_mmu_roots, link)  \
>                 if (refcount_read(&root->tdp_mmu_root_count) &&         \
>                     kvm_mmu_page_as_id(_root) != _as_id) {              \
>                 } else
>
> > Also I want to be clear:
> > 1. We can't just focus on here and now; we need to consider the distant future.
>
> I 100% agree, but those words need to be backed up by actions.  This series is
> littered with code that is not maintainable long term, e.g. open coding stuff
> that belongs in helpers and/or for which KVM already provides helpers, copy-pasting
> __kvm_handle_hva_range() instead of extending it to have a lockless option, poking
> directly into KVM from mm/ code, etc.
>
> I apologize for being so blunt.  My intent isn't to be rude/snarky, it's to set
> very clear expectations for getting any of these changes merges.

No worries at all. I appreciate you directly telling me how you prefer
it to be done, and that makes the job easier for both of us. Please do
bear with me though, because I'm very unfamiliar with the KVM side of
expectations.

> I asbolutely do
> want to land improvments to KVM's test+clear young flows, but it needs to be done
> in a way that is maintainable and doesn't saddle KVM with more tech debt.

Agreed.

> > 2. From my POV, "see the comments on ..." is like the index of a book.
>
> And my _very_ strong preference is to provide the "index" via code, not comments.

Will do.

> > > Clearing a single bit doesn't need a CMPXCHG.  Please weigh in on a relevant series
> > > that is modifying the aging flows[*], I want to have exactly one helper for aging
> > > TDP MMU SPTEs.
> > >
> > > [*] https://lore.kernel.org/all/20230211014626.3659152-5-vipinsh@google.com
> >
> > I'll take a look at that series. clear_bit() probably won't cause any
> > practical damage but is technically wrong because, for example, it can
> > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > in this case, obviously.)
>
> Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> wrong because the target gfn may or may not have been accessed.

Sorry, I don't understand. You mean clear_bit() on a huge PTE is
technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
is not.)

> The only way for
> KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> replaced between the "is leaf" and the clear_bit().

I think there is a misunderstanding here. Let me be more specific:
1. Clearing the A-bit in a non-leaf entry is technically wrong because
that's not our intention.
2. When we try to clear_bit() on a leaf PMD, it can at the same time
become a non-leaf PMD, which causes 1) above, and therefore is
technically wrong.
3. I don't think 2) could do any real harm, so no practically no problem.
4. cmpxchg() can avoid 2).

Does this make sense?

Thanks.
Sean Christopherson Feb. 23, 2023, 6:23 p.m. UTC | #6
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > I'll take a look at that series. clear_bit() probably won't cause any
> > > practical damage but is technically wrong because, for example, it can
> > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > in this case, obviously.)
> >
> > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > wrong because the target gfn may or may not have been accessed.
> 
> Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> is not.)
> 
> > The only way for
> > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > replaced between the "is leaf" and the clear_bit().
> 
> I think there is a misunderstanding here. Let me be more specific:
> 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> that's not our intention.
> 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> become a non-leaf PMD, which causes 1) above, and therefore is
> technically wrong.
> 3. I don't think 2) could do any real harm, so no practically no problem.
> 4. cmpxchg() can avoid 2).
> 
> Does this make sense?

I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
_just_ got converted from a leaf PMD is "wrong" if and only if the intented
behavior is nonsensical.

Without an explicit granluarity from the caller, the intent is to either (a) reap
A-bit on leaf PTEs, or (b) reap A-bit at the highest possible granularity.  (a) is
nonsensical because because it provides zero guarantees to the caller as to the
granularity of the information.  Leaf vs. non-leaf matters for the life cycle of
page tables and guest accesses, e.g. KVM needs to zap _only_ leaf SPTEs when
handling an mmu_notifier invalidation, but when it comes to the granularity of the
A-bit, leaf vs. non-leaf has no meaning.  On KVM x86, a PMD covers 2MiB of GPAs
regardless of whether it's a leaf or non-leaf PMD.

If the intent is (b), then clearing the A-bit on a PMD a few cycles after the PMD
was converted from leaf to non-leaf is a pointless distinction, because it yields
the same end result as clearing the A-bit just a few cycles earlier, when the PMD
was a leaf.

Actually, if I'm reading patch 5 correctly, this is all much ado about nothing,
because the MGLRU code only kicks in only for non-huge PTEs, and KVM cannot have
larger mappings than the primary MMU, i.e. should not encounter huge PTEs.

On that topic, if the assumption is that the bitmap is used only for non-huge PTEs,
then x86's kvm_arch_test_clear_young() needs to be hardened to process only 4KiB
PTEs, and probably to WARN if a huge PTE is encountered.  That assumption should
also be documented.

If that assumption is incorrect, then kvm_arch_test_clear_young() is broken and/or
the expected behavior of the bitmap isn't fully defined.
Yu Zhao Feb. 23, 2023, 6:34 p.m. UTC | #7
On Thu, Feb 23, 2023 at 11:24 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > I'll take a look at that series. clear_bit() probably won't cause any
> > > > practical damage but is technically wrong because, for example, it can
> > > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > > in this case, obviously.)
> > >
> > > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > > wrong because the target gfn may or may not have been accessed.
> >
> > Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> > technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> > is not.)
> >
> > > The only way for
> > > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > > replaced between the "is leaf" and the clear_bit().
> >
> > I think there is a misunderstanding here. Let me be more specific:
> > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > that's not our intention.
> > 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> > become a non-leaf PMD, which causes 1) above, and therefore is
> > technically wrong.
> > 3. I don't think 2) could do any real harm, so no practically no problem.
> > 4. cmpxchg() can avoid 2).
> >
> > Does this make sense?
>
> I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
> _just_ got converted from a leaf PMD is "wrong" if and only if the intented
> behavior is nonsensical.

Sorry, let me rephrase:
1. Clearing the A-bit in a non-leaf entry is technically wrong because
we didn't make sure there is the A-bit there --  the bit we are
clearing can be something else. (Yes, we know it's not, but we didn't
define this behavior, e.g., a macro to designate that bit for non-leaf
entries. Also I didn't check the spec -- does EPT actually support the
A-bit in non-leaf entries? My guess is that NPT does.)
Sean Christopherson Feb. 23, 2023, 6:47 p.m. UTC | #8
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 11:24 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > I'll take a look at that series. clear_bit() probably won't cause any
> > > > > practical damage but is technically wrong because, for example, it can
> > > > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > > > in this case, obviously.)
> > > >
> > > > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > > > wrong because the target gfn may or may not have been accessed.
> > >
> > > Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> > > technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> > > is not.)
> > >
> > > > The only way for
> > > > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > > > replaced between the "is leaf" and the clear_bit().
> > >
> > > I think there is a misunderstanding here. Let me be more specific:
> > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > that's not our intention.
> > > 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> > > become a non-leaf PMD, which causes 1) above, and therefore is
> > > technically wrong.
> > > 3. I don't think 2) could do any real harm, so no practically no problem.
> > > 4. cmpxchg() can avoid 2).
> > >
> > > Does this make sense?
> >
> > I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
> > _just_ got converted from a leaf PMD is "wrong" if and only if the intented
> > behavior is nonsensical.
> 
> Sorry, let me rephrase:
> 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> we didn't make sure there is the A-bit there --  the bit we are
> clearing can be something else. (Yes, we know it's not, but we didn't
> define this behavior, e.g., a macro to designate that bit for non-leaf
> entries.

Heh, by that definition, anything and everything is "technically wrong".  An Intel
CPU might support SVM, even though we know no such CPUs exist, so requiring AMD or
Hygon to enable SVM is technically wrong.

> Also I didn't check the spec -- does EPT actually support the
> A-bit in non-leaf entries? My guess is that NPT does.)

If A/D bits are enabled, both EPT and 64-bit NPT support the Accessed bit at all
levels irrespective of whether or not the entry maps a huge page.

PAE NPT is a different story, but the TDP MMU is limited to 64-bit kernels, i.e.
requires 64-bit NPT.
Yu Zhao Feb. 23, 2023, 7:02 p.m. UTC | #9
On Thu, Feb 23, 2023 at 11:47 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 11:24 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > > I'll take a look at that series. clear_bit() probably won't cause any
> > > > > > practical damage but is technically wrong because, for example, it can
> > > > > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > > > > in this case, obviously.)
> > > > >
> > > > > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > > > > wrong because the target gfn may or may not have been accessed.
> > > >
> > > > Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> > > > technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> > > > is not.)
> > > >
> > > > > The only way for
> > > > > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > > > > replaced between the "is leaf" and the clear_bit().
> > > >
> > > > I think there is a misunderstanding here. Let me be more specific:
> > > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > > that's not our intention.
> > > > 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> > > > become a non-leaf PMD, which causes 1) above, and therefore is
> > > > technically wrong.
> > > > 3. I don't think 2) could do any real harm, so no practically no problem.
> > > > 4. cmpxchg() can avoid 2).
> > > >
> > > > Does this make sense?
> > >
> > > I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
> > > _just_ got converted from a leaf PMD is "wrong" if and only if the intented
> > > behavior is nonsensical.
> >
> > Sorry, let me rephrase:
> > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > we didn't make sure there is the A-bit there --  the bit we are
> > clearing can be something else. (Yes, we know it's not, but we didn't
> > define this behavior, e.g., a macro to designate that bit for non-leaf
> > entries.
>
> Heh, by that definition, anything and everything is "technically wrong".

I really don't see how what I said, in our context,

  "Clearing the A-bit in a non-leaf entry is technically wrong because
we didn't make sure there is the A-bit there"

can infer

  "anything and everything is "technically wrong"."

And how what I said can be an analogy to

  "An Intel CPU might support SVM, even though we know no such CPUs
exist, so requiring AMD or Hygon to enable SVM is technically wrong."

BTW, here is a bug caused by clearing the A-bit in non-leaf entries in
a different scenario:
https://lore.kernel.org/linux-mm/20221123064510.16225-1-jgross@suse.com/

Let's just agree to disagree.
Sean Christopherson Feb. 23, 2023, 7:21 p.m. UTC | #10
On Thu, Feb 23, 2023, Yu Zhao wrote:
> On Thu, Feb 23, 2023 at 11:47 AM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > On Thu, Feb 23, 2023 at 11:24 AM Sean Christopherson <seanjc@google.com> wrote:
> > > >
> > > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > > On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > > > I'll take a look at that series. clear_bit() probably won't cause any
> > > > > > > practical damage but is technically wrong because, for example, it can
> > > > > > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > > > > > in this case, obviously.)
> > > > > >
> > > > > > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > > > > > wrong because the target gfn may or may not have been accessed.
> > > > >
> > > > > Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> > > > > technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> > > > > is not.)
> > > > >
> > > > > > The only way for
> > > > > > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > > > > > replaced between the "is leaf" and the clear_bit().
> > > > >
> > > > > I think there is a misunderstanding here. Let me be more specific:
> > > > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > > > that's not our intention.
> > > > > 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> > > > > become a non-leaf PMD, which causes 1) above, and therefore is
> > > > > technically wrong.
> > > > > 3. I don't think 2) could do any real harm, so no practically no problem.
> > > > > 4. cmpxchg() can avoid 2).
> > > > >
> > > > > Does this make sense?
> > > >
> > > > I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
> > > > _just_ got converted from a leaf PMD is "wrong" if and only if the intented
> > > > behavior is nonsensical.
> > >
> > > Sorry, let me rephrase:
> > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > we didn't make sure there is the A-bit there --  the bit we are
> > > clearing can be something else. (Yes, we know it's not, but we didn't
> > > define this behavior, e.g., a macro to designate that bit for non-leaf
> > > entries.
> >
> > Heh, by that definition, anything and everything is "technically wrong".
> 
> I really don't see how what I said, in our context,
> 
>   "Clearing the A-bit in a non-leaf entry is technically wrong because
> we didn't make sure there is the A-bit there"
> 
> can infer
> 
>   "anything and everything is "technically wrong"."
> 
> And how what I said can be an analogy to
> 
>   "An Intel CPU might support SVM, even though we know no such CPUs
> exist, so requiring AMD or Hygon to enable SVM is technically wrong."
> 
> BTW, here is a bug caused by clearing the A-bit in non-leaf entries in
> a different scenario:
> https://lore.kernel.org/linux-mm/20221123064510.16225-1-jgross@suse.com/
> 
> Let's just agree to disagree.

No, because I don't want anyone to leave with the impression that relying on the
Accessed bit to uniformly exist (or not) at all levels in the TDP MMU is somehow
technically wrong.  The link you posted is about running as a Xen guest, and is
in arch-agnostic code.  That is wildly different than what we are talking about
here, where the targets are strictly limited to x86-64 TDP, and the existence of
the Accessed bit is architecturally defined.

In this code, there are exactly two flavors of paging that can be in use, and
using clear_bit() to clear shadow_accessed_mask is safe for both, full stop.
Yu Zhao Feb. 23, 2023, 7:25 p.m. UTC | #11
On Thu, Feb 23, 2023 at 12:21 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Feb 23, 2023, Yu Zhao wrote:
> > On Thu, Feb 23, 2023 at 11:47 AM Sean Christopherson <seanjc@google.com> wrote:
> > >
> > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > On Thu, Feb 23, 2023 at 11:24 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > >
> > > > > On Thu, Feb 23, 2023, Yu Zhao wrote:
> > > > > > On Thu, Feb 23, 2023 at 10:09 AM Sean Christopherson <seanjc@google.com> wrote:
> > > > > > > > I'll take a look at that series. clear_bit() probably won't cause any
> > > > > > > > practical damage but is technically wrong because, for example, it can
> > > > > > > > end up clearing the A-bit in a non-leaf PMD. (cmpxchg will just fail
> > > > > > > > in this case, obviously.)
> > > > > > >
> > > > > > > Eh, not really.  By that argument, clearing an A-bit in a huge PTE is also technically
> > > > > > > wrong because the target gfn may or may not have been accessed.
> > > > > >
> > > > > > Sorry, I don't understand. You mean clear_bit() on a huge PTE is
> > > > > > technically wrong? Yes, that's what I mean. (cmpxchg() on a huge PTE
> > > > > > is not.)
> > > > > >
> > > > > > > The only way for
> > > > > > > KVM to clear a A-bit in a non-leaf entry is if the entry _was_ a huge PTE, but was
> > > > > > > replaced between the "is leaf" and the clear_bit().
> > > > > >
> > > > > > I think there is a misunderstanding here. Let me be more specific:
> > > > > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > > > > that's not our intention.
> > > > > > 2. When we try to clear_bit() on a leaf PMD, it can at the same time
> > > > > > become a non-leaf PMD, which causes 1) above, and therefore is
> > > > > > technically wrong.
> > > > > > 3. I don't think 2) could do any real harm, so no practically no problem.
> > > > > > 4. cmpxchg() can avoid 2).
> > > > > >
> > > > > > Does this make sense?
> > > > >
> > > > > I understand what you're saying, but clearing an A-bit on a non-leaf PMD that
> > > > > _just_ got converted from a leaf PMD is "wrong" if and only if the intented
> > > > > behavior is nonsensical.
> > > >
> > > > Sorry, let me rephrase:
> > > > 1. Clearing the A-bit in a non-leaf entry is technically wrong because
> > > > we didn't make sure there is the A-bit there --  the bit we are
> > > > clearing can be something else. (Yes, we know it's not, but we didn't
> > > > define this behavior, e.g., a macro to designate that bit for non-leaf
> > > > entries.
> > >
> > > Heh, by that definition, anything and everything is "technically wrong".
> >
> > I really don't see how what I said, in our context,
> >
> >   "Clearing the A-bit in a non-leaf entry is technically wrong because
> > we didn't make sure there is the A-bit there"
> >
> > can infer
> >
> >   "anything and everything is "technically wrong"."
> >
> > And how what I said can be an analogy to
> >
> >   "An Intel CPU might support SVM, even though we know no such CPUs
> > exist, so requiring AMD or Hygon to enable SVM is technically wrong."
> >
> > BTW, here is a bug caused by clearing the A-bit in non-leaf entries in
> > a different scenario:
> > https://lore.kernel.org/linux-mm/20221123064510.16225-1-jgross@suse.com/
> >
> > Let's just agree to disagree.
>
> No, because I don't want anyone to leave with the impression that relying on the
> Accessed bit to uniformly exist (or not) at all levels in the TDP MMU is somehow
> technically wrong.  The link you posted is about running as a Xen guest, and is
> in arch-agnostic code.  That is wildly different than what we are talking about
> here, where the targets are strictly limited to x86-64 TDP, and the existence of
> the Accessed bit is architecturally defined.

Yes, I see now.

Sorry to say this, but this is all I needed to hear: "the existence of
the Accessed bit is architecturally defined". (I didn't know and see
this is what you meant.)

> In this code, there are exactly two flavors of paging that can be in use, and
> using clear_bit() to clear shadow_accessed_mask is safe for both, full stop.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6aaae18f1854..d2995c9e8f07 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1367,6 +1367,12 @@  struct kvm_arch {
 	 *	the MMU lock in read mode + the tdp_mmu_pages_lock or
 	 *	the MMU lock in write mode
 	 *
+	 * kvm_arch_test_clear_young() is a special case. It relies on two
+	 * techniques, RCU and cmpxchg, to safely test and clear the accessed
+	 * bit without taking the MMU lock. The former protects KVM page tables
+	 * from being freed while the latter clears the accessed bit atomically
+	 * against both the hardware and other software page table walkers.
+	 *
 	 * Roots will remain in the list until their tdp_mmu_root_count
 	 * drops to zero, at which point the thread that decremented the
 	 * count to zero should removed the root from the list and clean
@@ -2171,4 +2177,25 @@  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
 	 KVM_X86_QUIRK_FIX_HYPERCALL_INSN |	\
 	 KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS)
 
+extern u64 __read_mostly shadow_accessed_mask;
+
+/*
+ * Returns true if A/D bits are supported in hardware and are enabled by KVM.
+ * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
+ * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
+ * scenario where KVM is using A/D bits for L1, but not L2.
+ */
+static inline bool kvm_ad_enabled(void)
+{
+	return shadow_accessed_mask;
+}
+
+/* see the comments on the generic kvm_arch_has_test_clear_young() */
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return IS_ENABLED(CONFIG_KVM) && IS_ENABLED(CONFIG_X86_64) &&
+	       (!IS_REACHABLE(CONFIG_KVM) || (kvm_ad_enabled() && tdp_enabled));
+}
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 6f54dc9409c9..0dc7fed1f3fd 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -153,7 +153,6 @@  extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
-extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
 extern u64 __read_mostly shadow_mmio_mask;
@@ -247,17 +246,6 @@  static inline bool is_shadow_present_pte(u64 pte)
 	return !!(pte & SPTE_MMU_PRESENT_MASK);
 }
 
-/*
- * Returns true if A/D bits are supported in hardware and are enabled by KVM.
- * When enabled, KVM uses A/D bits for all non-nested MMUs.  Because L1 can
- * disable A/D bits in EPTP12, SP and SPTE variants are needed to handle the
- * scenario where KVM is using A/D bits for L1, but not L2.
- */
-static inline bool kvm_ad_enabled(void)
-{
-	return !!shadow_accessed_mask;
-}
-
 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 {
 	return sp->role.ad_disabled;
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index d6df38d371a0..9028e09f1aab 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1309,6 +1309,47 @@  bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm_tdp_mmu_handle_gfn(kvm, range, age_gfn_range);
 }
 
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range,
+			       gfn_t lsb_gfn, unsigned long *bitmap)
+{
+	struct kvm_mmu_page *root;
+
+	if (WARN_ON_ONCE(!kvm_arch_has_test_clear_young()))
+		return false;
+
+	if (kvm_memslots_have_rmaps(kvm))
+		return false;
+
+	/* see the comments on kvm_arch->tdp_mmu_roots */
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+		struct tdp_iter iter;
+
+		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
+			continue;
+
+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+			u64 *sptep = rcu_dereference(iter.sptep);
+			u64 new_spte = iter.old_spte & ~shadow_accessed_mask;
+
+			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
+			VM_WARN_ON_ONCE(iter.gfn < range->start || iter.gfn >= range->end);
+
+			if (new_spte == iter.old_spte)
+				continue;
+
+			/* see the comments on the generic kvm_arch_has_test_clear_young() */
+			if (__test_and_change_bit(lsb_gfn - iter.gfn, bitmap))
+				cmpxchg64(sptep, iter.old_spte, new_spte);
+		}
+	}
+
+	rcu_read_unlock();
+
+	return true;
+}
+
 static bool test_age_gfn(struct kvm *kvm, struct tdp_iter *iter,
 			 struct kvm_gfn_range *range)
 {