diff mbox series

[mm-unstable,v2,09/10] kvm/x86: add kvm_arch_test_clear_young()

Message ID 20230526234435.662652-10-yuzhao@google.com (mailing list archive)
State Handled Elsewhere
Headers show
Series mm/kvm: locklessly clear the accessed bit | expand

Commit Message

Yu Zhao May 26, 2023, 11:44 p.m. UTC
Implement kvm_arch_test_clear_young() to support the fast path in
mmu_notifier_ops->test_clear_young().

It focuses on a simple case, i.e., TDP MMU sets the accessed bit in
KVM PTEs and VMs are not nested, where it can rely on RCU and
clear_bit() to safely clear the accessed bit without taking
kvm->mmu_lock. Complex cases fall back to the existing slow path
where kvm->mmu_lock is then taken.

Signed-off-by: Yu Zhao <yuzhao@google.com>
---
 arch/x86/include/asm/kvm_host.h |  7 +++++++
 arch/x86/kvm/mmu/tdp_mmu.c      | 34 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

Comments

Paolo Bonzini June 9, 2023, 9:06 a.m. UTC | #1
On 5/27/23 01:44, Yu Zhao wrote:
> +#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
> +static inline bool kvm_arch_has_test_clear_young(void)
> +{
> +	return IS_ENABLED(CONFIG_X86_64) &&
> +	       (!IS_REACHABLE(CONFIG_KVM) || (tdp_mmu_enabled && shadow_accessed_mask));
> +}

I don't think you need IS_REACHABLE(CONFIG_KVM) here, it would be a bug 
if this is called from outside KVM code.

Maybe make it a BUILD_BUG_ON?

Paolo
Sean Christopherson June 15, 2023, 6:26 p.m. UTC | #2
On Fri, May 26, 2023, Yu Zhao wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 08340219c35a..6875a819e007 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1232,6 +1232,40 @@ bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
>  	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
>  }
>  
> +bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
> +{
> +	struct kvm_mmu_page *root;
> +	int offset = ffs(shadow_accessed_mask) - 1;
> +
> +	if (kvm_shadow_root_allocated(kvm))

This needs a comment.

> +		return true;
> +
> +	rcu_read_lock();
> +
> +	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {

As requested in v1[1], please add a macro for a lockless walk.

[1] https://lkml.kernel.org/r/Y%2Fed0XYAPx%2B7pukA%40google.com

> +		struct tdp_iter iter;
> +
> +		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
> +			continue;
> +
> +		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
> +			u64 *sptep = rcu_dereference(iter.sptep);
> +
> +			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));

Hrm, I don't like adding this in KVM.  The primary MMU might guarantee that this
callback is invoked if and only if the SPTE is backed by struct page memory, but
there's no reason to assume that's true in KVM.  If we want the sanity check, then
this needs to use kvm_pfn_to_refcounted_page().

And it should use KVM's MMU_WARN_ON(), which is a mess and effectively dead code,
but I'm working on changing that[*], i.e. by the time this gets to Linus' tree,
the sanity check should have a much cleaner implementation.

[2] https://lore.kernel.org/all/20230511235917.639770-8-seanjc@google.com

> +
> +			if (!(iter.old_spte & shadow_accessed_mask))
> +				continue;
> +
> +			if (kvm_should_clear_young(range, iter.gfn))
> +				clear_bit(offset, (unsigned long *)sptep);

If/when you rebase on https://github.com/kvm-x86/linux/tree/next, can you pull
out the atomic bits of tdp_mmu_clear_spte_bits() and use that new helper? E.g.

diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index fae559559a80..914c34518829 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -58,15 +58,18 @@ static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
        return old_spte;
 }
 
+static inline u64 tdp_mmu_clear_spte_bits_atomic(tdp_ptep_t sptep, u64 mask)
+{
+       atomic64_t *sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
+
+       return (u64)atomic64_fetch_and(~mask, sptep_atomic);
+}
+
 static inline u64 tdp_mmu_clear_spte_bits(tdp_ptep_t sptep, u64 old_spte,
                                          u64 mask, int level)
 {
-       atomic64_t *sptep_atomic;
-
-       if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level)) {
-               sptep_atomic = (atomic64_t *)rcu_dereference(sptep);
-               return (u64)atomic64_fetch_and(~mask, sptep_atomic);
-       }
+       if (kvm_tdp_mmu_spte_need_atomic_write(old_spte, level))
+               return tdp_mmu_clear_spte_bits_atomic(sptep, mask);
 
        __kvm_tdp_mmu_write_spte(sptep, old_spte & ~mask);
        return old_spte;
diff mbox series

Patch

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 753c67072c47..d6dfdebe3d94 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2223,4 +2223,11 @@  int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
  */
 #define KVM_EXIT_HYPERCALL_MBZ		GENMASK_ULL(31, 1)
 
+#define kvm_arch_has_test_clear_young kvm_arch_has_test_clear_young
+static inline bool kvm_arch_has_test_clear_young(void)
+{
+	return IS_ENABLED(CONFIG_X86_64) &&
+	       (!IS_REACHABLE(CONFIG_KVM) || (tdp_mmu_enabled && shadow_accessed_mask));
+}
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 08340219c35a..6875a819e007 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1232,6 +1232,40 @@  bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 	return kvm_tdp_mmu_handle_gfn(kvm, range, test_age_gfn);
 }
 
+bool kvm_arch_test_clear_young(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+	struct kvm_mmu_page *root;
+	int offset = ffs(shadow_accessed_mask) - 1;
+
+	if (kvm_shadow_root_allocated(kvm))
+		return true;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(root, &kvm->arch.tdp_mmu_roots, link) {
+		struct tdp_iter iter;
+
+		if (kvm_mmu_page_as_id(root) != range->slot->as_id)
+			continue;
+
+		tdp_root_for_each_leaf_pte(iter, root, range->start, range->end) {
+			u64 *sptep = rcu_dereference(iter.sptep);
+
+			VM_WARN_ON_ONCE(!page_count(virt_to_page(sptep)));
+
+			if (!(iter.old_spte & shadow_accessed_mask))
+				continue;
+
+			if (kvm_should_clear_young(range, iter.gfn))
+				clear_bit(offset, (unsigned long *)sptep);
+		}
+	}
+
+	rcu_read_unlock();
+
+	return false;
+}
+
 static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
 			 struct kvm_gfn_range *range)
 {