| Message ID | 202509301520545960_jMIljfZY7bMHBkBbaHR@zte.com.cn |
|---|---|
| State | New |
| Headers | show |
| Series | [v2] RISC-V: KVM: Transparent huge page support | expand |
On Tue, Sep 30, 2025 at 12:51 PM <liu.xuemei1@zte.com.cn> wrote: > > From: Jessica Liu <liu.xuemei1@zte.com.cn> > > Use block mapping if backed by a THP, as implemented in architectures > like ARM and x86_64. > > Signed-off-by: Jessica Liu <liu.xuemei1@zte.com.cn> > --- > Changes in v2: > - Fixed the typo of writing PAGE_SHIFT as PAGE_SIZE. > > arch/riscv/include/asm/kvm_gstage.h | 3 + > arch/riscv/kvm/gstage.c | 100 ++++++++++++++++++++++++++++ > arch/riscv/kvm/mmu.c | 12 +++- > 3 files changed, 114 insertions(+), 1 deletion(-) > > diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h > index 595e2183173e..cc67fb2d2d42 100644 > --- a/arch/riscv/include/asm/kvm_gstage.h > +++ b/arch/riscv/include/asm/kvm_gstage.h > @@ -69,4 +69,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end > > void kvm_riscv_gstage_mode_detect(void); > > +long kvm_riscv_gstage_thp_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, > + unsigned long hva, kvm_pfn_t *pfnp, gpa_t *gpa); > + > #endif > diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c > index 24c270d6d0e2..129dee62c570 100644 > --- a/arch/riscv/kvm/gstage.c > +++ b/arch/riscv/kvm/gstage.c > @@ -77,6 +77,106 @@ static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) > return 0; > } > > +static int gstage_get_user_mapping_size(struct kvm *kvm, u64 addr) > +{ > + pte_t *ptepp; > + u32 ptep_level; > + unsigned long out_pgsize; > + struct kvm_gstage gstage = { > + .pgd = kvm->mm->pgd > + }; > + > + if (!kvm_riscv_gstage_get_leaf(&gstage, addr, &ptepp, &ptep_level)) > + return -EFAULT; > + > + if (gstage_level_to_page_size(ptep_level, &out_pgsize)) > + return -EFAULT; > + > + return out_pgsize; > +} > + > +static bool gstage_supports_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva) > +{ > + gpa_t gpa_start; > + hva_t uaddr_start, uaddr_end; > + size_t size; > + > + size = memslot->npages * PAGE_SIZE; > + uaddr_start = memslot->userspace_addr; > + uaddr_end = uaddr_start + size; > + > + gpa_start = memslot->base_gfn << PAGE_SHIFT; > + > + /* > + * Pages belonging to memslots that don't have the same alignment > + * within a PMD for userspace and GPA cannot be mapped with g-stage > + * PMD entries, because we'll end up mapping the wrong pages. > + * > + * Consider a layout like the following: > + * > + * memslot->userspace_addr: > + * +-----+--------------------+--------------------+---+ > + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| > + * +-----+--------------------+--------------------+---+ > + * > + * memslot->base_gfn << PAGE_SHIFT: > + * +---+--------------------+--------------------+-----+ > + * |abc|def g-stage block | g-stage block |tvxyz| > + * +---+--------------------+--------------------+-----+ > + * > + * If we create those g-stage blocks, we'll end up with this incorrect > + * mapping: > + * d -> f > + * e -> g > + * f -> h > + */ > + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) > + return false; > + > + /* > + * Next, let's make sure we're not trying to map anything not covered > + * by the memslot. This means we have to prohibit block size mappings > + * for the beginning and end of a non-block aligned and non-block sized > + * memory slot (illustrated by the head and tail parts of the > + * userspace view above containing pages 'abcde' and 'xyz', > + * respectively). > + * > + * Note that it doesn't matter if we do the check using the > + * userspace_addr or the base_gfn, as both are equally aligned (per > + * the check above) and equally sized. > + */ > + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); > +} > + > +long kvm_riscv_gstage_thp_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, > + unsigned long hva, kvm_pfn_t *hfnp, gpa_t *gpa) > +{ > + kvm_pfn_t hfn = *hfnp; > + > + /* > + * Make sure the adjustment is done only for THP pages. Also make > + * sure that the HVA and GPA are sufficiently aligned and that the > + * block map is contained within the memslot. > + */ > + if (gstage_supports_huge_mapping(memslot, hva)) { > + int sz = gstage_get_user_mapping_size(kvm, hva); > + > + if (sz < 0) > + return sz; > + > + if (sz < PMD_SIZE) > + return PAGE_SIZE; > + > + *gpa &= PMD_MASK; > + hfn &= ~(PTRS_PER_PMD - 1); > + *hfnp = hfn; > + > + return PMD_SIZE; > + } > + > + return PAGE_SIZE; > +} > + The gstage.c is for common page table management which will be shared by nested virtualization and pKVM. whereas mmu.c is for host/hypervisor mappings. All above functions except gstage_get_user_mapping_size() must be moved to mmu.c. Also, change prototype of gstage_get_user_mapping_size() to int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr); > bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, > pte_t **ptepp, u32 *ptep_level) > { > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > index 525fb5a330c0..f70cf721ebb8 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -337,7 +337,8 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; > bool logging = (memslot->dirty_bitmap && > !(memslot->flags & KVM_MEM_READONLY)) ? true : false; > - unsigned long vma_pagesize, mmu_seq; > + unsigned long mmu_seq; > + long vma_pagesize; > struct kvm_gstage gstage; > struct page *page; > > @@ -416,6 +417,15 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > if (mmu_invalidate_retry(kvm, mmu_seq)) > goto out_unlock; > > + /* check if we are backed by a THP and thus use block mapping if possible */ > + if (vma_pagesize == PAGE_SIZE) { > + vma_pagesize = kvm_riscv_gstage_thp_adjust(kvm, memslot, hva, &hfn, &gpa); > + if (vma_pagesize < 0) { > + ret = vma_pagesize; > + goto out_unlock; > + } > + } > + > if (writable) { > mark_page_dirty_in_slot(kvm, memslot, gfn); > ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, > -- > 2.27.0 > Regards, Anup
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173e..cc67fb2d2d42 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -69,4 +69,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end void kvm_riscv_gstage_mode_detect(void); +long kvm_riscv_gstage_thp_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *pfnp, gpa_t *gpa); + #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index 24c270d6d0e2..129dee62c570 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -77,6 +77,106 @@ static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) return 0; } +static int gstage_get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + pte_t *ptepp; + u32 ptep_level; + unsigned long out_pgsize; + struct kvm_gstage gstage = { + .pgd = kvm->mm->pgd + }; + + if (!kvm_riscv_gstage_get_leaf(&gstage, addr, &ptepp, &ptep_level)) + return -EFAULT; + + if (gstage_level_to_page_size(ptep_level, &out_pgsize)) + return -EFAULT; + + return out_pgsize; +} + +static bool gstage_supports_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with g-stage + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def g-stage block | g-stage block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those g-stage blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); +} + +long kvm_riscv_gstage_thp_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *hfnp, gpa_t *gpa) +{ + kvm_pfn_t hfn = *hfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and GPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (gstage_supports_huge_mapping(memslot, hva)) { + int sz = gstage_get_user_mapping_size(kvm, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + + *gpa &= PMD_MASK; + hfn &= ~(PTRS_PER_PMD - 1); + *hfnp = hfn; + + return PMD_SIZE; + } + + return PAGE_SIZE; +} + bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 525fb5a330c0..f70cf721ebb8 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -337,7 +337,8 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; - unsigned long vma_pagesize, mmu_seq; + unsigned long mmu_seq; + long vma_pagesize; struct kvm_gstage gstage; struct page *page; @@ -416,6 +417,15 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; + /* check if we are backed by a THP and thus use block mapping if possible */ + if (vma_pagesize == PAGE_SIZE) { + vma_pagesize = kvm_riscv_gstage_thp_adjust(kvm, memslot, hva, &hfn, &gpa); + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } + } + if (writable) { mark_page_dirty_in_slot(kvm, memslot, gfn); ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,