| Message ID | 20251111124932618qn9qbBbeaZrOZ3UDg7jed@zte.com.cn |
|---|---|
| State | New |
| Headers | show |
| Series | [v3] RISC-V: KVM: Transparent huge page support | expand |
On Tue, Nov 11, 2025 at 10:19 AM <liu.xuemei1@zte.com.cn> wrote: > > From: Jessica Liu <liu.xuemei1@zte.com.cn> > > Use block mapping if backed by a THP, as implemented in architectures > like ARM and x86_64. > > Signed-off-by: Jessica Liu <liu.xuemei1@zte.com.cn> > --- > Changes in v3: > - Changed prototype of gstage_get_user_mapping_size to > kvm_riscv_gstage_get_mapping_size. > - Relocated the remaining functions from gstage.c in v2 to mmu.c and > renamed them. > > arch/riscv/include/asm/kvm_gstage.h | 2 + > arch/riscv/kvm/gstage.c | 15 +++++ > arch/riscv/kvm/mmu.c | 97 ++++++++++++++++++++++++++++- > 3 files changed, 113 insertions(+), 1 deletion(-) > > diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h > index 595e2183173e..006bbdb90df8 100644 > --- a/arch/riscv/include/asm/kvm_gstage.h > +++ b/arch/riscv/include/asm/kvm_gstage.h > @@ -69,4 +69,6 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end > > void kvm_riscv_gstage_mode_detect(void); > > +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr); > + > #endif > diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c > index b67d60d722c2..a63089206869 100644 > --- a/arch/riscv/kvm/gstage.c > +++ b/arch/riscv/kvm/gstage.c > @@ -357,3 +357,18 @@ void __init kvm_riscv_gstage_mode_detect(void) > csr_write(CSR_HGATP, 0); > kvm_riscv_local_hfence_gvma_all(); > } > + > +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr) > +{ > + pte_t *ptepp; > + u32 ptep_level; > + unsigned long out_pgsize; > + > + if (!kvm_riscv_gstage_get_leaf(gstage, addr, &ptepp, &ptep_level)) > + return -EFAULT; > + > + if (gstage_level_to_page_size(ptep_level, &out_pgsize)) > + return -EFAULT; > + > + return out_pgsize; > +} > diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c > index 525fb5a330c0..1457bc958505 100644 > --- a/arch/riscv/kvm/mmu.c > +++ b/arch/riscv/kvm/mmu.c > @@ -323,6 +323,91 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) > return pte_young(ptep_get(ptep)); > } > > +static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva) > +{ > + gpa_t gpa_start; > + hva_t uaddr_start, uaddr_end; > + size_t size; Declare local variables in inverted pyramid fashion when possible. > + > + size = memslot->npages * PAGE_SIZE; > + uaddr_start = memslot->userspace_addr; > + uaddr_end = uaddr_start + size; > + > + gpa_start = memslot->base_gfn << PAGE_SHIFT; > + > + /* > + * Pages belonging to memslots that don't have the same alignment > + * within a PMD for userspace and GPA cannot be mapped with g-stage > + * PMD entries, because we'll end up mapping the wrong pages. > + * > + * Consider a layout like the following: > + * > + * memslot->userspace_addr: > + * +-----+--------------------+--------------------+---+ > + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| > + * +-----+--------------------+--------------------+---+ > + * > + * memslot->base_gfn << PAGE_SHIFT: > + * +---+--------------------+--------------------+-----+ > + * |abc|def g-stage block | g-stage block |tvxyz| > + * +---+--------------------+--------------------+-----+ > + * > + * If we create those g-stage blocks, we'll end up with this incorrect > + * mapping: > + * d -> f > + * e -> g > + * f -> h > + */ > + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) > + return false; > + > + /* > + * Next, let's make sure we're not trying to map anything not covered > + * by the memslot. This means we have to prohibit block size mappings > + * for the beginning and end of a non-block aligned and non-block sized > + * memory slot (illustrated by the head and tail parts of the > + * userspace view above containing pages 'abcde' and 'xyz', > + * respectively). > + * > + * Note that it doesn't matter if we do the check using the > + * userspace_addr or the base_gfn, as both are equally aligned (per > + * the check above) and equally sized. > + */ > + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); > +} > + > +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, > + unsigned long hva, kvm_pfn_t *hfnp, gpa_t *gpa) > +{ > + kvm_pfn_t hfn = *hfnp; > + > + /* > + * Make sure the adjustment is done only for THP pages. Also make > + * sure that the HVA and GPA are sufficiently aligned and that the > + * block map is contained within the memslot. > + */ > + if (fault_supports_gstage_huge_mapping(memslot, hva)) { > + struct kvm_gstage gstage; Declare sz here. > + > + gstage.pgd = kvm->mm->pgd; > + int sz = kvm_riscv_gstage_get_mapping_size(&gstage, hva); This is broken because you are passing hva as gpa to kvm_riscv_gstage_get_mapping_size(). > + > + if (sz < 0) > + return sz; > + > + if (sz < PMD_SIZE) > + return PAGE_SIZE; > + > + *gpa &= PMD_MASK; > + hfn &= ~(PTRS_PER_PMD - 1); > + *hfnp = hfn; > + > + return PMD_SIZE; > + } > + > + return PAGE_SIZE; > +} > + > int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > gpa_t gpa, unsigned long hva, bool is_write, > struct kvm_gstage_mapping *out_map) > @@ -337,7 +422,8 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; > bool logging = (memslot->dirty_bitmap && > !(memslot->flags & KVM_MEM_READONLY)) ? true : false; > - unsigned long vma_pagesize, mmu_seq; > + unsigned long mmu_seq; > + long vma_pagesize; > struct kvm_gstage gstage; > struct page *page; > > @@ -416,6 +502,15 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, > if (mmu_invalidate_retry(kvm, mmu_seq)) > goto out_unlock; > > + /* check if we are backed by a THP and thus use block mapping if possible */ > + if (vma_pagesize == PAGE_SIZE) { > + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); > + if (vma_pagesize < 0) { > + ret = vma_pagesize; > + goto out_unlock; > + } > + } > + > if (writable) { > mark_page_dirty_in_slot(kvm, memslot, gfn); > ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, > -- > 2.27.0 Regards, Anup
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173e..006bbdb90df8 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -69,4 +69,6 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end void kvm_riscv_gstage_mode_detect(void); +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr); + #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index b67d60d722c2..a63089206869 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -357,3 +357,18 @@ void __init kvm_riscv_gstage_mode_detect(void) csr_write(CSR_HGATP, 0); kvm_riscv_local_hfence_gvma_all(); } + +int kvm_riscv_gstage_get_mapping_size(struct kvm_gstage *gstage, gpa_t addr) +{ + pte_t *ptepp; + u32 ptep_level; + unsigned long out_pgsize; + + if (!kvm_riscv_gstage_get_leaf(gstage, addr, &ptepp, &ptep_level)) + return -EFAULT; + + if (gstage_level_to_page_size(ptep_level, &out_pgsize)) + return -EFAULT; + + return out_pgsize; +} diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 525fb5a330c0..1457bc958505 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -323,6 +323,91 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return pte_young(ptep_get(ptep)); } +static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, unsigned long hva) +{ + gpa_t gpa_start; + hva_t uaddr_start, uaddr_end; + size_t size; + + size = memslot->npages * PAGE_SIZE; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with g-stage + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def g-stage block | g-stage block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those g-stage blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); +} + +static long transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long hva, kvm_pfn_t *hfnp, gpa_t *gpa) +{ + kvm_pfn_t hfn = *hfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and GPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (fault_supports_gstage_huge_mapping(memslot, hva)) { + struct kvm_gstage gstage; + + gstage.pgd = kvm->mm->pgd; + int sz = kvm_riscv_gstage_get_mapping_size(&gstage, hva); + + if (sz < 0) + return sz; + + if (sz < PMD_SIZE) + return PAGE_SIZE; + + *gpa &= PMD_MASK; + hfn &= ~(PTRS_PER_PMD - 1); + *hfnp = hfn; + + return PMD_SIZE; + } + + return PAGE_SIZE; +} + int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write, struct kvm_gstage_mapping *out_map) @@ -337,7 +422,8 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, struct kvm_mmu_memory_cache *pcache = &vcpu->arch.mmu_page_cache; bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; - unsigned long vma_pagesize, mmu_seq; + unsigned long mmu_seq; + long vma_pagesize; struct kvm_gstage gstage; struct page *page; @@ -416,6 +502,15 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; + /* check if we are backed by a THP and thus use block mapping if possible */ + if (vma_pagesize == PAGE_SIZE) { + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); + if (vma_pagesize < 0) { + ret = vma_pagesize; + goto out_unlock; + } + } + if (writable) { mark_page_dirty_in_slot(kvm, memslot, gfn); ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT,