| Message ID | 202603301612587174XZ6QMCrymBqv30S6BN50@zte.com.cn |
|---|---|
| State | Accepted |
| Headers | show |
| Series | RISC-V: KVM: Fix hugepage mapping handling during dirty logging | expand |
On Mon, Mar 30, 2026 at 1:43 PM <wang.yechao255@zte.com.cn> wrote: > > From: Wang Yechao <wang.yechao255@zte.com.cn> > > During dirty logging, all huge pages are write-protected. When the guest > writes to a write-protected huge page, a page fault is triggered. Before > recovering the write permission, the huge page must be split into smaller > pages (e.g., 4K). After splitting, the normal mapping process proceeds, > allowing write permission to be restored at the smaller page granularity. > > If dirty logging is disabled because migration failed or was cancelled, > only recover the write permission at the 4K level, and skip recovering the > huge page mapping at this time to avoid the overhead of freeing page tables. > The huge page mapping can be recovered in the ioctl context, similar to x86, > in a later patch. > > Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn> LGTM. Reviewed-by: Anup Patel <anup@brainfault.org> Thanks, Anup > --- > arch/riscv/include/asm/kvm_gstage.h | 4 + > arch/riscv/kvm/gstage.c | 126 ++++++++++++++++++++++++++++ > 2 files changed, 130 insertions(+) > > diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h > index 595e2183173e..373748c6745e 100644 > --- a/arch/riscv/include/asm/kvm_gstage.h > +++ b/arch/riscv/include/asm/kvm_gstage.h > @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, > bool page_rdonly, bool page_exec, > struct kvm_gstage_mapping *out_map); > > +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, > + struct kvm_mmu_memory_cache *pcache, > + gpa_t addr, u32 target_level, bool flush); > + > enum kvm_riscv_gstage_op { > GSTAGE_OP_NOP = 0, /* Nothing */ > GSTAGE_OP_CLEAR, /* Clear/Unmap */ > diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c > index d2001d508046..ffec3e5ddcaf 100644 > --- a/arch/riscv/kvm/gstage.c > +++ b/arch/riscv/kvm/gstage.c > @@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, > return 0; > } > > +static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level, > + gpa_t addr, pte_t *ptep, pgprot_t prot) > +{ > + pte_t new_pte; > + > + if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot)) > + return; > + > + new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot); > + new_pte = pte_mkdirty(new_pte); > + > + set_pte(ptep, new_pte); > + > + gstage_tlb_flush(gstage, level, addr); > +} > + > int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, > struct kvm_mmu_memory_cache *pcache, > gpa_t gpa, phys_addr_t hpa, unsigned long page_size, > bool page_rdonly, bool page_exec, > struct kvm_gstage_mapping *out_map) > { > + bool found_leaf; > + u32 ptep_level; > pgprot_t prot; > + pte_t *ptep; > int ret; > > out_map->addr = gpa; > @@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, > else > prot = PAGE_WRITE; > } > + > + found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level); > + if (found_leaf) { > + /* > + * ptep_level is the current gstage mapping level of addr, out_map->level > + * is the required mapping level during fault handling. > + * > + * 1) ptep_level > out_map->level > + * This happens when dirty logging is enabled and huge pages are used. > + * KVM must track the pages at 4K level, and split the huge mapping > + * into 4K mappings. > + * > + * 2) ptep_level < out_map->level > + * This happens when dirty logging is disabled and huge pages are used. > + * The gstage is split into 4K mappings, but the out_map level is now > + * back to the huge page level. Ignore the out_map level this time, and > + * just update the pte prot here. Otherwise, we would fall back to mapping > + * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the > + * overhead of freeing the page tables(not support now), which would slow > + * down the vCPUs' performance. > + * > + * It is better to recover the huge page mapping in the ioctl context when > + * disabling dirty logging. > + * > + * 3) ptep_level == out_map->level > + * We already have the ptep, just update the pte prot if the pfn not change. > + * There is no need to invoke `kvm_riscv_gstage_set_pte` again. > + */ > + if (ptep_level > out_map->level) { > + kvm_riscv_gstage_split_huge(gstage, pcache, gpa, > + out_map->level, true); > + } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) { > + kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot); > + return 0; > + } > + } > + > out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); > out_map->pte = pte_mkdirty(out_map->pte); > > return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); > } > > +static inline unsigned long make_child_pte(unsigned long huge_pte, int index, > + unsigned long child_page_size) > +{ > + unsigned long child_pte = huge_pte; > + unsigned long child_pfn_offset; > + > + /* > + * The child_pte already has the base address of the huge page being > + * split. So we just have to OR in the offset to the page at the next > + * lower level for the given index. > + */ > + child_pfn_offset = index * (child_page_size / PAGE_SIZE); > + child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0))); > + > + return child_pte; > +} > + > +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, > + struct kvm_mmu_memory_cache *pcache, > + gpa_t addr, u32 target_level, bool flush) > +{ > + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; > + pte_t *next_ptep = (pte_t *)gstage->pgd; > + unsigned long huge_pte, child_pte; > + unsigned long child_page_size; > + pte_t *ptep; > + int i, ret; > + > + if (!pcache) > + return -ENOMEM; > + > + while(current_level > target_level) { > + ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)]; > + > + if (!pte_val(ptep_get(ptep))) > + break; > + > + if (!gstage_pte_leaf(ptep)) { > + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); > + current_level--; > + continue; > + } > + > + huge_pte = pte_val(ptep_get(ptep)); > + > + ret = gstage_level_to_page_size(current_level - 1, &child_page_size); > + if (ret) > + return ret; > + > + next_ptep = kvm_mmu_memory_cache_alloc(pcache); > + if (!next_ptep) > + return -ENOMEM; > + > + for (i = 0; i < PTRS_PER_PTE; i++) { > + child_pte = make_child_pte(huge_pte, i, child_page_size); > + set_pte((pte_t *)&next_ptep[i], __pte(child_pte)); > + } > + > + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), > + __pgprot(_PAGE_TABLE))); > + > + if (flush) > + gstage_tlb_flush(gstage, current_level, addr); > + > + current_level--; > + } > + > + return 0; > +} > + > void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, > pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) > { > -- > 2.47.3 > > -- > kvm-riscv mailing list > kvm-riscv@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kvm-riscv
diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173e..373748c6745e 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map); +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush); + enum kvm_riscv_gstage_op { GSTAGE_OP_NOP = 0, /* Nothing */ GSTAGE_OP_CLEAR, /* Clear/Unmap */ diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index d2001d508046..ffec3e5ddcaf 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, return 0; } +static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level, + gpa_t addr, pte_t *ptep, pgprot_t prot) +{ + pte_t new_pte; + + if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot)) + return; + + new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot); + new_pte = pte_mkdirty(new_pte); + + set_pte(ptep, new_pte); + + gstage_tlb_flush(gstage, level, addr); +} + int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map) { + bool found_leaf; + u32 ptep_level; pgprot_t prot; + pte_t *ptep; int ret; out_map->addr = gpa; @@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, else prot = PAGE_WRITE; } + + found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level); + if (found_leaf) { + /* + * ptep_level is the current gstage mapping level of addr, out_map->level + * is the required mapping level during fault handling. + * + * 1) ptep_level > out_map->level + * This happens when dirty logging is enabled and huge pages are used. + * KVM must track the pages at 4K level, and split the huge mapping + * into 4K mappings. + * + * 2) ptep_level < out_map->level + * This happens when dirty logging is disabled and huge pages are used. + * The gstage is split into 4K mappings, but the out_map level is now + * back to the huge page level. Ignore the out_map level this time, and + * just update the pte prot here. Otherwise, we would fall back to mapping + * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the + * overhead of freeing the page tables(not support now), which would slow + * down the vCPUs' performance. + * + * It is better to recover the huge page mapping in the ioctl context when + * disabling dirty logging. + * + * 3) ptep_level == out_map->level + * We already have the ptep, just update the pte prot if the pfn not change. + * There is no need to invoke `kvm_riscv_gstage_set_pte` again. + */ + if (ptep_level > out_map->level) { + kvm_riscv_gstage_split_huge(gstage, pcache, gpa, + out_map->level, true); + } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) { + kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot); + return 0; + } + } + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); out_map->pte = pte_mkdirty(out_map->pte); return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); } +static inline unsigned long make_child_pte(unsigned long huge_pte, int index, + unsigned long child_page_size) +{ + unsigned long child_pte = huge_pte; + unsigned long child_pfn_offset; + + /* + * The child_pte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_pfn_offset = index * (child_page_size / PAGE_SIZE); + child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0))); + + return child_pte; +} + +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush) +{ + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)gstage->pgd; + unsigned long huge_pte, child_pte; + unsigned long child_page_size; + pte_t *ptep; + int i, ret; + + if (!pcache) + return -ENOMEM; + + while(current_level > target_level) { + ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)]; + + if (!pte_val(ptep_get(ptep))) + break; + + if (!gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + current_level--; + continue; + } + + huge_pte = pte_val(ptep_get(ptep)); + + ret = gstage_level_to_page_size(current_level - 1, &child_page_size); + if (ret) + return ret; + + next_ptep = kvm_mmu_memory_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + + for (i = 0; i < PTRS_PER_PTE; i++) { + child_pte = make_child_pte(huge_pte, i, child_page_size); + set_pte((pte_t *)&next_ptep[i], __pte(child_pte)); + } + + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); + + if (flush) + gstage_tlb_flush(gstage, current_level, addr); + + current_level--; + } + + return 0; +} + void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) {