diff mbox series

[14/16] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

Message ID 20230606045608.55127-15-aneesh.kumar@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Add support for DAX vmemmap optimization for ppc64 | expand

Commit Message

Aneesh Kumar K V June 6, 2023, 4:56 a.m. UTC
This is in preparation to update radix to implement vmemmap optimization for
devdax. Below are the rules w.r.t radix vmemmap mapping

1. First try to map things using PMD (2M)
2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE
3. IF we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE

On removing vmemmap mapping, check if every subsection that is using the vmemmap
area is invalid. If found to be invalid, that implies we can safely free the
vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K
page size, we need to do the above check even at the PAGE_SIZE granularity.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   2 +
 arch/powerpc/include/asm/pgtable.h         |   3 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 293 +++++++++++++++++++--
 arch/powerpc/mm/init_64.c                  |  26 +-
 4 files changed, 293 insertions(+), 31 deletions(-)

Comments

Sachin Sant June 14, 2023, 10:50 a.m. UTC | #1
> 1. First try to map things using PMD (2M)
> 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE
> 3. IF we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE
> 
> On removing vmemmap mapping, check if every subsection that is using the vmemmap
> area is invalid. If found to be invalid, that implies we can safely free the
> vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K
> page size, we need to do the above check even at the PAGE_SIZE granularity.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---

With this patch series applied I see the following warning

[  OK  ] Started Monitoring of LVM2 mirrors,…sing dmeventd or progress polling.
[    3.283884] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: nvdimm pmu didn't register rc=-2
[    3.284212] papr_scm ibm,persistent-memory:ibm,pmemory@44104002: nvdimm pmu didn't register rc=-2
[    3.563890] radix-mmu: Mapped 0x0000040010000000-0x0000040c90000000 with 64.0 KiB pages
[    3.703227] ------------[ cut here ]------------
[    3.703236] failed to free all reserved pages
[    3.703244] WARNING: CPU: 41 PID: 923 at mm/memremap.c:152 memunmap_pages+0x37c/0x3a0
[    3.703252] Modules linked in: device_dax(+) nd_pmem nd_btt dax_pmem papr_scm libnvdimm pseries_rng vmx_crypto aes_gcm_p10_crypto ext4 mbcache jbd2 sd_mod t10_pi crc64_rocksoft crc64 sg ibmvscsi scsi_transport_srp ibmveth fuse
[    3.703272] CPU: 41 PID: 923 Comm: systemd-udevd Not tainted 6.4.0-rc6-00037-gb6dad5178cea-dirty #1
[    3.703276] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries
[    3.703280] NIP:  c00000000057a18c LR: c00000000057a188 CTR: 00000000005ca81c
[    3.703283] REGS: c000000032a170d0 TRAP: 0700   Not tainted  (6.4.0-rc6-00037-gb6dad5178cea-dirty)
[    3.703286] MSR:  800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48248824  XER: 00000002
[    3.703296] CFAR: c00000000015f0c0 IRQMASK: 0  [    3.703296] GPR00: c00000000057a188 c000000032a17370 c000000001421500 0000000000000021  [    3.703296] GPR04: 00000000ffff7fff c000000032a17140 c000000032a17138 0000000000000027  [    3.703296] GPR08: c0000015c91a7c10 0000000000000001 0000000000000027 c000000002a18a20  [    3.703296] GPR12: 0000000048248824 c0000015cb9f4300 c000000032a17d68 c000000001262b20  [    3.703296] GPR16: c008000001310000 000000000000ff20 000000000000fff2 c0080000012d7418  [    3.703296] GPR20: c000000032a17c30 0000000000000004 ffffffffffffc005 0000000001000200  [    3.703296] GPR24: c000000002f11570 c00000000e376870 0000000000000001 0000000000000001  [    3.703296] GPR28: c00000000e376840 c00000000e3768c8 0000000000000000 c00000000e376840  [    3.703333] NIP [c00000000057a18c] memunmap_pages+0x37c/0x3a0
[    3.703338] LR [c00000000057a188] memunmap_pages+0x378/0x3a0
[    3.703342] Call Trace:
[    3.703344] [c000000032a17370] [c00000000057a188] memunmap_pages+0x378/0x3a0 (unreliable)
[    3.703349] [c000000032a17420] [c00000000057a928] memremap_pages+0x4a8/0x890
[    3.703355] [c000000032a17500] [c00000000057ad4c] devm_memremap_pages+0x3c/0xd0
[    3.703359] [c000000032a17540] [c0080000011c084c] dev_dax_probe+0x134/0x3a0 [device_dax]
[    3.703366] [c000000032a175e0] [c0000000009f7e8c] dax_bus_probe+0xac/0x140
[    3.703371] [c000000032a17610] [c0000000009b5828] really_probe+0x108/0x530
[    3.703375] [c000000032a176a0] [c0000000009b5d04] __driver_probe_device+0xb4/0x200
[    3.703379] [c000000032a17720] [c0000000009b5ea8] driver_probe_device+0x58/0x120
[    3.703383] [c000000032a17760] [c0000000009b6298] __driver_attach+0x148/0x250
[    3.703387] [c000000032a177e0] [c0000000009b1a58] bus_for_each_dev+0xa8/0x130
[    3.703392] [c000000032a17840] [c0000000009b4b34] driver_attach+0x34/0x50
[    3.703396] [c000000032a17860] [c0000000009b3b98] bus_add_driver+0x258/0x300
[    3.703400] [c000000032a178f0] [c0000000009b78d4] driver_register+0xa4/0x1b0
[    3.703404] [c000000032a17960] [c0000000009f9530] __dax_driver_register+0x50/0x70
[    3.703409] [c000000032a17980] [c0080000011c1374] dax_init+0x3c/0x58 [device_dax]
[    3.703414] [c000000032a179a0] [c000000000013260] do_one_initcall+0x60/0x2f0
[    3.703418] [c000000032a17a70] [c000000000248af8] do_init_module+0x78/0x310
[    3.703424] [c000000032a17af0] [c00000000024bcac] load_module+0x2a7c/0x2f30
[    3.703429] [c000000032a17d00] [c00000000024c4f0] __do_sys_finit_module+0xe0/0x180
[    3.703434] [c000000032a17e10] [c0000000000374c0] system_call_exception+0x140/0x350
[    3.703439] [c000000032a17e50] [c00000000000d6a0] system_call_common+0x160/0x2e4
[    3.703444] --- interrupt: c00 at 0x7fff9af2fb34
[    3.703447] NIP:  00007fff9af2fb34 LR: 00007fff9b6dea9c CTR: 0000000000000000
[    3.703450] REGS: c000000032a17e80 TRAP: 0c00   Not tainted  (6.4.0-rc6-00037-gb6dad5178cea-dirty)
[    3.703453] MSR:  800000000280f033 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28222204  XER: 00000000
[    3.703462] IRQMASK: 0  [    3.703462] GPR00: 0000000000000161 00007fffed351350 00007fff9b007300 000000000000000f  [    3.703462] GPR04: 00007fff9b6ead30 0000000000000000 000000000000000f 0000000000000000  [    3.703462] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000  [    3.703462] GPR12: 0000000000000000 00007fff9b7c6610 0000000000020000 000000011057db18  [    3.703462] GPR16: 00000001105c0108 0000000110585f48 0000000000000000 0000000000000000  [    3.703462] GPR20: 0000000000000000 0000000110585f80 0000000147985200 00007fffed351570  [    3.703462] GPR24: 00000001105c0128 0000000000020000 0000000000000000 0000000147981010  [    3.703462] GPR28: 00007fff9b6ead30 0000000000020000 0000000000000000 0000000147985200  [    3.703497] NIP [00007fff9af2fb34] 0x7fff9af2fb34
[    3.703499] LR [00007fff9b6dea9c] 0x7fff9b6dea9c
[    3.703502] --- interrupt: c00
[    3.703504] Code: 60000000 3d220170 8929b2b7 2f890000 409eff28 3c62ffe7 39200001 3d420170 3863c518 992ab2b7 4bbe4e55 60000000 <0fe00000> fac10060 fae10068 fb010070  [    3.703516] ---[ end trace 0000000000000000 ]---
[    3.703520] device_dax: probe of dax0.0 failed with error -12
[  OK  ] Created slice system-daxdev\x2dreconfigure.slice.
[  OK  ] Started udev Wait for Complete Device Initialization.
[  OK  ] Reached target Local File Systems (Pre).
[  OK  ] Reached target Local File Systems.

The warning appears after applying this patch. 

- Sachin
Aneesh Kumar K V June 15, 2023, 2:23 a.m. UTC | #2
Sachin Sant <sachinp@linux.ibm.com> writes:

>> 1. First try to map things using PMD (2M)
>> 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE
>> 3. IF we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE
>> 
>> On removing vmemmap mapping, check if every subsection that is using the vmemmap
>> area is invalid. If found to be invalid, that implies we can safely free the
>> vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K
>> page size, we need to do the above check even at the PAGE_SIZE granularity.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>
> With this patch series applied I see the following warning
>
> [  OK  ] Started Monitoring of LVM2 mirrors,…sing dmeventd or progress polling.
> [    3.283884] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: nvdimm pmu didn't register rc=-2
> [    3.284212] papr_scm ibm,persistent-memory:ibm,pmemory@44104002: nvdimm pmu didn't register rc=-2
> [    3.563890] radix-mmu: Mapped 0x0000040010000000-0x0000040c90000000 with 64.0 KiB pages
> [    3.703227] ------------[ cut here ]------------
> [    3.703236] failed to free all reserved pages
> [    3.703244] WARNING: CPU: 41 PID: 923 at mm/memremap.c:152 memunmap_pages+0x37c/0x3a0
> [    3.703252] Modules linked in: device_dax(+) nd_pmem nd_btt dax_pmem papr_scm libnvdimm pseries_rng vmx_crypto aes_gcm_p10_crypto ext4 mbcache jbd2 sd_mod t10_pi crc64_rocksoft crc64 sg ibmvscsi scsi_transport_srp ibmveth fuse
> [    3.703272] CPU: 41 PID: 923 Comm: systemd-udevd Not tainted 6.4.0-rc6-00037-gb6dad5178cea-dirty #1
> [    3.703276] Hardware name: IBM,9080-HEX POWER10 (raw) 0x800200 0xf000006 of:IBM,FW1030.20 (NH1030_058) hv:phyp pSeries
> [    3.703280] NIP:  c00000000057a18c LR: c00000000057a188 CTR: 00000000005ca81c
> [    3.703283] REGS: c000000032a170d0 TRAP: 0700   Not tainted  (6.4.0-rc6-00037-gb6dad5178cea-dirty)
> [    3.703286] MSR:  800000000282b033 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 48248824  XER: 00000002
> [    3.703296] CFAR: c00000000015f0c0 IRQMASK: 0  [    3.703296] GPR00: c00000000057a188 c000000032a17370 c000000001421500 0000000000000021  [    3.703296] GPR04: 00000000ffff7fff c000000032a17140 c000000032a17138 0000000000000027  [    3.703296] GPR08: c0000015c91a7c10 0000000000000001 0000000000000027 c000000002a18a20  [    3.703296] GPR12: 0000000048248824 c0000015cb9f4300 c000000032a17d68 c000000001262b20  [    3.703296] GPR16: c008000001310000 000000000000ff20 000000000000fff2 c0080000012d7418  [    3.703296] GPR20: c000000032a17c30 0000000000000004 ffffffffffffc005 0000000001000200  [    3.703296] GPR24: c000000002f11570 c00000000e376870 0000000000000001 0000000000000001  [    3.703296] GPR28: c00000000e376840 c00000000e3768c8 0000000000000000 c00000000e376840  [    3.703333] NIP [c00000000057a18c] memunmap_pages+0x37c/0x3a0
> [    3.703338] LR [c00000000057a188] memunmap_pages+0x378/0x3a0
> [    3.703342] Call Trace:
> [    3.703344] [c000000032a17370] [c00000000057a188] memunmap_pages+0x378/0x3a0 (unreliable)
> [    3.703349] [c000000032a17420] [c00000000057a928] memremap_pages+0x4a8/0x890
> [    3.703355] [c000000032a17500] [c00000000057ad4c] devm_memremap_pages+0x3c/0xd0
> [    3.703359] [c000000032a17540] [c0080000011c084c] dev_dax_probe+0x134/0x3a0 [device_dax]
> [    3.703366] [c000000032a175e0] [c0000000009f7e8c] dax_bus_probe+0xac/0x140
> [    3.703371] [c000000032a17610] [c0000000009b5828] really_probe+0x108/0x530
> [    3.703375] [c000000032a176a0] [c0000000009b5d04] __driver_probe_device+0xb4/0x200
> [    3.703379] [c000000032a17720] [c0000000009b5ea8] driver_probe_device+0x58/0x120
> [    3.703383] [c000000032a17760] [c0000000009b6298] __driver_attach+0x148/0x250
> [    3.703387] [c000000032a177e0] [c0000000009b1a58] bus_for_each_dev+0xa8/0x130
> [    3.703392] [c000000032a17840] [c0000000009b4b34] driver_attach+0x34/0x50
> [    3.703396] [c000000032a17860] [c0000000009b3b98] bus_add_driver+0x258/0x300
> [    3.703400] [c000000032a178f0] [c0000000009b78d4] driver_register+0xa4/0x1b0
> [    3.703404] [c000000032a17960] [c0000000009f9530] __dax_driver_register+0x50/0x70
> [    3.703409] [c000000032a17980] [c0080000011c1374] dax_init+0x3c/0x58 [device_dax]
> [    3.703414] [c000000032a179a0] [c000000000013260] do_one_initcall+0x60/0x2f0
> [    3.703418] [c000000032a17a70] [c000000000248af8] do_init_module+0x78/0x310
> [    3.703424] [c000000032a17af0] [c00000000024bcac] load_module+0x2a7c/0x2f30
> [    3.703429] [c000000032a17d00] [c00000000024c4f0] __do_sys_finit_module+0xe0/0x180
> [    3.703434] [c000000032a17e10] [c0000000000374c0] system_call_exception+0x140/0x350
> [    3.703439] [c000000032a17e50] [c00000000000d6a0] system_call_common+0x160/0x2e4
> [    3.703444] --- interrupt: c00 at 0x7fff9af2fb34
> [    3.703447] NIP:  00007fff9af2fb34 LR: 00007fff9b6dea9c CTR: 0000000000000000
> [    3.703450] REGS: c000000032a17e80 TRAP: 0c00   Not tainted  (6.4.0-rc6-00037-gb6dad5178cea-dirty)
> [    3.703453] MSR:  800000000280f033 <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28222204  XER: 00000000
> [    3.703462] IRQMASK: 0  [    3.703462] GPR00: 0000000000000161 00007fffed351350 00007fff9b007300 000000000000000f  [    3.703462] GPR04: 00007fff9b6ead30 0000000000000000 000000000000000f 0000000000000000  [    3.703462] GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000  [    3.703462] GPR12: 0000000000000000 00007fff9b7c6610 0000000000020000 000000011057db18  [    3.703462] GPR16: 00000001105c0108 0000000110585f48 0000000000000000 0000000000000000  [    3.703462] GPR20: 0000000000000000 0000000110585f80 0000000147985200 00007fffed351570  [    3.703462] GPR24: 00000001105c0128 0000000000020000 0000000000000000 0000000147981010  [    3.703462] GPR28: 00007fff9b6ead30 0000000000020000 0000000000000000 0000000147985200  [    3.703497] NIP [00007fff9af2fb34] 0x7fff9af2fb34
> [    3.703499] LR [00007fff9b6dea9c] 0x7fff9b6dea9c
> [    3.703502] --- interrupt: c00
> [    3.703504] Code: 60000000 3d220170 8929b2b7 2f890000 409eff28 3c62ffe7 39200001 3d420170 3863c518 992ab2b7 4bbe4e55 60000000 <0fe00000> fac10060 fae10068 fb010070  [    3.703516] ---[ end trace 0000000000000000 ]---
> [    3.703520] device_dax: probe of dax0.0 failed with error -12
> [  OK  ] Created slice system-daxdev\x2dreconfigure.slice.
> [  OK  ] Started udev Wait for Complete Device Initialization.
> [  OK  ] Reached target Local File Systems (Pre).
> [  OK  ] Reached target Local File Systems.
>

The below change fixed the warning on the test machine you shared.

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 1c49af91fd9c..d884c1b39128 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -994,6 +994,7 @@ void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
 	pte_t entry;
 	pte_t *ptep = pmdp_ptep(pmdp);
 
+	VM_BUG_ON(!IS_ALIGNED((unsigned long)(addr), PMD_SIZE));
 	entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 	set_pte_at(&init_mm, addr, ptep, entry);
 	asm volatile("ptesync": : :"memory");
@@ -1012,6 +1013,10 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmd, unsigned long a
 		void *p;
 
 		if (!reuse) {
+
+			if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
+				altmap = NULL;
+
 			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 			if (!p)
 				return NULL;
@@ -1028,6 +1033,8 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmd, unsigned long a
 			get_page(reuse);
 			p = page_to_virt(reuse);
 		}
+
+		VM_BUG_ON(!PAGE_ALIGNED(addr));
 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 		set_pte_at(&init_mm, addr, pte, entry);
 		asm volatile("ptesync": : :"memory");
@@ -1108,10 +1115,14 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in
 		pmd = vmemmap_pmd_alloc(pud, node, addr);
 		if (!pmd)
 			return -ENOMEM;
+
 		if (pmd_none(READ_ONCE(*pmd))) {
 			void *p;
 
-			if (altmap && altmap_cross_boundary(altmap, start, PMD_SIZE)) {
+			if (!IS_ALIGNED(addr, PMD_SIZE))
+				goto base_mapping;
+
+			if (altmap && altmap_cross_boundary(altmap, addr, PMD_SIZE)) {
 				/* make sure we don't create altmap mappings covery things outside. */
 				goto base_mapping;
diff mbox series

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 8cdff5a05011..87d4c1e62491 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -332,6 +332,8 @@  extern int __meminit radix__vmemmap_create_mapping(unsigned long start,
 					     unsigned long phys);
 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
 				      int node, struct vmem_altmap *altmap);
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+			       struct vmem_altmap *altmap);
 extern void radix__vmemmap_remove_mapping(unsigned long start,
 				    unsigned long page_size);
 
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9972626ddaf6..6d4cd2ebae6e 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -168,6 +168,9 @@  static inline bool is_ioremap_addr(const void *x)
 
 struct seq_file;
 void arch_report_meminfo(struct seq_file *m);
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size);
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+			   unsigned long page_size);
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index d7e2dd3d4add..65de8630abcb 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -742,8 +742,57 @@  static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
 	p4d_clear(p4d);
 }
 
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
+{
+	unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+	return vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
+{
+	unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+	return vmemmap_populated(start, PAGE_SIZE);
+
+}
+
+static void __meminit free_vmemmap_pages(struct page *page,
+					 struct vmem_altmap *altmap,
+					 int order)
+{
+	unsigned int nr_pages = 1 << order;
+
+	if (altmap) {
+		unsigned long alt_start, alt_end;
+		unsigned long base_pfn = page_to_pfn(page);
+
+		/*
+		 * with 1G vmemmap mmaping we can have things setup
+		 * such that even though atlmap is specified we never
+		 * used altmap.
+		 */
+		alt_start = altmap->base_pfn;
+		alt_end = altmap->base_pfn + altmap->reserve +
+			altmap->free + altmap->alloc + altmap->align;
+
+		if (base_pfn >= alt_start && base_pfn < alt_end) {
+			vmem_altmap_free(altmap, nr_pages);
+			return;
+		}
+	}
+
+	if (PageReserved(page)) {
+		/* allocated from memblock */
+		while (nr_pages--)
+			free_reserved_page(page++);
+	} else
+		free_pages((unsigned long)page_address(page), order);
+}
+
 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-			     unsigned long end, bool direct)
+			     unsigned long end, bool direct,
+			     struct vmem_altmap *altmap)
 {
 	unsigned long next, pages = 0;
 	pte_t *pte;
@@ -757,24 +806,23 @@  static void remove_pte_table(pte_t *pte_start, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 
-		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
-			/*
-			 * The vmemmap_free() and remove_section_mapping()
-			 * codepaths call us with aligned addresses.
-			 */
-			WARN_ONCE(1, "%s: unaligned range\n", __func__);
-			continue;
+		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
+			if (!direct)
+				free_vmemmap_pages(pte_page(*pte), altmap, 0);
+			pte_clear(&init_mm, addr, pte);
+			pages++;
+		} else if (!direct && vmemmap_page_is_unused(addr, next)) {
+			free_vmemmap_pages(pte_page(*pte), altmap, 0);
+			pte_clear(&init_mm, addr, pte);
 		}
-
-		pte_clear(&init_mm, addr, pte);
-		pages++;
 	}
 	if (direct)
 		update_page_count(mmu_virtual_psize, -pages);
 }
 
 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
-				       unsigned long end, bool direct)
+				       unsigned long end, bool direct,
+				       struct vmem_altmap *altmap)
 {
 	unsigned long next, pages = 0;
 	pte_t *pte_base;
@@ -788,18 +836,21 @@  static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 			continue;
 
 		if (pmd_is_leaf(*pmd)) {
-			if (!IS_ALIGNED(addr, PMD_SIZE) ||
-			    !IS_ALIGNED(next, PMD_SIZE)) {
-				WARN_ONCE(1, "%s: unaligned range\n", __func__);
-				continue;
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE)) {
+				if (!direct)
+					free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+				pte_clear(&init_mm, addr, (pte_t *)pmd);
+				pages++;
+			} else if (vmemmap_pmd_is_unused(addr, next)) {
+				free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
+				pte_clear(&init_mm, addr, (pte_t *)pmd);
 			}
-			pte_clear(&init_mm, addr, (pte_t *)pmd);
-			pages++;
 			continue;
 		}
 
 		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
-		remove_pte_table(pte_base, addr, next, direct);
+		remove_pte_table(pte_base, addr, next, direct, altmap);
 		free_pte_table(pte_base, pmd);
 	}
 	if (direct)
@@ -807,7 +858,8 @@  static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
 }
 
 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
-				       unsigned long end, bool direct)
+				       unsigned long end, bool direct,
+				       struct vmem_altmap *altmap)
 {
 	unsigned long next, pages = 0;
 	pmd_t *pmd_base;
@@ -832,15 +884,16 @@  static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
 		}
 
 		pmd_base = pud_pgtable(*pud);
-		remove_pmd_table(pmd_base, addr, next, direct);
+		remove_pmd_table(pmd_base, addr, next, direct, altmap);
 		free_pmd_table(pmd_base, pud);
 	}
 	if (direct)
 		update_page_count(MMU_PAGE_1G, -pages);
 }
 
-static void __meminit remove_pagetable(unsigned long start, unsigned long end,
-				       bool direct)
+static void __meminit
+remove_pagetable(unsigned long start, unsigned long end, bool direct,
+		 struct vmem_altmap *altmap)
 {
 	unsigned long addr, next;
 	pud_t *pud_base;
@@ -869,7 +922,7 @@  static void __meminit remove_pagetable(unsigned long start, unsigned long end,
 		}
 
 		pud_base = p4d_pgtable(*p4d);
-		remove_pud_table(pud_base, addr, next, direct);
+		remove_pud_table(pud_base, addr, next, direct, altmap);
 		free_pud_table(pud_base, p4d);
 	}
 
@@ -892,7 +945,7 @@  int __meminit radix__create_section_mapping(unsigned long start,
 
 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
 {
-	remove_pagetable(start, end, true);
+	remove_pagetable(start, end, true, NULL);
 	return 0;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
@@ -924,10 +977,198 @@  int __meminit radix__vmemmap_create_mapping(unsigned long start,
 	return 0;
 }
 
+int __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
+				unsigned long addr, unsigned long next)
+{
+	int large = pmd_large(*pmd);
+
+	if (pmd_large(*pmd))
+		vmemmap_verify((pte_t *)pmd, node, addr, next);
+
+	return large;
+}
+
+void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
+			       unsigned long addr, unsigned long next)
+{
+	pte_t entry;
+	pte_t *ptep = pmdp_ptep(pmdp);
+
+	entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+	set_pte_at(&init_mm, addr, ptep, entry);
+	asm volatile("ptesync": : :"memory");
+
+	vmemmap_verify(ptep, node, addr, next);
+}
+
+static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+						     struct vmem_altmap *altmap,
+						     struct page *reuse)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	if (pte_none(*pte)) {
+		pte_t entry;
+		void *p;
+
+		if (!reuse) {
+			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
+			if (!p)
+				return NULL;
+		} else {
+			/*
+			 * When a PTE/PMD entry is freed from the init_mm
+			 * there's a free_pages() call to this page allocated
+			 * above. Thus this get_page() is paired with the
+			 * put_page_testzero() on the freeing path.
+			 * This can only called by certain ZONE_DEVICE path,
+			 * and through vmemmap_populate_compound_pages() when
+			 * slab is available.
+			 */
+			get_page(reuse);
+			p = page_to_virt(reuse);
+		}
+		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+		set_pte_at(&init_mm, addr, pte, entry);
+		asm volatile("ptesync": : :"memory");
+	}
+	return pte;
+}
+
+static inline pud_t *vmemmap_pud_alloc(p4d_t *p4d, int node,
+				       unsigned long address)
+{
+	pud_t *pud;
+
+	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+	if (unlikely(p4d_none(*p4d))) {
+		if (unlikely(!slab_is_available())) {
+			pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+			p4d_populate(&init_mm, p4d, pud);
+			/* go to the pud_offset */
+		} else
+			return pud_alloc(&init_mm, p4d, address);
+	}
+	return pud_offset(p4d, address);
+}
+
+static inline pmd_t *vmemmap_pmd_alloc(pud_t *pud, int node,
+				       unsigned long address)
+{
+	pmd_t *pmd;
+
+	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+	if (unlikely(pud_none(*pud))) {
+		if (unlikely(!slab_is_available())) {
+			pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+			pud_populate(&init_mm, pud, pmd);
+		} else
+			return pmd_alloc(&init_mm, pud, address);
+	}
+	return pmd_offset(pud, address);
+}
+
+static inline pte_t *vmemmap_pte_alloc(pmd_t *pmd, int node,
+				       unsigned long address)
+{
+	pte_t *pte;
+
+	/* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
+	if (unlikely(pmd_none(*pmd))) {
+		if (unlikely(!slab_is_available())) {
+			pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
+			pmd_populate(&init_mm, pmd, pte);
+		} else
+			return pte_alloc_kernel(pmd, address);
+	}
+	return pte_offset_kernel(pmd, address);
+}
+
+
+
+int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
+				      struct vmem_altmap *altmap)
+{
+	unsigned long addr;
+	unsigned long next;
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pmd_addr_end(addr, end);
+
+		pgd = pgd_offset_k(addr);
+		p4d = p4d_offset(pgd, addr);
+		pud = vmemmap_pud_alloc(p4d, node, addr);
+		if (!pud)
+			return -ENOMEM;
+		pmd = vmemmap_pmd_alloc(pud, node, addr);
+		if (!pmd)
+			return -ENOMEM;
+		if (pmd_none(READ_ONCE(*pmd))) {
+			void *p;
+
+			if (altmap && altmap_cross_boundary(altmap, start, PMD_SIZE)) {
+				/* make sure we don't create altmap mappings covery things outside. */
+				goto base_mapping;
+
+			}
+
+			p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
+			if (p) {
+				vmemmap_set_pmd(pmd, p, node, addr, next);
+				continue;
+			} else if (altmap) {
+				/*
+				 * No fallback: In any case we care about, the
+				 * altmap should be reasonably sized and aligned
+				 * such that vmemmap_alloc_block_buf() will always
+				 * succeed. For consistency with the PTE case,
+				 * return an error here as failure could indicate
+				 * a configuration issue with the size of the altmap.
+				 */
+				return -ENOMEM;
+			}
+		} else if (vmemmap_check_pmd(pmd, node, addr, next)) {
+			/*
+			 * If a huge mapping exist due to early call to
+			 * vmemmap_populate, let's try to use that.
+			 */
+			continue;
+		}
+base_mapping:
+		/*
+		 * Not able allocate higher order memory to back memmap
+		 * or we found a pointer to pte page. Allocate base page
+		 * size vmemmap
+		 */
+		pte = vmemmap_pte_alloc(pmd, node, addr);
+		if (!pte)
+			return -ENOMEM;
+
+		pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
+		if (!pte)
+			return -ENOMEM;
+
+		vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+		next = addr + PAGE_SIZE;
+	}
+	return 0;
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
 {
-	remove_pagetable(start, start + page_size, false);
+	remove_pagetable(start, start + page_size, true, NULL);
+}
+
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+			       struct vmem_altmap *altmap)
+{
+	remove_pagetable(start, end, false, altmap);
 }
 #endif
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fe1b83020e0d..5701faca39ef 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -92,7 +92,7 @@  static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad
  * a page table lookup here because with the hash translation we don't keep
  * vmemmap details in linux page table.
  */
-static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size)
 {
 	struct page *start;
 	unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size;
@@ -183,8 +183,8 @@  static __meminit int vmemmap_list_populate(unsigned long phys,
 	return 0;
 }
 
-static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
-				unsigned long page_size)
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+			   unsigned long page_size)
 {
 	unsigned long nr_pfn = page_size / sizeof(struct page);
 	unsigned long start_pfn = page_to_pfn((struct page *)start);
@@ -204,6 +204,11 @@  int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 	bool altmap_alloc;
 	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (radix_enabled())
+		return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
 	/* Align to the page size of the linear mapping. */
 	start = ALIGN_DOWN(start, page_size);
 
@@ -303,8 +308,8 @@  static unsigned long vmemmap_list_free(unsigned long start)
 	return vmem_back->phys;
 }
 
-void __ref vmemmap_free(unsigned long start, unsigned long end,
-		struct vmem_altmap *altmap)
+void __ref __vmemmap_free(unsigned long start, unsigned long end,
+			  struct vmem_altmap *altmap)
 {
 	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 	unsigned long page_order = get_order(page_size);
@@ -362,6 +367,17 @@  void __ref vmemmap_free(unsigned long start, unsigned long end,
 		vmemmap_remove_mapping(start, page_size);
 	}
 }
+
+void __ref vmemmap_free(unsigned long start, unsigned long end,
+			struct vmem_altmap *altmap)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+	if (radix_enabled())
+		return radix__vmemmap_free(start, end, altmap);
+#endif
+	return __vmemmap_free(start, end, altmap);
+}
+
 #endif
 void register_page_bootmem_memmap(unsigned long section_nr,
 				  struct page *start_page, unsigned long size)