diff mbox series

[3/4] arm64: separate code and data virtual memory allocation

Message ID 20240220203256.31153-4-mbland@motorola.com (mailing list archive)
State Handled Elsewhere
Headers show
Series arm64: mm: support dynamic vmalloc/pmd configuration | expand

Commit Message

Maxwell Bland Feb. 20, 2024, 8:32 p.m. UTC
Current BPF and kprobe instruction allocation interfaces do not match
the base kernel and intermingle code and data pages within the same
sections. In the case of BPF, this appears to be a result of code
duplication between the kernel's JIT compiler and arm64's JIT.  However,
This is no longer necessary given the possibility of overriding vmalloc
wrapper functions.

arm64's vmalloc_node routines now include a layer of indirection which
splits the vmalloc region into two segments surrounding the middle
module_alloc region determined by ASLR. To support this,
code_region_start and code_region_end are defined to match the 2GB
boundary chosen by the kernel module ASLR initialization routine.

The result is a large benefits to overall kernel security, as code pages
now remain protected by this ASLR routine and protections can be defined
linearly for code regions rather than through PTE-level tracking.

Signed-off-by: Maxwell Bland <mbland@motorola.com>
---
 arch/arm64/include/asm/vmalloc.h   |  3 ++
 arch/arm64/kernel/module.c         |  7 ++++
 arch/arm64/kernel/probes/kprobes.c |  2 +-
 arch/arm64/mm/Makefile             |  3 +-
 arch/arm64/mm/vmalloc.c            | 57 ++++++++++++++++++++++++++++++
 arch/arm64/net/bpf_jit_comp.c      |  5 +--
 6 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/mm/vmalloc.c

Comments

Christophe Leroy Feb. 21, 2024, 7:20 a.m. UTC | #1
Le 20/02/2024 à 21:32, Maxwell Bland a écrit :
> [Vous ne recevez pas souvent de courriers de mbland@motorola.com. Découvrez pourquoi ceci est important à https://aka.ms/LearnAboutSenderIdentification ]
> 
> Current BPF and kprobe instruction allocation interfaces do not match
> the base kernel and intermingle code and data pages within the same
> sections. In the case of BPF, this appears to be a result of code
> duplication between the kernel's JIT compiler and arm64's JIT.  However,
> This is no longer necessary given the possibility of overriding vmalloc
> wrapper functions.

Why do you need to override vmalloc wrapper functions for that ?

See powerpc, for kprobes, alloc_insn_page() uses module_alloc().
On powerpc, the approach is that vmalloc() provides non-exec memory 
while module_alloc() provides executable memory.

Christophe
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/vmalloc.h b/arch/arm64/include/asm/vmalloc.h
index 38fafffe699f..dbcf8ad20265 100644
--- a/arch/arm64/include/asm/vmalloc.h
+++ b/arch/arm64/include/asm/vmalloc.h
@@ -31,4 +31,7 @@  static inline pgprot_t arch_vmap_pgprot_tagged(pgprot_t prot)
 	return pgprot_tagged(prot);
 }
 
+extern unsigned long code_region_start __ro_after_init;
+extern unsigned long code_region_end __ro_after_init;
+
 #endif /* _ASM_ARM64_VMALLOC_H */
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index dd851297596e..c4fe753a71a9 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -29,6 +29,10 @@ 
 static u64 module_direct_base __ro_after_init = 0;
 static u64 module_plt_base __ro_after_init = 0;
 
+/* For pre-init vmalloc, assume the worst-case code range */
+unsigned long code_region_start __ro_after_init = (u64) (_end - SZ_2G);
+unsigned long code_region_end __ro_after_init = (u64) (_text + SZ_2G);
+
 /*
  * Choose a random page-aligned base address for a window of 'size' bytes which
  * entirely contains the interval [start, end - 1].
@@ -101,6 +105,9 @@  static int __init module_init_limits(void)
 		module_plt_base = random_bounding_box(SZ_2G, min, max);
 	}
 
+	code_region_start = module_plt_base;
+	code_region_end = module_plt_base + SZ_2G;
+
 	pr_info("%llu pages in range for non-PLT usage",
 		module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
 	pr_info("%llu pages in range for PLT usage",
diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c
index 70b91a8c6bb3..c9e109d6c8bc 100644
--- a/arch/arm64/kernel/probes/kprobes.c
+++ b/arch/arm64/kernel/probes/kprobes.c
@@ -131,7 +131,7 @@  int __kprobes arch_prepare_kprobe(struct kprobe *p)
 
 void *alloc_insn_page(void)
 {
-	return __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, VMALLOC_END,
+	return __vmalloc_node_range(PAGE_SIZE, 1, code_region_start, code_region_end,
 			GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS,
 			NUMA_NO_NODE, __builtin_return_address(0));
 }
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index dbd1bc95967d..730b805d8388 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -2,7 +2,8 @@ 
 obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
 				   ioremap.o mmap.o pgd.o mmu.o \
-				   context.o proc.o pageattr.o fixmap.o
+				   context.o proc.o pageattr.o fixmap.o \
+				   vmalloc.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
diff --git a/arch/arm64/mm/vmalloc.c b/arch/arm64/mm/vmalloc.c
new file mode 100644
index 000000000000..b6d2fa841f90
--- /dev/null
+++ b/arch/arm64/mm/vmalloc.c
@@ -0,0 +1,57 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+
+static void *__vmalloc_node_range_split(unsigned long size, unsigned long align,
+			unsigned long start, unsigned long end,
+			unsigned long exclusion_start, unsigned long exclusion_end, gfp_t gfp_mask,
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
+{
+	void *res = NULL;
+
+	res = __vmalloc_node_range(size, align, start, exclusion_start,
+				gfp_mask, prot, vm_flags, node, caller);
+	if (!res)
+		res = __vmalloc_node_range(size, align, exclusion_end, end,
+				gfp_mask, prot, vm_flags, node, caller);
+
+	return res;
+}
+
+void *__vmalloc_node(unsigned long size, unsigned long align,
+			    gfp_t gfp_mask, unsigned long vm_flags, int node,
+			    const void *caller)
+{
+	return __vmalloc_node_range_split(size, align, VMALLOC_START,
+				VMALLOC_END, code_region_start, code_region_end,
+				gfp_mask, PAGE_KERNEL, vm_flags, node, caller);
+}
+
+void *vmalloc_huge(unsigned long size, gfp_t gfp_mask)
+{
+	return __vmalloc_node_range_split(size, 1, VMALLOC_START, VMALLOC_END,
+				code_region_start, code_region_end,
+				gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
+				NUMA_NO_NODE, __builtin_return_address(0));
+}
+
+void *vmalloc_user(unsigned long size)
+{
+	return __vmalloc_node_range_split(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
+				code_region_start, code_region_end,
+				GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+				VM_USERMAP, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+
+void *vmalloc_32_user(unsigned long size)
+{
+	return __vmalloc_node_range_split(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
+				code_region_start, code_region_end,
+				GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+				VM_USERMAP, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 8955da5c47cf..40426f3a9bdf 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -13,6 +13,7 @@ 
 #include <linux/memory.h>
 #include <linux/printk.h>
 #include <linux/slab.h>
+#include <linux/moduleloader.h>
 
 #include <asm/asm-extable.h>
 #include <asm/byteorder.h>
@@ -1690,12 +1691,12 @@  u64 bpf_jit_alloc_exec_limit(void)
 void *bpf_jit_alloc_exec(unsigned long size)
 {
 	/* Memory is intended to be executable, reset the pointer tag. */
-	return kasan_reset_tag(vmalloc(size));
+	return kasan_reset_tag(module_alloc(size));
 }
 
 void bpf_jit_free_exec(void *addr)
 {
-	return vfree(addr);
+	return module_memfree(addr);
 }
 
 /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */