diff mbox series

[v11,7/9] arm64/kasan: add and use kasan_map_populate()

Message ID 20171009221931.1481-8-pasha.tatashin@oracle.com
State Not Applicable
Delegated to: David Miller
Headers show
Series complete deferred page initialization | expand

Commit Message

Pavel Tatashin Oct. 9, 2017, 10:19 p.m. UTC
During early boot, kasan uses vmemmap_populate() to establish its shadow
memory. But, that interface is intended for struct pages use.

Because of the current project, vmemmap won't be zeroed during allocation,
but kasan expects that memory to be zeroed. We are adding a new
kasan_map_populate() function to resolve this difference.

Therefore, we must use a new interface to allocate and map kasan shadow
memory, that also zeroes memory for us.

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
---
 arch/arm64/mm/kasan_init.c | 72 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 6 deletions(-)

Comments

Will Deacon Oct. 10, 2017, 3:56 p.m. UTC | #1
Hi Pavel,

On Mon, Oct 09, 2017 at 06:19:29PM -0400, Pavel Tatashin wrote:
> During early boot, kasan uses vmemmap_populate() to establish its shadow
> memory. But, that interface is intended for struct pages use.
> 
> Because of the current project, vmemmap won't be zeroed during allocation,
> but kasan expects that memory to be zeroed. We are adding a new
> kasan_map_populate() function to resolve this difference.
> 
> Therefore, we must use a new interface to allocate and map kasan shadow
> memory, that also zeroes memory for us.
> 
> Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
> ---
>  arch/arm64/mm/kasan_init.c | 72 ++++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 66 insertions(+), 6 deletions(-)

Thanks for doing this, although I still think we can do better and avoid the
additional walking code altogether, as well as removing the dependence on
vmemmap. Rather than keep messing you about here (sorry about that), I've
written an arm64 patch for you to take on top of this series. Please take
a look below.

Cheers,

Will

--->8

From 36c6c7c06273d08348b47c1a182116b0a1df8363 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 10 Oct 2017 15:49:43 +0100
Subject: [PATCH] arm64: kasan: Avoid using vmemmap_populate to initialise
 shadow

The kasan shadow is currently mapped using vmemmap_populate since that
provides a semi-convenient way to map pages into swapper. However, since
that no longer zeroes the mapped pages, it is not suitable for kasan,
which requires that the shadow is zeroed in order to avoid false
positives.

This patch removes our reliance on vmemmap_populate and reuses the
existing kasan page table code, which is already required for creating
the early shadow.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/Kconfig         |   2 +-
 arch/arm64/mm/kasan_init.c | 176 +++++++++++++++++++--------------------------
 2 files changed, 74 insertions(+), 104 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0df64a6a56d4..888580b9036e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -68,7 +68,7 @@ config ARM64
 	select HAVE_ARCH_BITREVERSE
 	select HAVE_ARCH_HUGE_VMAP
 	select HAVE_ARCH_JUMP_LABEL
-	select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
+	select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cb4af2951c90..b922826d9908 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -11,6 +11,7 @@
  */
 
 #define pr_fmt(fmt) "kasan: " fmt
+#include <linux/bootmem.h>
 #include <linux/kasan.h>
 #include <linux/kernel.h>
 #include <linux/sched/task.h>
@@ -28,66 +29,6 @@
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
 
-/* Creates mappings for kasan during early boot. The mapped memory is zeroed */
-static int __meminit kasan_map_populate(unsigned long start, unsigned long end,
-					int node)
-{
-	unsigned long addr, pfn, next;
-	unsigned long long size;
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	int ret;
-
-	ret = vmemmap_populate(start, end, node);
-	/*
-	 * We might have partially populated memory, so check for no entries,
-	 * and zero only those that actually exist.
-	 */
-	for (addr = start; addr < end; addr = next) {
-		pgd = pgd_offset_k(addr);
-		if (pgd_none(*pgd)) {
-			next = pgd_addr_end(addr, end);
-			continue;
-		}
-
-		pud = pud_offset(pgd, addr);
-		if (pud_none(*pud)) {
-			next = pud_addr_end(addr, end);
-			continue;
-		}
-		if (pud_sect(*pud)) {
-			/* This is PUD size page */
-			next = pud_addr_end(addr, end);
-			size = PUD_SIZE;
-			pfn = pud_pfn(*pud);
-		} else {
-			pmd = pmd_offset(pud, addr);
-			if (pmd_none(*pmd)) {
-				next = pmd_addr_end(addr, end);
-				continue;
-			}
-			if (pmd_sect(*pmd)) {
-				/* This is PMD size page */
-				next = pmd_addr_end(addr, end);
-				size = PMD_SIZE;
-				pfn = pmd_pfn(*pmd);
-			} else {
-				pte = pte_offset_kernel(pmd, addr);
-				next = addr + PAGE_SIZE;
-				if (pte_none(*pte))
-					continue;
-				/* This is base size page */
-				size = PAGE_SIZE;
-				pfn = pte_pfn(*pte);
-			}
-		}
-		memset(phys_to_virt(PFN_PHYS(pfn)), 0, size);
-	}
-	return ret;
-}
-
 /*
  * The p*d_populate functions call virt_to_phys implicitly so they can't be used
  * directly on kernel symbols (bm_p*d). All the early functions are called too
@@ -95,77 +36,117 @@ static int __meminit kasan_map_populate(unsigned long start, unsigned long end,
  * with the physical address from __pa_symbol.
  */
 
-static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr,
-					unsigned long end)
+static phys_addr_t __init kasan_alloc_zeroed_page(int node)
 {
-	pte_t *pte;
-	unsigned long next;
+	void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE,
+					      __pa(MAX_DMA_ADDRESS),
+					      MEMBLOCK_ALLOC_ACCESSIBLE, node);
+	return __pa(p);
+}
 
-	if (pmd_none(*pmd))
-		__pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE);
+static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node,
+				      bool early)
+{
+	if (pmd_none(*pmd)) {
+		phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte)
+					     : kasan_alloc_zeroed_page(node);
+		__pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE);
+	}
+
+	return early ? pte_offset_kimg(pmd, addr)
+		     : pte_offset_kernel(pmd, addr);
+}
+
+static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node,
+				      bool early)
+{
+	if (pud_none(*pud)) {
+		phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd)
+					     : kasan_alloc_zeroed_page(node);
+		__pud_populate(pud, pmd_phys, PMD_TYPE_TABLE);
+	}
+
+	return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr);
+}
+
+static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node,
+				      bool early)
+{
+	if (pgd_none(*pgd)) {
+		phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud)
+					     : kasan_alloc_zeroed_page(node);
+		__pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE);
+	}
+
+	return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr);
+}
+
+static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr,
+				      unsigned long end, int node, bool early)
+{
+	unsigned long next;
+	pte_t *pte = kasan_pte_offset(pmd, addr, node, early);
 
-	pte = pte_offset_kimg(pmd, addr);
 	do {
+		phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page)
+					      : kasan_alloc_zeroed_page(node);
 		next = addr + PAGE_SIZE;
-		set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page),
-					PAGE_KERNEL));
+		set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
 	} while (pte++, addr = next, addr != end && pte_none(*pte));
 }
 
-static void __init kasan_early_pmd_populate(pud_t *pud,
-					unsigned long addr,
-					unsigned long end)
+static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr,
+				      unsigned long end, int node, bool early)
 {
-	pmd_t *pmd;
 	unsigned long next;
+	pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early);
 
-	if (pud_none(*pud))
-		__pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE);
-
-	pmd = pmd_offset_kimg(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
-		kasan_early_pte_populate(pmd, addr, next);
+		kasan_pte_populate(pmd, addr, next, node, early);
 	} while (pmd++, addr = next, addr != end && pmd_none(*pmd));
 }
 
-static void __init kasan_early_pud_populate(pgd_t *pgd,
-					unsigned long addr,
-					unsigned long end)
+static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr,
+				      unsigned long end, int node, bool early)
 {
-	pud_t *pud;
 	unsigned long next;
+	pud_t *pud = kasan_pud_offset(pgd, addr, node, early);
 
-	if (pgd_none(*pgd))
-		__pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE);
-
-	pud = pud_offset_kimg(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
-		kasan_early_pmd_populate(pud, addr, next);
+		kasan_pmd_populate(pud, addr, next, node, early);
 	} while (pud++, addr = next, addr != end && pud_none(*pud));
 }
 
-static void __init kasan_map_early_shadow(void)
+static void __init kasan_pgd_populate(unsigned long addr, unsigned long end,
+				      int node, bool early)
 {
-	unsigned long addr = KASAN_SHADOW_START;
-	unsigned long end = KASAN_SHADOW_END;
 	unsigned long next;
 	pgd_t *pgd;
 
 	pgd = pgd_offset_k(addr);
 	do {
 		next = pgd_addr_end(addr, end);
-		kasan_early_pud_populate(pgd, addr, next);
+		kasan_pud_populate(pgd, addr, next, node, early);
 	} while (pgd++, addr = next, addr != end);
 }
 
+/* The early shadow maps everything to a single page of zeroes */
 asmlinkage void __init kasan_early_init(void)
 {
 	BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61));
 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
 	BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
-	kasan_map_early_shadow();
+	kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE,
+			   true);
+}
+
+/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */
+static void __init kasan_map_populate(unsigned long start, unsigned long end,
+				      int node)
+{
+	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
 }
 
 /*
@@ -224,17 +205,6 @@ void __init kasan_init(void)
 	kasan_map_populate(kimg_shadow_start, kimg_shadow_end,
 			   pfn_to_nid(virt_to_pfn(lm_alias(_text))));
 
-	/*
-	 * kasan_map_populate() has populated the shadow region that covers the
-	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
-	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
-	 * kasan_populate_zero_shadow() from replacing the page table entries
-	 * (PMD or PTE) at the edges of the shadow region for the kernel
-	 * image.
-	 */
-	kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE);
-	kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE);
-
 	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
 				   (void *)mod_shadow_start);
 	kasan_populate_zero_shadow((void *)kimg_shadow_end,
Pavel Tatashin Oct. 10, 2017, 5:07 p.m. UTC | #2
Hi Will,

Thank you for doing this work. How would you like to proceed?

- If you OK for my series to be accepted as-is, so your patch can be
added later on top, I think, I need an ack from you for kasan changes.
- Otherwise, I can replace: 4267aaf1d279 arm64/kasan: add and use
kasan_map_populate() in my series with code from your patch.

Thank you,
Pavel
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Oct. 10, 2017, 5:10 p.m. UTC | #3
Hi Pavel,

On Tue, Oct 10, 2017 at 01:07:35PM -0400, Pavel Tatashin wrote:
> Thank you for doing this work. How would you like to proceed?
> 
> - If you OK for my series to be accepted as-is, so your patch can be
> added later on top, I think, I need an ack from you for kasan changes.
> - Otherwise, I can replace: 4267aaf1d279 arm64/kasan: add and use
> kasan_map_populate() in my series with code from your patch.

I was thinking that you could just add my patch to the end of your series
and have the whole lot go up like that. If you want to merge it with your
patch, I'm fine with that too.

Will
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 10, 2017, 5:41 p.m. UTC | #4
Hi Will,

Ok, I will add your patch at the end of my series.

Thank you,
Pavel

>
> I was thinking that you could just add my patch to the end of your series
> and have the whole lot go up like that. If you want to merge it with your
> patch, I'm fine with that too.
>
> Will
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 13, 2017, 2:10 p.m. UTC | #5
Hi Will,

I have a couple concerns about your patch:

One of the reasons (and actually, the main reason) why I preferred to
keep vmemmap_populate() instead of implementing kasan's own variant,
which btw can be done in common code similarly to
vmemmap_populate_basepages() is that vmemmap_populate() uses large
pages when available. I think it is a considerable downgrade to go
back to base pages, when we already have large page support available
to us.

The kasan shadow tree is large, it is up-to 1/8th of system memory, so
even on moderate size servers, shadow tree is going to be multiple
gigabytes.

The second concern is that there is an existing bug associated with
your patch that I am not sure how to solve:

Try building your patch with CONFIG_DEBUG_VM. This config makes
memblock_virt_alloc_try_nid_raw() to do memset(0xff) on all allocated
memory.

I am getting the following panic during boot:

[    0.012637] pid_max: default: 32768 minimum: 301
[    0.016037] Security Framework initialized
[    0.018389] Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
[    0.019559] Inode-cache hash table entries: 8192 (order: 4, 65536 bytes)
[    0.020409] Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
[    0.020721] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes)
[    0.055337] Unable to handle kernel paging request at virtual
address ffff0400010065af
[    0.055422] Mem abort info:
[    0.055518]   Exception class = DABT (current EL), IL = 32 bits
[    0.055579]   SET = 0, FnV = 0
[    0.055640]   EA = 0, S1PTW = 0
[    0.055699] Data abort info:
[    0.055762]   ISV = 0, ISS = 0x00000007
[    0.055822]   CM = 0, WnR = 0
[    0.055966] swapper pgtable: 4k pages, 48-bit VAs, pgd = ffff20000a8f4000
[    0.056047] [ffff0400010065af] *pgd=0000000046fe7003,
*pud=0000000046fe6003, *pmd=0000000046fe5003, *pte=0000000000000000
[    0.056436] Internal error: Oops: 96000007 [#1] PREEMPT SMP
[    0.056701] Modules linked in:
[    0.056939] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
4.14.0-rc4_pt_memset12-00096-gfca5985f860e-dirty #16
[    0.057001] Hardware name: linux,dummy-virt (DT)
[    0.057084] task: ffff2000099d9000 task.stack: ffff2000099c0000
[    0.057275] PC is at __asan_load8+0x34/0xb0
[    0.057375] LR is at __d_rehash+0xf0/0x240
[    0.057460] pc : [<ffff200008317d7c>] lr : [<ffff20000837e168>]
pstate: 60000045
[    0.057522] sp : ffff2000099c6a60
[    0.057590] x29: ffff2000099c6a60 x28: ffff2000099d9010
[    0.057733] x27: 0000000000000004 x26: ffff200008031000
[    0.057846] x25: ffff2000099d9000 x24: ffff800003c06410
---Type <return> to continue, or q <return> to quit---
[    0.057954] x23: 00000000000003af x22: ffff800003c06400
[    0.058065] x21: 1fffe40001338d5a x20: ffff200008032d78
[    0.058175] x19: ffff800003c06408 x18: 0000000000000000
[    0.058311] x17: 0000000000000009 x16: 0000000000007fff
[    0.058417] x15: 000000000000002a x14: ffff2000080ef374
[    0.058528] x13: ffff200008126648 x12: ffff200008411a7c
[    0.058638] x11: ffff200008392358 x10: ffff200008392184
[    0.058770] x9 : ffff20000835aad8 x8 : ffff200009850e90
[    0.058883] x7 : ffff20000904b23c x6 : 00000000f2f2f200
[    0.058990] x5 : 0000000000000000 x4 : ffff200008032d78
[    0.059097] x3 : 0000000000000000 x2 : dfff200000000000
[    0.059206] x1 : 0000000000000007 x0 : 1fffe400010065af
[    0.059372] Process swapper/0 (pid: 0, stack limit = 0xffff2000099c0000)
[    0.059442] Call trace:
[    0.059603] Exception stack(0xffff2000099c6920 to 0xffff2000099c6a60)
[    0.059771] 6920: 1fffe400010065af 0000000000000007
dfff200000000000 0000000000000000
[    0.059877] 6940: ffff200008032d78 0000000000000000
00000000f2f2f200 ffff20000904b23c
[    0.059973] 6960: ffff200009850e90 ffff20000835aad8
ffff200008392184 ffff200008392358
[    0.060066] 6980: ffff200008411a7c ffff200008126648
ffff2000080ef374 000000000000002a
[    0.060154] 69a0: 0000000000007fff 0000000000000009
0000000000000000 ffff800003c06408
[    0.060246] 69c0: ffff200008032d78 1fffe40001338d5a
ffff800003c06400 00000000000003af
[    0.060338] 69e0: ffff800003c06410 ffff2000099d9000
ffff200008031000 0000000000000004
[    0.060432] 6a00: ffff2000099d9010 ffff2000099c6a60
ffff20000837e168 ffff2000099c6a60
[    0.060525] 6a20: ffff200008317d7c 0000000060000045
ffff200008392358 ffff200008411a7c
[    0.060620] 6a40: ffffffffffffffff ffff2000080ef374
ffff2000099c6a60 ffff200008317d7c
[    0.060762] [<ffff200008317d7c>] __asan_load8+0x34/0xb0
[    0.060856] [<ffff20000837e168>] __d_rehash+0xf0/0x240
[    0.060944] [<ffff20000837fb80>] d_add+0x288/0x3f0
[    0.061041] [<ffff200008420db8>] proc_setup_self+0x110/0x198
[    0.061139] [<ffff200008411594>] proc_fill_super+0x13c/0x198
[    0.061234] [<ffff200008359648>] mount_ns+0x98/0x148
[    0.061328] [<ffff2000084116ac>] proc_mount+0x5c/0x70
[    0.061422] [<ffff20000835aad8>] mount_fs+0x50/0x1a8
[    0.061515] [<ffff200008392184>] vfs_kern_mount.part.7+0x9c/0x218
[    0.061602] [<ffff200008392358>] kern_mount_data+0x38/0x70
[    0.061699] [<ffff200008411a7c>] pid_ns_prepare_proc+0x24/0x50
[    0.061796] [<ffff200008126648>] alloc_pid+0x6e8/0x730
[    0.061891] [<ffff2000080ef374>] copy_process.isra.6.part.7+0x11cc/0x2cb8
[    0.061978] [<ffff2000080f1104>] _do_fork+0x14c/0x4c0
[    0.062065] [<ffff2000080f14c0>] kernel_thread+0x30/0x38
[    0.062156] [<ffff20000904b23c>] rest_init+0x34/0x108
[    0.062260] [<ffff200009850e90>] start_kernel+0x45c/0x48c
[    0.062458] Code: 540001e1 d343fc00 d2c40002 f2fbffe2 (38e26800)
[    0.063559] ---[ end trace 390c5d4fc6641888 ]---
[    0.064164] Kernel panic - not syncing: Attempted to kill the idle task!
[    0.064438] ---[ end Kernel panic - not syncing: Attempted to kill
the idle task!


So, I've been trying to root cause it, and here is what I've got:

First, I went back to my version of kasan_map_populate() and replaced
vmemmap_populate() with vmemmap_populate_basepages(), which
behavior-vise made it very similar to your patch. After doing this I
got the same panic. So, I figured there must be something to do with
the differences that regular vmemmap allocated with granularity of
SWAPPER_BLOCK_SIZE while kasan with granularity of PAGE_SIZE.

So, I made the following modification to your patch:

static void __init kasan_map_populate(unsigned long start, unsigned long end,
                                      int node)
{
+        start = round_down(start, SWAPPER_BLOCK_SIZE);
+       end = round_up(end, SWAPPER_BLOCK_SIZE);
        kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
}

This is basically makes shadow tree ranges to be SWAPPER_BLOCK_SIZE
aligned. After, this modification everything is working.  However, I
am not sure if this is a proper fix.

I feel, this patch requires more work, and I am troubled with using
base pages instead of large pages.

Thank you,
Pavel

On Tue, Oct 10, 2017 at 1:41 PM, Pavel Tatashin
<pasha.tatashin@oracle.com> wrote:
> Hi Will,
>
> Ok, I will add your patch at the end of my series.
>
> Thank you,
> Pavel
>
>>
>> I was thinking that you could just add my patch to the end of your series
>> and have the whole lot go up like that. If you want to merge it with your
>> patch, I'm fine with that too.
>>
>> Will
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org.  For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Oct. 13, 2017, 2:43 p.m. UTC | #6
Hi Pavel,

On Fri, Oct 13, 2017 at 10:10:09AM -0400, Pavel Tatashin wrote:
> I have a couple concerns about your patch:
> 
> One of the reasons (and actually, the main reason) why I preferred to
> keep vmemmap_populate() instead of implementing kasan's own variant,
> which btw can be done in common code similarly to
> vmemmap_populate_basepages() is that vmemmap_populate() uses large
> pages when available. I think it is a considerable downgrade to go
> back to base pages, when we already have large page support available
> to us.

It shouldn't be difficult to use section mappings with my patch, I just
don't really see the need to try to optimise TLB pressure when you're
running with KASAN enabled which already has something like a 3x slowdown
afaik. If it ends up being a big deal, we can always do that later, but
my main aim here is to divorce kasan from vmemmap because they should be
completely unrelated.

> The kasan shadow tree is large, it is up-to 1/8th of system memory, so
> even on moderate size servers, shadow tree is going to be multiple
> gigabytes.
> 
> The second concern is that there is an existing bug associated with
> your patch that I am not sure how to solve:
> 
> Try building your patch with CONFIG_DEBUG_VM. This config makes
> memblock_virt_alloc_try_nid_raw() to do memset(0xff) on all allocated
> memory.
> 
> I am getting the following panic during boot:
> 
> [    0.012637] pid_max: default: 32768 minimum: 301
> [    0.016037] Security Framework initialized
> [    0.018389] Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
> [    0.019559] Inode-cache hash table entries: 8192 (order: 4, 65536 bytes)
> [    0.020409] Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
> [    0.020721] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes)
> [    0.055337] Unable to handle kernel paging request at virtual
> address ffff0400010065af
> [    0.055422] Mem abort info:
> [    0.055518]   Exception class = DABT (current EL), IL = 32 bits
> [    0.055579]   SET = 0, FnV = 0
> [    0.055640]   EA = 0, S1PTW = 0
> [    0.055699] Data abort info:
> [    0.055762]   ISV = 0, ISS = 0x00000007
> [    0.055822]   CM = 0, WnR = 0
> [    0.055966] swapper pgtable: 4k pages, 48-bit VAs, pgd = ffff20000a8f4000
> [    0.056047] [ffff0400010065af] *pgd=0000000046fe7003,
> *pud=0000000046fe6003, *pmd=0000000046fe5003, *pte=0000000000000000
> [    0.056436] Internal error: Oops: 96000007 [#1] PREEMPT SMP
> [    0.056701] Modules linked in:
> [    0.056939] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
> 4.14.0-rc4_pt_memset12-00096-gfca5985f860e-dirty #16
> [    0.057001] Hardware name: linux,dummy-virt (DT)
> [    0.057084] task: ffff2000099d9000 task.stack: ffff2000099c0000
> [    0.057275] PC is at __asan_load8+0x34/0xb0
> [    0.057375] LR is at __d_rehash+0xf0/0x240

[...]

> So, I've been trying to root cause it, and here is what I've got:
> 
> First, I went back to my version of kasan_map_populate() and replaced
> vmemmap_populate() with vmemmap_populate_basepages(), which
> behavior-vise made it very similar to your patch. After doing this I
> got the same panic. So, I figured there must be something to do with
> the differences that regular vmemmap allocated with granularity of
> SWAPPER_BLOCK_SIZE while kasan with granularity of PAGE_SIZE.
> 
> So, I made the following modification to your patch:
> 
> static void __init kasan_map_populate(unsigned long start, unsigned long end,
>                                       int node)
> {
> +        start = round_down(start, SWAPPER_BLOCK_SIZE);
> +       end = round_up(end, SWAPPER_BLOCK_SIZE);
>         kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
> }
> 
> This is basically makes shadow tree ranges to be SWAPPER_BLOCK_SIZE
> aligned. After, this modification everything is working.  However, I
> am not sure if this is a proper fix.

This certainly doesn't sound right; mapping the shadow with pages shouldn't
lead to problems. I also can't seem to reproduce this myself -- could you
share your full .config and a pointer to the git tree that you're using,
please?

> I feel, this patch requires more work, and I am troubled with using
> base pages instead of large pages.

I'm happy to try fixing this, because I think splitting up kasan and vmemmap
is the right thing to do here.

Will
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Rutland Oct. 13, 2017, 2:56 p.m. UTC | #7
Hi,

On Fri, Oct 13, 2017 at 03:43:19PM +0100, Will Deacon wrote:
> On Fri, Oct 13, 2017 at 10:10:09AM -0400, Pavel Tatashin wrote:
> > I am getting the following panic during boot:
> > 
> > [    0.012637] pid_max: default: 32768 minimum: 301
> > [    0.016037] Security Framework initialized
> > [    0.018389] Dentry cache hash table entries: 16384 (order: 5, 131072 bytes)
> > [    0.019559] Inode-cache hash table entries: 8192 (order: 4, 65536 bytes)
> > [    0.020409] Mount-cache hash table entries: 512 (order: 0, 4096 bytes)
> > [    0.020721] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes)
> > [    0.055337] Unable to handle kernel paging request at virtual
> > address ffff0400010065af
> > [    0.055422] Mem abort info:
> > [    0.055518]   Exception class = DABT (current EL), IL = 32 bits
> > [    0.055579]   SET = 0, FnV = 0
> > [    0.055640]   EA = 0, S1PTW = 0
> > [    0.055699] Data abort info:
> > [    0.055762]   ISV = 0, ISS = 0x00000007
> > [    0.055822]   CM = 0, WnR = 0
> > [    0.055966] swapper pgtable: 4k pages, 48-bit VAs, pgd = ffff20000a8f4000
> > [    0.056047] [ffff0400010065af] *pgd=0000000046fe7003,
> > *pud=0000000046fe6003, *pmd=0000000046fe5003, *pte=0000000000000000
> > [    0.056436] Internal error: Oops: 96000007 [#1] PREEMPT SMP
> > [    0.056701] Modules linked in:
> > [    0.056939] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
> > 4.14.0-rc4_pt_memset12-00096-gfca5985f860e-dirty #16
> > [    0.057001] Hardware name: linux,dummy-virt (DT)
> > [    0.057084] task: ffff2000099d9000 task.stack: ffff2000099c0000
> > [    0.057275] PC is at __asan_load8+0x34/0xb0
> > [    0.057375] LR is at __d_rehash+0xf0/0x240

Do you know what your physical memory layout looks like? 

Knowing that would tell us where shadow memory *should* be.

Can you share the command line you're using the launch the VM?

Thanks,
Mark.
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 13, 2017, 3:02 p.m. UTC | #8
> Do you know what your physical memory layout looks like?

[    0.000000] Memory: 34960K/131072K available (16316K kernel code,
6716K rwdata, 7996K rodata, 1472K init, 8837K bss, 79728K reserved,
16384K cma-reserved)
[    0.000000] Virtual kernel memory layout:
[    0.000000]     kasan   : 0xffff000000000000 - 0xffff200000000000
( 32768 GB)
[    0.000000]     modules : 0xffff200000000000 - 0xffff200008000000
(   128 MB)
[    0.000000]     vmalloc : 0xffff200008000000 - 0xffff7dffbfff0000
( 96254 GB)
[    0.000000]       .text : 0xffff200008080000 - 0xffff200009070000
( 16320 KB)
[    0.000000]     .rodata : 0xffff200009070000 - 0xffff200009850000
(  8064 KB)
[    0.000000]       .init : 0xffff200009850000 - 0xffff2000099c0000
(  1472 KB)
[    0.000000]       .data : 0xffff2000099c0000 - 0xffff20000a04f200
(  6717 KB)
[    0.000000]        .bss : 0xffff20000a04f200 - 0xffff20000a8f09e0
(  8838 KB)
[    0.000000]     fixed   : 0xffff7dfffe7fd000 - 0xffff7dfffec00000
(  4108 KB)
[    0.000000]     PCI I/O : 0xffff7dfffee00000 - 0xffff7dffffe00000
(    16 MB)
[    0.000000]     vmemmap : 0xffff7e0000000000 - 0xffff800000000000
(  2048 GB maximum)
[    0.000000]               0xffff7e0000000000 - 0xffff7e0000200000
(     2 MB actual)
[    0.000000]     memory  : 0xffff800000000000 - 0xffff800008000000
(   128 MB)

>
> Knowing that would tell us where shadow memory *should* be.
>
> Can you share the command line you're using the launch the VM?
>

virtme-run --kdir . --arch aarch64 --qemu-opts -s -S

and get messages from connected gdb session via lx-dmesg command.

The actual qemu arguments are these:

qemu-system-aarch64 -fsdev
local,id=virtfs1,path=/,security_model=none,readonly -device
virtio-9p-device,fsdev=virtfs1,mount_tag=/dev/root -fsdev
local,id=virtfs5,path=/usr/share/virtme-guest-0,security_model=none,readonly
-device virtio-9p-device,fsdev=virtfs5,mount_tag=virtme.guesttools -M
virt -cpu cortex-a57 -parallel none -net none -echr 1 -serial none
-chardev stdio,id=console,signal=off,mux=on -serial chardev:console
-mon chardev=console -vga none -display none -kernel
./arch/arm64/boot/Image -append 'earlyprintk=serial,ttyAMA0,115200
console=ttyAMA0 psmouse.proto=exps "virtme_stty_con=rows 57 cols 105
iutf8" TERM=screen-256color-bce rootfstype=9p
rootflags=version=9p2000.L,trans=virtio,access=any raid=noautodetect
ro init=/bin/sh -- -c "mount -t tmpfs run /run;mkdir -p
/run/virtme/guesttools;/bin/mount -n -t 9p -o
ro,version=9p2000.L,trans=virtio,access=any virtme.guesttools
/run/virtme/guesttools;exec /run/virtme/guesttools/virtme-init"' -s -S
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 13, 2017, 3:09 p.m. UTC | #9
> It shouldn't be difficult to use section mappings with my patch, I just
> don't really see the need to try to optimise TLB pressure when you're
> running with KASAN enabled which already has something like a 3x slowdown
> afaik. If it ends up being a big deal, we can always do that later, but
> my main aim here is to divorce kasan from vmemmap because they should be
> completely unrelated.

Yes, I understand that kasan makes system slow, but my point is why
make it even slower? However, I am OK adding your patch to the series,
BTW, symmetric changes will be needed for x86 as well sometime later.

>
> This certainly doesn't sound right; mapping the shadow with pages shouldn't
> lead to problems. I also can't seem to reproduce this myself -- could you
> share your full .config and a pointer to the git tree that you're using,
> please?

Config is attached. I am using my patch series + your patch + today's
clone from https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

Also, in a separate e-mail i sent out the qemu arguments.

>
>> I feel, this patch requires more work, and I am troubled with using
>> base pages instead of large pages.
>
> I'm happy to try fixing this, because I think splitting up kasan and vmemmap
> is the right thing to do here.

Thank you very much.

Pavel
Pavel Tatashin Oct. 13, 2017, 3:34 p.m. UTC | #10
Here is simplified qemu command:

qemu-system-aarch64 \
      -display none \
      -kernel ./arch/arm64/boot/Image  \
      -M virt -cpu cortex-a57 -s -S

In a separate terminal start arm64 cross debugger:

$ aarch64-unknown-linux-gnu-gdb ./vmlinux
...
Reading symbols from ./vmlinux...done.
(gdb) target remote :1234
Remote debugging using :1234
0x0000000040000000 in ?? ()
(gdb) c
Continuing.
^C
(gdb) lx-dmesg
[    0.000000] Booting Linux on physical CPU 0x0
[    0.000000] Linux version 4.14.0-rc4_pt_study-00136-gbed2c89768ba
(soleen@xakep) (gcc version 7.1.0 (crosstool-NG
crosstool-ng-1.23.0-90-g81327dd9)) #1 SMP PREEMPT Fri Oct 13 11:24:46
EDT 2017
... until the panic message is printed ...

Thank you,
Pavel
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Oct. 13, 2017, 3:44 p.m. UTC | #11
Hi Pavel,

On Fri, Oct 13, 2017 at 11:09:41AM -0400, Pavel Tatashin wrote:
> > It shouldn't be difficult to use section mappings with my patch, I just
> > don't really see the need to try to optimise TLB pressure when you're
> > running with KASAN enabled which already has something like a 3x slowdown
> > afaik. If it ends up being a big deal, we can always do that later, but
> > my main aim here is to divorce kasan from vmemmap because they should be
> > completely unrelated.
> 
> Yes, I understand that kasan makes system slow, but my point is why
> make it even slower? However, I am OK adding your patch to the series,
> BTW, symmetric changes will be needed for x86 as well sometime later.
> 
> >
> > This certainly doesn't sound right; mapping the shadow with pages shouldn't
> > lead to problems. I also can't seem to reproduce this myself -- could you
> > share your full .config and a pointer to the git tree that you're using,
> > please?
> 
> Config is attached. I am using my patch series + your patch + today's
> clone from https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

Great, I hit the same problem with your .config. It might actually be
CONFIG_DEBUG_MEMORY_INIT which does it.

> Also, in a separate e-mail i sent out the qemu arguments.
> 
> >
> >> I feel, this patch requires more work, and I am troubled with using
> >> base pages instead of large pages.
> >
> > I'm happy to try fixing this, because I think splitting up kasan and vmemmap
> > is the right thing to do here.
> 
> Thank you very much.

Thanks for sharing the .config and tree. It looks like the problem is that
kimg_shadow_start and kimg_shadow_end are not page-aligned. Whilst I fix
them up in kasan_map_populate, they remain unaligned when passed to
kasan_populate_zero_shadow, which confuses the loop termination conditions
in e.g. zero_pte_populate and the shadow isn't configured properly.

Fixup diff below; please merge in with my original patch.

Will

--->8

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index b922826d9908..207b1acb823a 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -146,7 +146,7 @@ asmlinkage void __init kasan_early_init(void)
 static void __init kasan_map_populate(unsigned long start, unsigned long end,
				      int node)
 {
-	kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false);
+	kasan_pgd_populate(start, end, node, false);
 }
 
 /*
@@ -183,8 +183,8 @@ void __init kasan_init(void)
	struct memblock_region *reg;
	int i;
 
-	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text);
-	kimg_shadow_end = (u64)kasan_mem_to_shadow(_end);
+	kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK;
+	kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end));
 
	mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR);
	mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);


--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 13, 2017, 3:54 p.m. UTC | #12
> Thanks for sharing the .config and tree. It looks like the problem is that
> kimg_shadow_start and kimg_shadow_end are not page-aligned. Whilst I fix
> them up in kasan_map_populate, they remain unaligned when passed to
> kasan_populate_zero_shadow, which confuses the loop termination conditions
> in e.g. zero_pte_populate and the shadow isn't configured properly.

This makes sense. Thank you. I will insert these changes into your
patch, and send out a new series soon after sanity checking it.

Pavel
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Tatashin Oct. 13, 2017, 4 p.m. UTC | #13
BTW, don't we need the same aligments inside for_each_memblock() loop?

How about change kasan_map_populate() to accept regular VA start, end
address, and convert them internally after aligning to PAGE_SIZE?

Thank you,
Pavel


On Fri, Oct 13, 2017 at 11:54 AM, Pavel Tatashin
<pasha.tatashin@oracle.com> wrote:
>> Thanks for sharing the .config and tree. It looks like the problem is that
>> kimg_shadow_start and kimg_shadow_end are not page-aligned. Whilst I fix
>> them up in kasan_map_populate, they remain unaligned when passed to
>> kasan_populate_zero_shadow, which confuses the loop termination conditions
>> in e.g. zero_pte_populate and the shadow isn't configured properly.
>
> This makes sense. Thank you. I will insert these changes into your
> patch, and send out a new series soon after sanity checking it.
>
> Pavel
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Will Deacon Oct. 13, 2017, 4:18 p.m. UTC | #14
On Fri, Oct 13, 2017 at 12:00:27PM -0400, Pavel Tatashin wrote:
> BTW, don't we need the same aligments inside for_each_memblock() loop?

Hmm, yes actually, given that we shift them right for the shadow address.

> How about change kasan_map_populate() to accept regular VA start, end
> address, and convert them internally after aligning to PAGE_SIZE?

That's what my original patch did, but it doesn't help on its own because
kasan_populate_zero_shadow would need the same change.

Will
--
To unsubscribe from this list: send the line "unsubscribe sparclinux" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 81f03959a4ab..cb4af2951c90 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -28,6 +28,66 @@ 
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
 
+/* Creates mappings for kasan during early boot. The mapped memory is zeroed */
+static int __meminit kasan_map_populate(unsigned long start, unsigned long end,
+					int node)
+{
+	unsigned long addr, pfn, next;
+	unsigned long long size;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	int ret;
+
+	ret = vmemmap_populate(start, end, node);
+	/*
+	 * We might have partially populated memory, so check for no entries,
+	 * and zero only those that actually exist.
+	 */
+	for (addr = start; addr < end; addr = next) {
+		pgd = pgd_offset_k(addr);
+		if (pgd_none(*pgd)) {
+			next = pgd_addr_end(addr, end);
+			continue;
+		}
+
+		pud = pud_offset(pgd, addr);
+		if (pud_none(*pud)) {
+			next = pud_addr_end(addr, end);
+			continue;
+		}
+		if (pud_sect(*pud)) {
+			/* This is PUD size page */
+			next = pud_addr_end(addr, end);
+			size = PUD_SIZE;
+			pfn = pud_pfn(*pud);
+		} else {
+			pmd = pmd_offset(pud, addr);
+			if (pmd_none(*pmd)) {
+				next = pmd_addr_end(addr, end);
+				continue;
+			}
+			if (pmd_sect(*pmd)) {
+				/* This is PMD size page */
+				next = pmd_addr_end(addr, end);
+				size = PMD_SIZE;
+				pfn = pmd_pfn(*pmd);
+			} else {
+				pte = pte_offset_kernel(pmd, addr);
+				next = addr + PAGE_SIZE;
+				if (pte_none(*pte))
+					continue;
+				/* This is base size page */
+				size = PAGE_SIZE;
+				pfn = pte_pfn(*pte);
+			}
+		}
+		memset(phys_to_virt(PFN_PHYS(pfn)), 0, size);
+	}
+	return ret;
+}
+
 /*
  * The p*d_populate functions call virt_to_phys implicitly so they can't be used
  * directly on kernel symbols (bm_p*d). All the early functions are called too
@@ -161,11 +221,11 @@  void __init kasan_init(void)
 
 	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
 
-	vmemmap_populate(kimg_shadow_start, kimg_shadow_end,
-			 pfn_to_nid(virt_to_pfn(lm_alias(_text))));
+	kasan_map_populate(kimg_shadow_start, kimg_shadow_end,
+			   pfn_to_nid(virt_to_pfn(lm_alias(_text))));
 
 	/*
-	 * vmemmap_populate() has populated the shadow region that covers the
+	 * kasan_map_populate() has populated the shadow region that covers the
 	 * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round
 	 * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent
 	 * kasan_populate_zero_shadow() from replacing the page table entries
@@ -191,9 +251,9 @@  void __init kasan_init(void)
 		if (start >= end)
 			break;
 
-		vmemmap_populate((unsigned long)kasan_mem_to_shadow(start),
-				(unsigned long)kasan_mem_to_shadow(end),
-				pfn_to_nid(virt_to_pfn(start)));
+		kasan_map_populate((unsigned long)kasan_mem_to_shadow(start),
+				   (unsigned long)kasan_mem_to_shadow(end),
+				   pfn_to_nid(virt_to_pfn(start)));
 	}
 
 	/*