[RFC,7/7] virtual memory for OPAL boot

Message ID 20180921080511.22026-8-npiggin@gmail.com
State New
Headers show
Series
  • virtual memory patches
Related show

Checks

Context Check Description
snowpatch_ozlabs/make_check success Test make_check on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Nicholas Piggin Sept. 21, 2018, 8:05 a.m.
This is a rewrite of the old patch which is now a real implementation
that boots on a POWER9. Sometimes it seems to hang, probably has a
lot of bugs left, and there's some hacks and warts, but for the most
part this should be a realistic design.

The main difference is that VM mapping code rather than setting up
bolted mappings immediately (which can not work in general with the
small hash table when you have a lot of memory to map), it sets up
an extent. And then the hash table gets populated on demand with
page faults.

The mappings are still entirely 1:1 so there is no need for anything
more than a simple extent with RW, EX, CI attributes. Actually even
then we have a relaxed mode which tells you that you forgot to map
something and sets a default RW page for you (which works fine except
for MMIO).

The most intrusive part of this is finding all the MMIOs that need
to be mapped CI. I work around it with a hack to put a big CI mapping
at the 0x6.... address, but I think it would be better in the long
run to explicitly map everything fine-grained, and unmap if they are
not in use.

If we eventually merge this, the in the medium-term this may be useful
for catching boot and bring-up bugs. Everything can run fine in
real-mode (only code path difference are the IO accesses). We could put
more guard pages between memory regions, stacks, we can more carefully
unmap memory after it's freed, only map MMIO when it's to be used, and
do more per-cpu mappigs. Provide debug modes that give out full pages
with guard pages around them for memory allocation calls.

In the long term I would like to think about keeping the virtual map
information around (not the hash table, but the vmm extents) after
boot, and have the guest OS ability to provide a virtual environment
for skiboot runtime. In general I think it would help contain memory
scribble bugs and such things.

Thanks,
Nick
---
 core/Makefile.inc    |   2 +-
 core/cpu.c           |   7 +
 core/exceptions.c    |  51 +++
 core/flash.c         |   4 +-
 core/init.c          | 107 +++++--
 core/lock.c          |  10 +-
 core/mem_region.c    |  84 +++--
 core/opal.c          |  14 +-
 core/vm.c            | 723 +++++++++++++++++++++++++++++++++++++++++++
 hdata/spira.c        |  21 +-
 hw/fake-nvram.c      |  12 +-
 hw/homer.c           |   5 +
 hw/lpc-uart.c        |  31 +-
 hw/lpc.c             |   2 +
 hw/phb4.c            |  16 +-
 hw/slw.c             |   4 +-
 hw/xive.c            |   1 +
 hw/xscom.c           |   3 +-
 include/cmpxchg.h    |  12 +
 include/cpu.h        |   9 +
 include/io.h         |  57 +++-
 include/mem_region.h |   1 +
 include/processor.h  |  12 +-
 include/skiboot.h    |  26 ++
 libstb/container.c   |  12 +-
 skiboot.lds.S        |  77 +++--
 26 files changed, 1198 insertions(+), 105 deletions(-)
 create mode 100644 core/vm.c

Patch

diff --git a/core/Makefile.inc b/core/Makefile.inc
index d3635059..e057f479 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -1,7 +1,7 @@ 
 # -*-Makefile-*-
 
 SUBDIRS += core
-CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
 CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
 CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
diff --git a/core/cpu.c b/core/cpu.c
index cc5b88c5..1a9d6bac 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -465,15 +465,22 @@  static void cpu_idle_p9(enum cpu_wake_cause wake_on)
 	isync();
 
 	if (sreset_enabled) {
+		bool vm_setup = cpu->vm_setup;
+
 		/* stop with EC=1 (sreset) and ESL=1 (enable thread switch). */
 		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BIT(42) | PPC_BIT(43) |
 			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		if (vm_setup)
+			vm_exit();
 		enter_p9_pm_state(psscr);
+		if (vm_setup)
+			vm_enter();
 	} else {
 		/* stop with EC=0 (resumes) which does not require sreset. */
 		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/* Can run with VM enabled */
 		enter_p9_pm_lite_state(psscr);
 	}
 
diff --git a/core/exceptions.c b/core/exceptions.c
index e205ac6e..670d218a 100644
--- a/core/exceptions.c
+++ b/core/exceptions.c
@@ -44,11 +44,15 @@  void exception_entry(struct stack_frame *stack);
 
 void exception_entry(struct stack_frame *stack)
 {
+	struct cpu_thread *c = this_cpu();
 	uint64_t nip;
 	uint64_t msr;
 	const size_t max = 320;
 	char buf[max];
 	size_t l;
+	bool vm_setup = c->vm_setup;
+
+	c->vm_setup = false;
 
 	switch (stack->type) {
 	case 0x500:
@@ -72,6 +76,47 @@  void exception_entry(struct stack_frame *stack)
 		break;
 	}
 
+	if (msr & MSR_DR) {
+		if (stack->type == 0x300) {
+			if (vm_dsi(nip, stack->dar,
+					!!(stack->dsisr & DSISR_ISSTORE)))
+				goto out;
+		}
+		if (stack->type == 0x380) {
+			if (vm_dslb(nip, stack->dar))
+				goto out;
+		}
+	}
+
+	if (msr & MSR_IR) {
+		if (stack->type == 0x400) {
+			if (vm_isi(nip))
+				goto out;
+		}
+		if (stack->type == 0x480) {
+			if (vm_islb(nip))
+				goto out;
+		}
+	}
+
+	if ((msr & (MSR_IR|MSR_DR)) && (msr & MSR_RI)) {
+		if (stack->type == 0x200) {
+			l = 0;
+			l += snprintf(buf + l, max - l,
+				"Recoverable MCE with VM=on at "REG"   ", nip);
+			l += snprintf_symbol(buf + l, max - l, nip);
+			l += snprintf(buf + l, max - l, "  MSR "REG, nip);
+			prerror("%s\n", buf);
+			dump_regs(stack);
+			prerror("Attempting to continue with VM=off\n");
+			vm_setup = false;
+			msr &= ~(MSR_IR|MSR_DR);
+			mtspr(SPR_HSRR1, msr);
+			goto out;
+		}
+	}
+
+fatal:
 	prerror("***********************************************\n");
 	l = 0;
 	if (stack->type == 0x200) {
@@ -87,6 +132,12 @@  void exception_entry(struct stack_frame *stack)
 	dump_regs(stack);
 
 	abort();
+
+out:
+	if (!(msr & MSR_RI))
+		goto fatal;
+
+	c->vm_setup = vm_setup;
 }
 
 static int64_t opal_register_exc_handler(uint64_t opal_exception __unused,
diff --git a/core/flash.c b/core/flash.c
index a4ffc7a7..cf1a942b 100644
--- a/core/flash.c
+++ b/core/flash.c
@@ -830,10 +830,12 @@  done_reading:
 	 * FIXME: TEMPORARY HACK: Don't verify VERSION until all bits of code
 	 * to produce a signed VERSION partition are upstream for a while.
 	 */
+#if 0
+// XXX: this chekstops
 	if (id != RESOURCE_ID_VERSION)
 		secureboot_verify(id, buf, *len);
 	trustedboot_measure(id, buf, *len);
-
+#endif
 	/* Find subpartition */
 	if (subid != RESOURCE_SUBID_NONE) {
 		memmove(buf, bufp, content_size);
diff --git a/core/init.c b/core/init.c
index a7e70c9e..2f507981 100644
--- a/core/init.c
+++ b/core/init.c
@@ -320,6 +320,7 @@  bool start_preload_kernel(void)
 	int loaded;
 
 	/* Try to load an external kernel payload through the platform hooks */
+	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
 	kernel_size = KERNEL_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
 					RESOURCE_SUBID_NONE,
@@ -331,6 +332,7 @@  bool start_preload_kernel(void)
 		return false;
 	}
 
+	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
 	initramfs_size = INITRAMFS_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
 					RESOURCE_SUBID_NONE,
@@ -347,7 +349,7 @@  bool start_preload_kernel(void)
 static bool load_kernel(void)
 {
 	void *stb_container = NULL;
-	struct elf_hdr *kh;
+	struct elf_hdr *kh, *t;
 	int loaded;
 
 	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
@@ -355,6 +357,8 @@  static bool load_kernel(void)
 	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
 					  RESOURCE_SUBID_NONE);
 
+	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
+
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform wait for kernel load failed\n");
 		kernel_size = 0;
@@ -386,7 +390,7 @@  static bool load_kernel(void)
 		if (kernel_entry < EXCEPTION_VECTORS_END) {
 			cpu_set_sreset_enable(false);
 			memcpy(NULL, old_vectors, EXCEPTION_VECTORS_END);
-			sync_icache();
+			sync_icache(0);
 		}
 	} else {
 		if (!kernel_size) {
@@ -407,21 +411,33 @@  static bool load_kernel(void)
 	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
 	      kernel_size);
 
-	if (kh->ei_ident != ELF_IDENT) {
+	/*
+	 * This has to be mapped in place because of some calaulcations
+	 * relative to pointer value.
+	 */
+	vm_map_global("kernel header", (unsigned long)kh, sizeof(*kh), false, false);
+	t = kh;
+	if (t->ei_ident != ELF_IDENT) {
 		prerror("INIT: ELF header not found. Assuming raw binary.\n");
 		return true;
 	}
 
-	if (kh->ei_class == ELF_CLASS_64) {
-		if (!try_load_elf64(kh))
+	if (t->ei_class == ELF_CLASS_64) {
+		if (!try_load_elf64(t)) {
+			vm_unmap_global((unsigned long)kh, sizeof(*kh));
 			return false;
-	} else if (kh->ei_class == ELF_CLASS_32) {
-		if (!try_load_elf32(kh))
+		}
+	} else if (t->ei_class == ELF_CLASS_32) {
+		if (!try_load_elf32(t)) {
+			vm_unmap_global((unsigned long)kh, sizeof(*kh));
 			return false;
+		}
 	} else {
 		prerror("INIT: Neither ELF32 not ELF64 ?\n");
+		vm_unmap_global((unsigned long)kh, sizeof(*kh));
 		return false;
 	}
+	vm_unmap_global((unsigned long)kh, sizeof(*kh));
 
 	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
 		secureboot_verify(RESOURCE_ID_KERNEL,
@@ -444,6 +460,7 @@  static void load_initramfs(void)
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
 
+	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 	if (loaded != OPAL_SUCCESS || !initramfs_size)
 		return;
 
@@ -481,6 +498,7 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 	const struct dt_property *memprop;
 	const char *cmdline, *stdoutp;
 	uint64_t mem_top;
+	uint32_t *t;
 
 	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
 	if (memprop)
@@ -584,11 +602,16 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
 
+	t = vm_map(kernel_entry, 4, false);
 	/* Check there is something there before we branch to it */
-	if (*(uint32_t *)kernel_entry == 0) {
+	if (*t == 0) {
 		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
 		assert(0);
 	}
+	vm_unmap(kernel_entry, 4);
+
+	/* Go back to realmode and tear down our VM before booting kernel */
+	vm_destroy();
 
 	if (kernel_32bit)
 		start_kernel32(kernel_entry, fdt, mem_top);
@@ -749,23 +772,35 @@  static void setup_branch_null_catcher(void)
 
 void setup_reset_vector(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 	cpu_set_sreset_enable(true);
 }
 
 void copy_exception_vectors(void)
 {
+	void *t;
+
+	t = vm_map(0x0, 0x2000, true);
+
 	/* Backup previous vectors as this could contain a kernel
 	 * image.
 	 */
-	memcpy(old_vectors, NULL, EXCEPTION_VECTORS_END);
+	memcpy(old_vectors, t, EXCEPTION_VECTORS_END);
 
 	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
 	 * this is the boot flag used by CPUs still potentially entering
@@ -773,9 +808,10 @@  void copy_exception_vectors(void)
 	 */
 	BUILD_ASSERT((&reset_patch_end - &reset_patch_start) <
 			EXCEPTION_VECTORS_END - 0x100);
-	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
 			EXCEPTION_VECTORS_END - 0x100);
-	sync_icache();
+	sync_icache((unsigned long)t);
+	vm_unmap(0x0, 0x2000);
 }
 
 static void per_thread_sanity_checks(void)
@@ -839,14 +875,23 @@  static uint32_t romem_csum;
 
 static void checksum_romem(void)
 {
+	void *t;
+	unsigned long size;
 	uint32_t csum;
 
 	romem_csum = 0;
 
-	csum = mem_csum(_start, _romem_end);
+	size = (unsigned long)_romem_end - (unsigned long)_start;
+	t = vm_map((unsigned long)_start, size, false);
+	csum = mem_csum(t, t + size);
 	romem_csum ^= csum;
-	csum = mem_csum(__builtin_kernel_start, __builtin_kernel_end);
+	vm_unmap((unsigned long)_start, size);
+
+	size = (unsigned long)__builtin_kernel_end - (unsigned long)__builtin_kernel_start;
+	t = vm_map((unsigned long)__builtin_kernel_start, size, false);
+	csum = mem_csum(t, t + size);
 	romem_csum ^= csum;
+	vm_unmap((unsigned long)__builtin_kernel_start, size);
 }
 
 bool verify_romem(void)
@@ -864,6 +909,8 @@  bool verify_romem(void)
 /* Called from head.S, thus no prototype. */
 void main_cpu_entry(const void *fdt);
 
+void mem_dump_allocs(void);
+
 void __noreturn __nomcount main_cpu_entry(const void *fdt)
 {
 	/*
@@ -921,7 +968,7 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
 	       (debug_descriptor.console_log_levels >> 4),
 	       (debug_descriptor.console_log_levels & 0x0f));
-	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
+	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
 
 #ifdef SKIBOOT_GCOV
 	skiboot_gcov_done();
@@ -933,6 +980,8 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	/* Now locks can be used */
 	init_locks();
 
+	vm_init();
+
 	/* Create the OPAL call table early on, entries can be overridden
 	 * later on (FSP console code for example)
 	 */
@@ -958,7 +1007,20 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 		if (parse_hdat(false) < 0)
 			abort();
 	} else {
+		void *t;
+		uint32_t size;
+
+		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
+		size = fdt_totalsize(t);
+		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
+
+		/*
+		 * Would be nice to make this a local map, but it seems
+		 * to need to be expanded in place.
+		 */
+		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
 		dt_expand(fdt);
+		vm_unmap_global((unsigned long)fdt, size);
 	}
 	dt_add_cpufeatures(dt_root);
 
@@ -1009,6 +1071,8 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	init_cpu_max_pir();
 
+	vm_init_stacks();
+
 	/*
 	 * Now, we init our memory map from the device-tree, and immediately
 	 * reserve areas which we know might contain data coming from
@@ -1018,6 +1082,8 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	mem_region_init();
 
+	mem_dump_allocs();
+
 	/* Reserve HOMER and OCC area */
 	homer_init();
 
@@ -1168,6 +1234,7 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	probe_phb3();
 
 	/* Probe PHB4 on P9 */
+	enable_machine_check();
 	probe_phb4();
 
 	/* Probe NPUs */
@@ -1220,7 +1287,7 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	prd_register_reserved_memory();
 
 	/* On P9, switch to radix mode by default */
-	cpu_set_radix_mode();
+//	cpu_set_radix_mode();
 
 	checksum_romem();
 
@@ -1257,5 +1324,9 @@  void __noreturn __nomcount secondary_cpu_entry(void)
 
 	prlog(PR_DEBUG, "INIT: CPU PIR 0x%04x called in\n", cpu->pir);
 
+	enable_machine_check();
+
+	vm_init_secondary();
+
 	__secondary_cpu_entry();
 }
diff --git a/core/lock.c b/core/lock.c
index fca8f465..f6dee774 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -194,14 +194,14 @@  static inline void add_lock_request(struct lock *l) { };
 static inline void remove_lock_request(void) { };
 #endif /* #if defined(DEADLOCK_CHECKER) && defined(DEBUG_LOCKS) */
 
-bool lock_held_by_me(struct lock *l)
+bool __nomcount lock_held_by_me(struct lock *l)
 {
 	uint64_t pir64 = this_cpu()->pir;
 
 	return l->lock_val == ((pir64 << 32) | 1);
 }
 
-bool try_lock_caller(struct lock *l, const char *owner)
+bool __nomcount try_lock_caller(struct lock *l, const char *owner)
 {
 	struct cpu_thread *cpu = this_cpu();
 
@@ -220,7 +220,7 @@  bool try_lock_caller(struct lock *l, const char *owner)
 	return false;
 }
 
-void lock_caller(struct lock *l, const char *owner)
+void __nomcount lock_caller(struct lock *l, const char *owner)
 {
 	bool timeout_warn = false;
 	unsigned long start = 0;
@@ -259,7 +259,7 @@  void lock_caller(struct lock *l, const char *owner)
 	remove_lock_request();
 }
 
-void unlock(struct lock *l)
+void __nomcount unlock(struct lock *l)
 {
 	struct cpu_thread *cpu = this_cpu();
 
@@ -283,7 +283,7 @@  void unlock(struct lock *l)
 	}
 }
 
-bool lock_recursive_caller(struct lock *l, const char *caller)
+bool __nomcount lock_recursive_caller(struct lock *l, const char *caller)
 {
 	if (bust_locks)
 		return false;
diff --git a/core/mem_region.c b/core/mem_region.c
index 7aac4e1c..81883b93 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -66,24 +66,27 @@  static struct mem_region skiboot_os_reserve = {
 	.type		= REGION_OS,
 };
 
-struct mem_region skiboot_heap = {
-	.name		= "ibm,firmware-heap",
-	.start		= HEAP_BASE,
-	.len		= HEAP_SIZE,
-	.type		= REGION_SKIBOOT_HEAP,
-};
-
 static struct mem_region skiboot_code_and_text = {
 	.name		= "ibm,firmware-code",
 	.start		= SKIBOOT_BASE,
 	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.vm_mapped_len	= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
 static struct mem_region skiboot_after_heap = {
 	.name		= "ibm,firmware-data",
 	.start		= HEAP_BASE + HEAP_SIZE,
 	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
@@ -153,14 +156,6 @@  static struct alloc_hdr *next_hdr(const struct mem_region *region,
 #if POISON_MEM_REGION == 1
 static void mem_poison(struct free_hdr *f)
 {
-	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
-
-	/* We only poison up to a limit, as otherwise boot is
-	 * kinda slow */
-	if (poison_size > POISON_MEM_REGION_LIMIT)
-		poison_size = POISON_MEM_REGION_LIMIT;
-
-	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
 }
 #else
 static inline void mem_poison(struct free_hdr *f __unused) { }
@@ -170,21 +165,41 @@  static inline void mem_poison(struct free_hdr *f __unused) { }
 static void init_allocatable_region(struct mem_region *region)
 {
 	struct free_hdr *f = region_start(region);
+	unsigned long num_longs;
+	unsigned long *t;
+
 	assert(region->type == REGION_SKIBOOT_HEAP ||
 	       region->type == REGION_MEMORY);
-	f->hdr.num_longs = region->len / sizeof(long);
+
+	num_longs = region->len / sizeof(long);
+
+	printf("init_allocatable_region %s %llx-%llx vm_mapped_len:%llx\n", region->name, region->start, region->start + region->len, region->vm_mapped_len);
+	if (!region->vm_mapped_len) {
+		/* SKIBOOT_BASE-SIZE regions already come mapped */
+		region->vm_mapped_len = PAGE_SIZE;
+		vm_map_global(region->name, region->start, PAGE_SIZE, true, false);
+	}
+
+	assert(PAGE_SIZE >= sizeof(*f));
+	assert(region->len >= PAGE_SIZE*2);
+
+	f->hdr.num_longs = num_longs;
 	f->hdr.free = true;
 	f->hdr.prev_free = false;
-	*tailer(f) = f->hdr.num_longs;
 	list_head_init(&region->free_list);
 	list_add(&region->free_list, &f->list);
-	mem_poison(f);
+
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+//	*tailer(f) = num_longs;
+	*t = num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 }
 
 static void make_free(struct mem_region *region, struct free_hdr *f,
 		      const char *location, bool skip_poison)
 {
 	struct alloc_hdr *next;
+	unsigned long *t;
 
 	if (!skip_poison)
 		mem_poison(f);
@@ -208,7 +223,10 @@  static void make_free(struct mem_region *region, struct free_hdr *f,
 	}
 
 	/* Fix up tailer. */
-	*tailer(f) = f->hdr.num_longs;
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+//	*tailer(f) = f->hdr.num_longs;
+	*t = f->hdr.num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 
 	/* If next is free, coalesce it */
 	next = next_hdr(region, &f->hdr);
@@ -305,17 +323,17 @@  void mem_dump_allocs(void)
 	struct alloc_hdr *h, *i;
 
 	/* Second pass: populate property data */
-	prlog(PR_INFO, "Memory regions:\n");
+	prlog(PR_NOTICE, "Memory regions:\n");
 	list_for_each(&regions, region, list) {
 		if (!(region->type == REGION_SKIBOOT_HEAP ||
 		      region->type == REGION_MEMORY))
 			continue;
-		prlog(PR_INFO, "  0x%012llx..%012llx : %s\n",
+		prlog(PR_NOTICE, "  0x%012llx..%012llx : %s\n",
 		       (long long)region->start,
 		       (long long)(region->start + region->len - 1),
 		       region->name);
 		if (region->free_list.n.next == NULL) {
-			prlog(PR_INFO, "    no allocs\n");
+			prlog(PR_NOTICE, "    no allocs\n");
 			continue;
 		}
 
@@ -397,6 +415,7 @@  static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
 	size_t alloc_longs, offset;
 	struct free_hdr *f;
 	struct alloc_hdr *next;
+	unsigned long newsz;
 
 	/* Align must be power of 2. */
 	assert(!((align - 1) & align));
@@ -452,6 +471,17 @@  found:
 		next->prev_free = false;
 	}
 
+	newsz = ((void *)((unsigned long *)f + alloc_longs + offset) - region_start(region) + sizeof(struct free_hdr));
+	if (newsz > region->vm_mapped_len) {
+		/* TODO: unmap on free */
+		newsz += PAGE_SIZE-1;
+		newsz &= ~(PAGE_SIZE-1);
+		vm_map_global(location,
+			region->start + region->vm_mapped_len,
+			newsz - region->vm_mapped_len, true, false);
+		region->vm_mapped_len = newsz;
+	}
+
 	if (offset != 0) {
 		struct free_hdr *pre = f;
 
@@ -696,6 +726,7 @@  static struct mem_region *new_region(const char *name,
 	region->name = name;
 	region->start = start;
 	region->len = len;
+	region->vm_mapped_len = 0;
 	region->node = node;
 	region->type = type;
 	region->free_list.n.next = NULL;
@@ -772,6 +803,10 @@  static bool add_region(struct mem_region *region)
 {
 	struct mem_region *r;
 
+	prlog(PR_NOTICE, "Add region  0x%012llx..%012llx : %s\n",
+	       (long long)region->start,
+	       (long long)(region->start + region->len - 1),
+	       region->name);
 	if (mem_regions_finalised) {
 		prerror("MEM: add_region(%s@0x%"PRIx64") called after finalise!\n",
 				region->name, region->start);
@@ -1103,6 +1138,11 @@  void mem_region_init(void)
 		if ((start + len) > top_of_ram)
 			top_of_ram = start + len;
 		unlock(&mem_region_lock);
+
+		prlog(PR_NOTICE, "init region  0x%012llx..%012llx : %s\n",
+		       (long long)region->start,
+		       (long long)(region->start + region->len - 1),
+		       region->name);
 	}
 
 	/*
diff --git a/core/opal.c b/core/opal.c
index 63a08510..17b60d27 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -64,14 +64,20 @@  void opal_table_init(void)
 {
 	struct opal_table_entry *s = __opal_table_start;
 	struct opal_table_entry *e = __opal_table_end;
+	uint64_t *t;
+	uint64_t len;
+
+	len = (unsigned long)e - (unsigned long)s;
 
 	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
 	      s, e, opal_branch_table);
+	t = vm_map((unsigned long)&opal_branch_table[0], len, true);
 	while(s < e) {
-		opal_branch_table[s->token] = function_entry_address(s->func);
+		t[s->token] = function_entry_address(s->func);
 		opal_num_args[s->token] = s->nargs;
 		s++;
 	}
+	vm_unmap((unsigned long)&opal_branch_table[0], len);
 }
 
 /* Called from head.S, thus no prototype */
@@ -320,9 +326,13 @@  opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
 void __opal_register(uint64_t token, void *func, unsigned int nargs)
 {
+	uint64_t *t;
+
 	assert(token <= OPAL_LAST);
 
-	opal_branch_table[token] = function_entry_address(func);
+	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(uint64_t), true);
+	*t = function_entry_address(func);
+	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
 	opal_num_args[token] = nargs;
 }
 
diff --git a/core/vm.c b/core/vm.c
new file mode 100644
index 00000000..b5bb78ae
--- /dev/null
+++ b/core/vm.c
@@ -0,0 +1,723 @@ 
+/* Copyright 2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <stack.h>
+#include <cpu.h>
+#include <timebase.h>
+#include <trace.h>
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <ccan/container_of/container_of.h>
+
+static bool vm_setup = false;
+
+#define SLB_SZ		(256UL*1024*1024)
+#define SLB_NR		32
+#define LOCAL_SLB_NR	2
+#define GLOBAL_SLB_NR	(SLB_NR - LOCAL_SLB_NR)
+#define LOCAL_SLB_BASE	GLOBAL_SLB_NR
+
+static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
+{
+	unsigned long rs;
+	unsigned long rb;
+
+	rs = vsid << (63-51);		/* 256MB VSID */
+	rs |= 1UL << (63-53);		/* Kp = 1 */
+
+	rb = esid << (63-35);		/* 256MB ESID */
+	rb |= 1UL << (63-36);		/* V = 1 */
+	rb |= index;
+
+	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
+}
+
+#if 0
+static void slb_remove(unsigned long esid)
+{
+	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
+}
+#endif
+
+static void slb_remove_all(void)
+{
+	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
+}
+
+static void __nomcount slb_add(unsigned long ea)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t esid = ea >> 28;
+	uint64_t vsid = ea >> 28;
+
+	slb_install(esid, vsid, cpu->vm_slb_rr);
+
+	cpu->vm_slb_rr++;
+	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
+		cpu->vm_slb_rr = 0;
+}
+
+struct hpte {
+	unsigned long dword[2];
+};
+
+struct hpteg {
+	struct hpte hpte[8];
+};
+
+static struct hpteg *htab;
+static unsigned long htab_nr_bytes;
+static unsigned long htab_nr_ptegs;
+static unsigned long htab_pteg_mask;
+
+static struct lock htab_global_lock;
+static struct lock htab_local_lock;
+
+static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, int global)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	struct hpte *hpte;
+	unsigned long ava = va >> 23;
+	unsigned long arpn = pa >> 12;
+	unsigned long dw0, dw1;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = (ava << (63-56)) | 0x1;
+	if (!global)
+		dw0 |= 0x8;
+
+	dw1 = (arpn << (63-43 - 8));
+	if (!rw)
+		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1));
+	if (!ex)
+		dw1 |= (1UL << (63 - 61));
+	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
+	if (ci)
+		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0010 */
+	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	if (global) {
+		lock(&htab_global_lock);
+		hstart = 0;
+		hend = 3;
+	} else {
+		lock(&htab_local_lock);
+		hstart = 4;
+		hend = 7;
+	}
+
+	for (i = hstart; i <= hend; i++) {
+		unsigned long _dw0;
+
+		hpte = &hpteg->hpte[i];
+
+		_dw0 = be64_to_cpu(hpte->dword[0]);
+		if (_dw0 & 1) {
+			if (_dw0 >> 7 == ava) {
+				if (global) {
+					/* Replace insertion */
+					goto install;
+				}
+
+				printf("HTAB collision va:%lx pa:%lx rw:%d ex:%d global:%d\n", va, pa, rw, ex, global);
+				backtrace();
+				assert(0);
+			}
+
+			continue;
+		}
+
+		assert(!hpte->dword[0]);
+		assert(!hpte->dword[1]);
+		goto install;
+	}
+
+	if (!global)
+		assert(0);
+
+	i = (mftb() >> 4) % (hend + 1);
+	hpte = &hpteg->hpte[i];
+
+install:
+	hpte->dword[1] = cpu_to_be64(dw1);
+	eieio();
+	hpte->dword[0] = cpu_to_be64(dw0);
+	asm volatile("ptesync" ::: "memory");
+
+	if (global)
+		unlock(&htab_global_lock);
+	else
+		unlock(&htab_local_lock);
+}
+
+static void htab_remove(unsigned long va, int global)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned long ava = va >> 23;
+	unsigned long dw0;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = (ava << (63-56)) | 0x1;
+	if (!global)
+		dw0 |= 0x8;
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	if (global) {
+		lock(&htab_global_lock);
+		hstart = 0;
+		hend = 3;
+	} else {
+		lock(&htab_local_lock);
+		hstart = 4;
+		hend = 7;
+	}
+
+	for (i = hstart; i <= hend; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		unsigned long _dw0;
+
+		_dw0 = be64_to_cpu(hpte->dword[0]);
+
+		if (!(_dw0 & 1)) {
+			assert(!hpte->dword[0]);
+			assert(!hpte->dword[1]);
+			continue;
+		}
+
+		if (_dw0 != dw0) {
+			assert(_dw0 >> 7 != ava);
+			continue;
+		}
+
+		hpte->dword[0] = 0;
+		eieio();
+		hpte->dword[1] = 0;
+		asm volatile("ptesync" ::: "memory");
+
+		goto found;
+	}
+
+	if (!global)
+		assert(0);
+
+found:
+	if (global) {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbie %0,%1" : : "r"(va & ~0xfffULL), "r"(0));
+		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
+		unlock(&htab_global_lock);
+	} else {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbiel %0" : : "r"(va & ~0xfffULL));
+		asm volatile("ptesync" ::: "memory");
+		unlock(&htab_local_lock);
+	}
+}
+
+struct vm_map {
+	struct list_node list;
+
+	const char *name;
+	uint64_t address;
+	uint64_t length;
+	bool writeable;
+	bool ci;
+};
+
+static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
+static struct lock vm_maps_lock;
+static unsigned long nr_vm_maps;
+
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	struct vm_map *new;
+//	struct vm_map *vmm;
+
+	new = zalloc(sizeof(*new));
+	assert(new);
+
+	new->name = name;
+	new->address = addr;
+	new->length = len;
+	new->writeable = rw;
+	new->ci = ci;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+#if 0
+	list_for_each(&vm_maps, vmm, list) {
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr + len <= vmm->address)
+			continue;
+		if (ci)
+			continue;
+		break;
+		unlock(&vm_maps_lock);
+		/* XXX: all MMIOs are mapped up front */
+		if (ci)
+			return;
+
+		// printf("vm_map_global %s %lx-%lx collided with vmm:%s %llx-%llx\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+		// assert(0);
+		return;
+	}
+#endif
+
+	list_add(&vm_maps, &new->list);
+	nr_vm_maps++;
+	unlock(&vm_maps_lock);
+	if (vm_setup)
+		vm_enter();
+}
+
+void vm_unmap_global(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long end = addr + len;
+	struct vm_map *vmm;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (addr != vmm->address)
+			continue;
+		if (len != vmm->length)
+			continue;
+		goto found;
+	}
+	vmm = NULL;
+	unlock(&vm_maps_lock);
+	printf("unmap did't find anything\n");
+	backtrace();
+	goto out;
+//	assert(0);
+
+found:
+	list_del(&vmm->list);
+
+	if (vm_setup) {
+		while (addr < end) {
+			htab_remove(addr, 1);
+			addr += PAGE_SIZE;
+		}
+	}
+
+	nr_vm_maps--;
+	unlock(&vm_maps_lock);
+out:
+	if (vm_setup)
+		vm_enter();
+
+	if (vmm)
+		free(vmm);
+}
+
+
+void *vm_map(unsigned long addr, unsigned long len, bool rw)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long va;
+	unsigned long esid = (0x0800000000000000ULL + ((unsigned long)c->pir << 30)) >> 28;
+	unsigned long vsid = (0x0800000000000000ULL + ((unsigned long)c->pir << 30)) >> 28; /* per-cpu VA */
+	unsigned long end = addr + len;
+	unsigned long offset = addr & (PAGE_SIZE - 1);
+
+	/* Can't do nested mappings */
+	assert(!c->vm_local_map);
+	c->vm_local_map = true;
+
+	if (!c->vm_setup)
+		return (void *)addr;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	va = vsid << 28;
+	while (addr < end) {
+		htab_install(va, addr, rw, 0, 0, 0);
+		va += PAGE_SIZE;
+		addr += PAGE_SIZE;
+	}
+
+	return (void *)(esid << 28) + offset;
+}
+
+void vm_unmap(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long va;
+	unsigned long vsid = (0x0800000000000000ULL + ((unsigned long)c->pir << 30)) >> 28; /* per-cpu VA */
+	unsigned long end = addr + len;
+
+	assert(c->vm_local_map);
+	c->vm_local_map = false;
+
+	if (!c->vm_setup)
+		return;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len < (1 << 28)); /* same segment */
+
+	va = vsid << 28;
+	while (addr < end) {
+		htab_remove(va, 0);
+		va += PAGE_SIZE;
+		addr += PAGE_SIZE;
+	}
+}
+
+struct prte {
+	unsigned long dword[2];
+};
+
+static struct prte *prtab;
+
+static void vm_init_cpu(void)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long esid = (0x0800000000000000ULL + ((unsigned long)c->pir << 30)) >> 28;
+	unsigned long vsid = (0x0800000000000000ULL + ((unsigned long)c->pir << 30)) >> 28;
+
+	mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
+		~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
+	mtspr(SPR_LPID, 0);
+	mtspr(SPR_PID, 0);
+	mtspr(SPR_HRMOR, 0);
+	mtspr(SPR_PTCR, (unsigned long)prtab);
+
+	slb_remove_all();
+	slb_install(esid, vsid, LOCAL_SLB_BASE);
+}
+
+void vm_init_secondary(void)
+{
+	vm_init_cpu();
+	vm_enter();
+}
+
+bool vm_realmode(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	return !vm_setup || !c->vm_setup;
+}
+
+void vm_enter(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (c->vm_setup) {
+		mtmsr(mfmsr() | (MSR_RI|MSR_IR|MSR_DR));
+		printf("CPU:%d vm_enter already entered\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = true;
+	mtmsr(mfmsr() | (MSR_RI|MSR_IR|MSR_DR));
+}
+
+void vm_exit(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (!c->vm_setup) {
+		mtmsr(mfmsr() & ~(MSR_RI|MSR_IR|MSR_DR));
+		printf("CPU:%d vm_exit already exited\n", c->pir);
+		backtrace();
+		return;
+	}
+	c->vm_setup = false;
+	mtmsr(mfmsr() & ~(MSR_RI|MSR_IR|MSR_DR));
+}
+
+bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
+{
+	(void)nia;
+
+	slb_add(dar);
+
+	return true;
+}
+
+bool __nomcount vm_islb(uint64_t nia)
+{
+	slb_add(nia);
+	return true;
+}
+
+static bool vm_strict = false;
+
+bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, bool store)
+{
+	struct vm_map *vmm;
+
+	(void)nia;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+	}
+	if (!vm_strict) {
+		if (dar >= 0x0006000000000000 &&
+			dar < 0x0006f00000000000)
+			htab_install(dar, dar, 1, 0, 1, 1);
+		else if (dar < 0x0800000000000000ULL)
+			htab_install(dar, dar, 1, 0, 0, 1);
+		unlock(&vm_maps_lock);
+		printf("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
+		backtrace();
+		return true;
+	}
+	unlock(&vm_maps_lock);
+	printf("  vmm not found\n");
+	return false;
+found:
+	if (store && !vmm->writeable) {
+		if (!vm_strict) {
+			htab_install(dar, dar, store, 0, 0, 1);
+			unlock(&vm_maps_lock);
+			printf("Page fault with RO VMM:%s at NIA:0x%016llx DAR:0x%016llx, store:%d\n", vmm->name, nia, dar, store);
+			backtrace();
+			return true;
+		}
+		unlock(&vm_maps_lock);
+		printf("  vmm not writeable\n");
+		return false;
+	}
+
+	htab_install(dar, dar, vmm->writeable, 0, vmm->ci, 1);
+	unlock(&vm_maps_lock);
+
+	return true;
+}
+
+bool __nomcount vm_isi(uint64_t nia)
+{
+	if (nia < (unsigned long)_stext)
+		return false;
+	if (nia >= (unsigned long)_etext)
+		return false;
+
+	htab_install(nia, nia, 0, 1, 0, 1);
+	return true;
+}
+
+static void cpu_stop_vm(void *arg __unused)
+{
+	vm_exit();
+}
+
+static void cpu_cleanup_vm(void *arg __unused)
+{
+	slb_remove_all();
+	mtspr(SPR_PTCR, 0);
+}
+
+static void cpu_all_destroy_vm(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
+	assert(jobs);
+
+	/* Stop all CPUs */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
+						cpu_stop_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_stop_vm(NULL);
+
+	/* Cleaup after all stop */
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
+						cpu_cleanup_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_cleanup_vm(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+}
+
+void vm_init(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+
+	prtab = memalign(64*1024, 64*1024);
+	assert(prtab);
+	memset(prtab, 0, 64*1024);
+
+	htab_nr_bytes = 1UL<<18;
+	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
+	htab_pteg_mask = htab_nr_ptegs - 1;
+	htab = memalign(1UL<<18, htab_nr_bytes);
+	assert(htab);
+	memset(htab, 0, htab_nr_bytes);
+
+	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab);
+	prtab[0].dword[1] = 0;
+
+	eieio();
+
+	vm_init_cpu();
+
+	cleanup_global_tlb();
+
+
+	vm_map_global("text", (unsigned long)_stext,
+		(unsigned long)_etext - (unsigned long)_stext,
+		false, false);
+	vm_map_global("rodata", (unsigned long)__rodata_start,
+		(unsigned long)__rodata_end - (unsigned long)__rodata_start,
+		false, false);
+	vm_map_global("data", (unsigned long)_sdata,
+		(unsigned long)_edata - (unsigned long)_sdata,
+		true, false);
+	vm_map_global("bss", (unsigned long)_sbss,
+		(unsigned long)_ebss - (unsigned long)_sbss,
+		true, false);
+	vm_map_global("sym map", (unsigned long)__sym_map_start,
+		(unsigned long)__sym_map_end - (unsigned long)__sym_map_start,
+		false, false);
+	vm_map_global("heap", HEAP_BASE, HEAP_SIZE, true, false);
+	vm_map_global("mem console", INMEM_CON_START, INMEM_CON_LEN, true, false);
+	vm_map_global("hbrt console", HBRT_CON_START, HBRT_CON_LEN, false, false);
+	vm_map_global("spia heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
+	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE_P8, false, false);
+	vm_map_global("early stack", stack_start, stack_end - stack_start, true, false);
+
+	/* XXX: should map individual MMIOs? lot of work */
+	vm_map_global("MMIO", 0x0006000000000000ULL, 0x0001000000000000ULL, true, true);
+
+	printf("VMM: SETUP\n");
+	printf(" PRTAB:%p\n", prtab);
+	printf(" HTAB: %p\n", htab);
+	printf(" Global mappings\n");
+	printf("  text   %lx-%lx\n", (unsigned long)_stext, (unsigned long)_etext);
+	printf("  rodata %lx-%lx\n", (unsigned long)__rodata_start, (unsigned long)__rodata_end);
+	printf("  data   %lx-%lx\n", (unsigned long)_sdata, (unsigned long)_edata);
+	printf("  sym    %lx-%lx\n", (unsigned long)__sym_map_start, (unsigned long)__sym_map_end);
+	printf("  bss    %lx-%lx\n", (unsigned long)_sbss, (unsigned long)_ebss);
+	printf("  heap   %lx-%lx\n", (unsigned long)HEAP_BASE, (unsigned long)SKIBOOT_BASE + SKIBOOT_SIZE);
+	printf("  early stack %lx-%lx\n", stack_start, stack_end);
+
+	vm_setup = true;
+
+	vm_enter();
+}
+
+void vm_init_stacks(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (c->vm_setup)
+		mtmsr(mfmsr() & ~MSR_DR);
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (vmm->address >= stack_end)
+			continue;
+		if (vmm->address + vmm->length <= stack_start)
+			continue;
+		goto found;
+	}
+	unlock(&vm_maps_lock);
+	assert(0);
+
+found:
+	vmm->name = "stacks";
+	vmm->address = stack_start;
+	vmm->length = stack_end - stack_start;
+	unlock(&vm_maps_lock);
+	if (c->vm_setup)
+		mtmsr(mfmsr() | MSR_DR);
+
+	printf("VMM: STACKS\n");
+	printf(" Global mappings\n");
+	printf("  stacks %lx-%lx\n", stack_start, stack_end);
+
+	list_for_each(&vm_maps, vmm, list)
+		printf("VMM %s %llx-%llx\n", vmm->name, vmm->address, vmm->address + vmm->length);
+}
+
+void vm_destroy(void)
+{
+	assert(vm_setup);
+
+	printf("VMM: TEARDOWN\n");
+
+	cpu_all_destroy_vm();
+
+	vm_setup = false;
+
+	while (!list_empty(&vm_maps)) {
+		struct vm_map *vmm;
+		vmm = list_pop(&vm_maps, struct vm_map, list);
+		free(vmm);
+	}
+
+	free(htab);
+	free(prtab);
+}
diff --git a/hdata/spira.c b/hdata/spira.c
index 2dd0dd36..95016838 100644
--- a/hdata/spira.c
+++ b/hdata/spira.c
@@ -1666,11 +1666,18 @@  static void fixup_spira(void)
 
 int parse_hdat(bool is_opal)
 {
+	int ret = 0;
+
 	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
 
 	prlog(PR_DEBUG, "Parsing HDAT...\n");
 
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
 	fixup_spira();
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
 
 	/*
 	 * Basic DT root stuff
@@ -1691,9 +1698,12 @@  int parse_hdat(bool is_opal)
 	dt_init_led_node();
 
 	/* Parse SPPACA and/or PCIA */
-	if (!pcia_parse())
-		if (paca_parse() < 0)
-			return -1;
+	if (!pcia_parse()) {
+		if (paca_parse() < 0) {
+			ret = -1;
+			goto out;
+		}
+	}
 
 	/* IPL params */
 	add_iplparams();
@@ -1740,6 +1750,9 @@  int parse_hdat(bool is_opal)
 		node_stb_parse();
 
 	prlog(PR_DEBUG, "Parsing HDAT...done\n");
+out:
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
 
-	return 0;
+	return ret;
 }
diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
index 236ad5b9..97f3f31e 100644
--- a/hw/fake-nvram.c
+++ b/hw/fake-nvram.c
@@ -36,12 +36,16 @@  int fake_nvram_info(uint32_t *total_size)
 
 int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 {
+	void *t;
+
 	if (!nvram_region)
 		return -ENODEV;
 
+	t = vm_map(nvram_region->start + src, len, false);
 	lock(&fake_nvram_lock);
-	memcpy(dst, (void *) (nvram_region->start + src), len);
+	memcpy(dst, t, len);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + src, len);
 
 	nvram_read_complete(true);
 
@@ -50,12 +54,16 @@  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 
 int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
 {
+	void *t;
+
 	if (!nvram_region)
 		return OPAL_HARDWARE;
 
+	t = vm_map(nvram_region->start + offset, size, true);
 	lock(&fake_nvram_lock);
-	memcpy((void *) (nvram_region->start + offset), src, size);
+	memcpy(t, src, size);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + offset, size);
 
 	return 0;
 }
diff --git a/hw/homer.c b/hw/homer.c
index a0a0733e..07b987e6 100644
--- a/hw/homer.c
+++ b/hw/homer.c
@@ -121,6 +121,9 @@  static void homer_init_chip(struct proc_chip *chip)
 
 		chip->homer_base = hbase;
 		chip->homer_size = hsize;
+		/* slw late init and xive late iit want to write to HOMER */
+		/* XXX: make it read only until then? */
+		vm_map_global("HOMER Image", hbase, hsize, true, false);
 	}
 
 	/*
@@ -147,6 +150,7 @@  static void homer_init_chip(struct proc_chip *chip)
 		chip->slw_base = sbase;
 		chip->slw_bar_size = ssize;
 		chip->slw_image_size = ssize; /* will be adjusted later */
+		/* XXX */
 	}
 
 	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
@@ -154,6 +158,7 @@  static void homer_init_chip(struct proc_chip *chip)
 		      obase, osize / 0x100000);
 		chip->occ_common_base = obase;
 		chip->occ_common_size = osize;
+		vm_map_global("OCC Common Area", obase, osize, false, false);
 	}
 }
 
diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index 365bf3e2..7f01662b 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -600,6 +600,8 @@  void early_uart_init(void)
 	if (!mmio_uart_base)
 		return;
 
+	vm_map_global("uart", (unsigned long)mmio_uart_base, 8, true, true);
+
 	clk = dt_prop_get_u32(uart_node, "clock-frequency");
 	baud = dt_prop_get_u32(uart_node, "current-speed");
 
@@ -608,6 +610,7 @@  void early_uart_init(void)
 		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
 	} else {
 		prerror("UART: Early init failed!");
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
 		mmio_uart_base = NULL;
 	}
 }
@@ -619,9 +622,6 @@  void uart_init(void)
 	char *path __unused;
 	const uint32_t *irqp;
 
-	/* Clean up after early_uart_init() */
-	mmio_uart_base = NULL;
-
 	/* UART lock is in the console path and thus must block
 	 * printf re-entrancy
 	 */
@@ -639,13 +639,28 @@  void uart_init(void)
 	 * directly mapped UARTs in simulation environments
 	 */
 	if (n->parent == dt_root) {
+		void *base;
+
 		printf("UART: Found at root !\n");
-		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
-		if (!mmio_uart_base) {
+
+		base = (void *)dt_translate_address(n, 0, NULL);
+		if (!base) {
 			printf("UART: Failed to translate address !\n");
 			return;
 		}
 
+		if (mmio_uart_base != base) {
+			void *old;
+
+			vm_map_global("uart", (unsigned long)base, 8, true, true);
+			old = mmio_uart_base;
+			mmio_uart_base = base;
+
+			/* Clean up after early_uart_init() */
+			if (old)
+				vm_unmap_global((unsigned long)old, 8);
+		}
+
 		/* If it has an interrupt properly, we consider this to be
 		 * a direct XICS/XIVE interrupt
 		 */
@@ -674,6 +689,12 @@  void uart_init(void)
 			lpc_irq = be32_to_cpu(*irqp);
 			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
 		}
+
+		/* Clean up after early_uart_init() */
+		if (mmio_uart_base) {
+			mmio_uart_base = NULL;
+			vm_unmap_global((unsigned long)mmio_uart_base, 8);
+		}
 	}
 
 
diff --git a/hw/lpc.c b/hw/lpc.c
index 0eccad82..a935b8fe 100644
--- a/hw/lpc.c
+++ b/hw/lpc.c
@@ -1179,6 +1179,8 @@  static void lpc_init_chip_p9(struct dt_node *opb_node)
 	if (!lpc_node)
 		return;
 
+	vm_map_global("lpc", addr, 0x100000000UL, true, true);
+
 	lpc = zalloc(sizeof(struct lpcm));
 	assert(lpc);
 	lpc->chip_id = gcid;
diff --git a/hw/phb4.c b/hw/phb4.c
index 34dabb05..776a1dd4 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -5386,6 +5386,13 @@  static void phb4_create(struct dt_node *np)
 	if (!phb4_calculate_windows(p))
 		return;
 
+	vm_map_global("PHB REGS", (unsigned long)p->regs, 1024*1024, true, true);
+	vm_map_global("PHB INT MMIO", (unsigned long)p->int_mmio, 1024*1024, true, true);
+	if (p->mm0_size)
+		vm_map_global("PHB MM0", p->mm0_base, p->mm0_size, true, true);
+	if (p->mm1_size)
+		vm_map_global("PHB MM1", p->mm1_base, p->mm1_size, true, true);
+
 	/* Get the various XSCOM register bases from the device-tree */
 	prop = dt_require_property(np, "ibm,xscom-bases", 5 * sizeof(uint32_t));
 	p->pe_xscom = ((const uint32_t *)prop->prop)[0];
@@ -5573,6 +5580,7 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
 	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
 	uint64_t mmio1_bar, mmio1_bmask, mmio1_sz;
+	uint64_t bar_sz;
 	uint64_t reg[4];
 	void *foo;
 	uint64_t mmio_win[4];
@@ -5602,7 +5610,8 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en = 0;
 
 	/* Initialize PHB register BAR */
-	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
+	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
+	vm_map_global("PHB BAR", phb_bar, bar_sz, true, true);
 	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
 			 phb_bar << 8);
 
@@ -5616,18 +5625,21 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
 
 	/* Same with INT BAR (ESB) */
-	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
+	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
+	vm_map_global("IRQ BAR", irq_bar, bar_sz, true, true);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
 	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
 
 
 	/* Same with MMIO windows */
 	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
+	vm_map_global("MMIO0 BAR", mmio0_bar, mmio0_sz, true, true);
 	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
 
 	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
+	vm_map_global("MMIO1 BAR", mmio1_bar, mmio1_sz, true, true);
 	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
diff --git a/hw/slw.c b/hw/slw.c
index dfa9189b..32d6628d 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -164,7 +164,7 @@  static void slw_patch_reset(void)
 		*(sav++) = *(dst);
 		*(dst++) = *(src++);
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static void slw_unpatch_reset(void)
@@ -180,7 +180,7 @@  static void slw_unpatch_reset(void)
 		*(dst++) = *(sav++);
 		src++;
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
diff --git a/hw/xive.c b/hw/xive.c
index 515f154d..6a4eeefa 100644
--- a/hw/xive.c
+++ b/hw/xive.c
@@ -1620,6 +1620,7 @@  static bool xive_configure_bars(struct xive *x)
 
 	/* IC BAR */
 	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	vm_map_global("xive", (unsigned long)x->ic_base, x->ic_size, true, true);
 	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
 	if (IC_PAGE_SIZE == 0x10000) {
 		val |= CQ_IC_BAR_64K;
diff --git a/hw/xscom.c b/hw/xscom.c
index 05b2c14f..18a4f3cb 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -890,6 +890,7 @@  void xscom_init(void)
 		assert(reg);
 
 		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+		vm_map_global("xscom", chip->xscom_base, 0x100000000UL, true, true);
 
 		/* Grab processor type and EC level */
 		xscom_init_chip_info(chip);
@@ -903,7 +904,7 @@  void xscom_init(void)
 		prlog(PR_NOTICE, "CHIP: Chip ID %04x type: %s DD%x.%x%d\n",
 		      gcid, chip_name, chip->ec_level >> 4,
 		      chip->ec_level & 0xf, chip->ec_rev);
-		prlog(PR_DEBUG, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
+		prlog(PR_NOTICE, "XSCOM: Base address: 0x%llx\n", chip->xscom_base);
 	}
 
 	/* Collect details to trigger xstop via XSCOM write */
diff --git a/include/cmpxchg.h b/include/cmpxchg.h
index 28911c08..b194cc8d 100644
--- a/include/cmpxchg.h
+++ b/include/cmpxchg.h
@@ -71,6 +71,18 @@  static inline uint32_t cmpxchg32(uint32_t *mem, uint32_t old, uint32_t new)
 
 	return prev;
 }
+
+static inline uint64_t cmpxchg64(uint64_t *mem, uint64_t old, uint64_t new)
+{
+	uint64_t prev;
+
+	sync();
+	prev = __cmpxchg64(mem, old,new);
+	sync();
+
+	return prev;
+}
+
 #endif /* __TEST_ */
 
 #endif /* __CMPXCHG_H */
diff --git a/include/cpu.h b/include/cpu.h
index 2fe47982..23a497e1 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -82,10 +82,19 @@  struct cpu_thread {
 	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
 	unsigned int			stack_bot_bt_count;
 #endif
+	/*
+	 * Per-thread VM parameters
+	 */
+	uint8_t				vm_slb_rr; /* RR allocator */
+	bool				vm_setup; /* virtual memory is up */
+	bool				vm_local_map; /* local mapping */
+	bool				vm_local_slb; /* local SLB used */
+
 	struct lock			job_lock;
 	struct list_head		job_queue;
 	uint32_t			job_count;
 	bool				job_has_no_return;
+
 	/*
 	 * Per-core mask tracking for threads in HMI handler and
 	 * a cleanup done bit.
diff --git a/include/io.h b/include/io.h
index c056c37e..cc896404 100644
--- a/include/io.h
+++ b/include/io.h
@@ -20,6 +20,7 @@ 
 #ifndef __ASSEMBLY__
 
 #include <compiler.h>
+#include <skiboot.h>
 #include <stdint.h>
 #include <processor.h>
 #include <ccan/endian/endian.h>
@@ -35,8 +36,14 @@ 
 static inline uint8_t __in_8(const volatile uint8_t *addr)
 {
 	uint8_t val;
-	asm volatile("lbzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lbzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lbzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -49,8 +56,14 @@  static inline uint8_t in_8(const volatile uint8_t *addr)
 static inline uint16_t __in_be16(const volatile uint16_t *addr)
 {
 	uint16_t val;
-	asm volatile("lhzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lhzx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -68,8 +81,14 @@  static inline uint16_t in_le16(const volatile uint16_t *addr)
 static inline uint32_t __in_be32(const volatile uint32_t *addr)
 {
 	uint32_t val;
-	asm volatile("lwzcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("lwzx %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -87,8 +106,14 @@  static inline uint32_t in_le32(const volatile uint32_t *addr)
 static inline uint64_t __in_be64(const volatile uint64_t *addr)
 {
 	uint64_t val;
-	asm volatile("ldcix %0,0,%1" :
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
 		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("ldx %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
 	return val;
 }
 
@@ -105,7 +130,11 @@  static inline uint64_t in_le64(const volatile uint64_t *addr)
 
 static inline void __out_8(volatile uint8_t *addr, uint8_t val)
 {
-	asm volatile("stbcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stbcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stbx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -117,7 +146,11 @@  static inline void out_8(volatile uint8_t *addr, uint8_t val)
 
 static inline void __out_be16(volatile uint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("sthx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -134,7 +167,11 @@  static inline void out_le16(volatile uint16_t *addr, uint16_t val)
 
 static inline void __out_be32(volatile uint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stwx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
@@ -151,7 +188,11 @@  static inline void out_le32(volatile uint32_t *addr, uint32_t val)
 
 static inline void __out_be64(volatile uint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	else
+		asm volatile("stdx %0,0,%1"
 		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
 }
 
diff --git a/include/mem_region.h b/include/mem_region.h
index d9e490af..a18494d4 100644
--- a/include/mem_region.h
+++ b/include/mem_region.h
@@ -46,6 +46,7 @@  struct mem_region {
 	struct list_node list;
 	const char *name;
 	uint64_t start, len;
+	uint64_t vm_mapped_len;
 	struct dt_node *node;
 	enum mem_region_type type;
 	struct list_head free_list;
diff --git a/include/processor.h b/include/processor.h
index 6b262b45..396ee775 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -53,6 +53,7 @@ 
 #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
 #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
 #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
+#define SPR_PID		0x030	/* RW: PID register */
 #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
 #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
 #define SPR_TBRL	0x10c	/* RO: Timebase low */
@@ -75,10 +76,12 @@ 
 #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
 #define SPR_TFMR	0x13d
 #define SPR_LPCR	0x13e
+#define SPR_LPID	0x13f	/* RW: LPID register */
 #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
 #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
 #define SPR_PCR		0x152
 #define SPR_AMOR	0x15d
+#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
 #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
 #define SPR_TSCR	0x399
 #define SPR_HID0	0x3f0
@@ -89,6 +92,11 @@ 
 #define SPR_PIR		0x3ff	/* RO: Processor Identification */
 
 
+/* Bits in DSISR */
+
+#define	DSISR_ISSTORE		0x02000000
+
+
 /* Bits in LPCR */
 
 /* Powersave Exit Cause Enable is different for P7 and P8 */
@@ -324,9 +332,9 @@  static inline void isync(void)
 /*
  * Cache sync
  */
-static inline void sync_icache(void)
+static inline void sync_icache(unsigned long ptr)
 {
-	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
+	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
 }
 
 /*
diff --git a/include/skiboot.h b/include/skiboot.h
index 8b53c768..4eb8bfa0 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -56,8 +56,13 @@  extern char __sym_map_end[];
 extern char _romem_end[];
 
 #ifndef __TESTING__
+extern char _stext[], _etext[];
 /* Readonly section start and end. */
 extern char __rodata_start[], __rodata_end[];
+extern char _sdata[], _edata[];
+extern char __sym_map_start[], __sym_map_end[];
+extern char _sbss[], _ebss[];
+extern char _end[];
 
 static inline bool is_rodata(const void *p)
 {
@@ -298,4 +303,25 @@  extern int fake_nvram_info(uint32_t *total_size);
 extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
 extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
 
+/* core/vm.c */
+#define PAGE_SIZE 4096
+
+bool vm_realmode(void);
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
+void vm_unmap_global(unsigned long addr, unsigned long len);
+void *vm_map(unsigned long addr, unsigned long len, bool rw);
+void vm_unmap(unsigned long addr, unsigned long len);
+void vm_init(void);
+void vm_init_stacks(void);
+void vm_destroy(void);
+void vm_init_secondary(void);
+void vm_enter(void);
+void vm_exit(void);
+void vm_exit_cleanup(void);
+void vm_map_stacks(void);
+bool vm_dslb(uint64_t nia, uint64_t dar);
+bool vm_islb(uint64_t nia);
+bool vm_dsi(uint64_t nia, uint64_t dar, bool store);
+bool vm_isi(uint64_t nia);
+
 #endif /* __SKIBOOT_H */
diff --git a/libstb/container.c b/libstb/container.c
index a720fbbf..aef169e1 100644
--- a/libstb/container.c
+++ b/libstb/container.c
@@ -19,14 +19,20 @@ 
 
 bool stb_is_container(const void *buf, size_t size)
 {
+	uint32_t *t;
 	ROM_container_raw *c;
+	bool ret = true;;
 
 	c = (ROM_container_raw*) buf;
 	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
 		return false;
-	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
-		return false;
-	return true;
+
+	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
+	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
+		ret = false;
+	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
+
+	return ret;
 }
 
 uint32_t stb_payload_magic(const void *buf, size_t size)
diff --git a/skiboot.lds.S b/skiboot.lds.S
index 8fae2084..64029069 100644
--- a/skiboot.lds.S
+++ b/skiboot.lds.S
@@ -52,18 +52,41 @@  SECTIONS
 		KEEP(*(.cpuctrl.data))
 	}
 
+	/* Do I need to keep these ? */
+	.dynsym : { *(.dynsym)	}
+	.dynstr : { *(.dynstr)	}
+
+	/* Relocations */
 	. = ALIGN(0x10);
+	.dynamic : {
+		__dynamic_start = .;
+		*(.dynamic)
+		__dynamic_end = .;
+	}
+
+	. = ALIGN(0x10);
+	.rela.dyn : {
+		__rela_dyn_start = .;
+		*(.rela*)
+		__rela_dyn_end = .;
+	}
+
+	.hash   : { *(.hash)   }
+	.dynsym : { *(.dynsym) }
+	.dynstr : { *(.dynstr) }
+
+	. = ALIGN(0x1000);
 	_stext = .;
  	.text : {
 		*(.text*)
 		*(.sfpr)
 	}
 	_etext = .;
+	. = ALIGN(0x1000);
 
+	__rodata_start = .;
 	.rodata : {
-		__rodata_start = .;
 		*(.rodata .rodata.*)
-		__rodata_end = .;
 	}
 
 	. = ALIGN(0x10);
@@ -87,43 +110,43 @@  SECTIONS
 		*(.toc)
 	}
 
-	. = ALIGN(0x10);
-	.opal_table : {
-		__opal_table_start = .;
-		KEEP(*(.opal_table))
-		__opal_table_end = .;
-	}
-
 	.platforms : {
 		__platforms_start = .;
 		KEEP(*(.platforms))
 		__platforms_end = .;
 	}
 
-	/* Do I need to keep these ? */
-	.dynsym : { *(.dynsym)	}
-	.dynstr : { *(.dynstr)	}
-
-	/* Relocations */
 	. = ALIGN(0x10);
-	.dynamic : {
-		__dynamic_start = .;
-		*(.dynamic)
-		__dynamic_end = .;
+	.opal_table : {
+		__opal_table_start = .;
+		KEEP(*(.opal_table))
+		__opal_table_end = .;
 	}
+	__rodata_end = .;
 
-	. = ALIGN(0x10);
-	.rela.dyn : {
-		__rela_dyn_start = .;
-		*(.rela*)
-		__rela_dyn_end = .;
+	. = ALIGN(0x1000);
+
+	_sdata = .;
+	.data : {
+		/*
+		 * A couple of things that need to be 4K aligned and
+		 * to reside in their own pages for the sake of TCE
+		 * mappings
+		 */
+		. = ALIGN(0x1000);
+		*(.data.memcons);
+		. = ALIGN(0x1000);
+		*(.data.boot_trace);
+		. = ALIGN(0x1000);
+		*(.data*)
+		*(.force.data)
+		*(.toc1)
+		*(.branch_lt)
 	}
+	_edata = .;
 
-	.hash   : { *(.hash)   }
-	.dynsym : { *(.dynsym) }
-	.dynstr : { *(.dynstr) }
+	. = ALIGN(0x1000);
 
-	. = ALIGN(0x10);
 	.sym_map : {
 		__sym_map_start = . ;
 		KEEP(*(.sym_map))