diff mbox series

[RFC] Virtual Memory for OPAL boot

Message ID 20200428074459.1323794-1-npiggin@gmail.com
State New
Headers show
Series [RFC] Virtual Memory for OPAL boot | expand

Checks

Context Check Description
snowpatch_ozlabs/apply_patch fail Failed to apply to any branch
snowpatch_ozlabs/apply_patch warning Failed to apply on branch master (0f1937ef40fca0c3212a9dff1010b832a24fb063)

Commit Message

Nicholas Piggin April 28, 2020, 7:44 a.m. UTC
vm_map_global / vm_unmap_global sets up all-CPUs visible 1:1 mappings.
vm_map / vm_unmap creates a per-cpu mapping, and which can not be nested.

A list of global extents + a local extent per cpu is kept to describe
active mappings. Fault handlers look these up to install translations.

Booting with virtual memory is all well and good, and it can help find
bugs. The bigger benefit is that a logical virtual map is created in
the process, which can be given to the OS and used to create a virtual
memory environment for the OPAL runtime to execute in.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
- Countless fixes and improvements since last posted. This boots mambo
  and a P9 witherspoon I have, and is the basis for later skiboot and
  kernel patches which do actually make OPAL calls in virtual mode using
  a specific mm context created for it.

 core/Makefile.inc    |   2 +-
 core/cpu.c           |  22 +-
 core/exceptions.c    |  68 +++-
 core/fast-reboot.c   |  14 +-
 core/init.c          | 173 ++++++--
 core/mem_region.c    | 145 +++++--
 core/opal.c          |  38 +-
 core/platform.c      |  15 +-
 core/vm.c            | 942 +++++++++++++++++++++++++++++++++++++++++++
 hdata/spira.c        |  35 +-
 hw/fake-nvram.c      |  12 +-
 hw/homer.c           |  15 +-
 hw/lpc-uart.c        |  32 +-
 hw/lpc.c             |   6 +
 hw/phb4.c            |   9 +-
 hw/psi.c             |   2 +
 hw/slw.c             |   4 +-
 hw/xive.c            |   5 +
 hw/xscom.c           |   4 +
 include/cmpxchg.h    |   3 +
 include/cpu.h        |  22 +
 include/elf-abi.h    |  21 +-
 include/io.h         | 119 ++++--
 include/mem_region.h |   1 +
 include/platform.h   |   4 +-
 include/processor.h  |  13 +-
 include/skiboot.h    |  27 ++
 libstb/container.c   |  12 +-
 libstb/cvc.c         |   3 +
 libstb/secureboot.c  |   5 +-
 libstb/trustedboot.c |   6 +-
 skiboot.lds.S        |  26 +-
 32 files changed, 1650 insertions(+), 155 deletions(-)
 create mode 100644 core/vm.c

Comments

Cédric Le Goater April 29, 2020, 9:49 a.m. UTC | #1
On 4/28/20 9:44 AM, Nicholas Piggin wrote:
> vm_map_global / vm_unmap_global sets up all-CPUs visible 1:1 mappings.
> vm_map / vm_unmap creates a per-cpu mapping, and which can not be nested.
> 
> A list of global extents + a local extent per cpu is kept to describe
> active mappings. Fault handlers look these up to install translations.
> 
> Booting with virtual memory is all well and good, and it can help find
> bugs. The bigger benefit is that a logical virtual map is created in
> the process, which can be given to the OS and used to create a virtual> memory environment for the OPAL runtime to execute in.

The goal is to turn OPAL into a kernel driver and the OPAL calls into 
simple function calls ?  

Thanks,

C.


> 
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
> ---
> - Countless fixes and improvements since last posted. This boots mambo
>   and a P9 witherspoon I have, and is the basis for later skiboot and
>   kernel patches which do actually make OPAL calls in virtual mode using
>   a specific mm context created for it.
> 
>  core/Makefile.inc    |   2 +-
>  core/cpu.c           |  22 +-
>  core/exceptions.c    |  68 +++-
>  core/fast-reboot.c   |  14 +-
>  core/init.c          | 173 ++++++--
>  core/mem_region.c    | 145 +++++--
>  core/opal.c          |  38 +-
>  core/platform.c      |  15 +-
>  core/vm.c            | 942 +++++++++++++++++++++++++++++++++++++++++++
>  hdata/spira.c        |  35 +-
>  hw/fake-nvram.c      |  12 +-
>  hw/homer.c           |  15 +-
>  hw/lpc-uart.c        |  32 +-
>  hw/lpc.c             |   6 +
>  hw/phb4.c            |   9 +-
>  hw/psi.c             |   2 +
>  hw/slw.c             |   4 +-
>  hw/xive.c            |   5 +
>  hw/xscom.c           |   4 +
>  include/cmpxchg.h    |   3 +
>  include/cpu.h        |  22 +
>  include/elf-abi.h    |  21 +-
>  include/io.h         | 119 ++++--
>  include/mem_region.h |   1 +
>  include/platform.h   |   4 +-
>  include/processor.h  |  13 +-
>  include/skiboot.h    |  27 ++
>  libstb/container.c   |  12 +-
>  libstb/cvc.c         |   3 +
>  libstb/secureboot.c  |   5 +-
>  libstb/trustedboot.c |   6 +-
>  skiboot.lds.S        |  26 +-
>  32 files changed, 1650 insertions(+), 155 deletions(-)
>  create mode 100644 core/vm.c
> 
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index 829800e5b..7a4bb6797 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -3,7 +3,7 @@
>  # -*-Makefile-*-
>  
>  SUBDIRS += core
> -CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
> +CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
>  CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
>  CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
>  CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
> diff --git a/core/cpu.c b/core/cpu.c
> index 37d9f41a8..30f9c6e70 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -416,6 +416,10 @@ static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
>  	}
>  	isync();
>  
> +	/* P8 must enter nap with VM disabled */
> +	if (cpu->vm_setup)
> +		vm_exit();
> +
>  	/* Enter nap */
>  	vec = enter_p8_pm_state(false);
>  
> @@ -476,11 +480,19 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
>  		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
>  		psscr = PPC_BIT(42) | PPC_BIT(43) |
>  			PPC_BITMASK(54, 55) | PPC_BIT(63);
> +		/*
> +		 * stop with EC=1 wakes with vm off. P9 can stop with vm
> +		 * enabled, but it's simpler to disable now and so it wakes
> +		 * in the proper state.
> +		 */
> +		if (cpu->vm_setup)
> +			vm_exit();
>  		vec = enter_p9_pm_state(psscr);
>  	} else {
>  		/* stop with EC=0 (resumes) which does not require sreset. */
>  		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
>  		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
> +		/* Can run with VM enabled */
>  		enter_p9_pm_lite_state(psscr);
>  	}
>  
> @@ -499,6 +511,7 @@ static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
>  static void cpu_idle_pm(enum cpu_wake_cause wake_on)
>  {
>  	unsigned int vec;
> +	bool was_vm_setup = this_cpu()->vm_setup;
>  
>  	switch(proc_gen) {
>  	case proc_gen_p8:
> @@ -523,12 +536,17 @@ static void cpu_idle_pm(enum cpu_wake_cause wake_on)
>  		default:
>  			break;
>  		}
> -		mtmsrd(MSR_RI, 1);
>  
>  	} else if (vec == 0x200) {
>  		exception_entry_pm_mce();
>  		enable_machine_check();
> +	}
> +
> +	if (vec != 0) {
> +		/* 0x100 or 0x200 */
>  		mtmsrd(MSR_RI, 1);
> +		if (was_vm_setup)
> +			vm_enter();
>  	}
>  }
>  
> @@ -1361,7 +1379,7 @@ static int64_t opal_return_cpu(void)
>  		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
>  	}
>  
> -	__secondary_cpu_entry();
> +	__return_cpu_entry();
>  
>  	return OPAL_HARDWARE; /* Should not happen */
>  }
> diff --git a/core/exceptions.c b/core/exceptions.c
> index 389548d16..35c14f8af 100644
> --- a/core/exceptions.c
> +++ b/core/exceptions.c
> @@ -33,7 +33,7 @@ static void dump_regs(struct stack_frame *stack)
>  
>  #define EXCEPTION_MAX_STR 320
>  
> -static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal)
> +static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal, bool *vm_setup)
>  {
>  	uint64_t mce_flags, mce_addr;
>  	const char *mce_err;
> @@ -44,12 +44,28 @@ static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
>  	decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar,
>  			&mce_flags, &mce_err, &mce_addr);
>  
> -	/* Try to recover. */
> -	if (mce_flags & MCE_ERAT_ERROR) {
> -		/* Real-mode still uses ERAT, flush transient bitflips */
> +	/* Try to recover */
> +	if ((mce_flags & (MCE_SLB_ERROR|MCE_TABLE_WALK)) &&
> +			(msr & (MSR_IR|MSR_DR)) &&
> +			!this_cpu()->vm_local_map_inuse) {
> +		/* Try to turn off VM if non-linear map is not in use. */
> +		*vm_setup = false;
> +		stack->srr1 &= ~(MSR_IR|MSR_DR);
> +		mce_fix = "Disabling virtual memory";
> +
> +	} else if (mce_flags & MCE_ERAT_ERROR) {
>  		flush_erat();
>  		mce_fix = "ERAT flush";
>  
> +	} else if (mce_flags & MCE_TLB_ERROR) {
> +		cleanup_global_tlb();
> +		mce_fix = "global TLB flush";
> +
> +	} else if (mce_flags & MCE_TLB_ERROR) {
> +		cleanup_global_tlb();
> +		stack->srr0 += 4;
> +		mce_fix = "global TLB flush and skip instruction";
> +
>  	} else {
>  		*fatal = true;
>  	}
> @@ -83,6 +99,8 @@ static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
>  
>  void exception_entry(struct stack_frame *stack)
>  {
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
>  	bool fatal = false;
>  	bool hv;
>  	uint64_t nip;
> @@ -90,6 +108,8 @@ void exception_entry(struct stack_frame *stack)
>  	char buf[EXCEPTION_MAX_STR];
>  	size_t l;
>  
> +	c->vm_setup = false;
> +
>  	switch (stack->type) {
>  	case 0x500:
>  	case 0x980:
> @@ -134,9 +154,44 @@ void exception_entry(struct stack_frame *stack)
>  		break;
>  
>  	case 0x200:
> -		handle_mce(stack, nip, msr, &fatal);
> +		handle_mce(stack, nip, msr, &fatal, &vm_setup);
>  		goto no_symbol;
>  
> +	case 0x300:
> +		if (vm_dsi(nip, stack->dar, stack->dsisr))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal %s address "REG" at "REG"   ",
> +			(stack->dsisr & DSISR_ISSTORE) ? "store" : "load",
> +			stack->dar, nip);
> +		break;
> +
> +	case 0x380:
> +		if (vm_dslb(nip, stack->dar))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal load/store address "REG" at "REG"   ",
> +			stack->dar, nip);
> +		break;
> +
> +	case 0x400:
> +		if (vm_isi(nip))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal ifetch at "REG"   ", nip);
> +		break;
> +
> +	case 0x480:
> +		if (vm_islb(nip))
> +			goto out;
> +		fatal = true;
> +		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
> +			"Fatal ifetch at "REG"   ", nip);
> +		break;
> +
>  	case 0x700: {
>  		struct trap_table_entry *tte;
>  
> @@ -185,11 +240,14 @@ no_symbol:
>  		for (;;) ;
>  	}
>  
> +out:
> +	assert(!fatal);
>  	if (hv) {
>  		/* Set up for SRR return */
>  		stack->srr0 = nip;
>  		stack->srr1 = msr;
>  	}
> +	c->vm_setup = vm_setup;
>  }
>  
>  void exception_entry_pm_sreset(void)
> diff --git a/core/fast-reboot.c b/core/fast-reboot.c
> index 03777543a..e7f3b5c67 100644
> --- a/core/fast-reboot.c
> +++ b/core/fast-reboot.c
> @@ -381,6 +381,9 @@ void __noreturn fast_reboot_entry(void)
>  	cpu_set_sreset_enable(true);
>  	cpu_set_ipi_enable(true);
>  
> +	/* Enter virtual memory mode */
> +	vm_init(true);
> +
>  	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
>  
>  	/* Release everybody */
> @@ -401,6 +404,7 @@ void __noreturn fast_reboot_entry(void)
>  	fast_boot_release = false;
>  
>  	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
> +		void *t;
>  		/*
>  		 * mem_region_clear_unused avoids these preload regions
>  		 * so it can run along side image preloading. Clear these
> @@ -410,8 +414,14 @@ void __noreturn fast_reboot_entry(void)
>  		 * Mambo may have embedded payload here, so don't clear
>  		 * it at all.
>  		 */
> -		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
> -		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
> +
> +		t = vm_map((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true);
> +		memset(t, 0, KERNEL_LOAD_SIZE);
> +		vm_unmap((unsigned long)t, KERNEL_LOAD_SIZE);
> +
> +		t = vm_map((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true);
> +		memset(t, 0, INITRAMFS_LOAD_SIZE);
> +		vm_unmap((unsigned long)t, INITRAMFS_LOAD_SIZE);
>  	}
>  
>  	/* Start preloading kernel and ramdisk */
> diff --git a/core/init.c b/core/init.c
> index 2bb48845d..95c0339cf 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -94,6 +94,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	uint64_t load_base = (uint64_t)kh;
>  	struct elf64le_phdr *ph;
>  	unsigned int i;
> +	bool ret = false;
>  
>  	printf("INIT: 64-bit LE kernel discovered\n");
>  
> @@ -105,6 +106,9 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	 * but it will not work for any ELF binary.
>  	 */
>  	ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
> +	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
> +			le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr),
> +			false, false);
>  	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
>  		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
>  			continue;
> @@ -121,7 +125,7 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  
>  	if (!kernel_entry) {
>  		prerror("INIT: Failed to find kernel entry !\n");
> -		return false;
> +		goto out_unmap;
>  	}
>  	kernel_entry += load_base;
>  	kernel_32bit = false;
> @@ -133,7 +137,12 @@ static bool try_load_elf64_le(struct elf_hdr *header)
>  	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
>  	      kernel_entry, kernel_size);
>  
> -	return true;
> +	ret = true;
> +
> +out_unmap:
> +	vm_unmap_global((unsigned long)ph, le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr));
> +
> +	return ret;
>  }
>  
>  static bool try_load_elf64(struct elf_hdr *header)
> @@ -144,12 +153,17 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	struct elf64be_phdr *ph;
>  	struct elf64be_shdr *sh;
>  	unsigned int i;
> +	bool ret = false;
> +
> +	vm_map_global("KERNEL ELF64 Header", (unsigned long)header,
> +			sizeof(struct elf64be_hdr), false, false);
>  
>  	/* Check it's a ppc64 LE ELF */
>  	if (khle->ei_ident == ELF_IDENT		&&
>  	    khle->ei_data == ELF_DATA_LSB	&&
>  	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) {
> -		return try_load_elf64_le(header);
> +		ret = try_load_elf64_le(header);
> +		goto out_unmap1;
>  	}
>  
>  	/* Check it's a ppc64 ELF */
> @@ -157,7 +171,7 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	    kh->ei_data != ELF_DATA_MSB		||
>  	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) {
>  		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
> -		return false;
> +		goto out_unmap1;
>  	}
>  
>  	/* Look for a loadable program header that has our entry in it
> @@ -168,6 +182,8 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	 * but it will not work for any ELF binary.
>  	 */
>  	ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff));
> +	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
> +			be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr), false, false);
>  	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
>  		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
>  			continue;
> @@ -184,7 +200,7 @@ static bool try_load_elf64(struct elf_hdr *header)
>  
>  	if (!kernel_entry) {
>  		prerror("INIT: Failed to find kernel entry !\n");
> -		return false;
> +		goto out_unmap2;
>  	}
>  
>  	/* For the normal big-endian ELF ABI, the kernel entry points
> @@ -194,6 +210,8 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	 * to assuming it obeys the ABI.
>  	 */
>  	sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff));
> +	vm_map_global("KERNEL ELF Section Headers", (unsigned long)sh,
> +			be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr), false, false);
>  	for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) {
>  		if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) &&
>  		    (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) >
> @@ -218,7 +236,15 @@ static bool try_load_elf64(struct elf_hdr *header)
>  	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
>  	       kernel_entry, kernel_size);
>  
> -	return true;
> +	ret = true;
> +
> +	vm_unmap_global((unsigned long)sh, be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr));
> +out_unmap2:
> +	vm_unmap_global((unsigned long)ph, be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr));
> +out_unmap1:
> +	vm_unmap_global((unsigned long)header, sizeof(struct elf64be_hdr));
> +
> +	return ret;
>  }
>  
>  static bool try_load_elf32_le(struct elf_hdr *header)
> @@ -334,6 +360,7 @@ bool start_preload_kernel(void)
>  	int loaded;
>  
>  	/* Try to load an external kernel payload through the platform hooks */
> +	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
>  	kernel_size = KERNEL_LOAD_SIZE;
>  	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
>  					RESOURCE_SUBID_NONE,
> @@ -342,9 +369,11 @@ bool start_preload_kernel(void)
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform start load kernel failed\n");
>  		kernel_size = 0;
> +		vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
>  		return false;
>  	}
>  
> +	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
>  	initramfs_size = INITRAMFS_LOAD_SIZE;
>  	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
>  					RESOURCE_SUBID_NONE,
> @@ -352,6 +381,7 @@ bool start_preload_kernel(void)
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform start load initramfs failed\n");
>  		initramfs_size = 0;
> +		vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
>  		return false;
>  	}
>  
> @@ -361,13 +391,16 @@ bool start_preload_kernel(void)
>  static bool load_kernel(void)
>  {
>  	void *stb_container = NULL;
> -	struct elf_hdr *kh;
> +	struct elf_hdr *kh, *t;
> +	uint32_t ei_ident;
> +	uint8_t ei_class;
>  	int loaded;
>  
>  	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
>  
>  	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
>  					  RESOURCE_SUBID_NONE);
> +	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
>  
>  	if (loaded != OPAL_SUCCESS) {
>  		printf("INIT: platform wait for kernel load failed\n");
> @@ -383,8 +416,10 @@ static bool load_kernel(void)
>  				((uint64_t)__builtin_kernel_start) -
>  				SKIBOOT_BASE + boot_offset;
>  			printf("Using built-in kernel\n");
> +			vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, kernel_size, true, false);
>  			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
>  				kernel_size);
> +			vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, kernel_size);
>  		}
>  	}
>  
> @@ -400,7 +435,7 @@ static bool load_kernel(void)
>  		if (kernel_entry < EXCEPTION_VECTORS_END) {
>  			cpu_set_sreset_enable(false);
>  			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
> -			sync_icache();
> +			sync_icache(0);
>  		} else {
>  			/* Hack for STB in Mambo, assume at least 4kb in mem */
>  			if (!kernel_size)
> @@ -431,15 +466,20 @@ static bool load_kernel(void)
>  	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
>  	      kernel_size);
>  
> -	if (kh->ei_ident != ELF_IDENT) {
> +	t = vm_map((unsigned long)kh, sizeof(*kh), false);
> +	ei_ident = t->ei_ident;
> +	ei_class = t->ei_class;
> +	vm_unmap((unsigned long)t, sizeof(*kh));
> +
> +	if (ei_ident != ELF_IDENT) {
>  		prerror("INIT: ELF header not found. Assuming raw binary.\n");
>  		return true;
>  	}
>  
> -	if (kh->ei_class == ELF_CLASS_64) {
> +	if (ei_class == ELF_CLASS_64) {
>  		if (!try_load_elf64(kh))
>  			return false;
> -	} else if (kh->ei_class == ELF_CLASS_32) {
> +	} else if (ei_class == ELF_CLASS_32) {
>  		if (!try_load_elf32(kh))
>  			return false;
>  	} else {
> @@ -467,7 +507,7 @@ static void load_initramfs(void)
>  
>  	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
>  					  RESOURCE_SUBID_NONE);
> -
> +	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
>  	if (loaded != OPAL_SUCCESS || !initramfs_size)
>  		return;
>  
> @@ -539,6 +579,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  	const struct dt_property *memprop;
>  	const char *cmdline, *stdoutp;
>  	uint64_t mem_top;
> +	uint32_t *t;
>  
>  	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
>  	if (memprop)
> @@ -613,11 +654,13 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  
>  	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
>  
> +	t = vm_map(kernel_entry, 4, false);
>  	/* Check there is something there before we branch to it */
> -	if (*(uint32_t *)kernel_entry == 0) {
> +	if (*t == 0) {
>  		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
>  		assert(0);
>  	}
> +	vm_unmap(kernel_entry, 4);
>  
>  	if (platform.exit)
>  		platform.exit();
> @@ -629,7 +672,10 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>  	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
>  	       kernel_entry, fdt, fdt_totalsize(fdt));
>  
> -	/* Disable machine checks on all */
> +	/* Go back to realmode and tear down our VM before booting kernel */
> +	vm_destroy();
> +
> +	/* Disable machine checks, RI on all */
>  	cpu_disable_ME_RI_all();
>  
>  	patch_traps(false);
> @@ -835,37 +881,60 @@ static void setup_branch_null_catcher(void)
>  
>  void copy_sreset_vector(void)
>  {
> +	static char patch[0x100];
>  	uint32_t *src, *dst;
> +	uint32_t *t;
> +	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
>  
>  	/* Copy the reset code over the entry point. */
>  	src = &reset_patch_start;
> +	t = vm_map((unsigned long)src, len, false);
> +	memcpy(patch, t, len);
> +	vm_unmap((unsigned long)src, len);
> +
>  	dst = (uint32_t *)0x100;
> -	while(src < &reset_patch_end)
> -		*(dst++) = *(src++);
> -	sync_icache();
> +	t = vm_map((unsigned long)dst, len, true);
> +	memcpy(t, patch, len);
> +	sync_icache((unsigned long)t);
> +	vm_unmap((unsigned long)dst, len);
>  }
>  
>  void copy_sreset_vector_fast_reboot(void)
>  {
> +	static char patch[0x100];
>  	uint32_t *src, *dst;
> +	uint32_t *t;
> +	uint32_t len = (void *)&reset_fast_reboot_patch_end -
> +			(void *)&reset_fast_reboot_patch_start;
>  
>  	/* Copy the reset code over the entry point. */
>  	src = &reset_fast_reboot_patch_start;
> +	t = vm_map((unsigned long)src, len, false);
> +	memcpy(patch, t, len);
> +	vm_unmap((unsigned long)src, len);
> +
>  	dst = (uint32_t *)0x100;
> -	while(src < &reset_fast_reboot_patch_end)
> -		*(dst++) = *(src++);
> -	sync_icache();
> +	t = vm_map((unsigned long)dst, len, true);
> +	memcpy(t, patch, len);
> +	sync_icache((unsigned long)t);
> +	vm_unmap((unsigned long)dst, len);
>  }
>  
>  void copy_exception_vectors(void)
>  {
> +	void *t;
> +
> +	t = vm_map(0x0, EXCEPTION_VECTORS_END, true);
> +
>  	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
>  	 * this is the boot flag used by CPUs still potentially entering
>  	 * skiboot.
>  	 */
> -	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
> +	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
>  			EXCEPTION_VECTORS_END - 0x100);
> -	sync_icache();
> +
> +	sync_icache((unsigned long)t);
> +	vm_unmap(0x0, EXCEPTION_VECTORS_END);
>  }
>  
>  /*
> @@ -879,15 +948,16 @@ void patch_traps(bool enable)
>  	for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
>  		uint32_t *insn;
>  
> -		insn = (uint32_t *)tte->address;
> +		insn = vm_map(tte->address, sizeof(uint32_t), true);
>  		if (enable) {
>  			*insn = PPC_INST_TRAP;
>  		} else {
>  			*insn = PPC_INST_NOP;
>  		}
> +		sync_icache((unsigned long)insn);
> +		vm_unmap(tte->address, sizeof(uint32_t));
>  	}
>  
> -	sync_icache();
>  }
>  
>  static void per_thread_sanity_checks(void)
> @@ -937,19 +1007,22 @@ void pci_nvram_init(void)
>  static uint32_t mem_csum(void *_p, void *_e)
>  {
>  	size_t len = _e - _p;
> -	uint32_t *p = _p;
> +	uint32_t *t;
>  	uint32_t v1 = 0, v2 = 0;
>  	uint32_t csum;
>  	unsigned int i;
>  
> +	t = vm_map((unsigned long)_p, len, false);
> +
>  	for (i = 0; i < len; i += 4) {
> -		uint32_t v = *p++;
> +		uint32_t v = *t++;
>  		v1 += v;
>  		v2 += v1;
>  	}
> -
>  	csum = v1 ^ v2;
>  
> +	vm_unmap((unsigned long)_p, len);
> +
>  	return csum;
>  }
>  
> @@ -963,6 +1036,8 @@ static void checksum_romem(void)
>  	if (chip_quirk(QUIRK_SLOW_SIM))
>  		return;
>  
> +	/* Called in real mode */
> +
>  	csum = mem_csum(_start, _head_end);
>  	romem_csum ^= csum;
>  
> @@ -1054,7 +1129,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
>  	       (debug_descriptor.console_log_levels >> 4),
>  	       (debug_descriptor.console_log_levels & 0x0f));
> -	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
> +	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
>  
>  #ifdef SKIBOOT_GCOV
>  	skiboot_gcov_done();
> @@ -1066,6 +1141,9 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	/* Now locks can be used */
>  	init_locks();
>  
> +	/* Enter virtual memory mode */
> +	vm_init(false);
> +
>  	/* Create the OPAL call table early on, entries can be overridden
>  	 * later on (FSP console code for example)
>  	 */
> @@ -1091,7 +1169,20 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  		if (parse_hdat(false) < 0)
>  			abort();
>  	} else {
> +		void *t;
> +		uint32_t size;
> +
> +		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
> +		size = fdt_totalsize(t);
> +		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
> +
> +		/*
> +		 * Would be nice to make this a local map, but it seems
> +		 * to need to be expanded in place.
> +		 */
> +		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
>  		dt_expand(fdt);
> +		vm_unmap_global((unsigned long)fdt, size);
>  	}
>  	dt_add_cpufeatures(dt_root);
>  
> @@ -1142,6 +1233,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	 */
>  	init_cpu_max_pir();
>  
> +	vm_init_stacks();
> +
>  	/*
>  	 * Now, we init our memory map from the device-tree, and immediately
>  	 * reserve areas which we know might contain data coming from
> @@ -1393,6 +1486,30 @@ void __noreturn __secondary_cpu_entry(void)
>  	enable_machine_check();
>  	mtmsrd(MSR_RI, 1);
>  
> +	vm_init_secondary();
> +
> +	/* Some XIVE setup */
> +	xive_cpu_callin(cpu);
> +
> +	/* Wait for work to do */
> +	while(true) {
> +		if (cpu_check_jobs(cpu))
> +			cpu_process_jobs();
> +		else
> +			cpu_idle_job();
> +	}
> +}
> +
> +void __noreturn __return_cpu_entry(void)
> +{
> +	struct cpu_thread *cpu = this_cpu();
> +
> +	/* Secondary CPU called in */
> +	cpu_callin(cpu);
> +
> +	enable_machine_check();
> +	mtmsrd(MSR_RI, 1);
> +
>  	/* Some XIVE setup */
>  	xive_cpu_callin(cpu);
>  
> diff --git a/core/mem_region.c b/core/mem_region.c
> index 36de2d094..69f24d630 100644
> --- a/core/mem_region.c
> +++ b/core/mem_region.c
> @@ -25,7 +25,7 @@
>  #define POISON_MEM_REGION	0
>  #endif
>  #define POISON_MEM_REGION_WITH	0x99
> -#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
> +#define POISON_MEM_REGION_LIMIT (128*1024*1024 - PAGE_SIZE)
>  
>  /* Locking: The mem_region_lock protects the regions list from concurrent
>   * updates. Additions to, or removals from, the region list must be done
> @@ -57,24 +57,27 @@ static struct mem_region skiboot_os_reserve = {
>  	.type		= REGION_OS,
>  };
>  
> -struct mem_region skiboot_heap = {
> -	.name		= "ibm,firmware-heap",
> -	.start		= HEAP_BASE,
> -	.len		= HEAP_SIZE,
> -	.type		= REGION_SKIBOOT_HEAP,
> -};
> -
>  static struct mem_region skiboot_code_and_text = {
>  	.name		= "ibm,firmware-code",
>  	.start		= SKIBOOT_BASE,
>  	.len		= HEAP_BASE - SKIBOOT_BASE,
> +	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
>  	.type		= REGION_SKIBOOT_FIRMWARE,
>  };
>  
> +struct mem_region skiboot_heap = {
> +	.name		= "ibm,firmware-heap",
> +	.start		= HEAP_BASE,
> +	.len		= HEAP_SIZE,
> +	.vm_mapped_len	= HEAP_SIZE,
> +	.type		= REGION_SKIBOOT_HEAP,
> +};
> +
>  static struct mem_region skiboot_after_heap = {
>  	.name		= "ibm,firmware-data",
>  	.start		= HEAP_BASE + HEAP_SIZE,
>  	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
> +	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
>  	.type		= REGION_SKIBOOT_FIRMWARE,
>  };
>  
> @@ -141,17 +144,40 @@ static struct alloc_hdr *next_hdr(const struct mem_region *region,
>  	return next;
>  }
>  
> +static unsigned long vm_map_limit(const struct mem_region *region,
> +				  const struct alloc_hdr *hdr,
> +				  unsigned long size)
> +{
> +	unsigned long end = region->start + region->len;
> +	unsigned long limit;
> +
> +	assert((unsigned long)hdr >= region->start);
> +
> +	limit = (unsigned long)hdr + size;
> +	assert(limit <= end);
> +
> +	if (limit + sizeof(struct free_hdr) <= end)
> +		limit += sizeof(struct free_hdr);
> +
> +	return limit - region->start;
> +}
> +
>  #if POISON_MEM_REGION == 1
>  static void mem_poison(struct free_hdr *f)
>  {
> -	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
> +	unsigned long start = (unsigned long)(f + 1);
> +	unsigned long *t = tailer(f);
> +	size_t poison_size = (unsigned long)t - start;
> +	void *mem;
>  
>  	/* We only poison up to a limit, as otherwise boot is
>  	 * kinda slow */
>  	if (poison_size > POISON_MEM_REGION_LIMIT)
>  		poison_size = POISON_MEM_REGION_LIMIT;
>  
> -	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
> +	mem = vm_map(start, poison_size, true);
> +	memset(mem, POISON_MEM_REGION_WITH, poison_size);
> +	vm_unmap(start, poison_size);
>  }
>  #endif
>  
> @@ -159,14 +185,36 @@ static void mem_poison(struct free_hdr *f)
>  static void init_allocatable_region(struct mem_region *region)
>  {
>  	struct free_hdr *f = region_start(region);
> +	unsigned long num_longs;
> +	unsigned long *t;
> +
>  	assert(region->type == REGION_SKIBOOT_HEAP ||
>  	       region->type == REGION_MEMORY);
> -	f->hdr.num_longs = region->len / sizeof(long);
> +
> +	num_longs = region->len / sizeof(long);
> +
> +	assert(PAGE_SIZE >= sizeof(*f));
> +	assert(region->len >= PAGE_SIZE*2);
> +
> +	list_head_init(&region->free_list);
> +
> +	if (!region->vm_mapped_len) {
> +		/* SKIBOOT_BASE-SIZE regions already come mapped */
> +		vm_map_global(region->name, region->start, sizeof(struct free_hdr), true, false);
> +		region->vm_mapped_len = sizeof(struct free_hdr);
> +	} else {
> +		assert(region == &skiboot_heap);
> +	}
> +
> +	f->hdr.num_longs = num_longs;
>  	f->hdr.free = true;
>  	f->hdr.prev_free = false;
> -	*tailer(f) = f->hdr.num_longs;
> -	list_head_init(&region->free_list);
>  	list_add(&region->free_list, &f->list);
> +
> +	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
> +	*t = num_longs;
> +	vm_unmap((unsigned long)tailer(f), sizeof(long));
> +
>  #if POISON_MEM_REGION == 1
>  	mem_poison(f);
>  #endif
> @@ -176,6 +224,9 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
>  		      const char *location, bool skip_poison)
>  {
>  	struct alloc_hdr *next;
> +	unsigned long *t;
> +	unsigned long new_end;
> +	unsigned long new_sz;
>  
>  #if POISON_MEM_REGION == 1
>  	if (!skip_poison)
> @@ -202,20 +253,33 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
>  		list_add(&region->free_list, &f->list);
>  	}
>  
> -	/* Fix up tailer. */
> -	*tailer(f) = f->hdr.num_longs;
> -
> -	/* If next is free, coalesce it */
> +	/* If next is free coalesce it, else mark us as free. */
>  	next = next_hdr(region, &f->hdr);
>  	if (next) {
> -		next->prev_free = true;
>  		if (next->free) {
>  			struct free_hdr *next_free = (void *)next;
>  			list_del_from(&region->free_list, &next_free->list);
> -			/* Maximum of one level of recursion */
> -			make_free(region, next_free, location, true);
> +			f->hdr.num_longs += next_free->hdr.num_longs;
> +		} else {
> +			assert(!next->prev_free);
> +			next->prev_free = true;
> +			goto no_unmap;
>  		}
>  	}
> +
> +	/* Freed to the end, may have to trim mapping */
> +	new_end = (unsigned long)f + sizeof(struct free_hdr);
> +	new_sz = new_end - region->start;
> +	if (region != &skiboot_heap && new_sz < region->vm_mapped_len) {
> +		vm_unmap_global(new_end, region->vm_mapped_len - new_sz);
> +		region->vm_mapped_len = new_sz;
> +	}
> +
> +no_unmap:
> +	/* Fix up tailer. */
> +	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
> +	*t = f->hdr.num_longs;
> +	vm_unmap((unsigned long)tailer(f), sizeof(long));
>  }
>  
>  /* Can we fit this many longs with this alignment in this free block? */
> @@ -253,11 +317,12 @@ static void discard_excess(struct mem_region *region,
>  		post->hdr.num_longs = hdr->num_longs - alloc_longs;
>  		post->hdr.prev_free = false;
>  
> +		/* No coalescing required. */
> +		make_free(region, post, location, skip_poison);
> +
>  		/* Trim our block. */
>  		hdr->num_longs = alloc_longs;
>  
> -		/* This coalesces as required. */
> -		make_free(region, post, location, skip_poison);
>  	}
>  }
>  
> @@ -445,6 +510,18 @@ found:
>  	if (next) {
>  		assert(next->prev_free);
>  		next->prev_free = false;
> +	} else {
> +		unsigned long new_sz;
> +
> +		/* Took from the end, may have to expand mapping */
> +		new_sz = vm_map_limit(region, &f->hdr, (alloc_longs + offset) * sizeof(long));
> +		if (new_sz > region->vm_mapped_len) {
> +			assert(region != &skiboot_heap);
> +			vm_map_global(region->name,
> +				region->start + region->vm_mapped_len,
> +				new_sz - region->vm_mapped_len, true, false);
> +			region->vm_mapped_len = new_sz;
> +		}
>  	}
>  
>  	if (offset != 0) {
> @@ -536,6 +613,7 @@ bool mem_resize(struct mem_region *region, void *mem, size_t len,
>  {
>  	struct alloc_hdr *hdr, *next;
>  	struct free_hdr *f;
> +	unsigned long new_sz;
>  
>  	/* This should be a constant. */
>  	assert(is_rodata(location));
> @@ -566,6 +644,15 @@ bool mem_resize(struct mem_region *region, void *mem, size_t len,
>  	if (!next || !next->free || hdr->num_longs + next->num_longs < len)
>  		return false;
>  
> +	new_sz = vm_map_limit(region, hdr, len * sizeof(long));
> +	if (new_sz > region->vm_mapped_len) {
> +		assert(region != &skiboot_heap);
> +		vm_map_global(region->name,
> +			region->start + region->vm_mapped_len,
> +			new_sz - region->vm_mapped_len, true, false);
> +		region->vm_mapped_len = new_sz;
> +	}
> +
>  	/* OK, it's free and big enough, absorb it. */
>  	f = (struct free_hdr *)next;
>  	list_del_from(&region->free_list, &f->list);
> @@ -691,6 +778,7 @@ static struct mem_region *new_region(const char *name,
>  	region->name = name;
>  	region->start = start;
>  	region->len = len;
> +	region->vm_mapped_len = 0;
>  	region->node = node;
>  	region->type = type;
>  	region->free_list.n.next = NULL;
> @@ -1199,6 +1287,7 @@ void mem_region_release_unused(void)
>  			continue;
>  
>  		used_len = allocated_length(r);
> +		assert(used_len <= r->vm_mapped_len);
>  
>  		prlog(PR_INFO, "    %s: %llu/%llu used\n",
>  		       r->name, (long long)used_len, (long long)r->len);
> @@ -1227,6 +1316,10 @@ void mem_region_release_unused(void)
>  			}
>  			list_add(&regions, &for_linux->list);
>  		}
> +		if (r->vm_mapped_len > used_len) {
> +			vm_unmap_global(r->start + used_len, r->vm_mapped_len - used_len);
> +			r->vm_mapped_len = used_len;
> +		}
>  	}
>  	unlock(&mem_region_lock);
>  }
> @@ -1271,9 +1364,13 @@ static void mem_clear_range(uint64_t s, uint64_t e)
>  		return;
>  	}
>  
> -	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
> -	      (long long)s, (long long)e);
> +	/*
> +	 * Large clear thrashes the small hash table, with parallel clearing
> +	 * this can livelock. Clear in real mode.
> +	 */
> +	vm_exit();
>  	memset((void *)s, 0, e - s);
> +	vm_enter();
>  }
>  
>  struct mem_region_clear_job_args {
> diff --git a/core/opal.c b/core/opal.c
> index 46518c445..9ab7391d1 100644
> --- a/core/opal.c
> +++ b/core/opal.c
> @@ -44,19 +44,39 @@ static uint64_t opal_dynamic_events;
>  extern uint32_t attn_trigger;
>  extern uint32_t hir_trigger;
>  
> +void __opal_register(uint64_t token, void *func, unsigned int nargs)
> +{
> +	uint64_t f;
> +	uint64_t *t;
> +	u8 *a;
> +
> +	assert(token <= OPAL_LAST);
> +
> +	f = function_entry_address(func);
> +
> +	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(*t), true);
> +	*t = f;
> +	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(*t));
> +
> +	a = vm_map((unsigned long)&opal_num_args[token], sizeof(*a), true);
> +	*a = nargs;
> +	vm_unmap((unsigned long)&opal_num_args[token], sizeof(*a));
> +}
>  
>  void opal_table_init(void)
>  {
>  	struct opal_table_entry *s = __opal_table_start;
>  	struct opal_table_entry *e = __opal_table_end;
> +	struct opal_table_entry *te;
> +	size_t size = (void *)e - (void *)s;
>  
>  	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
>  	      s, e, opal_branch_table);
> -	while(s < e) {
> -		((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func);
> -		((u8 *)opal_num_args)[s->token] = s->nargs;
> -		s++;
> -	}
> +
> +	vm_map_global("OPAL table", (unsigned long)s, size, false, false);
> +	for (te = s; te < e; te++)
> +		__opal_register(te->token, te->func, te->nargs);
> +	vm_unmap_global((unsigned long)s, size);
>  }
>  
>  /* Called from head.S, thus no prototype */
> @@ -317,14 +337,6 @@ int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
>  }
>  opal_call(OPAL_QUIESCE, opal_quiesce, 2);
>  
> -void __opal_register(uint64_t token, void *func, unsigned int nargs)
> -{
> -	assert(token <= OPAL_LAST);
> -
> -	((uint64_t *)opal_branch_table)[token] = function_entry_address(func);
> -	((u8 *)opal_num_args)[token] = nargs;
> -}
> -
>  /*
>   * add_opal_firmware_exports_node: adds properties to the device-tree which
>   * the OS will then change into sysfs nodes.
> diff --git a/core/platform.c b/core/platform.c
> index 8f4a3b877..839cf97ee 100644
> --- a/core/platform.c
> +++ b/core/platform.c
> @@ -242,8 +242,10 @@ void set_bmc_platform(const struct bmc_platform *bmc)
>  
>  void probe_platform(void)
>  {
> -	struct platform *platforms = &__platforms_start;
> -	unsigned int i;
> +	struct platform *s = __platforms_start;
> +	struct platform *e = __platforms_end;
> +	struct platform *p;
> +	size_t size = (void *)e - (void *)s;
>  
>  	/* Detect Manufacturing mode */
>  	if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
> @@ -257,12 +259,15 @@ void probe_platform(void)
>  		manufacturing_mode = true;
>  	}
>  
> -	for (i = 0; &platforms[i] < &__platforms_end; i++) {
> -		if (platforms[i].probe && platforms[i].probe()) {
> -			platform = platforms[i];
> +	vm_map_global("Platform table", (unsigned long)s, size, false, false);
> +	for (p = s; p < e; p++) {
> +		if (p->probe && p->probe()) {
> +			platform = *p;
>  			break;
>  		}
>  	}
> +	vm_unmap_global((unsigned long)s, size);
> +
>  	if (!platform.name) {
>  		platform = generic_platform;
>  		if (platform.probe)
> diff --git a/core/vm.c b/core/vm.c
> new file mode 100644
> index 000000000..84534796c
> --- /dev/null
> +++ b/core/vm.c
> @@ -0,0 +1,942 @@
> +/* Copyright 2018 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + * 	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <ccan/container_of/container_of.h>
> +#include <ccan/list/list.h>
> +#include <ccan/str/str.h>
> +#include <cmpxchg.h>
> +#include <cpu.h>
> +#include <opal.h>
> +#include <skiboot.h>
> +#include <stack.h>
> +#include <timebase.h>
> +#include <trace.h>
> +
> +static bool vm_setup = false;
> +static bool vm_globals_allocated = false;
> +
> +#define SLB_SZ			(256UL*1024*1024)
> +#define SLB_NR			32
> +#define LOCAL_SLB_NR		2
> +#define GLOBAL_SLB_NR		(SLB_NR - LOCAL_SLB_NR)
> +#define LOCAL_SLB_BASE		GLOBAL_SLB_NR
> +
> +#define LOCAL_EA_PERCPU		(SLB_SZ)
> +#define LOCAL_EA_BEGIN		0x0008000000000000ULL
> +#define LOCAL_EA_END		0x0009000000000000ULL
> +
> +static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
> +{
> +	unsigned long rs;
> +	unsigned long rb;
> +
> +	rs = vsid << (63-51);		/* 256MB VSID */
> +	rs |= 1UL << (63-53);		/* Kp = 1 */
> +	if (PAGE_SIZE == 0x10000) {
> +		rs |= 1UL << (63-55);		/* L = 1 */
> +		rs |= 1UL << (63-59);		/* LP = 01 */
> +	}
> +
> +	rb = esid << (63-35);		/* 256MB ESID */
> +	rb |= 1UL << (63-36);		/* V = 1 */
> +	rb |= index;
> +
> +	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
> +}
> +
> +#if 0
> +static void slb_remove(unsigned long esid)
> +{
> +	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
> +}
> +#endif
> +
> +static void slb_remove_all(void)
> +{
> +	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
> +}
> +
> +static void __nomcount slb_add(unsigned long ea)
> +{
> +	struct cpu_thread *cpu = this_cpu();
> +	uint64_t esid = ea >> 28;
> +	uint64_t vsid = ea >> 28;
> +
> +	slb_install(esid, vsid, cpu->vm_slb_rr);
> +
> +	cpu->vm_slb_rr++;
> +	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
> +		cpu->vm_slb_rr = 0;
> +}
> +
> +struct hpte {
> +	beint64_t dword[2];
> +};
> +
> +struct hpteg {
> +	struct hpte hpte[8];
> +};
> +
> +static struct hpteg *htab;
> +static unsigned long htab_shift;
> +static unsigned long htab_pteg_mask;
> +
> +static struct lock htab_lock;
> +
> +static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, bool local)
> +{
> +	unsigned long hash;
> +	struct hpteg *hpteg;
> +	struct hpte *hpte;
> +	unsigned long ava = va >> 23;
> +	unsigned long arpn = pa >> 12;
> +	unsigned long dw0, dw1;
> +	unsigned long _dw0;
> +	unsigned long _ava;
> +	unsigned int hstart, hend;
> +	unsigned int i;
> +
> +	if (PAGE_SIZE == 0x10000)
> +		arpn >>= 4;
> +
> +	dw0 = ava << (63-56); /* AVA = ava */
> +	dw0 |= 0x1; /* V = 1 */
> +	if (PAGE_SIZE == 0x10000)
> +		dw0 |= 0x4; /* L = 1 */
> +	if (local)
> +		dw0 |= 0x8; /* SW[0] = 1 */
> +
> +	if (PAGE_SIZE == 0x10000) {
> +		dw1 = (arpn << (63-43 - 4)); /* ARPN||LP-4 = arpn */
> +		dw1 |= (0x1 << (63-43 - 8)); /* LP = 0001 */
> +	} else
> +		dw1 = (arpn << (63-43 - 8)); /* ARPN||LP = arpn */
> +	if (!rw)
> +		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1)); /* pp = 110 */
> +	if (!ex)
> +		dw1 |= (1UL << (63 - 61)); /* N = 1 */
> +	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
> +	if (ci)
> +		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0111 */
> +	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
> +
> +	if (PAGE_SIZE == 0x10000)
> +		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	else
> +		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	hpteg = &htab[hash & htab_pteg_mask];
> +
> +	lock(&htab_lock);
> +
> +	hstart = 0;
> +	hend = 7;
> +
> +	for (i = hstart; i <= hend; i++) {
> +		hpte = &hpteg->hpte[i];
> +
> +		_dw0 = be64_to_cpu(hpte->dword[0]);
> +		if (_dw0 & 1) {
> +			_ava = _dw0 >> (63 - 56);
> +			if (_ava == ava) {
> +				assert(!local);
> +				/* This could happen with racing global fault */
> +				assert(dw0 == _dw0);
> +				assert(dw1 == be64_to_cpu(hpte->dword[1]));
> +				goto out;
> +			}
> +
> +			continue;
> +		}
> +
> +		assert(!_dw0);
> +		goto install;
> +	}
> +
> +	i = mftb();
> +	i = (i ^ (i >> 4)) & 0x7;
> +	hpte = &hpteg->hpte[i];
> +
> +install:
> +	hpte->dword[1] = cpu_to_be64(dw1);
> +	eieio();
> +	hpte->dword[0] = cpu_to_be64(dw0);
> +	asm volatile("ptesync" ::: "memory");
> +out:
> +	unlock(&htab_lock);
> +}
> +
> +static void htab_remove(unsigned long va, int local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	unsigned long hash;
> +	struct hpteg *hpteg;
> +	unsigned long ava = va >> 23;
> +	unsigned long dw0;
> +	unsigned long rb;
> +	unsigned int hstart, hend;
> +	unsigned int i;
> +
> +	dw0 = ava << (63-56);
> +	dw0 |= 0x1;
> +	if (PAGE_SIZE == 0x10000)
> +		dw0 |= 0x4;
> +	if (local)
> +		dw0 |= 0x8;
> +
> +	if (PAGE_SIZE == 0x10000)
> +		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	else
> +		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +	hpteg = &htab[hash & htab_pteg_mask];
> +
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&htab_lock);
> +	hstart = 0;
> +	hend = 7;
> +
> +	for (i = hstart; i <= hend; i++) {
> +		struct hpte *hpte = &hpteg->hpte[i];
> +		beint64_t _raw_dw0;
> +		uint64_t _dw0;
> +
> +		_raw_dw0 = hpte->dword[0];
> +		_dw0 = be64_to_cpu(_raw_dw0);
> +
> +		if (!(_dw0 & 1)) {
> +			assert(!_raw_dw0);
> +			continue;
> +		}
> +
> +		if (_dw0 != dw0)
> +			continue;
> +
> +		hpte->dword[0] = 0;
> +		eieio();
> +		hpte->dword[1] = 0;
> +
> +		break;
> +	}
> +
> +	if (PAGE_SIZE == 0x10000) {
> +		rb = (va >> 16) << (63 - 47); /* AVA||LP-4 */
> +		rb |= 0x1 << (63 - 51); /* LP=0001 */
> +		rb |= 0x1; /* L=1 */
> +	} else {
> +		rb = va & ~0xfffUL;
> +	}
> +
> +	unlock(&htab_lock);
> +
> +	if (vm_setup)
> +		vm_enter();
> +
> +	if (local) {
> +		asm volatile("ptesync" ::: "memory");
> +		asm volatile("tlbiel %0" : : "r"(rb));
> +		asm volatile("ptesync" ::: "memory");
> +	} else {
> +		asm volatile("ptesync" ::: "memory");
> +		asm volatile("tlbie %0,%1" : : "r"(rb), "r"(0));
> +		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
> +
> +	}
> +}
> +
> +/*
> + * Try to fix problems in callers if !strict.
> + */
> +static bool vm_strict = false;
> +
> +static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
> +static struct lock vm_maps_lock;
> +static unsigned long nr_vm_maps;
> +
> +static void __vm_map(const char *name, unsigned long addr, unsigned long len, unsigned long pa, bool r, bool w, bool x, bool ci, bool local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	struct vm_map *new;
> +	struct vm_map *vmm;
> +
> +	if (local) {
> +		new = &c->vm_local_map;
> +		new->name = name;
> +		new->address = addr;
> +		new->length = len;
> +		new->pa = pa;
> +		new->readable = r;
> +		new->writeable = w;
> +		new->executable = x;
> +		new->ci = ci;
> +
> +		return;
> +	}
> +
> +	new = zalloc(sizeof(*new));
> +	assert(new);
> +
> +	new->name = name;
> +	new->address = addr;
> +	new->length = len;
> +	new->pa = pa;
> +	new->readable = r;
> +	new->writeable = w;
> +	new->executable = x;
> +	new->ci = ci;
> +
> +	/* Can not take a d-side fault while holding this lock */
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&vm_maps_lock);
> +
> +	list_for_each(&vm_maps, vmm, list) {
> +		unsigned long ps = addr & ~(PAGE_SIZE - 1);
> +		unsigned long pe = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +		unsigned long vmm_ps = vmm->address & ~(PAGE_SIZE - 1);
> +		unsigned long vmm_pe = (vmm->address + vmm->length + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +		bool mergeable = false;
> +		bool samepage = false;
> +
> +		/* Ensure no overlap */
> +		assert(addr + len <= vmm->address || addr >= vmm->address + vmm->length);
> +
> +		if (ps > vmm_pe)
> +			continue; /* Sort */
> +		if (pe < vmm_ps) {
> +			/* Not same or adjacent page is easy */
> +			list_add_before(&vm_maps, &new->list, &vmm->list);
> +			goto found;
> +		}
> +		if (pe > vmm_ps || ps < vmm_pe)
> +			samepage = true;
> +
> +		mergeable =	/* XXX: check pa */ 1 &&
> +				(vmm->ci == ci) &&
> +				(vmm->readable == r) &&
> +				(vmm->writeable == w) &&
> +				(vmm->executable == x);
> +		samepage = false;
> +
> +		if (samepage && !mergeable) {
> +			printf("VMM: %s (%lx-%lx) mismatched permissions with same page mapping %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
> +			assert(vmm->pa == pa);
> +			assert(vmm->ci == ci);
> +			assert(vmm->readable == r);
> +			assert(vmm->writeable == w);
> +			assert(vmm->executable == x);
> +		}
> +
> +		if (!strcmp(name, vmm->name) && mergeable) {
> +			if (addr == vmm->address + vmm->length) {
> +				free(new);
> +				vmm->length += len;
> +				goto done;
> +			}
> +
> +			if (addr + len == vmm->address) {
> +				free(new);
> +				vmm->address = addr;
> +				vmm->pa = pa;
> +				vmm->length += len;
> +				goto done;
> +			}
> +		}
> +
> +		if (addr >= vmm->address + vmm->length)
> +			continue;
> +		if (addr + len <= vmm->address) {
> +			list_add_before(&vm_maps, &new->list, &vmm->list);
> +			goto found;
> +		}
> +
> +		assert(0);
> +	}
> +	list_add_tail(&vm_maps, &new->list);
> +found:
> +	nr_vm_maps++;
> +done:
> +	unlock(&vm_maps_lock);
> +	if (vm_setup)
> +		vm_enter();
> +}
> +
> +static void __vm_unmap(unsigned long addr, unsigned long len, bool local)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	bool vm_setup = c->vm_setup;
> +	unsigned long end = addr + len;
> +	struct vm_map *vmm, *to_free = NULL;
> +
> +	if (local) {
> +		vmm = &c->vm_local_map;
> +		assert(addr == vmm->address);
> +		assert(len == vmm->length);
> +		memset(vmm, 0, sizeof(struct vm_map));
> +
> +		if (vm_setup) {
> +			while (addr < end) {
> +				htab_remove(addr, local);
> +				addr += PAGE_SIZE;
> +			}
> +		}
> +
> +		return;
> +	}
> +
> +	/* Can not take a d-side fault while holding this lock */
> +	if (vm_setup)
> +		vm_exit();
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		struct vm_map *new;
> +
> +		if (addr + len <= vmm->address)
> +			continue;
> +		if (addr >= vmm->address + vmm->length)
> +			continue;
> +		if (addr == vmm->address && len == vmm->length) {
> +			to_free = vmm;
> +			goto found;
> +		}
> +
> +		if (addr == vmm->address) {
> +			vmm->address += len;
> +			vmm->pa += len;
> +			vmm->length -= len;
> +			goto done;
> +		}
> +
> +		if (addr + len == vmm->address + vmm->length) {
> +			vmm->length -= len;
> +			goto done;
> +		}
> +
> +		/* Unmaps will never span multiple because they always apply to a previous map, so this is a split */
> +		new = zalloc(sizeof(*new));
> +		assert(new);
> +		memcpy(new, vmm, sizeof(*new));
> +		list_add_before(&vm_maps, &new->list, &vmm->list);
> +		nr_vm_maps++;
> +
> +		new->length = addr - new->address;
> +		vmm->address += new->length + len;
> +		vmm->pa += new->length + len;
> +		vmm->length -= new->length + len;
> +		goto done;
> +	}
> +	vmm = NULL;
> +	unlock(&vm_maps_lock);
> +	if (!vm_strict) {
> +		prerror("unmap didn't find anything\n");
> +		backtrace();
> +		goto out;
> +	}
> +	assert(0);
> +
> +found:
> +	list_del(&vmm->list);
> +	nr_vm_maps--;
> +done:
> +	if (vm_setup) {
> +		while (addr < end) {
> +			htab_remove(addr, local);
> +			addr += PAGE_SIZE;
> +		}
> +	}
> +
> +	unlock(&vm_maps_lock);
> +out:
> +	if (vm_setup)
> +		vm_enter();
> +
> +	if (to_free)
> +		free(to_free);
> +}
> +
> +
> +void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_map(name, addr, len, addr, true, rw, false, ci, false);
> +}
> +
> +void vm_map_global_text(const char *name, unsigned long addr, unsigned long len)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_map(name, addr, len, addr, true, false, true, false, false);
> +}
> +
> +void vm_unmap_global(unsigned long addr, unsigned long len)
> +{
> +	assert(this_cpu()->state != cpu_state_os);
> +	__vm_unmap(addr, len, false);
> +}
> +
> +
> +void *vm_map(unsigned long addr, unsigned long len, bool rw)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long newaddr;
> +	unsigned long end;
> +	unsigned long offset = addr & (PAGE_SIZE - 1);
> +
> +	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +	addr &= ~(PAGE_SIZE - 1);
> +	len = end - addr;
> +
> +	assert(len <= LOCAL_EA_PERCPU);
> +
> +	/* Can't do nested mappings */
> +	assert(!c->vm_local_map_inuse);
> +	c->vm_local_map_inuse = true;
> +
> +	if (c->vm_setup) {
> +		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +		__vm_map("local", newaddr, len, addr, true, rw, false, false, true);
> +	} else {
> +		newaddr = addr;
> +	}
> +
> +	return (void *)newaddr + offset;
> +}
> +
> +void vm_unmap(unsigned long addr, unsigned long len)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long newaddr;
> +	unsigned long end;
> +
> +	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
> +	addr &= ~(PAGE_SIZE - 1);
> +	len = end - addr;
> +
> +	assert(len <= LOCAL_EA_PERCPU);
> +
> +	assert(c->vm_local_map_inuse);
> +	c->vm_local_map_inuse = false;
> +
> +	if (c->vm_setup) {
> +		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +		__vm_unmap(newaddr, len, true);
> +	}
> +}
> +
> +struct prte {
> +	beint64_t dword[2];
> +};
> +
> +static struct prte *prtab;
> +static unsigned long old_lpcr;
> +static unsigned long new_lpcr;
> +
> +static void vm_init_cpu(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	unsigned long ea = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
> +	unsigned long esid = ea >> 28;
> +	unsigned long vsid = ea >> 28;
> +
> +	mtspr(SPR_LPCR, new_lpcr);
> +
> +	mtspr(SPR_LPID, 0);
> +	mtspr(SPR_PID, 0);
> +	mtspr(SPR_HRMOR, 0);
> +	mtspr(SPR_PTCR, (unsigned long)prtab);
> +	mtspr(SPR_AMR, 0);
> +	mtspr(SPR_IAMR, 0);
> +	mtspr(SPR_AMOR, 0);
> +	mtspr(SPR_UAMOR, 0);
> +
> +	slb_remove_all();
> +	slb_install(esid, vsid, LOCAL_SLB_BASE);
> +}
> +
> +void vm_init_secondary(void)
> +{
> +	vm_init_cpu();
> +	vm_enter();
> +}
> +
> +bool vm_realmode(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	return !vm_setup || !c->vm_setup;
> +}
> +
> +void vm_enter(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	assert(vm_setup);
> +	if (c->vm_setup) {
> +		prerror("CPU:%d vm_enter already entered\n", c->pir);
> +		backtrace();
> +	}
> +	if (c->vm_local_map_inuse) {
> +		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
> +		backtrace();
> +	}
> +
> +	c->vm_setup = true;
> +	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
> +}
> +
> +void vm_exit(void)
> +{
> +	struct cpu_thread *c = this_cpu();
> +
> +	assert(vm_setup);
> +	if (!c->vm_setup) {
> +		prerror("CPU:%d vm_exit already exited\n", c->pir);
> +		backtrace();
> +	}
> +	if (c->vm_local_map_inuse) {
> +		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
> +		backtrace();
> +	}
> +	c->vm_setup = false;
> +	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
> +}
> +
> +bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
> +{
> +	/*
> +	 * Per-cpu map ranges are bolted to per-cpu SLBs.
> +	 */
> +	assert((dar < LOCAL_EA_BEGIN) ||
> +		(dar >= LOCAL_EA_END));
> +
> +	(void)nia;
> +	slb_add(dar);
> +
> +	return true;
> +}
> +
> +bool __nomcount vm_islb(uint64_t nia)
> +{
> +	slb_add(nia);
> +
> +	return true;
> +}
> +
> +bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr)
> +{
> +	struct cpu_thread *c = this_cpu();
> +	struct vm_map *vmm;
> +	uint64_t pa;
> +	bool store = !!(dsisr & DSISR_ISSTORE);
> +	bool ret = true;
> +	bool local;
> +
> +	if (dsisr & 0xbdffffffU) {
> +		printf("Page fault bad dsisr at 0x%016llx dar=0x%016llx dsisr=0x%08x\n", nia, dar, dsisr);
> +		return false;
> +	}
> +
> +	if ((dar >= LOCAL_EA_BEGIN) && (dar < LOCAL_EA_END)) {
> +		local = true;
> +		vmm = &c->vm_local_map;
> +		if (dar >= vmm->address && dar < vmm->address + vmm->length)
> +			goto found;
> +		goto not_found;
> +	}
> +
> +	local = false;
> +
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		assert(vmm->pa == vmm->address);
> +		if (dar >= vmm->address && dar < vmm->address + vmm->length)
> +			goto found;
> +	}
> +	if (!vm_strict) {
> +		if (dar >= 0x0006000000000000 && dar < 0x0007000000000000)
> +			/* MMIO */
> +			htab_install(dar, dar, 1, 0, 1, false);
> +		else if (dar < LOCAL_EA_BEGIN)
> +			htab_install(dar, dar, 1, 0, 0, false);
> +		else
> +			ret = false;
> +		unlock(&vm_maps_lock);
> +		prerror("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
> +		backtrace();
> +		list_for_each(&vm_maps, vmm, list)
> +			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
> +				vmm->address, vmm->address + vmm->length);
> +		goto out;
> +	}
> +	unlock(&vm_maps_lock);
> +not_found:
> +	prerror("  vmm not found\n");
> +	ret = false;
> +	assert(0);
> +	goto out;
> +
> +found:
> +	pa = vmm->pa + (dar & ~(PAGE_SIZE - 1)) - vmm->address;
> +	if (!vmm->readable) {
> +		if (!local)
> +			unlock(&vm_maps_lock);
> +		prerror("  vmm not readable\n");
> +		ret = false;
> +		assert(0);
> +		goto out;
> +	}
> +	if (store && !vmm->writeable) {
> +		if (!vm_strict) {
> +			htab_install(dar, pa, store, 0, vmm->ci, local);
> +			if (!local)
> +				unlock(&vm_maps_lock);
> +			prerror("Page fault store to RO VMM:%s at NIA:0x%016llx DAR:0x%016llx\n", vmm->name, nia, dar);
> +			backtrace();
> +			goto out;
> +		}
> +		if (!local)
> +			unlock(&vm_maps_lock);
> +		prerror("  vmm not writeable\n");
> +		ret = false;
> +		assert(0);
> +		goto out;
> +	}
> +
> +	htab_install(dar, pa, vmm->writeable, vmm->executable, vmm->ci, local);
> +	if (!local)
> +		unlock(&vm_maps_lock);
> +
> +out:
> +	return ret;
> +}
> +
> +bool __nomcount vm_isi(uint64_t nia)
> +{
> +	struct vm_map *vmm;
> +
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		assert(vmm->pa == vmm->address);
> +		if (nia >= vmm->address && nia < vmm->address + vmm->length) {
> +			if (!vmm->executable)
> +				prerror("Page fault at NIA:0x%016llx NX mapping!\n", nia);
> +			goto found;
> +		}
> +	}
> +
> +	prerror("Page fault, no mapping for NIA:0x%016llx !\n", nia);
> +
> +found:
> +	unlock(&vm_maps_lock);
> +	htab_install(nia, nia, 0, 1, 0, false);
> +
> +	return true;
> +}
> +
> +static void cpu_stop_vm(void *arg __unused)
> +{
> +	vm_exit();
> +}
> +
> +static void cpu_cleanup_vm(void *arg __unused)
> +{
> +	slb_remove_all();
> +	mtspr(SPR_PTCR, 0);
> +	mtspr(SPR_LPCR, old_lpcr);
> +}
> +
> +static void cpu_all_destroy_vm(void)
> +{
> +	struct cpu_thread *cpu;
> +	struct cpu_job **jobs;
> +
> +	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
> +	assert(jobs);
> +
> +	/* Stop all CPUs */
> +	for_each_available_cpu(cpu) {
> +		if (cpu == this_cpu())
> +			continue;
> +		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
> +						cpu_stop_vm, NULL);
> +	}
> +
> +	/* this cpu */
> +	cpu_stop_vm(NULL);
> +
> +	/* Cleaup after all stop */
> +	for_each_available_cpu(cpu) {
> +		if (jobs[cpu->pir])
> +			cpu_wait_job(jobs[cpu->pir], true);
> +	}
> +
> +	for_each_available_cpu(cpu) {
> +		if (cpu == this_cpu())
> +			continue;
> +		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
> +						cpu_cleanup_vm, NULL);
> +	}
> +
> +	/* this cpu */
> +	cpu_cleanup_vm(NULL);
> +
> +	for_each_available_cpu(cpu) {
> +		if (jobs[cpu->pir])
> +			cpu_wait_job(jobs[cpu->pir], true);
> +	}
> +
> +	free(jobs);
> +
> +	cleanup_global_tlb();
> +}
> +
> +static void print_maps(void)
> +{
> +	struct vm_map *vmm;
> +
> +	prlog(PR_DEBUG, " %lu Global mappings\n", nr_vm_maps);
> +	list_for_each(&vm_maps, vmm, list) {
> +		prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
> +			vmm->address, vmm->address + vmm->length);
> +	}
> +}
> +
> +void vm_init(bool fast_reboot)
> +{
> +	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
> +	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
> +	unsigned long sym_start = (unsigned long)__sym_map_start;
> +	unsigned long sym_size = (unsigned long)__sym_map_end - sym_start;
> +	unsigned long htab_nr_bytes;
> +	unsigned long htab_nr_ptegs;
> +
> +	old_lpcr = mfspr(SPR_LPCR);
> +	new_lpcr = (old_lpcr & ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43)))
> +								| PPC_BIT(54);
> +
> +	prtab = memalign(64*1024, 64*1024);
> +	assert(prtab);
> +	memset(prtab, 0, 64*1024);
> +
> +	htab_shift = 18; /* 256kB table */
> +	htab_nr_bytes = 1UL << htab_shift;
> +	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
> +	htab_pteg_mask = htab_nr_ptegs - 1;
> +	htab = memalign(1UL << htab_shift, htab_nr_bytes);
> +	assert(htab);
> +	memset(htab, 0, htab_nr_bytes);
> +
> +	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab | (htab_shift - 18));
> +	prtab[0].dword[1] = 0;
> +
> +	eieio();
> +
> +	vm_init_cpu();
> +
> +	cleanup_global_tlb();
> +
> +	if (vm_globals_allocated) {
> +		assert(fast_reboot);
> +		goto done;
> +	}
> +
> +	assert(!fast_reboot);
> +	vm_globals_allocated = true;
> +
> +	vm_map_global_text("OPAL text", (unsigned long)_stext,
> +			   (unsigned long)_etext - (unsigned long)_stext);
> +	vm_map_global("OPAL rodata", (unsigned long)__rodata_start,
> +		      (unsigned long)__vm_mapped_romem_end - (unsigned long)__rodata_start,
> +		      false, false);
> +	vm_map_global("OPAL data", (unsigned long)_sdata,
> +		      (unsigned long)_edata - (unsigned long)_sdata,
> +		      true, false);
> +	vm_map_global("OPAL symbols", sym_start, sym_size, false, false);
> +	vm_map_global("OPAL bss", (unsigned long)_sbss,
> +		      (unsigned long)_ebss - (unsigned long)_sbss,
> +		      true, false);
> +	vm_map_global("OPAL heap", HEAP_BASE, HEAP_SIZE, true, false);
> +	vm_map_global("Memory console", INMEM_CON_START, INMEM_CON_LEN, true, false);
> +	vm_map_global("Hostboot console", HBRT_CON_START, HBRT_CON_LEN, false, false);
> +	vm_map_global("SPIRA heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
> +	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE, false, false);
> +	vm_map_global("OPAL boot stacks", stack_start, stack_end - stack_start, true, false);
> +
> +done:
> +	prlog(PR_DEBUG, "VMM: SETUP\n");
> +	prlog(PR_DEBUG, " PRTAB:%p\n", prtab);
> +	prlog(PR_DEBUG, " HTAB: %p\n", htab);
> +	print_maps();
> +
> +	vm_setup = true;
> +
> +	vm_enter();
> +}
> +
> +void vm_init_stacks(void)
> +{
> +	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
> +	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
> +	struct cpu_thread *c = this_cpu();
> +	struct vm_map *vmm;
> +
> +	/* Can not take a d-side fault while holdig this lock */
> +	if (c->vm_setup)
> +		mtmsr(mfmsr() & ~MSR_DR);
> +	lock(&vm_maps_lock);
> +	list_for_each(&vm_maps, vmm, list) {
> +		if (vmm->address >= stack_end)
> +			continue;
> +		if (vmm->address + vmm->length <= stack_start)
> +			continue;
> +		goto found;
> +	}
> +	unlock(&vm_maps_lock);
> +	assert(0);
> +
> +found:
> +	vmm->name = "OPAL stacks";
> +	vmm->address = stack_start;
> +	vmm->length = stack_end - stack_start;
> +	unlock(&vm_maps_lock);
> +	if (c->vm_setup)
> +		mtmsr(mfmsr() | MSR_DR);
> +}
> +
> +void vm_destroy(void)
> +{
> +	assert(vm_setup);
> +
> +	prlog(PR_DEBUG, "VMM: TEARDOWN\n");
> +	print_maps();
> +
> +	cpu_all_destroy_vm();
> +
> +	vm_setup = false;
> +
> +	if (0) { /* XXX: leave for VMM enabled fast-reboot */
> +		while (!list_empty(&vm_maps)) {
> +			struct vm_map *vmm;
> +			vmm = list_pop(&vm_maps, struct vm_map, list);
> +			free(vmm);
> +		}
> +	}
> +
> +	free(htab);
> +	htab = NULL;
> +	free(prtab);
> +	prtab = NULL;
> +}
> diff --git a/hdata/spira.c b/hdata/spira.c
> index 35d6109d3..870903bd8 100644
> --- a/hdata/spira.c
> +++ b/hdata/spira.c
> @@ -1703,11 +1703,20 @@ static void fixup_spira(void)
>  static void update_spirah_addr(void)
>  {
>  #if !defined(TEST)
> +	beint64_t *spirah_offset;
> +	beint64_t *spira_offset;
> +
>  	if (proc_gen < proc_gen_p9)
>  		return;
>  
> -	naca.spirah_addr = CPU_TO_BE64(SPIRAH_OFF);
> -	naca.spira_addr = CPU_TO_BE64(SPIRA_OFF);
> +	spirah_offset = vm_map((u64)&naca, sizeof(u64), true);
> +	*spirah_offset = CPU_TO_BE64(SPIRAH_OFF);
> +	vm_unmap((unsigned long)spirah_offset, sizeof(u64));
> +
> +	spira_offset = vm_map((u64)&naca + 0x30, sizeof(u64), true);
> +	*spira_offset = CPU_TO_BE64(SPIRA_OFF);
> +	vm_unmap((unsigned long)spira_offset, sizeof(u64));
> +
>  	spirah.ntuples.hs_data_area.addr = CPU_TO_BE64(SPIRA_HEAP_BASE - SKIBOOT_BASE);
>  	spirah.ntuples.mdump_res.addr = CPU_TO_BE64(MDRT_TABLE_BASE - SKIBOOT_BASE);
>  #endif
> @@ -1715,13 +1724,24 @@ static void update_spirah_addr(void)
>  
>  int parse_hdat(bool is_opal)
>  {
> +	int ret = 0;
> +
>  	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
>  
>  	prlog(PR_DEBUG, "Parsing HDAT...\n");
>  
> +	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
>  	fixup_spira();
> +	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
>  
> +	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), true, false);
>  	update_spirah_addr();
> +	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
> +
> +	/* Downgrade to read-only */
> +
> +	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
> +	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
>  
>  	/*
>  	 * Basic DT root stuff
> @@ -1742,8 +1762,10 @@ int parse_hdat(bool is_opal)
>  	dt_init_led_node();
>  
>  	/* Parse PCIA */
> -	if (!pcia_parse())
> -		return -1;
> +	if (!pcia_parse()) {
> +		ret = -1;
> +		goto out;
> +	}
>  
>  	/* IPL params */
>  	add_iplparams();
> @@ -1789,6 +1811,9 @@ int parse_hdat(bool is_opal)
>  		node_stb_parse();
>  
>  	prlog(PR_DEBUG, "Parsing HDAT...done\n");
> +out:
> +	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
> +	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
>  
> -	return 0;
> +	return ret;
>  }
> diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
> index 44adde4a3..d1ed62e9e 100644
> --- a/hw/fake-nvram.c
> +++ b/hw/fake-nvram.c
> @@ -23,12 +23,16 @@ int fake_nvram_info(uint32_t *total_size)
>  
>  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>  {
> +	void *t;
> +
>  	if (!nvram_region)
>  		return -ENODEV;
>  
> +	t = vm_map(nvram_region->start + src, len, false);
>  	lock(&fake_nvram_lock);
> -	memcpy(dst, (void *) (nvram_region->start + src), len);
> +	memcpy(dst, t, len);
>  	unlock(&fake_nvram_lock);
> +	vm_unmap(nvram_region->start + src, len);
>  
>  	nvram_read_complete(true);
>  
> @@ -37,12 +41,16 @@ int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>  
>  int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
>  {
> +	void *t;
> +
>  	if (!nvram_region)
>  		return OPAL_HARDWARE;
>  
> +	t = vm_map(nvram_region->start + offset, size, true);
>  	lock(&fake_nvram_lock);
> -	memcpy((void *) (nvram_region->start + offset), src, size);
> +	memcpy(t, src, size);
>  	unlock(&fake_nvram_lock);
> +	vm_unmap(nvram_region->start + offset, size);
>  
>  	return 0;
>  }
> diff --git a/hw/homer.c b/hw/homer.c
> index c5dbd58e3..58d629d23 100644
> --- a/hw/homer.c
> +++ b/hw/homer.c
> @@ -108,6 +108,9 @@ static void homer_init_chip(struct proc_chip *chip)
>  
>  		chip->homer_base = hbase;
>  		chip->homer_size = hsize;
> +		/* slw late init and xive late init want to write to HOMER */
> +		/* XXX: make it read only until then? */
> +		vm_map_global("HOMER Image", hbase, hsize, true, false);
>  	}
>  
>  	/*
> @@ -134,13 +137,21 @@ static void homer_init_chip(struct proc_chip *chip)
>  		chip->slw_base = sbase;
>  		chip->slw_bar_size = ssize;
>  		chip->slw_image_size = ssize; /* will be adjusted later */
> +		/* XXX */
>  	}
>  
>  	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
> -		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
> -		      obase, osize / 0x100000);
> +		static uint64_t homer_obase = 0;
> +
>  		chip->occ_common_base = obase;
>  		chip->occ_common_size = osize;
> +
> +		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
> +		      obase, osize / 0x100000);
> +		if (obase != homer_obase) {
> +			vm_map_global("OCC Common Area", obase, osize, false, false);
> +			homer_obase = obase;
> +		}
>  	}
>  }
>  
> diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
> index 979a617c3..898fc4b1c 100644
> --- a/hw/lpc-uart.c
> +++ b/hw/lpc-uart.c
> @@ -59,7 +59,7 @@ static uint32_t uart_base;
>  static bool has_irq = false, irq_ok, rx_full, tx_full;
>  static uint8_t tx_room;
>  static uint8_t cached_ier;
> -static void *mmio_uart_base;
> +void *mmio_uart_base;
>  static int uart_console_policy = UART_CONSOLE_OPAL;
>  static int lpc_irq = -1;
>  
> @@ -591,6 +591,8 @@ void early_uart_init(void)
>  	if (!mmio_uart_base)
>  		return;
>  
> +	vm_map_global("UART MMIO", (unsigned long)mmio_uart_base, 8, true, true);
> +
>  	clk = dt_prop_get_u32(uart_node, "clock-frequency");
>  	baud = dt_prop_get_u32(uart_node, "current-speed");
>  
> @@ -599,6 +601,7 @@ void early_uart_init(void)
>  		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
>  	} else {
>  		prerror("UART: Early init failed!");
> +		vm_unmap_global((unsigned long)mmio_uart_base, 8);
>  		mmio_uart_base = NULL;
>  	}
>  }
> @@ -610,9 +613,6 @@ void uart_init(void)
>  	char *path __unused;
>  	const be32 *irqp;
>  
> -	/* Clean up after early_uart_init() */
> -	mmio_uart_base = NULL;
> -
>  	/* UART lock is in the console path and thus must block
>  	 * printf re-entrancy
>  	 */
> @@ -630,13 +630,28 @@ void uart_init(void)
>  	 * directly mapped UARTs in simulation environments
>  	 */
>  	if (n->parent == dt_root) {
> +		void *base;
> +
>  		printf("UART: Found at root !\n");
> -		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
> -		if (!mmio_uart_base) {
> +
> +		base = (void *)dt_translate_address(n, 0, NULL);
> +		if (!base) {
>  			printf("UART: Failed to translate address !\n");
>  			return;
>  		}
>  
> +		if (mmio_uart_base != base) {
> +			void *old;
> +
> +			vm_map_global("UART MMIO", (unsigned long)base, 8, true, true);
> +			old = mmio_uart_base;
> +			mmio_uart_base = base;
> +
> +			/* Clean up after early_uart_init() */
> +			if (old)
> +				vm_unmap_global((unsigned long)old, 8);
> +		}
> +
>  		/* If it has an interrupt properly, we consider this to be
>  		 * a direct XICS/XIVE interrupt
>  		 */
> @@ -665,6 +680,11 @@ void uart_init(void)
>  			lpc_irq = be32_to_cpu(*irqp);
>  			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
>  		}
> +
> +		if (mmio_uart_base) {
> +//			vm_unmap_global((unsigned long)mmio_uart_base, 8);
> +			mmio_uart_base = NULL;
> +		}
>  	}
>  
>  
> diff --git a/hw/lpc.c b/hw/lpc.c
> index c2a07a0db..cb2fed2a2 100644
> --- a/hw/lpc.c
> +++ b/hw/lpc.c
> @@ -1239,6 +1239,7 @@ static void lpc_init_chip_p8(struct dt_node *xn)
>  	chip->lpc = lpc;
>  }
>  
> +void *mmio_uart_base;
>  static void lpc_init_chip_p9(struct dt_node *opb_node)
>  {
>  	uint32_t gcid = dt_get_chip_id(opb_node);
> @@ -1261,6 +1262,11 @@ static void lpc_init_chip_p9(struct dt_node *opb_node)
>  	if (!lpc_node)
>  		return;
>  
> +
> +	if (mmio_uart_base)
> +		vm_unmap_global((unsigned long)mmio_uart_base, 8);
> +	vm_map_global("LPC MMIO", addr, 0x100000000UL /* XXX: size? */, true, true);
> +
>  	lpc = zalloc(sizeof(struct lpcm));
>  	assert(lpc);
>  	lpc->chip_id = gcid;
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 60e797cf6..2447c6722 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -5830,6 +5830,7 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
>  	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
>  	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
> +	uint64_t bar_sz;
>  	void *foo;
>  	__be64 mmio_win[4];
>  	unsigned int mmio_win_sz;
> @@ -5858,7 +5859,8 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	bar_en = 0;
>  
>  	/* Initialize PHB register BAR */
> -	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
> +	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
> +	vm_map_global("PHB REGS", phb_bar, bar_sz, true, true);
>  	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
>  			 phb_bar << 8);
>  
> @@ -5872,18 +5874,21 @@ static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
>  	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
>  
>  	/* Same with INT BAR (ESB) */
> -	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
> +	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
> +	vm_map_global("PHB IRQ", irq_bar, bar_sz, true, true);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
>  	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
>  
>  
>  	/* Same with MMIO windows */
>  	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
> +	vm_map_global("PHB MMIO0", mmio0_bar, mmio0_sz, true, true);
>  	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
>  
>  	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
> +	vm_map_global("PHB MMIO1", mmio1_bar, mmio1_sz, true, true);
>  	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
>  	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
> diff --git a/hw/psi.c b/hw/psi.c
> index 63fcb257e..45f11c6b9 100644
> --- a/hw/psi.c
> +++ b/hw/psi.c
> @@ -908,6 +908,8 @@ static bool psi_init_psihb(struct dt_node *psihb)
>  
>  	list_add(&psis, &psi->list);
>  
> +	vm_map_global("PSI", (unsigned long)psi->regs, 0x100, true, true);
> +
>  	val = in_be64(psi->regs + PSIHB_CR);
>  	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
>  		lock(&psi_lock);
> diff --git a/hw/slw.c b/hw/slw.c
> index beb129a86..ccb100087 100644
> --- a/hw/slw.c
> +++ b/hw/slw.c
> @@ -151,7 +151,7 @@ static void slw_patch_reset(void)
>  		*(sav++) = *(dst);
>  		*(dst++) = *(src++);
>  	}
> -	sync_icache();
> +	sync_icache(0);
>  }
>  
>  static void slw_unpatch_reset(void)
> @@ -167,7 +167,7 @@ static void slw_unpatch_reset(void)
>  		*(dst++) = *(sav++);
>  		src++;
>  	}
> -	sync_icache();
> +	sync_icache(0);
>  }
>  
>  static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
> diff --git a/hw/xive.c b/hw/xive.c
> index 9a36f1ab2..c6aed7c9f 100644
> --- a/hw/xive.c
> +++ b/hw/xive.c
> @@ -1397,6 +1397,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* IC BAR */
>  	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
> +	vm_map_global("XIVE IC", (unsigned long)x->ic_base, x->ic_size, true, true);
>  	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
>  	if (IC_PAGE_SIZE == 0x10000) {
>  		val |= CQ_IC_BAR_64K;
> @@ -1412,6 +1413,8 @@ static bool xive_configure_bars(struct xive *x)
>  	 * all phys_map_get(XIVE_TM) calls.
>  	 */
>  	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
> +	if (chip_id == 0)
> +		vm_map_global("XIVE TM", (unsigned long)x->tm_base, x->tm_size, true, true);
>  	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID;
>  	if (TM_PAGE_SIZE == 0x10000) {
>  		x->tm_shift = 16;
> @@ -1427,6 +1430,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* PC BAR. Clear first, write mask, then write value */
>  	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
> +	vm_map_global("XIVE PC", (unsigned long)x->pc_base, x->pc_size, true, true);
>  	xive_regwx(x, CQ_PC_BAR, 0);
>  	if (x->last_reg_error)
>  		return false;
> @@ -1441,6 +1445,7 @@ static bool xive_configure_bars(struct xive *x)
>  
>  	/* VC BAR. Clear first, write mask, then write value */
>  	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
> +	vm_map_global("XIVE VC", (unsigned long)x->vc_base, x->vc_size, true, true);
>  	xive_regwx(x, CQ_VC_BAR, 0);
>  	if (x->last_reg_error)
>  		return false;
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 0eda567fc..ef1a83fd4 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -931,6 +931,7 @@ void xscom_init(void)
>  		const struct dt_property *reg;
>  		struct proc_chip *chip;
>  		const char *chip_name;
> +		u64 size;
>  		static const char *chip_names[] = {
>  			"UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P"
>  		};
> @@ -945,6 +946,9 @@ void xscom_init(void)
>  		assert(reg);
>  
>  		chip->xscom_base = dt_translate_address(xn, 0, NULL);
> +		size = dt_property_get_u64(reg, 1);
> +
> +		vm_map_global("XSCOM MMIO", chip->xscom_base, size, true, true);
>  
>  		/* Grab processor type and EC level */
>  		xscom_init_chip_info(chip);
> diff --git a/include/cmpxchg.h b/include/cmpxchg.h
> index 0304e9134..835743cf5 100644
> --- a/include/cmpxchg.h
> +++ b/include/cmpxchg.h
> @@ -5,6 +5,9 @@
>  #define __CMPXCHG_H
>  
>  #ifndef __TEST__
> +#include <stdint.h>
> +#include <processor.h>
> +
>  /*
>   * Bare cmpxchg, no barriers.
>   */
> diff --git a/include/cpu.h b/include/cpu.h
> index 8ef20e35b..026328904 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -12,6 +12,19 @@
>  #include <stack.h>
>  #include <timer.h>
>  
> +struct vm_map {
> +	struct list_node list;
> +
> +	const char *name;
> +	uint64_t address;
> +	uint64_t pa;
> +	uint64_t length;
> +	bool readable;
> +	bool writeable;
> +	bool executable;
> +	bool ci;
> +};
> +
>  /*
>   * cpu_thread is our internal structure representing each
>   * thread in the system
> @@ -71,10 +84,19 @@ struct cpu_thread {
>  	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
>  	struct bt_metadata		stack_bot_bt_metadata;
>  #endif
> +	/*
> +	 * Per-thread VM parameters
> +	 */
> +	struct vm_map			vm_local_map; /* per-cpu map */
> +	bool				vm_local_map_inuse;
> +	uint8_t				vm_slb_rr; /* RR allocator */
> +	bool				vm_setup; /* virtual memory is up */
> +
>  	struct lock			job_lock;
>  	struct list_head		job_queue;
>  	uint32_t			job_count;
>  	bool				job_has_no_return;
> +
>  	/*
>  	 * Per-core mask tracking for threads in HMI handler and
>  	 * a cleanup done bit.
> diff --git a/include/elf-abi.h b/include/elf-abi.h
> index 29c757642..34b95d337 100644
> --- a/include/elf-abi.h
> +++ b/include/elf-abi.h
> @@ -21,7 +21,16 @@
>  static inline uint64_t function_entry_address(void *func)
>  {
>  #ifdef ELF_ABI_v2
> -	u32 *insn = func;
> +	u32 *ret = func;
> +	u32 *i;
> +	u32 insn;
> +	u32 insn2;
> +
> +	i = vm_map((unsigned long)func, sizeof(insn)*2, false);
> +	insn = *i;
> +	insn2 = *(i+1);
> +	vm_unmap((unsigned long)func, sizeof(insn)*2);
> +
>  	/*
>  	 * A PPC64 ABIv2 function may have a local and a global entry
>  	 * point. We use the local entry point for branch tables called
> @@ -38,12 +47,12 @@ static inline uint64_t function_entry_address(void *func)
>  	 * lis   r2,XXXX
>  	 * addi  r2,r2,XXXX
>  	 */
> -	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
> -	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
> -	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
> -		return (uint64_t)(insn + 2);
> +	if ((((insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
> +	     ((insn & OP_RT_RA_MASK) == LIS_R2)) &&
> +	    ((insn2 & OP_RT_RA_MASK) == ADDI_R2_R2))
> +		return (uint64_t)(ret + 2);
>  	else
> -		return (uint64_t)func;
> +		return (uint64_t)ret;
>  #else
>  	return *(uint64_t *)func;
>  #endif
> diff --git a/include/io.h b/include/io.h
> index f00021dcd..5c1bd41b4 100644
> --- a/include/io.h
> +++ b/include/io.h
> @@ -7,6 +7,7 @@
>  #ifndef __ASSEMBLY__
>  
>  #include <compiler.h>
> +#include <skiboot.h>
>  #include <stdint.h>
>  #include <processor.h>
>  #include <types.h>
> @@ -23,8 +24,13 @@
>  static inline uint8_t __in_8(const volatile uint8_t *addr)
>  {
>  	uint8_t val;
> -	asm volatile("lbzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lbzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return val;
>  }
>  
> @@ -37,8 +43,13 @@ static inline uint8_t in_8(const volatile uint8_t *addr)
>  static inline uint16_t __in_be16(const volatile beint16_t *addr)
>  {
>  	__be16 val;
> -	asm volatile("lhzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lhzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be16_to_cpu(val);
>  }
>  
> @@ -51,8 +62,13 @@ static inline uint16_t in_be16(const volatile beint16_t *addr)
>  static inline uint16_t __in_le16(const volatile leint16_t *addr)
>  {
>  	__le16 val;
> -	asm volatile("lhzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lhzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le16_to_cpu(val);
>  }
>  
> @@ -65,8 +81,13 @@ static inline uint16_t in_le16(const volatile leint16_t *addr)
>  static inline uint32_t __in_be32(const volatile beint32_t *addr)
>  {
>  	__be32 val;
> -	asm volatile("lwzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lwzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be32_to_cpu(val);
>  }
>  
> @@ -79,8 +100,13 @@ static inline uint32_t in_be32(const volatile beint32_t *addr)
>  static inline uint32_t __in_le32(const volatile leint32_t *addr)
>  {
>  	__le32 val;
> -	asm volatile("lwzcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("lwzcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le32_to_cpu(val);
>  }
>  
> @@ -93,8 +119,13 @@ static inline uint32_t in_le32(const volatile leint32_t *addr)
>  static inline uint64_t __in_be64(const volatile beint64_t *addr)
>  {
>  	__be64 val;
> -	asm volatile("ldcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("ldcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return be64_to_cpu(val);
>  }
>  
> @@ -107,8 +138,13 @@ static inline uint64_t in_be64(const volatile beint64_t *addr)
>  static inline uint64_t __in_le64(const volatile leint64_t *addr)
>  {
>  	__le64 val;
> -	asm volatile("ldcix %0,0,%1" :
> -		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
> +
> +	if (vm_realmode())
> +		asm volatile("ldcix %0,0,%1" :
> +		     "=r"(val) : "r"(addr), "m"(*addr));
> +	else
> +		val = *addr;
> +
>  	return le64_to_cpu(val);
>  }
>  
> @@ -120,8 +156,11 @@ static inline uint64_t in_le64(const volatile leint64_t *addr)
>  
>  static inline void __out_8(volatile uint8_t *addr, uint8_t val)
>  {
> -	asm volatile("stbcix %0,0,%1"
> -		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
> +	if (vm_realmode())
> +		asm volatile("stbcix %0,0,%1"
> +		     : : "r"(val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = val;
>  }
>  
>  static inline void out_8(volatile uint8_t *addr, uint8_t val)
> @@ -132,8 +171,12 @@ static inline void out_8(volatile uint8_t *addr, uint8_t val)
>  
>  static inline void __out_be16(volatile beint16_t *addr, uint16_t val)
>  {
> -	asm volatile("sthcix %0,0,%1"
> -		     : : "r"(cpu_to_be16(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be16 __val = cpu_to_be16(val);
> +	if (vm_realmode())
> +		asm volatile("sthcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be16(volatile beint16_t *addr, uint16_t val)
> @@ -144,8 +187,12 @@ static inline void out_be16(volatile beint16_t *addr, uint16_t val)
>  
>  static inline void __out_le16(volatile leint16_t *addr, uint16_t val)
>  {
> -	asm volatile("sthcix %0,0,%1"
> -		     : : "r"(cpu_to_le16(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le16 __val = cpu_to_le16(val);
> +	if (vm_realmode())
> +		asm volatile("sthcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le16(volatile leint16_t *addr, uint16_t val)
> @@ -156,8 +203,12 @@ static inline void out_le16(volatile leint16_t *addr, uint16_t val)
>  
>  static inline void __out_be32(volatile beint32_t *addr, uint32_t val)
>  {
> -	asm volatile("stwcix %0,0,%1"
> -		     : : "r"(cpu_to_be32(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be32 __val = cpu_to_be32(val);
> +	if (vm_realmode())
> +		asm volatile("stwcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be32(volatile beint32_t *addr, uint32_t val)
> @@ -168,8 +219,12 @@ static inline void out_be32(volatile beint32_t *addr, uint32_t val)
>  
>  static inline void __out_le32(volatile leint32_t *addr, uint32_t val)
>  {
> -	asm volatile("stwcix %0,0,%1"
> -		     : : "r"(cpu_to_le32(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le32 __val = cpu_to_le32(val);
> +	if (vm_realmode())
> +		asm volatile("stwcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le32(volatile leint32_t *addr, uint32_t val)
> @@ -180,8 +235,12 @@ static inline void out_le32(volatile leint32_t *addr, uint32_t val)
>  
>  static inline void __out_be64(volatile beint64_t *addr, uint64_t val)
>  {
> -	asm volatile("stdcix %0,0,%1"
> -		     : : "r"(cpu_to_be64(val)), "r"(addr), "m"(*addr) : "memory");
> +	__be64 __val = cpu_to_be64(val);
> +	if (vm_realmode())
> +		asm volatile("stdcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_be64(volatile beint64_t *addr, uint64_t val)
> @@ -192,8 +251,12 @@ static inline void out_be64(volatile beint64_t *addr, uint64_t val)
>  
>  static inline void __out_le64(volatile leint64_t *addr, uint64_t val)
>  {
> -	asm volatile("stdcix %0,0,%1"
> -		     : : "r"(cpu_to_le64(val)), "r"(addr), "m"(*addr) : "memory");
> +	__le64 __val = cpu_to_le64(val);
> +	if (vm_realmode())
> +		asm volatile("stdcix %0,0,%1"
> +		     : : "r"(__val), "r"(addr), "m"(*addr));
> +	else
> +		*addr = __val;
>  }
>  
>  static inline void out_le64(volatile leint64_t *addr, uint64_t val)
> diff --git a/include/mem_region.h b/include/mem_region.h
> index 3e3818a66..47c3bd70c 100644
> --- a/include/mem_region.h
> +++ b/include/mem_region.h
> @@ -33,6 +33,7 @@ struct mem_region {
>  	struct list_node list;
>  	const char *name;
>  	uint64_t start, len;
> +	uint64_t vm_mapped_len;
>  	struct dt_node *node;
>  	enum mem_region_type type;
>  	struct list_head free_list;
> diff --git a/include/platform.h b/include/platform.h
> index 6aa263ae0..e431a5fe0 100644
> --- a/include/platform.h
> +++ b/include/platform.h
> @@ -298,8 +298,8 @@ struct platform {
>  	void (*vpd_iohub_load)(struct dt_node *hub_node);
>  };
>  
> -extern struct platform __platforms_start;
> -extern struct platform __platforms_end;
> +extern struct platform __platforms_start[];
> +extern struct platform __platforms_end[];
>  
>  extern struct platform	platform;
>  extern const struct bmc_platform *bmc_platform;
> diff --git a/include/processor.h b/include/processor.h
> index 7ba251bb4..9d197ffc1 100644
> --- a/include/processor.h
> +++ b/include/processor.h
> @@ -39,7 +39,9 @@
>  #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
>  #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
>  #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
> +#define SPR_PID		0x030	/* RW: PID register */
>  #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
> +#define SPR_UAMOR	0x09d
>  #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
>  #define SPR_TBRL	0x10c	/* RO: Timebase low */
>  #define SPR_TBRU	0x10d	/* RO: Timebase high */
> @@ -61,10 +63,12 @@
>  #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
>  #define SPR_TFMR	0x13d
>  #define SPR_LPCR	0x13e
> +#define SPR_LPID	0x13f	/* RW: LPID register */
>  #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
>  #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
>  #define SPR_PCR		0x152
>  #define SPR_AMOR	0x15d
> +#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
>  #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
>  #define SPR_TSCR	0x399
>  #define SPR_HID0	0x3f0
> @@ -80,6 +84,11 @@
>  #define SPR_SRR1_PM_WAKE_SRESET	0x100000
>  #define SPR_SRR1_PM_WAKE_MCE	0x3c0000	/* Use reserved value for MCE */
>  
> +/* Bits in DSISR */
> +
> +#define	DSISR_ISSTORE		0x02000000
> +
> +
>  /* Bits in LPCR */
>  
>  /* Powersave Exit Cause Enable is different on each generation */
> @@ -318,9 +327,9 @@ static inline void isync(void)
>  /*
>   * Cache sync
>   */
> -static inline void sync_icache(void)
> +static inline void sync_icache(unsigned long ptr)
>  {
> -	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
> +	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
>  }
>  
>  /*
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 30ff500c5..aacb425f7 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -42,10 +42,16 @@ extern char _stext[];
>  extern char _etext[];
>  extern char __sym_map_end[];
>  extern char _romem_end[];
> +extern char __vm_mapped_romem_end[];
>  
>  #ifndef __TESTING__
> +extern char _stext[], _etext[];
>  /* Readonly section start and end. */
>  extern char __rodata_start[], __rodata_end[];
> +extern char _sdata[], _edata[];
> +extern char __sym_map_start[], __sym_map_end[];
> +extern char _sbss[], _ebss[];
> +extern char _end[];
>  
>  static inline bool is_rodata(const void *p)
>  {
> @@ -184,6 +190,7 @@ extern void disable_fast_reboot(const char *reason);
>  extern void add_fast_reboot_dt_entries(void);
>  extern void fast_reboot(void);
>  extern void __noreturn __secondary_cpu_entry(void);
> +extern void __noreturn __return_cpu_entry(void);
>  extern void __noreturn load_and_boot_kernel(bool is_reboot);
>  extern void cleanup_local_tlb(void);
>  extern void cleanup_global_tlb(void);
> @@ -336,4 +343,24 @@ extern int fake_nvram_info(uint32_t *total_size);
>  extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
>  extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
>  
> +/* core/vm.c */
> +bool vm_realmode(void);
> +void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
> +void vm_map_global_text(const char *name, unsigned long addr, unsigned long len);
> +void vm_unmap_global(unsigned long addr, unsigned long len);
> +void *vm_map(unsigned long addr, unsigned long len, bool rw);
> +void vm_unmap(unsigned long addr, unsigned long len);
> +void vm_init(bool fast_reboot);
> +void vm_init_stacks(void);
> +void vm_destroy(void);
> +void vm_init_secondary(void);
> +void vm_enter(void);
> +void vm_exit(void);
> +void vm_exit_cleanup(void);
> +void vm_map_stacks(void);
> +bool vm_dslb(uint64_t nia, uint64_t dar);
> +bool vm_islb(uint64_t nia);
> +bool vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr);
> +bool vm_isi(uint64_t nia);
> +
>  #endif /* __SKIBOOT_H */
> diff --git a/libstb/container.c b/libstb/container.c
> index eca54cf63..2b8f22f70 100644
> --- a/libstb/container.c
> +++ b/libstb/container.c
> @@ -6,14 +6,20 @@
>  
>  bool stb_is_container(const void *buf, size_t size)
>  {
> +	beint32_t *t;
>  	ROM_container_raw *c;
> +	bool ret = true;;
>  
>  	c = (ROM_container_raw*) buf;
>  	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
>  		return false;
> -	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
> -		return false;
> -	return true;
> +
> +	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
> +	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
> +		ret = false;
> +	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
> +
> +	return ret;
>  }
>  
>  uint32_t stb_payload_magic(const void *buf, size_t size)
> diff --git a/libstb/cvc.c b/libstb/cvc.c
> index 663e53953..08b2eea60 100644
> --- a/libstb/cvc.c
> +++ b/libstb/cvc.c
> @@ -155,6 +155,9 @@ static int cvc_reserved_mem_init(struct dt_node *parent) {
>  		return -1;
>  	}
>  	addr = dt_get_address(cvc_resv_mem, 0, &size);
> +	if (size == 0) // MAMBO HACK
> +		size = 64*1024;
> +	vm_map_global_text("STB-CVC", addr, size);
>  	cvc_register(addr, addr + size-1);
>  
>  	exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
> diff --git a/libstb/secureboot.c b/libstb/secureboot.c
> index c86972161..dc3bda3d2 100644
> --- a/libstb/secureboot.c
> +++ b/libstb/secureboot.c
> @@ -164,6 +164,7 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
>  {
>  	const char *name;
>  	__be64 log;
> +	void *vbuf;
>  	int rc = -1;
>  
>  	name = flash_map_resource_name(id);
> @@ -181,7 +182,9 @@ int secureboot_verify(enum resource_id id, void *buf, size_t len)
>  		return -1;
>          }
>  
> -	rc = call_cvc_verify(buf, len, hw_key_hash, hw_key_hash_size, &log);
> +	vbuf = vm_map((unsigned long)buf, len, false);
> +	rc = call_cvc_verify(vbuf, len, hw_key_hash, hw_key_hash_size, &log);
> +	vm_unmap((unsigned long)buf, len);
>  
>  	if (rc == OPAL_SUCCESS) {
>  		prlog(PR_NOTICE, "%s verified\n", name);
> diff --git a/libstb/trustedboot.c b/libstb/trustedboot.c
> index 413862e63..910354f7b 100644
> --- a/libstb/trustedboot.c
> +++ b/libstb/trustedboot.c
> @@ -161,7 +161,7 @@ out_free:
>  int trustedboot_measure(enum resource_id id, void *buf, size_t len)
>  {
>  	uint8_t digest[SHA512_DIGEST_LENGTH];
> -	void *buf_aux;
> +	void *buf_aux, *vbuf;
>  	size_t len_aux;
>  	const char *name;
>  	TPM_Pcr pcr;
> @@ -219,7 +219,9 @@ int trustedboot_measure(enum resource_id id, void *buf, size_t len)
>  		len_aux = len;
>  	}
>  
> -	rc = call_cvc_sha512(buf_aux, len_aux, digest, SHA512_DIGEST_LENGTH);
> +	vbuf = vm_map((unsigned long)buf_aux, len_aux, false);
> +	rc = call_cvc_sha512(vbuf, len_aux, digest, SHA512_DIGEST_LENGTH);
> +	vm_unmap((unsigned long)buf_aux, len_aux);
>  
>  	if (rc == OPAL_SUCCESS) {
>  		prlog(PR_NOTICE, "%s hash calculated\n", name);
> diff --git a/skiboot.lds.S b/skiboot.lds.S
> index b136e4004..9d21681ab 100644
> --- a/skiboot.lds.S
> +++ b/skiboot.lds.S
> @@ -123,12 +123,26 @@ SECTIONS
>  		__rodata_end = .;
>  	}
>  
> +	. = ALIGN(0x100);
> +	.got : {
> +		__toc_start = . + 0x8000;
> +		*(.got)
> +		*(.toc)
> +	}
> +
> +	. = ALIGN(0x10);
> +	.opd : {
> +		*(.opd)
> +	}
> +
>  	. = ALIGN(0x10);
>  	.trap_table : {
>  		__trap_table_start = .;
>  		KEEP(*(.trap_table))
>  		__trap_table_end = .;
>  	}
> +	__vm_mapped_romem_end = .;
> +	. = ALIGN(PAGE_SIZE);
>  
>  	. = ALIGN(0x10);
>  	.init : {
> @@ -139,18 +153,6 @@ SECTIONS
>  		__ctors_end = .;
>  	}
>  
> -	. = ALIGN(0x10);
> -	.opd : {
> -		*(.opd)
> -	}
> -  
> -	. = ALIGN(0x100);
> -	.got : {
> -		__toc_start = . + 0x8000;
> -		*(.got)
> -		*(.toc)
> -	}
> -
>  	. = ALIGN(0x10);
>  	.opal_table : {
>  		__opal_table_start = .;
>
Nicholas Piggin April 29, 2020, 11:33 p.m. UTC | #2
Excerpts from Cédric Le Goater's message of April 29, 2020 7:49 pm:
> On 4/28/20 9:44 AM, Nicholas Piggin wrote:
>> vm_map_global / vm_unmap_global sets up all-CPUs visible 1:1 mappings.
>> vm_map / vm_unmap creates a per-cpu mapping, and which can not be nested.
>> 
>> A list of global extents + a local extent per cpu is kept to describe
>> active mappings. Fault handlers look these up to install translations.
>> 
>> Booting with virtual memory is all well and good, and it can help find
>> bugs. The bigger benefit is that a logical virtual map is created in
>> the process, which can be given to the OS and used to create a virtual> memory environment for the OPAL runtime to execute in.
> 
> The goal is to turn OPAL into a kernel driver and the OPAL calls into 
> simple function calls ?  

Yes. Kind of.

Not so close coupled to the kernel for practical matters, but I would 
like skiboot to be able to use a few of the most useful facilities of 
the kernel.

printk, trap/bug/symbol printing is obvious and easy. Virtual memory is
important for security and also keeping firmware safe(r) from kernel and 
vice versa, and we want to get rid of real-mode after boot as far as 
possible.

I'll post the skiboot and Linux patches to do these again soon.

There are other features we can provide to skiboot which sound good but 
we should be very careful to add them, and only if there is real benefit 
that outweighs complexity. ftrace, sleep/wake/timer, perf interrupts, 
maybe a tunables that can be exported in a sysfs directory. I don't 
know, lots of ideas but start very small and careful.

The opposite way to go about this is have the firmware provide code 
(e.g., bytecode like AML or eBPF) and then have the kernel execute that 
in kernel context. I kind of considered that but it seems backwards and 
pointless when we have open source firmware, we can do it this way and 
mostly keep the same APIs etc.

Thanks,
Nick
diff mbox series

Patch

diff --git a/core/Makefile.inc b/core/Makefile.inc
index 829800e5b..7a4bb6797 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -3,7 +3,7 @@ 
 # -*-Makefile-*-
 
 SUBDIRS += core
-CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
 CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
 CORE_OBJS += opal-msg.o pci.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
diff --git a/core/cpu.c b/core/cpu.c
index 37d9f41a8..30f9c6e70 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -416,6 +416,10 @@  static unsigned int cpu_idle_p8(enum cpu_wake_cause wake_on)
 	}
 	isync();
 
+	/* P8 must enter nap with VM disabled */
+	if (cpu->vm_setup)
+		vm_exit();
+
 	/* Enter nap */
 	vec = enter_p8_pm_state(false);
 
@@ -476,11 +480,19 @@  static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
 		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BIT(42) | PPC_BIT(43) |
 			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/*
+		 * stop with EC=1 wakes with vm off. P9 can stop with vm
+		 * enabled, but it's simpler to disable now and so it wakes
+		 * in the proper state.
+		 */
+		if (cpu->vm_setup)
+			vm_exit();
 		vec = enter_p9_pm_state(psscr);
 	} else {
 		/* stop with EC=0 (resumes) which does not require sreset. */
 		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		/* Can run with VM enabled */
 		enter_p9_pm_lite_state(psscr);
 	}
 
@@ -499,6 +511,7 @@  static unsigned int cpu_idle_p9(enum cpu_wake_cause wake_on)
 static void cpu_idle_pm(enum cpu_wake_cause wake_on)
 {
 	unsigned int vec;
+	bool was_vm_setup = this_cpu()->vm_setup;
 
 	switch(proc_gen) {
 	case proc_gen_p8:
@@ -523,12 +536,17 @@  static void cpu_idle_pm(enum cpu_wake_cause wake_on)
 		default:
 			break;
 		}
-		mtmsrd(MSR_RI, 1);
 
 	} else if (vec == 0x200) {
 		exception_entry_pm_mce();
 		enable_machine_check();
+	}
+
+	if (vec != 0) {
+		/* 0x100 or 0x200 */
 		mtmsrd(MSR_RI, 1);
+		if (was_vm_setup)
+			vm_enter();
 	}
 }
 
@@ -1361,7 +1379,7 @@  static int64_t opal_return_cpu(void)
 		printf("OPAL in_opal_call=%u\n", this_cpu()->in_opal_call);
 	}
 
-	__secondary_cpu_entry();
+	__return_cpu_entry();
 
 	return OPAL_HARDWARE; /* Should not happen */
 }
diff --git a/core/exceptions.c b/core/exceptions.c
index 389548d16..35c14f8af 100644
--- a/core/exceptions.c
+++ b/core/exceptions.c
@@ -33,7 +33,7 @@  static void dump_regs(struct stack_frame *stack)
 
 #define EXCEPTION_MAX_STR 320
 
-static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal)
+static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bool *fatal, bool *vm_setup)
 {
 	uint64_t mce_flags, mce_addr;
 	const char *mce_err;
@@ -44,12 +44,28 @@  static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
 	decode_mce(stack->srr0, stack->srr1, stack->dsisr, stack->dar,
 			&mce_flags, &mce_err, &mce_addr);
 
-	/* Try to recover. */
-	if (mce_flags & MCE_ERAT_ERROR) {
-		/* Real-mode still uses ERAT, flush transient bitflips */
+	/* Try to recover */
+	if ((mce_flags & (MCE_SLB_ERROR|MCE_TABLE_WALK)) &&
+			(msr & (MSR_IR|MSR_DR)) &&
+			!this_cpu()->vm_local_map_inuse) {
+		/* Try to turn off VM if non-linear map is not in use. */
+		*vm_setup = false;
+		stack->srr1 &= ~(MSR_IR|MSR_DR);
+		mce_fix = "Disabling virtual memory";
+
+	} else if (mce_flags & MCE_ERAT_ERROR) {
 		flush_erat();
 		mce_fix = "ERAT flush";
 
+	} else if (mce_flags & MCE_TLB_ERROR) {
+		cleanup_global_tlb();
+		mce_fix = "global TLB flush";
+
+	} else if (mce_flags & MCE_TLB_ERROR) {
+		cleanup_global_tlb();
+		stack->srr0 += 4;
+		mce_fix = "global TLB flush and skip instruction";
+
 	} else {
 		*fatal = true;
 	}
@@ -83,6 +99,8 @@  static void handle_mce(struct stack_frame *stack, uint64_t nip, uint64_t msr, bo
 
 void exception_entry(struct stack_frame *stack)
 {
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
 	bool fatal = false;
 	bool hv;
 	uint64_t nip;
@@ -90,6 +108,8 @@  void exception_entry(struct stack_frame *stack)
 	char buf[EXCEPTION_MAX_STR];
 	size_t l;
 
+	c->vm_setup = false;
+
 	switch (stack->type) {
 	case 0x500:
 	case 0x980:
@@ -134,9 +154,44 @@  void exception_entry(struct stack_frame *stack)
 		break;
 
 	case 0x200:
-		handle_mce(stack, nip, msr, &fatal);
+		handle_mce(stack, nip, msr, &fatal, &vm_setup);
 		goto no_symbol;
 
+	case 0x300:
+		if (vm_dsi(nip, stack->dar, stack->dsisr))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal %s address "REG" at "REG"   ",
+			(stack->dsisr & DSISR_ISSTORE) ? "store" : "load",
+			stack->dar, nip);
+		break;
+
+	case 0x380:
+		if (vm_dslb(nip, stack->dar))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal load/store address "REG" at "REG"   ",
+			stack->dar, nip);
+		break;
+
+	case 0x400:
+		if (vm_isi(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
+	case 0x480:
+		if (vm_islb(nip))
+			goto out;
+		fatal = true;
+		l += snprintf(buf + l, EXCEPTION_MAX_STR - l,
+			"Fatal ifetch at "REG"   ", nip);
+		break;
+
 	case 0x700: {
 		struct trap_table_entry *tte;
 
@@ -185,11 +240,14 @@  no_symbol:
 		for (;;) ;
 	}
 
+out:
+	assert(!fatal);
 	if (hv) {
 		/* Set up for SRR return */
 		stack->srr0 = nip;
 		stack->srr1 = msr;
 	}
+	c->vm_setup = vm_setup;
 }
 
 void exception_entry_pm_sreset(void)
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 03777543a..e7f3b5c67 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -381,6 +381,9 @@  void __noreturn fast_reboot_entry(void)
 	cpu_set_sreset_enable(true);
 	cpu_set_ipi_enable(true);
 
+	/* Enter virtual memory mode */
+	vm_init(true);
+
 	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
 
 	/* Release everybody */
@@ -401,6 +404,7 @@  void __noreturn fast_reboot_entry(void)
 	fast_boot_release = false;
 
 	if (!chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
+		void *t;
 		/*
 		 * mem_region_clear_unused avoids these preload regions
 		 * so it can run along side image preloading. Clear these
@@ -410,8 +414,14 @@  void __noreturn fast_reboot_entry(void)
 		 * Mambo may have embedded payload here, so don't clear
 		 * it at all.
 		 */
-		memset(KERNEL_LOAD_BASE, 0, KERNEL_LOAD_SIZE);
-		memset(INITRAMFS_LOAD_BASE, 0, INITRAMFS_LOAD_SIZE);
+
+		t = vm_map((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true);
+		memset(t, 0, KERNEL_LOAD_SIZE);
+		vm_unmap((unsigned long)t, KERNEL_LOAD_SIZE);
+
+		t = vm_map((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true);
+		memset(t, 0, INITRAMFS_LOAD_SIZE);
+		vm_unmap((unsigned long)t, INITRAMFS_LOAD_SIZE);
 	}
 
 	/* Start preloading kernel and ramdisk */
diff --git a/core/init.c b/core/init.c
index 2bb48845d..95c0339cf 100644
--- a/core/init.c
+++ b/core/init.c
@@ -94,6 +94,7 @@  static bool try_load_elf64_le(struct elf_hdr *header)
 	uint64_t load_base = (uint64_t)kh;
 	struct elf64le_phdr *ph;
 	unsigned int i;
+	bool ret = false;
 
 	printf("INIT: 64-bit LE kernel discovered\n");
 
@@ -105,6 +106,9 @@  static bool try_load_elf64_le(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64le_phdr *)(load_base + le64_to_cpu(kh->e_phoff));
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr),
+			false, false);
 	for (i = 0; i < le16_to_cpu(kh->e_phnum); i++, ph++) {
 		if (le32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
 			continue;
@@ -121,7 +125,7 @@  static bool try_load_elf64_le(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap;
 	}
 	kernel_entry += load_base;
 	kernel_32bit = false;
@@ -133,7 +137,12 @@  static bool try_load_elf64_le(struct elf_hdr *header)
 	prlog(PR_DEBUG, "INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	      kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+out_unmap:
+	vm_unmap_global((unsigned long)ph, le16_to_cpu(kh->e_phnum)*sizeof(struct elf64le_phdr));
+
+	return ret;
 }
 
 static bool try_load_elf64(struct elf_hdr *header)
@@ -144,12 +153,17 @@  static bool try_load_elf64(struct elf_hdr *header)
 	struct elf64be_phdr *ph;
 	struct elf64be_shdr *sh;
 	unsigned int i;
+	bool ret = false;
+
+	vm_map_global("KERNEL ELF64 Header", (unsigned long)header,
+			sizeof(struct elf64be_hdr), false, false);
 
 	/* Check it's a ppc64 LE ELF */
 	if (khle->ei_ident == ELF_IDENT		&&
 	    khle->ei_data == ELF_DATA_LSB	&&
 	    le16_to_cpu(khle->e_machine) == ELF_MACH_PPC64) {
-		return try_load_elf64_le(header);
+		ret = try_load_elf64_le(header);
+		goto out_unmap1;
 	}
 
 	/* Check it's a ppc64 ELF */
@@ -157,7 +171,7 @@  static bool try_load_elf64(struct elf_hdr *header)
 	    kh->ei_data != ELF_DATA_MSB		||
 	    be16_to_cpu(kh->e_machine) != ELF_MACH_PPC64) {
 		prerror("INIT: Kernel doesn't look like an ppc64 ELF\n");
-		return false;
+		goto out_unmap1;
 	}
 
 	/* Look for a loadable program header that has our entry in it
@@ -168,6 +182,8 @@  static bool try_load_elf64(struct elf_hdr *header)
 	 * but it will not work for any ELF binary.
 	 */
 	ph = (struct elf64be_phdr *)(load_base + be64_to_cpu(kh->e_phoff));
+	vm_map_global("KERNEL ELF Program Headers", (unsigned long)ph,
+			be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr), false, false);
 	for (i = 0; i < be16_to_cpu(kh->e_phnum); i++, ph++) {
 		if (be32_to_cpu(ph->p_type) != ELF_PTYPE_LOAD)
 			continue;
@@ -184,7 +200,7 @@  static bool try_load_elf64(struct elf_hdr *header)
 
 	if (!kernel_entry) {
 		prerror("INIT: Failed to find kernel entry !\n");
-		return false;
+		goto out_unmap2;
 	}
 
 	/* For the normal big-endian ELF ABI, the kernel entry points
@@ -194,6 +210,8 @@  static bool try_load_elf64(struct elf_hdr *header)
 	 * to assuming it obeys the ABI.
 	 */
 	sh = (struct elf64be_shdr *)(load_base + be64_to_cpu(kh->e_shoff));
+	vm_map_global("KERNEL ELF Section Headers", (unsigned long)sh,
+			be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr), false, false);
 	for (i = 0; i < be16_to_cpu(kh->e_shnum); i++, sh++) {
 		if (be64_to_cpu(sh->sh_addr) <= be64_to_cpu(kh->e_entry) &&
 		    (be64_to_cpu(sh->sh_addr) + be64_to_cpu(sh->sh_size)) >
@@ -218,7 +236,15 @@  static bool try_load_elf64(struct elf_hdr *header)
 	printf("INIT: 64-bit kernel entry at 0x%llx, size 0x%lx\n",
 	       kernel_entry, kernel_size);
 
-	return true;
+	ret = true;
+
+	vm_unmap_global((unsigned long)sh, be16_to_cpu(kh->e_shnum)*sizeof(struct elf64be_shdr));
+out_unmap2:
+	vm_unmap_global((unsigned long)ph, be16_to_cpu(kh->e_phnum)*sizeof(struct elf64be_phdr));
+out_unmap1:
+	vm_unmap_global((unsigned long)header, sizeof(struct elf64be_hdr));
+
+	return ret;
 }
 
 static bool try_load_elf32_le(struct elf_hdr *header)
@@ -334,6 +360,7 @@  bool start_preload_kernel(void)
 	int loaded;
 
 	/* Try to load an external kernel payload through the platform hooks */
+	vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE, true, false);
 	kernel_size = KERNEL_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_KERNEL,
 					RESOURCE_SUBID_NONE,
@@ -342,9 +369,11 @@  bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load kernel failed\n");
 		kernel_size = 0;
+		vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 		return false;
 	}
 
+	vm_map_global("INITRAMFS", (unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE, true, false);
 	initramfs_size = INITRAMFS_LOAD_SIZE;
 	loaded = start_preload_resource(RESOURCE_ID_INITRAMFS,
 					RESOURCE_SUBID_NONE,
@@ -352,6 +381,7 @@  bool start_preload_kernel(void)
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform start load initramfs failed\n");
 		initramfs_size = 0;
+		vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 		return false;
 	}
 
@@ -361,13 +391,16 @@  bool start_preload_kernel(void)
 static bool load_kernel(void)
 {
 	void *stb_container = NULL;
-	struct elf_hdr *kh;
+	struct elf_hdr *kh, *t;
+	uint32_t ei_ident;
+	uint8_t ei_class;
 	int loaded;
 
 	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_KERNEL,
 					  RESOURCE_SUBID_NONE);
+	vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, KERNEL_LOAD_SIZE);
 
 	if (loaded != OPAL_SUCCESS) {
 		printf("INIT: platform wait for kernel load failed\n");
@@ -383,8 +416,10 @@  static bool load_kernel(void)
 				((uint64_t)__builtin_kernel_start) -
 				SKIBOOT_BASE + boot_offset;
 			printf("Using built-in kernel\n");
+			vm_map_global("KERNEL", (unsigned long)KERNEL_LOAD_BASE, kernel_size, true, false);
 			memmove(KERNEL_LOAD_BASE, (void*)builtin_base,
 				kernel_size);
+			vm_unmap_global((unsigned long)KERNEL_LOAD_BASE, kernel_size);
 		}
 	}
 
@@ -400,7 +435,7 @@  static bool load_kernel(void)
 		if (kernel_entry < EXCEPTION_VECTORS_END) {
 			cpu_set_sreset_enable(false);
 			memcpy_null(NULL, old_vectors, EXCEPTION_VECTORS_END);
-			sync_icache();
+			sync_icache(0);
 		} else {
 			/* Hack for STB in Mambo, assume at least 4kb in mem */
 			if (!kernel_size)
@@ -431,15 +466,20 @@  static bool load_kernel(void)
 	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
 	      kernel_size);
 
-	if (kh->ei_ident != ELF_IDENT) {
+	t = vm_map((unsigned long)kh, sizeof(*kh), false);
+	ei_ident = t->ei_ident;
+	ei_class = t->ei_class;
+	vm_unmap((unsigned long)t, sizeof(*kh));
+
+	if (ei_ident != ELF_IDENT) {
 		prerror("INIT: ELF header not found. Assuming raw binary.\n");
 		return true;
 	}
 
-	if (kh->ei_class == ELF_CLASS_64) {
+	if (ei_class == ELF_CLASS_64) {
 		if (!try_load_elf64(kh))
 			return false;
-	} else if (kh->ei_class == ELF_CLASS_32) {
+	} else if (ei_class == ELF_CLASS_32) {
 		if (!try_load_elf32(kh))
 			return false;
 	} else {
@@ -467,7 +507,7 @@  static void load_initramfs(void)
 
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
-
+	vm_unmap_global((unsigned long)INITRAMFS_LOAD_BASE, INITRAMFS_LOAD_SIZE);
 	if (loaded != OPAL_SUCCESS || !initramfs_size)
 		return;
 
@@ -539,6 +579,7 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 	const struct dt_property *memprop;
 	const char *cmdline, *stdoutp;
 	uint64_t mem_top;
+	uint32_t *t;
 
 	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
 	if (memprop)
@@ -613,11 +654,13 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
 
+	t = vm_map(kernel_entry, 4, false);
 	/* Check there is something there before we branch to it */
-	if (*(uint32_t *)kernel_entry == 0) {
+	if (*t == 0) {
 		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
 		assert(0);
 	}
+	vm_unmap(kernel_entry, 4);
 
 	if (platform.exit)
 		platform.exit();
@@ -629,7 +672,10 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 	printf("INIT: Starting kernel at 0x%llx, fdt at %p %u bytes\n",
 	       kernel_entry, fdt, fdt_totalsize(fdt));
 
-	/* Disable machine checks on all */
+	/* Go back to realmode and tear down our VM before booting kernel */
+	vm_destroy();
+
+	/* Disable machine checks, RI on all */
 	cpu_disable_ME_RI_all();
 
 	patch_traps(false);
@@ -835,37 +881,60 @@  static void setup_branch_null_catcher(void)
 
 void copy_sreset_vector(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_sreset_vector_fast_reboot(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_fast_reboot_patch_end -
+			(void *)&reset_fast_reboot_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_fast_reboot_patch_start;
+	t = vm_map((unsigned long)src, len, false);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_fast_reboot_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len, true);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 }
 
 void copy_exception_vectors(void)
 {
+	void *t;
+
+	t = vm_map(0x0, EXCEPTION_VECTORS_END, true);
+
 	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
 	 * this is the boot flag used by CPUs still potentially entering
 	 * skiboot.
 	 */
-	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
 			EXCEPTION_VECTORS_END - 0x100);
-	sync_icache();
+
+	sync_icache((unsigned long)t);
+	vm_unmap(0x0, EXCEPTION_VECTORS_END);
 }
 
 /*
@@ -879,15 +948,16 @@  void patch_traps(bool enable)
 	for (tte = __trap_table_start; tte < __trap_table_end; tte++) {
 		uint32_t *insn;
 
-		insn = (uint32_t *)tte->address;
+		insn = vm_map(tte->address, sizeof(uint32_t), true);
 		if (enable) {
 			*insn = PPC_INST_TRAP;
 		} else {
 			*insn = PPC_INST_NOP;
 		}
+		sync_icache((unsigned long)insn);
+		vm_unmap(tte->address, sizeof(uint32_t));
 	}
 
-	sync_icache();
 }
 
 static void per_thread_sanity_checks(void)
@@ -937,19 +1007,22 @@  void pci_nvram_init(void)
 static uint32_t mem_csum(void *_p, void *_e)
 {
 	size_t len = _e - _p;
-	uint32_t *p = _p;
+	uint32_t *t;
 	uint32_t v1 = 0, v2 = 0;
 	uint32_t csum;
 	unsigned int i;
 
+	t = vm_map((unsigned long)_p, len, false);
+
 	for (i = 0; i < len; i += 4) {
-		uint32_t v = *p++;
+		uint32_t v = *t++;
 		v1 += v;
 		v2 += v1;
 	}
-
 	csum = v1 ^ v2;
 
+	vm_unmap((unsigned long)_p, len);
+
 	return csum;
 }
 
@@ -963,6 +1036,8 @@  static void checksum_romem(void)
 	if (chip_quirk(QUIRK_SLOW_SIM))
 		return;
 
+	/* Called in real mode */
+
 	csum = mem_csum(_start, _head_end);
 	romem_csum ^= csum;
 
@@ -1054,7 +1129,7 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	prlog(PR_DEBUG, "initial console log level: memory %d, driver %d\n",
 	       (debug_descriptor.console_log_levels >> 4),
 	       (debug_descriptor.console_log_levels & 0x0f));
-	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology.\n");
+	prlog(PR_TRACE, "OPAL is Powered By Linked-List Technology. Now with more indirection.\n");
 
 #ifdef SKIBOOT_GCOV
 	skiboot_gcov_done();
@@ -1066,6 +1141,9 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	/* Now locks can be used */
 	init_locks();
 
+	/* Enter virtual memory mode */
+	vm_init(false);
+
 	/* Create the OPAL call table early on, entries can be overridden
 	 * later on (FSP console code for example)
 	 */
@@ -1091,7 +1169,20 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 		if (parse_hdat(false) < 0)
 			abort();
 	} else {
+		void *t;
+		uint32_t size;
+
+		t = vm_map((unsigned long)fdt, sizeof(struct fdt_header), false);
+		size = fdt_totalsize(t);
+		vm_unmap((unsigned long)fdt, sizeof(struct fdt_header));
+
+		/*
+		 * Would be nice to make this a local map, but it seems
+		 * to need to be expanded in place.
+		 */
+		vm_map_global("fdt", (unsigned long)fdt, size, false, false);
 		dt_expand(fdt);
+		vm_unmap_global((unsigned long)fdt, size);
 	}
 	dt_add_cpufeatures(dt_root);
 
@@ -1142,6 +1233,8 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	init_cpu_max_pir();
 
+	vm_init_stacks();
+
 	/*
 	 * Now, we init our memory map from the device-tree, and immediately
 	 * reserve areas which we know might contain data coming from
@@ -1393,6 +1486,30 @@  void __noreturn __secondary_cpu_entry(void)
 	enable_machine_check();
 	mtmsrd(MSR_RI, 1);
 
+	vm_init_secondary();
+
+	/* Some XIVE setup */
+	xive_cpu_callin(cpu);
+
+	/* Wait for work to do */
+	while(true) {
+		if (cpu_check_jobs(cpu))
+			cpu_process_jobs();
+		else
+			cpu_idle_job();
+	}
+}
+
+void __noreturn __return_cpu_entry(void)
+{
+	struct cpu_thread *cpu = this_cpu();
+
+	/* Secondary CPU called in */
+	cpu_callin(cpu);
+
+	enable_machine_check();
+	mtmsrd(MSR_RI, 1);
+
 	/* Some XIVE setup */
 	xive_cpu_callin(cpu);
 
diff --git a/core/mem_region.c b/core/mem_region.c
index 36de2d094..69f24d630 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -25,7 +25,7 @@ 
 #define POISON_MEM_REGION	0
 #endif
 #define POISON_MEM_REGION_WITH	0x99
-#define POISON_MEM_REGION_LIMIT 1*1024*1024*1024
+#define POISON_MEM_REGION_LIMIT (128*1024*1024 - PAGE_SIZE)
 
 /* Locking: The mem_region_lock protects the regions list from concurrent
  * updates. Additions to, or removals from, the region list must be done
@@ -57,24 +57,27 @@  static struct mem_region skiboot_os_reserve = {
 	.type		= REGION_OS,
 };
 
-struct mem_region skiboot_heap = {
-	.name		= "ibm,firmware-heap",
-	.start		= HEAP_BASE,
-	.len		= HEAP_SIZE,
-	.type		= REGION_SKIBOOT_HEAP,
-};
-
 static struct mem_region skiboot_code_and_text = {
 	.name		= "ibm,firmware-code",
 	.start		= SKIBOOT_BASE,
 	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.vm_mapped_len	= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
 static struct mem_region skiboot_after_heap = {
 	.name		= "ibm,firmware-data",
 	.start		= HEAP_BASE + HEAP_SIZE,
 	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
@@ -141,17 +144,40 @@  static struct alloc_hdr *next_hdr(const struct mem_region *region,
 	return next;
 }
 
+static unsigned long vm_map_limit(const struct mem_region *region,
+				  const struct alloc_hdr *hdr,
+				  unsigned long size)
+{
+	unsigned long end = region->start + region->len;
+	unsigned long limit;
+
+	assert((unsigned long)hdr >= region->start);
+
+	limit = (unsigned long)hdr + size;
+	assert(limit <= end);
+
+	if (limit + sizeof(struct free_hdr) <= end)
+		limit += sizeof(struct free_hdr);
+
+	return limit - region->start;
+}
+
 #if POISON_MEM_REGION == 1
 static void mem_poison(struct free_hdr *f)
 {
-	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
+	unsigned long start = (unsigned long)(f + 1);
+	unsigned long *t = tailer(f);
+	size_t poison_size = (unsigned long)t - start;
+	void *mem;
 
 	/* We only poison up to a limit, as otherwise boot is
 	 * kinda slow */
 	if (poison_size > POISON_MEM_REGION_LIMIT)
 		poison_size = POISON_MEM_REGION_LIMIT;
 
-	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
+	mem = vm_map(start, poison_size, true);
+	memset(mem, POISON_MEM_REGION_WITH, poison_size);
+	vm_unmap(start, poison_size);
 }
 #endif
 
@@ -159,14 +185,36 @@  static void mem_poison(struct free_hdr *f)
 static void init_allocatable_region(struct mem_region *region)
 {
 	struct free_hdr *f = region_start(region);
+	unsigned long num_longs;
+	unsigned long *t;
+
 	assert(region->type == REGION_SKIBOOT_HEAP ||
 	       region->type == REGION_MEMORY);
-	f->hdr.num_longs = region->len / sizeof(long);
+
+	num_longs = region->len / sizeof(long);
+
+	assert(PAGE_SIZE >= sizeof(*f));
+	assert(region->len >= PAGE_SIZE*2);
+
+	list_head_init(&region->free_list);
+
+	if (!region->vm_mapped_len) {
+		/* SKIBOOT_BASE-SIZE regions already come mapped */
+		vm_map_global(region->name, region->start, sizeof(struct free_hdr), true, false);
+		region->vm_mapped_len = sizeof(struct free_hdr);
+	} else {
+		assert(region == &skiboot_heap);
+	}
+
+	f->hdr.num_longs = num_longs;
 	f->hdr.free = true;
 	f->hdr.prev_free = false;
-	*tailer(f) = f->hdr.num_longs;
-	list_head_init(&region->free_list);
 	list_add(&region->free_list, &f->list);
+
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
+
 #if POISON_MEM_REGION == 1
 	mem_poison(f);
 #endif
@@ -176,6 +224,9 @@  static void make_free(struct mem_region *region, struct free_hdr *f,
 		      const char *location, bool skip_poison)
 {
 	struct alloc_hdr *next;
+	unsigned long *t;
+	unsigned long new_end;
+	unsigned long new_sz;
 
 #if POISON_MEM_REGION == 1
 	if (!skip_poison)
@@ -202,20 +253,33 @@  static void make_free(struct mem_region *region, struct free_hdr *f,
 		list_add(&region->free_list, &f->list);
 	}
 
-	/* Fix up tailer. */
-	*tailer(f) = f->hdr.num_longs;
-
-	/* If next is free, coalesce it */
+	/* If next is free coalesce it, else mark us as free. */
 	next = next_hdr(region, &f->hdr);
 	if (next) {
-		next->prev_free = true;
 		if (next->free) {
 			struct free_hdr *next_free = (void *)next;
 			list_del_from(&region->free_list, &next_free->list);
-			/* Maximum of one level of recursion */
-			make_free(region, next_free, location, true);
+			f->hdr.num_longs += next_free->hdr.num_longs;
+		} else {
+			assert(!next->prev_free);
+			next->prev_free = true;
+			goto no_unmap;
 		}
 	}
+
+	/* Freed to the end, may have to trim mapping */
+	new_end = (unsigned long)f + sizeof(struct free_hdr);
+	new_sz = new_end - region->start;
+	if (region != &skiboot_heap && new_sz < region->vm_mapped_len) {
+		vm_unmap_global(new_end, region->vm_mapped_len - new_sz);
+		region->vm_mapped_len = new_sz;
+	}
+
+no_unmap:
+	/* Fix up tailer. */
+	t = vm_map((unsigned long)tailer(f), sizeof(long), true);
+	*t = f->hdr.num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 }
 
 /* Can we fit this many longs with this alignment in this free block? */
@@ -253,11 +317,12 @@  static void discard_excess(struct mem_region *region,
 		post->hdr.num_longs = hdr->num_longs - alloc_longs;
 		post->hdr.prev_free = false;
 
+		/* No coalescing required. */
+		make_free(region, post, location, skip_poison);
+
 		/* Trim our block. */
 		hdr->num_longs = alloc_longs;
 
-		/* This coalesces as required. */
-		make_free(region, post, location, skip_poison);
 	}
 }
 
@@ -445,6 +510,18 @@  found:
 	if (next) {
 		assert(next->prev_free);
 		next->prev_free = false;
+	} else {
+		unsigned long new_sz;
+
+		/* Took from the end, may have to expand mapping */
+		new_sz = vm_map_limit(region, &f->hdr, (alloc_longs + offset) * sizeof(long));
+		if (new_sz > region->vm_mapped_len) {
+			assert(region != &skiboot_heap);
+			vm_map_global(region->name,
+				region->start + region->vm_mapped_len,
+				new_sz - region->vm_mapped_len, true, false);
+			region->vm_mapped_len = new_sz;
+		}
 	}
 
 	if (offset != 0) {
@@ -536,6 +613,7 @@  bool mem_resize(struct mem_region *region, void *mem, size_t len,
 {
 	struct alloc_hdr *hdr, *next;
 	struct free_hdr *f;
+	unsigned long new_sz;
 
 	/* This should be a constant. */
 	assert(is_rodata(location));
@@ -566,6 +644,15 @@  bool mem_resize(struct mem_region *region, void *mem, size_t len,
 	if (!next || !next->free || hdr->num_longs + next->num_longs < len)
 		return false;
 
+	new_sz = vm_map_limit(region, hdr, len * sizeof(long));
+	if (new_sz > region->vm_mapped_len) {
+		assert(region != &skiboot_heap);
+		vm_map_global(region->name,
+			region->start + region->vm_mapped_len,
+			new_sz - region->vm_mapped_len, true, false);
+		region->vm_mapped_len = new_sz;
+	}
+
 	/* OK, it's free and big enough, absorb it. */
 	f = (struct free_hdr *)next;
 	list_del_from(&region->free_list, &f->list);
@@ -691,6 +778,7 @@  static struct mem_region *new_region(const char *name,
 	region->name = name;
 	region->start = start;
 	region->len = len;
+	region->vm_mapped_len = 0;
 	region->node = node;
 	region->type = type;
 	region->free_list.n.next = NULL;
@@ -1199,6 +1287,7 @@  void mem_region_release_unused(void)
 			continue;
 
 		used_len = allocated_length(r);
+		assert(used_len <= r->vm_mapped_len);
 
 		prlog(PR_INFO, "    %s: %llu/%llu used\n",
 		       r->name, (long long)used_len, (long long)r->len);
@@ -1227,6 +1316,10 @@  void mem_region_release_unused(void)
 			}
 			list_add(&regions, &for_linux->list);
 		}
+		if (r->vm_mapped_len > used_len) {
+			vm_unmap_global(r->start + used_len, r->vm_mapped_len - used_len);
+			r->vm_mapped_len = used_len;
+		}
 	}
 	unlock(&mem_region_lock);
 }
@@ -1271,9 +1364,13 @@  static void mem_clear_range(uint64_t s, uint64_t e)
 		return;
 	}
 
-	prlog(PR_DEBUG, "Clearing region %llx-%llx\n",
-	      (long long)s, (long long)e);
+	/*
+	 * Large clear thrashes the small hash table, with parallel clearing
+	 * this can livelock. Clear in real mode.
+	 */
+	vm_exit();
 	memset((void *)s, 0, e - s);
+	vm_enter();
 }
 
 struct mem_region_clear_job_args {
diff --git a/core/opal.c b/core/opal.c
index 46518c445..9ab7391d1 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -44,19 +44,39 @@  static uint64_t opal_dynamic_events;
 extern uint32_t attn_trigger;
 extern uint32_t hir_trigger;
 
+void __opal_register(uint64_t token, void *func, unsigned int nargs)
+{
+	uint64_t f;
+	uint64_t *t;
+	u8 *a;
+
+	assert(token <= OPAL_LAST);
+
+	f = function_entry_address(func);
+
+	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(*t), true);
+	*t = f;
+	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(*t));
+
+	a = vm_map((unsigned long)&opal_num_args[token], sizeof(*a), true);
+	*a = nargs;
+	vm_unmap((unsigned long)&opal_num_args[token], sizeof(*a));
+}
 
 void opal_table_init(void)
 {
 	struct opal_table_entry *s = __opal_table_start;
 	struct opal_table_entry *e = __opal_table_end;
+	struct opal_table_entry *te;
+	size_t size = (void *)e - (void *)s;
 
 	prlog(PR_DEBUG, "OPAL table: %p .. %p, branch table: %p\n",
 	      s, e, opal_branch_table);
-	while(s < e) {
-		((uint64_t *)opal_branch_table)[s->token] = function_entry_address(s->func);
-		((u8 *)opal_num_args)[s->token] = s->nargs;
-		s++;
-	}
+
+	vm_map_global("OPAL table", (unsigned long)s, size, false, false);
+	for (te = s; te < e; te++)
+		__opal_register(te->token, te->func, te->nargs);
+	vm_unmap_global((unsigned long)s, size);
 }
 
 /* Called from head.S, thus no prototype */
@@ -317,14 +337,6 @@  int64_t opal_quiesce(uint32_t quiesce_type, int32_t cpu_target)
 }
 opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
-void __opal_register(uint64_t token, void *func, unsigned int nargs)
-{
-	assert(token <= OPAL_LAST);
-
-	((uint64_t *)opal_branch_table)[token] = function_entry_address(func);
-	((u8 *)opal_num_args)[token] = nargs;
-}
-
 /*
  * add_opal_firmware_exports_node: adds properties to the device-tree which
  * the OS will then change into sysfs nodes.
diff --git a/core/platform.c b/core/platform.c
index 8f4a3b877..839cf97ee 100644
--- a/core/platform.c
+++ b/core/platform.c
@@ -242,8 +242,10 @@  void set_bmc_platform(const struct bmc_platform *bmc)
 
 void probe_platform(void)
 {
-	struct platform *platforms = &__platforms_start;
-	unsigned int i;
+	struct platform *s = __platforms_start;
+	struct platform *e = __platforms_end;
+	struct platform *p;
+	size_t size = (void *)e - (void *)s;
 
 	/* Detect Manufacturing mode */
 	if (dt_find_property(dt_root, "ibm,manufacturing-mode")) {
@@ -257,12 +259,15 @@  void probe_platform(void)
 		manufacturing_mode = true;
 	}
 
-	for (i = 0; &platforms[i] < &__platforms_end; i++) {
-		if (platforms[i].probe && platforms[i].probe()) {
-			platform = platforms[i];
+	vm_map_global("Platform table", (unsigned long)s, size, false, false);
+	for (p = s; p < e; p++) {
+		if (p->probe && p->probe()) {
+			platform = *p;
 			break;
 		}
 	}
+	vm_unmap_global((unsigned long)s, size);
+
 	if (!platform.name) {
 		platform = generic_platform;
 		if (platform.probe)
diff --git a/core/vm.c b/core/vm.c
new file mode 100644
index 000000000..84534796c
--- /dev/null
+++ b/core/vm.c
@@ -0,0 +1,942 @@ 
+/* Copyright 2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ccan/container_of/container_of.h>
+#include <ccan/list/list.h>
+#include <ccan/str/str.h>
+#include <cmpxchg.h>
+#include <cpu.h>
+#include <opal.h>
+#include <skiboot.h>
+#include <stack.h>
+#include <timebase.h>
+#include <trace.h>
+
+static bool vm_setup = false;
+static bool vm_globals_allocated = false;
+
+#define SLB_SZ			(256UL*1024*1024)
+#define SLB_NR			32
+#define LOCAL_SLB_NR		2
+#define GLOBAL_SLB_NR		(SLB_NR - LOCAL_SLB_NR)
+#define LOCAL_SLB_BASE		GLOBAL_SLB_NR
+
+#define LOCAL_EA_PERCPU		(SLB_SZ)
+#define LOCAL_EA_BEGIN		0x0008000000000000ULL
+#define LOCAL_EA_END		0x0009000000000000ULL
+
+static void __nomcount slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
+{
+	unsigned long rs;
+	unsigned long rb;
+
+	rs = vsid << (63-51);		/* 256MB VSID */
+	rs |= 1UL << (63-53);		/* Kp = 1 */
+	if (PAGE_SIZE == 0x10000) {
+		rs |= 1UL << (63-55);		/* L = 1 */
+		rs |= 1UL << (63-59);		/* LP = 01 */
+	}
+
+	rb = esid << (63-35);		/* 256MB ESID */
+	rb |= 1UL << (63-36);		/* V = 1 */
+	rb |= index;
+
+	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
+}
+
+#if 0
+static void slb_remove(unsigned long esid)
+{
+	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
+}
+#endif
+
+static void slb_remove_all(void)
+{
+	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
+}
+
+static void __nomcount slb_add(unsigned long ea)
+{
+	struct cpu_thread *cpu = this_cpu();
+	uint64_t esid = ea >> 28;
+	uint64_t vsid = ea >> 28;
+
+	slb_install(esid, vsid, cpu->vm_slb_rr);
+
+	cpu->vm_slb_rr++;
+	if (cpu->vm_slb_rr == GLOBAL_SLB_NR)
+		cpu->vm_slb_rr = 0;
+}
+
+struct hpte {
+	beint64_t dword[2];
+};
+
+struct hpteg {
+	struct hpte hpte[8];
+};
+
+static struct hpteg *htab;
+static unsigned long htab_shift;
+static unsigned long htab_pteg_mask;
+
+static struct lock htab_lock;
+
+static void __nomcount htab_install(unsigned long va, unsigned long pa, int rw, int ex, int ci, bool local)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	struct hpte *hpte;
+	unsigned long ava = va >> 23;
+	unsigned long arpn = pa >> 12;
+	unsigned long dw0, dw1;
+	unsigned long _dw0;
+	unsigned long _ava;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	if (PAGE_SIZE == 0x10000)
+		arpn >>= 4;
+
+	dw0 = ava << (63-56); /* AVA = ava */
+	dw0 |= 0x1; /* V = 1 */
+	if (PAGE_SIZE == 0x10000)
+		dw0 |= 0x4; /* L = 1 */
+	if (local)
+		dw0 |= 0x8; /* SW[0] = 1 */
+
+	if (PAGE_SIZE == 0x10000) {
+		dw1 = (arpn << (63-43 - 4)); /* ARPN||LP-4 = arpn */
+		dw1 |= (0x1 << (63-43 - 8)); /* LP = 0001 */
+	} else
+		dw1 = (arpn << (63-43 - 8)); /* ARPN||LP = arpn */
+	if (!rw)
+		dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1)); /* pp = 110 */
+	if (!ex)
+		dw1 |= (1UL << (63 - 61)); /* N = 1 */
+	dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
+	if (ci)
+		dw1 |= (1UL << (63 - 60)) | (1UL << (63 - 60 + 2)); /* WIMG = 0111 */
+	dw1 |= (1UL << (63 - 55)) | (1UL << (63 - 56)); /* R=C=1 */
+
+	if (PAGE_SIZE == 0x10000)
+		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
+	else
+		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	lock(&htab_lock);
+
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		hpte = &hpteg->hpte[i];
+
+		_dw0 = be64_to_cpu(hpte->dword[0]);
+		if (_dw0 & 1) {
+			_ava = _dw0 >> (63 - 56);
+			if (_ava == ava) {
+				assert(!local);
+				/* This could happen with racing global fault */
+				assert(dw0 == _dw0);
+				assert(dw1 == be64_to_cpu(hpte->dword[1]));
+				goto out;
+			}
+
+			continue;
+		}
+
+		assert(!_dw0);
+		goto install;
+	}
+
+	i = mftb();
+	i = (i ^ (i >> 4)) & 0x7;
+	hpte = &hpteg->hpte[i];
+
+install:
+	hpte->dword[1] = cpu_to_be64(dw1);
+	eieio();
+	hpte->dword[0] = cpu_to_be64(dw0);
+	asm volatile("ptesync" ::: "memory");
+out:
+	unlock(&htab_lock);
+}
+
+static void htab_remove(unsigned long va, int local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned long ava = va >> 23;
+	unsigned long dw0;
+	unsigned long rb;
+	unsigned int hstart, hend;
+	unsigned int i;
+
+	dw0 = ava << (63-56);
+	dw0 |= 0x1;
+	if (PAGE_SIZE == 0x10000)
+		dw0 |= 0x4;
+	if (local)
+		dw0 |= 0x8;
+
+	if (PAGE_SIZE == 0x10000)
+		hash = ((va >> 16) & 0xfff) ^ ((va >> 28) & 0x7fffffffffUL);
+	else
+		hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	if (vm_setup)
+		vm_exit();
+	lock(&htab_lock);
+	hstart = 0;
+	hend = 7;
+
+	for (i = hstart; i <= hend; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		beint64_t _raw_dw0;
+		uint64_t _dw0;
+
+		_raw_dw0 = hpte->dword[0];
+		_dw0 = be64_to_cpu(_raw_dw0);
+
+		if (!(_dw0 & 1)) {
+			assert(!_raw_dw0);
+			continue;
+		}
+
+		if (_dw0 != dw0)
+			continue;
+
+		hpte->dword[0] = 0;
+		eieio();
+		hpte->dword[1] = 0;
+
+		break;
+	}
+
+	if (PAGE_SIZE == 0x10000) {
+		rb = (va >> 16) << (63 - 47); /* AVA||LP-4 */
+		rb |= 0x1 << (63 - 51); /* LP=0001 */
+		rb |= 0x1; /* L=1 */
+	} else {
+		rb = va & ~0xfffUL;
+	}
+
+	unlock(&htab_lock);
+
+	if (vm_setup)
+		vm_enter();
+
+	if (local) {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbiel %0" : : "r"(rb));
+		asm volatile("ptesync" ::: "memory");
+	} else {
+		asm volatile("ptesync" ::: "memory");
+		asm volatile("tlbie %0,%1" : : "r"(rb), "r"(0));
+		asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
+
+	}
+}
+
+/*
+ * Try to fix problems in callers if !strict.
+ */
+static bool vm_strict = false;
+
+static struct list_head vm_maps = LIST_HEAD_INIT(vm_maps);
+static struct lock vm_maps_lock;
+static unsigned long nr_vm_maps;
+
+static void __vm_map(const char *name, unsigned long addr, unsigned long len, unsigned long pa, bool r, bool w, bool x, bool ci, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	struct vm_map *new;
+	struct vm_map *vmm;
+
+	if (local) {
+		new = &c->vm_local_map;
+		new->name = name;
+		new->address = addr;
+		new->length = len;
+		new->pa = pa;
+		new->readable = r;
+		new->writeable = w;
+		new->executable = x;
+		new->ci = ci;
+
+		return;
+	}
+
+	new = zalloc(sizeof(*new));
+	assert(new);
+
+	new->name = name;
+	new->address = addr;
+	new->length = len;
+	new->pa = pa;
+	new->readable = r;
+	new->writeable = w;
+	new->executable = x;
+	new->ci = ci;
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+
+	list_for_each(&vm_maps, vmm, list) {
+		unsigned long ps = addr & ~(PAGE_SIZE - 1);
+		unsigned long pe = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+		unsigned long vmm_ps = vmm->address & ~(PAGE_SIZE - 1);
+		unsigned long vmm_pe = (vmm->address + vmm->length + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+		bool mergeable = false;
+		bool samepage = false;
+
+		/* Ensure no overlap */
+		assert(addr + len <= vmm->address || addr >= vmm->address + vmm->length);
+
+		if (ps > vmm_pe)
+			continue; /* Sort */
+		if (pe < vmm_ps) {
+			/* Not same or adjacent page is easy */
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+		if (pe > vmm_ps || ps < vmm_pe)
+			samepage = true;
+
+		mergeable =	/* XXX: check pa */ 1 &&
+				(vmm->ci == ci) &&
+				(vmm->readable == r) &&
+				(vmm->writeable == w) &&
+				(vmm->executable == x);
+		samepage = false;
+
+		if (samepage && !mergeable) {
+			printf("VMM: %s (%lx-%lx) mismatched permissions with same page mapping %s (%llx-%llx)\n", name, addr, addr + len, vmm->name, vmm->address, vmm->address + vmm->length);
+			assert(vmm->pa == pa);
+			assert(vmm->ci == ci);
+			assert(vmm->readable == r);
+			assert(vmm->writeable == w);
+			assert(vmm->executable == x);
+		}
+
+		if (!strcmp(name, vmm->name) && mergeable) {
+			if (addr == vmm->address + vmm->length) {
+				free(new);
+				vmm->length += len;
+				goto done;
+			}
+
+			if (addr + len == vmm->address) {
+				free(new);
+				vmm->address = addr;
+				vmm->pa = pa;
+				vmm->length += len;
+				goto done;
+			}
+		}
+
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr + len <= vmm->address) {
+			list_add_before(&vm_maps, &new->list, &vmm->list);
+			goto found;
+		}
+
+		assert(0);
+	}
+	list_add_tail(&vm_maps, &new->list);
+found:
+	nr_vm_maps++;
+done:
+	unlock(&vm_maps_lock);
+	if (vm_setup)
+		vm_enter();
+}
+
+static void __vm_unmap(unsigned long addr, unsigned long len, bool local)
+{
+	struct cpu_thread *c = this_cpu();
+	bool vm_setup = c->vm_setup;
+	unsigned long end = addr + len;
+	struct vm_map *vmm, *to_free = NULL;
+
+	if (local) {
+		vmm = &c->vm_local_map;
+		assert(addr == vmm->address);
+		assert(len == vmm->length);
+		memset(vmm, 0, sizeof(struct vm_map));
+
+		if (vm_setup) {
+			while (addr < end) {
+				htab_remove(addr, local);
+				addr += PAGE_SIZE;
+			}
+		}
+
+		return;
+	}
+
+	/* Can not take a d-side fault while holding this lock */
+	if (vm_setup)
+		vm_exit();
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		struct vm_map *new;
+
+		if (addr + len <= vmm->address)
+			continue;
+		if (addr >= vmm->address + vmm->length)
+			continue;
+		if (addr == vmm->address && len == vmm->length) {
+			to_free = vmm;
+			goto found;
+		}
+
+		if (addr == vmm->address) {
+			vmm->address += len;
+			vmm->pa += len;
+			vmm->length -= len;
+			goto done;
+		}
+
+		if (addr + len == vmm->address + vmm->length) {
+			vmm->length -= len;
+			goto done;
+		}
+
+		/* Unmaps will never span multiple because they always apply to a previous map, so this is a split */
+		new = zalloc(sizeof(*new));
+		assert(new);
+		memcpy(new, vmm, sizeof(*new));
+		list_add_before(&vm_maps, &new->list, &vmm->list);
+		nr_vm_maps++;
+
+		new->length = addr - new->address;
+		vmm->address += new->length + len;
+		vmm->pa += new->length + len;
+		vmm->length -= new->length + len;
+		goto done;
+	}
+	vmm = NULL;
+	unlock(&vm_maps_lock);
+	if (!vm_strict) {
+		prerror("unmap didn't find anything\n");
+		backtrace();
+		goto out;
+	}
+	assert(0);
+
+found:
+	list_del(&vmm->list);
+	nr_vm_maps--;
+done:
+	if (vm_setup) {
+		while (addr < end) {
+			htab_remove(addr, local);
+			addr += PAGE_SIZE;
+		}
+	}
+
+	unlock(&vm_maps_lock);
+out:
+	if (vm_setup)
+		vm_enter();
+
+	if (to_free)
+		free(to_free);
+}
+
+
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci)
+{
+	assert(this_cpu()->state != cpu_state_os);
+	__vm_map(name, addr, len, addr, true, rw, false, ci, false);
+}
+
+void vm_map_global_text(const char *name, unsigned long addr, unsigned long len)
+{
+	assert(this_cpu()->state != cpu_state_os);
+	__vm_map(name, addr, len, addr, true, false, true, false, false);
+}
+
+void vm_unmap_global(unsigned long addr, unsigned long len)
+{
+	assert(this_cpu()->state != cpu_state_os);
+	__vm_unmap(addr, len, false);
+}
+
+
+void *vm_map(unsigned long addr, unsigned long len, bool rw)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr;
+	unsigned long end;
+	unsigned long offset = addr & (PAGE_SIZE - 1);
+
+	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len <= LOCAL_EA_PERCPU);
+
+	/* Can't do nested mappings */
+	assert(!c->vm_local_map_inuse);
+	c->vm_local_map_inuse = true;
+
+	if (c->vm_setup) {
+		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
+		__vm_map("local", newaddr, len, addr, true, rw, false, false, true);
+	} else {
+		newaddr = addr;
+	}
+
+	return (void *)newaddr + offset;
+}
+
+void vm_unmap(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long newaddr;
+	unsigned long end;
+
+	end = (addr + len + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	assert(len <= LOCAL_EA_PERCPU);
+
+	assert(c->vm_local_map_inuse);
+	c->vm_local_map_inuse = false;
+
+	if (c->vm_setup) {
+		newaddr = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
+		__vm_unmap(newaddr, len, true);
+	}
+}
+
+struct prte {
+	beint64_t dword[2];
+};
+
+static struct prte *prtab;
+static unsigned long old_lpcr;
+static unsigned long new_lpcr;
+
+static void vm_init_cpu(void)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long ea = LOCAL_EA_BEGIN + LOCAL_EA_PERCPU * c->pir;
+	unsigned long esid = ea >> 28;
+	unsigned long vsid = ea >> 28;
+
+	mtspr(SPR_LPCR, new_lpcr);
+
+	mtspr(SPR_LPID, 0);
+	mtspr(SPR_PID, 0);
+	mtspr(SPR_HRMOR, 0);
+	mtspr(SPR_PTCR, (unsigned long)prtab);
+	mtspr(SPR_AMR, 0);
+	mtspr(SPR_IAMR, 0);
+	mtspr(SPR_AMOR, 0);
+	mtspr(SPR_UAMOR, 0);
+
+	slb_remove_all();
+	slb_install(esid, vsid, LOCAL_SLB_BASE);
+}
+
+void vm_init_secondary(void)
+{
+	vm_init_cpu();
+	vm_enter();
+}
+
+bool vm_realmode(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	return !vm_setup || !c->vm_setup;
+}
+
+void vm_enter(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (c->vm_setup) {
+		prerror("CPU:%d vm_enter already entered\n", c->pir);
+		backtrace();
+	}
+	if (c->vm_local_map_inuse) {
+		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
+		backtrace();
+	}
+
+	c->vm_setup = true;
+	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+}
+
+void vm_exit(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	if (!c->vm_setup) {
+		prerror("CPU:%d vm_exit already exited\n", c->pir);
+		backtrace();
+	}
+	if (c->vm_local_map_inuse) {
+		prerror("CPU:%d vm_enter local map inuse\n", c->pir);
+		backtrace();
+	}
+	c->vm_setup = false;
+	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+}
+
+bool __nomcount vm_dslb(uint64_t nia, uint64_t dar)
+{
+	/*
+	 * Per-cpu map ranges are bolted to per-cpu SLBs.
+	 */
+	assert((dar < LOCAL_EA_BEGIN) ||
+		(dar >= LOCAL_EA_END));
+
+	(void)nia;
+	slb_add(dar);
+
+	return true;
+}
+
+bool __nomcount vm_islb(uint64_t nia)
+{
+	slb_add(nia);
+
+	return true;
+}
+
+bool __nomcount vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr)
+{
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+	uint64_t pa;
+	bool store = !!(dsisr & DSISR_ISSTORE);
+	bool ret = true;
+	bool local;
+
+	if (dsisr & 0xbdffffffU) {
+		printf("Page fault bad dsisr at 0x%016llx dar=0x%016llx dsisr=0x%08x\n", nia, dar, dsisr);
+		return false;
+	}
+
+	if ((dar >= LOCAL_EA_BEGIN) && (dar < LOCAL_EA_END)) {
+		local = true;
+		vmm = &c->vm_local_map;
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+		goto not_found;
+	}
+
+	local = false;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		assert(vmm->pa == vmm->address);
+		if (dar >= vmm->address && dar < vmm->address + vmm->length)
+			goto found;
+	}
+	if (!vm_strict) {
+		if (dar >= 0x0006000000000000 && dar < 0x0007000000000000)
+			/* MMIO */
+			htab_install(dar, dar, 1, 0, 1, false);
+		else if (dar < LOCAL_EA_BEGIN)
+			htab_install(dar, dar, 1, 0, 0, false);
+		else
+			ret = false;
+		unlock(&vm_maps_lock);
+		prerror("Page fault with no VMM at NIA:0x%016llx DAR:0x%016llx, store:%d\n", nia, dar, store);
+		backtrace();
+		list_for_each(&vm_maps, vmm, list)
+			prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
+				vmm->address, vmm->address + vmm->length);
+		goto out;
+	}
+	unlock(&vm_maps_lock);
+not_found:
+	prerror("  vmm not found\n");
+	ret = false;
+	assert(0);
+	goto out;
+
+found:
+	pa = vmm->pa + (dar & ~(PAGE_SIZE - 1)) - vmm->address;
+	if (!vmm->readable) {
+		if (!local)
+			unlock(&vm_maps_lock);
+		prerror("  vmm not readable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+	if (store && !vmm->writeable) {
+		if (!vm_strict) {
+			htab_install(dar, pa, store, 0, vmm->ci, local);
+			if (!local)
+				unlock(&vm_maps_lock);
+			prerror("Page fault store to RO VMM:%s at NIA:0x%016llx DAR:0x%016llx\n", vmm->name, nia, dar);
+			backtrace();
+			goto out;
+		}
+		if (!local)
+			unlock(&vm_maps_lock);
+		prerror("  vmm not writeable\n");
+		ret = false;
+		assert(0);
+		goto out;
+	}
+
+	htab_install(dar, pa, vmm->writeable, vmm->executable, vmm->ci, local);
+	if (!local)
+		unlock(&vm_maps_lock);
+
+out:
+	return ret;
+}
+
+bool __nomcount vm_isi(uint64_t nia)
+{
+	struct vm_map *vmm;
+
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		assert(vmm->pa == vmm->address);
+		if (nia >= vmm->address && nia < vmm->address + vmm->length) {
+			if (!vmm->executable)
+				prerror("Page fault at NIA:0x%016llx NX mapping!\n", nia);
+			goto found;
+		}
+	}
+
+	prerror("Page fault, no mapping for NIA:0x%016llx !\n", nia);
+
+found:
+	unlock(&vm_maps_lock);
+	htab_install(nia, nia, 0, 1, 0, false);
+
+	return true;
+}
+
+static void cpu_stop_vm(void *arg __unused)
+{
+	vm_exit();
+}
+
+static void cpu_cleanup_vm(void *arg __unused)
+{
+	slb_remove_all();
+	mtspr(SPR_PTCR, 0);
+	mtspr(SPR_LPCR, old_lpcr);
+}
+
+static void cpu_all_destroy_vm(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
+	assert(jobs);
+
+	/* Stop all CPUs */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
+						cpu_stop_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_stop_vm(NULL);
+
+	/* Cleaup after all stop */
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_cleanup_vm",
+						cpu_cleanup_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_cleanup_vm(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+
+	cleanup_global_tlb();
+}
+
+static void print_maps(void)
+{
+	struct vm_map *vmm;
+
+	prlog(PR_DEBUG, " %lu Global mappings\n", nr_vm_maps);
+	list_for_each(&vm_maps, vmm, list) {
+		prlog(PR_DEBUG, "%28s 0x%08llx-0x%08llx\n", vmm->name,
+			vmm->address, vmm->address + vmm->length);
+	}
+}
+
+void vm_init(bool fast_reboot)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	unsigned long sym_start = (unsigned long)__sym_map_start;
+	unsigned long sym_size = (unsigned long)__sym_map_end - sym_start;
+	unsigned long htab_nr_bytes;
+	unsigned long htab_nr_ptegs;
+
+	old_lpcr = mfspr(SPR_LPCR);
+	new_lpcr = (old_lpcr & ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43)))
+								| PPC_BIT(54);
+
+	prtab = memalign(64*1024, 64*1024);
+	assert(prtab);
+	memset(prtab, 0, 64*1024);
+
+	htab_shift = 18; /* 256kB table */
+	htab_nr_bytes = 1UL << htab_shift;
+	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
+	htab_pteg_mask = htab_nr_ptegs - 1;
+	htab = memalign(1UL << htab_shift, htab_nr_bytes);
+	assert(htab);
+	memset(htab, 0, htab_nr_bytes);
+
+	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab | (htab_shift - 18));
+	prtab[0].dword[1] = 0;
+
+	eieio();
+
+	vm_init_cpu();
+
+	cleanup_global_tlb();
+
+	if (vm_globals_allocated) {
+		assert(fast_reboot);
+		goto done;
+	}
+
+	assert(!fast_reboot);
+	vm_globals_allocated = true;
+
+	vm_map_global_text("OPAL text", (unsigned long)_stext,
+			   (unsigned long)_etext - (unsigned long)_stext);
+	vm_map_global("OPAL rodata", (unsigned long)__rodata_start,
+		      (unsigned long)__vm_mapped_romem_end - (unsigned long)__rodata_start,
+		      false, false);
+	vm_map_global("OPAL data", (unsigned long)_sdata,
+		      (unsigned long)_edata - (unsigned long)_sdata,
+		      true, false);
+	vm_map_global("OPAL symbols", sym_start, sym_size, false, false);
+	vm_map_global("OPAL bss", (unsigned long)_sbss,
+		      (unsigned long)_ebss - (unsigned long)_sbss,
+		      true, false);
+	vm_map_global("OPAL heap", HEAP_BASE, HEAP_SIZE, true, false);
+	vm_map_global("Memory console", INMEM_CON_START, INMEM_CON_LEN, true, false);
+	vm_map_global("Hostboot console", HBRT_CON_START, HBRT_CON_LEN, false, false);
+	vm_map_global("SPIRA heap", SPIRA_HEAP_BASE, SPIRA_HEAP_SIZE, false, false);
+	vm_map_global("PSI TCE table", PSI_TCE_TABLE_BASE, PSI_TCE_TABLE_SIZE, false, false);
+	vm_map_global("OPAL boot stacks", stack_start, stack_end - stack_start, true, false);
+
+done:
+	prlog(PR_DEBUG, "VMM: SETUP\n");
+	prlog(PR_DEBUG, " PRTAB:%p\n", prtab);
+	prlog(PR_DEBUG, " HTAB: %p\n", htab);
+	print_maps();
+
+	vm_setup = true;
+
+	vm_enter();
+}
+
+void vm_init_stacks(void)
+{
+	unsigned long stack_start = SKIBOOT_BASE + SKIBOOT_SIZE;
+	unsigned long stack_end = stack_start + (cpu_max_pir + 1)*STACK_SIZE;
+	struct cpu_thread *c = this_cpu();
+	struct vm_map *vmm;
+
+	/* Can not take a d-side fault while holdig this lock */
+	if (c->vm_setup)
+		mtmsr(mfmsr() & ~MSR_DR);
+	lock(&vm_maps_lock);
+	list_for_each(&vm_maps, vmm, list) {
+		if (vmm->address >= stack_end)
+			continue;
+		if (vmm->address + vmm->length <= stack_start)
+			continue;
+		goto found;
+	}
+	unlock(&vm_maps_lock);
+	assert(0);
+
+found:
+	vmm->name = "OPAL stacks";
+	vmm->address = stack_start;
+	vmm->length = stack_end - stack_start;
+	unlock(&vm_maps_lock);
+	if (c->vm_setup)
+		mtmsr(mfmsr() | MSR_DR);
+}
+
+void vm_destroy(void)
+{
+	assert(vm_setup);
+
+	prlog(PR_DEBUG, "VMM: TEARDOWN\n");
+	print_maps();
+
+	cpu_all_destroy_vm();
+
+	vm_setup = false;
+
+	if (0) { /* XXX: leave for VMM enabled fast-reboot */
+		while (!list_empty(&vm_maps)) {
+			struct vm_map *vmm;
+			vmm = list_pop(&vm_maps, struct vm_map, list);
+			free(vmm);
+		}
+	}
+
+	free(htab);
+	htab = NULL;
+	free(prtab);
+	prtab = NULL;
+}
diff --git a/hdata/spira.c b/hdata/spira.c
index 35d6109d3..870903bd8 100644
--- a/hdata/spira.c
+++ b/hdata/spira.c
@@ -1703,11 +1703,20 @@  static void fixup_spira(void)
 static void update_spirah_addr(void)
 {
 #if !defined(TEST)
+	beint64_t *spirah_offset;
+	beint64_t *spira_offset;
+
 	if (proc_gen < proc_gen_p9)
 		return;
 
-	naca.spirah_addr = CPU_TO_BE64(SPIRAH_OFF);
-	naca.spira_addr = CPU_TO_BE64(SPIRA_OFF);
+	spirah_offset = vm_map((u64)&naca, sizeof(u64), true);
+	*spirah_offset = CPU_TO_BE64(SPIRAH_OFF);
+	vm_unmap((unsigned long)spirah_offset, sizeof(u64));
+
+	spira_offset = vm_map((u64)&naca + 0x30, sizeof(u64), true);
+	*spira_offset = CPU_TO_BE64(SPIRA_OFF);
+	vm_unmap((unsigned long)spira_offset, sizeof(u64));
+
 	spirah.ntuples.hs_data_area.addr = CPU_TO_BE64(SPIRA_HEAP_BASE - SKIBOOT_BASE);
 	spirah.ntuples.mdump_res.addr = CPU_TO_BE64(MDRT_TABLE_BASE - SKIBOOT_BASE);
 #endif
@@ -1715,13 +1724,24 @@  static void update_spirah_addr(void)
 
 int parse_hdat(bool is_opal)
 {
+	int ret = 0;
+
 	cpu_type = PVR_TYPE(mfspr(SPR_PVR));
 
 	prlog(PR_DEBUG, "Parsing HDAT...\n");
 
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), true, false);
 	fixup_spira();
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
 
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), true, false);
 	update_spirah_addr();
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
+
+	/* Downgrade to read-only */
+
+	vm_map_global("SPIRA", SKIBOOT_BASE + SPIRA_OFF, sizeof(spira), false, false);
+	vm_map_global("SPIRA-H", SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah), false, false);
 
 	/*
 	 * Basic DT root stuff
@@ -1742,8 +1762,10 @@  int parse_hdat(bool is_opal)
 	dt_init_led_node();
 
 	/* Parse PCIA */
-	if (!pcia_parse())
-		return -1;
+	if (!pcia_parse()) {
+		ret = -1;
+		goto out;
+	}
 
 	/* IPL params */
 	add_iplparams();
@@ -1789,6 +1811,9 @@  int parse_hdat(bool is_opal)
 		node_stb_parse();
 
 	prlog(PR_DEBUG, "Parsing HDAT...done\n");
+out:
+	vm_unmap_global(SKIBOOT_BASE + SPIRA_OFF, sizeof(spira));
+	vm_unmap_global(SKIBOOT_BASE + SPIRAH_OFF, sizeof(spirah));
 
-	return 0;
+	return ret;
 }
diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
index 44adde4a3..d1ed62e9e 100644
--- a/hw/fake-nvram.c
+++ b/hw/fake-nvram.c
@@ -23,12 +23,16 @@  int fake_nvram_info(uint32_t *total_size)
 
 int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 {
+	void *t;
+
 	if (!nvram_region)
 		return -ENODEV;
 
+	t = vm_map(nvram_region->start + src, len, false);
 	lock(&fake_nvram_lock);
-	memcpy(dst, (void *) (nvram_region->start + src), len);
+	memcpy(dst, t, len);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + src, len);
 
 	nvram_read_complete(true);
 
@@ -37,12 +41,16 @@  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 
 int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
 {
+	void *t;
+
 	if (!nvram_region)
 		return OPAL_HARDWARE;
 
+	t = vm_map(nvram_region->start + offset, size, true);
 	lock(&fake_nvram_lock);
-	memcpy((void *) (nvram_region->start + offset), src, size);
+	memcpy(t, src, size);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + offset, size);
 
 	return 0;
 }
diff --git a/hw/homer.c b/hw/homer.c
index c5dbd58e3..58d629d23 100644
--- a/hw/homer.c
+++ b/hw/homer.c
@@ -108,6 +108,9 @@  static void homer_init_chip(struct proc_chip *chip)
 
 		chip->homer_base = hbase;
 		chip->homer_size = hsize;
+		/* slw late init and xive late init want to write to HOMER */
+		/* XXX: make it read only until then? */
+		vm_map_global("HOMER Image", hbase, hsize, true, false);
 	}
 
 	/*
@@ -134,13 +137,21 @@  static void homer_init_chip(struct proc_chip *chip)
 		chip->slw_base = sbase;
 		chip->slw_bar_size = ssize;
 		chip->slw_image_size = ssize; /* will be adjusted later */
+		/* XXX */
 	}
 
 	if (read_pba_bar(chip, bar_occ_common, &obase, &osize)) {
-		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
-		      obase, osize / 0x100000);
+		static uint64_t homer_obase = 0;
+
 		chip->occ_common_base = obase;
 		chip->occ_common_size = osize;
+
+		prlog(PR_DEBUG, "  OCC Common Area at 0x%llx size %lldMB\n",
+		      obase, osize / 0x100000);
+		if (obase != homer_obase) {
+			vm_map_global("OCC Common Area", obase, osize, false, false);
+			homer_obase = obase;
+		}
 	}
 }
 
diff --git a/hw/lpc-uart.c b/hw/lpc-uart.c
index 979a617c3..898fc4b1c 100644
--- a/hw/lpc-uart.c
+++ b/hw/lpc-uart.c
@@ -59,7 +59,7 @@  static uint32_t uart_base;
 static bool has_irq = false, irq_ok, rx_full, tx_full;
 static uint8_t tx_room;
 static uint8_t cached_ier;
-static void *mmio_uart_base;
+void *mmio_uart_base;
 static int uart_console_policy = UART_CONSOLE_OPAL;
 static int lpc_irq = -1;
 
@@ -591,6 +591,8 @@  void early_uart_init(void)
 	if (!mmio_uart_base)
 		return;
 
+	vm_map_global("UART MMIO", (unsigned long)mmio_uart_base, 8, true, true);
+
 	clk = dt_prop_get_u32(uart_node, "clock-frequency");
 	baud = dt_prop_get_u32(uart_node, "current-speed");
 
@@ -599,6 +601,7 @@  void early_uart_init(void)
 		prlog(PR_DEBUG, "UART: Using UART at %p\n", mmio_uart_base);
 	} else {
 		prerror("UART: Early init failed!");
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
 		mmio_uart_base = NULL;
 	}
 }
@@ -610,9 +613,6 @@  void uart_init(void)
 	char *path __unused;
 	const be32 *irqp;
 
-	/* Clean up after early_uart_init() */
-	mmio_uart_base = NULL;
-
 	/* UART lock is in the console path and thus must block
 	 * printf re-entrancy
 	 */
@@ -630,13 +630,28 @@  void uart_init(void)
 	 * directly mapped UARTs in simulation environments
 	 */
 	if (n->parent == dt_root) {
+		void *base;
+
 		printf("UART: Found at root !\n");
-		mmio_uart_base = (void *)dt_translate_address(n, 0, NULL);
-		if (!mmio_uart_base) {
+
+		base = (void *)dt_translate_address(n, 0, NULL);
+		if (!base) {
 			printf("UART: Failed to translate address !\n");
 			return;
 		}
 
+		if (mmio_uart_base != base) {
+			void *old;
+
+			vm_map_global("UART MMIO", (unsigned long)base, 8, true, true);
+			old = mmio_uart_base;
+			mmio_uart_base = base;
+
+			/* Clean up after early_uart_init() */
+			if (old)
+				vm_unmap_global((unsigned long)old, 8);
+		}
+
 		/* If it has an interrupt properly, we consider this to be
 		 * a direct XICS/XIVE interrupt
 		 */
@@ -665,6 +680,11 @@  void uart_init(void)
 			lpc_irq = be32_to_cpu(*irqp);
 			prlog(PR_DEBUG, "UART: Using LPC IRQ %d\n", lpc_irq);
 		}
+
+		if (mmio_uart_base) {
+//			vm_unmap_global((unsigned long)mmio_uart_base, 8);
+			mmio_uart_base = NULL;
+		}
 	}
 
 
diff --git a/hw/lpc.c b/hw/lpc.c
index c2a07a0db..cb2fed2a2 100644
--- a/hw/lpc.c
+++ b/hw/lpc.c
@@ -1239,6 +1239,7 @@  static void lpc_init_chip_p8(struct dt_node *xn)
 	chip->lpc = lpc;
 }
 
+void *mmio_uart_base;
 static void lpc_init_chip_p9(struct dt_node *opb_node)
 {
 	uint32_t gcid = dt_get_chip_id(opb_node);
@@ -1261,6 +1262,11 @@  static void lpc_init_chip_p9(struct dt_node *opb_node)
 	if (!lpc_node)
 		return;
 
+
+	if (mmio_uart_base)
+		vm_unmap_global((unsigned long)mmio_uart_base, 8);
+	vm_map_global("LPC MMIO", addr, 0x100000000UL /* XXX: size? */, true, true);
+
 	lpc = zalloc(sizeof(struct lpcm));
 	assert(lpc);
 	lpc->chip_id = gcid;
diff --git a/hw/phb4.c b/hw/phb4.c
index 60e797cf6..2447c6722 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -5830,6 +5830,7 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	uint64_t val, phb_bar = 0, irq_bar = 0, bar_en;
 	uint64_t mmio0_bar = 0, mmio0_bmask, mmio0_sz;
 	uint64_t mmio1_bar = 0, mmio1_bmask, mmio1_sz;
+	uint64_t bar_sz;
 	void *foo;
 	__be64 mmio_win[4];
 	unsigned int mmio_win_sz;
@@ -5858,7 +5859,8 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en = 0;
 
 	/* Initialize PHB register BAR */
-	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, NULL);
+	phys_map_get(gcid, PHB4_REG_SPC, phb_num, &phb_bar, &bar_sz);
+	vm_map_global("PHB REGS", phb_bar, bar_sz, true, true);
 	rc = xscom_write(gcid, nest_stack + XPEC_NEST_STK_PHB_REG_BAR,
 			 phb_bar << 8);
 
@@ -5872,18 +5874,21 @@  static void phb4_probe_stack(struct dt_node *stk_node, uint32_t pec_index,
 	bar_en |= XPEC_NEST_STK_BAR_EN_PHB;
 
 	/* Same with INT BAR (ESB) */
-	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, NULL);
+	phys_map_get(gcid, PHB4_XIVE_ESB, phb_num, &irq_bar, &bar_sz);
+	vm_map_global("PHB IRQ", irq_bar, bar_sz, true, true);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_IRQ_BAR, irq_bar << 8);
 	bar_en |= XPEC_NEST_STK_BAR_EN_INT;
 
 
 	/* Same with MMIO windows */
 	phys_map_get(gcid, PHB4_64BIT_MMIO, phb_num, &mmio0_bar, &mmio0_sz);
+	vm_map_global("PHB MMIO0", mmio0_bar, mmio0_sz, true, true);
 	mmio0_bmask =  (~(mmio0_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0, mmio0_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR0_MASK, mmio0_bmask << 8);
 
 	phys_map_get(gcid, PHB4_32BIT_MMIO, phb_num, &mmio1_bar, &mmio1_sz);
+	vm_map_global("PHB MMIO1", mmio1_bar, mmio1_sz, true, true);
 	mmio1_bmask =  (~(mmio1_sz - 1)) & 0x00FFFFFFFFFFFFFFULL;
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1, mmio1_bar << 8);
 	xscom_write(gcid, nest_stack + XPEC_NEST_STK_MMIO_BAR1_MASK, mmio1_bmask << 8);
diff --git a/hw/psi.c b/hw/psi.c
index 63fcb257e..45f11c6b9 100644
--- a/hw/psi.c
+++ b/hw/psi.c
@@ -908,6 +908,8 @@  static bool psi_init_psihb(struct dt_node *psihb)
 
 	list_add(&psis, &psi->list);
 
+	vm_map_global("PSI", (unsigned long)psi->regs, 0x100, true, true);
+
 	val = in_be64(psi->regs + PSIHB_CR);
 	if (val & PSIHB_CR_FSP_LINK_ACTIVE) {
 		lock(&psi_lock);
diff --git a/hw/slw.c b/hw/slw.c
index beb129a86..ccb100087 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -151,7 +151,7 @@  static void slw_patch_reset(void)
 		*(sav++) = *(dst);
 		*(dst++) = *(src++);
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static void slw_unpatch_reset(void)
@@ -167,7 +167,7 @@  static void slw_unpatch_reset(void)
 		*(dst++) = *(sav++);
 		src++;
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
diff --git a/hw/xive.c b/hw/xive.c
index 9a36f1ab2..c6aed7c9f 100644
--- a/hw/xive.c
+++ b/hw/xive.c
@@ -1397,6 +1397,7 @@  static bool xive_configure_bars(struct xive *x)
 
 	/* IC BAR */
 	phys_map_get(chip_id, XIVE_IC, 0, (uint64_t *)&x->ic_base, &x->ic_size);
+	vm_map_global("XIVE IC", (unsigned long)x->ic_base, x->ic_size, true, true);
 	val = (uint64_t)x->ic_base | CQ_IC_BAR_VALID;
 	if (IC_PAGE_SIZE == 0x10000) {
 		val |= CQ_IC_BAR_64K;
@@ -1412,6 +1413,8 @@  static bool xive_configure_bars(struct xive *x)
 	 * all phys_map_get(XIVE_TM) calls.
 	 */
 	phys_map_get(0, XIVE_TM, 0, (uint64_t *)&x->tm_base, &x->tm_size);
+	if (chip_id == 0)
+		vm_map_global("XIVE TM", (unsigned long)x->tm_base, x->tm_size, true, true);
 	val = (uint64_t)x->tm_base | CQ_TM_BAR_VALID;
 	if (TM_PAGE_SIZE == 0x10000) {
 		x->tm_shift = 16;
@@ -1427,6 +1430,7 @@  static bool xive_configure_bars(struct xive *x)
 
 	/* PC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_PC, 0, (uint64_t *)&x->pc_base, &x->pc_size);
+	vm_map_global("XIVE PC", (unsigned long)x->pc_base, x->pc_size, true, true);
 	xive_regwx(x, CQ_PC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
@@ -1441,6 +1445,7 @@  static bool xive_configure_bars(struct xive *x)
 
 	/* VC BAR. Clear first, write mask, then write value */
 	phys_map_get(chip_id, XIVE_VC, 0, (uint64_t *)&x->vc_base, &x->vc_size);
+	vm_map_global("XIVE VC", (unsigned long)x->vc_base, x->vc_size, true, true);
 	xive_regwx(x, CQ_VC_BAR, 0);
 	if (x->last_reg_error)
 		return false;
diff --git a/hw/xscom.c b/hw/xscom.c
index 0eda567fc..ef1a83fd4 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -931,6 +931,7 @@  void xscom_init(void)
 		const struct dt_property *reg;
 		struct proc_chip *chip;
 		const char *chip_name;
+		u64 size;
 		static const char *chip_names[] = {
 			"UNKNOWN", "P8E", "P8", "P8NVL", "P9N", "P9C", "P9P"
 		};
@@ -945,6 +946,9 @@  void xscom_init(void)
 		assert(reg);
 
 		chip->xscom_base = dt_translate_address(xn, 0, NULL);
+		size = dt_property_get_u64(reg, 1);
+
+		vm_map_global("XSCOM MMIO", chip->xscom_base, size, true, true);
 
 		/* Grab processor type and EC level */
 		xscom_init_chip_info(chip);
diff --git a/include/cmpxchg.h b/include/cmpxchg.h
index 0304e9134..835743cf5 100644
--- a/include/cmpxchg.h
+++ b/include/cmpxchg.h
@@ -5,6 +5,9 @@ 
 #define __CMPXCHG_H
 
 #ifndef __TEST__
+#include <stdint.h>
+#include <processor.h>
+
 /*
  * Bare cmpxchg, no barriers.
  */
diff --git a/include/cpu.h b/include/cpu.h
index 8ef20e35b..026328904 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -12,6 +12,19 @@ 
 #include <stack.h>
 #include <timer.h>
 
+struct vm_map {
+	struct list_node list;
+
+	const char *name;
+	uint64_t address;
+	uint64_t pa;
+	uint64_t length;
+	bool readable;
+	bool writeable;
+	bool executable;
+	bool ci;
+};
+
 /*
  * cpu_thread is our internal structure representing each
  * thread in the system
@@ -71,10 +84,19 @@  struct cpu_thread {
 	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
 	struct bt_metadata		stack_bot_bt_metadata;
 #endif
+	/*
+	 * Per-thread VM parameters
+	 */
+	struct vm_map			vm_local_map; /* per-cpu map */
+	bool				vm_local_map_inuse;
+	uint8_t				vm_slb_rr; /* RR allocator */
+	bool				vm_setup; /* virtual memory is up */
+
 	struct lock			job_lock;
 	struct list_head		job_queue;
 	uint32_t			job_count;
 	bool				job_has_no_return;
+
 	/*
 	 * Per-core mask tracking for threads in HMI handler and
 	 * a cleanup done bit.
diff --git a/include/elf-abi.h b/include/elf-abi.h
index 29c757642..34b95d337 100644
--- a/include/elf-abi.h
+++ b/include/elf-abi.h
@@ -21,7 +21,16 @@ 
 static inline uint64_t function_entry_address(void *func)
 {
 #ifdef ELF_ABI_v2
-	u32 *insn = func;
+	u32 *ret = func;
+	u32 *i;
+	u32 insn;
+	u32 insn2;
+
+	i = vm_map((unsigned long)func, sizeof(insn)*2, false);
+	insn = *i;
+	insn2 = *(i+1);
+	vm_unmap((unsigned long)func, sizeof(insn)*2);
+
 	/*
 	 * A PPC64 ABIv2 function may have a local and a global entry
 	 * point. We use the local entry point for branch tables called
@@ -38,12 +47,12 @@  static inline uint64_t function_entry_address(void *func)
 	 * lis   r2,XXXX
 	 * addi  r2,r2,XXXX
 	 */
-	if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
-	     ((*insn & OP_RT_RA_MASK) == LIS_R2)) &&
-	    ((*(insn+1) & OP_RT_RA_MASK) == ADDI_R2_R2))
-		return (uint64_t)(insn + 2);
+	if ((((insn & OP_RT_RA_MASK) == ADDIS_R2_R12) ||
+	     ((insn & OP_RT_RA_MASK) == LIS_R2)) &&
+	    ((insn2 & OP_RT_RA_MASK) == ADDI_R2_R2))
+		return (uint64_t)(ret + 2);
 	else
-		return (uint64_t)func;
+		return (uint64_t)ret;
 #else
 	return *(uint64_t *)func;
 #endif
diff --git a/include/io.h b/include/io.h
index f00021dcd..5c1bd41b4 100644
--- a/include/io.h
+++ b/include/io.h
@@ -7,6 +7,7 @@ 
 #ifndef __ASSEMBLY__
 
 #include <compiler.h>
+#include <skiboot.h>
 #include <stdint.h>
 #include <processor.h>
 #include <types.h>
@@ -23,8 +24,13 @@ 
 static inline uint8_t __in_8(const volatile uint8_t *addr)
 {
 	uint8_t val;
-	asm volatile("lbzcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("lbzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return val;
 }
 
@@ -37,8 +43,13 @@  static inline uint8_t in_8(const volatile uint8_t *addr)
 static inline uint16_t __in_be16(const volatile beint16_t *addr)
 {
 	__be16 val;
-	asm volatile("lhzcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return be16_to_cpu(val);
 }
 
@@ -51,8 +62,13 @@  static inline uint16_t in_be16(const volatile beint16_t *addr)
 static inline uint16_t __in_le16(const volatile leint16_t *addr)
 {
 	__le16 val;
-	asm volatile("lhzcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("lhzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return le16_to_cpu(val);
 }
 
@@ -65,8 +81,13 @@  static inline uint16_t in_le16(const volatile leint16_t *addr)
 static inline uint32_t __in_be32(const volatile beint32_t *addr)
 {
 	__be32 val;
-	asm volatile("lwzcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return be32_to_cpu(val);
 }
 
@@ -79,8 +100,13 @@  static inline uint32_t in_be32(const volatile beint32_t *addr)
 static inline uint32_t __in_le32(const volatile leint32_t *addr)
 {
 	__le32 val;
-	asm volatile("lwzcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("lwzcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return le32_to_cpu(val);
 }
 
@@ -93,8 +119,13 @@  static inline uint32_t in_le32(const volatile leint32_t *addr)
 static inline uint64_t __in_be64(const volatile beint64_t *addr)
 {
 	__be64 val;
-	asm volatile("ldcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return be64_to_cpu(val);
 }
 
@@ -107,8 +138,13 @@  static inline uint64_t in_be64(const volatile beint64_t *addr)
 static inline uint64_t __in_le64(const volatile leint64_t *addr)
 {
 	__le64 val;
-	asm volatile("ldcix %0,0,%1" :
-		     "=r"(val) : "r"(addr), "m"(*addr) : "memory");
+
+	if (vm_realmode())
+		asm volatile("ldcix %0,0,%1" :
+		     "=r"(val) : "r"(addr), "m"(*addr));
+	else
+		val = *addr;
+
 	return le64_to_cpu(val);
 }
 
@@ -120,8 +156,11 @@  static inline uint64_t in_le64(const volatile leint64_t *addr)
 
 static inline void __out_8(volatile uint8_t *addr, uint8_t val)
 {
-	asm volatile("stbcix %0,0,%1"
-		     : : "r"(val), "r"(addr), "m"(*addr) : "memory");
+	if (vm_realmode())
+		asm volatile("stbcix %0,0,%1"
+		     : : "r"(val), "r"(addr), "m"(*addr));
+	else
+		*addr = val;
 }
 
 static inline void out_8(volatile uint8_t *addr, uint8_t val)
@@ -132,8 +171,12 @@  static inline void out_8(volatile uint8_t *addr, uint8_t val)
 
 static inline void __out_be16(volatile beint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
-		     : : "r"(cpu_to_be16(val)), "r"(addr), "m"(*addr) : "memory");
+	__be16 __val = cpu_to_be16(val);
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_be16(volatile beint16_t *addr, uint16_t val)
@@ -144,8 +187,12 @@  static inline void out_be16(volatile beint16_t *addr, uint16_t val)
 
 static inline void __out_le16(volatile leint16_t *addr, uint16_t val)
 {
-	asm volatile("sthcix %0,0,%1"
-		     : : "r"(cpu_to_le16(val)), "r"(addr), "m"(*addr) : "memory");
+	__le16 __val = cpu_to_le16(val);
+	if (vm_realmode())
+		asm volatile("sthcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_le16(volatile leint16_t *addr, uint16_t val)
@@ -156,8 +203,12 @@  static inline void out_le16(volatile leint16_t *addr, uint16_t val)
 
 static inline void __out_be32(volatile beint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
-		     : : "r"(cpu_to_be32(val)), "r"(addr), "m"(*addr) : "memory");
+	__be32 __val = cpu_to_be32(val);
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_be32(volatile beint32_t *addr, uint32_t val)
@@ -168,8 +219,12 @@  static inline void out_be32(volatile beint32_t *addr, uint32_t val)
 
 static inline void __out_le32(volatile leint32_t *addr, uint32_t val)
 {
-	asm volatile("stwcix %0,0,%1"
-		     : : "r"(cpu_to_le32(val)), "r"(addr), "m"(*addr) : "memory");
+	__le32 __val = cpu_to_le32(val);
+	if (vm_realmode())
+		asm volatile("stwcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_le32(volatile leint32_t *addr, uint32_t val)
@@ -180,8 +235,12 @@  static inline void out_le32(volatile leint32_t *addr, uint32_t val)
 
 static inline void __out_be64(volatile beint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
-		     : : "r"(cpu_to_be64(val)), "r"(addr), "m"(*addr) : "memory");
+	__be64 __val = cpu_to_be64(val);
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_be64(volatile beint64_t *addr, uint64_t val)
@@ -192,8 +251,12 @@  static inline void out_be64(volatile beint64_t *addr, uint64_t val)
 
 static inline void __out_le64(volatile leint64_t *addr, uint64_t val)
 {
-	asm volatile("stdcix %0,0,%1"
-		     : : "r"(cpu_to_le64(val)), "r"(addr), "m"(*addr) : "memory");
+	__le64 __val = cpu_to_le64(val);
+	if (vm_realmode())
+		asm volatile("stdcix %0,0,%1"
+		     : : "r"(__val), "r"(addr), "m"(*addr));
+	else
+		*addr = __val;
 }
 
 static inline void out_le64(volatile leint64_t *addr, uint64_t val)
diff --git a/include/mem_region.h b/include/mem_region.h
index 3e3818a66..47c3bd70c 100644
--- a/include/mem_region.h
+++ b/include/mem_region.h
@@ -33,6 +33,7 @@  struct mem_region {
 	struct list_node list;
 	const char *name;
 	uint64_t start, len;
+	uint64_t vm_mapped_len;
 	struct dt_node *node;
 	enum mem_region_type type;
 	struct list_head free_list;
diff --git a/include/platform.h b/include/platform.h
index 6aa263ae0..e431a5fe0 100644
--- a/include/platform.h
+++ b/include/platform.h
@@ -298,8 +298,8 @@  struct platform {
 	void (*vpd_iohub_load)(struct dt_node *hub_node);
 };
 
-extern struct platform __platforms_start;
-extern struct platform __platforms_end;
+extern struct platform __platforms_start[];
+extern struct platform __platforms_end[];
 
 extern struct platform	platform;
 extern const struct bmc_platform *bmc_platform;
diff --git a/include/processor.h b/include/processor.h
index 7ba251bb4..9d197ffc1 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -39,7 +39,9 @@ 
 #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
 #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
 #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
+#define SPR_PID		0x030	/* RW: PID register */
 #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
+#define SPR_UAMOR	0x09d
 #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
 #define SPR_TBRL	0x10c	/* RO: Timebase low */
 #define SPR_TBRU	0x10d	/* RO: Timebase high */
@@ -61,10 +63,12 @@ 
 #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
 #define SPR_TFMR	0x13d
 #define SPR_LPCR	0x13e
+#define SPR_LPID	0x13f	/* RW: LPID register */
 #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
 #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
 #define SPR_PCR		0x152
 #define SPR_AMOR	0x15d
+#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
 #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
 #define SPR_TSCR	0x399
 #define SPR_HID0	0x3f0
@@ -80,6 +84,11 @@ 
 #define SPR_SRR1_PM_WAKE_SRESET	0x100000
 #define SPR_SRR1_PM_WAKE_MCE	0x3c0000	/* Use reserved value for MCE */
 
+/* Bits in DSISR */
+
+#define	DSISR_ISSTORE		0x02000000
+
+
 /* Bits in LPCR */
 
 /* Powersave Exit Cause Enable is different on each generation */
@@ -318,9 +327,9 @@  static inline void isync(void)
 /*
  * Cache sync
  */
-static inline void sync_icache(void)
+static inline void sync_icache(unsigned long ptr)
 {
-	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
+	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
 }
 
 /*
diff --git a/include/skiboot.h b/include/skiboot.h
index 30ff500c5..aacb425f7 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -42,10 +42,16 @@  extern char _stext[];
 extern char _etext[];
 extern char __sym_map_end[];
 extern char _romem_end[];
+extern char __vm_mapped_romem_end[];
 
 #ifndef __TESTING__
+extern char _stext[], _etext[];
 /* Readonly section start and end. */
 extern char __rodata_start[], __rodata_end[];
+extern char _sdata[], _edata[];
+extern char __sym_map_start[], __sym_map_end[];
+extern char _sbss[], _ebss[];
+extern char _end[];
 
 static inline bool is_rodata(const void *p)
 {
@@ -184,6 +190,7 @@  extern void disable_fast_reboot(const char *reason);
 extern void add_fast_reboot_dt_entries(void);
 extern void fast_reboot(void);
 extern void __noreturn __secondary_cpu_entry(void);
+extern void __noreturn __return_cpu_entry(void);
 extern void __noreturn load_and_boot_kernel(bool is_reboot);
 extern void cleanup_local_tlb(void);
 extern void cleanup_global_tlb(void);
@@ -336,4 +343,24 @@  extern int fake_nvram_info(uint32_t *total_size);
 extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
 extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
 
+/* core/vm.c */
+bool vm_realmode(void);
+void vm_map_global(const char *name, unsigned long addr, unsigned long len, bool rw, bool ci);
+void vm_map_global_text(const char *name, unsigned long addr, unsigned long len);
+void vm_unmap_global(unsigned long addr, unsigned long len);
+void *vm_map(unsigned long addr, unsigned long len, bool rw);
+void vm_unmap(unsigned long addr, unsigned long len);
+void vm_init(bool fast_reboot);
+void vm_init_stacks(void);
+void vm_destroy(void);
+void vm_init_secondary(void);
+void vm_enter(void);
+void vm_exit(void);
+void vm_exit_cleanup(void);
+void vm_map_stacks(void);
+bool vm_dslb(uint64_t nia, uint64_t dar);
+bool vm_islb(uint64_t nia);
+bool vm_dsi(uint64_t nia, uint64_t dar, uint32_t dsisr);
+bool vm_isi(uint64_t nia);
+
 #endif /* __SKIBOOT_H */
diff --git a/libstb/container.c b/libstb/container.c
index eca54cf63..2b8f22f70 100644
--- a/libstb/container.c
+++ b/libstb/container.c
@@ -6,14 +6,20 @@ 
 
 bool stb_is_container(const void *buf, size_t size)
 {
+	beint32_t *t;
 	ROM_container_raw *c;
+	bool ret = true;;
 
 	c = (ROM_container_raw*) buf;
 	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
 		return false;
-	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
-		return false;
-	return true;
+
+	t = vm_map((unsigned long)&c->magic_number, sizeof(*t), false);
+	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
+		ret = false;
+	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
+
+	return ret;
 }
 
 uint32_t stb_payload_magic(const void *buf, size_t size)
diff --git a/libstb/cvc.c b/libstb/cvc.c
index 663e53953..08b2eea60 100644
--- a/libstb/cvc.c
+++ b/libstb/cvc.c
@@ -155,6 +155,9 @@  static int cvc_reserved_mem_init(struct dt_node *parent) {
 		return -1;
 	}
 	addr = dt_get_address(cvc_resv_mem, 0, &size);
+	if (size == 0) // MAMBO HACK
+		size = 64*1024;
+	vm_map_global_text("STB-CVC", addr, size);
 	cvc_register(addr, addr + size-1);
 
 	exports = dt_find_by_path(dt_root, "/ibm,opal/firmware/exports");
diff --git a/libstb/secureboot.c b/libstb/secureboot.c
index c86972161..dc3bda3d2 100644
--- a/libstb/secureboot.c
+++ b/libstb/secureboot.c
@@ -164,6 +164,7 @@  int secureboot_verify(enum resource_id id, void *buf, size_t len)
 {
 	const char *name;
 	__be64 log;
+	void *vbuf;
 	int rc = -1;
 
 	name = flash_map_resource_name(id);
@@ -181,7 +182,9 @@  int secureboot_verify(enum resource_id id, void *buf, size_t len)
 		return -1;
         }
 
-	rc = call_cvc_verify(buf, len, hw_key_hash, hw_key_hash_size, &log);
+	vbuf = vm_map((unsigned long)buf, len, false);
+	rc = call_cvc_verify(vbuf, len, hw_key_hash, hw_key_hash_size, &log);
+	vm_unmap((unsigned long)buf, len);
 
 	if (rc == OPAL_SUCCESS) {
 		prlog(PR_NOTICE, "%s verified\n", name);
diff --git a/libstb/trustedboot.c b/libstb/trustedboot.c
index 413862e63..910354f7b 100644
--- a/libstb/trustedboot.c
+++ b/libstb/trustedboot.c
@@ -161,7 +161,7 @@  out_free:
 int trustedboot_measure(enum resource_id id, void *buf, size_t len)
 {
 	uint8_t digest[SHA512_DIGEST_LENGTH];
-	void *buf_aux;
+	void *buf_aux, *vbuf;
 	size_t len_aux;
 	const char *name;
 	TPM_Pcr pcr;
@@ -219,7 +219,9 @@  int trustedboot_measure(enum resource_id id, void *buf, size_t len)
 		len_aux = len;
 	}
 
-	rc = call_cvc_sha512(buf_aux, len_aux, digest, SHA512_DIGEST_LENGTH);
+	vbuf = vm_map((unsigned long)buf_aux, len_aux, false);
+	rc = call_cvc_sha512(vbuf, len_aux, digest, SHA512_DIGEST_LENGTH);
+	vm_unmap((unsigned long)buf_aux, len_aux);
 
 	if (rc == OPAL_SUCCESS) {
 		prlog(PR_NOTICE, "%s hash calculated\n", name);
diff --git a/skiboot.lds.S b/skiboot.lds.S
index b136e4004..9d21681ab 100644
--- a/skiboot.lds.S
+++ b/skiboot.lds.S
@@ -123,12 +123,26 @@  SECTIONS
 		__rodata_end = .;
 	}
 
+	. = ALIGN(0x100);
+	.got : {
+		__toc_start = . + 0x8000;
+		*(.got)
+		*(.toc)
+	}
+
+	. = ALIGN(0x10);
+	.opd : {
+		*(.opd)
+	}
+
 	. = ALIGN(0x10);
 	.trap_table : {
 		__trap_table_start = .;
 		KEEP(*(.trap_table))
 		__trap_table_end = .;
 	}
+	__vm_mapped_romem_end = .;
+	. = ALIGN(PAGE_SIZE);
 
 	. = ALIGN(0x10);
 	.init : {
@@ -139,18 +153,6 @@  SECTIONS
 		__ctors_end = .;
 	}
 
-	. = ALIGN(0x10);
-	.opd : {
-		*(.opd)
-	}
-  
-	. = ALIGN(0x100);
-	.got : {
-		__toc_start = . + 0x8000;
-		*(.got)
-		*(.toc)
-	}
-
 	. = ALIGN(0x10);
 	.opal_table : {
 		__opal_table_start = .;