[RFC] virtual memory for OPAL boot

Message ID 20180827021558.8699-1-npiggin@gmail.com
State New
Headers show
Series
  • [RFC] virtual memory for OPAL boot
Related show

Checks

Context Check Description
snowpatch_ozlabs/make_check success Test make_check on branch master
snowpatch_ozlabs/apply_patch success master/apply_patch Successfully applied

Commit Message

Nicholas Piggin Aug. 27, 2018, 2:15 a.m.
I tried hacking on this a bit more. This turns on HPT virtual memory
quite early in boot. There is a global EA=RA map for "global" mappings
which are things that are always mapped and shared, like text and heap.
Then there are transient per-CPU mappings that use their own private
addresses for temporary mappings of things that are accessed carefully
(e.g., like the 0 page interrupt vectors).

VM gets shut down right before the kernel is booted.

This rearranges skiboot.lds.S a bit to put the most similar regions
together as possible, which makes it easier to map things with specific
protections. Everything but text is no-execute, rodata is read only, etc.

Not too sure where I'm going with this. I think it's good to minimise
the amount of time spent in real mode in general to catch bugs. Maybe
this is unintrusive enough to be worthwhile. But this is only boot, I
would like to get to a point where OPAL services run mostly in virtual
mode too, but that would look much different and probably require VM
provided by the OS.

Anyway this "works" (in mambo), it's fairly unintrusive, most code
changes are just juggling a link locations around.
---
 asm/head.S           | 106 +--------
 asm/misc.S           | 110 +++++++++
 core/Makefile.inc    |   2 +-
 core/cpu.c           |   4 +
 core/init.c          |  60 +++--
 core/mem_region.c    |  64 +++--
 core/opal.c          |   5 +-
 core/stack.c         |   2 -
 core/vm.c            | 538 +++++++++++++++++++++++++++++++++++++++++++
 hw/fake-nvram.c      |  12 +-
 hw/slw.c             |   4 +-
 include/cpu.h        |   8 +
 include/mem_region.h |   1 +
 include/processor.h  |   7 +-
 include/skiboot.h    |  19 ++
 libstb/container.c   |  12 +-
 skiboot.lds.S        |  94 ++++----
 17 files changed, 853 insertions(+), 195 deletions(-)
 create mode 100644 core/vm.c

Comments

Oliver Aug. 27, 2018, 6:16 a.m. | #1
On Mon, Aug 27, 2018 at 12:15 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> I tried hacking on this a bit more. This turns on HPT virtual memory
> quite early in boot. There is a global EA=RA map for "global" mappings
> which are things that are always mapped and shared, like text and heap.
> Then there are transient per-CPU mappings that use their own private
> addresses for temporary mappings of things that are accessed carefully
> (e.g., like the 0 page interrupt vectors).

cool

> VM gets shut down right before the kernel is booted.
>
> This rearranges skiboot.lds.S a bit to put the most similar regions
> together as possible, which makes it easier to map things with specific
> protections. Everything but text is no-execute, rodata is read only, etc.
>
> Not too sure where I'm going with this. I think it's good to minimise
> the amount of time spent in real mode in general to catch bugs. Maybe
> this is unintrusive enough to be worthwhile. But this is only boot, I
> would like to get to a point where OPAL services run mostly in virtual
> mode too, but that would look much different and probably require VM
> provided by the OS.
>
> Anyway this "works" (in mambo), it's fairly unintrusive, most code
> changes are just juggling a link locations around.

We'll need to have a think about how we're going to deal with I/O if
we want to do this on real hardware, or even on mambo before
xscom_init() is called. Currently we use the explicit cache inhibited
load/store instructions for accessing MMIO regions in skiboot and
those are only available in hypervisor real mode. So we'll probably
need some kind of instruction patching mechanism if we want to boot in
virtual mode and switch to real mode at runtime.

Alternatively we could leave them as-is and emulate them at boot time.
It might be a bit slow, but MMIOs aren't exactly fast to begin with.

> ---
>  asm/head.S           | 106 +--------
>  asm/misc.S           | 110 +++++++++
>  core/Makefile.inc    |   2 +-
>  core/cpu.c           |   4 +
>  core/init.c          |  60 +++--
>  core/mem_region.c    |  64 +++--
>  core/opal.c          |   5 +-
>  core/stack.c         |   2 -
>  core/vm.c            | 538 +++++++++++++++++++++++++++++++++++++++++++
>  hw/fake-nvram.c      |  12 +-
>  hw/slw.c             |   4 +-
>  include/cpu.h        |   8 +
>  include/mem_region.h |   1 +
>  include/processor.h  |   7 +-
>  include/skiboot.h    |  19 ++
>  libstb/container.c   |  12 +-
>  skiboot.lds.S        |  94 ++++----
>  17 files changed, 853 insertions(+), 195 deletions(-)
>  create mode 100644 core/vm.c
>
> diff --git a/asm/head.S b/asm/head.S
> index 803fbf1a..729cfe87 100644
> --- a/asm/head.S
> +++ b/asm/head.S
> @@ -23,13 +23,6 @@
>
>  #define EPAPR_MAGIC    0x65504150
>
> -/* Power management instructions */
> -#define PPC_INST_NAP           .long 0x4c000364
> -#define PPC_INST_SLEEP         .long 0x4c0003a4
> -#define PPC_INST_RVWINKLE      .long 0x4c0003e4
> -
> -#define PPC_INST_STOP          .long 0x4c0002e4
> -
>  #define GET_STACK(stack_reg,pir_reg)                                   \
>         sldi    stack_reg,pir_reg,STACK_SHIFT;                          \
>         addis   stack_reg,stack_reg,CPU_STACKS_OFFSET@ha;               \
> @@ -520,104 +513,6 @@ disable_machine_check:
>  1:     mtlr    %r0
>         blr
>
> -pm_save_regs:
> -       SAVE_GPR(2,%r1)
> -       SAVE_GPR(14,%r1)
> -       SAVE_GPR(15,%r1)
> -       SAVE_GPR(16,%r1)
> -       SAVE_GPR(17,%r1)
> -       SAVE_GPR(18,%r1)
> -       SAVE_GPR(19,%r1)
> -       SAVE_GPR(20,%r1)
> -       SAVE_GPR(21,%r1)
> -       SAVE_GPR(22,%r1)
> -       SAVE_GPR(23,%r1)
> -       SAVE_GPR(24,%r1)
> -       SAVE_GPR(25,%r1)
> -       SAVE_GPR(26,%r1)
> -       SAVE_GPR(27,%r1)
> -       SAVE_GPR(28,%r1)
> -       SAVE_GPR(29,%r1)
> -       SAVE_GPR(30,%r1)
> -       SAVE_GPR(31,%r1)
> -       mfcr    %r4
> -       mfxer   %r5
> -       mfspr   %r6,SPR_HSPRG0
> -       mfspr   %r7,SPR_HSPRG1
> -       stw     %r4,STACK_CR(%r1)
> -       stw     %r5,STACK_XER(%r1)
> -       std     %r6,STACK_GPR0(%r1)
> -       std     %r7,STACK_GPR1(%r1)
> -       blr
> -
> -.global enter_p8_pm_state
> -enter_p8_pm_state:
> -       /* Before entering map or rvwinkle, we create a stack frame
> -        * and save our non-volatile registers.
> -        *
> -        * We also save these SPRs:
> -        *
> -        *  - HSPRG0    in GPR0 slot
> -        *  - HSPRG1    in GPR1 slot
> -        *
> -        *  - xxx TODO: HIDs
> -        *  - TODO: Mask MSR:ME during the process
> -        *
> -        * On entry, r3 indicates:
> -        *
> -        *    0 = nap
> -        *    1 = rvwinkle
> -        */
> -       mflr    %r0
> -       std     %r0,16(%r1)
> -       stdu    %r1,-STACK_FRAMESIZE(%r1)
> -
> -       bl      pm_save_regs
> -
> -       /* Save stack pointer in struct cpu_thread */
> -       std     %r1,CPUTHREAD_SAVE_R1(%r13)
> -
> -       /* Winkle or nap ? */
> -       cmpli   %cr0,0,%r3,0
> -       bne     1f
> -
> -       /* nap sequence */
> -       ptesync
> -0:     ld      %r0,CPUTHREAD_SAVE_R1(%r13)
> -       cmpd    cr0,%r0,%r0
> -       bne     0b
> -       PPC_INST_NAP
> -       b       .
> -
> -       /* rvwinkle sequence */
> -1:     ptesync
> -0:     ld      %r0,CPUTHREAD_SAVE_R1(%r13)
> -       cmpd    cr0,%r0,%r0
> -       bne     0b
> -       PPC_INST_RVWINKLE
> -       b       .
> -
> -.global enter_p9_pm_lite_state
> -enter_p9_pm_lite_state:
> -       mtspr   SPR_PSSCR,%r3
> -       PPC_INST_STOP
> -       blr
> -
> -.global enter_p9_pm_state
> -enter_p9_pm_state:
> -       mflr    %r0
> -       std     %r0,16(%r1)
> -       stdu    %r1,-STACK_FRAMESIZE(%r1)
> -
> -       bl      pm_save_regs
> -
> -       /* Save stack pointer in struct cpu_thread */
> -       std     %r1,CPUTHREAD_SAVE_R1(%r13)
> -
> -       mtspr   SPR_PSSCR,%r3
> -       PPC_INST_STOP
> -       b       .
> -
>  /* This is a little piece of code that is copied down to
>   * 0x100 for handling power management wakeups
>   */
> @@ -633,6 +528,7 @@ reset_patch_start:
>  .global reset_patch_end
>  reset_patch_end:
>
> +.global reset_wakeup
>  reset_wakeup:
>         /* Get PIR */
>         mfspr   %r31,SPR_PIR
> diff --git a/asm/misc.S b/asm/misc.S
> index 381590b9..916acf9c 100644
> --- a/asm/misc.S
> +++ b/asm/misc.S
> @@ -123,3 +123,113 @@ cleanup_global_tlb:
>         ptesync
>
>         blr
> +
> +
> +/* Power management instructions */
> +#define PPC_INST_NAP           .long 0x4c000364
> +#define PPC_INST_SLEEP         .long 0x4c0003a4
> +#define PPC_INST_RVWINKLE      .long 0x4c0003e4
> +
> +#define PPC_INST_STOP          .long 0x4c0002e4
> +
> +#define SAVE_GPR(reg,sp)       std %r##reg,STACK_GPR##reg(sp)
> +#define REST_GPR(reg,sp)       ld %r##reg,STACK_GPR##reg(sp)
> +
> +pm_save_regs:
> +       SAVE_GPR(2,%r1)
> +       SAVE_GPR(14,%r1)
> +       SAVE_GPR(15,%r1)
> +       SAVE_GPR(16,%r1)
> +       SAVE_GPR(17,%r1)
> +       SAVE_GPR(18,%r1)
> +       SAVE_GPR(19,%r1)
> +       SAVE_GPR(20,%r1)
> +       SAVE_GPR(21,%r1)
> +       SAVE_GPR(22,%r1)
> +       SAVE_GPR(23,%r1)
> +       SAVE_GPR(24,%r1)
> +       SAVE_GPR(25,%r1)
> +       SAVE_GPR(26,%r1)
> +       SAVE_GPR(27,%r1)
> +       SAVE_GPR(28,%r1)
> +       SAVE_GPR(29,%r1)
> +       SAVE_GPR(30,%r1)
> +       SAVE_GPR(31,%r1)
> +       mfcr    %r4
> +       mfxer   %r5
> +       mfspr   %r6,SPR_HSPRG0
> +       mfspr   %r7,SPR_HSPRG1
> +       stw     %r4,STACK_CR(%r1)
> +       stw     %r5,STACK_XER(%r1)
> +       std     %r6,STACK_GPR0(%r1)
> +       std     %r7,STACK_GPR1(%r1)
> +       blr
> +
> +.global enter_p8_pm_state
> +enter_p8_pm_state:
> +       /* Before entering map or rvwinkle, we create a stack frame
> +        * and save our non-volatile registers.
> +        *
> +        * We also save these SPRs:
> +        *
> +        *  - HSPRG0    in GPR0 slot
> +        *  - HSPRG1    in GPR1 slot
> +        *
> +        *  - xxx TODO: HIDs
> +        *  - TODO: Mask MSR:ME during the process
> +        *
> +        * On entry, r3 indicates:
> +        *
> +        *    0 = nap
> +        *    1 = rvwinkle
> +        */
> +       mflr    %r0
> +       std     %r0,16(%r1)
> +       stdu    %r1,-STACK_FRAMESIZE(%r1)
> +
> +       bl      pm_save_regs
> +
> +       /* Save stack pointer in struct cpu_thread */
> +       std     %r1,CPUTHREAD_SAVE_R1(%r13)
> +
> +       /* Winkle or nap ? */
> +       cmpli   %cr0,0,%r3,0
> +       bne     1f
> +
> +       /* nap sequence */
> +       ptesync
> +0:     ld      %r0,CPUTHREAD_SAVE_R1(%r13)
> +       cmpd    cr0,%r0,%r0
> +       bne     0b
> +       PPC_INST_NAP
> +       b       .
> +
> +       /* rvwinkle sequence */
> +1:     ptesync
> +0:     ld      %r0,CPUTHREAD_SAVE_R1(%r13)
> +       cmpd    cr0,%r0,%r0
> +       bne     0b
> +       PPC_INST_RVWINKLE
> +       b       .
> +
> +.global enter_p9_pm_lite_state
> +enter_p9_pm_lite_state:
> +       mtspr   SPR_PSSCR,%r3
> +       PPC_INST_STOP
> +       blr
> +
> +.global enter_p9_pm_state
> +enter_p9_pm_state:
> +       mflr    %r0
> +       std     %r0,16(%r1)
> +       stdu    %r1,-STACK_FRAMESIZE(%r1)
> +
> +       bl      pm_save_regs
> +
> +       /* Save stack pointer in struct cpu_thread */
> +       std     %r1,CPUTHREAD_SAVE_R1(%r13)
> +
> +       mtspr   SPR_PSSCR,%r3
> +       PPC_INST_STOP
> +       b       .
> +
> diff --git a/core/Makefile.inc b/core/Makefile.inc
> index d3635059..e057f479 100644
> --- a/core/Makefile.inc
> +++ b/core/Makefile.inc
> @@ -1,7 +1,7 @@
>  # -*-Makefile-*-
>
>  SUBDIRS += core
> -CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
> +CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
>  CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
>  CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
>  CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
> diff --git a/core/cpu.c b/core/cpu.c
> index 88477f82..15829ede 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -469,12 +469,16 @@ static void cpu_idle_p9(enum cpu_wake_cause wake_on)
>                 /* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
>                 psscr = PPC_BIT(42) | PPC_BIT(43) |
>                         PPC_BITMASK(54, 55) | PPC_BIT(63);
> +               vm_exit();
>                 enter_p9_pm_state(psscr);
> +               vm_enter();
>         } else {
>                 /* stop with EC=0 (resumes) which does not require sreset. */
>                 /* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
>                 psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
> +               // vm_exit();
>                 enter_p9_pm_lite_state(psscr);
> +               // vm_enter();
>         }
>
>         /* Clear doorbell */
> diff --git a/core/init.c b/core/init.c
> index ca6c468c..8e99ab42 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -347,7 +347,7 @@ bool start_preload_kernel(void)
>  static bool load_kernel(void)
>  {
>         void *stb_container = NULL;
> -       struct elf_hdr *kh;
> +       struct elf_hdr *kh, *t;
>         int loaded;
>
>         prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
> @@ -386,7 +386,7 @@ static bool load_kernel(void)
>                 if (kernel_entry < EXCEPTION_VECTORS_END) {
>                         cpu_set_sreset_enable(false);
>                         memcpy(NULL, old_vectors, EXCEPTION_VECTORS_END);
> -                       sync_icache();
> +                       sync_icache(0);
>                 }
>         } else {
>                 if (!kernel_size) {
> @@ -407,21 +407,25 @@ static bool load_kernel(void)
>               "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
>               kernel_size);
>
> -       if (kh->ei_ident != ELF_IDENT) {
> +//     t = vm_map((unsigned long)kh, sizeof(*kh));
> +       vm_map_global((unsigned long)kh, sizeof(*kh));
> +       t = kh;
> +       if (t->ei_ident != ELF_IDENT) {
>                 prerror("INIT: ELF header not found. Assuming raw binary.\n");
>                 return true;
>         }
>
> -       if (kh->ei_class == ELF_CLASS_64) {
> -               if (!try_load_elf64(kh))
> +       if (t->ei_class == ELF_CLASS_64) {
> +               if (!try_load_elf64(t))
>                         return false;
> -       } else if (kh->ei_class == ELF_CLASS_32) {
> -               if (!try_load_elf32(kh))
> +       } else if (t->ei_class == ELF_CLASS_32) {
> +               if (!try_load_elf32(t))
>                         return false;
>         } else {
>                 prerror("INIT: Neither ELF32 not ELF64 ?\n");
>                 return false;
>         }
> +//     vm_unmap((unsigned long)kh, sizeof(*kh));
>
>         if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
>                 secureboot_verify(RESOURCE_ID_KERNEL,
> @@ -481,6 +485,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>         const struct dt_property *memprop;
>         const char *cmdline, *stdoutp;
>         uint64_t mem_top;
> +       uint32_t *t;
>
>         memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
>         if (memprop)
> @@ -580,13 +585,23 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
>
>         debug_descriptor.state_flags |= OPAL_BOOT_COMPLETE;
>
> +       printf("%s:%d\n", __FILE__, __LINE__);
>         fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
> +       printf("%s:%d\n", __FILE__, __LINE__);
>
> +       t = vm_map(kernel_entry, 4);
> +       printf("%s:%d\n", __FILE__, __LINE__);
>         /* Check there is something there before we branch to it */
> -       if (*(uint32_t *)kernel_entry == 0) {
> +       if (*t == 0) {
>                 prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
>                 assert(0);
>         }
> +       printf("%s:%d\n", __FILE__, __LINE__);
> +       vm_unmap(kernel_entry, 4);
> +       printf("%s:%d\n", __FILE__, __LINE__);
> +
> +       /* Go back to realmode and tear down our VM before booting kernel */
> +       vm_destroy();
>
>         if (kernel_32bit)
>                 start_kernel32(kernel_entry, fdt, mem_top);
> @@ -747,23 +762,35 @@ static void setup_branch_null_catcher(void)
>
>  void setup_reset_vector(void)
>  {
> +       static char patch[0x100];
>         uint32_t *src, *dst;
> +       uint32_t *t;
> +       uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
>
>         /* Copy the reset code over the entry point. */
>         src = &reset_patch_start;
> +       t = vm_map((unsigned long)src, len);
> +       memcpy(patch, t, len);
> +       vm_unmap((unsigned long)src, len);
> +
>         dst = (uint32_t *)0x100;
> -       while(src < &reset_patch_end)
> -               *(dst++) = *(src++);
> -       sync_icache();
> +       t = vm_map((unsigned long)dst, len);
> +       memcpy(t, patch, len);
> +       sync_icache((unsigned long)t);
> +       vm_unmap((unsigned long)dst, len);
>         cpu_set_sreset_enable(true);
>  }
>
>  void copy_exception_vectors(void)
>  {
> +       void *t;
> +
> +       t = vm_map(0x0, 0x2000);
> +
>         /* Backup previous vectors as this could contain a kernel
>          * image.
>          */
> -       memcpy(old_vectors, NULL, EXCEPTION_VECTORS_END);
> +       memcpy(old_vectors, t, EXCEPTION_VECTORS_END);
>
>         /* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
>          * this is the boot flag used by CPUs still potentially entering
> @@ -771,9 +798,10 @@ void copy_exception_vectors(void)
>          */
>         BUILD_ASSERT((&reset_patch_end - &reset_patch_start) <
>                         EXCEPTION_VECTORS_END - 0x100);
> -       memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
> +       memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
>                         EXCEPTION_VECTORS_END - 0x100);
> -       sync_icache();
> +       sync_icache((unsigned long)t);
> +       vm_unmap(0x0, 0x2000);
>  }
>
>  static void per_thread_sanity_checks(void)
> @@ -971,6 +999,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>          */
>         mem_region_init();
>
> +       vm_init();
> +

This is a bit too late to really be useful. Ideally we'd want to be in
virtual mode before the HDAT parser runs or the FDT is expanded.

>         /* Reserve HOMER and OCC area */
>         homer_init();
>
> @@ -1184,6 +1214,8 @@ void __noreturn __secondary_cpu_entry(void)
>  {
>         struct cpu_thread *cpu = this_cpu();
>
> +       vm_init_secondary();
> +
>         /* Secondary CPU called in */
>         cpu_callin(cpu);
>
> diff --git a/core/mem_region.c b/core/mem_region.c
> index bd387f3c..5c427523 100644
> --- a/core/mem_region.c
> +++ b/core/mem_region.c
> @@ -65,24 +65,27 @@ static struct mem_region skiboot_os_reserve = {
>         .type           = REGION_OS,
>  };
>
> -struct mem_region skiboot_heap = {
> -       .name           = "ibm,firmware-heap",
> -       .start          = HEAP_BASE,
> -       .len            = HEAP_SIZE,
> -       .type           = REGION_SKIBOOT_HEAP,
> -};
> -
>  static struct mem_region skiboot_code_and_text = {
>         .name           = "ibm,firmware-code",
>         .start          = SKIBOOT_BASE,
>         .len            = HEAP_BASE - SKIBOOT_BASE,
> +       .vm_mapped_len  = HEAP_BASE - SKIBOOT_BASE,
>         .type           = REGION_SKIBOOT_FIRMWARE,
>  };
>
> +struct mem_region skiboot_heap = {
> +       .name           = "ibm,firmware-heap",
> +       .start          = HEAP_BASE,
> +       .len            = HEAP_SIZE,
> +       .vm_mapped_len  = HEAP_SIZE,
> +       .type           = REGION_SKIBOOT_HEAP,
> +};
> +
>  static struct mem_region skiboot_after_heap = {
>         .name           = "ibm,firmware-data",
>         .start          = HEAP_BASE + HEAP_SIZE,
>         .len            = SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
> +       .vm_mapped_len  = SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
>         .type           = REGION_SKIBOOT_FIRMWARE,
>  };
>
> @@ -137,14 +140,6 @@ static struct alloc_hdr *next_hdr(const struct mem_region *region,
>  #if POISON_MEM_REGION == 1
>  static void mem_poison(struct free_hdr *f)
>  {
> -       size_t poison_size = (void*)tailer(f) - (void*)(f+1);
> -
> -       /* We only poison up to a limit, as otherwise boot is
> -        * kinda slow */
> -       if (poison_size > POISON_MEM_REGION_LIMIT)
> -               poison_size = POISON_MEM_REGION_LIMIT;
> -
> -       memset(f+1, POISON_MEM_REGION_WITH, poison_size);
>  }
>  #else
>  static inline void mem_poison(struct free_hdr *f __unused) { }
> @@ -154,21 +149,40 @@ static inline void mem_poison(struct free_hdr *f __unused) { }
>  static void init_allocatable_region(struct mem_region *region)
>  {
>         struct free_hdr *f = region_start(region);
> +       unsigned int num_longs;
> +       unsigned long *t;
> +
>         assert(region->type == REGION_SKIBOOT_HEAP ||
>                region->type == REGION_MEMORY);
> -       f->hdr.num_longs = region->len / sizeof(long);
> +
> +       num_longs = region->len / sizeof(long);
> +
> +       if (!region->vm_mapped_len) {
> +               /* SKIBOOT_BASE-SIZE regions already come mapped */
> +               region->vm_mapped_len = PAGE_SIZE;
> +               vm_map_global((unsigned long)f, PAGE_SIZE);
> +       }
> +
> +       assert(PAGE_SIZE >= sizeof(*f));
> +       assert(region->len >= PAGE_SIZE*2);
> +
> +       f->hdr.num_longs = num_longs;
>         f->hdr.free = true;
>         f->hdr.prev_free = false;
> -       *tailer(f) = f->hdr.num_longs;
>         list_head_init(&region->free_list);
>         list_add(&region->free_list, &f->list);
> -       mem_poison(f);
> +
> +       t = vm_map((unsigned long)tailer(f), sizeof(long));
> +//     *tailer(f) = num_longs;
> +       *t = num_longs;
> +       vm_unmap((unsigned long)tailer(f), sizeof(long));
>  }
>
>  static void make_free(struct mem_region *region, struct free_hdr *f,
>                       const char *location, bool skip_poison)
>  {
>         struct alloc_hdr *next;
> +       unsigned long *t;
>
>         if (!skip_poison)
>                 mem_poison(f);
> @@ -192,7 +206,10 @@ static void make_free(struct mem_region *region, struct free_hdr *f,
>         }
>
>         /* Fix up tailer. */
> -       *tailer(f) = f->hdr.num_longs;
> +       t = vm_map((unsigned long)tailer(f), sizeof(long));
> +//     *tailer(f) = f->hdr.num_longs;
> +       *t = f->hdr.num_longs;
> +       vm_unmap((unsigned long)tailer(f), sizeof(long));
>
>         /* If next is free, coalesce it */
>         next = next_hdr(region, &f->hdr);
> @@ -381,6 +398,7 @@ static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
>         size_t alloc_longs, offset;
>         struct free_hdr *f;
>         struct alloc_hdr *next;
> +       unsigned long newsz;
>
>         /* Align must be power of 2. */
>         assert(!((align - 1) & align));
> @@ -455,6 +473,14 @@ found:
>                 /* This coalesces as required. */
>                 make_free(region, pre, location, true);
>         }
> +
> +       newsz = ((void *)((unsigned long *)f + alloc_longs) - region_start(region) + sizeof(struct free_hdr));
> +       if (newsz > region->vm_mapped_len) {
> +               newsz += PAGE_SIZE-1;
> +               newsz &= ~(PAGE_SIZE-1);
> +               vm_map_global((unsigned long)region_start(region) + region->vm_mapped_len, newsz - region->vm_mapped_len);
> +               region->vm_mapped_len = newsz;
> +       }
>
>         /* We might be too long; put the rest back. */
>         discard_excess(region, &f->hdr, alloc_longs, location, true);
> diff --git a/core/opal.c b/core/opal.c
> index 7ffca9c1..14469062 100644
> --- a/core/opal.c
> +++ b/core/opal.c
> @@ -320,9 +320,12 @@ opal_call(OPAL_QUIESCE, opal_quiesce, 2);
>
>  void __opal_register(uint64_t token, void *func, unsigned int nargs)
>  {
> +       uint64_t *t;
>         assert(token <= OPAL_LAST);
>
> -       opal_branch_table[token] = function_entry_address(func);
> +       t = vm_map((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
> +       *t = function_entry_address(func);
> +       vm_unmap((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
>         opal_num_args[token] = nargs;
>  }
>
> diff --git a/core/stack.c b/core/stack.c
> index 73700ce5..3a86a376 100644
> --- a/core/stack.c
> +++ b/core/stack.c
> @@ -26,8 +26,6 @@
>  #define STACK_BUF_ENTRIES      60
>  static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
>
> -extern uint32_t _stext, _etext;
> -
>  /* Dumps backtrace to buffer */
>  void __nomcount ___backtrace(struct bt_entry *entries, unsigned int *count,
>                                 unsigned long r1,
> diff --git a/core/vm.c b/core/vm.c
> new file mode 100644
> index 00000000..f97f6f2d
> --- /dev/null
> +++ b/core/vm.c
> @@ -0,0 +1,538 @@
> +/* Copyright 2018 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <opal.h>
> +#include <stack.h>
> +#include <cpu.h>
> +#include <trace.h>
> +#include <ccan/str/str.h>
> +#include <ccan/container_of/container_of.h>
> +
> +static bool vm_setup = false;
> +
> +#define SLB_SZ         (256UL*1024*1024)
> +#define SLB_NR         32
> +#define LOCAL_SLB_NR   2
> +#define GLOBAL_SLB_NR  (SLB_NR - LOCAL_SLB_NR)
> +#define LOCAL_SLB_BASE GLOBAL_SLB_NR
> +
> +struct slbe {
> +       int valid;
> +       unsigned long esid;
> +       unsigned long vsid;
> +};
> +
> +/* Entry 31 is reserved for local SLB maps */
> +static struct slbe global_slb[GLOBAL_SLB_NR];
> +static int global_slb_used = 0;
> +
> +static void slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
> +{
> +       unsigned long rs;
> +       unsigned long rb;
> +
> +       rs = vsid << (63-51);           /* 256MB VSID */
> +       rs |= 1UL << (63-53);           /* Kp = 1 */
> +
> +       rb = esid << (63-35);           /* 256MB ESID */
> +       rb |= 1UL << (63-36);           /* V = 1 */
> +       rb |= index;
> +
> +       asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
> +}
> +
> +#if 0
> +static void slb_remove(unsigned long esid)
> +{
> +       asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
> +}
> +#endif
> +
> +static void slb_remove_all(void)
> +{
> +       asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
> +}
> +
> +static bool global_slb_hit(unsigned long esid)
> +{
> +       int i;
> +
> +       for (i = 0; i < global_slb_used; i++) {
> +               struct slbe *s = &global_slb[i];
> +
> +               if (!s->valid)
> +                       continue;
> +
> +//             printf("global slb hit esid:%lx s->esid:%lx\n", esid, s->esid);
> +               if (s->esid == esid)
> +                       return true;
> +       }
> +       return false;
> +}
> +
> +static void global_slb_add(unsigned long esid, unsigned long vsid)
> +{
> +       struct slbe *s = &global_slb[global_slb_used];
> +
> +       assert(!global_slb_hit(esid));
> +       assert(global_slb_used < GLOBAL_SLB_NR);
> +       global_slb_used++;
> +
> +       s->valid = 1;
> +       s->esid = esid;
> +       s->vsid = vsid;
> +}
> +
> +static void sync_global_slb(void)
> +{
> +       int i;
> +
> +//     slb_remove_all();
> +       for (i = 0; i < GLOBAL_SLB_NR; i++) {
> +               struct slbe *s = &global_slb[i];
> +
> +               if (!s->valid)
> +                       continue;
> +               slb_install(s->esid, s->vsid, i);
> +       }
> +}
> +
> +static void cpu_sync_global_slb(void *arg __unused)
> +{
> +       // printf("CPU PIR 0x%04x cpu_sync_global_slb\n", this_cpu()->pir);
> +       sync_global_slb();
> +}
> +
> +static void cpu_all_sync_global_slb(void)
> +{
> +       struct cpu_thread *cpu;
> +
> +       cpu_sync_global_slb(NULL);
> +       return;
> +
> +       /* XXX: deadlocks allocating memory */
> +       for_each_available_cpu(cpu) {
> +               if (cpu == this_cpu()) {
> +                       cpu_sync_global_slb(NULL);
> +                       continue;
> +               }
> +               cpu_wait_job(cpu_queue_job(cpu, "cpu_sync_global_slb",
> +                       cpu_sync_global_slb, NULL), true);
> +       }
> +}
> +
> +struct hpte {
> +       unsigned long dword[2];
> +};
> +
> +struct hpteg {
> +       struct hpte hpte[8];
> +};
> +
> +static struct hpteg *htab;
> +static unsigned long htab_nr_bytes;
> +static unsigned long htab_nr_ptegs;
> +static unsigned long htab_pteg_mask;
> +
> +static void htab_install(unsigned long va, unsigned long pa, int rw, int ex, int global)
> +{
> +       unsigned long hash;
> +       struct hpteg *hpteg;
> +       unsigned int i;
> +
> +       hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +       hpteg = &htab[hash & htab_pteg_mask];
> +
> +       for (i = 0; i < 8; i++) {
> +               struct hpte *hpte = &hpteg->hpte[i];
> +               unsigned long ava = va >> 23;
> +               unsigned long arpn = pa >> 12;
> +               unsigned long dw0, dw1;
> +
> +               dw0 = be64_to_cpu(hpte->dword[0]);
> +               if (dw0 & 1) {
> +                       if (dw0 >> 7 == ava) {
> +                               printf("HTAB collision va:%lx pa:%lx rw:%d ex:%d global:%d\n", va, pa, rw, ex, global);
> +                               assert(0);
> +                               return;
> +                       }
> +                       continue;
> +               }
> +
> +               assert(!hpte->dword[0]);
> +               assert(!hpte->dword[1]);
> +
> +               dw0 = (ava << (63-56)) | 0x1;
> +               if (!global)
> +                       dw0 |= 0x8;
> +
> +               dw1 = (arpn << (63-43 - 8));
> +               if (!rw)
> +                       dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1));
> +               if (!ex)
> +                       dw1 |= (1UL << (63 - 61));
> +               dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
> +
> +               hpte->dword[1] = cpu_to_be64(dw1);
> +               eieio();
> +               hpte->dword[0] = cpu_to_be64(dw0);
> +
> +               return;
> +       }
> +       assert(0);
> +}
> +
> +static void htab_remove(unsigned long va, int global)
> +{
> +       unsigned long hash;
> +       struct hpteg *hpteg;
> +       unsigned int i;
> +
> +       hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
> +       hpteg = &htab[hash & htab_pteg_mask];
> +
> +       for (i = 0; i < 8; i++) {
> +               struct hpte *hpte = &hpteg->hpte[i];
> +               unsigned long ava = va >> 23;
> +               unsigned long dw0;
> +
> +               dw0 = be64_to_cpu(hpte->dword[0]);
> +
> +               if (!(dw0 & 1)) {
> +                       assert(!hpte->dword[0]);
> +                       assert(!hpte->dword[1]);
> +                       continue;
> +               }
> +
> +               if (dw0 >> 7 != ava)
> +                       continue;
> +
> +               if (global)
> +                       assert(!(dw0 & 0x8));
> +               else
> +                       assert(dw0 & 0x8);
> +
> +               hpte->dword[0] = 0;
> +               eieio();
> +               hpte->dword[1] = 0;
> +               eieio();
> +
> +               if (global) {
> +                       asm volatile("tlbie %0,%1" : : "r"(ava<<12), "r"(0));
> +                       asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
> +               } else {
> +                       asm volatile("tlbiel %0" : : "r"(ava<<12));
> +                       asm volatile("ptesync" ::: "memory");
> +               }
> +               return;
> +       }
> +       assert(0);
> +}
> +
> +void vm_map_global(unsigned long addr, unsigned long len)
> +{
> +       unsigned long va;
> +       unsigned long esid = addr >> 28;
> +       unsigned long end = addr + len;
> +       bool need_sync;
> +
> +       assert(vm_setup);
> +       assert((addr >= SKIBOOT_BASE + SKIBOOT_SIZE) || (addr + len) <= SKIBOOT_BASE);
> +
> +       end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
> +       addr &= ~(PAGE_SIZE - 1);
> +       len = end - addr;
> +
> +       // printf("vm_map_global: %lx-%lx\n", addr, addr + len);
> +
> +       if (!global_slb_hit(esid)) {
> +               global_slb_add(esid, esid);
> +               need_sync = true;
> +       }
> +
> +       for (va = addr; va < end; va += PAGE_SIZE) {
> +               if (va >> 28 != esid) {
> +                       esid = va >> 28;
> +                       if (!global_slb_hit(esid)) {
> +                               global_slb_add(esid, esid);
> +                               need_sync = true;
> +                       }
> +               }
> +
> +               htab_install(va, va, 1, 0, 1);
> +       }
> +
> +       if (need_sync)
> +               cpu_all_sync_global_slb();
> +}
> +
> +void *vm_map(unsigned long addr, unsigned long len)
> +{
> +       struct cpu_thread *c = this_cpu();
> +       unsigned long va;
> +       unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
> +       unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
> +       unsigned long end = addr + len;
> +       unsigned long offset = addr & (PAGE_SIZE - 1);
> +
> +       /* Can't do nested mappings */
> +       assert(!c->vm_local_map);
> +       c->vm_local_map = true;
> +
> +       if (!c->vm_setup)
> +               return (void *)addr;
> +
> +       end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
> +       addr &= ~(PAGE_SIZE - 1);
> +       len = end - addr;
> +
> +       // printf("vm_map: %lx-%lx esid:%lx\n", addr, addr + len, esid);
> +
> +       assert(len < (1 << 28)); /* same segment */
> +
> +       va = vsid << 28;
> +       while (addr < end) {
> +               htab_install(va, addr, 1, 0, 0);
> +               va += PAGE_SIZE;
> +               addr += PAGE_SIZE;
> +       }
> +
> +       printf("vm_map: %lx-%lx esid:%lx vsid:%lx addr=%lx\n", addr, addr + len, esid, vsid, (esid<<28)+offset);
> +
> +       return (void *)(esid << 28) + offset;
> +}
> +
> +void vm_unmap(unsigned long addr, unsigned long len)
> +{
> +       struct cpu_thread *c = this_cpu();
> +       unsigned long va;
> +       unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
> +       unsigned long end = addr + len;
> +
> +       assert(c->vm_local_map);
> +       c->vm_local_map = false;
> +
> +       if (!c->vm_setup)
> +               return;
> +
> +       end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
> +       addr &= ~(PAGE_SIZE - 1);
> +       len = end - addr;
> +
> +       // printf("vm_unmap: %lx-%lx esid:%lx\n", addr, addr + len, esid);
> +
> +       assert(len < (1 << 28)); /* same segment */
> +
> +       va = vsid << 28;
> +       while (addr < end) {
> +               htab_remove(va, 0);
> +               va += PAGE_SIZE;
> +               addr += PAGE_SIZE;
> +       }
> +}
> +
> +struct prte {
> +       unsigned long dword[2];
> +};
> +
> +static struct prte *prtab;
> +
> +static unsigned long stack_end = SKIBOOT_BASE + SKIBOOT_SIZE;
> +

> +void vm_map_stacks(void)
> +{
> +       unsigned long start = stack_end;
> +       unsigned long end = start + (cpu_max_pir + 1)*STACK_SIZE;
> +       unsigned long va;
> +
> +       if (start == end)
> +               return;
> +
> +       for (va = start; va < end; va += PAGE_SIZE)
> +               htab_install(va, va, 1, 0, 1);
> +
> +       stack_end = end;
> +}

I'd look at having each thread map it's own stack rather than doing it
all at once. That way we can enter virtual mode before the DT has been
expanded since we need the DT to find cpu_max_pir.

> +static void vm_init_cpu(void)
> +{
> +       struct cpu_thread *c = this_cpu();
> +       unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
> +       unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
> +
> +       mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
> +               ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
> +       mtspr(SPR_LPID, 0);
> +       mtspr(SPR_PID, 0);
> +       mtspr(SPR_HRMOR, 0);

If HRMOR is non-zero we'll fail an assert long before we get here.
IIRC HRMOR is replicated across threads on the same core so you need
to rendezvous all the threads on a core at an address with the high
bit set (bypasses HRMOR) to safely update it. Hostboot and the FSP
should always load us with HRMOR set to zero so it shouldn't matter.

> +       mtspr(SPR_PTCR, (unsigned long)prtab);
> +
> +       sync_global_slb();
> +
> +       slb_install(esid, vsid, LOCAL_SLB_BASE);
> +}
> +
> +static void vm_cleanup_cpu(void)
> +{
> +       slb_remove_all();
> +       // XXX: have the last thread per core clear this reg
> +       // mtspr(SPR_PTCR, 0);
> +}
> +
> +void vm_init_secondary(void)
> +{
> +       vm_init_cpu();
> +       vm_enter();
> +}
> +
> +void vm_enter(void)
> +{
> +       struct cpu_thread *c = this_cpu();
> +
> +       assert(vm_setup);
> +       assert(!c->vm_setup);
> +       c->vm_setup = true;
> +       mtmsr(mfmsr() | (MSR_IR|MSR_DR));
> +}
> +
> +void vm_exit(void)
> +{
> +       struct cpu_thread *c = this_cpu();
> +
> +       assert(vm_setup);
> +       assert(c->vm_setup);
> +       c->vm_setup = false;
> +       mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
> +}
> +
> +static void cpu_stop_vm(void *arg __unused)
> +{
> +       printf("CPU PIR 0x%04x cpu_stop_vm\n", this_cpu()->pir);
> +       vm_exit();
> +       vm_cleanup_cpu();
> +}
> +
> +static void cpu_all_stop_vm(void)
> +{
> +       struct cpu_thread *cpu;
> +       struct cpu_job **jobs;
> +
> +       jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
> +       assert(jobs);
> +
> +       for_each_available_cpu(cpu) {
> +               if (cpu == this_cpu())
> +                       continue;
> +               jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
> +                                               cpu_stop_vm, NULL);
> +       }
> +
> +       /* this cpu */
> +       cpu_stop_vm(NULL);
> +
> +       for_each_available_cpu(cpu) {
> +               if (jobs[cpu->pir])
> +                       cpu_wait_job(jobs[cpu->pir], true);
> +       }
> +
> +       free(jobs);
> +}
> +
> +void vm_init(void)
> +{
> +       unsigned long va;
> +
> +//     prtab = local_alloc(0, 64*1024, 64*1024);
> +       prtab = memalign(64*1024, 64*1024);
> +       assert(prtab);
> +       memset(prtab, 0, 64*1024);
> +
> +       global_slb_add(SKIBOOT_BASE >> 28, SKIBOOT_BASE >> 28);
> +
> +       htab_nr_bytes = 1UL<<18;
> +       htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
> +       htab_pteg_mask = htab_nr_ptegs - 1;
> +//     htab = local_alloc(0, htab_nr_bytes, 1UL<<18);
> +       htab = memalign(1UL<<18, htab_nr_bytes);

I'd just statically allocate some space for it in the skiboot memory
map. That would allow entering virtual mode earlier too.

> +       assert(htab);
> +       memset(htab, 0, htab_nr_bytes);
> +
> +       prtab[0].dword[0] = cpu_to_be64((unsigned long)htab);
> +       prtab[0].dword[1] = 0;
> +
> +       eieio();
> +
> +       vm_init_cpu();
> +
> +//     for (va = (unsigned long)_stext; va < HEAP_BASE; va += PAGE_SIZE) {
> +       for (va = (unsigned long)_stext; va < (unsigned long)_end; va += PAGE_SIZE) {
> +               if (va >= (unsigned long)_stext && va <= (unsigned long)_etext)
> +                       htab_install(va, va, 0, 1, 1); /* text */
> +               else if (va >= (unsigned long)__rodata_start &&
> +                               va <= (unsigned long)__rodata_end)
> +                       htab_install(va, va, 0, 0, 1);
> +               else if (va >= (unsigned long)_sdata &&
> +                               va <= (unsigned long)_edata)
> +                       htab_install(va, va, 1, 0, 1);
> +               else if (va >= (unsigned long)__sym_map_start &&
> +                               va <= (unsigned long)__sym_map_end)
> +                       htab_install(va, va, 0, 0, 1);
> +               else if (va >= (unsigned long)_sbss &&
> +                               va <= (unsigned long)_ebss)
> +                       htab_install(va, va, 1, 0, 1);
> +       }
> +       for (; va < SKIBOOT_BASE + SKIBOOT_SIZE; va += PAGE_SIZE)
> +               htab_install(va, va, 1, 0, 1);
> +
> +       vm_map_stacks();
> +
> +       printf("VMM: SETUP\n");
> +       printf(" PRTAB:%p\n", prtab);
> +       printf(" HTAB: %p\n", htab);
> +       printf(" Global mappings\n");
> +       printf("  text   %lx-%lx\n", (unsigned long)_stext, (unsigned long)_etext);
> +       printf("  rodata %lx-%lx\n", (unsigned long)__rodata_start, (unsigned long)__rodata_end);
> +       printf("  data   %lx-%lx\n", (unsigned long)_sdata, (unsigned long)_edata);
> +       printf("  sym    %lx-%lx\n", (unsigned long)__sym_map_start, (unsigned long)__sym_map_end);
> +       printf("  bss    %lx-%lx\n", (unsigned long)_sbss, (unsigned long)_ebss);
> +       printf("  heap   %lx-%lx\n", (unsigned long)HEAP_BASE, (unsigned long)SKIBOOT_BASE + SKIBOOT_SIZE);
> +       printf("  stacks %lx-%lx\n", (unsigned long)SKIBOOT_BASE + SKIBOOT_SIZE, stack_end);
> +
> +       eieio();
> +
> +       vm_setup = true;
> +
> +       vm_enter();
> +}
> +
> +void vm_destroy(void)
> +{
> +       assert(vm_setup);
> +
> +       printf("VMM: TEARDOWN\n");
> +
> +       cpu_all_stop_vm();
> +
> +       vm_setup = false;
> +
> +       /* XXX: don't need to remove */
> +//     for (va = SKIBOOT_BASE; va < SKIBOOT_BASE + SKIBOOT_SIZE + (cpu_max_pir + 1) * STACK_SIZE; va += PAGE_SIZE)
> +//             htab_remove(va, 1);
> +
> +       free(htab);
> +       free(prtab);
> +}
> diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
> index 236ad5b9..50b76eb5 100644
> --- a/hw/fake-nvram.c
> +++ b/hw/fake-nvram.c
> @@ -36,12 +36,16 @@ int fake_nvram_info(uint32_t *total_size)
>
>  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>  {
> +       void *t;
> +
>         if (!nvram_region)
>                 return -ENODEV;
>
> +       t = vm_map(nvram_region->start + src, len);
>         lock(&fake_nvram_lock);
> -       memcpy(dst, (void *) (nvram_region->start + src), len);
> +       memcpy(dst, t, len);
>         unlock(&fake_nvram_lock);
> +       vm_unmap(nvram_region->start + src, len);
>
>         nvram_read_complete(true);
>
> @@ -50,12 +54,16 @@ int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
>
>  int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
>  {
> +       void *t;
> +
>         if (!nvram_region)
>                 return OPAL_HARDWARE;
>
> +       t = vm_map(nvram_region->start + offset, size);
>         lock(&fake_nvram_lock);
> -       memcpy((void *) (nvram_region->start + offset), src, size);
> +       memcpy(t, src, size);
>         unlock(&fake_nvram_lock);
> +       vm_unmap(nvram_region->start + offset, size);
>
>         return 0;
>  }
> diff --git a/hw/slw.c b/hw/slw.c
> index dfa9189b..32d6628d 100644
> --- a/hw/slw.c
> +++ b/hw/slw.c
> @@ -164,7 +164,7 @@ static void slw_patch_reset(void)
>                 *(sav++) = *(dst);
>                 *(dst++) = *(src++);
>         }
> -       sync_icache();
> +       sync_icache(0);
>  }
>
>  static void slw_unpatch_reset(void)
> @@ -180,7 +180,7 @@ static void slw_unpatch_reset(void)
>                 *(dst++) = *(sav++);
>                 src++;
>         }
> -       sync_icache();
> +       sync_icache(0);
>  }
>
>  static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
> diff --git a/include/cpu.h b/include/cpu.h
> index 2fe47982..64e35a51 100644
> --- a/include/cpu.h
> +++ b/include/cpu.h
> @@ -82,10 +82,18 @@ struct cpu_thread {
>         struct bt_entry                 stack_bot_bt[CPU_BACKTRACE_SIZE];
>         unsigned int                    stack_bot_bt_count;
>  #endif
> +       /*
> +        * Per-thread VM parameters
> +        */
> +       bool                            vm_setup; /* virtual memory is up */
> +       bool                            vm_local_map; /* local mapping */
> +       bool                            vm_local_slb; /* local SLB used */
> +
>         struct lock                     job_lock;
>         struct list_head                job_queue;
>         uint32_t                        job_count;
>         bool                            job_has_no_return;
> +
>         /*
>          * Per-core mask tracking for threads in HMI handler and
>          * a cleanup done bit.
> diff --git a/include/mem_region.h b/include/mem_region.h
> index 018dfa0e..415cbf4d 100644
> --- a/include/mem_region.h
> +++ b/include/mem_region.h
> @@ -46,6 +46,7 @@ struct mem_region {
>         struct list_node list;
>         const char *name;
>         uint64_t start, len;
> +       uint64_t vm_mapped_len;
>         struct dt_node *node;
>         enum mem_region_type type;
>         struct list_head free_list;
> diff --git a/include/processor.h b/include/processor.h
> index 6b262b45..6f815bb4 100644
> --- a/include/processor.h
> +++ b/include/processor.h
> @@ -53,6 +53,7 @@
>  #define SPR_SRR1       0x01b   /* RW: Exception save/restore reg 1 */
>  #define SPR_CFAR       0x01c   /* RW: Come From Address Register */
>  #define SPR_AMR                0x01d   /* RW: Authority Mask Register */
> +#define SPR_PID                0x030   /* RW: PID register */
>  #define SPR_IAMR       0x03d   /* RW: Instruction Authority Mask Register */
>  #define SPR_RPR                0x0ba   /* RW: Relative Priority Register */
>  #define SPR_TBRL       0x10c   /* RO: Timebase low */
> @@ -75,10 +76,12 @@
>  #define SPR_HSRR1      0x13b   /* RW: HV Exception save/restore reg 1 */
>  #define SPR_TFMR       0x13d
>  #define SPR_LPCR       0x13e
> +#define SPR_LPID       0x13f   /* RW: LPID register */
>  #define SPR_HMER       0x150   /* Hypervisor Maintenance Exception */
>  #define SPR_HMEER      0x151   /* HMER interrupt enable mask */
>  #define SPR_PCR                0x152
>  #define SPR_AMOR       0x15d
> +#define SPR_PTCR       0x1d0   /* RW: Partition table control register */
>  #define SPR_PSSCR      0x357   /* RW: Stop status and control (ISA 3) */
>  #define SPR_TSCR       0x399
>  #define SPR_HID0       0x3f0
> @@ -324,9 +327,9 @@ static inline void isync(void)
>  /*
>   * Cache sync
>   */
> -static inline void sync_icache(void)
> +static inline void sync_icache(unsigned long ptr)
>  {
> -       asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
> +       asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
>  }
>
>  /*
> diff --git a/include/skiboot.h b/include/skiboot.h
> index bba76c12..246a7344 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -49,8 +49,13 @@ struct mem_region;
>  extern struct mem_region *mem_region_next(struct mem_region *region);
>
>  #ifndef __TESTING__
> +extern char _stext[], _etext[];
>  /* Readonly section start and end. */
>  extern char __rodata_start[], __rodata_end[];
> +extern char _sdata[], _edata[];
> +extern char __sym_map_start[], __sym_map_end[];
> +extern char _sbss[], _ebss[];
> +extern char _end[];
>
>  static inline bool is_rodata(const void *p)
>  {
> @@ -291,4 +296,18 @@ extern int fake_nvram_info(uint32_t *total_size);
>  extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
>  extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
>
> +/* core/vm.c */
> +#define PAGE_SIZE 4096
> +
> +void vm_map_global(unsigned long addr, unsigned long len);
> +void *vm_map(unsigned long addr, unsigned long len);
> +void vm_unmap(unsigned long addr, unsigned long len);
> +void vm_init(void);
> +void vm_destroy(void);
> +void vm_init_secondary(void);
> +void vm_enter(void);
> +void vm_exit(void);
> +void vm_exit_cleanup(void);
> +void vm_map_stacks(void);
> +
>  #endif /* __SKIBOOT_H */
> diff --git a/libstb/container.c b/libstb/container.c
> index a720fbbf..68111796 100644
> --- a/libstb/container.c
> +++ b/libstb/container.c
> @@ -19,14 +19,20 @@
>
>  bool stb_is_container(const void *buf, size_t size)
>  {
> +       uint32_t *t;
>         ROM_container_raw *c;
> +       bool ret = true;;
>
>         c = (ROM_container_raw*) buf;
>         if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
>                 return false;
> -       if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
> -               return false;
> -       return true;
> +
> +       t = vm_map((unsigned long)&c->magic_number, sizeof(*t));
> +       if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
> +               ret = false;
> +       vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
> +
> +       return ret;
>  }
>
>  uint32_t stb_payload_magic(const void *buf, size_t size)
> diff --git a/skiboot.lds.S b/skiboot.lds.S
> index a6e71077..a21e9af9 100644
> --- a/skiboot.lds.S
> +++ b/skiboot.lds.S
> @@ -51,35 +51,41 @@ SECTIONS
>                 KEEP(*(.cpuctrl.data))
>         }
>
> +       /* Do I need to keep these ? */
> +       .dynsym : { *(.dynsym)  }
> +       .dynstr : { *(.dynstr)  }
> +
> +       /* Relocations */
>         . = ALIGN(0x10);
> +       .dynamic : {
> +               __dynamic_start = .;
> +               *(.dynamic)
> +               __dynamic_end = .;
> +       }
> +
> +       . = ALIGN(0x10);
> +       .rela.dyn : {
> +               __rela_dyn_start = .;
> +               *(.rela*)
> +               __rela_dyn_end = .;
> +       }
> +
> +       .hash   : { *(.hash)   }
> +       .dynsym : { *(.dynsym) }
> +       .dynstr : { *(.dynstr) }
> +
> +       . = ALIGN(0x1000);
>         _stext = .;
>         .text : {
>                 *(.text*)
>                 *(.sfpr)
>         }
>         _etext = .;
> +       . = ALIGN(0x1000);
>
> +       __rodata_start = .;
>         .rodata : {
> -               __rodata_start = .;
>                 *(.rodata .rodata.*)
> -               __rodata_end = .;
> -       }
> -
> -       .data : {
> -               /*
> -                * A couple of things that need to be 4K aligned and
> -                * to reside in their own pages for the sake of TCE
> -                * mappings
> -                */
> -               . = ALIGN(0x1000);
> -               *(.data.memcons);
> -               . = ALIGN(0x1000);
> -               *(.data.boot_trace);
> -               . = ALIGN(0x1000);
> -               *(.data*)
> -               *(.force.data)
> -               *(.toc1)
> -               *(.branch_lt)
>         }
>
>         . = ALIGN(0x10);
> @@ -103,43 +109,43 @@ SECTIONS
>                 *(.toc)
>         }
>
> -       . = ALIGN(0x10);
> -       .opal_table : {
> -               __opal_table_start = .;
> -               KEEP(*(.opal_table))
> -               __opal_table_end = .;
> -       }
> -
>         .platforms : {
>                 __platforms_start = .;
>                 KEEP(*(.platforms))
>                 __platforms_end = .;
>         }
>
> -       /* Do I need to keep these ? */
> -       .dynsym : { *(.dynsym)  }
> -       .dynstr : { *(.dynstr)  }
> -
> -       /* Relocations */
>         . = ALIGN(0x10);
> -       .dynamic : {
> -               __dynamic_start = .;
> -               *(.dynamic)
> -               __dynamic_end = .;
> +       .opal_table : {
> +               __opal_table_start = .;
> +               KEEP(*(.opal_table))
> +               __opal_table_end = .;
>         }
> +       __rodata_end = .;
>
> -       . = ALIGN(0x10);
> -       .rela.dyn : {
> -               __rela_dyn_start = .;
> -               *(.rela*)
> -               __rela_dyn_end = .;
> +       . = ALIGN(0x1000);
> +
> +       _sdata = .;
> +       .data : {
> +               /*
> +                * A couple of things that need to be 4K aligned and
> +                * to reside in their own pages for the sake of TCE
> +                * mappings
> +                */
> +               . = ALIGN(0x1000);
> +               *(.data.memcons);
> +               . = ALIGN(0x1000);
> +               *(.data.boot_trace);
> +               . = ALIGN(0x1000);
> +               *(.data*)
> +               *(.force.data)
> +               *(.toc1)
> +               *(.branch_lt)
>         }
> +       _edata = .;
>
> -       .hash   : { *(.hash)   }
> -       .dynsym : { *(.dynsym) }
> -       .dynstr : { *(.dynstr) }
> +       . = ALIGN(0x1000);
>
> -       . = ALIGN(0x10);
>         .sym_map : {
>                 __sym_map_start = . ;
>                 KEEP(*(.sym_map))
> --
> 2.18.0
>
> _______________________________________________
> Skiboot mailing list
> Skiboot@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
Nicholas Piggin Aug. 27, 2018, 7:22 a.m. | #2
On Mon, 27 Aug 2018 16:16:06 +1000
Oliver <oohall@gmail.com> wrote:

> On Mon, Aug 27, 2018 at 12:15 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> > I tried hacking on this a bit more. This turns on HPT virtual memory
> > quite early in boot. There is a global EA=RA map for "global" mappings
> > which are things that are always mapped and shared, like text and heap.
> > Then there are transient per-CPU mappings that use their own private
> > addresses for temporary mappings of things that are accessed carefully
> > (e.g., like the 0 page interrupt vectors).  
> 
> cool
> 
> > VM gets shut down right before the kernel is booted.
> >
> > This rearranges skiboot.lds.S a bit to put the most similar regions
> > together as possible, which makes it easier to map things with specific
> > protections. Everything but text is no-execute, rodata is read only, etc.
> >
> > Not too sure where I'm going with this. I think it's good to minimise
> > the amount of time spent in real mode in general to catch bugs. Maybe
> > this is unintrusive enough to be worthwhile. But this is only boot, I
> > would like to get to a point where OPAL services run mostly in virtual
> > mode too, but that would look much different and probably require VM
> > provided by the OS.
> >
> > Anyway this "works" (in mambo), it's fairly unintrusive, most code
> > changes are just juggling a link locations around.  
> 
> We'll need to have a think about how we're going to deal with I/O if
> we want to do this on real hardware, or even on mambo before
> xscom_init() is called. Currently we use the explicit cache inhibited
> load/store instructions for accessing MMIO regions in skiboot and
> those are only available in hypervisor real mode. So we'll probably
> need some kind of instruction patching mechanism if we want to boot in
> virtual mode and switch to real mode at runtime.

Yeah I expect that will be the hard part on real hardware.

> Alternatively we could leave them as-is and emulate them at boot time.
> It might be a bit slow, but MMIOs aren't exactly fast to begin with.

Well you don't need to do patching or interrupts, the happy medium I
think would just be a test and branch. Every thread knows whether or
not it's currently running with relocation on.

If we could enable virtual mode *really* early ahead of most of the
MMIOs, maybe we could always do them with cache inhibited mappings and
provide an exceptional case of explicit _rm accessors like the kernel
does.

I think that would actually be quite possible -- the vm code currently
needs the memory allocator up for the hash table, but we could allocate
that more simply much earlier. Or possibly we could do the
mem_region_init earlier.


> > @@ -971,6 +999,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
> >          */
> >         mem_region_init();
> >
> > +       vm_init();
> > +  
> 
> This is a bit too late to really be useful. Ideally we'd want to be in
> virtual mode before the HDAT parser runs or the FDT is expanded.

Yeah... well it did catch a couple of NULL pointer bugs already. Let's
say it's a bit too late to be really useful.


> > +void vm_map_stacks(void)
> > +{
> > +       unsigned long start = stack_end;
> > +       unsigned long end = start + (cpu_max_pir + 1)*STACK_SIZE;
> > +       unsigned long va;
> > +
> > +       if (start == end)
> > +               return;
> > +
> > +       for (va = start; va < end; va += PAGE_SIZE)
> > +               htab_install(va, va, 1, 0, 1);
> > +
> > +       stack_end = end;
> > +}  
> 
> I'd look at having each thread map it's own stack rather than doing it
> all at once. That way we can enter virtual mode before the DT has been
> expanded since we need the DT to find cpu_max_pir.

We need to map secondary stacks because the boot CPU sets them up
before calling in secondaries. We don't want to go to real mode for
that. It should be fine keeping this part here  around init_all_cpus()
time and moving the rest of the vm init earlier though.

> 
> > +static void vm_init_cpu(void)
> > +{
> > +       struct cpu_thread *c = this_cpu();
> > +       unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
> > +       unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
> > +
> > +       mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
> > +               ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
> > +       mtspr(SPR_LPID, 0);
> > +       mtspr(SPR_PID, 0);
> > +       mtspr(SPR_HRMOR, 0);  
> 
> If HRMOR is non-zero we'll fail an assert long before we get here.
> IIRC HRMOR is replicated across threads on the same core so you need
> to rendezvous all the threads on a core at an address with the high
> bit set (bypasses HRMOR) to safely update it. Hostboot and the FSP
> should always load us with HRMOR set to zero so it shouldn't matter.

Okay I'll get rid of it.

> > +void vm_init(void)
> > +{
> > +       unsigned long va;
> > +
> > +//     prtab = local_alloc(0, 64*1024, 64*1024);
> > +       prtab = memalign(64*1024, 64*1024);
> > +       assert(prtab);
> > +       memset(prtab, 0, 64*1024);
> > +
> > +       global_slb_add(SKIBOOT_BASE >> 28, SKIBOOT_BASE >> 28);
> > +
> > +       htab_nr_bytes = 1UL<<18;
> > +       htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
> > +       htab_pteg_mask = htab_nr_ptegs - 1;
> > +//     htab = local_alloc(0, htab_nr_bytes, 1UL<<18);
> > +       htab = memalign(1UL<<18, htab_nr_bytes);  
> 
> I'd just statically allocate some space for it in the skiboot memory
> map. That would allow entering virtual mode earlier too.

Yeah.

Thanks,
Nick
Oliver Aug. 30, 2018, 8:30 a.m. | #3
On Mon, Aug 27, 2018 at 5:22 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
> On Mon, 27 Aug 2018 16:16:06 +1000
> Oliver <oohall@gmail.com> wrote:
>
>> On Mon, Aug 27, 2018 at 12:15 PM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> > I tried hacking on this a bit more. This turns on HPT virtual memory
>> > quite early in boot. There is a global EA=RA map for "global" mappings
>> > which are things that are always mapped and shared, like text and heap.
>> > Then there are transient per-CPU mappings that use their own private
>> > addresses for temporary mappings of things that are accessed carefully
>> > (e.g., like the 0 page interrupt vectors).
>>
>> cool
>>
>> > VM gets shut down right before the kernel is booted.
>> >
>> > This rearranges skiboot.lds.S a bit to put the most similar regions
>> > together as possible, which makes it easier to map things with specific
>> > protections. Everything but text is no-execute, rodata is read only, etc.
>> >
>> > Not too sure where I'm going with this. I think it's good to minimise
>> > the amount of time spent in real mode in general to catch bugs. Maybe
>> > this is unintrusive enough to be worthwhile. But this is only boot, I
>> > would like to get to a point where OPAL services run mostly in virtual
>> > mode too, but that would look much different and probably require VM
>> > provided by the OS.
>> >
>> > Anyway this "works" (in mambo), it's fairly unintrusive, most code
>> > changes are just juggling a link locations around.
>>
>> We'll need to have a think about how we're going to deal with I/O if
>> we want to do this on real hardware, or even on mambo before
>> xscom_init() is called. Currently we use the explicit cache inhibited
>> load/store instructions for accessing MMIO regions in skiboot and
>> those are only available in hypervisor real mode. So we'll probably
>> need some kind of instruction patching mechanism if we want to boot in
>> virtual mode and switch to real mode at runtime.
>
> Yeah I expect that will be the hard part on real hardware.
>
>> Alternatively we could leave them as-is and emulate them at boot time.
>> It might be a bit slow, but MMIOs aren't exactly fast to begin with.
>
> Well you don't need to do patching or interrupts, the happy medium I
> think would just be a test and branch. Every thread knows whether or
> not it's currently running with relocation on.
>
> If we could enable virtual mode *really* early ahead of most of the
> MMIOs, maybe we could always do them with cache inhibited mappings and
> provide an exceptional case of explicit _rm accessors like the kernel
> does.
>
> I think that would actually be quite possible -- the vm code currently
> needs the memory allocator up for the hash table, but we could allocate
> that more simply much earlier. Or possibly we could do the
> mem_region_init earlier.
>
>
>> > @@ -971,6 +999,8 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>> >          */
>> >         mem_region_init();
>> >
>> > +       vm_init();
>> > +
>>
>> This is a bit too late to really be useful. Ideally we'd want to be in
>> virtual mode before the HDAT parser runs or the FDT is expanded.
>
> Yeah... well it did catch a couple of NULL pointer bugs already. Let's
> say it's a bit too late to be really useful.

Er yeah, poor choice of words. I mean it'd be way more useful to me
personally if we turned it on earlier since I get to fix all the HDAT
bugs ;)

>> > +void vm_map_stacks(void)
>> > +{
>> > +       unsigned long start = stack_end;
>> > +       unsigned long end = start + (cpu_max_pir + 1)*STACK_SIZE;
>> > +       unsigned long va;
>> > +
>> > +       if (start == end)
>> > +               return;
>> > +
>> > +       for (va = start; va < end; va += PAGE_SIZE)
>> > +               htab_install(va, va, 1, 0, 1);
>> > +
>> > +       stack_end = end;
>> > +}
>>
>> I'd look at having each thread map it's own stack rather than doing it
>> all at once. That way we can enter virtual mode before the DT has been
>> expanded since we need the DT to find cpu_max_pir.
>
> We need to map secondary stacks because the boot CPU sets them up
> before calling in secondaries. We don't want to go to real mode for
> that. It should be fine keeping this part here  around init_all_cpus()
> time and moving the rest of the vm init earlier though.

Oh right, I forgot the boot cpu had to fill out the cpu_thread
structures at top of the stack. You're right, leaving it here should
be fine.

>> > +static void vm_init_cpu(void)
>> > +{
>> > +       struct cpu_thread *c = this_cpu();
>> > +       unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
>> > +       unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
>> > +
>> > +       mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
>> > +               ~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
>> > +       mtspr(SPR_LPID, 0);
>> > +       mtspr(SPR_PID, 0);
>> > +       mtspr(SPR_HRMOR, 0);
>>
>> If HRMOR is non-zero we'll fail an assert long before we get here.
>> IIRC HRMOR is replicated across threads on the same core so you need
>> to rendezvous all the threads on a core at an address with the high
>> bit set (bypasses HRMOR) to safely update it. Hostboot and the FSP
>> should always load us with HRMOR set to zero so it shouldn't matter.
>
> Okay I'll get rid of it.
>
>> > +void vm_init(void)
>> > +{
>> > +       unsigned long va;
>> > +
>> > +//     prtab = local_alloc(0, 64*1024, 64*1024);
>> > +       prtab = memalign(64*1024, 64*1024);
>> > +       assert(prtab);
>> > +       memset(prtab, 0, 64*1024);
>> > +
>> > +       global_slb_add(SKIBOOT_BASE >> 28, SKIBOOT_BASE >> 28);
>> > +
>> > +       htab_nr_bytes = 1UL<<18;
>> > +       htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
>> > +       htab_pteg_mask = htab_nr_ptegs - 1;
>> > +//     htab = local_alloc(0, htab_nr_bytes, 1UL<<18);
>> > +       htab = memalign(1UL<<18, htab_nr_bytes);
>>
>> I'd just statically allocate some space for it in the skiboot memory
>> map. That would allow entering virtual mode earlier too.
>
> Yeah.
>
> Thanks,
> Nick

Patch

diff --git a/asm/head.S b/asm/head.S
index 803fbf1a..729cfe87 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -23,13 +23,6 @@ 
 
 #define EPAPR_MAGIC	0x65504150
 
-/* Power management instructions */
-#define PPC_INST_NAP		.long 0x4c000364
-#define PPC_INST_SLEEP		.long 0x4c0003a4
-#define PPC_INST_RVWINKLE	.long 0x4c0003e4
-
-#define PPC_INST_STOP		.long 0x4c0002e4
-
 #define GET_STACK(stack_reg,pir_reg)					\
 	sldi	stack_reg,pir_reg,STACK_SHIFT;				\
 	addis	stack_reg,stack_reg,CPU_STACKS_OFFSET@ha;		\
@@ -520,104 +513,6 @@  disable_machine_check:
 1:	mtlr	%r0
 	blr
 
-pm_save_regs:
-	SAVE_GPR(2,%r1)
-	SAVE_GPR(14,%r1)
-	SAVE_GPR(15,%r1)
-	SAVE_GPR(16,%r1)
-	SAVE_GPR(17,%r1)
-	SAVE_GPR(18,%r1)
-	SAVE_GPR(19,%r1)
-	SAVE_GPR(20,%r1)
-	SAVE_GPR(21,%r1)
-	SAVE_GPR(22,%r1)
-	SAVE_GPR(23,%r1)
-	SAVE_GPR(24,%r1)
-	SAVE_GPR(25,%r1)
-	SAVE_GPR(26,%r1)
-	SAVE_GPR(27,%r1)
-	SAVE_GPR(28,%r1)
-	SAVE_GPR(29,%r1)
-	SAVE_GPR(30,%r1)
-	SAVE_GPR(31,%r1)
-	mfcr	%r4
-	mfxer	%r5
-	mfspr	%r6,SPR_HSPRG0
-	mfspr	%r7,SPR_HSPRG1
-	stw	%r4,STACK_CR(%r1)
-	stw	%r5,STACK_XER(%r1)
-	std	%r6,STACK_GPR0(%r1)
-	std	%r7,STACK_GPR1(%r1)
-	blr
-
-.global enter_p8_pm_state
-enter_p8_pm_state:
-	/* Before entering map or rvwinkle, we create a stack frame
-	 * and save our non-volatile registers.
-	 *
-	 * We also save these SPRs:
-	 *
-	 *  - HSPRG0	in GPR0 slot
-	 *  - HSPRG1	in GPR1 slot
-	 *
-	 *  - xxx TODO: HIDs
-	 *  - TODO: Mask MSR:ME during the process
-	 *
-	 * On entry, r3 indicates:
-	 *
-	 *    0 = nap
-	 *    1 = rvwinkle
-	 */
-	mflr	%r0
-	std	%r0,16(%r1)
-	stdu	%r1,-STACK_FRAMESIZE(%r1)
-
-	bl	pm_save_regs
-
-	/* Save stack pointer in struct cpu_thread */
-	std	%r1,CPUTHREAD_SAVE_R1(%r13)
-
-	/* Winkle or nap ? */
-	cmpli	%cr0,0,%r3,0
-	bne	1f
-
-	/* nap sequence */
-	ptesync
-0:	ld	%r0,CPUTHREAD_SAVE_R1(%r13)
-	cmpd	cr0,%r0,%r0
-	bne	0b
-	PPC_INST_NAP
-	b	.
-
-	/* rvwinkle sequence */
-1:	ptesync
-0:	ld	%r0,CPUTHREAD_SAVE_R1(%r13)
-	cmpd	cr0,%r0,%r0
-	bne	0b
-	PPC_INST_RVWINKLE
-	b	.
-
-.global enter_p9_pm_lite_state
-enter_p9_pm_lite_state:
-	mtspr	SPR_PSSCR,%r3
-	PPC_INST_STOP
-	blr
-
-.global enter_p9_pm_state
-enter_p9_pm_state:
-	mflr	%r0
-	std	%r0,16(%r1)
-	stdu	%r1,-STACK_FRAMESIZE(%r1)
-
-	bl	pm_save_regs
-
-	/* Save stack pointer in struct cpu_thread */
-	std	%r1,CPUTHREAD_SAVE_R1(%r13)
-
-	mtspr	SPR_PSSCR,%r3
-	PPC_INST_STOP
-	b	.
-
 /* This is a little piece of code that is copied down to
  * 0x100 for handling power management wakeups
  */
@@ -633,6 +528,7 @@  reset_patch_start:
 .global reset_patch_end
 reset_patch_end:
 
+.global reset_wakeup
 reset_wakeup:
 	/* Get PIR */
 	mfspr	%r31,SPR_PIR
diff --git a/asm/misc.S b/asm/misc.S
index 381590b9..916acf9c 100644
--- a/asm/misc.S
+++ b/asm/misc.S
@@ -123,3 +123,113 @@  cleanup_global_tlb:
 	ptesync
 
 	blr
+
+
+/* Power management instructions */
+#define PPC_INST_NAP		.long 0x4c000364
+#define PPC_INST_SLEEP		.long 0x4c0003a4
+#define PPC_INST_RVWINKLE	.long 0x4c0003e4
+
+#define PPC_INST_STOP		.long 0x4c0002e4
+
+#define SAVE_GPR(reg,sp)	std %r##reg,STACK_GPR##reg(sp)
+#define REST_GPR(reg,sp)	ld %r##reg,STACK_GPR##reg(sp)
+
+pm_save_regs:
+	SAVE_GPR(2,%r1)
+	SAVE_GPR(14,%r1)
+	SAVE_GPR(15,%r1)
+	SAVE_GPR(16,%r1)
+	SAVE_GPR(17,%r1)
+	SAVE_GPR(18,%r1)
+	SAVE_GPR(19,%r1)
+	SAVE_GPR(20,%r1)
+	SAVE_GPR(21,%r1)
+	SAVE_GPR(22,%r1)
+	SAVE_GPR(23,%r1)
+	SAVE_GPR(24,%r1)
+	SAVE_GPR(25,%r1)
+	SAVE_GPR(26,%r1)
+	SAVE_GPR(27,%r1)
+	SAVE_GPR(28,%r1)
+	SAVE_GPR(29,%r1)
+	SAVE_GPR(30,%r1)
+	SAVE_GPR(31,%r1)
+	mfcr	%r4
+	mfxer	%r5
+	mfspr	%r6,SPR_HSPRG0
+	mfspr	%r7,SPR_HSPRG1
+	stw	%r4,STACK_CR(%r1)
+	stw	%r5,STACK_XER(%r1)
+	std	%r6,STACK_GPR0(%r1)
+	std	%r7,STACK_GPR1(%r1)
+	blr
+
+.global enter_p8_pm_state
+enter_p8_pm_state:
+	/* Before entering map or rvwinkle, we create a stack frame
+	 * and save our non-volatile registers.
+	 *
+	 * We also save these SPRs:
+	 *
+	 *  - HSPRG0	in GPR0 slot
+	 *  - HSPRG1	in GPR1 slot
+	 *
+	 *  - xxx TODO: HIDs
+	 *  - TODO: Mask MSR:ME during the process
+	 *
+	 * On entry, r3 indicates:
+	 *
+	 *    0 = nap
+	 *    1 = rvwinkle
+	 */
+	mflr	%r0
+	std	%r0,16(%r1)
+	stdu	%r1,-STACK_FRAMESIZE(%r1)
+
+	bl	pm_save_regs
+
+	/* Save stack pointer in struct cpu_thread */
+	std	%r1,CPUTHREAD_SAVE_R1(%r13)
+
+	/* Winkle or nap ? */
+	cmpli	%cr0,0,%r3,0
+	bne	1f
+
+	/* nap sequence */
+	ptesync
+0:	ld	%r0,CPUTHREAD_SAVE_R1(%r13)
+	cmpd	cr0,%r0,%r0
+	bne	0b
+	PPC_INST_NAP
+	b	.
+
+	/* rvwinkle sequence */
+1:	ptesync
+0:	ld	%r0,CPUTHREAD_SAVE_R1(%r13)
+	cmpd	cr0,%r0,%r0
+	bne	0b
+	PPC_INST_RVWINKLE
+	b	.
+
+.global enter_p9_pm_lite_state
+enter_p9_pm_lite_state:
+	mtspr	SPR_PSSCR,%r3
+	PPC_INST_STOP
+	blr
+
+.global enter_p9_pm_state
+enter_p9_pm_state:
+	mflr	%r0
+	std	%r0,16(%r1)
+	stdu	%r1,-STACK_FRAMESIZE(%r1)
+
+	bl	pm_save_regs
+
+	/* Save stack pointer in struct cpu_thread */
+	std	%r1,CPUTHREAD_SAVE_R1(%r13)
+
+	mtspr	SPR_PSSCR,%r3
+	PPC_INST_STOP
+	b	.
+
diff --git a/core/Makefile.inc b/core/Makefile.inc
index d3635059..e057f479 100644
--- a/core/Makefile.inc
+++ b/core/Makefile.inc
@@ -1,7 +1,7 @@ 
 # -*-Makefile-*-
 
 SUBDIRS += core
-CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o
+CORE_OBJS = relocate.o console.o stack.o init.o chip.o mem_region.o vm.o
 CORE_OBJS += malloc.o lock.o cpu.o utils.o fdt.o opal.o interrupts.o timebase.o
 CORE_OBJS += opal-msg.o pci.o pci-iov.o pci-virt.o pci-slot.o pcie-slot.o
 CORE_OBJS += pci-opal.o fast-reboot.o device.o exceptions.o trace.o affinity.o
diff --git a/core/cpu.c b/core/cpu.c
index 88477f82..15829ede 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -469,12 +469,16 @@  static void cpu_idle_p9(enum cpu_wake_cause wake_on)
 		/* PSSCR SD=0 ESL=1 EC=1 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BIT(42) | PPC_BIT(43) |
 			PPC_BITMASK(54, 55) | PPC_BIT(63);
+		vm_exit();
 		enter_p9_pm_state(psscr);
+		vm_enter();
 	} else {
 		/* stop with EC=0 (resumes) which does not require sreset. */
 		/* PSSCR SD=0 ESL=0 EC=0 PSSL=0 TR=3 MTL=0 RL=1 */
 		psscr = PPC_BITMASK(54, 55) | PPC_BIT(63);
+		// vm_exit();
 		enter_p9_pm_lite_state(psscr);
+		// vm_enter();
 	}
 
 	/* Clear doorbell */
diff --git a/core/init.c b/core/init.c
index ca6c468c..8e99ab42 100644
--- a/core/init.c
+++ b/core/init.c
@@ -347,7 +347,7 @@  bool start_preload_kernel(void)
 static bool load_kernel(void)
 {
 	void *stb_container = NULL;
-	struct elf_hdr *kh;
+	struct elf_hdr *kh, *t;
 	int loaded;
 
 	prlog(PR_NOTICE, "INIT: Waiting for kernel...\n");
@@ -386,7 +386,7 @@  static bool load_kernel(void)
 		if (kernel_entry < EXCEPTION_VECTORS_END) {
 			cpu_set_sreset_enable(false);
 			memcpy(NULL, old_vectors, EXCEPTION_VECTORS_END);
-			sync_icache();
+			sync_icache(0);
 		}
 	} else {
 		if (!kernel_size) {
@@ -407,21 +407,25 @@  static bool load_kernel(void)
 	      "INIT: Kernel loaded, size: %zu bytes (0 = unknown preload)\n",
 	      kernel_size);
 
-	if (kh->ei_ident != ELF_IDENT) {
+//	t = vm_map((unsigned long)kh, sizeof(*kh));
+	vm_map_global((unsigned long)kh, sizeof(*kh));
+	t = kh;
+	if (t->ei_ident != ELF_IDENT) {
 		prerror("INIT: ELF header not found. Assuming raw binary.\n");
 		return true;
 	}
 
-	if (kh->ei_class == ELF_CLASS_64) {
-		if (!try_load_elf64(kh))
+	if (t->ei_class == ELF_CLASS_64) {
+		if (!try_load_elf64(t))
 			return false;
-	} else if (kh->ei_class == ELF_CLASS_32) {
-		if (!try_load_elf32(kh))
+	} else if (t->ei_class == ELF_CLASS_32) {
+		if (!try_load_elf32(t))
 			return false;
 	} else {
 		prerror("INIT: Neither ELF32 not ELF64 ?\n");
 		return false;
 	}
+//	vm_unmap((unsigned long)kh, sizeof(*kh));
 
 	if (chip_quirk(QUIRK_MAMBO_CALLOUTS)) {
 		secureboot_verify(RESOURCE_ID_KERNEL,
@@ -481,6 +485,7 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 	const struct dt_property *memprop;
 	const char *cmdline, *stdoutp;
 	uint64_t mem_top;
+	uint32_t *t;
 
 	memprop = dt_find_property(dt_root, DT_PRIVATE "maxmem");
 	if (memprop)
@@ -580,13 +585,23 @@  void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	debug_descriptor.state_flags |= OPAL_BOOT_COMPLETE;
 
+	printf("%s:%d\n", __FILE__, __LINE__);
 	fdt_set_boot_cpuid_phys(fdt, this_cpu()->pir);
+	printf("%s:%d\n", __FILE__, __LINE__);
 
+	t = vm_map(kernel_entry, 4);
+	printf("%s:%d\n", __FILE__, __LINE__);
 	/* Check there is something there before we branch to it */
-	if (*(uint32_t *)kernel_entry == 0) {
+	if (*t == 0) {
 		prlog(PR_EMERG, "FATAL: Kernel is zeros, can't execute!\n");
 		assert(0);
 	}
+	printf("%s:%d\n", __FILE__, __LINE__);
+	vm_unmap(kernel_entry, 4);
+	printf("%s:%d\n", __FILE__, __LINE__);
+
+	/* Go back to realmode and tear down our VM before booting kernel */
+	vm_destroy();
 
 	if (kernel_32bit)
 		start_kernel32(kernel_entry, fdt, mem_top);
@@ -747,23 +762,35 @@  static void setup_branch_null_catcher(void)
 
 void setup_reset_vector(void)
 {
+	static char patch[0x100];
 	uint32_t *src, *dst;
+	uint32_t *t;
+	uint32_t len = (void *)&reset_patch_end - (void *)&reset_patch_start;
 
 	/* Copy the reset code over the entry point. */
 	src = &reset_patch_start;
+	t = vm_map((unsigned long)src, len);
+	memcpy(patch, t, len);
+	vm_unmap((unsigned long)src, len);
+
 	dst = (uint32_t *)0x100;
-	while(src < &reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	t = vm_map((unsigned long)dst, len);
+	memcpy(t, patch, len);
+	sync_icache((unsigned long)t);
+	vm_unmap((unsigned long)dst, len);
 	cpu_set_sreset_enable(true);
 }
 
 void copy_exception_vectors(void)
 {
+	void *t;
+
+	t = vm_map(0x0, 0x2000);
+
 	/* Backup previous vectors as this could contain a kernel
 	 * image.
 	 */
-	memcpy(old_vectors, NULL, EXCEPTION_VECTORS_END);
+	memcpy(old_vectors, t, EXCEPTION_VECTORS_END);
 
 	/* Copy from 0x100 to EXCEPTION_VECTORS_END, avoid below 0x100 as
 	 * this is the boot flag used by CPUs still potentially entering
@@ -771,9 +798,10 @@  void copy_exception_vectors(void)
 	 */
 	BUILD_ASSERT((&reset_patch_end - &reset_patch_start) <
 			EXCEPTION_VECTORS_END - 0x100);
-	memcpy((void *)0x100, (void *)(SKIBOOT_BASE + 0x100),
+	memcpy(t + 0x100, (void *)(SKIBOOT_BASE + 0x100),
 			EXCEPTION_VECTORS_END - 0x100);
-	sync_icache();
+	sync_icache((unsigned long)t);
+	vm_unmap(0x0, 0x2000);
 }
 
 static void per_thread_sanity_checks(void)
@@ -971,6 +999,8 @@  void __noreturn __nomcount main_cpu_entry(const void *fdt)
 	 */
 	mem_region_init();
 
+	vm_init();
+
 	/* Reserve HOMER and OCC area */
 	homer_init();
 
@@ -1184,6 +1214,8 @@  void __noreturn __secondary_cpu_entry(void)
 {
 	struct cpu_thread *cpu = this_cpu();
 
+	vm_init_secondary();
+
 	/* Secondary CPU called in */
 	cpu_callin(cpu);
 
diff --git a/core/mem_region.c b/core/mem_region.c
index bd387f3c..5c427523 100644
--- a/core/mem_region.c
+++ b/core/mem_region.c
@@ -65,24 +65,27 @@  static struct mem_region skiboot_os_reserve = {
 	.type		= REGION_OS,
 };
 
-struct mem_region skiboot_heap = {
-	.name		= "ibm,firmware-heap",
-	.start		= HEAP_BASE,
-	.len		= HEAP_SIZE,
-	.type		= REGION_SKIBOOT_HEAP,
-};
-
 static struct mem_region skiboot_code_and_text = {
 	.name		= "ibm,firmware-code",
 	.start		= SKIBOOT_BASE,
 	.len		= HEAP_BASE - SKIBOOT_BASE,
+	.vm_mapped_len	= HEAP_BASE - SKIBOOT_BASE,
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
+struct mem_region skiboot_heap = {
+	.name		= "ibm,firmware-heap",
+	.start		= HEAP_BASE,
+	.len		= HEAP_SIZE,
+	.vm_mapped_len	= HEAP_SIZE,
+	.type		= REGION_SKIBOOT_HEAP,
+};
+
 static struct mem_region skiboot_after_heap = {
 	.name		= "ibm,firmware-data",
 	.start		= HEAP_BASE + HEAP_SIZE,
 	.len		= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
+	.vm_mapped_len	= SKIBOOT_BASE + SKIBOOT_SIZE - (HEAP_BASE + HEAP_SIZE),
 	.type		= REGION_SKIBOOT_FIRMWARE,
 };
 
@@ -137,14 +140,6 @@  static struct alloc_hdr *next_hdr(const struct mem_region *region,
 #if POISON_MEM_REGION == 1
 static void mem_poison(struct free_hdr *f)
 {
-	size_t poison_size = (void*)tailer(f) - (void*)(f+1);
-
-	/* We only poison up to a limit, as otherwise boot is
-	 * kinda slow */
-	if (poison_size > POISON_MEM_REGION_LIMIT)
-		poison_size = POISON_MEM_REGION_LIMIT;
-
-	memset(f+1, POISON_MEM_REGION_WITH, poison_size);
 }
 #else
 static inline void mem_poison(struct free_hdr *f __unused) { }
@@ -154,21 +149,40 @@  static inline void mem_poison(struct free_hdr *f __unused) { }
 static void init_allocatable_region(struct mem_region *region)
 {
 	struct free_hdr *f = region_start(region);
+	unsigned int num_longs;
+	unsigned long *t;
+
 	assert(region->type == REGION_SKIBOOT_HEAP ||
 	       region->type == REGION_MEMORY);
-	f->hdr.num_longs = region->len / sizeof(long);
+
+	num_longs = region->len / sizeof(long);
+
+	if (!region->vm_mapped_len) {
+		/* SKIBOOT_BASE-SIZE regions already come mapped */
+		region->vm_mapped_len = PAGE_SIZE;
+		vm_map_global((unsigned long)f, PAGE_SIZE);
+	}
+
+	assert(PAGE_SIZE >= sizeof(*f));
+	assert(region->len >= PAGE_SIZE*2);
+
+	f->hdr.num_longs = num_longs;
 	f->hdr.free = true;
 	f->hdr.prev_free = false;
-	*tailer(f) = f->hdr.num_longs;
 	list_head_init(&region->free_list);
 	list_add(&region->free_list, &f->list);
-	mem_poison(f);
+
+	t = vm_map((unsigned long)tailer(f), sizeof(long));
+//	*tailer(f) = num_longs;
+	*t = num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 }
 
 static void make_free(struct mem_region *region, struct free_hdr *f,
 		      const char *location, bool skip_poison)
 {
 	struct alloc_hdr *next;
+	unsigned long *t;
 
 	if (!skip_poison)
 		mem_poison(f);
@@ -192,7 +206,10 @@  static void make_free(struct mem_region *region, struct free_hdr *f,
 	}
 
 	/* Fix up tailer. */
-	*tailer(f) = f->hdr.num_longs;
+	t = vm_map((unsigned long)tailer(f), sizeof(long));
+//	*tailer(f) = f->hdr.num_longs;
+	*t = f->hdr.num_longs;
+	vm_unmap((unsigned long)tailer(f), sizeof(long));
 
 	/* If next is free, coalesce it */
 	next = next_hdr(region, &f->hdr);
@@ -381,6 +398,7 @@  static void *__mem_alloc(struct mem_region *region, size_t size, size_t align,
 	size_t alloc_longs, offset;
 	struct free_hdr *f;
 	struct alloc_hdr *next;
+	unsigned long newsz;
 
 	/* Align must be power of 2. */
 	assert(!((align - 1) & align));
@@ -455,6 +473,14 @@  found:
 		/* This coalesces as required. */
 		make_free(region, pre, location, true);
 	}
+	
+	newsz = ((void *)((unsigned long *)f + alloc_longs) - region_start(region) + sizeof(struct free_hdr));
+	if (newsz > region->vm_mapped_len) {
+		newsz += PAGE_SIZE-1;
+		newsz &= ~(PAGE_SIZE-1);
+		vm_map_global((unsigned long)region_start(region) + region->vm_mapped_len, newsz - region->vm_mapped_len);
+		region->vm_mapped_len = newsz;
+	}
 
 	/* We might be too long; put the rest back. */
 	discard_excess(region, &f->hdr, alloc_longs, location, true);
diff --git a/core/opal.c b/core/opal.c
index 7ffca9c1..14469062 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -320,9 +320,12 @@  opal_call(OPAL_QUIESCE, opal_quiesce, 2);
 
 void __opal_register(uint64_t token, void *func, unsigned int nargs)
 {
+	uint64_t *t;
 	assert(token <= OPAL_LAST);
 
-	opal_branch_table[token] = function_entry_address(func);
+	t = vm_map((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
+	*t = function_entry_address(func);
+	vm_unmap((unsigned long)&opal_branch_table[token], sizeof(uint64_t));
 	opal_num_args[token] = nargs;
 }
 
diff --git a/core/stack.c b/core/stack.c
index 73700ce5..3a86a376 100644
--- a/core/stack.c
+++ b/core/stack.c
@@ -26,8 +26,6 @@ 
 #define STACK_BUF_ENTRIES	60
 static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
 
-extern uint32_t _stext, _etext;
-
 /* Dumps backtrace to buffer */
 void __nomcount ___backtrace(struct bt_entry *entries, unsigned int *count,
 				unsigned long r1,
diff --git a/core/vm.c b/core/vm.c
new file mode 100644
index 00000000..f97f6f2d
--- /dev/null
+++ b/core/vm.c
@@ -0,0 +1,538 @@ 
+/* Copyright 2018 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * 	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <skiboot.h>
+#include <opal.h>
+#include <stack.h>
+#include <cpu.h>
+#include <trace.h>
+#include <ccan/str/str.h>
+#include <ccan/container_of/container_of.h>
+
+static bool vm_setup = false;
+
+#define SLB_SZ		(256UL*1024*1024)
+#define SLB_NR		32
+#define LOCAL_SLB_NR	2
+#define GLOBAL_SLB_NR	(SLB_NR - LOCAL_SLB_NR)
+#define LOCAL_SLB_BASE	GLOBAL_SLB_NR
+
+struct slbe {
+	int valid;
+	unsigned long esid;
+	unsigned long vsid;
+};
+
+/* Entry 31 is reserved for local SLB maps */
+static struct slbe global_slb[GLOBAL_SLB_NR];
+static int global_slb_used = 0;
+
+static void slb_install(unsigned long esid, unsigned long vsid, unsigned int index)
+{
+	unsigned long rs;
+	unsigned long rb;
+
+	rs = vsid << (63-51);		/* 256MB VSID */
+	rs |= 1UL << (63-53);		/* Kp = 1 */
+
+	rb = esid << (63-35);		/* 256MB ESID */
+	rb |= 1UL << (63-36);		/* V = 1 */
+	rb |= index;
+
+	asm volatile("slbmte %0,%1" : : "r"(rs), "r"(rb) : "memory");
+}
+
+#if 0
+static void slb_remove(unsigned long esid)
+{
+	asm volatile("isync ; slbie %0 ; isync" : : "r"(esid << 28) : "memory");
+}
+#endif
+
+static void slb_remove_all(void)
+{
+	asm volatile("isync ; slbmte %0,%0 ; slbia ; isync" : : "r"(0) : "memory");
+}
+
+static bool global_slb_hit(unsigned long esid)
+{
+	int i;
+
+	for (i = 0; i < global_slb_used; i++) {
+		struct slbe *s = &global_slb[i];
+
+		if (!s->valid)
+			continue;
+
+//		printf("global slb hit esid:%lx s->esid:%lx\n", esid, s->esid);
+		if (s->esid == esid)
+			return true;
+	}
+	return false;
+}
+
+static void global_slb_add(unsigned long esid, unsigned long vsid)
+{
+	struct slbe *s = &global_slb[global_slb_used];
+
+	assert(!global_slb_hit(esid));
+	assert(global_slb_used < GLOBAL_SLB_NR);
+	global_slb_used++;
+
+	s->valid = 1;
+	s->esid = esid;
+	s->vsid = vsid;
+}
+
+static void sync_global_slb(void)
+{
+	int i;
+
+//	slb_remove_all();
+	for (i = 0; i < GLOBAL_SLB_NR; i++) {
+		struct slbe *s = &global_slb[i];
+
+		if (!s->valid)
+			continue;
+		slb_install(s->esid, s->vsid, i);
+	}
+}
+
+static void cpu_sync_global_slb(void *arg __unused)
+{
+	// printf("CPU PIR 0x%04x cpu_sync_global_slb\n", this_cpu()->pir);
+	sync_global_slb();
+}
+
+static void cpu_all_sync_global_slb(void)
+{
+	struct cpu_thread *cpu;
+
+	cpu_sync_global_slb(NULL);
+	return;
+
+	/* XXX: deadlocks allocating memory */
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu()) {
+			cpu_sync_global_slb(NULL);
+			continue;
+		}
+		cpu_wait_job(cpu_queue_job(cpu, "cpu_sync_global_slb",
+			cpu_sync_global_slb, NULL), true);
+	}
+}
+
+struct hpte {
+	unsigned long dword[2];
+};
+
+struct hpteg {
+	struct hpte hpte[8];
+};
+
+static struct hpteg *htab;
+static unsigned long htab_nr_bytes;
+static unsigned long htab_nr_ptegs;
+static unsigned long htab_pteg_mask;
+
+static void htab_install(unsigned long va, unsigned long pa, int rw, int ex, int global)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned int i;
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	for (i = 0; i < 8; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		unsigned long ava = va >> 23;
+		unsigned long arpn = pa >> 12;
+		unsigned long dw0, dw1;
+
+		dw0 = be64_to_cpu(hpte->dword[0]);
+		if (dw0 & 1) {
+			if (dw0 >> 7 == ava) {
+				printf("HTAB collision va:%lx pa:%lx rw:%d ex:%d global:%d\n", va, pa, rw, ex, global);
+				assert(0);
+				return;
+			}
+			continue;
+		}
+
+		assert(!hpte->dword[0]);
+		assert(!hpte->dword[1]);
+
+		dw0 = (ava << (63-56)) | 0x1;
+		if (!global)
+			dw0 |= 0x8;
+
+		dw1 = (arpn << (63-43 - 8));
+		if (!rw)
+			dw1 |= (1UL << (63 - 0)) | (1UL << (63 - 63 + 1));
+		if (!ex)
+			dw1 |= (1UL << (63 - 61));
+		dw1 |= (1UL << (63 - 60 + 1)); /* WIMG = 0010 */
+
+		hpte->dword[1] = cpu_to_be64(dw1);
+		eieio();
+		hpte->dword[0] = cpu_to_be64(dw0);
+
+		return;
+	}
+	assert(0);
+}
+
+static void htab_remove(unsigned long va, int global)
+{
+	unsigned long hash;
+	struct hpteg *hpteg;
+	unsigned int i;
+
+	hash = ((va >> 12) & 0xffff) ^ ((va >> 28) & 0x7fffffffffUL);
+	hpteg = &htab[hash & htab_pteg_mask];
+
+	for (i = 0; i < 8; i++) {
+		struct hpte *hpte = &hpteg->hpte[i];
+		unsigned long ava = va >> 23;
+		unsigned long dw0;
+
+		dw0 = be64_to_cpu(hpte->dword[0]);
+
+		if (!(dw0 & 1)) {
+			assert(!hpte->dword[0]);
+			assert(!hpte->dword[1]);
+			continue;
+		}
+
+		if (dw0 >> 7 != ava)
+			continue;
+
+		if (global)
+			assert(!(dw0 & 0x8));
+		else
+			assert(dw0 & 0x8);
+
+		hpte->dword[0] = 0;
+		eieio();
+		hpte->dword[1] = 0;
+		eieio();
+
+		if (global) {
+			asm volatile("tlbie %0,%1" : : "r"(ava<<12), "r"(0));
+			asm volatile("eieio ; tlbsync ; ptesync" ::: "memory");
+		} else {
+			asm volatile("tlbiel %0" : : "r"(ava<<12));
+			asm volatile("ptesync" ::: "memory");
+		}
+		return;
+	}
+	assert(0);
+}
+
+void vm_map_global(unsigned long addr, unsigned long len)
+{
+	unsigned long va;
+	unsigned long esid = addr >> 28;
+	unsigned long end = addr + len;
+	bool need_sync;
+
+	assert(vm_setup);
+	assert((addr >= SKIBOOT_BASE + SKIBOOT_SIZE) || (addr + len) <= SKIBOOT_BASE);
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	// printf("vm_map_global: %lx-%lx\n", addr, addr + len);
+
+	if (!global_slb_hit(esid)) {
+		global_slb_add(esid, esid);
+		need_sync = true;
+	}
+
+	for (va = addr; va < end; va += PAGE_SIZE) {
+		if (va >> 28 != esid) {
+			esid = va >> 28;
+			if (!global_slb_hit(esid)) {
+				global_slb_add(esid, esid);
+				need_sync = true;
+			}
+		}
+
+		htab_install(va, va, 1, 0, 1);
+	}
+
+	if (need_sync)
+		cpu_all_sync_global_slb();
+}
+
+void *vm_map(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long va;
+	unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
+	unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
+	unsigned long end = addr + len;
+	unsigned long offset = addr & (PAGE_SIZE - 1);
+
+	/* Can't do nested mappings */
+	assert(!c->vm_local_map);
+	c->vm_local_map = true;
+
+	if (!c->vm_setup)
+		return (void *)addr;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	// printf("vm_map: %lx-%lx esid:%lx\n", addr, addr + len, esid);
+
+	assert(len < (1 << 28)); /* same segment */
+
+	va = vsid << 28;
+	while (addr < end) {
+		htab_install(va, addr, 1, 0, 0);
+		va += PAGE_SIZE;
+		addr += PAGE_SIZE;
+	}
+
+	printf("vm_map: %lx-%lx esid:%lx vsid:%lx addr=%lx\n", addr, addr + len, esid, vsid, (esid<<28)+offset);
+
+	return (void *)(esid << 28) + offset;
+}
+
+void vm_unmap(unsigned long addr, unsigned long len)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long va;
+	unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
+	unsigned long end = addr + len;
+
+	assert(c->vm_local_map);
+	c->vm_local_map = false;
+
+	if (!c->vm_setup)
+		return;
+
+	end = (end + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
+	addr &= ~(PAGE_SIZE - 1);
+	len = end - addr;
+
+	// printf("vm_unmap: %lx-%lx esid:%lx\n", addr, addr + len, esid);
+
+	assert(len < (1 << 28)); /* same segment */
+
+	va = vsid << 28;
+	while (addr < end) {
+		htab_remove(va, 0);
+		va += PAGE_SIZE;
+		addr += PAGE_SIZE;
+	}
+}
+
+struct prte {
+	unsigned long dword[2];
+};
+
+static struct prte *prtab;
+
+static unsigned long stack_end = SKIBOOT_BASE + SKIBOOT_SIZE;
+
+void vm_map_stacks(void)
+{
+	unsigned long start = stack_end;
+	unsigned long end = start + (cpu_max_pir + 1)*STACK_SIZE;
+	unsigned long va;
+
+	if (start == end)
+		return;
+
+	for (va = start; va < end; va += PAGE_SIZE)
+		htab_install(va, va, 1, 0, 1);
+
+	stack_end = end;
+}
+
+static void vm_init_cpu(void)
+{
+	struct cpu_thread *c = this_cpu();
+	unsigned long esid = (0x0800000000000000ULL + (c->pir << 28)) >> 28;
+	unsigned long vsid = (unsigned long)c->pir << 30; /* per-cpu VA */
+
+	mtspr(SPR_LPCR, mfspr(SPR_LPCR) &
+		~(PPC_BITMASK(0,3) | PPC_BIT(41) | PPC_BIT(43) | PPC_BIT(54)));
+	mtspr(SPR_LPID, 0);
+	mtspr(SPR_PID, 0);
+	mtspr(SPR_HRMOR, 0);
+	mtspr(SPR_PTCR, (unsigned long)prtab);
+
+	sync_global_slb();
+
+	slb_install(esid, vsid, LOCAL_SLB_BASE);
+}
+
+static void vm_cleanup_cpu(void)
+{
+	slb_remove_all();
+	// XXX: have the last thread per core clear this reg
+	// mtspr(SPR_PTCR, 0);
+}
+
+void vm_init_secondary(void)
+{
+	vm_init_cpu();
+	vm_enter();
+}
+
+void vm_enter(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	assert(!c->vm_setup);
+	c->vm_setup = true;
+	mtmsr(mfmsr() | (MSR_IR|MSR_DR));
+}
+
+void vm_exit(void)
+{
+	struct cpu_thread *c = this_cpu();
+
+	assert(vm_setup);
+	assert(c->vm_setup);
+	c->vm_setup = false;
+	mtmsr(mfmsr() & ~(MSR_IR|MSR_DR));
+}
+
+static void cpu_stop_vm(void *arg __unused)
+{
+	printf("CPU PIR 0x%04x cpu_stop_vm\n", this_cpu()->pir);
+	vm_exit();
+	vm_cleanup_cpu();
+}
+
+static void cpu_all_stop_vm(void)
+{
+	struct cpu_thread *cpu;
+	struct cpu_job **jobs;
+
+	jobs = zalloc(sizeof(struct cpu_job *) * cpu_max_pir + 1);
+	assert(jobs);
+
+	for_each_available_cpu(cpu) {
+		if (cpu == this_cpu())
+			continue;
+		jobs[cpu->pir] = cpu_queue_job(cpu, "cpu_stop_vm",
+						cpu_stop_vm, NULL);
+	}
+
+	/* this cpu */
+	cpu_stop_vm(NULL);
+
+	for_each_available_cpu(cpu) {
+		if (jobs[cpu->pir])
+			cpu_wait_job(jobs[cpu->pir], true);
+	}
+
+	free(jobs);
+}
+
+void vm_init(void)
+{
+	unsigned long va;
+
+//	prtab = local_alloc(0, 64*1024, 64*1024);
+	prtab = memalign(64*1024, 64*1024);
+	assert(prtab);
+	memset(prtab, 0, 64*1024);
+
+	global_slb_add(SKIBOOT_BASE >> 28, SKIBOOT_BASE >> 28);
+
+	htab_nr_bytes = 1UL<<18;
+	htab_nr_ptegs = htab_nr_bytes / sizeof(struct hpteg);
+	htab_pteg_mask = htab_nr_ptegs - 1;
+//	htab = local_alloc(0, htab_nr_bytes, 1UL<<18);
+	htab = memalign(1UL<<18, htab_nr_bytes);
+	assert(htab);
+	memset(htab, 0, htab_nr_bytes);
+
+	prtab[0].dword[0] = cpu_to_be64((unsigned long)htab);
+	prtab[0].dword[1] = 0;
+
+	eieio();
+
+	vm_init_cpu();
+
+//	for (va = (unsigned long)_stext; va < HEAP_BASE; va += PAGE_SIZE) {
+	for (va = (unsigned long)_stext; va < (unsigned long)_end; va += PAGE_SIZE) {
+		if (va >= (unsigned long)_stext && va <= (unsigned long)_etext)
+			htab_install(va, va, 0, 1, 1); /* text */
+		else if (va >= (unsigned long)__rodata_start &&
+				va <= (unsigned long)__rodata_end)
+			htab_install(va, va, 0, 0, 1);
+		else if (va >= (unsigned long)_sdata &&
+				va <= (unsigned long)_edata)
+			htab_install(va, va, 1, 0, 1);
+		else if (va >= (unsigned long)__sym_map_start &&
+				va <= (unsigned long)__sym_map_end)
+			htab_install(va, va, 0, 0, 1);
+		else if (va >= (unsigned long)_sbss &&
+				va <= (unsigned long)_ebss)
+			htab_install(va, va, 1, 0, 1);
+	}
+	for (; va < SKIBOOT_BASE + SKIBOOT_SIZE; va += PAGE_SIZE)
+		htab_install(va, va, 1, 0, 1);
+
+	vm_map_stacks();
+
+	printf("VMM: SETUP\n");
+	printf(" PRTAB:%p\n", prtab);
+	printf(" HTAB: %p\n", htab);
+	printf(" Global mappings\n");
+	printf("  text   %lx-%lx\n", (unsigned long)_stext, (unsigned long)_etext);
+	printf("  rodata %lx-%lx\n", (unsigned long)__rodata_start, (unsigned long)__rodata_end);
+	printf("  data   %lx-%lx\n", (unsigned long)_sdata, (unsigned long)_edata);
+	printf("  sym    %lx-%lx\n", (unsigned long)__sym_map_start, (unsigned long)__sym_map_end);
+	printf("  bss    %lx-%lx\n", (unsigned long)_sbss, (unsigned long)_ebss);
+	printf("  heap   %lx-%lx\n", (unsigned long)HEAP_BASE, (unsigned long)SKIBOOT_BASE + SKIBOOT_SIZE);
+	printf("  stacks %lx-%lx\n", (unsigned long)SKIBOOT_BASE + SKIBOOT_SIZE, stack_end);
+
+	eieio();
+
+	vm_setup = true;
+
+	vm_enter();
+}
+
+void vm_destroy(void)
+{
+	assert(vm_setup);
+
+	printf("VMM: TEARDOWN\n");
+
+	cpu_all_stop_vm();
+
+	vm_setup = false;
+
+	/* XXX: don't need to remove */
+//	for (va = SKIBOOT_BASE; va < SKIBOOT_BASE + SKIBOOT_SIZE + (cpu_max_pir + 1) * STACK_SIZE; va += PAGE_SIZE)
+//		htab_remove(va, 1);
+
+	free(htab);
+	free(prtab);
+}
diff --git a/hw/fake-nvram.c b/hw/fake-nvram.c
index 236ad5b9..50b76eb5 100644
--- a/hw/fake-nvram.c
+++ b/hw/fake-nvram.c
@@ -36,12 +36,16 @@  int fake_nvram_info(uint32_t *total_size)
 
 int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 {
+	void *t;
+
 	if (!nvram_region)
 		return -ENODEV;
 
+	t = vm_map(nvram_region->start + src, len);
 	lock(&fake_nvram_lock);
-	memcpy(dst, (void *) (nvram_region->start + src), len);
+	memcpy(dst, t, len);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + src, len);
 
 	nvram_read_complete(true);
 
@@ -50,12 +54,16 @@  int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len)
 
 int fake_nvram_write(uint32_t offset, void *src, uint32_t size)
 {
+	void *t;
+
 	if (!nvram_region)
 		return OPAL_HARDWARE;
 
+	t = vm_map(nvram_region->start + offset, size);
 	lock(&fake_nvram_lock);
-	memcpy((void *) (nvram_region->start + offset), src, size);
+	memcpy(t, src, size);
 	unlock(&fake_nvram_lock);
+	vm_unmap(nvram_region->start + offset, size);
 
 	return 0;
 }
diff --git a/hw/slw.c b/hw/slw.c
index dfa9189b..32d6628d 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -164,7 +164,7 @@  static void slw_patch_reset(void)
 		*(sav++) = *(dst);
 		*(dst++) = *(src++);
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static void slw_unpatch_reset(void)
@@ -180,7 +180,7 @@  static void slw_unpatch_reset(void)
 		*(dst++) = *(sav++);
 		src++;
 	}
-	sync_icache();
+	sync_icache(0);
 }
 
 static bool slw_general_init(struct proc_chip *chip, struct cpu_thread *c)
diff --git a/include/cpu.h b/include/cpu.h
index 2fe47982..64e35a51 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -82,10 +82,18 @@  struct cpu_thread {
 	struct bt_entry			stack_bot_bt[CPU_BACKTRACE_SIZE];
 	unsigned int			stack_bot_bt_count;
 #endif
+	/*
+	 * Per-thread VM parameters
+	 */
+	bool				vm_setup; /* virtual memory is up */
+	bool				vm_local_map; /* local mapping */
+	bool				vm_local_slb; /* local SLB used */
+
 	struct lock			job_lock;
 	struct list_head		job_queue;
 	uint32_t			job_count;
 	bool				job_has_no_return;
+
 	/*
 	 * Per-core mask tracking for threads in HMI handler and
 	 * a cleanup done bit.
diff --git a/include/mem_region.h b/include/mem_region.h
index 018dfa0e..415cbf4d 100644
--- a/include/mem_region.h
+++ b/include/mem_region.h
@@ -46,6 +46,7 @@  struct mem_region {
 	struct list_node list;
 	const char *name;
 	uint64_t start, len;
+	uint64_t vm_mapped_len;
 	struct dt_node *node;
 	enum mem_region_type type;
 	struct list_head free_list;
diff --git a/include/processor.h b/include/processor.h
index 6b262b45..6f815bb4 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -53,6 +53,7 @@ 
 #define SPR_SRR1	0x01b	/* RW: Exception save/restore reg 1 */
 #define SPR_CFAR	0x01c	/* RW: Come From Address Register */
 #define SPR_AMR		0x01d	/* RW: Authority Mask Register */
+#define SPR_PID		0x030	/* RW: PID register */
 #define SPR_IAMR	0x03d	/* RW: Instruction Authority Mask Register */
 #define SPR_RPR		0x0ba   /* RW: Relative Priority Register */
 #define SPR_TBRL	0x10c	/* RO: Timebase low */
@@ -75,10 +76,12 @@ 
 #define SPR_HSRR1	0x13b	/* RW: HV Exception save/restore reg 1 */
 #define SPR_TFMR	0x13d
 #define SPR_LPCR	0x13e
+#define SPR_LPID	0x13f	/* RW: LPID register */
 #define SPR_HMER	0x150	/* Hypervisor Maintenance Exception */
 #define SPR_HMEER	0x151	/* HMER interrupt enable mask */
 #define SPR_PCR		0x152
 #define SPR_AMOR	0x15d
+#define SPR_PTCR	0x1d0	/* RW: Partition table control register */
 #define SPR_PSSCR	0x357   /* RW: Stop status and control (ISA 3) */
 #define SPR_TSCR	0x399
 #define SPR_HID0	0x3f0
@@ -324,9 +327,9 @@  static inline void isync(void)
 /*
  * Cache sync
  */
-static inline void sync_icache(void)
+static inline void sync_icache(unsigned long ptr)
 {
-	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (0) : "memory");
+	asm volatile("sync; icbi 0,%0; sync; isync" : : "r" (ptr) : "memory");
 }
 
 /*
diff --git a/include/skiboot.h b/include/skiboot.h
index bba76c12..246a7344 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -49,8 +49,13 @@  struct mem_region;
 extern struct mem_region *mem_region_next(struct mem_region *region);
 
 #ifndef __TESTING__
+extern char _stext[], _etext[];
 /* Readonly section start and end. */
 extern char __rodata_start[], __rodata_end[];
+extern char _sdata[], _edata[];
+extern char __sym_map_start[], __sym_map_end[];
+extern char _sbss[], _ebss[];
+extern char _end[];
 
 static inline bool is_rodata(const void *p)
 {
@@ -291,4 +296,18 @@  extern int fake_nvram_info(uint32_t *total_size);
 extern int fake_nvram_start_read(void *dst, uint32_t src, uint32_t len);
 extern int fake_nvram_write(uint32_t offset, void *src, uint32_t size);
 
+/* core/vm.c */
+#define PAGE_SIZE 4096
+
+void vm_map_global(unsigned long addr, unsigned long len);
+void *vm_map(unsigned long addr, unsigned long len);
+void vm_unmap(unsigned long addr, unsigned long len);
+void vm_init(void);
+void vm_destroy(void);
+void vm_init_secondary(void);
+void vm_enter(void);
+void vm_exit(void);
+void vm_exit_cleanup(void);
+void vm_map_stacks(void);
+
 #endif /* __SKIBOOT_H */
diff --git a/libstb/container.c b/libstb/container.c
index a720fbbf..68111796 100644
--- a/libstb/container.c
+++ b/libstb/container.c
@@ -19,14 +19,20 @@ 
 
 bool stb_is_container(const void *buf, size_t size)
 {
+	uint32_t *t;
 	ROM_container_raw *c;
+	bool ret = true;;
 
 	c = (ROM_container_raw*) buf;
 	if (!buf || size < SECURE_BOOT_HEADERS_SIZE)
 		return false;
-	if (be32_to_cpu(c->magic_number) != ROM_MAGIC_NUMBER )
-		return false;
-	return true;
+
+	t = vm_map((unsigned long)&c->magic_number, sizeof(*t));
+	if (be32_to_cpu(*t) != ROM_MAGIC_NUMBER)
+		ret = false;
+	vm_unmap((unsigned long)&c->magic_number, sizeof(*t));
+
+	return ret;
 }
 
 uint32_t stb_payload_magic(const void *buf, size_t size)
diff --git a/skiboot.lds.S b/skiboot.lds.S
index a6e71077..a21e9af9 100644
--- a/skiboot.lds.S
+++ b/skiboot.lds.S
@@ -51,35 +51,41 @@  SECTIONS
 		KEEP(*(.cpuctrl.data))
 	}
 
+	/* Do I need to keep these ? */
+	.dynsym : { *(.dynsym)	}
+	.dynstr : { *(.dynstr)	}
+
+	/* Relocations */
 	. = ALIGN(0x10);
+	.dynamic : {
+		__dynamic_start = .;
+		*(.dynamic)
+		__dynamic_end = .;
+	}
+
+	. = ALIGN(0x10);
+	.rela.dyn : {
+		__rela_dyn_start = .;
+		*(.rela*)
+		__rela_dyn_end = .;
+	}
+
+	.hash   : { *(.hash)   }
+	.dynsym : { *(.dynsym) }
+	.dynstr : { *(.dynstr) }
+
+	. = ALIGN(0x1000);
 	_stext = .;
  	.text : {
 		*(.text*)
 		*(.sfpr)
 	}
 	_etext = .;
+	. = ALIGN(0x1000);
 
+	__rodata_start = .;
 	.rodata : {
-		__rodata_start = .;
 		*(.rodata .rodata.*)
-		__rodata_end = .;
-	}
-
-	.data : {
-		/*
-		 * A couple of things that need to be 4K aligned and
-		 * to reside in their own pages for the sake of TCE
-		 * mappings
-		 */
-		. = ALIGN(0x1000);
-		*(.data.memcons);
-		. = ALIGN(0x1000);
-		*(.data.boot_trace);
-		. = ALIGN(0x1000);
-		*(.data*)
-		*(.force.data)
-		*(.toc1)
-		*(.branch_lt)
 	}
 
 	. = ALIGN(0x10);
@@ -103,43 +109,43 @@  SECTIONS
 		*(.toc)
 	}
 
-	. = ALIGN(0x10);
-	.opal_table : {
-		__opal_table_start = .;
-		KEEP(*(.opal_table))
-		__opal_table_end = .;
-	}
-
 	.platforms : {
 		__platforms_start = .;
 		KEEP(*(.platforms))
 		__platforms_end = .;
 	}
 
-	/* Do I need to keep these ? */
-	.dynsym : { *(.dynsym)	}
-	.dynstr : { *(.dynstr)	}
-
-	/* Relocations */
 	. = ALIGN(0x10);
-	.dynamic : {
-		__dynamic_start = .;
-		*(.dynamic)
-		__dynamic_end = .;
+	.opal_table : {
+		__opal_table_start = .;
+		KEEP(*(.opal_table))
+		__opal_table_end = .;
 	}
+	__rodata_end = .;
 
-	. = ALIGN(0x10);
-	.rela.dyn : {
-		__rela_dyn_start = .;
-		*(.rela*)
-		__rela_dyn_end = .;
+	. = ALIGN(0x1000);
+
+	_sdata = .;
+	.data : {
+		/*
+		 * A couple of things that need to be 4K aligned and
+		 * to reside in their own pages for the sake of TCE
+		 * mappings
+		 */
+		. = ALIGN(0x1000);
+		*(.data.memcons);
+		. = ALIGN(0x1000);
+		*(.data.boot_trace);
+		. = ALIGN(0x1000);
+		*(.data*)
+		*(.force.data)
+		*(.toc1)
+		*(.branch_lt)
 	}
+	_edata = .;
 
-	.hash   : { *(.hash)   }
-	.dynsym : { *(.dynsym) }
-	.dynstr : { *(.dynstr) }
+	. = ALIGN(0x1000);
 
-	. = ALIGN(0x10);
 	.sym_map : {
 		__sym_map_start = . ;
 		KEEP(*(.sym_map))