diff mbox series

[v2,4/4] target-i386: Add NPT support

Message ID fb0c41863e19c760eca91a71a86eb391147d083d.1522769774.git.jan.kiszka@web.de
State New
Headers show
Series target-i386: Enhance SVM support | expand

Commit Message

Jan Kiszka April 3, 2018, 3:36 p.m. UTC
From: Jan Kiszka <jan.kiszka@siemens.com>

This implements NPT suport for SVM by hooking into
x86_cpu_handle_mmu_fault where it reads the stage-1 page table. Whether
we need to perform this 2nd stage translation, and how, is decided
during vmrun and stored in hflags as well as nested_cr3 and
nested_pg_mode.

As get_hphys performs a direct cpu_vmexit in case of NPT faults, we need
retaddr in that function. To avoid changing the signature of
cpu_handle_mmu_fault, this passes the value from tlb_fill to get_hphys
via the CPU state.

This was tested successfully via the Jailhouse hypervisor.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 target/i386/cpu.c         |   2 +-
 target/i386/cpu.h         |   6 ++
 target/i386/excp_helper.c | 216 +++++++++++++++++++++++++++++++++++++++++++++-
 target/i386/mem_helper.c  |   6 +-
 target/i386/svm.h         |  14 +++
 target/i386/svm_helper.c  |  22 +++++
 6 files changed, 260 insertions(+), 6 deletions(-)

Comments

Paolo Bonzini June 27, 2018, 12:14 p.m. UTC | #1
On 03/04/2018 17:36, Jan Kiszka wrote:
>  
> +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
> +                        int *prot)
> +{
> +    CPUX86State *env = &X86_CPU(cs)->env;
> +    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
> +    uint64_t ptep, pte;
> +    uint64_t exit_info_1 = 0;
> +    target_ulong pde_addr, pte_addr;
> +    uint32_t page_offset;
> +    int page_size;
> +
> +    if (likely(!(env->hflags & HF_NPT_MASK))) {
> +        return gphys;
> +    }

hflags are a somewhat limited resource.  Can this go in hflags2?

> 
> +
> +        env->nested_pg_mode = 0;
> +        if (env->cr[4] & CR4_PAE_MASK) {
> +            env->nested_pg_mode |= SVM_NPT_PAE;
> +        }
> +        if (env->hflags & HF_LMA_MASK) {
> +            env->nested_pg_mode |= SVM_NPT_LMA;
> +        }
> +        if (env->efer & MSR_EFER_NXE) {
> +            env->nested_pg_mode |= SVM_NPT_NXE;
> +        }
> +    }
> +

This needs to be migrated.  You can put it in a subsection, conditional
on hflags & HF_SVMI_MASK.

Also, do you need to flush the TLB unconditionally, even if CR0.PG is zero?

Otherwise looks good.  I have queued patches 1-3, but hopefully this one
can go in the next release too.  Sorry for the delayed review.

Paolo
Jan Kiszka June 30, 2018, 5:25 a.m. UTC | #2
On 2018-06-27 14:14, Paolo Bonzini wrote:
> On 03/04/2018 17:36, Jan Kiszka wrote:
>>  
>> +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>> +                        int *prot)
>> +{
>> +    CPUX86State *env = &X86_CPU(cs)->env;
>> +    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>> +    uint64_t ptep, pte;
>> +    uint64_t exit_info_1 = 0;
>> +    target_ulong pde_addr, pte_addr;
>> +    uint32_t page_offset;
>> +    int page_size;
>> +
>> +    if (likely(!(env->hflags & HF_NPT_MASK))) {
>> +        return gphys;
>> +    }
> 
> hflags are a somewhat limited resource.  Can this go in hflags2?
> 

Will have a look - I don't seen why not. Or is there any special
semantical difference between both fields?

>>
>> +
>> +        env->nested_pg_mode = 0;
>> +        if (env->cr[4] & CR4_PAE_MASK) {
>> +            env->nested_pg_mode |= SVM_NPT_PAE;
>> +        }
>> +        if (env->hflags & HF_LMA_MASK) {
>> +            env->nested_pg_mode |= SVM_NPT_LMA;
>> +        }
>> +        if (env->efer & MSR_EFER_NXE) {
>> +            env->nested_pg_mode |= SVM_NPT_NXE;
>> +        }
>> +    }
>> +
> 
> This needs to be migrated.  You can put it in a subsection, conditional
> on hflags & HF_SVMI_MASK.

OK.

> 
> Also, do you need to flush the TLB unconditionally, even if CR0.PG is zero?
Cannot follow you here yet. What flush are you referring to?

Also, CR0.PG would not reflect if NPT is on, which now also contributes
to our TLB.

> 
> Otherwise looks good.  I have queued patches 1-3, but hopefully this one
> can go in the next release too.  Sorry for the delayed review.

No problem.

Thanks,
Jan
Paolo Bonzini June 30, 2018, 6:05 a.m. UTC | #3
On 30/06/2018 07:25, Jan Kiszka wrote:
> On 2018-06-27 14:14, Paolo Bonzini wrote:
>> On 03/04/2018 17:36, Jan Kiszka wrote:
>>>  
>>> +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>> +                        int *prot)
>>> +{
>>> +    CPUX86State *env = &X86_CPU(cs)->env;
>>> +    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>>> +    uint64_t ptep, pte;
>>> +    uint64_t exit_info_1 = 0;
>>> +    target_ulong pde_addr, pte_addr;
>>> +    uint32_t page_offset;
>>> +    int page_size;
>>> +
>>> +    if (likely(!(env->hflags & HF_NPT_MASK))) {
>>> +        return gphys;
>>> +    }
>>
>> hflags are a somewhat limited resource.  Can this go in hflags2?
>>
> 
> Will have a look - I don't seen why not. Or is there any special
> semantical difference between both fields?

Yes, hflags become flags of the translation block, while hflags2 are
just random processor state.  If translate.c uses it you must use
hflags, but here hflags2 should be safe.

Thanks,

Paolo

>>>
>>> +
>>> +        env->nested_pg_mode = 0;
>>> +        if (env->cr[4] & CR4_PAE_MASK) {
>>> +            env->nested_pg_mode |= SVM_NPT_PAE;
>>> +        }
>>> +        if (env->hflags & HF_LMA_MASK) {
>>> +            env->nested_pg_mode |= SVM_NPT_LMA;
>>> +        }
>>> +        if (env->efer & MSR_EFER_NXE) {
>>> +            env->nested_pg_mode |= SVM_NPT_NXE;
>>> +        }
>>> +    }
>>> +
>>
>> This needs to be migrated.  You can put it in a subsection, conditional
>> on hflags & HF_SVMI_MASK.
> 
> OK.
> 
>>
>> Also, do you need to flush the TLB unconditionally, even if CR0.PG is zero?
> Cannot follow you here yet. What flush are you referring to?
> 
> Also, CR0.PG would not reflect if NPT is on, which now also contributes
> to our TLB.
> 
>>
>> Otherwise looks good.  I have queued patches 1-3, but hopefully this one
>> can go in the next release too.  Sorry for the delayed review.
> 
> No problem.
> 
> Thanks,
> Jan
>
Jan Kiszka June 30, 2018, 6:07 a.m. UTC | #4
On 2018-06-30 08:05, Paolo Bonzini wrote:
> On 30/06/2018 07:25, Jan Kiszka wrote:
>> On 2018-06-27 14:14, Paolo Bonzini wrote:
>>> On 03/04/2018 17:36, Jan Kiszka wrote:
>>>>  
>>>> +static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
>>>> +                        int *prot)
>>>> +{
>>>> +    CPUX86State *env = &X86_CPU(cs)->env;
>>>> +    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
>>>> +    uint64_t ptep, pte;
>>>> +    uint64_t exit_info_1 = 0;
>>>> +    target_ulong pde_addr, pte_addr;
>>>> +    uint32_t page_offset;
>>>> +    int page_size;
>>>> +
>>>> +    if (likely(!(env->hflags & HF_NPT_MASK))) {
>>>> +        return gphys;
>>>> +    }
>>>
>>> hflags are a somewhat limited resource.  Can this go in hflags2?
>>>
>>
>> Will have a look - I don't seen why not. Or is there any special
>> semantical difference between both fields?
> 
> Yes, hflags become flags of the translation block, while hflags2 are
> just random processor state.  If translate.c uses it you must use
> hflags, but here hflags2 should be safe.

Indeed. We only change it at vmentry/exit, and that is already a
translation block barrier.

v2 is on the way.

Jan
diff mbox series

Patch

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 555ae79d29..0d14254ca1 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -261,7 +261,7 @@  static void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
 #define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \
           CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A)
 #define TCG_EXT4_FEATURES 0
-#define TCG_SVM_FEATURES 0
+#define TCG_SVM_FEATURES CPUID_SVM_NPT
 #define TCG_KVM_FEATURES 0
 #define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \
           CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index d711634c2f..106a70e1cf 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -177,6 +177,7 @@  typedef enum X86Seg {
 #define HF_IOBPT_SHIFT      24 /* an io breakpoint enabled */
 #define HF_MPX_EN_SHIFT     25 /* MPX Enabled (CR4+XCR0+BNDCFGx) */
 #define HF_MPX_IU_SHIFT     26 /* BND registers in-use */
+#define HF_NPT_SHIFT        27 /* Nested Paging enabled */
 
 #define HF_CPL_MASK          (3 << HF_CPL_SHIFT)
 #define HF_INHIBIT_IRQ_MASK  (1 << HF_INHIBIT_IRQ_SHIFT)
@@ -202,6 +203,7 @@  typedef enum X86Seg {
 #define HF_IOBPT_MASK        (1 << HF_IOBPT_SHIFT)
 #define HF_MPX_EN_MASK       (1 << HF_MPX_EN_SHIFT)
 #define HF_MPX_IU_MASK       (1 << HF_MPX_IU_SHIFT)
+#define HF_NPT_MASK          (1 << HF_NPT_SHIFT)
 
 /* hflags2 */
 
@@ -1201,12 +1203,16 @@  typedef struct CPUX86State {
     uint16_t intercept_dr_read;
     uint16_t intercept_dr_write;
     uint32_t intercept_exceptions;
+    uint64_t nested_cr3;
+    uint32_t nested_pg_mode;
     uint8_t v_tpr;
 
     /* KVM states, automatically cleared on reset */
     uint8_t nmi_injected;
     uint8_t nmi_pending;
 
+    uintptr_t retaddr;
+
     /* Fields up to this point are cleared by a CPU reset */
     struct {} end_reset_fields;
 
diff --git a/target/i386/excp_helper.c b/target/i386/excp_helper.c
index cb4d1b7d33..e3bb59284f 100644
--- a/target/i386/excp_helper.c
+++ b/target/i386/excp_helper.c
@@ -157,6 +157,209 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
 
 #else
 
+static hwaddr get_hphys(CPUState *cs, hwaddr gphys, MMUAccessType access_type,
+                        int *prot)
+{
+    CPUX86State *env = &X86_CPU(cs)->env;
+    uint64_t rsvd_mask = PG_HI_RSVD_MASK;
+    uint64_t ptep, pte;
+    uint64_t exit_info_1 = 0;
+    target_ulong pde_addr, pte_addr;
+    uint32_t page_offset;
+    int page_size;
+
+    if (likely(!(env->hflags & HF_NPT_MASK))) {
+        return gphys;
+    }
+
+    if (!(env->nested_pg_mode & SVM_NPT_NXE)) {
+        rsvd_mask |= PG_NX_MASK;
+    }
+
+    if (env->nested_pg_mode & SVM_NPT_PAE) {
+        uint64_t pde, pdpe;
+        target_ulong pdpe_addr;
+
+#ifdef TARGET_X86_64
+        if (env->nested_pg_mode & SVM_NPT_LMA) {
+            uint64_t pml5e;
+            uint64_t pml4e_addr, pml4e;
+
+            pml5e = env->nested_cr3;
+            ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK;
+
+            pml4e_addr = (pml5e & PG_ADDRESS_MASK) +
+                    (((gphys >> 39) & 0x1ff) << 3);
+            pml4e = x86_ldq_phys(cs, pml4e_addr);
+            if (!(pml4e & PG_PRESENT_MASK)) {
+                goto do_fault;
+            }
+            if (pml4e & (rsvd_mask | PG_PSE_MASK)) {
+                goto do_fault_rsvd;
+            }
+            if (!(pml4e & PG_ACCESSED_MASK)) {
+                pml4e |= PG_ACCESSED_MASK;
+                x86_stl_phys_notdirty(cs, pml4e_addr, pml4e);
+            }
+            ptep &= pml4e ^ PG_NX_MASK;
+            pdpe_addr = (pml4e & PG_ADDRESS_MASK) +
+                    (((gphys >> 30) & 0x1ff) << 3);
+            pdpe = x86_ldq_phys(cs, pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK)) {
+                goto do_fault;
+            }
+            if (pdpe & rsvd_mask) {
+                goto do_fault_rsvd;
+            }
+            ptep &= pdpe ^ PG_NX_MASK;
+            if (!(pdpe & PG_ACCESSED_MASK)) {
+                pdpe |= PG_ACCESSED_MASK;
+                x86_stl_phys_notdirty(cs, pdpe_addr, pdpe);
+            }
+            if (pdpe & PG_PSE_MASK) {
+                /* 1 GB page */
+                page_size = 1024 * 1024 * 1024;
+                pte_addr = pdpe_addr;
+                pte = pdpe;
+                goto do_check_protect;
+            }
+        } else
+#endif
+        {
+            pdpe_addr = (env->nested_cr3 & ~0x1f) + ((gphys >> 27) & 0x18);
+            pdpe = x86_ldq_phys(cs, pdpe_addr);
+            if (!(pdpe & PG_PRESENT_MASK)) {
+                goto do_fault;
+            }
+            rsvd_mask |= PG_HI_USER_MASK;
+            if (pdpe & (rsvd_mask | PG_NX_MASK)) {
+                goto do_fault_rsvd;
+            }
+            ptep = PG_NX_MASK | PG_USER_MASK | PG_RW_MASK;
+        }
+
+        pde_addr = (pdpe & PG_ADDRESS_MASK) + (((gphys >> 21) & 0x1ff) << 3);
+        pde = x86_ldq_phys(cs, pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            goto do_fault;
+        }
+        if (pde & rsvd_mask) {
+            goto do_fault_rsvd;
+        }
+        ptep &= pde ^ PG_NX_MASK;
+        if (pde & PG_PSE_MASK) {
+            /* 2 MB page */
+            page_size = 2048 * 1024;
+            pte_addr = pde_addr;
+            pte = pde;
+            goto do_check_protect;
+        }
+        /* 4 KB page */
+        if (!(pde & PG_ACCESSED_MASK)) {
+            pde |= PG_ACCESSED_MASK;
+            x86_stl_phys_notdirty(cs, pde_addr, pde);
+        }
+        pte_addr = (pde & PG_ADDRESS_MASK) + (((gphys >> 12) & 0x1ff) << 3);
+        pte = x86_ldq_phys(cs, pte_addr);
+        if (!(pte & PG_PRESENT_MASK)) {
+            goto do_fault;
+        }
+        if (pte & rsvd_mask) {
+            goto do_fault_rsvd;
+        }
+        /* combine pde and pte nx, user and rw protections */
+        ptep &= pte ^ PG_NX_MASK;
+        page_size = 4096;
+    } else {
+        uint32_t pde;
+
+        /* page directory entry */
+        pde_addr = (env->nested_cr3 & ~0xfff) + ((gphys >> 20) & 0xffc);
+        pde = x86_ldl_phys(cs, pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            goto do_fault;
+        }
+        ptep = pde | PG_NX_MASK;
+
+        /* if PSE bit is set, then we use a 4MB page */
+        if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) {
+            page_size = 4096 * 1024;
+            pte_addr = pde_addr;
+
+            /* Bits 20-13 provide bits 39-32 of the address, bit 21 is reserved.
+             * Leave bits 20-13 in place for setting accessed/dirty bits below.
+             */
+            pte = pde | ((pde & 0x1fe000LL) << (32 - 13));
+            rsvd_mask = 0x200000;
+            goto do_check_protect_pse36;
+        }
+
+        if (!(pde & PG_ACCESSED_MASK)) {
+            pde |= PG_ACCESSED_MASK;
+            x86_stl_phys_notdirty(cs, pde_addr, pde);
+        }
+
+        /* page directory entry */
+        pte_addr = (pde & ~0xfff) + ((gphys >> 10) & 0xffc);
+        pte = x86_ldl_phys(cs, pte_addr);
+        if (!(pte & PG_PRESENT_MASK)) {
+            goto do_fault;
+        }
+        /* combine pde and pte user and rw protections */
+        ptep &= pte | PG_NX_MASK;
+        page_size = 4096;
+        rsvd_mask = 0;
+    }
+
+ do_check_protect:
+    rsvd_mask |= (page_size - 1) & PG_ADDRESS_MASK & ~PG_PSE_PAT_MASK;
+ do_check_protect_pse36:
+    if (pte & rsvd_mask) {
+        goto do_fault_rsvd;
+    }
+    ptep ^= PG_NX_MASK;
+
+    if (!(ptep & PG_USER_MASK)) {
+        goto do_fault_protect;
+    }
+    if (ptep & PG_NX_MASK) {
+        if (access_type == MMU_INST_FETCH) {
+            goto do_fault_protect;
+        }
+        *prot &= ~PAGE_EXEC;
+    }
+    if (!(ptep & PG_RW_MASK)) {
+        if (access_type == MMU_DATA_STORE) {
+            goto do_fault_protect;
+        }
+        *prot &= ~PAGE_WRITE;
+    }
+
+    pte &= PG_ADDRESS_MASK & ~(page_size - 1);
+    page_offset = gphys & (page_size - 1);
+    return pte + page_offset;
+
+ do_fault_rsvd:
+    exit_info_1 |= SVM_NPTEXIT_RSVD;
+ do_fault_protect:
+    exit_info_1 |= SVM_NPTEXIT_P;
+ do_fault:
+    x86_stq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, control.exit_info_2),
+                 gphys);
+    exit_info_1 |= SVM_NPTEXIT_US;
+    if (access_type == MMU_DATA_STORE) {
+        exit_info_1 |= SVM_NPTEXIT_RW;
+    } else if (access_type == MMU_INST_FETCH) {
+        exit_info_1 |= SVM_NPTEXIT_ID;
+    }
+    if (prot) {
+        exit_info_1 |= SVM_NPTEXIT_GPA;
+    } else { /* page table access */
+        exit_info_1 |= SVM_NPTEXIT_GPT;
+    }
+    cpu_vmexit(env, SVM_EXIT_NPF, exit_info_1, env->retaddr);
+}
+
 /* return value:
  * -1 = cannot handle fault
  * 0  = nothing more to do
@@ -224,6 +427,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
             if (la57) {
                 pml5e_addr = ((env->cr[3] & ~0xfff) +
                         (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
+                pml5e_addr = get_hphys(cs, pml5e_addr, MMU_DATA_STORE, NULL);
                 pml5e = x86_ldq_phys(cs, pml5e_addr);
                 if (!(pml5e & PG_PRESENT_MASK)) {
                     goto do_fault;
@@ -243,6 +447,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
 
             pml4e_addr = ((pml5e & PG_ADDRESS_MASK) +
                     (((addr >> 39) & 0x1ff) << 3)) & a20_mask;
+            pml4e_addr = get_hphys(cs, pml4e_addr, MMU_DATA_STORE, false);
             pml4e = x86_ldq_phys(cs, pml4e_addr);
             if (!(pml4e & PG_PRESENT_MASK)) {
                 goto do_fault;
@@ -257,6 +462,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
             ptep &= pml4e ^ PG_NX_MASK;
             pdpe_addr = ((pml4e & PG_ADDRESS_MASK) + (((addr >> 30) & 0x1ff) << 3)) &
                 a20_mask;
+            pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, NULL);
             pdpe = x86_ldq_phys(cs, pdpe_addr);
             if (!(pdpe & PG_PRESENT_MASK)) {
                 goto do_fault;
@@ -282,6 +488,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
             /* XXX: load them when cr3 is loaded ? */
             pdpe_addr = ((env->cr[3] & ~0x1f) + ((addr >> 27) & 0x18)) &
                 a20_mask;
+            pdpe_addr = get_hphys(cs, pdpe_addr, MMU_DATA_STORE, false);
             pdpe = x86_ldq_phys(cs, pdpe_addr);
             if (!(pdpe & PG_PRESENT_MASK)) {
                 goto do_fault;
@@ -295,6 +502,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
 
         pde_addr = ((pdpe & PG_ADDRESS_MASK) + (((addr >> 21) & 0x1ff) << 3)) &
             a20_mask;
+        pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL);
         pde = x86_ldq_phys(cs, pde_addr);
         if (!(pde & PG_PRESENT_MASK)) {
             goto do_fault;
@@ -317,6 +525,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         }
         pte_addr = ((pde & PG_ADDRESS_MASK) + (((addr >> 12) & 0x1ff) << 3)) &
             a20_mask;
+        pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL);
         pte = x86_ldq_phys(cs, pte_addr);
         if (!(pte & PG_PRESENT_MASK)) {
             goto do_fault;
@@ -333,6 +542,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         /* page directory entry */
         pde_addr = ((env->cr[3] & ~0xfff) + ((addr >> 20) & 0xffc)) &
             a20_mask;
+        pde_addr = get_hphys(cs, pde_addr, MMU_DATA_STORE, NULL);
         pde = x86_ldl_phys(cs, pde_addr);
         if (!(pde & PG_PRESENT_MASK)) {
             goto do_fault;
@@ -360,6 +570,7 @@  int x86_cpu_handle_mmu_fault(CPUState *cs, vaddr addr, int size,
         /* page directory entry */
         pte_addr = ((pde & ~0xfff) + ((addr >> 10) & 0xffc)) &
             a20_mask;
+        pte_addr = get_hphys(cs, pte_addr, MMU_DATA_STORE, NULL);
         pte = x86_ldl_phys(cs, pte_addr);
         if (!(pte & PG_PRESENT_MASK)) {
             goto do_fault;
@@ -442,12 +653,13 @@  do_check_protect_pse36:
 
     /* align to page_size */
     pte &= PG_ADDRESS_MASK & ~(page_size - 1);
+    page_offset = addr & (page_size - 1);
+    paddr = get_hphys(cs, pte + page_offset, is_write1, &prot);
 
     /* Even if 4MB pages, we map only one 4KB page in the cache to
        avoid filling it too fast */
     vaddr = addr & TARGET_PAGE_MASK;
-    page_offset = vaddr & (page_size - 1);
-    paddr = pte + page_offset;
+    paddr &= TARGET_PAGE_MASK;
 
     assert(prot & (1 << is_write1));
     tlb_set_page_with_attrs(cs, vaddr, paddr, cpu_get_mem_attrs(env),
diff --git a/target/i386/mem_helper.c b/target/i386/mem_helper.c
index a8ae694a9c..30c26b9d9c 100644
--- a/target/i386/mem_helper.c
+++ b/target/i386/mem_helper.c
@@ -202,13 +202,13 @@  void helper_boundl(CPUX86State *env, target_ulong a0, int v)
 void tlb_fill(CPUState *cs, target_ulong addr, int size,
               MMUAccessType access_type, int mmu_idx, uintptr_t retaddr)
 {
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
     int ret;
 
+    env->retaddr = retaddr;
     ret = x86_cpu_handle_mmu_fault(cs, addr, size, access_type, mmu_idx);
     if (ret) {
-        X86CPU *cpu = X86_CPU(cs);
-        CPUX86State *env = &cpu->env;
-
         raise_exception_err_ra(env, cs->exception_index, env->error_code, retaddr);
     }
 }
diff --git a/target/i386/svm.h b/target/i386/svm.h
index 922c8fd39c..23a3a040b8 100644
--- a/target/i386/svm.h
+++ b/target/i386/svm.h
@@ -130,6 +130,20 @@ 
 
 #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */
 
+#define SVM_NPT_ENABLED     (1 << 0)
+
+#define SVM_NPT_PAE         (1 << 0)
+#define SVM_NPT_LMA         (1 << 1)
+#define SVM_NPT_NXE         (1 << 2)
+
+#define SVM_NPTEXIT_P       (1ULL << 0)
+#define SVM_NPTEXIT_RW      (1ULL << 1)
+#define SVM_NPTEXIT_US      (1ULL << 2)
+#define SVM_NPTEXIT_RSVD    (1ULL << 3)
+#define SVM_NPTEXIT_ID      (1ULL << 4)
+#define SVM_NPTEXIT_GPA     (1ULL << 32)
+#define SVM_NPTEXIT_GPT     (1ULL << 33)
+
 struct QEMU_PACKED vmcb_control_area {
 	uint16_t intercept_cr_read;
 	uint16_t intercept_cr_write;
diff --git a/target/i386/svm_helper.c b/target/i386/svm_helper.c
index e3288955f1..209881cf16 100644
--- a/target/i386/svm_helper.c
+++ b/target/i386/svm_helper.c
@@ -124,6 +124,7 @@  void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
 {
     CPUState *cs = CPU(x86_env_get_cpu(env));
     target_ulong addr;
+    uint64_t nested_ctl;
     uint32_t event_inj;
     uint32_t int_ctl;
 
@@ -206,6 +207,26 @@  void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
                                                   control.intercept_exceptions
                                                   ));
 
+    nested_ctl = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb,
+                                                          control.nested_ctl));
+    if (nested_ctl & SVM_NPT_ENABLED) {
+        env->nested_cr3 = x86_ldq_phys(cs,
+                                env->vm_vmcb + offsetof(struct vmcb,
+                                                        control.nested_cr3));
+        env->hflags |= HF_NPT_MASK;
+
+        env->nested_pg_mode = 0;
+        if (env->cr[4] & CR4_PAE_MASK) {
+            env->nested_pg_mode |= SVM_NPT_PAE;
+        }
+        if (env->hflags & HF_LMA_MASK) {
+            env->nested_pg_mode |= SVM_NPT_LMA;
+        }
+        if (env->efer & MSR_EFER_NXE) {
+            env->nested_pg_mode |= SVM_NPT_NXE;
+        }
+    }
+
     /* enable intercepts */
     env->hflags |= HF_SVMI_MASK;
 
@@ -616,6 +637,7 @@  void do_vmexit(CPUX86State *env, uint32_t exit_code, uint64_t exit_info_1)
         x86_stl_phys(cs,
                  env->vm_vmcb + offsetof(struct vmcb, control.int_state), 0);
     }
+    env->hflags &= ~HF_NPT_MASK;
 
     /* Save the VM state in the vmcb */
     svm_save_seg(env, env->vm_vmcb + offsetof(struct vmcb, save.es),