diff mbox

[RFC,03/14,v7] target-i386: implement cpu_get_memory_mapping()

Message ID 4F4EE1EB.9080409@cn.fujitsu.com
State New
Headers show

Commit Message

Wen Congyang March 1, 2012, 2:41 a.m. UTC
Walk cpu's page table and collect all virtual address and physical address mapping.
Then, add these mapping into memory mapping list.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 Makefile.target                   |    1 +
 configure                         |    4 +
 cpu-all.h                         |   10 ++
 target-i386/arch_memory_mapping.c |  256 +++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+), 0 deletions(-)
 create mode 100644 target-i386/arch_memory_mapping.c

Comments

Hatayama, Daisuke March 1, 2012, 6:13 a.m. UTC | #1
From: Wen Congyang <wency@cn.fujitsu.com>
Subject: [RFC][PATCH 03/14 v7] target-i386: implement cpu_get_memory_mapping()
Date: Thu, 01 Mar 2012 10:41:47 +0800

> +int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env)
> +{
> +    if (env->cr[4] & CR4_PAE_MASK) {
> +#ifdef TARGET_X86_64
> +        if (env->hflags & HF_LMA_MASK) {
> +            target_phys_addr_t pml4e_addr;
> +
> +            pml4e_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
> +            walk_pml4e(list, pml4e_addr, env->a20_mask);
> +        } else
> +#endif
> +        {
> +            target_phys_addr_t pdpe_addr;
> +
> +            pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask;
> +            walk_pdpe2(list, pdpe_addr, env->a20_mask);
> +        }
> +    } else {
> +        target_phys_addr_t pde_addr;
> +        bool pse;
> +
> +        pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
> +        pse = !!(env->cr[4] & CR4_PSE_MASK);
> +        walk_pde2(list, pde_addr, env->a20_mask, pse);
> +    }
> +
> +    return 0;
> +}

Does this assume paging mode? I don't know qemu very well, but qemu
dump command runs externally to guest machine, so I think the machine
could be in the state with paging disabled where CR4 doesn't refer to
page table as expected.

Thanks.
HATAYAMA, Daisuke
Wen Congyang March 1, 2012, 6:21 a.m. UTC | #2
At 03/01/2012 02:13 PM, HATAYAMA Daisuke Wrote:
> From: Wen Congyang <wency@cn.fujitsu.com>
> Subject: [RFC][PATCH 03/14 v7] target-i386: implement cpu_get_memory_mapping()
> Date: Thu, 01 Mar 2012 10:41:47 +0800
> 
>> +int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env)
>> +{
>> +    if (env->cr[4] & CR4_PAE_MASK) {
>> +#ifdef TARGET_X86_64
>> +        if (env->hflags & HF_LMA_MASK) {
>> +            target_phys_addr_t pml4e_addr;
>> +
>> +            pml4e_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
>> +            walk_pml4e(list, pml4e_addr, env->a20_mask);
>> +        } else
>> +#endif
>> +        {
>> +            target_phys_addr_t pdpe_addr;
>> +
>> +            pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask;
>> +            walk_pdpe2(list, pdpe_addr, env->a20_mask);
>> +        }
>> +    } else {
>> +        target_phys_addr_t pde_addr;
>> +        bool pse;
>> +
>> +        pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
>> +        pse = !!(env->cr[4] & CR4_PSE_MASK);
>> +        walk_pde2(list, pde_addr, env->a20_mask, pse);
>> +    }
>> +
>> +    return 0;
>> +}
> 
> Does this assume paging mode? I don't know qemu very well, but qemu
> dump command runs externally to guest machine, so I think the machine
> could be in the state with paging disabled where CR4 doesn't refer to
> page table as expected.

CR4? I think you want to say CR3.

Yes, the guest may be in the state without paging mode. I will fix it.

Thanks
Wen Congyang

> 
> Thanks.
> HATAYAMA, Daisuke
> 
>
Hatayama, Daisuke March 2, 2012, 2:16 a.m. UTC | #3
From: Wen Congyang <wency@cn.fujitsu.com>
Subject: Re: [RFC][PATCH 03/14 v7] target-i386: implement cpu_get_memory_mapping()
Date: Thu, 01 Mar 2012 14:21:37 +0800

> At 03/01/2012 02:13 PM, HATAYAMA Daisuke Wrote:
>> From: Wen Congyang <wency@cn.fujitsu.com>
>> Subject: [RFC][PATCH 03/14 v7] target-i386: implement cpu_get_memory_mapping()
>> Date: Thu, 01 Mar 2012 10:41:47 +0800
>> 
>>> +int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env)
>>> +{
>>> +    if (env->cr[4] & CR4_PAE_MASK) {
>>> +#ifdef TARGET_X86_64
>>> +        if (env->hflags & HF_LMA_MASK) {
>>> +            target_phys_addr_t pml4e_addr;
>>> +
>>> +            pml4e_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
>>> +            walk_pml4e(list, pml4e_addr, env->a20_mask);
>>> +        } else
>>> +#endif
>>> +        {
>>> +            target_phys_addr_t pdpe_addr;
>>> +
>>> +            pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask;
>>> +            walk_pdpe2(list, pdpe_addr, env->a20_mask);
>>> +        }
>>> +    } else {
>>> +        target_phys_addr_t pde_addr;
>>> +        bool pse;
>>> +
>>> +        pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
>>> +        pse = !!(env->cr[4] & CR4_PSE_MASK);
>>> +        walk_pde2(list, pde_addr, env->a20_mask, pse);
>>> +    }
>>> +
>>> +    return 0;
>>> +}
>> 
>> Does this assume paging mode? I don't know qemu very well, but qemu
>> dump command runs externally to guest machine, so I think the machine
>> could be in the state with paging disabled where CR4 doesn't refer to
>> page table as expected.
> 
> CR4? I think you want to say CR3.
> 
> Yes, the guest may be in the state without paging mode. I will fix it.
> 

Hmmm, now I think dump command needs to have a option to specify
whether to do paging or not during dumping. Doing always paging is
problematic. Also generated formats should be as simple as possible,
different from the format this current version generates. The reasons
I have are as follows:

  - The qemu dump command runs outside of guest machine. If machine is
    in the state with paging disabled and CR3 doesn't has page table
    address, qemu dump command cannot do paging.

  - We cannot do paging if guest machine state is severe. For example,
    when page table data is corrupted in some reason. In general, we
    should use minimum kinds of data only during dumping.

  - There's also kdump specific issue. On kdump there are two kinds of
    kernels, 1st kernel and 2nd kernel, and when crash happens,
    execution is transmitted from the 1st to the 2nd, and then the 2nd
    kernel copies the 1st kernel's kernel image. The problem is that
    at catastrophic situation, kdump can also hang even in the 2nd
    kernel. At this point, the 2nd kernel refers to the 2nd kernel's
    page table. So paging at the situation leads to lost of the 1st
    kernel's memory.

  - OTOH, gdb cannot perform paging, so, for gdb support, qemu dump
    needs to have paging mode. The period when qemu dump can produce
    the dump gdb can read is limited to the machine state with paging
    enabled and the 1st krenel, but I think no choise.

    * There's a way of getting the 1st kernel's image as linear image
      from the dumpfile generated at the 2nd kernel generated without
      paging. But it uses kenrel's specific information, so I don't
      think qemu should do this.

  - Well, it's possible to generate dumpfile while enabling both
    physical and linear address access together. It's just what Wen is
    doing now. But I think it better to do that more simply: that is,
    if non-paging mode, produce dumpfile as raw format; if paging
    mode, produce as linear format.

    * For example, the current implementation assigns both virtual and
      physical address to a single PT_LOAD entry. But the memory areas
      mapped to by a single PT_LOAD is restricted to the ones
      contiguous both physically and virtually. Due to this, I guess
      there's a case where the number of program headers grows
      seriously in the worst case. It might reach ELF's limit size.

    * Also, by this, it's necessary to reduce the number of program
      headers as much as possible, and now qemu dump tries to merge
      them in PATCH 01, but it looks to me too complicated.

How do other people think?

Thanks.
HATAYAMA, Daisuke
diff mbox

Patch

diff --git a/Makefile.target b/Makefile.target
index 9227e4e..a87e678 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -84,6 +84,7 @@  libobj-y += op_helper.o helper.o
 ifeq ($(TARGET_BASE_ARCH), i386)
 libobj-y += cpuid.o
 endif
+libobj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += arch_memory_mapping.o
 libobj-$(TARGET_SPARC64) += vis_helper.o
 libobj-$(CONFIG_NEED_MMU) += mmu.o
 libobj-$(TARGET_ARM) += neon_helper.o iwmmxt_helper.o
diff --git a/configure b/configure
index f9d5330..ddc54f5 100755
--- a/configure
+++ b/configure
@@ -3630,6 +3630,10 @@  case "$target_arch2" in
       fi
     fi
 esac
+case "$target_arch2" in
+  i386|x86_64)
+    echo "CONFIG_HAVE_GET_MEMORY_MAPPING=y" >> $config_target_mak
+esac
 if test "$target_arch2" = "ppc64" -a "$fdt" = "yes"; then
   echo "CONFIG_PSERIES=y" >> $config_target_mak
 fi
diff --git a/cpu-all.h b/cpu-all.h
index e2c3c49..cb72680 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -22,6 +22,7 @@ 
 #include "qemu-common.h"
 #include "qemu-tls.h"
 #include "cpu-common.h"
+#include "memory_mapping.h"
 
 /* some important defines:
  *
@@ -523,4 +524,13 @@  void dump_exec_info(FILE *f, fprintf_function cpu_fprintf);
 int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
                         uint8_t *buf, int len, int is_write);
 
+#if defined(CONFIG_HAVE_GET_MEMORY_MAPPING)
+int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env);
+#else
+static inline int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env)
+{
+    return -1;
+}
+#endif
+
 #endif /* CPU_ALL_H */
diff --git a/target-i386/arch_memory_mapping.c b/target-i386/arch_memory_mapping.c
new file mode 100644
index 0000000..8dcc010
--- /dev/null
+++ b/target-i386/arch_memory_mapping.c
@@ -0,0 +1,256 @@ 
+/*
+ * i386 memory mapping
+ *
+ * Copyright Fujitsu, Corp. 2011
+ *
+ * Authors:
+ *     Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "cpu.h"
+#include "cpu-all.h"
+
+/* PAE Paging or IA-32e Paging */
+static void walk_pte(MemoryMappingList *list, target_phys_addr_t pte_start_addr,
+                     int32_t a20_mask, target_ulong start_line_addr)
+{
+    target_phys_addr_t pte_addr, start_paddr;
+    uint64_t pte;
+    target_ulong start_vaddr;
+    int i;
+
+    for (i = 0; i < 512; i++) {
+        pte_addr = (pte_start_addr + i * 8) & a20_mask;
+        pte = ldq_phys(pte_addr);
+        if (!(pte & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        start_paddr = (pte & ~0xfff) & ~(0x1ULL << 63);
+        if (cpu_physical_memory_is_io(start_paddr)) {
+            /* I/O region */
+            continue;
+        }
+
+        start_vaddr = start_line_addr | ((i & 0x1fff) << 12);
+        memory_mapping_list_add_sorted(list, start_paddr, start_vaddr, 1 << 12);
+    }
+}
+
+/* 32-bit Paging */
+static void walk_pte2(MemoryMappingList *list,
+                      target_phys_addr_t pte_start_addr, int32_t a20_mask,
+                      target_ulong start_line_addr)
+{
+    target_phys_addr_t pte_addr, start_paddr;
+    uint32_t pte;
+    target_ulong start_vaddr;
+    int i;
+
+    for (i = 0; i < 1024; i++) {
+        pte_addr = (pte_start_addr + i * 4) & a20_mask;
+        pte = ldl_phys(pte_addr);
+        if (!(pte & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        start_paddr = pte & ~0xfff;
+        if (cpu_physical_memory_is_io(start_paddr)) {
+            /* I/O region */
+            continue;
+        }
+
+        start_vaddr = start_line_addr | ((i & 0x3ff) << 12);
+        memory_mapping_list_add_sorted(list, start_paddr, start_vaddr, 1 << 12);
+    }
+}
+
+/* PAE Paging or IA-32e Paging */
+static void walk_pde(MemoryMappingList *list, target_phys_addr_t pde_start_addr,
+                     int32_t a20_mask, target_ulong start_line_addr)
+{
+    target_phys_addr_t pde_addr, pte_start_addr, start_paddr;
+    uint64_t pde;
+    target_ulong line_addr, start_vaddr;
+    int i;
+
+    for (i = 0; i < 512; i++) {
+        pde_addr = (pde_start_addr + i * 8) & a20_mask;
+        pde = ldq_phys(pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        line_addr = start_line_addr | ((i & 0x1ff) << 21);
+        if (pde & PG_PSE_MASK) {
+            /* 2 MB page */
+            start_paddr = (pde & ~0x1fffff) & ~(0x1ULL << 63);
+            if (cpu_physical_memory_is_io(start_paddr)) {
+                /* I/O region */
+                continue;
+            }
+            start_vaddr = line_addr;
+            memory_mapping_list_add_sorted(list, start_paddr, start_vaddr, 1 << 21);
+            continue;
+        }
+
+        pte_start_addr = (pde & ~0xfff) & a20_mask;
+        walk_pte(list, pte_start_addr, a20_mask, line_addr);
+    }
+}
+
+/* 32-bit Paging */
+static void walk_pde2(MemoryMappingList *list,
+                      target_phys_addr_t pde_start_addr, int32_t a20_mask,
+                      bool pse)
+{
+    target_phys_addr_t pde_addr, pte_start_addr, start_paddr;
+    uint32_t pde;
+    target_ulong line_addr, start_vaddr;
+    int i;
+
+    for (i = 0; i < 1024; i++) {
+        pde_addr = (pde_start_addr + i * 4) & a20_mask;
+        pde = ldl_phys(pde_addr);
+        if (!(pde & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        line_addr = (((unsigned int)i & 0x3ff) << 22);
+        if ((pde & PG_PSE_MASK) && pse) {
+            /* 4 MB page */
+            start_paddr = (pde & ~0x3fffff) | ((pde & 0x1fe000) << 19);
+            if (cpu_physical_memory_is_io(start_paddr)) {
+                /* I/O region */
+                continue;
+            }
+            start_vaddr = line_addr;
+            memory_mapping_list_add_sorted(list, start_paddr, start_vaddr, 1 << 22);
+            continue;
+        }
+
+        pte_start_addr = (pde & ~0xfff) & a20_mask;
+        walk_pte2(list, pte_start_addr, a20_mask, line_addr);
+    }
+}
+
+/* PAE Paging */
+static void walk_pdpe2(MemoryMappingList *list,
+                       target_phys_addr_t pdpe_start_addr, int32_t a20_mask)
+{
+    target_phys_addr_t pdpe_addr, pde_start_addr;
+    uint64_t pdpe;
+    target_ulong line_addr;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        pdpe_addr = (pdpe_start_addr + i * 8) & a20_mask;
+        pdpe = ldq_phys(pdpe_addr);
+        if (!(pdpe & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        line_addr = (((unsigned int)i & 0x3) << 30);
+        pde_start_addr = (pdpe & ~0xfff) & a20_mask;
+        walk_pde(list, pde_start_addr, a20_mask, line_addr);
+    }
+}
+
+#ifdef TARGET_X86_64
+/* IA-32e Paging */
+static void walk_pdpe(MemoryMappingList *list,
+                      target_phys_addr_t pdpe_start_addr, int32_t a20_mask,
+                      target_ulong start_line_addr)
+{
+    target_phys_addr_t pdpe_addr, pde_start_addr, start_paddr;
+    uint64_t pdpe;
+    target_ulong line_addr, start_vaddr;
+    int i;
+
+    for (i = 0; i < 512; i++) {
+        pdpe_addr = (pdpe_start_addr + i * 8) & a20_mask;
+        pdpe = ldq_phys(pdpe_addr);
+        if (!(pdpe & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        line_addr = start_line_addr | ((i & 0x1ffULL) << 30);
+        if (pdpe & PG_PSE_MASK) {
+            /* 1 GB page */
+            start_paddr = (pdpe & ~0x3fffffff) & ~(0x1ULL << 63);
+            if (cpu_physical_memory_is_io(start_paddr)) {
+                /* I/O region */
+                continue;
+            }
+            start_vaddr = line_addr;
+            memory_mapping_list_add_sorted(list, start_paddr, start_vaddr, 1 << 30);
+            continue;
+        }
+
+        pde_start_addr = (pdpe & ~0xfff) & a20_mask;
+        walk_pde(list, pde_start_addr, a20_mask, line_addr);
+    }
+}
+
+/* IA-32e Paging */
+static void walk_pml4e(MemoryMappingList *list,
+                       target_phys_addr_t pml4e_start_addr, int32_t a20_mask)
+{
+    target_phys_addr_t pml4e_addr, pdpe_start_addr;
+    uint64_t pml4e;
+    target_ulong line_addr;
+    int i;
+
+    for (i = 0; i < 512; i++) {
+        pml4e_addr = (pml4e_start_addr + i * 8) & a20_mask;
+        pml4e = ldq_phys(pml4e_addr);
+        if (!(pml4e & PG_PRESENT_MASK)) {
+            /* not present */
+            continue;
+        }
+
+        line_addr = ((i & 0x1ffULL) << 39) | (0xffffULL << 48);
+        pdpe_start_addr = (pml4e & ~0xfff) & a20_mask;
+        walk_pdpe(list, pdpe_start_addr, a20_mask, line_addr);
+    }
+}
+#endif
+
+int cpu_get_memory_mapping(MemoryMappingList *list, CPUState *env)
+{
+    if (env->cr[4] & CR4_PAE_MASK) {
+#ifdef TARGET_X86_64
+        if (env->hflags & HF_LMA_MASK) {
+            target_phys_addr_t pml4e_addr;
+
+            pml4e_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
+            walk_pml4e(list, pml4e_addr, env->a20_mask);
+        } else
+#endif
+        {
+            target_phys_addr_t pdpe_addr;
+
+            pdpe_addr = (env->cr[3] & ~0x1f) & env->a20_mask;
+            walk_pdpe2(list, pdpe_addr, env->a20_mask);
+        }
+    } else {
+        target_phys_addr_t pde_addr;
+        bool pse;
+
+        pde_addr = (env->cr[3] & ~0xfff) & env->a20_mask;
+        pse = !!(env->cr[4] & CR4_PSE_MASK);
+        walk_pde2(list, pde_addr, env->a20_mask, pse);
+    }
+
+    return 0;
+}