Patchwork [4/6] Implement multi-level page tables.

login
register
mail settings
Submitter Richard Henderson
Date Feb. 11, 2010, 11:51 p.m.
Message ID <baaef727b66e66ead2318731b0660f1bd0e398ff.1265933757.git.rth@twiddle.net>
Download mbox | patch
Permalink /patch/45171/
State New
Headers show

Comments

Richard Henderson - Feb. 11, 2010, 11:51 p.m.
Use TARGET_VIRT_ADDR_SPACE_BITS for the virtual memory map based off
of l1_map.  This rewrites page_find_alloc, page_flush_tb, and
walk_memory_regions.

Use TARGET_PHYS_ADDR_SPACE_BITS for the physical memory map based off
of l1_phys_map.  This rewrites page_phys_find_alloc and
phys_page_for_each.
---
 cpu-all.h |    7 +-
 exec.c    |  442 +++++++++++++++++++++++++++++++++++++------------------------
 2 files changed, 271 insertions(+), 178 deletions(-)

Patch

diff --git a/cpu-all.h b/cpu-all.h
index b81641f..510f0b4 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -745,8 +745,11 @@  extern unsigned long qemu_host_page_mask;
 #define PAGE_RESERVED  0x0020
 
 void page_dump(FILE *f);
-int walk_memory_regions(void *,
-    int (*fn)(void *, unsigned long, unsigned long, unsigned long));
+
+typedef int (*walk_memory_regions_fn)(void *, unsigned long,
+                                      unsigned long, unsigned long);
+int walk_memory_regions(void *, walk_memory_regions_fn);
+
 int page_get_flags(target_ulong address);
 void page_set_flags(target_ulong start, target_ulong end, int flags);
 int page_check_range(target_ulong start, target_ulong len, int flags);
diff --git a/exec.c b/exec.c
index ebbe6d0..14f15a6 100644
--- a/exec.c
+++ b/exec.c
@@ -141,28 +141,47 @@  typedef struct PhysPageDesc {
     ram_addr_t region_offset;
 } PhysPageDesc;
 
+/* Size of the L2 (and L3, etc) page tables.  */
 #define L2_BITS 10
-#if defined(CONFIG_USER_ONLY) && defined(TARGET_VIRT_ADDR_SPACE_BITS)
-/* XXX: this is a temporary hack for alpha target.
- *      In the future, this is to be replaced by a multi-level table
- *      to actually be able to handle the complete 64 bits address space.
- */
-#define L1_BITS (TARGET_VIRT_ADDR_SPACE_BITS - L2_BITS - TARGET_PAGE_BITS)
+#define L2_SIZE (1 << L2_BITS)
+
+/* The bits remaining after N lower levels of page tables.  */
+#define P_L1_BITS_REM \
+    ((TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+#define V_L1_BITS_REM \
+    ((TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS) % L2_BITS)
+
+/* Size of the L1 page table.  Avoid silly small sizes.  */
+#if P_L1_BITS_REM < 4
+#define P_L1_BITS  (P_L1_BITS_REM + L2_BITS)
 #else
-#define L1_BITS (32 - L2_BITS - TARGET_PAGE_BITS)
+#define P_L1_BITS  P_L1_BITS_REM
 #endif
 
-#define L1_SIZE (1 << L1_BITS)
-#define L2_SIZE (1 << L2_BITS)
+#if V_L1_BITS_REM < 4
+#define V_L1_BITS  (V_L1_BITS_REM + L2_BITS)
+#else
+#define V_L1_BITS  V_L1_BITS_REM
+#endif
+
+#define P_L1_SIZE  ((target_phys_addr_t)1 << P_L1_BITS)
+#define V_L1_SIZE  ((target_ulong)1 << V_L1_BITS)
+
+#define P_L1_SHIFT (TARGET_PHYS_ADDR_SPACE_BITS - TARGET_PAGE_BITS - P_L1_BITS)
+#define V_L1_SHIFT (TARGET_VIRT_ADDR_SPACE_BITS - TARGET_PAGE_BITS - V_L1_BITS)
 
 unsigned long qemu_real_host_page_size;
 unsigned long qemu_host_page_bits;
 unsigned long qemu_host_page_size;
 unsigned long qemu_host_page_mask;
 
-/* XXX: for system emulation, it could just be an array */
-static PageDesc *l1_map[L1_SIZE];
-static PhysPageDesc **l1_phys_map;
+/* This is a multi-level map on the virtual address space.
+   The bottom level has pointers to PageDesc.  */
+static void *l1_map[V_L1_SIZE];
+
+/* This is a multi-level map on the physical address space.
+   The bottom level has pointers to PhysPageDesc.  */
+static void *l1_phys_map[P_L1_SIZE];
 
 #if !defined(CONFIG_USER_ONLY)
 static void io_mem_init(void);
@@ -247,130 +266,158 @@  static void page_init(void)
     while ((1 << qemu_host_page_bits) < qemu_host_page_size)
         qemu_host_page_bits++;
     qemu_host_page_mask = ~(qemu_host_page_size - 1);
-    l1_phys_map = qemu_vmalloc(L1_SIZE * sizeof(void *));
-    memset(l1_phys_map, 0, L1_SIZE * sizeof(void *));
 
 #if !defined(_WIN32) && defined(CONFIG_USER_ONLY)
     {
-        long long startaddr, endaddr;
         FILE *f;
-        int n;
 
-        mmap_lock();
         last_brk = (unsigned long)sbrk(0);
+
         f = fopen("/proc/self/maps", "r");
         if (f) {
+            mmap_lock();
+
             do {
-                n = fscanf (f, "%llx-%llx %*[^\n]\n", &startaddr, &endaddr);
-                if (n == 2) {
-                    startaddr = MIN(startaddr,
-                                    (1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1);
-                    endaddr = MIN(endaddr,
-                                    (1ULL << TARGET_PHYS_ADDR_SPACE_BITS) - 1);
-                    page_set_flags(startaddr & TARGET_PAGE_MASK,
-                                   TARGET_PAGE_ALIGN(endaddr),
-                                   PAGE_RESERVED); 
+                unsigned long startaddr, endaddr;
+                int n;
+
+                n = fscanf (f, "%lx-%lx %*[^\n]\n", &startaddr, &endaddr);
+
+                if (n == 2 && h2g_valid(startaddr)) {
+                    startaddr = h2g(startaddr) & TARGET_PAGE_MASK;
+
+                    if (h2g_valid(endaddr)) {
+                        endaddr = h2g(endaddr);
+                    } else {
+                        endaddr = ~0ul;
+                    }
+                    page_set_flags(startaddr, endaddr, PAGE_RESERVED); 
                 }
             } while (!feof(f));
+
             fclose(f);
+            mmap_unlock();
         }
-        mmap_unlock();
     }
 #endif
 }
 
-static inline PageDesc **page_l1_map(target_ulong index)
+static PageDesc *page_find_alloc(target_ulong index, int alloc)
 {
-#if TARGET_LONG_BITS > 32
-    /* Host memory outside guest VM.  For 32-bit targets we have already
-       excluded high addresses.  */
-    if (index > ((target_ulong)L2_SIZE * L1_SIZE))
-        return NULL;
+#if defined(CONFIG_USER_ONLY)
+    /* We can't use qemu_malloc because it may recurse into a locked mutex.
+       Neither can we record the new pages we reserve while allocating a
+       given page because that may recurse into an unallocated page table
+       entry.  Stuff the allocations we do make into a queue and process
+       them after having completed one entire page table allocation.  */
+
+    unsigned long reserve[2 * (V_L1_SHIFT / L2_BITS)];
+    int reserve_idx = 0;
+
+# define ALLOC(P, SIZE)                                 \
+    do {                                                \
+        P = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,    \
+                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);   \
+        if (h2g_valid(P)) {                             \
+            reserve[reserve_idx] = h2g(P);              \
+            reserve[reserve_idx + 1] = SIZE;            \
+            reserve_idx += 2;                           \
+        }                                               \
+    } while (0)
+#else
+# define ALLOC(P, SIZE) \
+    do { P = qemu_mallocz(SIZE); } while (0)
 #endif
-    return &l1_map[index >> L2_BITS];
-}
 
-static inline PageDesc *page_find_alloc(target_ulong index)
-{
-    PageDesc **lp, *p;
-    lp = page_l1_map(index);
-    if (!lp)
-        return NULL;
+    PageDesc *pd;
+    void **lp;
+    int i;
 
-    p = *lp;
-    if (!p) {
-        /* allocate if not found */
-#if defined(CONFIG_USER_ONLY)
-        size_t len = sizeof(PageDesc) * L2_SIZE;
-        /* Don't use qemu_malloc because it may recurse.  */
-        p = mmap(NULL, len, PROT_READ | PROT_WRITE,
-                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-        *lp = p;
-        if (h2g_valid(p)) {
-            unsigned long addr = h2g(p);
-            page_set_flags(addr & TARGET_PAGE_MASK,
-                           TARGET_PAGE_ALIGN(addr + len),
-                           PAGE_RESERVED); 
+    /* Level 1.  Always allocated.  */
+    lp = l1_map + ((index >> V_L1_SHIFT) & (V_L1_SIZE - 1));
+
+    /* Level 2..N-1.  */
+    for (i = V_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+        void **p = *lp;
+
+        if (p == NULL) {
+            if (!alloc) {
+                return NULL;
+            }
+            ALLOC(p, sizeof(void *) * L2_SIZE);
+            *lp = p;
         }
-#else
-        p = qemu_mallocz(sizeof(PageDesc) * L2_SIZE);
-        *lp = p;
-#endif
+
+        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
+    }
+
+    pd = *lp;
+    if (pd == NULL) {
+        if (!alloc) {
+            return NULL;
+        }
+        ALLOC(pd, sizeof(PageDesc) * L2_SIZE);
+        *lp = pd;
     }
-    return p + (index & (L2_SIZE - 1));
+
+#undef ALLOC
+#if defined(CONFIG_USER_ONLY)
+    for (i = 0; i < reserve_idx; i += 2) {
+        unsigned long addr = reserve[i];
+        unsigned long len = reserve[i + 1];
+
+        page_set_flags(addr & TARGET_PAGE_MASK,
+                       TARGET_PAGE_ALIGN(addr + len),
+                       PAGE_RESERVED);
+    }
+#endif
+
+    return pd + (index & (L2_SIZE - 1));
 }
 
 static inline PageDesc *page_find(target_ulong index)
 {
-    PageDesc **lp, *p;
-    lp = page_l1_map(index);
-    if (!lp)
-        return NULL;
-
-    p = *lp;
-    if (!p) {
-        return NULL;
-    }
-    return p + (index & (L2_SIZE - 1));
+    return page_find_alloc(index, 0);
 }
 
 static PhysPageDesc *phys_page_find_alloc(target_phys_addr_t index, int alloc)
 {
-    void **lp, **p;
     PhysPageDesc *pd;
+    void **lp;
+    int i;
 
-    p = (void **)l1_phys_map;
-#if TARGET_PHYS_ADDR_SPACE_BITS > 32
+    /* Level 1.  Always allocated.  */
+    lp = l1_phys_map + ((index >> P_L1_SHIFT) & (P_L1_SIZE - 1));
 
-#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
-#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
-#endif
-    lp = p + ((index >> (L1_BITS + L2_BITS)) & (L1_SIZE - 1));
-    p = *lp;
-    if (!p) {
-        /* allocate if not found */
-        if (!alloc)
-            return NULL;
-        p = qemu_vmalloc(sizeof(void *) * L1_SIZE);
-        memset(p, 0, sizeof(void *) * L1_SIZE);
-        *lp = p;
+    /* Level 2..N-1.  */
+    for (i = P_L1_SHIFT / L2_BITS - 1; i > 0; i--) {
+        void **p = *lp;
+        if (p == NULL) {
+            if (!alloc) {
+                return NULL;
+            }
+            *lp = p = qemu_mallocz(sizeof(void *) * L2_SIZE);
+        }
+        lp = p + ((index >> (i * L2_BITS)) & (L2_SIZE - 1));
     }
-#endif
-    lp = p + ((index >> L2_BITS) & (L1_SIZE - 1));
+
     pd = *lp;
-    if (!pd) {
+    if (pd == NULL) {
         int i;
-        /* allocate if not found */
-        if (!alloc)
+
+        if (!alloc) {
             return NULL;
-        pd = qemu_vmalloc(sizeof(PhysPageDesc) * L2_SIZE);
-        *lp = pd;
+        }
+
+        *lp = pd = qemu_malloc(sizeof(PhysPageDesc) * L2_SIZE);
+
         for (i = 0; i < L2_SIZE; i++) {
-          pd[i].phys_offset = IO_MEM_UNASSIGNED;
-          pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
+            pd[i].phys_offset = IO_MEM_UNASSIGNED;
+            pd[i].region_offset = (index + i) << TARGET_PAGE_BITS;
         }
     }
-    return ((PhysPageDesc *)pd) + (index & (L2_SIZE - 1));
+
+    return pd + (index & (L2_SIZE - 1));
 }
 
 static inline PhysPageDesc *phys_page_find(target_phys_addr_t index)
@@ -596,23 +643,36 @@  static inline void invalidate_page_bitmap(PageDesc *p)
     p->code_write_count = 0;
 }
 
-/* set to NULL all the 'first_tb' fields in all PageDescs */
-static void page_flush_tb(void)
+/* Set to NULL all the 'first_tb' fields in all PageDescs. */
+
+static void page_flush_tb_1 (int level, void **lp)
 {
-    int i, j;
-    PageDesc *p;
+    int i;
 
-    for(i = 0; i < L1_SIZE; i++) {
-        p = l1_map[i];
-        if (p) {
-            for(j = 0; j < L2_SIZE; j++) {
-                p->first_tb = NULL;
-                invalidate_page_bitmap(p);
-                p++;
-            }
+    if (*lp == NULL) {
+        return;
+    }
+    if (level == 0) {
+        PageDesc *pd = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            pd[i].first_tb = NULL;
+            invalidate_page_bitmap(pd + i);
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            page_flush_tb_1 (level - 1, pp + i);
         }
     }
 }
+            
+static void page_flush_tb(void)
+{
+    int i;
+    for (i = 0; i < V_L1_SIZE; i++) {
+        page_flush_tb_1(V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+    }
+}
 
 /* flush all the translation blocks */
 /* XXX: tb_flush is currently not thread safe */
@@ -1104,7 +1164,7 @@  static inline void tb_alloc_page(TranslationBlock *tb,
     TranslationBlock *last_first_tb;
 
     tb->page_addr[n] = page_addr;
-    p = page_find_alloc(page_addr >> TARGET_PAGE_BITS);
+    p = page_find_alloc(page_addr >> TARGET_PAGE_BITS, 1);
     tb->page_next[n] = p->first_tb;
     last_first_tb = p->first_tb;
     p->first_tb = (TranslationBlock *)((long)tb | n);
@@ -1644,50 +1704,37 @@  static int cpu_notify_migration_log(int enable)
     return 0;
 }
 
-static void phys_page_for_each_in_l1_map(PhysPageDesc **phys_map,
-                                         CPUPhysMemoryClient *client)
+static void phys_page_for_each_1(CPUPhysMemoryClient *client,
+                                 int level, void **lp)
 {
-    PhysPageDesc *pd;
-    int l1, l2;
+    int i;
 
-    for (l1 = 0; l1 < L1_SIZE; ++l1) {
-        pd = phys_map[l1];
-        if (!pd) {
-            continue;
-        }
-        for (l2 = 0; l2 < L2_SIZE; ++l2) {
-            if (pd[l2].phys_offset == IO_MEM_UNASSIGNED) {
-                continue;
+    if (*lp == NULL) {
+        return;
+    }
+    if (level == 0) {
+        PhysPageDesc *pd = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            if (pd[i].phys_offset != IO_MEM_UNASSIGNED) {
+                client->set_memory(client, pd[i].region_offset,
+                                   TARGET_PAGE_SIZE, pd[i].phys_offset);
             }
-            client->set_memory(client, pd[l2].region_offset,
-                               TARGET_PAGE_SIZE, pd[l2].phys_offset);
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            phys_page_for_each_1(client, level - 1, pp + i);
         }
     }
 }
 
 static void phys_page_for_each(CPUPhysMemoryClient *client)
 {
-#if TARGET_PHYS_ADDR_SPACE_BITS > 32
-
-#if TARGET_PHYS_ADDR_SPACE_BITS > (32 + L1_BITS)
-#error unsupported TARGET_PHYS_ADDR_SPACE_BITS
-#endif
-    void **phys_map = (void **)l1_phys_map;
-    int l1;
-    if (!l1_phys_map) {
-        return;
-    }
-    for (l1 = 0; l1 < L1_SIZE; ++l1) {
-        if (phys_map[l1]) {
-            phys_page_for_each_in_l1_map(phys_map[l1], client);
-        }
-    }
-#else
-    if (!l1_phys_map) {
-        return;
+    int i;
+    for (i = 0; i < P_L1_SIZE; ++i) {
+        phys_page_for_each_1(client, P_L1_SHIFT / L2_BITS - 1,
+                             l1_phys_map + 1);
     }
-    phys_page_for_each_in_l1_map(l1_phys_map, client);
-#endif
 }
 
 void cpu_register_phys_memory_client(CPUPhysMemoryClient *client)
@@ -2158,44 +2205,87 @@  int tlb_set_page_exec(CPUState *env, target_ulong vaddr,
  * Walks guest process memory "regions" one by one
  * and calls callback function 'fn' for each region.
  */
-int walk_memory_regions(void *priv,
-    int (*fn)(void *, unsigned long, unsigned long, unsigned long))
+
+struct walk_memory_regions_data
 {
-    unsigned long start, end;
-    PageDesc *p = NULL;
-    int i, j, prot, prot1;
-    int rc = 0;
+    walk_memory_regions_fn fn;
+    void *priv;
+    unsigned long start;
+    int prot;
+};
 
-    start = end = -1;
-    prot = 0;
+static int walk_memory_regions_end(struct walk_memory_regions_data *data,
+                                   unsigned long end, int new_prot)
+{
+    if (data->start != -1ul) {
+        int rc = data->fn(data->priv, data->start, end, data->prot);
+        if (rc != 0) {
+            return rc;
+        }
+    }
+
+    data->start = (new_prot ? end : -1ul);
+    data->prot = new_prot;
+
+    return 0;
+}
+
+static int walk_memory_regions_1(struct walk_memory_regions_data *data,
+                                 unsigned long base, int level, void **lp)
+{
+    unsigned long pa;
+    int i, rc;
+
+    if (*lp == NULL) {
+        return walk_memory_regions_end(data, base, 0);
+    }
 
-    for (i = 0; i <= L1_SIZE; i++) {
-        p = (i < L1_SIZE) ? l1_map[i] : NULL;
-        for (j = 0; j < L2_SIZE; j++) {
-            prot1 = (p == NULL) ? 0 : p[j].flags;
-            /*
-             * "region" is one continuous chunk of memory
-             * that has same protection flags set.
-             */
-            if (prot1 != prot) {
-                end = (i << (32 - L1_BITS)) | (j << TARGET_PAGE_BITS);
-                if (start != -1) {
-                    rc = (*fn)(priv, start, end, prot);
-                    /* callback can stop iteration by returning != 0 */
-                    if (rc != 0)
-                        return (rc);
+    if (level == 0) {
+        PageDesc *pd = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            int prot = pd[i].flags;
+
+            pa = base | (i << TARGET_PAGE_BITS);
+            if (prot != data->prot) {
+                rc = walk_memory_regions_end(data, pa, prot);
+                if (rc != 0) {
+                    return rc;
                 }
-                if (prot1 != 0)
-                    start = end;
-                else
-                    start = -1;
-                prot = prot1;
             }
-            if (p == NULL)
-                break;
+        }
+    } else {
+        void **pp = *lp;
+        for (i = 0; i < L2_BITS; ++i) {
+            pa = base | (i << (TARGET_PAGE_BITS + L2_BITS * level));
+            rc = walk_memory_regions_1(data, pa, level - 1, pp + i);
+            if (rc != 0) {
+                return rc;
+            }
         }
     }
-    return (rc);
+
+    return 0;
+}
+
+int walk_memory_regions(void *priv, walk_memory_regions_fn fn)
+{
+    struct walk_memory_regions_data data;
+    unsigned long i;
+
+    data.fn = fn;
+    data.priv = priv;
+    data.start = -1ul;
+    data.prot = 0;
+
+    for (i = 0; i < V_L1_SIZE; i++) {
+        int rc = walk_memory_regions_1(&data, i << V_L1_SHIFT,
+                                       V_L1_SHIFT / L2_BITS - 1, l1_map + i);
+        if (rc != 0) {
+            return rc;
+        }
+    }
+
+    return walk_memory_regions_end(&data, 0, 0);
 }
 
 static int dump_region(void *priv, unsigned long start,