Patchwork [RFC,V3,07/12] xen: Introduce the Xen mapcache

login
register
mail settings
Submitter Anthony PERARD
Date Sept. 17, 2010, 11:15 a.m.
Message ID <1284722107-28550-8-git-send-email-anthony.perard@citrix.com>
Download mbox | patch
Permalink /patch/65068/
State New
Headers show

Comments

Anthony PERARD - Sept. 17, 2010, 11:15 a.m.
From: Anthony PERARD <anthony.perard@citrix.com>

The mapcache maps chucks of guest memory on demand, unmaps them when
they are not needed anymore.

Each call to qemu_get_ram_ptr makes a call to qemu_map_cache with the
lock option, so mapcache will not unmap these ram_ptr.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 Makefile.target |    2 +-
 exec.c          |   36 ++++++-
 hw/xen.h        |    4 +
 xen-all.c       |   63 ++++++++++++
 xen-stub.c      |    4 +
 xen_mapcache.c  |  302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 xen_mapcache.h  |   26 +++++
 7 files changed, 432 insertions(+), 5 deletions(-)
 create mode 100644 xen_mapcache.c
 create mode 100644 xen_mapcache.h
Blue Swirl - Sept. 17, 2010, 7:07 p.m.
On Fri, Sep 17, 2010 at 11:15 AM,  <anthony.perard@citrix.com> wrote:
> From: Anthony PERARD <anthony.perard@citrix.com>
>
> The mapcache maps chucks of guest memory on demand, unmaps them when
> they are not needed anymore.
>
> Each call to qemu_get_ram_ptr makes a call to qemu_map_cache with the
> lock option, so mapcache will not unmap these ram_ptr.
>
> Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  Makefile.target |    2 +-
>  exec.c          |   36 ++++++-
>  hw/xen.h        |    4 +
>  xen-all.c       |   63 ++++++++++++
>  xen-stub.c      |    4 +
>  xen_mapcache.c  |  302 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  xen_mapcache.h  |   26 +++++
>  7 files changed, 432 insertions(+), 5 deletions(-)
>  create mode 100644 xen_mapcache.c
>  create mode 100644 xen_mapcache.h
>
> diff --git a/Makefile.target b/Makefile.target
> index 6b390e6..ea14393 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -183,7 +183,7 @@ QEMU_CFLAGS += $(VNC_PNG_CFLAGS)
>
>  # xen backend driver support
>  obj-$(CONFIG_XEN) += xen_machine_pv.o xen_domainbuild.o
> -obj-$(CONFIG_XEN) += xen-all.o
> +obj-$(CONFIG_XEN) += xen-all.o xen_mapcache.o
>  obj-$(CONFIG_NO_XEN) += xen-stub.o
>
>  # xen full virtualized machine
> diff --git a/exec.c b/exec.c
> index 380dab5..f5888eb 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -60,6 +60,9 @@
>  #endif
>  #endif
>
> +#include "hw/xen.h"
> +#include "xen_mapcache.h"
> +
>  //#define DEBUG_TB_INVALIDATE
>  //#define DEBUG_FLUSH
>  //#define DEBUG_TLB
> @@ -2833,6 +2836,7 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>         }
>     }
>
> +    new_block->offset = find_ram_offset(size);
>     if (host) {
>         new_block->host = host;
>     } else {
> @@ -2856,15 +2860,17 @@ ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
>                                    PROT_EXEC|PROT_READ|PROT_WRITE,
>                                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
>  #else
> -            new_block->host = qemu_vmalloc(size);
> +            if (xen_enabled()) {
> +                xen_ram_alloc(new_block->offset, size);
> +            } else {
> +                new_block->host = qemu_vmalloc(size);
> +            }
>  #endif
>  #ifdef MADV_MERGEABLE
>             madvise(new_block->host, size, MADV_MERGEABLE);
>  #endif
>         }
>     }
> -
> -    new_block->offset = find_ram_offset(size);
>     new_block->length = size;
>
>     QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
> @@ -2905,7 +2911,11 @@ void qemu_ram_free(ram_addr_t addr)
>  #if defined(TARGET_S390X) && defined(CONFIG_KVM)
>                 munmap(block->host, block->length);
>  #else
> -                qemu_vfree(block->host);
> +                if (xen_enabled()) {
> +                    qemu_invalidate_entry(block->host);
> +                } else {
> +                    qemu_vfree(block->host);
> +                }
>  #endif
>             }
>             qemu_free(block);
> @@ -2931,6 +2941,14 @@ void *qemu_get_ram_ptr(ram_addr_t addr)
>         if (addr - block->offset < block->length) {
>             QLIST_REMOVE(block, next);
>             QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
> +            if (xen_enabled()) {
> +                /* We need to check if the requested address is in the RAM
> +                 * because we don't want to map the entire memory in QEMU.
> +                 */
> +                if (block->offset == 0)

braces

> +                    return qemu_map_cache(addr, 0, 1);
> +                block->host = qemu_map_cache(block->offset, block->length, 1);
> +            }
>             return block->host + (addr - block->offset);
>         }
>     }
> @@ -2949,11 +2967,18 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
>     uint8_t *host = ptr;
>
>     QLIST_FOREACH(block, &ram_list.blocks, next) {
> +        /* This case append when the block is not mapped. */
> +        if (block->host == NULL)

braces

> +            continue;
>         if (host - block->host < block->length) {
>             return block->offset + (host - block->host);
>         }
>     }
>
> +    if (xen_enabled()) {
> +        return qemu_ram_addr_from_mapcache(ptr);
> +    }
> +
>     fprintf(stderr, "Bad ram pointer %p\n", ptr);
>     abort();
>
> @@ -3728,6 +3753,9 @@ void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
>     if (is_write) {
>         cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len);
>     }
> +    if (xen_enabled()) {
> +        qemu_invalidate_entry(buffer);
> +    }
>     qemu_vfree(bounce.buffer);
>     bounce.buffer = NULL;
>     cpu_notify_map_clients();
> diff --git a/hw/xen.h b/hw/xen.h
> index c5189b1..2b62ff5 100644
> --- a/hw/xen.h
> +++ b/hw/xen.h
> @@ -34,4 +34,8 @@ void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len);
>
>  int xen_init(int smp_cpus);
>
> +#ifdef NEED_CPU_H
> +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size);
> +#endif
> +
>  #endif /* QEMU_HW_XEN_H */
> diff --git a/xen-all.c b/xen-all.c
> index 765f87a..4e0b061 100644
> --- a/xen-all.c
> +++ b/xen-all.c
> @@ -12,6 +12,8 @@
>  #include "hw/xen_common.h"
>  #include "hw/xen_backend.h"
>
> +#include "xen_mapcache.h"
> +
>  /* Xen specific function for piix pci */
>
>  int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
> @@ -52,6 +54,63 @@ qemu_irq *i8259_xen_init(void)
>     return qemu_allocate_irqs(i8259_set_irq, NULL, 16);
>  }
>
> +
> +/* Memory Ops */
> +
> +static void xen_ram_init(ram_addr_t ram_size)
> +{
> +    RAMBlock *new_block;
> +    ram_addr_t below_4g_mem_size, above_4g_mem_size = 0;
> +
> +    new_block = qemu_mallocz(sizeof (*new_block));
> +    pstrcpy(new_block->idstr, sizeof (new_block->idstr), "xen.ram");
> +    new_block->host = NULL;
> +    new_block->offset = 0;
> +    new_block->length = ram_size;
> +
> +    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
> +
> +    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
> +                                       new_block->length >> TARGET_PAGE_BITS);
> +    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
> +           0xff, new_block->length >> TARGET_PAGE_BITS);
> +
> +    if (ram_size >= 0xe0000000 ) {
> +        above_4g_mem_size = ram_size - 0xe0000000;
> +        below_4g_mem_size = 0xe0000000;
> +    } else {
> +        below_4g_mem_size = ram_size;
> +    }
> +
> +    cpu_register_physical_memory(0, below_4g_mem_size, new_block->offset);
> +#if TARGET_PHYS_ADDR_BITS > 32
> +    if (above_4g_mem_size > 0) {
> +        cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
> +                                     new_block->offset + below_4g_mem_size);
> +    }
> +#endif
> +}
> +
> +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
> +{
> +    unsigned long nr_pfn;
> +    xen_pfn_t *pfn_list;
> +    int i;
> +
> +    nr_pfn = size >> TARGET_PAGE_BITS;
> +    pfn_list = qemu_malloc(sizeof (*pfn_list) * nr_pfn);
> +
> +    for (i = 0; i < nr_pfn; i++)

braces

> +        pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
> +
> +    if (xc_domain_memory_populate_physmap(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
> +        hw_error("xen: failed to populate ram at %lx", ram_addr);
> +    }
> +
> +    qemu_free(pfn_list);
> +}
> +
> +
>  /* Initialise Xen */
>
>  int xen_init(int smp_cpus)
> @@ -62,5 +121,9 @@ int xen_init(int smp_cpus)
>         return -1;
>     }
>
> +    /* Init RAM management */
> +    qemu_map_cache_init();
> +    xen_ram_init(ram_size);
> +
>     return 0;
>  }
> diff --git a/xen-stub.c b/xen-stub.c
> index 07e64bc..c9f477d 100644
> --- a/xen-stub.c
> +++ b/xen-stub.c
> @@ -24,6 +24,10 @@ void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
>  {
>  }
>
> +void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
> +{
> +}
> +
>  int xen_init(int smp_cpus)
>  {
>     return -ENOSYS;
> diff --git a/xen_mapcache.c b/xen_mapcache.c
> new file mode 100644
> index 0000000..8e3bf6c
> --- /dev/null
> +++ b/xen_mapcache.c
> @@ -0,0 +1,302 @@
> +#include "config.h"
> +
> +#include "hw/xen_backend.h"
> +#include "blockdev.h"
> +
> +#include <xen/hvm/params.h>
> +#include <sys/mman.h>
> +
> +#include "xen_mapcache.h"
> +
> +
> +//#define MAPCACHE_DEBUG
> +
> +#ifdef MAPCACHE_DEBUG
> +#define DPRINTF(fmt, ...) do { \
> +    fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \
> +} while (0)
> +#else
> +#define DPRINTF(fmt, ...) do { } while (0)
> +#endif
> +
> +#if defined(MAPCACHE)
> +
> +#define BITS_PER_LONG (sizeof(long)*8)

Please add spaces around '*', also the below #defines need more spaces.

> +#define BITS_TO_LONGS(bits) \
> +    (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
> +#define DECLARE_BITMAP(name,bits) \
> +    unsigned long name[BITS_TO_LONGS(bits)]
> +#define test_bit(bit,map) \
> +    (!!((map)[(bit)/BITS_PER_LONG] & (1UL << ((bit)%BITS_PER_LONG))))
> +
> +typedef struct MapCacheEntry {
> +    unsigned long paddr_index;
> +    uint8_t *vaddr_base;
> +    DECLARE_BITMAP(valid_mapping, MCACHE_BUCKET_SIZE>>XC_PAGE_SHIFT);
> +    uint8_t lock;
> +    struct MapCacheEntry *next;
> +} MapCacheEntry;
> +
> +typedef struct MapCacheRev {
> +    uint8_t *vaddr_req;
> +    unsigned long paddr_index;
> +    QTAILQ_ENTRY(MapCacheRev) next;
> +} MapCacheRev;
> +
> +typedef struct MapCache {
> +    MapCacheEntry *entry;
> +    unsigned long nr_buckets;
> +    QTAILQ_HEAD(map_cache_head, MapCacheRev) locked_entries;
> +
> +    /* For most cases (>99.9%), the page address is the same. */
> +    unsigned long last_address_index;
> +    uint8_t      *last_address_vaddr;
> +} MapCache;
> +
> +static MapCache *mapcache;
> +
> +
> +int qemu_map_cache_init(void)
> +{
> +    unsigned long size;
> +
> +    mapcache = qemu_mallocz(sizeof (MapCache));
> +
> +    QTAILQ_INIT(&mapcache->locked_entries);
> +    mapcache->last_address_index = ~0UL;
> +
> +    mapcache->nr_buckets = (((MAX_MCACHE_SIZE >> XC_PAGE_SHIFT) +
> +                   (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
> +                  (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
> +
> +    /*
> +     * Use mmap() directly: lets us allocate a big hash table with no up-front
> +     * cost in storage space. The OS will allocate memory only for the buckets
> +     * that we actually use. All others will contain all zeroes.
> +     */
> +    size = mapcache->nr_buckets * sizeof(MapCacheEntry);
> +    size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
> +    DPRINTF("qemu_map_cache_init, nr_buckets = %lx size %lu\n", mapcache->nr_buckets, size);
> +    mapcache->entry = mmap(NULL, size, PROT_READ|PROT_WRITE,
> +                          MAP_SHARED|MAP_ANON, -1, 0);
> +    if (mapcache->entry == MAP_FAILED) {
> +        errno = ENOMEM;

Is this needed, can't we just use whatever was in errno?

> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
> +static void qemu_remap_bucket(MapCacheEntry *entry,
> +                              target_phys_addr_t size,
> +                              unsigned long address_index)
> +{
> +    uint8_t *vaddr_base;
> +    xen_pfn_t *pfns;
> +    int *err;
> +    unsigned int i, j;

There are a lot of size >> XC_PAGE_SHIFT uses here. I think it would
be clearer to add size >>= XC_PAGE_SHIFT or a new variable.

> +
> +    pfns = qemu_mallocz((size >> XC_PAGE_SHIFT) * sizeof (xen_pfn_t));
> +    err = qemu_mallocz((size >> XC_PAGE_SHIFT) * sizeof (int));
> +
> +    if (entry->vaddr_base != NULL) {
> +        errno = munmap(entry->vaddr_base, size);
> +        if (errno) {
> +            fprintf(stderr, "unmap fails %d\n", errno);

munmap() returns -1 on error, so please don't clobber errno and use perror().

> +            exit(-1);
> +        }
> +    }
> +
> +    for (i = 0; i < size >> XC_PAGE_SHIFT; i++) {
> +        pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
> +    }
> +
> +    vaddr_base = xc_map_foreign_bulk(xen_xc, xen_domid, PROT_READ|PROT_WRITE,
> +                                     pfns, err,
> +                                     size >> XC_PAGE_SHIFT);
> +    if (vaddr_base == NULL) {
> +        fprintf(stderr, "xc_map_foreign_bulk error %d\n", errno);

perror()?

> +        exit(-1);
> +    }
> +
> +    entry->vaddr_base  = vaddr_base;
> +    entry->paddr_index = address_index;
> +
> +    for (i = 0; i < size >> XC_PAGE_SHIFT; i += BITS_PER_LONG) {
> +        unsigned long word = 0;
> +        j = ((i + BITS_PER_LONG) > (size >> XC_PAGE_SHIFT)) ?
> +            (size >> XC_PAGE_SHIFT) % BITS_PER_LONG : BITS_PER_LONG;

Maybe this would be clearer with 'if'.

> +        while (j > 0) {
> +            word = (word << 1) | !err[i + --j];

You are mixing bitwise OR with logical NOT, is this correct?

> +        }
> +        entry->valid_mapping[i / BITS_PER_LONG] = word;
> +    }
> +
> +    qemu_free(pfns);
> +    qemu_free(err);
> +}
> +
> +uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size, uint8_t lock)
> +{
> +    MapCacheEntry *entry, *pentry = NULL;
> +    unsigned long address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
> +    unsigned long address_offset = phys_addr & (MCACHE_BUCKET_SIZE-1);

unsigned long will not be long enough on 32 bit host (or 32 bit user
space) for a 64 bit target. I can't remember if this was a supported
case for Xen anyway.

How about address_offset >>= XC_PAGE_SHIFT?

> +
> +    if (address_index == mapcache->last_address_index && !lock)

braces

> +        return mapcache->last_address_vaddr + address_offset;
> +
> +    entry = &mapcache->entry[address_index % mapcache->nr_buckets];
> +
> +    while (entry && entry->lock && entry->paddr_index != address_index && entry->vaddr_base) {
> +        pentry = entry;
> +        entry = entry->next;
> +    }
> +    if (!entry) {
> +        entry = qemu_mallocz(sizeof(MapCacheEntry));
> +        pentry->next = entry;
> +        qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index);
> +    } else if (!entry->lock) {
> +        if (!entry->vaddr_base || entry->paddr_index != address_index || !test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))

I suspect this line is too long. Please also add braces.

> +            qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index);
> +    }
> +
> +    if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) {
> +        mapcache->last_address_index = ~0UL;
> +        return NULL;
> +    }
> +
> +    mapcache->last_address_index = address_index;
> +    mapcache->last_address_vaddr = entry->vaddr_base;
> +    if (lock) {
> +        MapCacheRev *reventry = qemu_mallocz(sizeof(MapCacheRev));
> +        entry->lock++;
> +        reventry->vaddr_req = mapcache->last_address_vaddr + address_offset;
> +        reventry->paddr_index = mapcache->last_address_index;
> +        QTAILQ_INSERT_TAIL(&mapcache->locked_entries, reventry, next);
> +    }
> +
> +    return mapcache->last_address_vaddr + address_offset;
> +}
> +
> +ram_addr_t qemu_ram_addr_from_mapcache(void *ptr)
> +{
> +    MapCacheRev *reventry;
> +    unsigned long paddr_index;
> +    int found = 0;
> +
> +    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
> +        if (reventry->vaddr_req == ptr) {
> +            paddr_index = reventry->paddr_index;
> +            found = 1;
> +            break;
> +        }
> +    }
> +    if (!found) {
> +        fprintf(stderr, "qemu_ram_addr_from_mapcache, could not find %p\n", ptr);
> +        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
> +            DPRINTF("   %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
> +        }
> +        abort();
> +        return 0;
> +    }
> +
> +    return paddr_index << MCACHE_BUCKET_SHIFT;
> +}
> +
> +void qemu_invalidate_entry(uint8_t *buffer)
> +{
> +    MapCacheEntry *entry = NULL, *pentry = NULL;
> +    MapCacheRev *reventry;
> +    unsigned long paddr_index;
> +    int found = 0;
> +
> +    if (mapcache->last_address_vaddr == buffer)
> +        mapcache->last_address_index =  ~0UL;
> +
> +    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
> +        if (reventry->vaddr_req == buffer) {
> +            paddr_index = reventry->paddr_index;
> +            found = 1;
> +            break;
> +        }
> +    }
> +    if (!found) {
> +        DPRINTF("qemu_invalidate_entry, could not find %p\n", buffer);
> +        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
> +            DPRINTF("   %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
> +        }
> +        return;
> +    }
> +    QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
> +    qemu_free(reventry);
> +
> +    entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
> +    while (entry && entry->paddr_index != paddr_index) {
> +        pentry = entry;
> +        entry = entry->next;
> +    }
> +    if (!entry) {
> +        DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer);
> +        return;
> +    }
> +    entry->lock--;
> +    if (entry->lock > 0 || pentry == NULL)
> +        return;
> +
> +    pentry->next = entry->next;
> +    errno = munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE);
> +    if (errno) {
> +        fprintf(stderr, "unmap fails %d\n", errno);

Please see my previous munmap comments.

> +        exit(-1);
> +    }
> +    qemu_free(entry);
> +}
> +
> +void qemu_invalidate_map_cache(void)
> +{
> +    unsigned long i;
> +    MapCacheRev *reventry;
> +
> +    qemu_aio_flush();
> +
> +    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
> +        DPRINTF("There should be no locked mappings at this time, but %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);

Probably too long line.

> +    }
> +
> +    mapcache_lock();
> +
> +    for (i = 0; i < mapcache->nr_buckets; i++) {
> +        MapCacheEntry *entry = &mapcache->entry[i];
> +
> +        if (entry->vaddr_base == NULL)
> +            continue;
> +
> +        errno = munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE);
> +        if (errno) {
> +            fprintf(stderr, "unmap fails %d\n", errno);
> +            exit(-1);
> +        }
> +
> +        entry->paddr_index = 0;
> +        entry->vaddr_base  = NULL;
> +    }
> +
> +    mapcache->last_address_index =  ~0UL;
> +    mapcache->last_address_vaddr = NULL;
> +
> +    mapcache_unlock();
> +}
> +#else
> +uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, uint8_t lock)
> +{
> +    return qemu_get_ram_ptr(phys_addr);
> +}
> +
> +void qemu_invalidate_map_cache(void)
> +{
> +}
> +
> +void qemu_invalidate_entry(uint8_t *buffer)
> +{
> +}
> +#endif /* !MAPCACHE */
> diff --git a/xen_mapcache.h b/xen_mapcache.h
> new file mode 100644
> index 0000000..5a6730f
> --- /dev/null
> +++ b/xen_mapcache.h
> @@ -0,0 +1,26 @@
> +#ifndef XEN_MAPCACHE_H
> +#define XEN_MAPCACHE_H
> +
> +#if (defined(__i386__) || defined(__x86_64__))
> +#  define MAPCACHE

xen_mapcache.c could be split into two files, xen-mapcache-stub.c and
xen-mapcache.c. configure could perform the check for i386 or x86_64
host and define CONFIG_XEN_MAPCACHE=y appropriately. Then
Makefile.target would compile the correct file based on that.

Patch

diff --git a/Makefile.target b/Makefile.target
index 6b390e6..ea14393 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -183,7 +183,7 @@  QEMU_CFLAGS += $(VNC_PNG_CFLAGS)
 
 # xen backend driver support
 obj-$(CONFIG_XEN) += xen_machine_pv.o xen_domainbuild.o
-obj-$(CONFIG_XEN) += xen-all.o
+obj-$(CONFIG_XEN) += xen-all.o xen_mapcache.o
 obj-$(CONFIG_NO_XEN) += xen-stub.o
 
 # xen full virtualized machine
diff --git a/exec.c b/exec.c
index 380dab5..f5888eb 100644
--- a/exec.c
+++ b/exec.c
@@ -60,6 +60,9 @@ 
 #endif
 #endif
 
+#include "hw/xen.h"
+#include "xen_mapcache.h"
+
 //#define DEBUG_TB_INVALIDATE
 //#define DEBUG_FLUSH
 //#define DEBUG_TLB
@@ -2833,6 +2836,7 @@  ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
         }
     }
 
+    new_block->offset = find_ram_offset(size);
     if (host) {
         new_block->host = host;
     } else {
@@ -2856,15 +2860,17 @@  ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
                                    PROT_EXEC|PROT_READ|PROT_WRITE,
                                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
 #else
-            new_block->host = qemu_vmalloc(size);
+            if (xen_enabled()) {
+                xen_ram_alloc(new_block->offset, size);
+            } else {
+                new_block->host = qemu_vmalloc(size);
+            }
 #endif
 #ifdef MADV_MERGEABLE
             madvise(new_block->host, size, MADV_MERGEABLE);
 #endif
         }
     }
-
-    new_block->offset = find_ram_offset(size);
     new_block->length = size;
 
     QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
@@ -2905,7 +2911,11 @@  void qemu_ram_free(ram_addr_t addr)
 #if defined(TARGET_S390X) && defined(CONFIG_KVM)
                 munmap(block->host, block->length);
 #else
-                qemu_vfree(block->host);
+                if (xen_enabled()) {
+                    qemu_invalidate_entry(block->host);
+                } else {
+                    qemu_vfree(block->host);
+                }
 #endif
             }
             qemu_free(block);
@@ -2931,6 +2941,14 @@  void *qemu_get_ram_ptr(ram_addr_t addr)
         if (addr - block->offset < block->length) {
             QLIST_REMOVE(block, next);
             QLIST_INSERT_HEAD(&ram_list.blocks, block, next);
+            if (xen_enabled()) {
+                /* We need to check if the requested address is in the RAM
+                 * because we don't want to map the entire memory in QEMU.
+                 */
+                if (block->offset == 0)
+                    return qemu_map_cache(addr, 0, 1);
+                block->host = qemu_map_cache(block->offset, block->length, 1);
+            }
             return block->host + (addr - block->offset);
         }
     }
@@ -2949,11 +2967,18 @@  ram_addr_t qemu_ram_addr_from_host(void *ptr)
     uint8_t *host = ptr;
 
     QLIST_FOREACH(block, &ram_list.blocks, next) {
+        /* This case append when the block is not mapped. */
+        if (block->host == NULL)
+            continue;
         if (host - block->host < block->length) {
             return block->offset + (host - block->host);
         }
     }
 
+    if (xen_enabled()) {
+        return qemu_ram_addr_from_mapcache(ptr);
+    }
+
     fprintf(stderr, "Bad ram pointer %p\n", ptr);
     abort();
 
@@ -3728,6 +3753,9 @@  void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
     if (is_write) {
         cpu_physical_memory_write(bounce.addr, bounce.buffer, access_len);
     }
+    if (xen_enabled()) {
+        qemu_invalidate_entry(buffer);
+    }
     qemu_vfree(bounce.buffer);
     bounce.buffer = NULL;
     cpu_notify_map_clients();
diff --git a/hw/xen.h b/hw/xen.h
index c5189b1..2b62ff5 100644
--- a/hw/xen.h
+++ b/hw/xen.h
@@ -34,4 +34,8 @@  void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len);
 
 int xen_init(int smp_cpus);
 
+#ifdef NEED_CPU_H
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size);
+#endif
+
 #endif /* QEMU_HW_XEN_H */
diff --git a/xen-all.c b/xen-all.c
index 765f87a..4e0b061 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -12,6 +12,8 @@ 
 #include "hw/xen_common.h"
 #include "hw/xen_backend.h"
 
+#include "xen_mapcache.h"
+
 /* Xen specific function for piix pci */
 
 int xen_pci_slot_get_pirq(PCIDevice *pci_dev, int irq_num)
@@ -52,6 +54,63 @@  qemu_irq *i8259_xen_init(void)
     return qemu_allocate_irqs(i8259_set_irq, NULL, 16);
 }
 
+
+/* Memory Ops */
+
+static void xen_ram_init(ram_addr_t ram_size)
+{
+    RAMBlock *new_block;
+    ram_addr_t below_4g_mem_size, above_4g_mem_size = 0;
+
+    new_block = qemu_mallocz(sizeof (*new_block));
+    pstrcpy(new_block->idstr, sizeof (new_block->idstr), "xen.ram");
+    new_block->host = NULL;
+    new_block->offset = 0;
+    new_block->length = ram_size;
+
+    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+                                       new_block->length >> TARGET_PAGE_BITS);
+    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, new_block->length >> TARGET_PAGE_BITS);
+
+    if (ram_size >= 0xe0000000 ) {
+        above_4g_mem_size = ram_size - 0xe0000000;
+        below_4g_mem_size = 0xe0000000;
+    } else {
+        below_4g_mem_size = ram_size;
+    }
+
+    cpu_register_physical_memory(0, below_4g_mem_size, new_block->offset);
+#if TARGET_PHYS_ADDR_BITS > 32
+    if (above_4g_mem_size > 0) {
+        cpu_register_physical_memory(0x100000000ULL, above_4g_mem_size,
+                                     new_block->offset + below_4g_mem_size);
+    }
+#endif
+}
+
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
+{
+    unsigned long nr_pfn;
+    xen_pfn_t *pfn_list;
+    int i;
+
+    nr_pfn = size >> TARGET_PAGE_BITS;
+    pfn_list = qemu_malloc(sizeof (*pfn_list) * nr_pfn);
+
+    for (i = 0; i < nr_pfn; i++)
+        pfn_list[i] = (ram_addr >> TARGET_PAGE_BITS) + i;
+
+    if (xc_domain_memory_populate_physmap(xen_xc, xen_domid, nr_pfn, 0, 0, pfn_list)) {
+        hw_error("xen: failed to populate ram at %lx", ram_addr);
+    }
+
+    qemu_free(pfn_list);
+}
+
+
 /* Initialise Xen */
 
 int xen_init(int smp_cpus)
@@ -62,5 +121,9 @@  int xen_init(int smp_cpus)
         return -1;
     }
 
+    /* Init RAM management */
+    qemu_map_cache_init();
+    xen_ram_init(ram_size);
+
     return 0;
 }
diff --git a/xen-stub.c b/xen-stub.c
index 07e64bc..c9f477d 100644
--- a/xen-stub.c
+++ b/xen-stub.c
@@ -24,6 +24,10 @@  void xen_piix_pci_write_config_client(uint32_t address, uint32_t val, int len)
 {
 }
 
+void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size)
+{
+}
+
 int xen_init(int smp_cpus)
 {
     return -ENOSYS;
diff --git a/xen_mapcache.c b/xen_mapcache.c
new file mode 100644
index 0000000..8e3bf6c
--- /dev/null
+++ b/xen_mapcache.c
@@ -0,0 +1,302 @@ 
+#include "config.h"
+
+#include "hw/xen_backend.h"
+#include "blockdev.h"
+
+#include <xen/hvm/params.h>
+#include <sys/mman.h>
+
+#include "xen_mapcache.h"
+
+
+//#define MAPCACHE_DEBUG
+
+#ifdef MAPCACHE_DEBUG
+#define DPRINTF(fmt, ...) do { \
+    fprintf(stderr, "xen_mapcache: " fmt, ## __VA_ARGS__); \
+} while (0)
+#else
+#define DPRINTF(fmt, ...) do { } while (0)
+#endif
+
+#if defined(MAPCACHE)
+
+#define BITS_PER_LONG (sizeof(long)*8)
+#define BITS_TO_LONGS(bits) \
+    (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
+#define DECLARE_BITMAP(name,bits) \
+    unsigned long name[BITS_TO_LONGS(bits)]
+#define test_bit(bit,map) \
+    (!!((map)[(bit)/BITS_PER_LONG] & (1UL << ((bit)%BITS_PER_LONG))))
+
+typedef struct MapCacheEntry {
+    unsigned long paddr_index;
+    uint8_t *vaddr_base;
+    DECLARE_BITMAP(valid_mapping, MCACHE_BUCKET_SIZE>>XC_PAGE_SHIFT);
+    uint8_t lock;
+    struct MapCacheEntry *next;
+} MapCacheEntry;
+
+typedef struct MapCacheRev {
+    uint8_t *vaddr_req;
+    unsigned long paddr_index;
+    QTAILQ_ENTRY(MapCacheRev) next;
+} MapCacheRev;
+
+typedef struct MapCache {
+    MapCacheEntry *entry;
+    unsigned long nr_buckets;
+    QTAILQ_HEAD(map_cache_head, MapCacheRev) locked_entries;
+
+    /* For most cases (>99.9%), the page address is the same. */
+    unsigned long last_address_index;
+    uint8_t      *last_address_vaddr;
+} MapCache;
+
+static MapCache *mapcache;
+
+
+int qemu_map_cache_init(void)
+{
+    unsigned long size;
+
+    mapcache = qemu_mallocz(sizeof (MapCache));
+
+    QTAILQ_INIT(&mapcache->locked_entries);
+    mapcache->last_address_index = ~0UL;
+
+    mapcache->nr_buckets = (((MAX_MCACHE_SIZE >> XC_PAGE_SHIFT) +
+                   (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
+                  (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
+
+    /*
+     * Use mmap() directly: lets us allocate a big hash table with no up-front
+     * cost in storage space. The OS will allocate memory only for the buckets
+     * that we actually use. All others will contain all zeroes.
+     */
+    size = mapcache->nr_buckets * sizeof(MapCacheEntry);
+    size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
+    DPRINTF("qemu_map_cache_init, nr_buckets = %lx size %lu\n", mapcache->nr_buckets, size);
+    mapcache->entry = mmap(NULL, size, PROT_READ|PROT_WRITE,
+                          MAP_SHARED|MAP_ANON, -1, 0);
+    if (mapcache->entry == MAP_FAILED) {
+        errno = ENOMEM;
+        return -1;
+    }
+
+    return 0;
+}
+
+static void qemu_remap_bucket(MapCacheEntry *entry,
+                              target_phys_addr_t size,
+                              unsigned long address_index)
+{
+    uint8_t *vaddr_base;
+    xen_pfn_t *pfns;
+    int *err;
+    unsigned int i, j;
+
+    pfns = qemu_mallocz((size >> XC_PAGE_SHIFT) * sizeof (xen_pfn_t));
+    err = qemu_mallocz((size >> XC_PAGE_SHIFT) * sizeof (int));
+
+    if (entry->vaddr_base != NULL) {
+        errno = munmap(entry->vaddr_base, size);
+        if (errno) {
+            fprintf(stderr, "unmap fails %d\n", errno);
+            exit(-1);
+        }
+    }
+
+    for (i = 0; i < size >> XC_PAGE_SHIFT; i++) {
+        pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
+    }
+
+    vaddr_base = xc_map_foreign_bulk(xen_xc, xen_domid, PROT_READ|PROT_WRITE,
+                                     pfns, err,
+                                     size >> XC_PAGE_SHIFT);
+    if (vaddr_base == NULL) {
+        fprintf(stderr, "xc_map_foreign_bulk error %d\n", errno);
+        exit(-1);
+    }
+
+    entry->vaddr_base  = vaddr_base;
+    entry->paddr_index = address_index;
+
+    for (i = 0; i < size >> XC_PAGE_SHIFT; i += BITS_PER_LONG) {
+        unsigned long word = 0;
+        j = ((i + BITS_PER_LONG) > (size >> XC_PAGE_SHIFT)) ?
+            (size >> XC_PAGE_SHIFT) % BITS_PER_LONG : BITS_PER_LONG;
+        while (j > 0) {
+            word = (word << 1) | !err[i + --j];
+        }
+        entry->valid_mapping[i / BITS_PER_LONG] = word;
+    }
+
+    qemu_free(pfns);
+    qemu_free(err);
+}
+
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size, uint8_t lock)
+{
+    MapCacheEntry *entry, *pentry = NULL;
+    unsigned long address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
+    unsigned long address_offset = phys_addr & (MCACHE_BUCKET_SIZE-1);
+
+    if (address_index == mapcache->last_address_index && !lock)
+        return mapcache->last_address_vaddr + address_offset;
+
+    entry = &mapcache->entry[address_index % mapcache->nr_buckets];
+
+    while (entry && entry->lock && entry->paddr_index != address_index && entry->vaddr_base) {
+        pentry = entry;
+        entry = entry->next;
+    }
+    if (!entry) {
+        entry = qemu_mallocz(sizeof(MapCacheEntry));
+        pentry->next = entry;
+        qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index);
+    } else if (!entry->lock) {
+        if (!entry->vaddr_base || entry->paddr_index != address_index || !test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping))
+            qemu_remap_bucket(entry, size ? : MCACHE_BUCKET_SIZE, address_index);
+    }
+
+    if (!test_bit(address_offset>>XC_PAGE_SHIFT, entry->valid_mapping)) {
+        mapcache->last_address_index = ~0UL;
+        return NULL;
+    }
+
+    mapcache->last_address_index = address_index;
+    mapcache->last_address_vaddr = entry->vaddr_base;
+    if (lock) {
+        MapCacheRev *reventry = qemu_mallocz(sizeof(MapCacheRev));
+        entry->lock++;
+        reventry->vaddr_req = mapcache->last_address_vaddr + address_offset;
+        reventry->paddr_index = mapcache->last_address_index;
+        QTAILQ_INSERT_TAIL(&mapcache->locked_entries, reventry, next);
+    }
+
+    return mapcache->last_address_vaddr + address_offset;
+}
+
+ram_addr_t qemu_ram_addr_from_mapcache(void *ptr)
+{
+    MapCacheRev *reventry;
+    unsigned long paddr_index;
+    int found = 0;
+
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        if (reventry->vaddr_req == ptr) {
+            paddr_index = reventry->paddr_index;
+            found = 1;
+            break;
+        }
+    }
+    if (!found) {
+        fprintf(stderr, "qemu_ram_addr_from_mapcache, could not find %p\n", ptr);
+        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+            DPRINTF("   %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
+        }
+        abort();
+        return 0;
+    }
+
+    return paddr_index << MCACHE_BUCKET_SHIFT;
+}
+
+void qemu_invalidate_entry(uint8_t *buffer)
+{
+    MapCacheEntry *entry = NULL, *pentry = NULL;
+    MapCacheRev *reventry;
+    unsigned long paddr_index;
+    int found = 0;
+
+    if (mapcache->last_address_vaddr == buffer)
+        mapcache->last_address_index =  ~0UL;
+
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        if (reventry->vaddr_req == buffer) {
+            paddr_index = reventry->paddr_index;
+            found = 1;
+            break;
+        }
+    }
+    if (!found) {
+        DPRINTF("qemu_invalidate_entry, could not find %p\n", buffer);
+        QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+            DPRINTF("   %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
+        }
+        return;
+    }
+    QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
+    qemu_free(reventry);
+
+    entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
+    while (entry && entry->paddr_index != paddr_index) {
+        pentry = entry;
+        entry = entry->next;
+    }
+    if (!entry) {
+        DPRINTF("Trying to unmap address %p that is not in the mapcache!\n", buffer);
+        return;
+    }
+    entry->lock--;
+    if (entry->lock > 0 || pentry == NULL)
+        return;
+
+    pentry->next = entry->next;
+    errno = munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE);
+    if (errno) {
+        fprintf(stderr, "unmap fails %d\n", errno);
+        exit(-1);
+    }
+    qemu_free(entry);
+}
+
+void qemu_invalidate_map_cache(void)
+{
+    unsigned long i;
+    MapCacheRev *reventry;
+
+    qemu_aio_flush();
+
+    QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
+        DPRINTF("There should be no locked mappings at this time, but %lx -> %p is present\n", reventry->paddr_index, reventry->vaddr_req);
+    }
+
+    mapcache_lock();
+
+    for (i = 0; i < mapcache->nr_buckets; i++) {
+        MapCacheEntry *entry = &mapcache->entry[i];
+
+        if (entry->vaddr_base == NULL)
+            continue;
+
+        errno = munmap(entry->vaddr_base, MCACHE_BUCKET_SIZE);
+        if (errno) {
+            fprintf(stderr, "unmap fails %d\n", errno);
+            exit(-1);
+        }
+
+        entry->paddr_index = 0;
+        entry->vaddr_base  = NULL;
+    }
+
+    mapcache->last_address_index =  ~0UL;
+    mapcache->last_address_vaddr = NULL;
+
+    mapcache_unlock();
+}
+#else
+uint8_t *qemu_map_cache(target_phys_addr_t phys_addr, uint8_t lock)
+{
+    return qemu_get_ram_ptr(phys_addr);
+}
+
+void qemu_invalidate_map_cache(void)
+{
+}
+
+void qemu_invalidate_entry(uint8_t *buffer)
+{
+}
+#endif /* !MAPCACHE */
diff --git a/xen_mapcache.h b/xen_mapcache.h
new file mode 100644
index 0000000..5a6730f
--- /dev/null
+++ b/xen_mapcache.h
@@ -0,0 +1,26 @@ 
+#ifndef XEN_MAPCACHE_H
+#define XEN_MAPCACHE_H
+
+#if (defined(__i386__) || defined(__x86_64__))
+#  define MAPCACHE
+#  if defined(__i386__)
+#    define MAX_MCACHE_SIZE    0x40000000 /* 1GB max for x86 */
+#    define MCACHE_BUCKET_SHIFT 16
+#  elif defined(__x86_64__)
+#    define MAX_MCACHE_SIZE    0x1000000000 /* 64GB max for x86_64 */
+#    define MCACHE_BUCKET_SHIFT 20
+#  endif
+#  define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
+#endif
+
+int      qemu_map_cache_init(void);
+uint8_t  *qemu_map_cache(target_phys_addr_t phys_addr, target_phys_addr_t size, uint8_t lock);
+ram_addr_t qemu_ram_addr_from_mapcache(void *ptr);
+void     qemu_invalidate_entry(uint8_t *buffer);
+void     qemu_invalidate_map_cache(void);
+
+#define mapcache_lock()   ((void)0)
+#define mapcache_unlock() ((void)0)
+
+
+#endif /* !XEN_MAPCACHE_H */