diff mbox series

[v3,2/5] softmmu: Support concurrent bounce buffers

Message ID 20230907130410.498935-3-mnissler@rivosinc.com
State New
Headers show
Series Support message-based DMA in vfio-user server | expand

Commit Message

Mattias Nissler Sept. 7, 2023, 1:04 p.m. UTC
When DMA memory can't be directly accessed, as is the case when
running the device model in a separate process without shareable DMA
file descriptors, bounce buffering is used.

It is not uncommon for device models to request mapping of several DMA
regions at the same time. Examples include:
 * net devices, e.g. when transmitting a packet that is split across
   several TX descriptors (observed with igb)
 * USB host controllers, when handling a packet with multiple data TRBs
   (observed with xhci)

Previously, qemu only provided a single bounce buffer per AddressSpace
and would fail DMA map requests while the buffer was already in use. In
turn, this would cause DMA failures that ultimately manifest as hardware
errors from the guest perspective.

This change allocates DMA bounce buffers dynamically instead of
supporting only a single buffer. Thus, multiple DMA mappings work
correctly also when RAM can't be mmap()-ed.

The total bounce buffer allocation size is limited individually for each
AddressSpace. The default limit is 4096 bytes, matching the previous
maximum buffer size. A new x-max-bounce-buffer-size parameter is
provided to configure the limit for PCI devices.

Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
---
 hw/pci/pci.c                |  8 ++++
 include/exec/memory.h       | 14 ++----
 include/hw/pci/pci_device.h |  3 ++
 softmmu/memory.c            |  3 +-
 softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
 5 files changed, 80 insertions(+), 42 deletions(-)

Comments

Peter Xu Sept. 13, 2023, 7:11 p.m. UTC | #1
On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> When DMA memory can't be directly accessed, as is the case when
> running the device model in a separate process without shareable DMA
> file descriptors, bounce buffering is used.
> 
> It is not uncommon for device models to request mapping of several DMA
> regions at the same time. Examples include:
>  * net devices, e.g. when transmitting a packet that is split across
>    several TX descriptors (observed with igb)
>  * USB host controllers, when handling a packet with multiple data TRBs
>    (observed with xhci)
> 
> Previously, qemu only provided a single bounce buffer per AddressSpace
> and would fail DMA map requests while the buffer was already in use. In
> turn, this would cause DMA failures that ultimately manifest as hardware
> errors from the guest perspective.
> 
> This change allocates DMA bounce buffers dynamically instead of
> supporting only a single buffer. Thus, multiple DMA mappings work
> correctly also when RAM can't be mmap()-ed.
> 
> The total bounce buffer allocation size is limited individually for each
> AddressSpace. The default limit is 4096 bytes, matching the previous
> maximum buffer size. A new x-max-bounce-buffer-size parameter is
> provided to configure the limit for PCI devices.
> 
> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> ---
>  hw/pci/pci.c                |  8 ++++
>  include/exec/memory.h       | 14 ++----
>  include/hw/pci/pci_device.h |  3 ++
>  softmmu/memory.c            |  3 +-
>  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
>  5 files changed, 80 insertions(+), 42 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 881d774fb6..8c4541b394 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -85,6 +85,8 @@ static Property pci_props[] = {
>                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
>      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
>                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> +                     max_bounce_buffer_size, 4096),
>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
>                         "bus master container", UINT64_MAX);
>      address_space_init(&pci_dev->bus_master_as,
>                         &pci_dev->bus_master_container_region, pci_dev->name);
> +    pci_dev->bus_master_as.max_bounce_buffer_size =
> +        pci_dev->max_bounce_buffer_size;
>  
>      if (phase_check(PHASE_MACHINE_READY)) {
>          pci_init_bus_master(pci_dev);
> @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
>      k->unrealize = pci_qdev_unrealize;
>      k->bus_type = TYPE_PCI_BUS;
>      device_class_set_props(k, pci_props);
> +    object_class_property_set_description(
> +        klass, "x-max-bounce-buffer-size",
> +        "Maximum buffer size allocated for bounce buffers used for mapped "
> +        "access to indirect DMA memory");
>  }
>  
>  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 7d68936157..5577542b5e 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
>      QLIST_ENTRY(AddressSpaceMapClient) link;
>  } AddressSpaceMapClient;
>  
> -typedef struct {
> -    MemoryRegion *mr;
> -    void *buffer;
> -    hwaddr addr;
> -    hwaddr len;
> -    bool in_use;
> -} BounceBuffer;
> -
>  /**
>   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>   */
> @@ -1106,8 +1098,10 @@ struct AddressSpace {
>      QTAILQ_HEAD(, MemoryListener) listeners;
>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>  
> -    /* Bounce buffer to use for this address space. */
> -    BounceBuffer bounce;
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint64_t max_bounce_buffer_size;
> +    /* Total size of bounce buffers currently allocated, atomically accessed */
> +    uint64_t bounce_buffer_size;
>      /* List of callbacks to invoke when buffers free up */
>      QemuMutex map_client_list_lock;
>      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index d3dd0f64b2..f4027c5379 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -160,6 +160,9 @@ struct PCIDevice {
>      /* ID of standby device in net_failover pair */
>      char *failover_pair_id;
>      uint32_t acpi_index;
> +
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint64_t max_bounce_buffer_size;
>  };
>  
>  static inline int pci_intx(PCIDevice *pci_dev)
> diff --git a/softmmu/memory.c b/softmmu/memory.c
> index 5c9622c3d6..e02799359c 100644
> --- a/softmmu/memory.c
> +++ b/softmmu/memory.c
> @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
>      as->ioeventfds = NULL;
>      QTAILQ_INIT(&as->listeners);
>      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> -    as->bounce.in_use = false;
> +    as->max_bounce_buffer_size = 4096;

Instead of hard-code this 4k again (besides the pci property), maybe we can
have address_space_init_with_bouncebuffer() and always pass it in from pci
do_realize?  Then by default no bounce buffer supported for the AS only if
requested.

> +    as->bounce_buffer_size = 0;
>      qemu_mutex_init(&as->map_client_list_lock);
>      QLIST_INIT(&as->map_client_list);
>      as->name = g_strdup(name ? name : "anonymous");
> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index f40cc564b8..e3d1cf5fba 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
>                                       NULL, len, FLUSH_CACHE);
>  }
>  
> +/*
> + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> + * to detect illegal pointers passed to address_space_unmap.
> + */
> +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> +
> +typedef struct {
> +    uint64_t magic;
> +    MemoryRegion *mr;
> +    hwaddr addr;
> +    size_t len;
> +    uint8_t buffer[];
> +} BounceBuffer;
> +
>  static void
>  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
>  {
> @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
>      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
>      /* Write map_client_list before reading bounce_buffer_size.  */
>      smp_mb();
> -    if (!qatomic_read(&as->bounce.in_use)) {
> +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
>          address_space_notify_map_clients_locked(as);
>      }
>      qemu_mutex_unlock(&as->map_client_list_lock);
> @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
>      RCU_READ_LOCK_GUARD();
>      fv = address_space_to_flatview(as);
>      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> +    memory_region_ref(mr);
>  
>      if (!memory_access_is_direct(mr, is_write)) {
> -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> +        if (size > as->max_bounce_buffer_size) {
> +            size_t excess = size - as->max_bounce_buffer_size;
> +            l -= excess;
> +            qatomic_sub(&as->bounce_buffer_size, excess);
> +        }
> +
> +        if (l == 0) {
>              *plen = 0;
>              return NULL;

MR refcount leak?

>          }
> -        /* Avoid unbounded allocations */
> -        l = MIN(l, TARGET_PAGE_SIZE);
> -        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> -        as->bounce.addr = addr;
> -        as->bounce.len = l;
>  
> -        memory_region_ref(mr);
> -        as->bounce.mr = mr;
> +        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
> +        bounce->magic = BOUNCE_BUFFER_MAGIC;
> +        bounce->mr = mr;
> +        bounce->addr = addr;
> +        bounce->len = l;
> +
>          if (!is_write) {
>              flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> -                          as->bounce.buffer, l);
> +                          bounce->buffer, l);
>          }
>  
>          *plen = l;
> -        return as->bounce.buffer;
> +        return bounce->buffer;
>      }
>  
> -
> -    memory_region_ref(mr);
>      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
>                                          l, is_write, attrs);
>      fuzz_dma_read_cb(addr, *plen, mr);
> @@ -3119,31 +3138,44 @@ void *address_space_map(AddressSpace *as,
>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>                           bool is_write, hwaddr access_len)
>  {
> -    if (buffer != as->bounce.buffer) {
> -        MemoryRegion *mr;
> -        ram_addr_t addr1;
> +    MemoryRegion *mr;
> +    ram_addr_t addr1;
> +
> +    mr = memory_region_from_host(buffer, &addr1);
> +    if (mr == NULL) {
> +        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> +        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
> +            error_report(
> +                "Unmap request for %p, which neither corresponds to a memory "
> +                "region, nor looks like a bounce buffer, ignoring!",
> +                buffer);
> +            return;

Would assert() be better here? So that when trigger we can have the user
stack already.

> +        }
>  
> -        mr = memory_region_from_host(buffer, &addr1);
> -        assert(mr != NULL);
>          if (is_write) {
> -            invalidate_and_set_dirty(mr, addr1, access_len);
> +            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> +                                bounce->buffer, access_len);
>          }
> -        if (xen_enabled()) {
> -            xen_invalidate_map_cache_entry(buffer);
> +
> +        memory_region_unref(bounce->mr);
> +        uint64_t previous_buffer_size =
> +            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
> +        if (previous_buffer_size == as->max_bounce_buffer_size) {

Can there be race condition that someone just temporarily boosted
bounce_buffer_size, so that it won't exactly equal to max but actually
larger (but we should still notify)?

If the current context has already a bounce buffer and is going to unmap it
(len>0), IIUC it means we should always notify because there will
definitely be some space left, and the requesters can retry to see whether
the size fit.

> +            /* Write bounce_buffer_size before reading map_client_list. */
> +            smp_mb();

I know it comes from the old code.. but I don't know why this is needed;
mutex lock should contain an mb() already before the list iterations later.
Just to raise it up, maybe Paolo would like to comment.

> +            address_space_notify_map_clients(as);
>          }
> -        memory_region_unref(mr);
> +        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> +        g_free(bounce);
>          return;
>      }
> +
> +    if (xen_enabled()) {
> +        xen_invalidate_map_cache_entry(buffer);
> +    }
>      if (is_write) {
> -        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> -                            as->bounce.buffer, access_len);
> -    }
> -    qemu_vfree(as->bounce.buffer);
> -    as->bounce.buffer = NULL;
> -    memory_region_unref(as->bounce.mr);
> -    /* Clear in_use before reading map_client_list.  */
> -    qatomic_set_mb(&as->bounce.in_use, false);
> -    address_space_notify_map_clients(as);
> +        invalidate_and_set_dirty(mr, addr1, access_len);
> +    }
>  }
>  
>  void *cpu_physical_memory_map(hwaddr addr,
> -- 
> 2.34.1
>
Stefan Hajnoczi Sept. 14, 2023, 6:49 p.m. UTC | #2
On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> When DMA memory can't be directly accessed, as is the case when
> running the device model in a separate process without shareable DMA
> file descriptors, bounce buffering is used.
> 
> It is not uncommon for device models to request mapping of several DMA
> regions at the same time. Examples include:
>  * net devices, e.g. when transmitting a packet that is split across
>    several TX descriptors (observed with igb)
>  * USB host controllers, when handling a packet with multiple data TRBs
>    (observed with xhci)
> 
> Previously, qemu only provided a single bounce buffer per AddressSpace
> and would fail DMA map requests while the buffer was already in use. In
> turn, this would cause DMA failures that ultimately manifest as hardware
> errors from the guest perspective.
> 
> This change allocates DMA bounce buffers dynamically instead of
> supporting only a single buffer. Thus, multiple DMA mappings work
> correctly also when RAM can't be mmap()-ed.
> 
> The total bounce buffer allocation size is limited individually for each
> AddressSpace. The default limit is 4096 bytes, matching the previous
> maximum buffer size. A new x-max-bounce-buffer-size parameter is
> provided to configure the limit for PCI devices.
> 
> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> ---
>  hw/pci/pci.c                |  8 ++++
>  include/exec/memory.h       | 14 ++----
>  include/hw/pci/pci_device.h |  3 ++
>  softmmu/memory.c            |  3 +-
>  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
>  5 files changed, 80 insertions(+), 42 deletions(-)
> 
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 881d774fb6..8c4541b394 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -85,6 +85,8 @@ static Property pci_props[] = {
>                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
>      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
>                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> +                     max_bounce_buffer_size, 4096),
>      DEFINE_PROP_END_OF_LIST()
>  };
>  
> @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
>                         "bus master container", UINT64_MAX);
>      address_space_init(&pci_dev->bus_master_as,
>                         &pci_dev->bus_master_container_region, pci_dev->name);
> +    pci_dev->bus_master_as.max_bounce_buffer_size =
> +        pci_dev->max_bounce_buffer_size;
>  
>      if (phase_check(PHASE_MACHINE_READY)) {
>          pci_init_bus_master(pci_dev);
> @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
>      k->unrealize = pci_qdev_unrealize;
>      k->bus_type = TYPE_PCI_BUS;
>      device_class_set_props(k, pci_props);
> +    object_class_property_set_description(
> +        klass, "x-max-bounce-buffer-size",
> +        "Maximum buffer size allocated for bounce buffers used for mapped "
> +        "access to indirect DMA memory");
>  }
>  
>  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 7d68936157..5577542b5e 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
>      QLIST_ENTRY(AddressSpaceMapClient) link;
>  } AddressSpaceMapClient;
>  
> -typedef struct {
> -    MemoryRegion *mr;
> -    void *buffer;
> -    hwaddr addr;
> -    hwaddr len;
> -    bool in_use;
> -} BounceBuffer;
> -
>  /**
>   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>   */
> @@ -1106,8 +1098,10 @@ struct AddressSpace {
>      QTAILQ_HEAD(, MemoryListener) listeners;
>      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>  
> -    /* Bounce buffer to use for this address space. */
> -    BounceBuffer bounce;
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint64_t max_bounce_buffer_size;
> +    /* Total size of bounce buffers currently allocated, atomically accessed */
> +    uint64_t bounce_buffer_size;
>      /* List of callbacks to invoke when buffers free up */
>      QemuMutex map_client_list_lock;
>      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index d3dd0f64b2..f4027c5379 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -160,6 +160,9 @@ struct PCIDevice {
>      /* ID of standby device in net_failover pair */
>      char *failover_pair_id;
>      uint32_t acpi_index;
> +
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint64_t max_bounce_buffer_size;
>  };
>  
>  static inline int pci_intx(PCIDevice *pci_dev)
> diff --git a/softmmu/memory.c b/softmmu/memory.c
> index 5c9622c3d6..e02799359c 100644
> --- a/softmmu/memory.c
> +++ b/softmmu/memory.c
> @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
>      as->ioeventfds = NULL;
>      QTAILQ_INIT(&as->listeners);
>      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> -    as->bounce.in_use = false;
> +    as->max_bounce_buffer_size = 4096;
> +    as->bounce_buffer_size = 0;
>      qemu_mutex_init(&as->map_client_list_lock);
>      QLIST_INIT(&as->map_client_list);
>      as->name = g_strdup(name ? name : "anonymous");
> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index f40cc564b8..e3d1cf5fba 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
>                                       NULL, len, FLUSH_CACHE);
>  }
>  
> +/*
> + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> + * to detect illegal pointers passed to address_space_unmap.
> + */
> +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> +
> +typedef struct {
> +    uint64_t magic;
> +    MemoryRegion *mr;
> +    hwaddr addr;
> +    size_t len;
> +    uint8_t buffer[];
> +} BounceBuffer;
> +
>  static void
>  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
>  {
> @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
>      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
>      /* Write map_client_list before reading bounce_buffer_size.  */
>      smp_mb();
> -    if (!qatomic_read(&as->bounce.in_use)) {
> +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
>          address_space_notify_map_clients_locked(as);
>      }
>      qemu_mutex_unlock(&as->map_client_list_lock);
> @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
>      RCU_READ_LOCK_GUARD();
>      fv = address_space_to_flatview(as);
>      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> +    memory_region_ref(mr);
>  
>      if (!memory_access_is_direct(mr, is_write)) {
> -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> +        if (size > as->max_bounce_buffer_size) {
> +            size_t excess = size - as->max_bounce_buffer_size;
> +            l -= excess;
> +            qatomic_sub(&as->bounce_buffer_size, excess);

I think two threads can race here. as->bounce_buffer_size will be
corrupted (smaller than it should be) and l will be wrong as well. A
cmpxchg loop would solve the race.

> +        }
> +
> +        if (l == 0) {
>              *plen = 0;
>              return NULL;
>          }
> -        /* Avoid unbounded allocations */
> -        l = MIN(l, TARGET_PAGE_SIZE);
> -        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> -        as->bounce.addr = addr;
> -        as->bounce.len = l;
>  
> -        memory_region_ref(mr);
> -        as->bounce.mr = mr;
> +        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
> +        bounce->magic = BOUNCE_BUFFER_MAGIC;
> +        bounce->mr = mr;
> +        bounce->addr = addr;
> +        bounce->len = l;
> +
>          if (!is_write) {
>              flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> -                          as->bounce.buffer, l);
> +                          bounce->buffer, l);
>          }
>  
>          *plen = l;
> -        return as->bounce.buffer;
> +        return bounce->buffer;
>      }
>  
> -
> -    memory_region_ref(mr);
>      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
>                                          l, is_write, attrs);
>      fuzz_dma_read_cb(addr, *plen, mr);
> @@ -3119,31 +3138,44 @@ void *address_space_map(AddressSpace *as,
>  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
>                           bool is_write, hwaddr access_len)
>  {
> -    if (buffer != as->bounce.buffer) {
> -        MemoryRegion *mr;
> -        ram_addr_t addr1;
> +    MemoryRegion *mr;
> +    ram_addr_t addr1;
> +
> +    mr = memory_region_from_host(buffer, &addr1);
> +    if (mr == NULL) {
> +        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> +        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
> +            error_report(
> +                "Unmap request for %p, which neither corresponds to a memory "
> +                "region, nor looks like a bounce buffer, ignoring!",
> +                buffer);
> +            return;
> +        }
>  
> -        mr = memory_region_from_host(buffer, &addr1);
> -        assert(mr != NULL);
>          if (is_write) {
> -            invalidate_and_set_dirty(mr, addr1, access_len);
> +            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> +                                bounce->buffer, access_len);
>          }
> -        if (xen_enabled()) {
> -            xen_invalidate_map_cache_entry(buffer);
> +
> +        memory_region_unref(bounce->mr);
> +        uint64_t previous_buffer_size =
> +            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
> +        if (previous_buffer_size == as->max_bounce_buffer_size) {
> +            /* Write bounce_buffer_size before reading map_client_list. */
> +            smp_mb();
> +            address_space_notify_map_clients(as);
>          }
> -        memory_region_unref(mr);
> +        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> +        g_free(bounce);
>          return;
>      }
> +
> +    if (xen_enabled()) {
> +        xen_invalidate_map_cache_entry(buffer);
> +    }
>      if (is_write) {
> -        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> -                            as->bounce.buffer, access_len);
> -    }
> -    qemu_vfree(as->bounce.buffer);
> -    as->bounce.buffer = NULL;
> -    memory_region_unref(as->bounce.mr);
> -    /* Clear in_use before reading map_client_list.  */
> -    qatomic_set_mb(&as->bounce.in_use, false);
> -    address_space_notify_map_clients(as);
> +        invalidate_and_set_dirty(mr, addr1, access_len);
> +    }
>  }
>  
>  void *cpu_physical_memory_map(hwaddr addr,
> -- 
> 2.34.1
>
Mattias Nissler Sept. 15, 2023, 9:32 a.m. UTC | #3
On Wed, Sep 13, 2023 at 9:11 PM Peter Xu <peterx@redhat.com> wrote:
>
> On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> > When DMA memory can't be directly accessed, as is the case when
> > running the device model in a separate process without shareable DMA
> > file descriptors, bounce buffering is used.
> >
> > It is not uncommon for device models to request mapping of several DMA
> > regions at the same time. Examples include:
> >  * net devices, e.g. when transmitting a packet that is split across
> >    several TX descriptors (observed with igb)
> >  * USB host controllers, when handling a packet with multiple data TRBs
> >    (observed with xhci)
> >
> > Previously, qemu only provided a single bounce buffer per AddressSpace
> > and would fail DMA map requests while the buffer was already in use. In
> > turn, this would cause DMA failures that ultimately manifest as hardware
> > errors from the guest perspective.
> >
> > This change allocates DMA bounce buffers dynamically instead of
> > supporting only a single buffer. Thus, multiple DMA mappings work
> > correctly also when RAM can't be mmap()-ed.
> >
> > The total bounce buffer allocation size is limited individually for each
> > AddressSpace. The default limit is 4096 bytes, matching the previous
> > maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > provided to configure the limit for PCI devices.
> >
> > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > ---
> >  hw/pci/pci.c                |  8 ++++
> >  include/exec/memory.h       | 14 ++----
> >  include/hw/pci/pci_device.h |  3 ++
> >  softmmu/memory.c            |  3 +-
> >  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
> >  5 files changed, 80 insertions(+), 42 deletions(-)
> >
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index 881d774fb6..8c4541b394 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -85,6 +85,8 @@ static Property pci_props[] = {
> >                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
> >      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
> >                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> > +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> > +                     max_bounce_buffer_size, 4096),
> >      DEFINE_PROP_END_OF_LIST()
> >  };
> >
> > @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> >                         "bus master container", UINT64_MAX);
> >      address_space_init(&pci_dev->bus_master_as,
> >                         &pci_dev->bus_master_container_region, pci_dev->name);
> > +    pci_dev->bus_master_as.max_bounce_buffer_size =
> > +        pci_dev->max_bounce_buffer_size;
> >
> >      if (phase_check(PHASE_MACHINE_READY)) {
> >          pci_init_bus_master(pci_dev);
> > @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
> >      k->unrealize = pci_qdev_unrealize;
> >      k->bus_type = TYPE_PCI_BUS;
> >      device_class_set_props(k, pci_props);
> > +    object_class_property_set_description(
> > +        klass, "x-max-bounce-buffer-size",
> > +        "Maximum buffer size allocated for bounce buffers used for mapped "
> > +        "access to indirect DMA memory");
> >  }
> >
> >  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 7d68936157..5577542b5e 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
> >      QLIST_ENTRY(AddressSpaceMapClient) link;
> >  } AddressSpaceMapClient;
> >
> > -typedef struct {
> > -    MemoryRegion *mr;
> > -    void *buffer;
> > -    hwaddr addr;
> > -    hwaddr len;
> > -    bool in_use;
> > -} BounceBuffer;
> > -
> >  /**
> >   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >   */
> > @@ -1106,8 +1098,10 @@ struct AddressSpace {
> >      QTAILQ_HEAD(, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >
> > -    /* Bounce buffer to use for this address space. */
> > -    BounceBuffer bounce;
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint64_t max_bounce_buffer_size;
> > +    /* Total size of bounce buffers currently allocated, atomically accessed */
> > +    uint64_t bounce_buffer_size;
> >      /* List of callbacks to invoke when buffers free up */
> >      QemuMutex map_client_list_lock;
> >      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index d3dd0f64b2..f4027c5379 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -160,6 +160,9 @@ struct PCIDevice {
> >      /* ID of standby device in net_failover pair */
> >      char *failover_pair_id;
> >      uint32_t acpi_index;
> > +
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint64_t max_bounce_buffer_size;
> >  };
> >
> >  static inline int pci_intx(PCIDevice *pci_dev)
> > diff --git a/softmmu/memory.c b/softmmu/memory.c
> > index 5c9622c3d6..e02799359c 100644
> > --- a/softmmu/memory.c
> > +++ b/softmmu/memory.c
> > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
> >      as->ioeventfds = NULL;
> >      QTAILQ_INIT(&as->listeners);
> >      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> > -    as->bounce.in_use = false;
> > +    as->max_bounce_buffer_size = 4096;
>
> Instead of hard-code this 4k again (besides the pci property), maybe we can
> have address_space_init_with_bouncebuffer() and always pass it in from pci
> do_realize?  Then by default no bounce buffer supported for the AS only if
> requested.

I haven't verified in a running configuration, but I believe bounce
buffering is also used with non-PCI code, for example
sysbus_ahci_realize grabs &address_space_memory. So, IMO it makes
sense to keep at the old default for non-PCI address spaces, unless we
create additional knobs to set the limit for these?


>
> > +    as->bounce_buffer_size = 0;
> >      qemu_mutex_init(&as->map_client_list_lock);
> >      QLIST_INIT(&as->map_client_list);
> >      as->name = g_strdup(name ? name : "anonymous");
> > diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> > index f40cc564b8..e3d1cf5fba 100644
> > --- a/softmmu/physmem.c
> > +++ b/softmmu/physmem.c
> > @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
> >                                       NULL, len, FLUSH_CACHE);
> >  }
> >
> > +/*
> > + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> > + * to detect illegal pointers passed to address_space_unmap.
> > + */
> > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> > +
> > +typedef struct {
> > +    uint64_t magic;
> > +    MemoryRegion *mr;
> > +    hwaddr addr;
> > +    size_t len;
> > +    uint8_t buffer[];
> > +} BounceBuffer;
> > +
> >  static void
> >  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
> >  {
> > @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
> >      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> >      /* Write map_client_list before reading bounce_buffer_size.  */
> >      smp_mb();
> > -    if (!qatomic_read(&as->bounce.in_use)) {
> > +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
> >          address_space_notify_map_clients_locked(as);
> >      }
> >      qemu_mutex_unlock(&as->map_client_list_lock);
> > @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
> >      RCU_READ_LOCK_GUARD();
> >      fv = address_space_to_flatview(as);
> >      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> > +    memory_region_ref(mr);
> >
> >      if (!memory_access_is_direct(mr, is_write)) {
> > -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> > +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> > +        if (size > as->max_bounce_buffer_size) {
> > +            size_t excess = size - as->max_bounce_buffer_size;
> > +            l -= excess;
> > +            qatomic_sub(&as->bounce_buffer_size, excess);
> > +        }
> > +
> > +        if (l == 0) {
> >              *plen = 0;
> >              return NULL;
>
> MR refcount leak?

Oops, fixing.

>
> >          }
> > -        /* Avoid unbounded allocations */
> > -        l = MIN(l, TARGET_PAGE_SIZE);
> > -        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> > -        as->bounce.addr = addr;
> > -        as->bounce.len = l;
> >
> > -        memory_region_ref(mr);
> > -        as->bounce.mr = mr;
> > +        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
> > +        bounce->magic = BOUNCE_BUFFER_MAGIC;
> > +        bounce->mr = mr;
> > +        bounce->addr = addr;
> > +        bounce->len = l;
> > +
> >          if (!is_write) {
> >              flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> > -                          as->bounce.buffer, l);
> > +                          bounce->buffer, l);
> >          }
> >
> >          *plen = l;
> > -        return as->bounce.buffer;
> > +        return bounce->buffer;
> >      }
> >
> > -
> > -    memory_region_ref(mr);
> >      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
> >                                          l, is_write, attrs);
> >      fuzz_dma_read_cb(addr, *plen, mr);
> > @@ -3119,31 +3138,44 @@ void *address_space_map(AddressSpace *as,
> >  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
> >                           bool is_write, hwaddr access_len)
> >  {
> > -    if (buffer != as->bounce.buffer) {
> > -        MemoryRegion *mr;
> > -        ram_addr_t addr1;
> > +    MemoryRegion *mr;
> > +    ram_addr_t addr1;
> > +
> > +    mr = memory_region_from_host(buffer, &addr1);
> > +    if (mr == NULL) {
> > +        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> > +        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
> > +            error_report(
> > +                "Unmap request for %p, which neither corresponds to a memory "
> > +                "region, nor looks like a bounce buffer, ignoring!",
> > +                buffer);
> > +            return;
>
> Would assert() be better here? So that when trigger we can have the user
> stack already.

Yes, and it's something in the should-never-happen bucket, so assert
is more appropriate. Changing.

>
> > +        }
> >
> > -        mr = memory_region_from_host(buffer, &addr1);
> > -        assert(mr != NULL);
> >          if (is_write) {
> > -            invalidate_and_set_dirty(mr, addr1, access_len);
> > +            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> > +                                bounce->buffer, access_len);
> >          }
> > -        if (xen_enabled()) {
> > -            xen_invalidate_map_cache_entry(buffer);
> > +
> > +        memory_region_unref(bounce->mr);
> > +        uint64_t previous_buffer_size =
> > +            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
> > +        if (previous_buffer_size == as->max_bounce_buffer_size) {
>
> Can there be race condition that someone just temporarily boosted
> bounce_buffer_size, so that it won't exactly equal to max but actually
> larger (but we should still notify)?

Ah, yes, we may have overshot in the allocation path. So this should
be >= then, but see below...

>
> If the current context has already a bounce buffer and is going to unmap it
> (len>0), IIUC it means we should always notify because there will
> definitely be some space left, and the requesters can retry to see whether
> the size fit.

The current semantics are actually more awkward: If the requested size
doesn't fit, a caller currently gets a mapping for the portion that is
below the limit, with an adjusted *plen value. So a caller that wants
more would then register their callback and get a callback when more
space frees up. I carried this over from the existing code, but I find
it extremely obscure and error-prone. I would rather change it to the
semantics you suggest, namely rejecting the entire request, but that's
a breaking API change that's hard to validate. So, maybe it's best to
do what you suggest and notify unconditionally when we free up space
and leave it to the caller to deal with whatever situation they find
(which is already the case). So, I'll drop the condition here for now,
let me know if you have a better suggestion.

>
> > +            /* Write bounce_buffer_size before reading map_client_list. */
> > +            smp_mb();
>
> I know it comes from the old code.. but I don't know why this is needed;
> mutex lock should contain an mb() already before the list iterations later.
> Just to raise it up, maybe Paolo would like to comment.

Hm, are you sure that qemu_mutex_lock includes a full memory barrier?
The atomics docs say that pthread_mutex_lock is guaranteed to have
acquire semantics, but that doesn't guarantee that previous writes are
visible elsewhere. We need a release of bounce_buffer_size and an
acquire of map_client_list. The latter is implied by qemu_mutex_lock,
so I could arguably change this to smp_wmb().

(I hope this makes sense, memory fencing isn't my forte)

>
> > +            address_space_notify_map_clients(as);
> >          }
> > -        memory_region_unref(mr);
> > +        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> > +        g_free(bounce);
> >          return;
> >      }
> > +
> > +    if (xen_enabled()) {
> > +        xen_invalidate_map_cache_entry(buffer);
> > +    }
> >      if (is_write) {
> > -        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> > -                            as->bounce.buffer, access_len);
> > -    }
> > -    qemu_vfree(as->bounce.buffer);
> > -    as->bounce.buffer = NULL;
> > -    memory_region_unref(as->bounce.mr);
> > -    /* Clear in_use before reading map_client_list.  */
> > -    qatomic_set_mb(&as->bounce.in_use, false);
> > -    address_space_notify_map_clients(as);
> > +        invalidate_and_set_dirty(mr, addr1, access_len);
> > +    }
> >  }
> >
> >  void *cpu_physical_memory_map(hwaddr addr,
> > --
> > 2.34.1
> >
>
> --
> Peter Xu
>
Mattias Nissler Sept. 15, 2023, 9:54 a.m. UTC | #4
On Thu, Sep 14, 2023 at 8:49 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
>
> On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> > When DMA memory can't be directly accessed, as is the case when
> > running the device model in a separate process without shareable DMA
> > file descriptors, bounce buffering is used.
> >
> > It is not uncommon for device models to request mapping of several DMA
> > regions at the same time. Examples include:
> >  * net devices, e.g. when transmitting a packet that is split across
> >    several TX descriptors (observed with igb)
> >  * USB host controllers, when handling a packet with multiple data TRBs
> >    (observed with xhci)
> >
> > Previously, qemu only provided a single bounce buffer per AddressSpace
> > and would fail DMA map requests while the buffer was already in use. In
> > turn, this would cause DMA failures that ultimately manifest as hardware
> > errors from the guest perspective.
> >
> > This change allocates DMA bounce buffers dynamically instead of
> > supporting only a single buffer. Thus, multiple DMA mappings work
> > correctly also when RAM can't be mmap()-ed.
> >
> > The total bounce buffer allocation size is limited individually for each
> > AddressSpace. The default limit is 4096 bytes, matching the previous
> > maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > provided to configure the limit for PCI devices.
> >
> > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > ---
> >  hw/pci/pci.c                |  8 ++++
> >  include/exec/memory.h       | 14 ++----
> >  include/hw/pci/pci_device.h |  3 ++
> >  softmmu/memory.c            |  3 +-
> >  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
> >  5 files changed, 80 insertions(+), 42 deletions(-)
> >
> > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > index 881d774fb6..8c4541b394 100644
> > --- a/hw/pci/pci.c
> > +++ b/hw/pci/pci.c
> > @@ -85,6 +85,8 @@ static Property pci_props[] = {
> >                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
> >      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
> >                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> > +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> > +                     max_bounce_buffer_size, 4096),
> >      DEFINE_PROP_END_OF_LIST()
> >  };
> >
> > @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> >                         "bus master container", UINT64_MAX);
> >      address_space_init(&pci_dev->bus_master_as,
> >                         &pci_dev->bus_master_container_region, pci_dev->name);
> > +    pci_dev->bus_master_as.max_bounce_buffer_size =
> > +        pci_dev->max_bounce_buffer_size;
> >
> >      if (phase_check(PHASE_MACHINE_READY)) {
> >          pci_init_bus_master(pci_dev);
> > @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
> >      k->unrealize = pci_qdev_unrealize;
> >      k->bus_type = TYPE_PCI_BUS;
> >      device_class_set_props(k, pci_props);
> > +    object_class_property_set_description(
> > +        klass, "x-max-bounce-buffer-size",
> > +        "Maximum buffer size allocated for bounce buffers used for mapped "
> > +        "access to indirect DMA memory");
> >  }
> >
> >  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 7d68936157..5577542b5e 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
> >      QLIST_ENTRY(AddressSpaceMapClient) link;
> >  } AddressSpaceMapClient;
> >
> > -typedef struct {
> > -    MemoryRegion *mr;
> > -    void *buffer;
> > -    hwaddr addr;
> > -    hwaddr len;
> > -    bool in_use;
> > -} BounceBuffer;
> > -
> >  /**
> >   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >   */
> > @@ -1106,8 +1098,10 @@ struct AddressSpace {
> >      QTAILQ_HEAD(, MemoryListener) listeners;
> >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >
> > -    /* Bounce buffer to use for this address space. */
> > -    BounceBuffer bounce;
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint64_t max_bounce_buffer_size;
> > +    /* Total size of bounce buffers currently allocated, atomically accessed */
> > +    uint64_t bounce_buffer_size;
> >      /* List of callbacks to invoke when buffers free up */
> >      QemuMutex map_client_list_lock;
> >      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index d3dd0f64b2..f4027c5379 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -160,6 +160,9 @@ struct PCIDevice {
> >      /* ID of standby device in net_failover pair */
> >      char *failover_pair_id;
> >      uint32_t acpi_index;
> > +
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint64_t max_bounce_buffer_size;
> >  };
> >
> >  static inline int pci_intx(PCIDevice *pci_dev)
> > diff --git a/softmmu/memory.c b/softmmu/memory.c
> > index 5c9622c3d6..e02799359c 100644
> > --- a/softmmu/memory.c
> > +++ b/softmmu/memory.c
> > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
> >      as->ioeventfds = NULL;
> >      QTAILQ_INIT(&as->listeners);
> >      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> > -    as->bounce.in_use = false;
> > +    as->max_bounce_buffer_size = 4096;
> > +    as->bounce_buffer_size = 0;
> >      qemu_mutex_init(&as->map_client_list_lock);
> >      QLIST_INIT(&as->map_client_list);
> >      as->name = g_strdup(name ? name : "anonymous");
> > diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> > index f40cc564b8..e3d1cf5fba 100644
> > --- a/softmmu/physmem.c
> > +++ b/softmmu/physmem.c
> > @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
> >                                       NULL, len, FLUSH_CACHE);
> >  }
> >
> > +/*
> > + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> > + * to detect illegal pointers passed to address_space_unmap.
> > + */
> > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> > +
> > +typedef struct {
> > +    uint64_t magic;
> > +    MemoryRegion *mr;
> > +    hwaddr addr;
> > +    size_t len;
> > +    uint8_t buffer[];
> > +} BounceBuffer;
> > +
> >  static void
> >  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
> >  {
> > @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
> >      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> >      /* Write map_client_list before reading bounce_buffer_size.  */
> >      smp_mb();
> > -    if (!qatomic_read(&as->bounce.in_use)) {
> > +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
> >          address_space_notify_map_clients_locked(as);
> >      }
> >      qemu_mutex_unlock(&as->map_client_list_lock);
> > @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
> >      RCU_READ_LOCK_GUARD();
> >      fv = address_space_to_flatview(as);
> >      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> > +    memory_region_ref(mr);
> >
> >      if (!memory_access_is_direct(mr, is_write)) {
> > -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> > +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> > +        if (size > as->max_bounce_buffer_size) {
> > +            size_t excess = size - as->max_bounce_buffer_size;
> > +            l -= excess;
> > +            qatomic_sub(&as->bounce_buffer_size, excess);
>
> I think two threads can race here. as->bounce_buffer_size will be
> corrupted (smaller than it should be) and l will be wrong as well. A
> cmpxchg loop would solve the race.

Ah, thanks for pointing this out. I think the case you have in mind is this:
1. Thread A bumps the size to be larger than the max
2. Thread B bumps the size further
3. Thread B now computes an incorrect excess and sutracts more than it added.

I should be good if I ensure that each thread will only subtract what
it has previously added to enforce the invariant that additions and
subtractions for each map/unmap pair cancel each other out. Let me
know if there's a reason to still prefer the cmpxchg loop.

>
> > +        }
> > +
> > +        if (l == 0) {
> >              *plen = 0;
> >              return NULL;
> >          }
> > -        /* Avoid unbounded allocations */
> > -        l = MIN(l, TARGET_PAGE_SIZE);
> > -        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
> > -        as->bounce.addr = addr;
> > -        as->bounce.len = l;
> >
> > -        memory_region_ref(mr);
> > -        as->bounce.mr = mr;
> > +        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
> > +        bounce->magic = BOUNCE_BUFFER_MAGIC;
> > +        bounce->mr = mr;
> > +        bounce->addr = addr;
> > +        bounce->len = l;
> > +
> >          if (!is_write) {
> >              flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
> > -                          as->bounce.buffer, l);
> > +                          bounce->buffer, l);
> >          }
> >
> >          *plen = l;
> > -        return as->bounce.buffer;
> > +        return bounce->buffer;
> >      }
> >
> > -
> > -    memory_region_ref(mr);
> >      *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
> >                                          l, is_write, attrs);
> >      fuzz_dma_read_cb(addr, *plen, mr);
> > @@ -3119,31 +3138,44 @@ void *address_space_map(AddressSpace *as,
> >  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
> >                           bool is_write, hwaddr access_len)
> >  {
> > -    if (buffer != as->bounce.buffer) {
> > -        MemoryRegion *mr;
> > -        ram_addr_t addr1;
> > +    MemoryRegion *mr;
> > +    ram_addr_t addr1;
> > +
> > +    mr = memory_region_from_host(buffer, &addr1);
> > +    if (mr == NULL) {
> > +        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
> > +        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
> > +            error_report(
> > +                "Unmap request for %p, which neither corresponds to a memory "
> > +                "region, nor looks like a bounce buffer, ignoring!",
> > +                buffer);
> > +            return;
> > +        }
> >
> > -        mr = memory_region_from_host(buffer, &addr1);
> > -        assert(mr != NULL);
> >          if (is_write) {
> > -            invalidate_and_set_dirty(mr, addr1, access_len);
> > +            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
> > +                                bounce->buffer, access_len);
> >          }
> > -        if (xen_enabled()) {
> > -            xen_invalidate_map_cache_entry(buffer);
> > +
> > +        memory_region_unref(bounce->mr);
> > +        uint64_t previous_buffer_size =
> > +            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
> > +        if (previous_buffer_size == as->max_bounce_buffer_size) {
> > +            /* Write bounce_buffer_size before reading map_client_list. */
> > +            smp_mb();
> > +            address_space_notify_map_clients(as);
> >          }
> > -        memory_region_unref(mr);
> > +        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
> > +        g_free(bounce);
> >          return;
> >      }
> > +
> > +    if (xen_enabled()) {
> > +        xen_invalidate_map_cache_entry(buffer);
> > +    }
> >      if (is_write) {
> > -        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
> > -                            as->bounce.buffer, access_len);
> > -    }
> > -    qemu_vfree(as->bounce.buffer);
> > -    as->bounce.buffer = NULL;
> > -    memory_region_unref(as->bounce.mr);
> > -    /* Clear in_use before reading map_client_list.  */
> > -    qatomic_set_mb(&as->bounce.in_use, false);
> > -    address_space_notify_map_clients(as);
> > +        invalidate_and_set_dirty(mr, addr1, access_len);
> > +    }
> >  }
> >
> >  void *cpu_physical_memory_map(hwaddr addr,
> > --
> > 2.34.1
> >
Peter Xu Sept. 15, 2023, 3:57 p.m. UTC | #5
On Fri, Sep 15, 2023 at 11:32:31AM +0200, Mattias Nissler wrote:
> > > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
> > >      as->ioeventfds = NULL;
> > >      QTAILQ_INIT(&as->listeners);
> > >      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> > > -    as->bounce.in_use = false;
> > > +    as->max_bounce_buffer_size = 4096;
> >
> > Instead of hard-code this 4k again (besides the pci property), maybe we can
> > have address_space_init_with_bouncebuffer() and always pass it in from pci
> > do_realize?  Then by default no bounce buffer supported for the AS only if
> > requested.
> 
> I haven't verified in a running configuration, but I believe bounce
> buffering is also used with non-PCI code, for example
> sysbus_ahci_realize grabs &address_space_memory. So, IMO it makes
> sense to keep at the old default for non-PCI address spaces, unless we
> create additional knobs to set the limit for these?

Oh okay, in that case do we want to have a macro defining the default for
all (as 4K), then also use the macro in the pci property for the default
value?  Maybe it's slightly better than the hard-coded.

> > > +            /* Write bounce_buffer_size before reading map_client_list. */
> > > +            smp_mb();
> >
> > I know it comes from the old code.. but I don't know why this is needed;
> > mutex lock should contain an mb() already before the list iterations later.
> > Just to raise it up, maybe Paolo would like to comment.
> 
> Hm, are you sure that qemu_mutex_lock includes a full memory barrier?

No. :)

> The atomics docs say that pthread_mutex_lock is guaranteed to have
> acquire semantics, but that doesn't guarantee that previous writes are
> visible elsewhere. We need a release of bounce_buffer_size and an
> acquire of map_client_list. The latter is implied by qemu_mutex_lock,
> so I could arguably change this to smp_wmb().

Yeah I think I made such mistake before, sorry.  I think some day I looked
into x86 impl and I believe it was mb() there but I always kept that
impression in mind that it will always be, but not really.  I think you're
right that mutex_lock semantics only guarantees an REQUIRE, and that's not
the same as mb(), at least not always.

Changing it to wmb() makes sense to me but indeed that may be more suitable
for another patch.  Maybe easier to just leave it as-is as it shouldn't be
super hot path anyway.

Thanks,
Stefan Hajnoczi Sept. 15, 2023, 8:40 p.m. UTC | #6
On Fri, 15 Sept 2023 at 05:55, Mattias Nissler <mnissler@rivosinc.com> wrote:
>
> On Thu, Sep 14, 2023 at 8:49 PM Stefan Hajnoczi <stefanha@redhat.com> wrote:
> >
> > On Thu, Sep 07, 2023 at 06:04:07AM -0700, Mattias Nissler wrote:
> > > When DMA memory can't be directly accessed, as is the case when
> > > running the device model in a separate process without shareable DMA
> > > file descriptors, bounce buffering is used.
> > >
> > > It is not uncommon for device models to request mapping of several DMA
> > > regions at the same time. Examples include:
> > >  * net devices, e.g. when transmitting a packet that is split across
> > >    several TX descriptors (observed with igb)
> > >  * USB host controllers, when handling a packet with multiple data TRBs
> > >    (observed with xhci)
> > >
> > > Previously, qemu only provided a single bounce buffer per AddressSpace
> > > and would fail DMA map requests while the buffer was already in use. In
> > > turn, this would cause DMA failures that ultimately manifest as hardware
> > > errors from the guest perspective.
> > >
> > > This change allocates DMA bounce buffers dynamically instead of
> > > supporting only a single buffer. Thus, multiple DMA mappings work
> > > correctly also when RAM can't be mmap()-ed.
> > >
> > > The total bounce buffer allocation size is limited individually for each
> > > AddressSpace. The default limit is 4096 bytes, matching the previous
> > > maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > > provided to configure the limit for PCI devices.
> > >
> > > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > > ---
> > >  hw/pci/pci.c                |  8 ++++
> > >  include/exec/memory.h       | 14 ++----
> > >  include/hw/pci/pci_device.h |  3 ++
> > >  softmmu/memory.c            |  3 +-
> > >  softmmu/physmem.c           | 94 +++++++++++++++++++++++++------------
> > >  5 files changed, 80 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> > > index 881d774fb6..8c4541b394 100644
> > > --- a/hw/pci/pci.c
> > > +++ b/hw/pci/pci.c
> > > @@ -85,6 +85,8 @@ static Property pci_props[] = {
> > >                      QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
> > >      DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
> > >                      QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
> > > +    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
> > > +                     max_bounce_buffer_size, 4096),
> > >      DEFINE_PROP_END_OF_LIST()
> > >  };
> > >
> > > @@ -1208,6 +1210,8 @@ static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
> > >                         "bus master container", UINT64_MAX);
> > >      address_space_init(&pci_dev->bus_master_as,
> > >                         &pci_dev->bus_master_container_region, pci_dev->name);
> > > +    pci_dev->bus_master_as.max_bounce_buffer_size =
> > > +        pci_dev->max_bounce_buffer_size;
> > >
> > >      if (phase_check(PHASE_MACHINE_READY)) {
> > >          pci_init_bus_master(pci_dev);
> > > @@ -2664,6 +2668,10 @@ static void pci_device_class_init(ObjectClass *klass, void *data)
> > >      k->unrealize = pci_qdev_unrealize;
> > >      k->bus_type = TYPE_PCI_BUS;
> > >      device_class_set_props(k, pci_props);
> > > +    object_class_property_set_description(
> > > +        klass, "x-max-bounce-buffer-size",
> > > +        "Maximum buffer size allocated for bounce buffers used for mapped "
> > > +        "access to indirect DMA memory");
> > >  }
> > >
> > >  static void pci_device_class_base_init(ObjectClass *klass, void *data)
> > > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > > index 7d68936157..5577542b5e 100644
> > > --- a/include/exec/memory.h
> > > +++ b/include/exec/memory.h
> > > @@ -1081,14 +1081,6 @@ typedef struct AddressSpaceMapClient {
> > >      QLIST_ENTRY(AddressSpaceMapClient) link;
> > >  } AddressSpaceMapClient;
> > >
> > > -typedef struct {
> > > -    MemoryRegion *mr;
> > > -    void *buffer;
> > > -    hwaddr addr;
> > > -    hwaddr len;
> > > -    bool in_use;
> > > -} BounceBuffer;
> > > -
> > >  /**
> > >   * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> > >   */
> > > @@ -1106,8 +1098,10 @@ struct AddressSpace {
> > >      QTAILQ_HEAD(, MemoryListener) listeners;
> > >      QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> > >
> > > -    /* Bounce buffer to use for this address space. */
> > > -    BounceBuffer bounce;
> > > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > > +    uint64_t max_bounce_buffer_size;
> > > +    /* Total size of bounce buffers currently allocated, atomically accessed */
> > > +    uint64_t bounce_buffer_size;
> > >      /* List of callbacks to invoke when buffers free up */
> > >      QemuMutex map_client_list_lock;
> > >      QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> > > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > > index d3dd0f64b2..f4027c5379 100644
> > > --- a/include/hw/pci/pci_device.h
> > > +++ b/include/hw/pci/pci_device.h
> > > @@ -160,6 +160,9 @@ struct PCIDevice {
> > >      /* ID of standby device in net_failover pair */
> > >      char *failover_pair_id;
> > >      uint32_t acpi_index;
> > > +
> > > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > > +    uint64_t max_bounce_buffer_size;
> > >  };
> > >
> > >  static inline int pci_intx(PCIDevice *pci_dev)
> > > diff --git a/softmmu/memory.c b/softmmu/memory.c
> > > index 5c9622c3d6..e02799359c 100644
> > > --- a/softmmu/memory.c
> > > +++ b/softmmu/memory.c
> > > @@ -3105,7 +3105,8 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
> > >      as->ioeventfds = NULL;
> > >      QTAILQ_INIT(&as->listeners);
> > >      QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
> > > -    as->bounce.in_use = false;
> > > +    as->max_bounce_buffer_size = 4096;
> > > +    as->bounce_buffer_size = 0;
> > >      qemu_mutex_init(&as->map_client_list_lock);
> > >      QLIST_INIT(&as->map_client_list);
> > >      as->name = g_strdup(name ? name : "anonymous");
> > > diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> > > index f40cc564b8..e3d1cf5fba 100644
> > > --- a/softmmu/physmem.c
> > > +++ b/softmmu/physmem.c
> > > @@ -2926,6 +2926,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
> > >                                       NULL, len, FLUSH_CACHE);
> > >  }
> > >
> > > +/*
> > > + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> > > + * to detect illegal pointers passed to address_space_unmap.
> > > + */
> > > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> > > +
> > > +typedef struct {
> > > +    uint64_t magic;
> > > +    MemoryRegion *mr;
> > > +    hwaddr addr;
> > > +    size_t len;
> > > +    uint8_t buffer[];
> > > +} BounceBuffer;
> > > +
> > >  static void
> > >  address_space_unregister_map_client_do(AddressSpaceMapClient *client)
> > >  {
> > > @@ -2953,7 +2967,7 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
> > >      QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> > >      /* Write map_client_list before reading bounce_buffer_size.  */
> > >      smp_mb();
> > > -    if (!qatomic_read(&as->bounce.in_use)) {
> > > +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
> > >          address_space_notify_map_clients_locked(as);
> > >      }
> > >      qemu_mutex_unlock(&as->map_client_list_lock);
> > > @@ -3081,31 +3095,36 @@ void *address_space_map(AddressSpace *as,
> > >      RCU_READ_LOCK_GUARD();
> > >      fv = address_space_to_flatview(as);
> > >      mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> > > +    memory_region_ref(mr);
> > >
> > >      if (!memory_access_is_direct(mr, is_write)) {
> > > -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> > > +        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
> > > +        if (size > as->max_bounce_buffer_size) {
> > > +            size_t excess = size - as->max_bounce_buffer_size;
> > > +            l -= excess;
> > > +            qatomic_sub(&as->bounce_buffer_size, excess);
> >
> > I think two threads can race here. as->bounce_buffer_size will be
> > corrupted (smaller than it should be) and l will be wrong as well. A
> > cmpxchg loop would solve the race.
>
> Ah, thanks for pointing this out. I think the case you have in mind is this:
> 1. Thread A bumps the size to be larger than the max
> 2. Thread B bumps the size further
> 3. Thread B now computes an incorrect excess and sutracts more than it added.

Yes, that's the case.

> I should be good if I ensure that each thread will only subtract what
> it has previously added to enforce the invariant that additions and
> subtractions for each map/unmap pair cancel each other out. Let me
> know if there's a reason to still prefer the cmpxchg loop.

Cool, it would be nice to avoid the cmpxchg loop.

Stefan
diff mbox series

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 881d774fb6..8c4541b394 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@  static Property pci_props[] = {
                     QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
     DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
                     QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+    DEFINE_PROP_SIZE("x-max-bounce-buffer-size", PCIDevice,
+                     max_bounce_buffer_size, 4096),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -1208,6 +1210,8 @@  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
                        "bus master container", UINT64_MAX);
     address_space_init(&pci_dev->bus_master_as,
                        &pci_dev->bus_master_container_region, pci_dev->name);
+    pci_dev->bus_master_as.max_bounce_buffer_size =
+        pci_dev->max_bounce_buffer_size;
 
     if (phase_check(PHASE_MACHINE_READY)) {
         pci_init_bus_master(pci_dev);
@@ -2664,6 +2668,10 @@  static void pci_device_class_init(ObjectClass *klass, void *data)
     k->unrealize = pci_qdev_unrealize;
     k->bus_type = TYPE_PCI_BUS;
     device_class_set_props(k, pci_props);
+    object_class_property_set_description(
+        klass, "x-max-bounce-buffer-size",
+        "Maximum buffer size allocated for bounce buffers used for mapped "
+        "access to indirect DMA memory");
 }
 
 static void pci_device_class_base_init(ObjectClass *klass, void *data)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 7d68936157..5577542b5e 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1081,14 +1081,6 @@  typedef struct AddressSpaceMapClient {
     QLIST_ENTRY(AddressSpaceMapClient) link;
 } AddressSpaceMapClient;
 
-typedef struct {
-    MemoryRegion *mr;
-    void *buffer;
-    hwaddr addr;
-    hwaddr len;
-    bool in_use;
-} BounceBuffer;
-
 /**
  * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
  */
@@ -1106,8 +1098,10 @@  struct AddressSpace {
     QTAILQ_HEAD(, MemoryListener) listeners;
     QTAILQ_ENTRY(AddressSpace) address_spaces_link;
 
-    /* Bounce buffer to use for this address space. */
-    BounceBuffer bounce;
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint64_t max_bounce_buffer_size;
+    /* Total size of bounce buffers currently allocated, atomically accessed */
+    uint64_t bounce_buffer_size;
     /* List of callbacks to invoke when buffers free up */
     QemuMutex map_client_list_lock;
     QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..f4027c5379 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -160,6 +160,9 @@  struct PCIDevice {
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
     uint32_t acpi_index;
+
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint64_t max_bounce_buffer_size;
 };
 
 static inline int pci_intx(PCIDevice *pci_dev)
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 5c9622c3d6..e02799359c 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -3105,7 +3105,8 @@  void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
     as->ioeventfds = NULL;
     QTAILQ_INIT(&as->listeners);
     QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
-    as->bounce.in_use = false;
+    as->max_bounce_buffer_size = 4096;
+    as->bounce_buffer_size = 0;
     qemu_mutex_init(&as->map_client_list_lock);
     QLIST_INIT(&as->map_client_list);
     as->name = g_strdup(name ? name : "anonymous");
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index f40cc564b8..e3d1cf5fba 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -2926,6 +2926,20 @@  void cpu_flush_icache_range(hwaddr start, hwaddr len)
                                      NULL, len, FLUSH_CACHE);
 }
 
+/*
+ * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
+ * to detect illegal pointers passed to address_space_unmap.
+ */
+#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
+
+typedef struct {
+    uint64_t magic;
+    MemoryRegion *mr;
+    hwaddr addr;
+    size_t len;
+    uint8_t buffer[];
+} BounceBuffer;
+
 static void
 address_space_unregister_map_client_do(AddressSpaceMapClient *client)
 {
@@ -2953,7 +2967,7 @@  void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
     QLIST_INSERT_HEAD(&as->map_client_list, client, link);
     /* Write map_client_list before reading bounce_buffer_size.  */
     smp_mb();
-    if (!qatomic_read(&as->bounce.in_use)) {
+    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
         address_space_notify_map_clients_locked(as);
     }
     qemu_mutex_unlock(&as->map_client_list_lock);
@@ -3081,31 +3095,36 @@  void *address_space_map(AddressSpace *as,
     RCU_READ_LOCK_GUARD();
     fv = address_space_to_flatview(as);
     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
+    memory_region_ref(mr);
 
     if (!memory_access_is_direct(mr, is_write)) {
-        if (qatomic_xchg(&as->bounce.in_use, true)) {
+        size_t size = qatomic_add_fetch(&as->bounce_buffer_size, l);
+        if (size > as->max_bounce_buffer_size) {
+            size_t excess = size - as->max_bounce_buffer_size;
+            l -= excess;
+            qatomic_sub(&as->bounce_buffer_size, excess);
+        }
+
+        if (l == 0) {
             *plen = 0;
             return NULL;
         }
-        /* Avoid unbounded allocations */
-        l = MIN(l, TARGET_PAGE_SIZE);
-        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
-        as->bounce.addr = addr;
-        as->bounce.len = l;
 
-        memory_region_ref(mr);
-        as->bounce.mr = mr;
+        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
+        bounce->magic = BOUNCE_BUFFER_MAGIC;
+        bounce->mr = mr;
+        bounce->addr = addr;
+        bounce->len = l;
+
         if (!is_write) {
             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
-                          as->bounce.buffer, l);
+                          bounce->buffer, l);
         }
 
         *plen = l;
-        return as->bounce.buffer;
+        return bounce->buffer;
     }
 
-
-    memory_region_ref(mr);
     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
                                         l, is_write, attrs);
     fuzz_dma_read_cb(addr, *plen, mr);
@@ -3119,31 +3138,44 @@  void *address_space_map(AddressSpace *as,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                          bool is_write, hwaddr access_len)
 {
-    if (buffer != as->bounce.buffer) {
-        MemoryRegion *mr;
-        ram_addr_t addr1;
+    MemoryRegion *mr;
+    ram_addr_t addr1;
+
+    mr = memory_region_from_host(buffer, &addr1);
+    if (mr == NULL) {
+        BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
+        if (bounce->magic != BOUNCE_BUFFER_MAGIC) {
+            error_report(
+                "Unmap request for %p, which neither corresponds to a memory "
+                "region, nor looks like a bounce buffer, ignoring!",
+                buffer);
+            return;
+        }
 
-        mr = memory_region_from_host(buffer, &addr1);
-        assert(mr != NULL);
         if (is_write) {
-            invalidate_and_set_dirty(mr, addr1, access_len);
+            address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
+                                bounce->buffer, access_len);
         }
-        if (xen_enabled()) {
-            xen_invalidate_map_cache_entry(buffer);
+
+        memory_region_unref(bounce->mr);
+        uint64_t previous_buffer_size =
+            qatomic_fetch_sub(&as->bounce_buffer_size, bounce->len);
+        if (previous_buffer_size == as->max_bounce_buffer_size) {
+            /* Write bounce_buffer_size before reading map_client_list. */
+            smp_mb();
+            address_space_notify_map_clients(as);
         }
-        memory_region_unref(mr);
+        bounce->magic = ~BOUNCE_BUFFER_MAGIC;
+        g_free(bounce);
         return;
     }
+
+    if (xen_enabled()) {
+        xen_invalidate_map_cache_entry(buffer);
+    }
     if (is_write) {
-        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
-                            as->bounce.buffer, access_len);
-    }
-    qemu_vfree(as->bounce.buffer);
-    as->bounce.buffer = NULL;
-    memory_region_unref(as->bounce.mr);
-    /* Clear in_use before reading map_client_list.  */
-    qatomic_set_mb(&as->bounce.in_use, false);
-    address_space_notify_map_clients(as);
+        invalidate_and_set_dirty(mr, addr1, access_len);
+    }
 }
 
 void *cpu_physical_memory_map(hwaddr addr,