diff mbox series

[v9,2/5] softmmu: Support concurrent bounce buffers

Message ID 20240507094210.300566-3-mnissler@rivosinc.com
State New
Headers show
Series Support message-based DMA in vfio-user server | expand

Commit Message

Mattias Nissler May 7, 2024, 9:42 a.m. UTC
When DMA memory can't be directly accessed, as is the case when
running the device model in a separate process without shareable DMA
file descriptors, bounce buffering is used.

It is not uncommon for device models to request mapping of several DMA
regions at the same time. Examples include:
 * net devices, e.g. when transmitting a packet that is split across
   several TX descriptors (observed with igb)
 * USB host controllers, when handling a packet with multiple data TRBs
   (observed with xhci)

Previously, qemu only provided a single bounce buffer per AddressSpace
and would fail DMA map requests while the buffer was already in use. In
turn, this would cause DMA failures that ultimately manifest as hardware
errors from the guest perspective.

This change allocates DMA bounce buffers dynamically instead of
supporting only a single buffer. Thus, multiple DMA mappings work
correctly also when RAM can't be mmap()-ed.

The total bounce buffer allocation size is limited individually for each
AddressSpace. The default limit is 4096 bytes, matching the previous
maximum buffer size. A new x-max-bounce-buffer-size parameter is
provided to configure the limit for PCI devices.

Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
---
 hw/pci/pci.c                |  8 ++++
 include/exec/memory.h       | 14 +++----
 include/hw/pci/pci_device.h |  3 ++
 system/memory.c             |  5 ++-
 system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
 5 files changed, 76 insertions(+), 36 deletions(-)

Comments

Philippe Mathieu-Daudé May 7, 2024, 12:57 p.m. UTC | #1
On 7/5/24 11:42, Mattias Nissler wrote:
> When DMA memory can't be directly accessed, as is the case when
> running the device model in a separate process without shareable DMA
> file descriptors, bounce buffering is used.
> 
> It is not uncommon for device models to request mapping of several DMA
> regions at the same time. Examples include:
>   * net devices, e.g. when transmitting a packet that is split across
>     several TX descriptors (observed with igb)
>   * USB host controllers, when handling a packet with multiple data TRBs
>     (observed with xhci)
> 
> Previously, qemu only provided a single bounce buffer per AddressSpace
> and would fail DMA map requests while the buffer was already in use. In
> turn, this would cause DMA failures that ultimately manifest as hardware
> errors from the guest perspective.
> 
> This change allocates DMA bounce buffers dynamically instead of
> supporting only a single buffer. Thus, multiple DMA mappings work
> correctly also when RAM can't be mmap()-ed.
> 
> The total bounce buffer allocation size is limited individually for each
> AddressSpace. The default limit is 4096 bytes, matching the previous
> maximum buffer size. A new x-max-bounce-buffer-size parameter is
> provided to configure the limit for PCI devices.
> 
> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> ---
>   hw/pci/pci.c                |  8 ++++
>   include/exec/memory.h       | 14 +++----
>   include/hw/pci/pci_device.h |  3 ++
>   system/memory.c             |  5 ++-
>   system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
>   5 files changed, 76 insertions(+), 36 deletions(-)


> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index d417d7f363..2ea1e99da2 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -1117,13 +1117,7 @@ typedef struct AddressSpaceMapClient {
>       QLIST_ENTRY(AddressSpaceMapClient) link;
>   } AddressSpaceMapClient;
>   
> -typedef struct {
> -    MemoryRegion *mr;
> -    void *buffer;
> -    hwaddr addr;
> -    hwaddr len;
> -    bool in_use;
> -} BounceBuffer;
> +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
>   
>   /**
>    * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> @@ -1143,8 +1137,10 @@ struct AddressSpace {
>       QTAILQ_HEAD(, MemoryListener) listeners;
>       QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>   
> -    /* Bounce buffer to use for this address space. */
> -    BounceBuffer bounce;
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint32_t max_bounce_buffer_size;

Alternatively size_t.

> +    /* Total size of bounce buffers currently allocated, atomically accessed */
> +    uint32_t bounce_buffer_size;

Ditto.

>       /* List of callbacks to invoke when buffers free up */
>       QemuMutex map_client_list_lock;
>       QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> index d3dd0f64b2..253b48a688 100644
> --- a/include/hw/pci/pci_device.h
> +++ b/include/hw/pci/pci_device.h
> @@ -160,6 +160,9 @@ struct PCIDevice {
>       /* ID of standby device in net_failover pair */
>       char *failover_pair_id;
>       uint32_t acpi_index;
> +
> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> +    uint32_t max_bounce_buffer_size;

Ditto.

>   };


> diff --git a/system/physmem.c b/system/physmem.c
> index 632da6508a..cd61758da0 100644
> --- a/system/physmem.c
> +++ b/system/physmem.c
> @@ -3046,6 +3046,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
>                                        NULL, len, FLUSH_CACHE);
>   }
>   
> +/*
> + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> + * to detect illegal pointers passed to address_space_unmap.
> + */
> +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> +
> +typedef struct {
> +    uint64_t magic;
> +    MemoryRegion *mr;
> +    hwaddr addr;
> +    uint32_t len;
> +    uint8_t buffer[];
> +} BounceBuffer;

Eh, you moved it back here. Never mind.

> +
>   static void
>   address_space_unregister_map_client_do(AddressSpaceMapClient *client)
>   {
> @@ -3071,9 +3085,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
>       qemu_mutex_lock(&as->map_client_list_lock);
>       client->bh = bh;
>       QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> -    /* Write map_client_list before reading in_use.  */
> +    /* Write map_client_list before reading bounce_buffer_size. */
>       smp_mb();
> -    if (!qatomic_read(&as->bounce.in_use)) {
> +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
>           address_space_notify_map_clients_locked(as);
>       }
>       qemu_mutex_unlock(&as->map_client_list_lock);
> @@ -3203,28 +3217,40 @@ void *address_space_map(AddressSpace *as,
>       mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
>   
>       if (!memory_access_is_direct(mr, is_write)) {
> -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> +        uint32_t used = qatomic_read(&as->bounce_buffer_size);

Nitpicking again, size_t seems clearer. Otherwise LGTM.
Mattias Nissler May 7, 2024, 2:04 p.m. UTC | #2
On Tue, May 7, 2024 at 2:57 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
>
> On 7/5/24 11:42, Mattias Nissler wrote:
> > When DMA memory can't be directly accessed, as is the case when
> > running the device model in a separate process without shareable DMA
> > file descriptors, bounce buffering is used.
> >
> > It is not uncommon for device models to request mapping of several DMA
> > regions at the same time. Examples include:
> >   * net devices, e.g. when transmitting a packet that is split across
> >     several TX descriptors (observed with igb)
> >   * USB host controllers, when handling a packet with multiple data TRBs
> >     (observed with xhci)
> >
> > Previously, qemu only provided a single bounce buffer per AddressSpace
> > and would fail DMA map requests while the buffer was already in use. In
> > turn, this would cause DMA failures that ultimately manifest as hardware
> > errors from the guest perspective.
> >
> > This change allocates DMA bounce buffers dynamically instead of
> > supporting only a single buffer. Thus, multiple DMA mappings work
> > correctly also when RAM can't be mmap()-ed.
> >
> > The total bounce buffer allocation size is limited individually for each
> > AddressSpace. The default limit is 4096 bytes, matching the previous
> > maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > provided to configure the limit for PCI devices.
> >
> > Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > ---
> >   hw/pci/pci.c                |  8 ++++
> >   include/exec/memory.h       | 14 +++----
> >   include/hw/pci/pci_device.h |  3 ++
> >   system/memory.c             |  5 ++-
> >   system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
> >   5 files changed, 76 insertions(+), 36 deletions(-)
>
>
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index d417d7f363..2ea1e99da2 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -1117,13 +1117,7 @@ typedef struct AddressSpaceMapClient {
> >       QLIST_ENTRY(AddressSpaceMapClient) link;
> >   } AddressSpaceMapClient;
> >
> > -typedef struct {
> > -    MemoryRegion *mr;
> > -    void *buffer;
> > -    hwaddr addr;
> > -    hwaddr len;
> > -    bool in_use;
> > -} BounceBuffer;
> > +#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
> >
> >   /**
> >    * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> > @@ -1143,8 +1137,10 @@ struct AddressSpace {
> >       QTAILQ_HEAD(, MemoryListener) listeners;
> >       QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >
> > -    /* Bounce buffer to use for this address space. */
> > -    BounceBuffer bounce;
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint32_t max_bounce_buffer_size;
>
> Alternatively size_t.

While switching things over, I was surprised to find that
DEFINE_PROP_SIZE wants a uint64_t field rather than a size_t field.
There is a DEFINE_PROP_SIZE32 variant for uint32_t though. Considering
my options, assuming that we want to use size_t for everything other
than the property:

(1) Make PCIDevice::max_bounce_buffer_size size_t and have the
preprocessor select DEFINE_PROP_SIZE/DEFINE_PROP_SIZE32. This makes
the qdev property type depend on the host. Ugh.

(2) Make PCIDevice::max_bounce_buffer_size uint64_t and clamp if
needed when used. Weird to allow larger values that are then clamped,
although it probably doesn't matter in practice since address space is
limited to 4GB anyways.

(3) Make PCIDevice::max_bounce_buffer_size uint32_t and accept the
limitation that the largest bounce buffer limit is 4GB even on 64-bit
hosts.

#3 seemed most pragmatic, so I'll go with that.


>
> > +    /* Total size of bounce buffers currently allocated, atomically accessed */
> > +    uint32_t bounce_buffer_size;
>
> Ditto.
>
> >       /* List of callbacks to invoke when buffers free up */
> >       QemuMutex map_client_list_lock;
> >       QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
> > diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
> > index d3dd0f64b2..253b48a688 100644
> > --- a/include/hw/pci/pci_device.h
> > +++ b/include/hw/pci/pci_device.h
> > @@ -160,6 +160,9 @@ struct PCIDevice {
> >       /* ID of standby device in net_failover pair */
> >       char *failover_pair_id;
> >       uint32_t acpi_index;
> > +
> > +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > +    uint32_t max_bounce_buffer_size;
>
> Ditto.
>
> >   };
>
>
> > diff --git a/system/physmem.c b/system/physmem.c
> > index 632da6508a..cd61758da0 100644
> > --- a/system/physmem.c
> > +++ b/system/physmem.c
> > @@ -3046,6 +3046,20 @@ void cpu_flush_icache_range(hwaddr start, hwaddr len)
> >                                        NULL, len, FLUSH_CACHE);
> >   }
> >
> > +/*
> > + * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
> > + * to detect illegal pointers passed to address_space_unmap.
> > + */
> > +#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
> > +
> > +typedef struct {
> > +    uint64_t magic;
> > +    MemoryRegion *mr;
> > +    hwaddr addr;
> > +    uint32_t len;
> > +    uint8_t buffer[];
> > +} BounceBuffer;
>
> Eh, you moved it back here. Never mind.
>
> > +
> >   static void
> >   address_space_unregister_map_client_do(AddressSpaceMapClient *client)
> >   {
> > @@ -3071,9 +3085,9 @@ void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
> >       qemu_mutex_lock(&as->map_client_list_lock);
> >       client->bh = bh;
> >       QLIST_INSERT_HEAD(&as->map_client_list, client, link);
> > -    /* Write map_client_list before reading in_use.  */
> > +    /* Write map_client_list before reading bounce_buffer_size. */
> >       smp_mb();
> > -    if (!qatomic_read(&as->bounce.in_use)) {
> > +    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
> >           address_space_notify_map_clients_locked(as);
> >       }
> >       qemu_mutex_unlock(&as->map_client_list_lock);
> > @@ -3203,28 +3217,40 @@ void *address_space_map(AddressSpace *as,
> >       mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
> >
> >       if (!memory_access_is_direct(mr, is_write)) {
> > -        if (qatomic_xchg(&as->bounce.in_use, true)) {
> > +        uint32_t used = qatomic_read(&as->bounce_buffer_size);
>
> Nitpicking again, size_t seems clearer. Otherwise LGTM.
Philippe Mathieu-Daudé May 7, 2024, 2:46 p.m. UTC | #3
On 7/5/24 16:04, Mattias Nissler wrote:
> On Tue, May 7, 2024 at 2:57 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
>>
>> On 7/5/24 11:42, Mattias Nissler wrote:
>>> When DMA memory can't be directly accessed, as is the case when
>>> running the device model in a separate process without shareable DMA
>>> file descriptors, bounce buffering is used.
>>>
>>> It is not uncommon for device models to request mapping of several DMA
>>> regions at the same time. Examples include:
>>>    * net devices, e.g. when transmitting a packet that is split across
>>>      several TX descriptors (observed with igb)
>>>    * USB host controllers, when handling a packet with multiple data TRBs
>>>      (observed with xhci)
>>>
>>> Previously, qemu only provided a single bounce buffer per AddressSpace
>>> and would fail DMA map requests while the buffer was already in use. In
>>> turn, this would cause DMA failures that ultimately manifest as hardware
>>> errors from the guest perspective.
>>>
>>> This change allocates DMA bounce buffers dynamically instead of
>>> supporting only a single buffer. Thus, multiple DMA mappings work
>>> correctly also when RAM can't be mmap()-ed.
>>>
>>> The total bounce buffer allocation size is limited individually for each
>>> AddressSpace. The default limit is 4096 bytes, matching the previous
>>> maximum buffer size. A new x-max-bounce-buffer-size parameter is
>>> provided to configure the limit for PCI devices.
>>>
>>> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
>>> ---
>>>    hw/pci/pci.c                |  8 ++++
>>>    include/exec/memory.h       | 14 +++----
>>>    include/hw/pci/pci_device.h |  3 ++
>>>    system/memory.c             |  5 ++-
>>>    system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
>>>    5 files changed, 76 insertions(+), 36 deletions(-)


>>>    /**
>>>     * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
>>> @@ -1143,8 +1137,10 @@ struct AddressSpace {
>>>        QTAILQ_HEAD(, MemoryListener) listeners;
>>>        QTAILQ_ENTRY(AddressSpace) address_spaces_link;
>>>
>>> -    /* Bounce buffer to use for this address space. */
>>> -    BounceBuffer bounce;
>>> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
>>> +    uint32_t max_bounce_buffer_size;
>>
>> Alternatively size_t.
> 
> While switching things over, I was surprised to find that
> DEFINE_PROP_SIZE wants a uint64_t field rather than a size_t field.
> There is a DEFINE_PROP_SIZE32 variant for uint32_t though. Considering
> my options, assuming that we want to use size_t for everything other
> than the property:
> 
> (1) Make PCIDevice::max_bounce_buffer_size size_t and have the
> preprocessor select DEFINE_PROP_SIZE/DEFINE_PROP_SIZE32. This makes
> the qdev property type depend on the host. Ugh.
> 
> (2) Make PCIDevice::max_bounce_buffer_size uint64_t and clamp if
> needed when used. Weird to allow larger values that are then clamped,
> although it probably doesn't matter in practice since address space is
> limited to 4GB anyways.
> 
> (3) Make PCIDevice::max_bounce_buffer_size uint32_t and accept the
> limitation that the largest bounce buffer limit is 4GB even on 64-bit
> hosts.
> 
> #3 seemed most pragmatic, so I'll go with that.

LGTM, thanks for updating.

> 
> 
>>
>>> +    /* Total size of bounce buffers currently allocated, atomically accessed */
>>> +    uint32_t bounce_buffer_size;
>>
>> Ditto.
Mattias Nissler May 8, 2024, 6:33 a.m. UTC | #4
On Tue, May 7, 2024 at 4:46 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
>
> On 7/5/24 16:04, Mattias Nissler wrote:
> > On Tue, May 7, 2024 at 2:57 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
> >>
> >> On 7/5/24 11:42, Mattias Nissler wrote:
> >>> When DMA memory can't be directly accessed, as is the case when
> >>> running the device model in a separate process without shareable DMA
> >>> file descriptors, bounce buffering is used.
> >>>
> >>> It is not uncommon for device models to request mapping of several DMA
> >>> regions at the same time. Examples include:
> >>>    * net devices, e.g. when transmitting a packet that is split across
> >>>      several TX descriptors (observed with igb)
> >>>    * USB host controllers, when handling a packet with multiple data TRBs
> >>>      (observed with xhci)
> >>>
> >>> Previously, qemu only provided a single bounce buffer per AddressSpace
> >>> and would fail DMA map requests while the buffer was already in use. In
> >>> turn, this would cause DMA failures that ultimately manifest as hardware
> >>> errors from the guest perspective.
> >>>
> >>> This change allocates DMA bounce buffers dynamically instead of
> >>> supporting only a single buffer. Thus, multiple DMA mappings work
> >>> correctly also when RAM can't be mmap()-ed.
> >>>
> >>> The total bounce buffer allocation size is limited individually for each
> >>> AddressSpace. The default limit is 4096 bytes, matching the previous
> >>> maximum buffer size. A new x-max-bounce-buffer-size parameter is
> >>> provided to configure the limit for PCI devices.
> >>>
> >>> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> >>> ---
> >>>    hw/pci/pci.c                |  8 ++++
> >>>    include/exec/memory.h       | 14 +++----
> >>>    include/hw/pci/pci_device.h |  3 ++
> >>>    system/memory.c             |  5 ++-
> >>>    system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
> >>>    5 files changed, 76 insertions(+), 36 deletions(-)
>
>
> >>>    /**
> >>>     * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> >>> @@ -1143,8 +1137,10 @@ struct AddressSpace {
> >>>        QTAILQ_HEAD(, MemoryListener) listeners;
> >>>        QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> >>>
> >>> -    /* Bounce buffer to use for this address space. */
> >>> -    BounceBuffer bounce;
> >>> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> >>> +    uint32_t max_bounce_buffer_size;
> >>
> >> Alternatively size_t.
> >
> > While switching things over, I was surprised to find that
> > DEFINE_PROP_SIZE wants a uint64_t field rather than a size_t field.
> > There is a DEFINE_PROP_SIZE32 variant for uint32_t though. Considering
> > my options, assuming that we want to use size_t for everything other
> > than the property:
> >
> > (1) Make PCIDevice::max_bounce_buffer_size size_t and have the
> > preprocessor select DEFINE_PROP_SIZE/DEFINE_PROP_SIZE32. This makes
> > the qdev property type depend on the host. Ugh.
> >
> > (2) Make PCIDevice::max_bounce_buffer_size uint64_t and clamp if
> > needed when used. Weird to allow larger values that are then clamped,
> > although it probably doesn't matter in practice since address space is
> > limited to 4GB anyways.
> >
> > (3) Make PCIDevice::max_bounce_buffer_size uint32_t and accept the
> > limitation that the largest bounce buffer limit is 4GB even on 64-bit
> > hosts.
> >
> > #3 seemed most pragmatic, so I'll go with that.
>
> LGTM, thanks for updating.

No problem, can I ask you to provide a formal R-B on the v10 #4 patch
[1] then, so the series will be ready to go in?

[1] https://lists.nongnu.org/archive/html/qemu-devel/2024-05/msg01382.html

>
> >
> >
> >>
> >>> +    /* Total size of bounce buffers currently allocated, atomically accessed */
> >>> +    uint32_t bounce_buffer_size;
> >>
> >> Ditto.
>
Mattias Nissler May 13, 2024, 6:29 a.m. UTC | #5
Phil,

Did you accidentally miss this in your pull request [1] or did you
leave it out intentionally?

It's still missing the Reviewed bit after the atomic cmpxchg change
prompted by the size_t switch, so that might be the reason - I'm just
trying to figure out what the next step is for this patch.

Thanks,
Mattias

[1] https://lists.nongnu.org/archive/html/qemu-devel/2024-05/msg01712.html

On Wed, May 8, 2024 at 8:33 AM Mattias Nissler <mnissler@rivosinc.com> wrote:
>
> On Tue, May 7, 2024 at 4:46 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
> >
> > On 7/5/24 16:04, Mattias Nissler wrote:
> > > On Tue, May 7, 2024 at 2:57 PM Philippe Mathieu-Daudé <philmd@linaro.org> wrote:
> > >>
> > >> On 7/5/24 11:42, Mattias Nissler wrote:
> > >>> When DMA memory can't be directly accessed, as is the case when
> > >>> running the device model in a separate process without shareable DMA
> > >>> file descriptors, bounce buffering is used.
> > >>>
> > >>> It is not uncommon for device models to request mapping of several DMA
> > >>> regions at the same time. Examples include:
> > >>>    * net devices, e.g. when transmitting a packet that is split across
> > >>>      several TX descriptors (observed with igb)
> > >>>    * USB host controllers, when handling a packet with multiple data TRBs
> > >>>      (observed with xhci)
> > >>>
> > >>> Previously, qemu only provided a single bounce buffer per AddressSpace
> > >>> and would fail DMA map requests while the buffer was already in use. In
> > >>> turn, this would cause DMA failures that ultimately manifest as hardware
> > >>> errors from the guest perspective.
> > >>>
> > >>> This change allocates DMA bounce buffers dynamically instead of
> > >>> supporting only a single buffer. Thus, multiple DMA mappings work
> > >>> correctly also when RAM can't be mmap()-ed.
> > >>>
> > >>> The total bounce buffer allocation size is limited individually for each
> > >>> AddressSpace. The default limit is 4096 bytes, matching the previous
> > >>> maximum buffer size. A new x-max-bounce-buffer-size parameter is
> > >>> provided to configure the limit for PCI devices.
> > >>>
> > >>> Signed-off-by: Mattias Nissler <mnissler@rivosinc.com>
> > >>> ---
> > >>>    hw/pci/pci.c                |  8 ++++
> > >>>    include/exec/memory.h       | 14 +++----
> > >>>    include/hw/pci/pci_device.h |  3 ++
> > >>>    system/memory.c             |  5 ++-
> > >>>    system/physmem.c            | 82 ++++++++++++++++++++++++++-----------
> > >>>    5 files changed, 76 insertions(+), 36 deletions(-)
> >
> >
> > >>>    /**
> > >>>     * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
> > >>> @@ -1143,8 +1137,10 @@ struct AddressSpace {
> > >>>        QTAILQ_HEAD(, MemoryListener) listeners;
> > >>>        QTAILQ_ENTRY(AddressSpace) address_spaces_link;
> > >>>
> > >>> -    /* Bounce buffer to use for this address space. */
> > >>> -    BounceBuffer bounce;
> > >>> +    /* Maximum DMA bounce buffer size used for indirect memory map requests */
> > >>> +    uint32_t max_bounce_buffer_size;
> > >>
> > >> Alternatively size_t.
> > >
> > > While switching things over, I was surprised to find that
> > > DEFINE_PROP_SIZE wants a uint64_t field rather than a size_t field.
> > > There is a DEFINE_PROP_SIZE32 variant for uint32_t though. Considering
> > > my options, assuming that we want to use size_t for everything other
> > > than the property:
> > >
> > > (1) Make PCIDevice::max_bounce_buffer_size size_t and have the
> > > preprocessor select DEFINE_PROP_SIZE/DEFINE_PROP_SIZE32. This makes
> > > the qdev property type depend on the host. Ugh.
> > >
> > > (2) Make PCIDevice::max_bounce_buffer_size uint64_t and clamp if
> > > needed when used. Weird to allow larger values that are then clamped,
> > > although it probably doesn't matter in practice since address space is
> > > limited to 4GB anyways.
> > >
> > > (3) Make PCIDevice::max_bounce_buffer_size uint32_t and accept the
> > > limitation that the largest bounce buffer limit is 4GB even on 64-bit
> > > hosts.
> > >
> > > #3 seemed most pragmatic, so I'll go with that.
> >
> > LGTM, thanks for updating.
>
> No problem, can I ask you to provide a formal R-B on the v10 #4 patch
> [1] then, so the series will be ready to go in?
>
> [1] https://lists.nongnu.org/archive/html/qemu-devel/2024-05/msg01382.html
>
> >
> > >
> > >
> > >>
> > >>> +    /* Total size of bounce buffers currently allocated, atomically accessed */
> > >>> +    uint32_t bounce_buffer_size;
> > >>
> > >> Ditto.
> >
diff mbox series

Patch

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 324c1302d2..69934bfbbf 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -85,6 +85,8 @@  static Property pci_props[] = {
                     QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
     DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
                     QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
+    DEFINE_PROP_UINT32("x-max-bounce-buffer-size", PCIDevice,
+                       max_bounce_buffer_size, DEFAULT_MAX_BOUNCE_BUFFER_SIZE),
     DEFINE_PROP_END_OF_LIST()
 };
 
@@ -1204,6 +1206,8 @@  static PCIDevice *do_pci_register_device(PCIDevice *pci_dev,
                        "bus master container", UINT64_MAX);
     address_space_init(&pci_dev->bus_master_as,
                        &pci_dev->bus_master_container_region, pci_dev->name);
+    pci_dev->bus_master_as.max_bounce_buffer_size =
+        pci_dev->max_bounce_buffer_size;
 
     if (phase_check(PHASE_MACHINE_READY)) {
         pci_init_bus_master(pci_dev);
@@ -2633,6 +2637,10 @@  static void pci_device_class_init(ObjectClass *klass, void *data)
     k->unrealize = pci_qdev_unrealize;
     k->bus_type = TYPE_PCI_BUS;
     device_class_set_props(k, pci_props);
+    object_class_property_set_description(
+        klass, "x-max-bounce-buffer-size",
+        "Maximum buffer size allocated for bounce buffers used for mapped "
+        "access to indirect DMA memory");
 }
 
 static void pci_device_class_base_init(ObjectClass *klass, void *data)
diff --git a/include/exec/memory.h b/include/exec/memory.h
index d417d7f363..2ea1e99da2 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -1117,13 +1117,7 @@  typedef struct AddressSpaceMapClient {
     QLIST_ENTRY(AddressSpaceMapClient) link;
 } AddressSpaceMapClient;
 
-typedef struct {
-    MemoryRegion *mr;
-    void *buffer;
-    hwaddr addr;
-    hwaddr len;
-    bool in_use;
-} BounceBuffer;
+#define DEFAULT_MAX_BOUNCE_BUFFER_SIZE (4096)
 
 /**
  * struct AddressSpace: describes a mapping of addresses to #MemoryRegion objects
@@ -1143,8 +1137,10 @@  struct AddressSpace {
     QTAILQ_HEAD(, MemoryListener) listeners;
     QTAILQ_ENTRY(AddressSpace) address_spaces_link;
 
-    /* Bounce buffer to use for this address space. */
-    BounceBuffer bounce;
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint32_t max_bounce_buffer_size;
+    /* Total size of bounce buffers currently allocated, atomically accessed */
+    uint32_t bounce_buffer_size;
     /* List of callbacks to invoke when buffers free up */
     QemuMutex map_client_list_lock;
     QLIST_HEAD(, AddressSpaceMapClient) map_client_list;
diff --git a/include/hw/pci/pci_device.h b/include/hw/pci/pci_device.h
index d3dd0f64b2..253b48a688 100644
--- a/include/hw/pci/pci_device.h
+++ b/include/hw/pci/pci_device.h
@@ -160,6 +160,9 @@  struct PCIDevice {
     /* ID of standby device in net_failover pair */
     char *failover_pair_id;
     uint32_t acpi_index;
+
+    /* Maximum DMA bounce buffer size used for indirect memory map requests */
+    uint32_t max_bounce_buffer_size;
 };
 
 static inline int pci_intx(PCIDevice *pci_dev)
diff --git a/system/memory.c b/system/memory.c
index 642a449f8c..c288ed354a 100644
--- a/system/memory.c
+++ b/system/memory.c
@@ -3174,7 +3174,8 @@  void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
     as->ioeventfds = NULL;
     QTAILQ_INIT(&as->listeners);
     QTAILQ_INSERT_TAIL(&address_spaces, as, address_spaces_link);
-    as->bounce.in_use = false;
+    as->max_bounce_buffer_size = DEFAULT_MAX_BOUNCE_BUFFER_SIZE;
+    as->bounce_buffer_size = 0;
     qemu_mutex_init(&as->map_client_list_lock);
     QLIST_INIT(&as->map_client_list);
     as->name = g_strdup(name ? name : "anonymous");
@@ -3184,7 +3185,7 @@  void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
 
 static void do_address_space_destroy(AddressSpace *as)
 {
-    assert(!qatomic_read(&as->bounce.in_use));
+    assert(qatomic_read(&as->bounce_buffer_size) == 0);
     assert(QLIST_EMPTY(&as->map_client_list));
     qemu_mutex_destroy(&as->map_client_list_lock);
 
diff --git a/system/physmem.c b/system/physmem.c
index 632da6508a..cd61758da0 100644
--- a/system/physmem.c
+++ b/system/physmem.c
@@ -3046,6 +3046,20 @@  void cpu_flush_icache_range(hwaddr start, hwaddr len)
                                      NULL, len, FLUSH_CACHE);
 }
 
+/*
+ * A magic value stored in the first 8 bytes of the bounce buffer struct. Used
+ * to detect illegal pointers passed to address_space_unmap.
+ */
+#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
+
+typedef struct {
+    uint64_t magic;
+    MemoryRegion *mr;
+    hwaddr addr;
+    uint32_t len;
+    uint8_t buffer[];
+} BounceBuffer;
+
 static void
 address_space_unregister_map_client_do(AddressSpaceMapClient *client)
 {
@@ -3071,9 +3085,9 @@  void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
     qemu_mutex_lock(&as->map_client_list_lock);
     client->bh = bh;
     QLIST_INSERT_HEAD(&as->map_client_list, client, link);
-    /* Write map_client_list before reading in_use.  */
+    /* Write map_client_list before reading bounce_buffer_size. */
     smp_mb();
-    if (!qatomic_read(&as->bounce.in_use)) {
+    if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
         address_space_notify_map_clients_locked(as);
     }
     qemu_mutex_unlock(&as->map_client_list_lock);
@@ -3203,28 +3217,40 @@  void *address_space_map(AddressSpace *as,
     mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
 
     if (!memory_access_is_direct(mr, is_write)) {
-        if (qatomic_xchg(&as->bounce.in_use, true)) {
+        uint32_t used = qatomic_read(&as->bounce_buffer_size);
+        for (;;) {
+            hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
+            uint32_t new_size = used + alloc;
+            uint32_t actual =
+                qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
+            if (actual == used) {
+                l = alloc;
+                break;
+            }
+            used = actual;
+        }
+
+        if (l == 0) {
             *plen = 0;
             return NULL;
         }
-        /* Avoid unbounded allocations */
-        l = MIN(l, TARGET_PAGE_SIZE);
-        as->bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l);
-        as->bounce.addr = addr;
-        as->bounce.len = l;
 
+        BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
+        bounce->magic = BOUNCE_BUFFER_MAGIC;
         memory_region_ref(mr);
-        as->bounce.mr = mr;
+        bounce->mr = mr;
+        bounce->addr = addr;
+        bounce->len = l;
+
         if (!is_write) {
             flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED,
-                          as->bounce.buffer, l);
+                          bounce->buffer, l);
         }
 
         *plen = l;
-        return as->bounce.buffer;
+        return bounce->buffer;
     }
 
-
     memory_region_ref(mr);
     *plen = flatview_extend_translation(fv, addr, len, mr, xlat,
                                         l, is_write, attrs);
@@ -3239,12 +3265,11 @@  void *address_space_map(AddressSpace *as,
 void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
                          bool is_write, hwaddr access_len)
 {
-    if (buffer != as->bounce.buffer) {
-        MemoryRegion *mr;
-        ram_addr_t addr1;
+    MemoryRegion *mr;
+    ram_addr_t addr1;
 
-        mr = memory_region_from_host(buffer, &addr1);
-        assert(mr != NULL);
+    mr = memory_region_from_host(buffer, &addr1);
+    if (mr != NULL) {
         if (is_write) {
             invalidate_and_set_dirty(mr, addr1, access_len);
         }
@@ -3254,15 +3279,22 @@  void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
         memory_region_unref(mr);
         return;
     }
+
+
+    BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
+    assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
+
     if (is_write) {
-        address_space_write(as, as->bounce.addr, MEMTXATTRS_UNSPECIFIED,
-                            as->bounce.buffer, access_len);
-    }
-    qemu_vfree(as->bounce.buffer);
-    as->bounce.buffer = NULL;
-    memory_region_unref(as->bounce.mr);
-    /* Clear in_use before reading map_client_list.  */
-    qatomic_set_mb(&as->bounce.in_use, false);
+        address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
+                            bounce->buffer, access_len);
+    }
+
+    qatomic_sub(&as->bounce_buffer_size, bounce->len);
+    bounce->magic = ~BOUNCE_BUFFER_MAGIC;
+    memory_region_unref(bounce->mr);
+    g_free(bounce);
+    /* Write bounce_buffer_size before reading map_client_list. */
+    smp_mb();
     address_space_notify_map_clients(as);
 }