diff mbox series

[v2,2/3] s390x/pci: Honor DMA limits set by vfio

Message ID 1600122570-12941-3-git-send-email-mjrosato@linux.ibm.com
State New
Headers show
Series s390x/pci: Accomodate vfio DMA limiting | expand

Commit Message

Matthew Rosato Sept. 14, 2020, 10:29 p.m. UTC
When an s390 guest is using lazy unmapping, it can result in a very
large number of oustanding DMA requests, far beyond the default
limit configured for vfio.  Let's track DMA usage similar to vfio
in the host, and trigger the guest to flush their DMA mappings
before vfio runs out.

Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
---
 hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
 hw/s390x/s390-pci-bus.h  |  9 +++++
 hw/s390x/s390-pci-inst.c | 29 +++++++++++---
 hw/s390x/s390-pci-inst.h |  3 ++
 4 files changed, 129 insertions(+), 11 deletions(-)

Comments

Cornelia Huck Sept. 15, 2020, 11:28 a.m. UTC | #1
On Mon, 14 Sep 2020 18:29:29 -0400
Matthew Rosato <mjrosato@linux.ibm.com> wrote:

> When an s390 guest is using lazy unmapping, it can result in a very
> large number of oustanding DMA requests, far beyond the default
> limit configured for vfio.  Let's track DMA usage similar to vfio
> in the host, and trigger the guest to flush their DMA mappings
> before vfio runs out.
> 
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
>  hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/s390x/s390-pci-bus.h  |  9 +++++
>  hw/s390x/s390-pci-inst.c | 29 +++++++++++---
>  hw/s390x/s390-pci-inst.h |  3 ++
>  4 files changed, 129 insertions(+), 11 deletions(-)
> 

(...)

> @@ -737,6 +742,82 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
>      object_unref(OBJECT(iommu));
>  }
>  
> +static bool s390_sync_dma_avail(int fd, unsigned int *avail)

Not sure I like the name. It sounds like the function checks whether
"sync dma" is available. Maybe s390_update_dma_avail()?

> +{
> +    struct vfio_iommu_type1_info *info;
> +    uint32_t argsz;
> +    bool rval = false;
> +    int ret;
> +
> +    if (avail == NULL) {
> +        return false;
> +    }
> +
> +    argsz = sizeof(struct vfio_iommu_type1_info);
> +    info = g_malloc0(argsz);
> +    info->argsz = argsz;
> +    /*
> +     * If the specified argsz is not large enough to contain all
> +     * capabilities it will be updated upon return.  In this case
> +     * use the updated value to get the entire capability chain.
> +     */
> +    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> +    if (argsz != info->argsz) {
> +        argsz = info->argsz;
> +        info = g_realloc(info, argsz);
> +        info->argsz = argsz;
> +        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> +    }
> +
> +    if (ret) {
> +        goto out;
> +    }
> +
> +    /* If the capability exists, update with the current value */
> +    rval = vfio_get_info_dma_avail(info, avail);

Adding vfio specific things into the generic s390 pci emulation code
looks a bit ugly... I'd prefer to factor that out into a separate file,
especially if you plan to add more vfio-specific things in the future.

> +
> +out:
> +    g_free(info);
> +    return rval;
> +}
> +

(...)

> diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
> index 2f7a7d7..6af9af4 100644
> --- a/hw/s390x/s390-pci-inst.c
> +++ b/hw/s390x/s390-pci-inst.c
> @@ -32,6 +32,9 @@
>          }                                                          \
>      } while (0)
>  
> +#define INC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail++;
> +#define DEC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail--;

Hm... maybe lowercase inline functions might be better here?

> +
>  static void s390_set_status_code(CPUS390XState *env,
>                                   uint8_t r, uint64_t status_code)
>  {

(...)

> @@ -620,6 +629,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
>      S390PCIIOMMU *iommu;
>      S390IOTLBEntry entry;
>      hwaddr start, end;
> +    uint32_t dma_avail;
>  
>      if (env->psw.mask & PSW_MASK_PSTATE) {
>          s390_program_interrupt(env, PGM_PRIVILEGED, ra);
> @@ -675,8 +685,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
>          }
>  
>          start += entry.len;
> -        while (entry.iova < start && entry.iova < end) {
> -            s390_pci_update_iotlb(iommu, &entry);
> +        dma_avail = 1; /* Assume non-zero dma_avail to start */
> +        while (entry.iova < start && entry.iova < end && dma_avail > 0) {
> +            dma_avail = s390_pci_update_iotlb(iommu, &entry);
>              entry.iova += PAGE_SIZE;
>              entry.translated_addr += PAGE_SIZE;
>          }
> @@ -689,7 +700,13 @@ err:
>          s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0);
>      } else {
>          pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++;
> -        setcc(cpu, ZPCI_PCI_LS_OK);
> +        if (dma_avail > 0) {

When I compile this (with a headers update), the compiler moans here
about an uninitialized variable.

> +            setcc(cpu, ZPCI_PCI_LS_OK);
> +        } else {
> +            /* vfio DMA mappings are exhausted, trigger a RPCIT */
> +            setcc(cpu, ZPCI_PCI_LS_ERR);
> +            s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES);
> +        }
>      }
>      return 0;
>  }

(...)
Thomas Huth Sept. 15, 2020, 12:54 p.m. UTC | #2
On 15/09/2020 00.29, Matthew Rosato wrote:
> When an s390 guest is using lazy unmapping, it can result in a very
> large number of oustanding DMA requests, far beyond the default
> limit configured for vfio.  Let's track DMA usage similar to vfio
> in the host, and trigger the guest to flush their DMA mappings
> before vfio runs out.
> 
> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> ---
>  hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
>  hw/s390x/s390-pci-bus.h  |  9 +++++
>  hw/s390x/s390-pci-inst.c | 29 +++++++++++---
>  hw/s390x/s390-pci-inst.h |  3 ++
>  4 files changed, 129 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 92146a2..23474cd 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
> @@ -11,6 +11,8 @@
>   * directory.
>   */
>  
> +#include <sys/ioctl.h>
> +
>  #include "qemu/osdep.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> @@ -24,6 +26,9 @@
>  #include "qemu/error-report.h"
>  #include "qemu/module.h"
>  
> +#include "hw/vfio/pci.h"
> +#include "hw/vfio/vfio-common.h"
> +
>  #ifndef DEBUG_S390PCI_BUS
>  #define DEBUG_S390PCI_BUS  0
>  #endif
> @@ -737,6 +742,82 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
>      object_unref(OBJECT(iommu));
>  }
>  
> +static bool s390_sync_dma_avail(int fd, unsigned int *avail)
> +{
> +    struct vfio_iommu_type1_info *info;

You could use g_autofree to get rid of the g_free() at the end.

> +    uint32_t argsz;
> +    bool rval = false;
> +    int ret;
> +
> +    if (avail == NULL) {
> +        return false;
> +    }

Since this is a "static" local function, and calling it with avail ==
NULL does not make too much sense, I think I'd rather turn this into an
assert() instead.

> +    argsz = sizeof(struct vfio_iommu_type1_info);
> +    info = g_malloc0(argsz);
> +    info->argsz = argsz;
> +    /*
> +     * If the specified argsz is not large enough to contain all
> +     * capabilities it will be updated upon return.  In this case
> +     * use the updated value to get the entire capability chain.
> +     */
> +    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> +    if (argsz != info->argsz) {
> +        argsz = info->argsz;
> +        info = g_realloc(info, argsz);
> +        info->argsz = argsz;
> +        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> +    }
> +
> +    if (ret) {
> +        goto out;
> +    }
> +
> +    /* If the capability exists, update with the current value */
> +    rval = vfio_get_info_dma_avail(info, avail);
> +
> +out:
> +    g_free(info);
> +    return rval;
> +}
> +
> +static S390PCIDMACount *s390_start_dma_count(S390pciState *s, VFIODevice *vdev)
> +{
> +    int id = vdev->group->container->fd;
> +    S390PCIDMACount *cnt;
> +    uint32_t avail;
> +
> +    if (!s390_sync_dma_avail(id, &avail)) {
> +        return NULL;
> +    }
> +
> +    QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) {
> +        if (cnt->id  == id) {
> +            cnt->users++;
> +            return cnt;
> +        }
> +    }
> +
> +    cnt = g_new0(S390PCIDMACount, 1);
> +    cnt->id = id;
> +    cnt->users = 1;
> +    cnt->avail = avail;
> +    QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link);
> +    return cnt;
> +}
> +
> +static void s390_end_dma_count(S390pciState *s, S390PCIDMACount *cnt)
> +{
> +    if (cnt == NULL) {
> +        return;
> +    }

Either use assert() or drop this completely (since you're checking it at
the caller site already).

> +    cnt->users--;
> +    if (cnt->users == 0) {
> +        QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link);
> +    }
> +}
> +
>  static void s390_pcihost_realize(DeviceState *dev, Error **errp)
>  {
>      PCIBus *b;
> @@ -764,6 +845,7 @@ static void s390_pcihost_realize(DeviceState *dev, Error **errp)
>      s->bus_no = 0;
>      QTAILQ_INIT(&s->pending_sei);
>      QTAILQ_INIT(&s->zpci_devs);
> +    QTAILQ_INIT(&s->zpci_dma_limit);
>  
>      css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false,
>                               S390_ADAPTER_SUPPRESSIBLE, errp);
> @@ -902,6 +984,7 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>  {
>      S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
>      PCIDevice *pdev = NULL;
> +    VFIOPCIDevice *vpdev = NULL;
>      S390PCIBusDevice *pbdev = NULL;
>  
>      if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
> @@ -941,17 +1024,20 @@ static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
>              }
>          }
>  
> +        pbdev->pdev = pdev;
> +        pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn);
> +        pbdev->iommu->pbdev = pbdev;
> +        pbdev->state = ZPCI_FS_DISABLED;
> +
>          if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
>              pbdev->fh |= FH_SHM_VFIO;
> +            vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
> +            pbdev->iommu->dma_limit = s390_start_dma_count(s,
> +                                                           &vpdev->vbasedev);
>          } else {
>              pbdev->fh |= FH_SHM_EMUL;
>          }
>  
> -        pbdev->pdev = pdev;
> -        pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn);
> -        pbdev->iommu->pbdev = pbdev;
> -        pbdev->state = ZPCI_FS_DISABLED;
> -
>          if (s390_pci_msix_init(pbdev)) {
>              error_setg(errp, "MSI-X support is mandatory "
>                         "in the S390 architecture");
> @@ -1004,6 +1090,9 @@ static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
>          pbdev->fid = 0;
>          QTAILQ_REMOVE(&s->zpci_devs, pbdev, link);
>          g_hash_table_remove(s->zpci_table, &pbdev->idx);
> +        if (pbdev->iommu->dma_limit) {
> +            s390_end_dma_count(s, pbdev->iommu->dma_limit);
> +        }
>          qdev_unrealize(dev);
>      }
>  }

 Thomas
Matthew Rosato Sept. 15, 2020, 2:16 p.m. UTC | #3
On 9/15/20 7:28 AM, Cornelia Huck wrote:
> On Mon, 14 Sep 2020 18:29:29 -0400
> Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> 
>> When an s390 guest is using lazy unmapping, it can result in a very
>> large number of oustanding DMA requests, far beyond the default
>> limit configured for vfio.  Let's track DMA usage similar to vfio
>> in the host, and trigger the guest to flush their DMA mappings
>> before vfio runs out.
>>
>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> ---
>>   hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
>>   hw/s390x/s390-pci-bus.h  |  9 +++++
>>   hw/s390x/s390-pci-inst.c | 29 +++++++++++---
>>   hw/s390x/s390-pci-inst.h |  3 ++
>>   4 files changed, 129 insertions(+), 11 deletions(-)
>>
> 
> (...)
> 
>> @@ -737,6 +742,82 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
>>       object_unref(OBJECT(iommu));
>>   }
>>   
>> +static bool s390_sync_dma_avail(int fd, unsigned int *avail)
> 
> Not sure I like the name. It sounds like the function checks whether
> "sync dma" is available. Maybe s390_update_dma_avail()?
> 

Sounds fine to me.

>> +{
>> +    struct vfio_iommu_type1_info *info;
>> +    uint32_t argsz;
>> +    bool rval = false;
>> +    int ret;
>> +
>> +    if (avail == NULL) {
>> +        return false;
>> +    }
>> +
>> +    argsz = sizeof(struct vfio_iommu_type1_info);
>> +    info = g_malloc0(argsz);
>> +    info->argsz = argsz;
>> +    /*
>> +     * If the specified argsz is not large enough to contain all
>> +     * capabilities it will be updated upon return.  In this case
>> +     * use the updated value to get the entire capability chain.
>> +     */
>> +    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
>> +    if (argsz != info->argsz) {
>> +        argsz = info->argsz;
>> +        info = g_realloc(info, argsz);
>> +        info->argsz = argsz;
>> +        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
>> +    }
>> +
>> +    if (ret) {
>> +        goto out;
>> +    }
>> +
>> +    /* If the capability exists, update with the current value */
>> +    rval = vfio_get_info_dma_avail(info, avail);
> 
> Adding vfio specific things into the generic s390 pci emulation code
> looks a bit ugly... I'd prefer to factor that out into a separate file,
> especially if you plan to add more vfio-specific things in the future.
> 

Fair.   hw/s390x/s390-pci-vfio.* ?

>> +
>> +out:
>> +    g_free(info);
>> +    return rval;
>> +}
>> +
> 
> (...)
> 
>> diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
>> index 2f7a7d7..6af9af4 100644
>> --- a/hw/s390x/s390-pci-inst.c
>> +++ b/hw/s390x/s390-pci-inst.c
>> @@ -32,6 +32,9 @@
>>           }                                                          \
>>       } while (0)
>>   
>> +#define INC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail++;
>> +#define DEC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail--;
> 
> Hm... maybe lowercase inline functions might be better here?
> 

OK

>> +
>>   static void s390_set_status_code(CPUS390XState *env,
>>                                    uint8_t r, uint64_t status_code)
>>   {
> 
> (...)
> 
>> @@ -620,6 +629,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
>>       S390PCIIOMMU *iommu;
>>       S390IOTLBEntry entry;
>>       hwaddr start, end;
>> +    uint32_t dma_avail;
>>   
>>       if (env->psw.mask & PSW_MASK_PSTATE) {
>>           s390_program_interrupt(env, PGM_PRIVILEGED, ra);
>> @@ -675,8 +685,9 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
>>           }
>>   
>>           start += entry.len;
>> -        while (entry.iova < start && entry.iova < end) {
>> -            s390_pci_update_iotlb(iommu, &entry);
>> +        dma_avail = 1; /* Assume non-zero dma_avail to start */
>> +        while (entry.iova < start && entry.iova < end && dma_avail > 0) {
>> +            dma_avail = s390_pci_update_iotlb(iommu, &entry);
>>               entry.iova += PAGE_SIZE;
>>               entry.translated_addr += PAGE_SIZE;
>>           }
>> @@ -689,7 +700,13 @@ err:
>>           s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0);
>>       } else {
>>           pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++;
>> -        setcc(cpu, ZPCI_PCI_LS_OK);
>> +        if (dma_avail > 0) {
> 
> When I compile this (with a headers update), the compiler moans here
> about an uninitialized variable.
> 

D'oh.  Obviously dma_avail needs to be initialized outside of the 
if/else -- I'll double-check the logic here and fix.

>> +            setcc(cpu, ZPCI_PCI_LS_OK);
>> +        } else {
>> +            /* vfio DMA mappings are exhausted, trigger a RPCIT */
>> +            setcc(cpu, ZPCI_PCI_LS_ERR);
>> +            s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES);
>> +        }
>>       }
>>       return 0;
>>   }
> 
> (...)
>
Matthew Rosato Sept. 15, 2020, 2:18 p.m. UTC | #4
On 9/15/20 8:54 AM, Thomas Huth wrote:
> On 15/09/2020 00.29, Matthew Rosato wrote:
>> When an s390 guest is using lazy unmapping, it can result in a very
>> large number of oustanding DMA requests, far beyond the default
>> limit configured for vfio.  Let's track DMA usage similar to vfio
>> in the host, and trigger the guest to flush their DMA mappings
>> before vfio runs out.
>>
>> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
>> ---
>>   hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
>>   hw/s390x/s390-pci-bus.h  |  9 +++++
>>   hw/s390x/s390-pci-inst.c | 29 +++++++++++---
>>   hw/s390x/s390-pci-inst.h |  3 ++
>>   4 files changed, 129 insertions(+), 11 deletions(-)
>>
>> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
>> index 92146a2..23474cd 100644
>> --- a/hw/s390x/s390-pci-bus.c
>> +++ b/hw/s390x/s390-pci-bus.c
>> @@ -11,6 +11,8 @@
>>    * directory.
>>    */
>>   
>> +#include <sys/ioctl.h>
>> +
>>   #include "qemu/osdep.h"
>>   #include "qapi/error.h"
>>   #include "qapi/visitor.h"
>> @@ -24,6 +26,9 @@
>>   #include "qemu/error-report.h"
>>   #include "qemu/module.h"
>>   
>> +#include "hw/vfio/pci.h"
>> +#include "hw/vfio/vfio-common.h"
>> +
>>   #ifndef DEBUG_S390PCI_BUS
>>   #define DEBUG_S390PCI_BUS  0
>>   #endif
>> @@ -737,6 +742,82 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
>>       object_unref(OBJECT(iommu));
>>   }
>>   
>> +static bool s390_sync_dma_avail(int fd, unsigned int *avail)
>> +{
>> +    struct vfio_iommu_type1_info *info;
> 
> You could use g_autofree to get rid of the g_free() at the end.
> 

OK

>> +    uint32_t argsz;
>> +    bool rval = false;
>> +    int ret;
>> +
>> +    if (avail == NULL) {
>> +        return false;
>> +    }
> 
> Since this is a "static" local function, and calling it with avail ==
> NULL does not make too much sense, I think I'd rather turn this into an
> assert() instead. >

Sure, sounds good.


>> +    argsz = sizeof(struct vfio_iommu_type1_info);
>> +    info = g_malloc0(argsz);
>> +    info->argsz = argsz;
>> +    /*
>> +     * If the specified argsz is not large enough to contain all
>> +     * capabilities it will be updated upon return.  In this case
>> +     * use the updated value to get the entire capability chain.
>> +     */
>> +    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
>> +    if (argsz != info->argsz) {
>> +        argsz = info->argsz;
>> +        info = g_realloc(info, argsz);
>> +        info->argsz = argsz;
>> +        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
>> +    }
>> +
>> +    if (ret) {
>> +        goto out;
>> +    }
>> +
>> +    /* If the capability exists, update with the current value */
>> +    rval = vfio_get_info_dma_avail(info, avail);
>> +
>> +out:
>> +    g_free(info);
>> +    return rval;
>> +}
>> +
>> +static S390PCIDMACount *s390_start_dma_count(S390pciState *s, VFIODevice *vdev)
>> +{
>> +    int id = vdev->group->container->fd;
>> +    S390PCIDMACount *cnt;
>> +    uint32_t avail;
>> +
>> +    if (!s390_sync_dma_avail(id, &avail)) {
>> +        return NULL;
>> +    }
>> +
>> +    QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) {
>> +        if (cnt->id  == id) {
>> +            cnt->users++;
>> +            return cnt;
>> +        }
>> +    }
>> +
>> +    cnt = g_new0(S390PCIDMACount, 1);
>> +    cnt->id = id;
>> +    cnt->users = 1;
>> +    cnt->avail = avail;
>> +    QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link);
>> +    return cnt;
>> +}
>> +
>> +static void s390_end_dma_count(S390pciState *s, S390PCIDMACount *cnt)
>> +{
>> +    if (cnt == NULL) {
>> +        return;
>> +    }
> 
> Either use assert() or drop this completely (since you're checking it at
> the caller site already).
> 

Fair - I'll assert() here.  Thanks!
Cornelia Huck Sept. 15, 2020, 2:50 p.m. UTC | #5
On Tue, 15 Sep 2020 10:16:23 -0400
Matthew Rosato <mjrosato@linux.ibm.com> wrote:

> On 9/15/20 7:28 AM, Cornelia Huck wrote:
> > On Mon, 14 Sep 2020 18:29:29 -0400
> > Matthew Rosato <mjrosato@linux.ibm.com> wrote:
> >   
> >> When an s390 guest is using lazy unmapping, it can result in a very
> >> large number of oustanding DMA requests, far beyond the default
> >> limit configured for vfio.  Let's track DMA usage similar to vfio
> >> in the host, and trigger the guest to flush their DMA mappings
> >> before vfio runs out.
> >>
> >> Signed-off-by: Matthew Rosato <mjrosato@linux.ibm.com>
> >> ---
> >>   hw/s390x/s390-pci-bus.c  | 99 +++++++++++++++++++++++++++++++++++++++++++++---
> >>   hw/s390x/s390-pci-bus.h  |  9 +++++
> >>   hw/s390x/s390-pci-inst.c | 29 +++++++++++---
> >>   hw/s390x/s390-pci-inst.h |  3 ++
> >>   4 files changed, 129 insertions(+), 11 deletions(-)
> >>  
> > 
> > (...)
> >   
> >> @@ -737,6 +742,82 @@ static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
> >>       object_unref(OBJECT(iommu));
> >>   }
> >>   
> >> +static bool s390_sync_dma_avail(int fd, unsigned int *avail)  
> > 
> > Not sure I like the name. It sounds like the function checks whether
> > "sync dma" is available. Maybe s390_update_dma_avail()?
> >   
> 
> Sounds fine to me.
> 
> >> +{
> >> +    struct vfio_iommu_type1_info *info;
> >> +    uint32_t argsz;
> >> +    bool rval = false;
> >> +    int ret;
> >> +
> >> +    if (avail == NULL) {
> >> +        return false;
> >> +    }
> >> +
> >> +    argsz = sizeof(struct vfio_iommu_type1_info);
> >> +    info = g_malloc0(argsz);
> >> +    info->argsz = argsz;
> >> +    /*
> >> +     * If the specified argsz is not large enough to contain all
> >> +     * capabilities it will be updated upon return.  In this case
> >> +     * use the updated value to get the entire capability chain.
> >> +     */
> >> +    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> >> +    if (argsz != info->argsz) {
> >> +        argsz = info->argsz;
> >> +        info = g_realloc(info, argsz);
> >> +        info->argsz = argsz;
> >> +        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
> >> +    }
> >> +
> >> +    if (ret) {
> >> +        goto out;
> >> +    }
> >> +
> >> +    /* If the capability exists, update with the current value */
> >> +    rval = vfio_get_info_dma_avail(info, avail);  
> > 
> > Adding vfio specific things into the generic s390 pci emulation code
> > looks a bit ugly... I'd prefer to factor that out into a separate file,
> > especially if you plan to add more vfio-specific things in the future.
> >   
> 
> Fair.   hw/s390x/s390-pci-vfio.* ?

Sounds good to me.
diff mbox series

Patch

diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 92146a2..23474cd 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -11,6 +11,8 @@ 
  * directory.
  */
 
+#include <sys/ioctl.h>
+
 #include "qemu/osdep.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
@@ -24,6 +26,9 @@ 
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 
+#include "hw/vfio/pci.h"
+#include "hw/vfio/vfio-common.h"
+
 #ifndef DEBUG_S390PCI_BUS
 #define DEBUG_S390PCI_BUS  0
 #endif
@@ -737,6 +742,82 @@  static void s390_pci_iommu_free(S390pciState *s, PCIBus *bus, int32_t devfn)
     object_unref(OBJECT(iommu));
 }
 
+static bool s390_sync_dma_avail(int fd, unsigned int *avail)
+{
+    struct vfio_iommu_type1_info *info;
+    uint32_t argsz;
+    bool rval = false;
+    int ret;
+
+    if (avail == NULL) {
+        return false;
+    }
+
+    argsz = sizeof(struct vfio_iommu_type1_info);
+    info = g_malloc0(argsz);
+    info->argsz = argsz;
+    /*
+     * If the specified argsz is not large enough to contain all
+     * capabilities it will be updated upon return.  In this case
+     * use the updated value to get the entire capability chain.
+     */
+    ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
+    if (argsz != info->argsz) {
+        argsz = info->argsz;
+        info = g_realloc(info, argsz);
+        info->argsz = argsz;
+        ret = ioctl(fd, VFIO_IOMMU_GET_INFO, info);
+    }
+
+    if (ret) {
+        goto out;
+    }
+
+    /* If the capability exists, update with the current value */
+    rval = vfio_get_info_dma_avail(info, avail);
+
+out:
+    g_free(info);
+    return rval;
+}
+
+static S390PCIDMACount *s390_start_dma_count(S390pciState *s, VFIODevice *vdev)
+{
+    int id = vdev->group->container->fd;
+    S390PCIDMACount *cnt;
+    uint32_t avail;
+
+    if (!s390_sync_dma_avail(id, &avail)) {
+        return NULL;
+    }
+
+    QTAILQ_FOREACH(cnt, &s->zpci_dma_limit, link) {
+        if (cnt->id  == id) {
+            cnt->users++;
+            return cnt;
+        }
+    }
+
+    cnt = g_new0(S390PCIDMACount, 1);
+    cnt->id = id;
+    cnt->users = 1;
+    cnt->avail = avail;
+    QTAILQ_INSERT_TAIL(&s->zpci_dma_limit, cnt, link);
+    return cnt;
+}
+
+static void s390_end_dma_count(S390pciState *s, S390PCIDMACount *cnt)
+{
+    if (cnt == NULL) {
+        return;
+    }
+
+    cnt->users--;
+    if (cnt->users == 0) {
+        QTAILQ_REMOVE(&s->zpci_dma_limit, cnt, link);
+    }
+}
+
 static void s390_pcihost_realize(DeviceState *dev, Error **errp)
 {
     PCIBus *b;
@@ -764,6 +845,7 @@  static void s390_pcihost_realize(DeviceState *dev, Error **errp)
     s->bus_no = 0;
     QTAILQ_INIT(&s->pending_sei);
     QTAILQ_INIT(&s->zpci_devs);
+    QTAILQ_INIT(&s->zpci_dma_limit);
 
     css_register_io_adapters(CSS_IO_ADAPTER_PCI, true, false,
                              S390_ADAPTER_SUPPRESSIBLE, errp);
@@ -902,6 +984,7 @@  static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
 {
     S390pciState *s = S390_PCI_HOST_BRIDGE(hotplug_dev);
     PCIDevice *pdev = NULL;
+    VFIOPCIDevice *vpdev = NULL;
     S390PCIBusDevice *pbdev = NULL;
 
     if (object_dynamic_cast(OBJECT(dev), TYPE_PCI_BRIDGE)) {
@@ -941,17 +1024,20 @@  static void s390_pcihost_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
             }
         }
 
+        pbdev->pdev = pdev;
+        pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn);
+        pbdev->iommu->pbdev = pbdev;
+        pbdev->state = ZPCI_FS_DISABLED;
+
         if (object_dynamic_cast(OBJECT(dev), "vfio-pci")) {
             pbdev->fh |= FH_SHM_VFIO;
+            vpdev = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
+            pbdev->iommu->dma_limit = s390_start_dma_count(s,
+                                                           &vpdev->vbasedev);
         } else {
             pbdev->fh |= FH_SHM_EMUL;
         }
 
-        pbdev->pdev = pdev;
-        pbdev->iommu = s390_pci_get_iommu(s, pci_get_bus(pdev), pdev->devfn);
-        pbdev->iommu->pbdev = pbdev;
-        pbdev->state = ZPCI_FS_DISABLED;
-
         if (s390_pci_msix_init(pbdev)) {
             error_setg(errp, "MSI-X support is mandatory "
                        "in the S390 architecture");
@@ -1004,6 +1090,9 @@  static void s390_pcihost_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
         pbdev->fid = 0;
         QTAILQ_REMOVE(&s->zpci_devs, pbdev, link);
         g_hash_table_remove(s->zpci_table, &pbdev->idx);
+        if (pbdev->iommu->dma_limit) {
+            s390_end_dma_count(s, pbdev->iommu->dma_limit);
+        }
         qdev_unrealize(dev);
     }
 }
diff --git a/hw/s390x/s390-pci-bus.h b/hw/s390x/s390-pci-bus.h
index 0458059..f166fd9 100644
--- a/hw/s390x/s390-pci-bus.h
+++ b/hw/s390x/s390-pci-bus.h
@@ -270,6 +270,13 @@  typedef struct S390IOTLBEntry {
     uint64_t perm;
 } S390IOTLBEntry;
 
+typedef struct S390PCIDMACount {
+    int id;
+    int users;
+    uint32_t avail;
+    QTAILQ_ENTRY(S390PCIDMACount) link;
+} S390PCIDMACount;
+
 struct S390PCIIOMMU {
     Object parent_obj;
     S390PCIBusDevice *pbdev;
@@ -281,6 +288,7 @@  struct S390PCIIOMMU {
     uint64_t pba;
     uint64_t pal;
     GHashTable *iotlb;
+    S390PCIDMACount *dma_limit;
 };
 
 typedef struct S390PCIIOMMUTable {
@@ -356,6 +364,7 @@  struct S390pciState {
     GHashTable *zpci_table;
     QTAILQ_HEAD(, SeiContainer) pending_sei;
     QTAILQ_HEAD(, S390PCIBusDevice) zpci_devs;
+    QTAILQ_HEAD(, S390PCIDMACount) zpci_dma_limit;
 };
 
 S390pciState *s390_get_phb(void);
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 2f7a7d7..6af9af4 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -32,6 +32,9 @@ 
         }                                                          \
     } while (0)
 
+#define INC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail++;
+#define DEC_DMA_AVAIL(iommu) if (iommu->dma_limit) iommu->dma_limit->avail--;
+
 static void s390_set_status_code(CPUS390XState *env,
                                  uint8_t r, uint64_t status_code)
 {
@@ -572,7 +575,8 @@  int pcistg_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
     return 0;
 }
 
-static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry)
+static uint32_t s390_pci_update_iotlb(S390PCIIOMMU *iommu,
+                                      S390IOTLBEntry *entry)
 {
     S390IOTLBEntry *cache = g_hash_table_lookup(iommu->iotlb, &entry->iova);
     IOMMUTLBEntry notify = {
@@ -585,14 +589,15 @@  static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry)
 
     if (entry->perm == IOMMU_NONE) {
         if (!cache) {
-            return;
+            goto out;
         }
         g_hash_table_remove(iommu->iotlb, &entry->iova);
+        INC_DMA_AVAIL(iommu);
     } else {
         if (cache) {
             if (cache->perm == entry->perm &&
                 cache->translated_addr == entry->translated_addr) {
-                return;
+                goto out;
             }
 
             notify.perm = IOMMU_NONE;
@@ -606,9 +611,13 @@  static void s390_pci_update_iotlb(S390PCIIOMMU *iommu, S390IOTLBEntry *entry)
         cache->len = PAGE_SIZE;
         cache->perm = entry->perm;
         g_hash_table_replace(iommu->iotlb, &cache->iova, cache);
+        DEC_DMA_AVAIL(iommu);
     }
 
     memory_region_notify_iommu(&iommu->iommu_mr, 0, notify);
+
+out:
+    return iommu->dma_limit ? iommu->dma_limit->avail : 1;
 }
 
 int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
@@ -620,6 +629,7 @@  int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
     S390PCIIOMMU *iommu;
     S390IOTLBEntry entry;
     hwaddr start, end;
+    uint32_t dma_avail;
 
     if (env->psw.mask & PSW_MASK_PSTATE) {
         s390_program_interrupt(env, PGM_PRIVILEGED, ra);
@@ -675,8 +685,9 @@  int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2, uintptr_t ra)
         }
 
         start += entry.len;
-        while (entry.iova < start && entry.iova < end) {
-            s390_pci_update_iotlb(iommu, &entry);
+        dma_avail = 1; /* Assume non-zero dma_avail to start */
+        while (entry.iova < start && entry.iova < end && dma_avail > 0) {
+            dma_avail = s390_pci_update_iotlb(iommu, &entry);
             entry.iova += PAGE_SIZE;
             entry.translated_addr += PAGE_SIZE;
         }
@@ -689,7 +700,13 @@  err:
         s390_pci_generate_error_event(error, pbdev->fh, pbdev->fid, start, 0);
     } else {
         pbdev->fmb.counter[ZPCI_FMB_CNT_RPCIT]++;
-        setcc(cpu, ZPCI_PCI_LS_OK);
+        if (dma_avail > 0) {
+            setcc(cpu, ZPCI_PCI_LS_OK);
+        } else {
+            /* vfio DMA mappings are exhausted, trigger a RPCIT */
+            setcc(cpu, ZPCI_PCI_LS_ERR);
+            s390_set_status_code(env, r1, ZPCI_RPCIT_ST_INSUFF_RES);
+        }
     }
     return 0;
 }
diff --git a/hw/s390x/s390-pci-inst.h b/hw/s390x/s390-pci-inst.h
index fa3bf8b..8ee3a3c 100644
--- a/hw/s390x/s390-pci-inst.h
+++ b/hw/s390x/s390-pci-inst.h
@@ -254,6 +254,9 @@  typedef struct ClpReqRspQueryPciGrp {
 #define ZPCI_STPCIFC_ST_INVAL_DMAAS   28
 #define ZPCI_STPCIFC_ST_ERROR_RECOVER 40
 
+/* Refresh PCI Translations status codes */
+#define ZPCI_RPCIT_ST_INSUFF_RES      16
+
 /* FIB function controls */
 #define ZPCI_FIB_FC_ENABLED     0x80
 #define ZPCI_FIB_FC_ERROR       0x40