diff mbox series

[qemu,v3,4/6] spapr_iommu: Do not replay mappings from just created DMA window

Message ID 20190227085149.38596-5-aik@ozlabs.ru
State New
Headers show
Series spapr_pci, vfio: NVIDIA V100 + POWER9 passthrough | expand

Commit Message

Alexey Kardashevskiy Feb. 27, 2019, 8:51 a.m. UTC
On sPAPR vfio_listener_region_add() is called in 2 situations:
1. a new listener is registered from vfio_connect_container();
2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().

In both cases vfio_listener_region_add() calls
memory_region_iommu_replay() to notify newly registered IOMMU notifiers
about existing mappings which is totally desirable for case 1.

However for case 2 it is nothing but noop as the window has just been
created and has no valid mappings so replaying those does not do anything.
It is barely noticeable with usual guests but if the window happens to be
really big, such no-op replay might take minutes and trigger RCU stall
warnings in the guest.

For example, a upcoming GPU RAM memory region mapped at 64TiB (right
after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.

This mitigates the problem by adding an "skipping_replay" flag to
sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
exactly the same thing as the generic one except it returns early if
@skipping_replay==true.

When "ibm,create-pe-dma-window" is complete, the guest will map only
required regions of the huge DMA window.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 include/hw/ppc/spapr.h  |  1 +
 hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
 hw/ppc/spapr_rtas_ddw.c |  7 +++++++
 3 files changed, 39 insertions(+)

Comments

Greg Kurz Feb. 27, 2019, 2:33 p.m. UTC | #1
On Wed, 27 Feb 2019 19:51:47 +1100
Alexey Kardashevskiy <aik@ozlabs.ru> wrote:

> On sPAPR vfio_listener_region_add() is called in 2 situations:
> 1. a new listener is registered from vfio_connect_container();
> 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().
> 
> In both cases vfio_listener_region_add() calls
> memory_region_iommu_replay() to notify newly registered IOMMU notifiers
> about existing mappings which is totally desirable for case 1.
> 
> However for case 2 it is nothing but noop as the window has just been
> created and has no valid mappings so replaying those does not do anything.
> It is barely noticeable with usual guests but if the window happens to be
> really big, such no-op replay might take minutes and trigger RCU stall
> warnings in the guest.
> 
> For example, a upcoming GPU RAM memory region mapped at 64TiB (right
> after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
> which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.
> 
> This mitigates the problem by adding an "skipping_replay" flag to
> sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
> exactly the same thing as the generic one except it returns early if
> @skipping_replay==true.
> 
> When "ibm,create-pe-dma-window" is complete, the guest will map only
> required regions of the huge DMA window.
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  include/hw/ppc/spapr.h  |  1 +
>  hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
>  hw/ppc/spapr_rtas_ddw.c |  7 +++++++
>  3 files changed, 39 insertions(+)
> 
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 86b0488..358bb38 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -727,6 +727,7 @@ struct sPAPRTCETable {
>      uint64_t *mig_table;
>      bool bypass;
>      bool need_vfio;
> +    bool skipping_replay;
>      int fd;
>      MemoryRegion root;
>      IOMMUMemoryRegion iommu;
> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> index 37e98f9..8f23179 100644
> --- a/hw/ppc/spapr_iommu.c
> +++ b/hw/ppc/spapr_iommu.c
> @@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
>      return ret;
>  }
>  
> +static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
> +{
> +    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> +    hwaddr addr, granularity;
> +    IOMMUTLBEntry iotlb;
> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
> +
> +    if (tcet->skipping_replay) {
> +        return;
> +    }
> +
> +    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
> +
> +    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
> +        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
> +        if (iotlb.perm != IOMMU_NONE) {
> +            n->notify(n, &iotlb);
> +        }
> +
> +        /*
> +         * if (2^64 - MR size) < granularity, it's possible to get an
> +         * infinite loop here.  This should catch such a wraparound.
> +         */
> +        if ((addr + granularity) < addr) {
> +            break;
> +        }
> +    }
> +}

It is a bit unfortunate to duplicate all that code. What about making
a memory_region_iommu_replay_generic() helper out of it and call it
from spapr_tce_replay() and memory_region_iommu_replay() ?

Apart from that, LGTM.

> +
>  static int spapr_tce_table_pre_save(void *opaque)
>  {
>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
> @@ -659,6 +689,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>      IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>  
>      imrc->translate = spapr_tce_translate_iommu;
> +    imrc->replay = spapr_tce_replay;
>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>      imrc->get_attr = spapr_tce_get_attr;
> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
> index cb8a410..9cc020d 100644
> --- a/hw/ppc/spapr_rtas_ddw.c
> +++ b/hw/ppc/spapr_rtas_ddw.c
> @@ -171,8 +171,15 @@ static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
>      }
>  
>      win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
> +    /*
> +     * We have just created a window, we know for the fact that it is empty,
> +     * use a hack to avoid iterating over the table as it is quite possible
> +     * to have billions of TCEs, all empty.
> +     */
> +    tcet->skipping_replay = true;
>      spapr_tce_table_enable(tcet, page_shift, win_addr,
>                             1ULL << (window_shift - page_shift));
> +    tcet->skipping_replay = false;
>      if (!tcet->nb_table) {
>          goto hw_error_exit;
>      }
Alexey Kardashevskiy Feb. 27, 2019, 11:59 p.m. UTC | #2
On 28/02/2019 01:33, Greg Kurz wrote:
> On Wed, 27 Feb 2019 19:51:47 +1100
> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> 
>> On sPAPR vfio_listener_region_add() is called in 2 situations:
>> 1. a new listener is registered from vfio_connect_container();
>> 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().
>>
>> In both cases vfio_listener_region_add() calls
>> memory_region_iommu_replay() to notify newly registered IOMMU notifiers
>> about existing mappings which is totally desirable for case 1.
>>
>> However for case 2 it is nothing but noop as the window has just been
>> created and has no valid mappings so replaying those does not do anything.
>> It is barely noticeable with usual guests but if the window happens to be
>> really big, such no-op replay might take minutes and trigger RCU stall
>> warnings in the guest.
>>
>> For example, a upcoming GPU RAM memory region mapped at 64TiB (right
>> after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
>> which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.
>>
>> This mitigates the problem by adding an "skipping_replay" flag to
>> sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
>> exactly the same thing as the generic one except it returns early if
>> @skipping_replay==true.
>>
>> When "ibm,create-pe-dma-window" is complete, the guest will map only
>> required regions of the huge DMA window.
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>  include/hw/ppc/spapr.h  |  1 +
>>  hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
>>  hw/ppc/spapr_rtas_ddw.c |  7 +++++++
>>  3 files changed, 39 insertions(+)
>>
>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>> index 86b0488..358bb38 100644
>> --- a/include/hw/ppc/spapr.h
>> +++ b/include/hw/ppc/spapr.h
>> @@ -727,6 +727,7 @@ struct sPAPRTCETable {
>>      uint64_t *mig_table;
>>      bool bypass;
>>      bool need_vfio;
>> +    bool skipping_replay;
>>      int fd;
>>      MemoryRegion root;
>>      IOMMUMemoryRegion iommu;
>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>> index 37e98f9..8f23179 100644
>> --- a/hw/ppc/spapr_iommu.c
>> +++ b/hw/ppc/spapr_iommu.c
>> @@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
>>      return ret;
>>  }
>>  
>> +static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>> +{
>> +    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
>> +    hwaddr addr, granularity;
>> +    IOMMUTLBEntry iotlb;
>> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>> +
>> +    if (tcet->skipping_replay) {
>> +        return;
>> +    }
>> +
>> +    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
>> +
>> +    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
>> +        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
>> +        if (iotlb.perm != IOMMU_NONE) {
>> +            n->notify(n, &iotlb);
>> +        }
>> +
>> +        /*
>> +         * if (2^64 - MR size) < granularity, it's possible to get an
>> +         * infinite loop here.  This should catch such a wraparound.
>> +         */
>> +        if ((addr + granularity) < addr) {
>> +            break;
>> +        }
>> +    }
>> +}
> 
> It is a bit unfortunate to duplicate all that code. What about making
> a memory_region_iommu_replay_generic() helper out of it and call it
> from spapr_tce_replay() and memory_region_iommu_replay() ?


I really do not want to mess with generic code to solve our local sPAPR
problem, especially when there is a way not to do so.

And as a next step, I was thinking of removing (i.e. making this replay
a no-op) from QEMU later and do replay in KVM instead when an IOMMU
group is attaching to KVM as this is the only case when we need replay
and KVM has a lot better idea what TCEs are actually valid and can skip
most of them. This is a bit bigger thing as it requires a KVM capability
"KVM replays mappings" but when we get it, spapr_tce_replay() will
become no-op.


> Apart from that, LGTM.

Well. It is a hack, I just do not have taste to tell how nasty it is :)


> 
>> +
>>  static int spapr_tce_table_pre_save(void *opaque)
>>  {
>>      sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
>> @@ -659,6 +689,7 @@ static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
>>      IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
>>  
>>      imrc->translate = spapr_tce_translate_iommu;
>> +    imrc->replay = spapr_tce_replay;
>>      imrc->get_min_page_size = spapr_tce_get_min_page_size;
>>      imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
>>      imrc->get_attr = spapr_tce_get_attr;
>> diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
>> index cb8a410..9cc020d 100644
>> --- a/hw/ppc/spapr_rtas_ddw.c
>> +++ b/hw/ppc/spapr_rtas_ddw.c
>> @@ -171,8 +171,15 @@ static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
>>      }
>>  
>>      win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
>> +    /*
>> +     * We have just created a window, we know for the fact that it is empty,
>> +     * use a hack to avoid iterating over the table as it is quite possible
>> +     * to have billions of TCEs, all empty.
>> +     */
>> +    tcet->skipping_replay = true;
>>      spapr_tce_table_enable(tcet, page_shift, win_addr,
>>                             1ULL << (window_shift - page_shift));
>> +    tcet->skipping_replay = false;
>>      if (!tcet->nb_table) {
>>          goto hw_error_exit;
>>      }
>
David Gibson Feb. 28, 2019, 3:49 a.m. UTC | #3
On Thu, Feb 28, 2019 at 10:59:56AM +1100, Alexey Kardashevskiy wrote:
> 
> 
> On 28/02/2019 01:33, Greg Kurz wrote:
> > On Wed, 27 Feb 2019 19:51:47 +1100
> > Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> > 
> >> On sPAPR vfio_listener_region_add() is called in 2 situations:
> >> 1. a new listener is registered from vfio_connect_container();
> >> 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().
> >>
> >> In both cases vfio_listener_region_add() calls
> >> memory_region_iommu_replay() to notify newly registered IOMMU notifiers
> >> about existing mappings which is totally desirable for case 1.
> >>
> >> However for case 2 it is nothing but noop as the window has just been
> >> created and has no valid mappings so replaying those does not do anything.
> >> It is barely noticeable with usual guests but if the window happens to be
> >> really big, such no-op replay might take minutes and trigger RCU stall
> >> warnings in the guest.
> >>
> >> For example, a upcoming GPU RAM memory region mapped at 64TiB (right
> >> after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
> >> which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.
> >>
> >> This mitigates the problem by adding an "skipping_replay" flag to
> >> sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
> >> exactly the same thing as the generic one except it returns early if
> >> @skipping_replay==true.
> >>
> >> When "ibm,create-pe-dma-window" is complete, the guest will map only
> >> required regions of the huge DMA window.
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>  include/hw/ppc/spapr.h  |  1 +
> >>  hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
> >>  hw/ppc/spapr_rtas_ddw.c |  7 +++++++
> >>  3 files changed, 39 insertions(+)
> >>
> >> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >> index 86b0488..358bb38 100644
> >> --- a/include/hw/ppc/spapr.h
> >> +++ b/include/hw/ppc/spapr.h
> >> @@ -727,6 +727,7 @@ struct sPAPRTCETable {
> >>      uint64_t *mig_table;
> >>      bool bypass;
> >>      bool need_vfio;
> >> +    bool skipping_replay;
> >>      int fd;
> >>      MemoryRegion root;
> >>      IOMMUMemoryRegion iommu;
> >> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> >> index 37e98f9..8f23179 100644
> >> --- a/hw/ppc/spapr_iommu.c
> >> +++ b/hw/ppc/spapr_iommu.c
> >> @@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
> >>      return ret;
> >>  }
> >>  
> >> +static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
> >> +{
> >> +    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
> >> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> >> +    hwaddr addr, granularity;
> >> +    IOMMUTLBEntry iotlb;
> >> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
> >> +
> >> +    if (tcet->skipping_replay) {
> >> +        return;
> >> +    }
> >> +
> >> +    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
> >> +
> >> +    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
> >> +        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
> >> +        if (iotlb.perm != IOMMU_NONE) {
> >> +            n->notify(n, &iotlb);
> >> +        }
> >> +
> >> +        /*
> >> +         * if (2^64 - MR size) < granularity, it's possible to get an
> >> +         * infinite loop here.  This should catch such a wraparound.
> >> +         */
> >> +        if ((addr + granularity) < addr) {
> >> +            break;
> >> +        }
> >> +    }
> >> +}
> > 
> > It is a bit unfortunate to duplicate all that code. What about making
> > a memory_region_iommu_replay_generic() helper out of it and call it
> > from spapr_tce_replay() and memory_region_iommu_replay() ?
> 
> 
> I really do not want to mess with generic code to solve our local sPAPR
> problem, especially when there is a way not to do so.

Well, the thing is, I think we're actually the only user of the
current generic replay - everything else has more efficient structure
aware replay hooks AFAIK.  Which makes this hack even hackier.

> And as a next step, I was thinking of removing (i.e. making this replay
> a no-op) from QEMU later and do replay in KVM instead when an IOMMU
> group is attaching to KVM as this is the only case when we need replay
> and KVM has a lot better idea what TCEs are actually valid and can skip
> most of them. This is a bit bigger thing as it requires a KVM capability
> "KVM replays mappings" but when we get it, spapr_tce_replay() will
> become no-op.

That's a good idea longer term.

> > Apart from that, LGTM.
> 
> Well. It is a hack, I just do not have taste to tell how nasty it is
> :)

As an interim step until the kernel change, I think we can do a bit
better than this.  First, as Greg suggests we should have the
"generic" replay be a helper and have the spapr one call that with a
little in the way of extra checking.

Second, rather than having an explicit "skip_replay" flag, what we
really want here is to have the replay be a fast no-op if there are no
existing mappings rather than a slow no-op.  So instead I think we
should have a flag which records if any mappings have been made in the
region yet, initialized to false.  The new replay would do nothing if
it's still false.
Alexey Kardashevskiy Feb. 28, 2019, 5:37 a.m. UTC | #4
On 28/02/2019 14:49, David Gibson wrote:
> On Thu, Feb 28, 2019 at 10:59:56AM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 28/02/2019 01:33, Greg Kurz wrote:
>>> On Wed, 27 Feb 2019 19:51:47 +1100
>>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>
>>>> On sPAPR vfio_listener_region_add() is called in 2 situations:
>>>> 1. a new listener is registered from vfio_connect_container();
>>>> 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().
>>>>
>>>> In both cases vfio_listener_region_add() calls
>>>> memory_region_iommu_replay() to notify newly registered IOMMU notifiers
>>>> about existing mappings which is totally desirable for case 1.
>>>>
>>>> However for case 2 it is nothing but noop as the window has just been
>>>> created and has no valid mappings so replaying those does not do anything.
>>>> It is barely noticeable with usual guests but if the window happens to be
>>>> really big, such no-op replay might take minutes and trigger RCU stall
>>>> warnings in the guest.
>>>>
>>>> For example, a upcoming GPU RAM memory region mapped at 64TiB (right
>>>> after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
>>>> which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.
>>>>
>>>> This mitigates the problem by adding an "skipping_replay" flag to
>>>> sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
>>>> exactly the same thing as the generic one except it returns early if
>>>> @skipping_replay==true.
>>>>
>>>> When "ibm,create-pe-dma-window" is complete, the guest will map only
>>>> required regions of the huge DMA window.
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>  include/hw/ppc/spapr.h  |  1 +
>>>>  hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
>>>>  hw/ppc/spapr_rtas_ddw.c |  7 +++++++
>>>>  3 files changed, 39 insertions(+)
>>>>
>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>> index 86b0488..358bb38 100644
>>>> --- a/include/hw/ppc/spapr.h
>>>> +++ b/include/hw/ppc/spapr.h
>>>> @@ -727,6 +727,7 @@ struct sPAPRTCETable {
>>>>      uint64_t *mig_table;
>>>>      bool bypass;
>>>>      bool need_vfio;
>>>> +    bool skipping_replay;
>>>>      int fd;
>>>>      MemoryRegion root;
>>>>      IOMMUMemoryRegion iommu;
>>>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
>>>> index 37e98f9..8f23179 100644
>>>> --- a/hw/ppc/spapr_iommu.c
>>>> +++ b/hw/ppc/spapr_iommu.c
>>>> @@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
>>>>      return ret;
>>>>  }
>>>>  
>>>> +static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
>>>> +{
>>>> +    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
>>>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
>>>> +    hwaddr addr, granularity;
>>>> +    IOMMUTLBEntry iotlb;
>>>> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
>>>> +
>>>> +    if (tcet->skipping_replay) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
>>>> +
>>>> +    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
>>>> +        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
>>>> +        if (iotlb.perm != IOMMU_NONE) {
>>>> +            n->notify(n, &iotlb);
>>>> +        }
>>>> +
>>>> +        /*
>>>> +         * if (2^64 - MR size) < granularity, it's possible to get an
>>>> +         * infinite loop here.  This should catch such a wraparound.
>>>> +         */
>>>> +        if ((addr + granularity) < addr) {
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +}
>>>
>>> It is a bit unfortunate to duplicate all that code. What about making
>>> a memory_region_iommu_replay_generic() helper out of it and call it
>>> from spapr_tce_replay() and memory_region_iommu_replay() ?
>>
>>
>> I really do not want to mess with generic code to solve our local sPAPR
>> problem, especially when there is a way not to do so.
> 
> Well, the thing is, I think we're actually the only user of the
> current generic replay - everything else has more efficient structure
> aware replay hooks AFAIK.  Which makes this hack even hackier.


If that so, then we are better off removing that loop from
memory_region_iommu_replay() at all rather than keeping it generic.


>> And as a next step, I was thinking of removing (i.e. making this replay
>> a no-op) from QEMU later and do replay in KVM instead when an IOMMU
>> group is attaching to KVM as this is the only case when we need replay
>> and KVM has a lot better idea what TCEs are actually valid and can skip
>> most of them. This is a bit bigger thing as it requires a KVM capability
>> "KVM replays mappings" but when we get it, spapr_tce_replay() will
>> become no-op.
> 
> That's a good idea longer term.
> 
>>> Apart from that, LGTM.
>>
>> Well. It is a hack, I just do not have taste to tell how nasty it is
>> :)
> 
> As an interim step until the kernel change, I think we can do a bit
> better than this.  First, as Greg suggests we should have the
> "generic" replay be a helper and have the spapr one call that with a
> little in the way of extra checking.
> 
> Second, rather than having an explicit "skip_replay" flag, what we
> really want here is to have the replay be a fast no-op if there are no
> existing mappings rather than a slow no-op.  So instead I think we
> should have a flag which records if any mappings have been made in the
> region yet, initialized to false.
> The new replay would do nothing if
> it's still false.

If QEMU controlled the mappings - sure. But it does not - KVM does it
instead via that fast path. So QEMU does not know if there are mappings
until it reads all TCEs from mmap'ed KVM TCE table which will fault in
all these pages.

We could implement some tricks such are allow reading (or ioctl) from
that KVM TCE fd and it could tell what is mapped and what is not in a
very condensed format (for example a bit per every 256MB of the guest
address space)  ooooor  implement different behavior for mapping with RW
or readonly - the latter would fail if there is no backing page
allocated yet - and then QEMU could skip these regions when replaying.
David Gibson March 5, 2019, 3:28 a.m. UTC | #5
On Thu, Feb 28, 2019 at 04:37:25PM +1100, Alexey Kardashevskiy wrote:
> On 28/02/2019 14:49, David Gibson wrote:
> > On Thu, Feb 28, 2019 at 10:59:56AM +1100, Alexey Kardashevskiy wrote:
> >>
> >>
> >> On 28/02/2019 01:33, Greg Kurz wrote:
> >>> On Wed, 27 Feb 2019 19:51:47 +1100
> >>> Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>>
> >>>> On sPAPR vfio_listener_region_add() is called in 2 situations:
> >>>> 1. a new listener is registered from vfio_connect_container();
> >>>> 2. a new IOMMU Memory Region is added from rtas_ibm_create_pe_dma_window().
> >>>>
> >>>> In both cases vfio_listener_region_add() calls
> >>>> memory_region_iommu_replay() to notify newly registered IOMMU notifiers
> >>>> about existing mappings which is totally desirable for case 1.
> >>>>
> >>>> However for case 2 it is nothing but noop as the window has just been
> >>>> created and has no valid mappings so replaying those does not do anything.
> >>>> It is barely noticeable with usual guests but if the window happens to be
> >>>> really big, such no-op replay might take minutes and trigger RCU stall
> >>>> warnings in the guest.
> >>>>
> >>>> For example, a upcoming GPU RAM memory region mapped at 64TiB (right
> >>>> after SPAPR_PCI_LIMIT) causes a 64bit DMA window to be at least 128TiB
> >>>> which is (128<<40)/0x10000=2.147.483.648 TCEs to replay.
> >>>>
> >>>> This mitigates the problem by adding an "skipping_replay" flag to
> >>>> sPAPRTCETable and defining sPAPR own IOMMU MR replay() hook which does
> >>>> exactly the same thing as the generic one except it returns early if
> >>>> @skipping_replay==true.
> >>>>
> >>>> When "ibm,create-pe-dma-window" is complete, the guest will map only
> >>>> required regions of the huge DMA window.
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>  include/hw/ppc/spapr.h  |  1 +
> >>>>  hw/ppc/spapr_iommu.c    | 31 +++++++++++++++++++++++++++++++
> >>>>  hw/ppc/spapr_rtas_ddw.c |  7 +++++++
> >>>>  3 files changed, 39 insertions(+)
> >>>>
> >>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> >>>> index 86b0488..358bb38 100644
> >>>> --- a/include/hw/ppc/spapr.h
> >>>> +++ b/include/hw/ppc/spapr.h
> >>>> @@ -727,6 +727,7 @@ struct sPAPRTCETable {
> >>>>      uint64_t *mig_table;
> >>>>      bool bypass;
> >>>>      bool need_vfio;
> >>>> +    bool skipping_replay;
> >>>>      int fd;
> >>>>      MemoryRegion root;
> >>>>      IOMMUMemoryRegion iommu;
> >>>> diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
> >>>> index 37e98f9..8f23179 100644
> >>>> --- a/hw/ppc/spapr_iommu.c
> >>>> +++ b/hw/ppc/spapr_iommu.c
> >>>> @@ -141,6 +141,36 @@ static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
> >>>>      return ret;
> >>>>  }
> >>>>  
> >>>> +static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
> >>>> +{
> >>>> +    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
> >>>> +    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
> >>>> +    hwaddr addr, granularity;
> >>>> +    IOMMUTLBEntry iotlb;
> >>>> +    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
> >>>> +
> >>>> +    if (tcet->skipping_replay) {
> >>>> +        return;
> >>>> +    }
> >>>> +
> >>>> +    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
> >>>> +
> >>>> +    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
> >>>> +        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
> >>>> +        if (iotlb.perm != IOMMU_NONE) {
> >>>> +            n->notify(n, &iotlb);
> >>>> +        }
> >>>> +
> >>>> +        /*
> >>>> +         * if (2^64 - MR size) < granularity, it's possible to get an
> >>>> +         * infinite loop here.  This should catch such a wraparound.
> >>>> +         */
> >>>> +        if ((addr + granularity) < addr) {
> >>>> +            break;
> >>>> +        }
> >>>> +    }
> >>>> +}
> >>>
> >>> It is a bit unfortunate to duplicate all that code. What about making
> >>> a memory_region_iommu_replay_generic() helper out of it and call it
> >>> from spapr_tce_replay() and memory_region_iommu_replay() ?
> >>
> >>
> >> I really do not want to mess with generic code to solve our local sPAPR
> >> problem, especially when there is a way not to do so.
> > 
> > Well, the thing is, I think we're actually the only user of the
> > current generic replay - everything else has more efficient structure
> > aware replay hooks AFAIK.  Which makes this hack even hackier.
> 
> If that so, then we are better off removing that loop from
> memory_region_iommu_replay() at all rather than keeping it generic.

Well.. maybe.  In theory this logic should work, albeit slowly, for
any IOMMU implementation, it's just that everyone else has more
efficient implementations at the moment.

> >> And as a next step, I was thinking of removing (i.e. making this replay
> >> a no-op) from QEMU later and do replay in KVM instead when an IOMMU
> >> group is attaching to KVM as this is the only case when we need replay
> >> and KVM has a lot better idea what TCEs are actually valid and can skip
> >> most of them. This is a bit bigger thing as it requires a KVM capability
> >> "KVM replays mappings" but when we get it, spapr_tce_replay() will
> >> become no-op.
> > 
> > That's a good idea longer term.
> > 
> >>> Apart from that, LGTM.
> >>
> >> Well. It is a hack, I just do not have taste to tell how nasty it is
> >> :)
> > 
> > As an interim step until the kernel change, I think we can do a bit
> > better than this.  First, as Greg suggests we should have the
> > "generic" replay be a helper and have the spapr one call that with a
> > little in the way of extra checking.
> > 
> > Second, rather than having an explicit "skip_replay" flag, what we
> > really want here is to have the replay be a fast no-op if there are no
> > existing mappings rather than a slow no-op.  So instead I think we
> > should have a flag which records if any mappings have been made in the
> > region yet, initialized to false.
> > The new replay would do nothing if
> > it's still false.
> 
> If QEMU controlled the mappings - sure. But it does not - KVM does it
> instead via that fast path. So QEMU does not know if there are mappings
> until it reads all TCEs from mmap'ed KVM TCE table which will fault in
> all these pages.

Oops, I forgot that; good point.

> We could implement some tricks such are allow reading (or ioctl) from
> that KVM TCE fd and it could tell what is mapped and what is not in a
> very condensed format (for example a bit per every 256MB of the guest
> address space)  ooooor  implement different behavior for mapping with RW
> or readonly - the latter would fail if there is no backing page
> allocated yet - and then QEMU could skip these regions when replaying.

Urgh, yeah, this is trickier than I initially thought.  About the
cleanest approach I can see is to delay allocation of the IOMMU table
until the first H_PUT_TCE, and only then actually bind the table to
KVM and allow for H_PUT_TCE acceleration.

Seems pretty awkward, though.  Ok, your hack seems the best way
forward in the short to medium term, just make sure there are some
comments there explaining why the hack is valuable.
diff mbox series

Patch

diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 86b0488..358bb38 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -727,6 +727,7 @@  struct sPAPRTCETable {
     uint64_t *mig_table;
     bool bypass;
     bool need_vfio;
+    bool skipping_replay;
     int fd;
     MemoryRegion root;
     IOMMUMemoryRegion iommu;
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 37e98f9..8f23179 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -141,6 +141,36 @@  static IOMMUTLBEntry spapr_tce_translate_iommu(IOMMUMemoryRegion *iommu,
     return ret;
 }
 
+static void spapr_tce_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
+{
+    MemoryRegion *mr = MEMORY_REGION(iommu_mr);
+    IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_GET_CLASS(iommu_mr);
+    hwaddr addr, granularity;
+    IOMMUTLBEntry iotlb;
+    sPAPRTCETable *tcet = container_of(iommu_mr, sPAPRTCETable, iommu);
+
+    if (tcet->skipping_replay) {
+        return;
+    }
+
+    granularity = memory_region_iommu_get_min_page_size(iommu_mr);
+
+    for (addr = 0; addr < memory_region_size(mr); addr += granularity) {
+        iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, n->iommu_idx);
+        if (iotlb.perm != IOMMU_NONE) {
+            n->notify(n, &iotlb);
+        }
+
+        /*
+         * if (2^64 - MR size) < granularity, it's possible to get an
+         * infinite loop here.  This should catch such a wraparound.
+         */
+        if ((addr + granularity) < addr) {
+            break;
+        }
+    }
+}
+
 static int spapr_tce_table_pre_save(void *opaque)
 {
     sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -659,6 +689,7 @@  static void spapr_iommu_memory_region_class_init(ObjectClass *klass, void *data)
     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
 
     imrc->translate = spapr_tce_translate_iommu;
+    imrc->replay = spapr_tce_replay;
     imrc->get_min_page_size = spapr_tce_get_min_page_size;
     imrc->notify_flag_changed = spapr_tce_notify_flag_changed;
     imrc->get_attr = spapr_tce_get_attr;
diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c
index cb8a410..9cc020d 100644
--- a/hw/ppc/spapr_rtas_ddw.c
+++ b/hw/ppc/spapr_rtas_ddw.c
@@ -171,8 +171,15 @@  static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu,
     }
 
     win_addr = (windows == 0) ? sphb->dma_win_addr : sphb->dma64_win_addr;
+    /*
+     * We have just created a window, we know for the fact that it is empty,
+     * use a hack to avoid iterating over the table as it is quite possible
+     * to have billions of TCEs, all empty.
+     */
+    tcet->skipping_replay = true;
     spapr_tce_table_enable(tcet, page_shift, win_addr,
                            1ULL << (window_shift - page_shift));
+    tcet->skipping_replay = false;
     if (!tcet->nb_table) {
         goto hw_error_exit;
     }