diff mbox

[2/3] exec: simplify address_space_get_iotlb_entry

Message ID 1496404254-17429-3-git-send-email-peterx@redhat.com
State New
Headers show

Commit Message

Peter Xu June 2, 2017, 11:50 a.m. UTC
This patch let address_space_get_iotlb_entry() to use the newly
introduced page_mask parameter in address_space_do_translate(). Then we
will be sure the IOTLB can be aligned to page mask, also we should
nicely support huge pages now when introducing a764040.

Fixes: a764040 ("exec: abstract address_space_do_translate()")
Signed-off-by: Peter Xu <peterx@redhat.com>
---
 exec.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

Comments

Michael S. Tsirkin June 2, 2017, 4:49 p.m. UTC | #1
On Fri, Jun 02, 2017 at 07:50:53PM +0800, Peter Xu wrote:
> This patch let address_space_get_iotlb_entry() to use the newly
> introduced page_mask parameter in address_space_do_translate(). Then we
> will be sure the IOTLB can be aligned to page mask, also we should
> nicely support huge pages now when introducing a764040.
> 
> Fixes: a764040 ("exec: abstract address_space_do_translate()")
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  exec.c | 29 ++++++++++-------------------
>  1 file changed, 10 insertions(+), 19 deletions(-)
> 
> diff --git a/exec.c b/exec.c
> index 63a3ff0..1f86253 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -544,14 +544,14 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
>                                              bool is_write)
>  {
>      MemoryRegionSection section;
> -    hwaddr xlat, plen;
> +    hwaddr xlat, page_mask;
>  
> -    /* Try to get maximum page mask during translation. */
> -    plen = (hwaddr)-1;
> -
> -    /* This can never be MMIO. */
> -    section = address_space_do_translate(as, addr, &xlat, &plen,
> -                                         NULL, is_write, false);
> +    /*
> +     * This can never be MMIO, and we don't really care about plen,
> +     * but page mask.
> +     */
> +    section = address_space_do_translate(as, addr, &xlat, NULL,
> +                                         &page_mask, is_write, false);
>  
>      /* Illegal translation */
>      if (section.mr == &io_mem_unassigned) {


Can we just use section.size - xlat here?

> @@ -562,20 +562,11 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
>      xlat += section.offset_within_address_space -
>          section.offset_within_region;
>  
> -    if (plen == (hwaddr)-1) {
> -        /* If not specified during translation, use default mask */
> -        plen = TARGET_PAGE_MASK;
> -    } else {
> -        /* Make it a valid page mask */
> -        assert(plen);
> -        plen = pow2floor(plen) - 1;
> -    }
> -
>      return (IOMMUTLBEntry) {
>          .target_as = section.address_space,
> -        .iova = addr & ~plen,
> -        .translated_addr = xlat & ~plen,
> -        .addr_mask = plen,
> +        .iova = addr & ~page_mask,
> +        .translated_addr = xlat & ~page_mask,
> +        .addr_mask = page_mask,
>          /* IOTLBs are for DMAs, and DMA only allows on RAMs. */

BTW this comment is pretty confusing. What does it mean?

>          .perm = IOMMU_RW,
>      };

Looks like we should change IOMMUTLBEntry to pass size and not mask -
then we could simply pass info from section as is. iova would be
addr - xlat.

> -- 
> 2.7.4
Peter Xu June 5, 2017, 3:07 a.m. UTC | #2
On Fri, Jun 02, 2017 at 07:49:58PM +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 02, 2017 at 07:50:53PM +0800, Peter Xu wrote:
> > This patch let address_space_get_iotlb_entry() to use the newly
> > introduced page_mask parameter in address_space_do_translate(). Then we
> > will be sure the IOTLB can be aligned to page mask, also we should
> > nicely support huge pages now when introducing a764040.
> > 
> > Fixes: a764040 ("exec: abstract address_space_do_translate()")
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  exec.c | 29 ++++++++++-------------------
> >  1 file changed, 10 insertions(+), 19 deletions(-)
> > 
> > diff --git a/exec.c b/exec.c
> > index 63a3ff0..1f86253 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -544,14 +544,14 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
> >                                              bool is_write)
> >  {
> >      MemoryRegionSection section;
> > -    hwaddr xlat, plen;
> > +    hwaddr xlat, page_mask;
> >  
> > -    /* Try to get maximum page mask during translation. */
> > -    plen = (hwaddr)-1;
> > -
> > -    /* This can never be MMIO. */
> > -    section = address_space_do_translate(as, addr, &xlat, &plen,
> > -                                         NULL, is_write, false);
> > +    /*
> > +     * This can never be MMIO, and we don't really care about plen,
> > +     * but page mask.
> > +     */
> > +    section = address_space_do_translate(as, addr, &xlat, NULL,
> > +                                         &page_mask, is_write, false);
> >  
> >      /* Illegal translation */
> >      if (section.mr == &io_mem_unassigned) {
> 
> 
> Can we just use section.size - xlat here?

I replied in the other thread about what I thought... So will skip
here.

> 
> > @@ -562,20 +562,11 @@ IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
> >      xlat += section.offset_within_address_space -
> >          section.offset_within_region;
> >  
> > -    if (plen == (hwaddr)-1) {
> > -        /* If not specified during translation, use default mask */
> > -        plen = TARGET_PAGE_MASK;
> > -    } else {
> > -        /* Make it a valid page mask */
> > -        assert(plen);
> > -        plen = pow2floor(plen) - 1;
> > -    }
> > -
> >      return (IOMMUTLBEntry) {
> >          .target_as = section.address_space,
> > -        .iova = addr & ~plen,
> > -        .translated_addr = xlat & ~plen,
> > -        .addr_mask = plen,
> > +        .iova = addr & ~page_mask,
> > +        .translated_addr = xlat & ~page_mask,
> > +        .addr_mask = page_mask,
> >          /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
> 
> BTW this comment is pretty confusing. What does it mean?

This function, address_space_get_iotlb_entry(), is for device to get
IOTLB entry when they want to request for page translations for DMA,
and DMA should only be allowed to RAM, right? Then we need IOMMU_RW
permission here.

Maybe I should add one more check above on the returned MR - it should
be RAM typed as well. But I don't really know whether that's too
strict, since if guest setup the IOMMU page table to allow one IOVA
points to a non-RAM region, I thought it should still be legal from
hypervisor POV (I see it a guest OS bug though)?

> 
> >          .perm = IOMMU_RW,
> >      };
> 
> Looks like we should change IOMMUTLBEntry to pass size and not mask -
> then we could simply pass info from section as is. iova would be
> addr - xlat.

I don't sure whether it'll be a good interface for IOTLB. AFAIU at
least for VT-d, the IOMMU translation is page aligned which is defined
by spec, so it makes sense that (again at least for VT-d) here we'd
better just use page_mask/addr_mask.

That's also how I know about IOMMU in general - I assume it do the
translations always with page masks (never arbitary length), though
page size can differ from platfrom to platform, that's why here the
IOTLB interface used addr_mask, then it works for all platforms. I
don't know whether I'm 100% correct here though.

Maybe David/Paolo/... would comment as well?

(CC David)

Thanks,
Paolo Bonzini June 6, 2017, 2:34 p.m. UTC | #3
On 05/06/2017 05:07, Peter Xu wrote:
> I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> least for VT-d, the IOMMU translation is page aligned which is defined
> by spec, so it makes sense that (again at least for VT-d) here we'd
> better just use page_mask/addr_mask.
> 
> That's also how I know about IOMMU in general - I assume it do the
> translations always with page masks (never arbitary length), though
> page size can differ from platfrom to platform, that's why here the
> IOTLB interface used addr_mask, then it works for all platforms. I
> don't know whether I'm 100% correct here though.
> 
> Maybe David/Paolo/... would comment as well?

I would ask David.  There are PowerPC MMUs that allow fast lookup of
arbitrarily-sized windows (not necessarily power of two), so maybe the
IOMMUs can do the same.

Paolo
David Gibson June 6, 2017, 11:47 p.m. UTC | #4
On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> 
> 
> On 05/06/2017 05:07, Peter Xu wrote:
> > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > least for VT-d, the IOMMU translation is page aligned which is defined
> > by spec, so it makes sense that (again at least for VT-d) here we'd
> > better just use page_mask/addr_mask.
> > 
> > That's also how I know about IOMMU in general - I assume it do the
> > translations always with page masks (never arbitary length), though
> > page size can differ from platfrom to platform, that's why here the
> > IOTLB interface used addr_mask, then it works for all platforms. I
> > don't know whether I'm 100% correct here though.
> > 
> > Maybe David/Paolo/... would comment as well?
> 
> I would ask David.  There are PowerPC MMUs that allow fast lookup of
> arbitrarily-sized windows (not necessarily power of two),

Uh.. I'm not sure what you mean here.  You might be thinking of the
BATs which really old (32-bit) PowerPC MMUs had - those allow
arbitrary large block translations, but they do have to be a power of
two.

> so maybe the
> IOMMUs can do the same.

The only Power IOMMU I know about uses a fixed, power-of-two page size
per DMA window.
Peter Xu June 7, 2017, 3:44 a.m. UTC | #5
On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > 
> > 
> > On 05/06/2017 05:07, Peter Xu wrote:
> > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > better just use page_mask/addr_mask.
> > > 
> > > That's also how I know about IOMMU in general - I assume it do the
> > > translations always with page masks (never arbitary length), though
> > > page size can differ from platfrom to platform, that's why here the
> > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > don't know whether I'm 100% correct here though.
> > > 
> > > Maybe David/Paolo/... would comment as well?
> > 
> > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > arbitrarily-sized windows (not necessarily power of two),
> 
> Uh.. I'm not sure what you mean here.  You might be thinking of the
> BATs which really old (32-bit) PowerPC MMUs had - those allow
> arbitrary large block translations, but they do have to be a power of
> two.
> 
> > so maybe the
> > IOMMUs can do the same.
> 
> The only Power IOMMU I know about uses a fixed, power-of-two page size
> per DMA window.

If so, I would still be inclined to keep using masks for QEMU IOTLB.
Then, my first two patches should still stand.

I am just afraid that not using masks will diverge the emulation from
real hardware and brings trouble one day.

For vhost IOTLB interface, it does not need to be strictly aligned to
QEMU IOMMU IOTLB definition, and that's how it's working now (current
vhost iotlb allows arbitary length, and I think it's good). So imho we
don't really need to worry about the performance - after all, we can
do everything customized for vhost, just like what patch 3 did (yeah,
it can be better...).

Thanks,
Paolo Bonzini June 7, 2017, 1:01 p.m. UTC | #6
On 07/06/2017 01:47, David Gibson wrote:
>> I would ask David.  There are PowerPC MMUs that allow fast lookup of
>> arbitrarily-sized windows (not necessarily power of two),
>
> Uh.. I'm not sure what you mean here.  You might be thinking of the
> BATs which really old (32-bit) PowerPC MMUs had - those allow
> arbitrary large block translations, but they do have to be a power of
> two.

Oh, that is what I was thinking about.  Thanks for figuring out! :)

Paolo
Michael S. Tsirkin June 7, 2017, 1:07 p.m. UTC | #7
On Wed, Jun 07, 2017 at 11:44:43AM +0800, Peter Xu wrote:
> On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> > On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > > 
> > > 
> > > On 05/06/2017 05:07, Peter Xu wrote:
> > > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > > better just use page_mask/addr_mask.
> > > > 
> > > > That's also how I know about IOMMU in general - I assume it do the
> > > > translations always with page masks (never arbitary length), though
> > > > page size can differ from platfrom to platform, that's why here the
> > > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > > don't know whether I'm 100% correct here though.
> > > > 
> > > > Maybe David/Paolo/... would comment as well?
> > > 
> > > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > > arbitrarily-sized windows (not necessarily power of two),
> > 
> > Uh.. I'm not sure what you mean here.  You might be thinking of the
> > BATs which really old (32-bit) PowerPC MMUs had - those allow
> > arbitrary large block translations, but they do have to be a power of
> > two.
> > 
> > > so maybe the
> > > IOMMUs can do the same.
> > 
> > The only Power IOMMU I know about uses a fixed, power-of-two page size
> > per DMA window.
> 
> If so, I would still be inclined to keep using masks for QEMU IOTLB.
> Then, my first two patches should still stand.
> 
> I am just afraid that not using masks will diverge the emulation from
> real hardware and brings trouble one day.
> 
> For vhost IOTLB interface, it does not need to be strictly aligned to
> QEMU IOMMU IOTLB definition, and that's how it's working now (current
> vhost iotlb allows arbitary length, and I think it's good). So imho we
> don't really need to worry about the performance - after all, we can
> do everything customized for vhost, just like what patch 3 did (yeah,
> it can be better...).
> 
> Thanks,

Pre-faults is also something that does not happen on real hardware.
And it's about security so a bigger issue.

If I had to choose between that and using non-power-of-2 in
the API, I'd go for non-power-of-2. Let backends that can only
support power of 2 split it up to multiple transactions.



> -- 
> Peter Xu
Peter Xu June 8, 2017, 6:11 a.m. UTC | #8
On Wed, Jun 07, 2017 at 04:07:20PM +0300, Michael S. Tsirkin wrote:
> On Wed, Jun 07, 2017 at 11:44:43AM +0800, Peter Xu wrote:
> > On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> > > On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > > > 
> > > > 
> > > > On 05/06/2017 05:07, Peter Xu wrote:
> > > > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > > > better just use page_mask/addr_mask.
> > > > > 
> > > > > That's also how I know about IOMMU in general - I assume it do the
> > > > > translations always with page masks (never arbitary length), though
> > > > > page size can differ from platfrom to platform, that's why here the
> > > > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > > > don't know whether I'm 100% correct here though.
> > > > > 
> > > > > Maybe David/Paolo/... would comment as well?
> > > > 
> > > > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > > > arbitrarily-sized windows (not necessarily power of two),
> > > 
> > > Uh.. I'm not sure what you mean here.  You might be thinking of the
> > > BATs which really old (32-bit) PowerPC MMUs had - those allow
> > > arbitrary large block translations, but they do have to be a power of
> > > two.
> > > 
> > > > so maybe the
> > > > IOMMUs can do the same.
> > > 
> > > The only Power IOMMU I know about uses a fixed, power-of-two page size
> > > per DMA window.
> > 
> > If so, I would still be inclined to keep using masks for QEMU IOTLB.
> > Then, my first two patches should still stand.
> > 
> > I am just afraid that not using masks will diverge the emulation from
> > real hardware and brings trouble one day.
> > 
> > For vhost IOTLB interface, it does not need to be strictly aligned to
> > QEMU IOMMU IOTLB definition, and that's how it's working now (current
> > vhost iotlb allows arbitary length, and I think it's good). So imho we
> > don't really need to worry about the performance - after all, we can
> > do everything customized for vhost, just like what patch 3 did (yeah,
> > it can be better...).
> > 
> > Thanks,
> 
> Pre-faults is also something that does not happen on real hardware.
> And it's about security so a bigger issue.
> 
> If I had to choose between that and using non-power-of-2 in
> the API, I'd go for non-power-of-2. Let backends that can only
> support power of 2 split it up to multiple transactions.

The problem is that when I was fixing the problem that vhost had with
PT (a764040, "exec: abstract address_space_do_translate()"), I did
broke the IOTLB translation a bit (it was using page masks). IMHO we
need to fix it first for correctness (patch 1/2).

For patch 3, if we can have Jason's patch to allow dynamic
iommu_platform switching, that'll be the best, then I can rewrite
patch 3 with the switching logic rather than caching anything. But
IMHO that can be separated from patch 1/2 if you like.

Or do you have better suggestion on how should we fix it?

Thanks,
Michael S. Tsirkin June 8, 2017, 6:59 p.m. UTC | #9
On Thu, Jun 08, 2017 at 02:11:50PM +0800, Peter Xu wrote:
> On Wed, Jun 07, 2017 at 04:07:20PM +0300, Michael S. Tsirkin wrote:
> > On Wed, Jun 07, 2017 at 11:44:43AM +0800, Peter Xu wrote:
> > > On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> > > > On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > > > > 
> > > > > 
> > > > > On 05/06/2017 05:07, Peter Xu wrote:
> > > > > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > > > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > > > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > > > > better just use page_mask/addr_mask.
> > > > > > 
> > > > > > That's also how I know about IOMMU in general - I assume it do the
> > > > > > translations always with page masks (never arbitary length), though
> > > > > > page size can differ from platfrom to platform, that's why here the
> > > > > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > > > > don't know whether I'm 100% correct here though.
> > > > > > 
> > > > > > Maybe David/Paolo/... would comment as well?
> > > > > 
> > > > > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > > > > arbitrarily-sized windows (not necessarily power of two),
> > > > 
> > > > Uh.. I'm not sure what you mean here.  You might be thinking of the
> > > > BATs which really old (32-bit) PowerPC MMUs had - those allow
> > > > arbitrary large block translations, but they do have to be a power of
> > > > two.
> > > > 
> > > > > so maybe the
> > > > > IOMMUs can do the same.
> > > > 
> > > > The only Power IOMMU I know about uses a fixed, power-of-two page size
> > > > per DMA window.
> > > 
> > > If so, I would still be inclined to keep using masks for QEMU IOTLB.
> > > Then, my first two patches should still stand.
> > > 
> > > I am just afraid that not using masks will diverge the emulation from
> > > real hardware and brings trouble one day.
> > > 
> > > For vhost IOTLB interface, it does not need to be strictly aligned to
> > > QEMU IOMMU IOTLB definition, and that's how it's working now (current
> > > vhost iotlb allows arbitary length, and I think it's good). So imho we
> > > don't really need to worry about the performance - after all, we can
> > > do everything customized for vhost, just like what patch 3 did (yeah,
> > > it can be better...).
> > > 
> > > Thanks,
> > 
> > Pre-faults is also something that does not happen on real hardware.
> > And it's about security so a bigger issue.
> > 
> > If I had to choose between that and using non-power-of-2 in
> > the API, I'd go for non-power-of-2. Let backends that can only
> > support power of 2 split it up to multiple transactions.
> 
> The problem is that when I was fixing the problem that vhost had with
> PT (a764040, "exec: abstract address_space_do_translate()"), I did
> broke the IOTLB translation a bit (it was using page masks). IMHO we
> need to fix it first for correctness (patch 1/2).
> 
> For patch 3, if we can have Jason's patch to allow dynamic
> iommu_platform switching, that'll be the best, then I can rewrite
> patch 3 with the switching logic rather than caching anything. But
> IMHO that can be separated from patch 1/2 if you like.
> 
> Or do you have better suggestion on how should we fix it?
> 
> Thanks,

Can we drop masks completely and replace with length? I think we
should do that instead of trying to fix masks.

> -- 
> Peter Xu
Peter Xu June 9, 2017, 1:58 a.m. UTC | #10
On Thu, Jun 08, 2017 at 09:59:50PM +0300, Michael S. Tsirkin wrote:
> On Thu, Jun 08, 2017 at 02:11:50PM +0800, Peter Xu wrote:
> > On Wed, Jun 07, 2017 at 04:07:20PM +0300, Michael S. Tsirkin wrote:
> > > On Wed, Jun 07, 2017 at 11:44:43AM +0800, Peter Xu wrote:
> > > > On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> > > > > On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > > > > > 
> > > > > > 
> > > > > > On 05/06/2017 05:07, Peter Xu wrote:
> > > > > > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > > > > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > > > > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > > > > > better just use page_mask/addr_mask.
> > > > > > > 
> > > > > > > That's also how I know about IOMMU in general - I assume it do the
> > > > > > > translations always with page masks (never arbitary length), though
> > > > > > > page size can differ from platfrom to platform, that's why here the
> > > > > > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > > > > > don't know whether I'm 100% correct here though.
> > > > > > > 
> > > > > > > Maybe David/Paolo/... would comment as well?
> > > > > > 
> > > > > > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > > > > > arbitrarily-sized windows (not necessarily power of two),
> > > > > 
> > > > > Uh.. I'm not sure what you mean here.  You might be thinking of the
> > > > > BATs which really old (32-bit) PowerPC MMUs had - those allow
> > > > > arbitrary large block translations, but they do have to be a power of
> > > > > two.
> > > > > 
> > > > > > so maybe the
> > > > > > IOMMUs can do the same.
> > > > > 
> > > > > The only Power IOMMU I know about uses a fixed, power-of-two page size
> > > > > per DMA window.
> > > > 
> > > > If so, I would still be inclined to keep using masks for QEMU IOTLB.
> > > > Then, my first two patches should still stand.
> > > > 
> > > > I am just afraid that not using masks will diverge the emulation from
> > > > real hardware and brings trouble one day.
> > > > 
> > > > For vhost IOTLB interface, it does not need to be strictly aligned to
> > > > QEMU IOMMU IOTLB definition, and that's how it's working now (current
> > > > vhost iotlb allows arbitary length, and I think it's good). So imho we
> > > > don't really need to worry about the performance - after all, we can
> > > > do everything customized for vhost, just like what patch 3 did (yeah,
> > > > it can be better...).
> > > > 
> > > > Thanks,
> > > 
> > > Pre-faults is also something that does not happen on real hardware.
> > > And it's about security so a bigger issue.
> > > 
> > > If I had to choose between that and using non-power-of-2 in
> > > the API, I'd go for non-power-of-2. Let backends that can only
> > > support power of 2 split it up to multiple transactions.
> > 
> > The problem is that when I was fixing the problem that vhost had with
> > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > need to fix it first for correctness (patch 1/2).
> > 
> > For patch 3, if we can have Jason's patch to allow dynamic
> > iommu_platform switching, that'll be the best, then I can rewrite
> > patch 3 with the switching logic rather than caching anything. But
> > IMHO that can be separated from patch 1/2 if you like.
> > 
> > Or do you have better suggestion on how should we fix it?
> > 
> > Thanks,
> 
> Can we drop masks completely and replace with length? I think we
> should do that instead of trying to fix masks.

Do you mean to modify IOMMUTLBEntry.addr_mask into length?

Again, I am not sure this is good... At least we need to get ack from
David since spapr should be the initial user of it, and possibly also
Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
addr_mask is page masks rather than arbirary length.

(CC Alex)

Thanks,
David Gibson June 9, 2017, 2:37 a.m. UTC | #11
On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> On Thu, Jun 08, 2017 at 09:59:50PM +0300, Michael S. Tsirkin wrote:
> > On Thu, Jun 08, 2017 at 02:11:50PM +0800, Peter Xu wrote:
> > > On Wed, Jun 07, 2017 at 04:07:20PM +0300, Michael S. Tsirkin wrote:
> > > > On Wed, Jun 07, 2017 at 11:44:43AM +0800, Peter Xu wrote:
> > > > > On Wed, Jun 07, 2017 at 09:47:05AM +1000, David Gibson wrote:
> > > > > > On Tue, Jun 06, 2017 at 04:34:30PM +0200, Paolo Bonzini wrote:
> > > > > > > 
> > > > > > > 
> > > > > > > On 05/06/2017 05:07, Peter Xu wrote:
> > > > > > > > I don't sure whether it'll be a good interface for IOTLB. AFAIU at
> > > > > > > > least for VT-d, the IOMMU translation is page aligned which is defined
> > > > > > > > by spec, so it makes sense that (again at least for VT-d) here we'd
> > > > > > > > better just use page_mask/addr_mask.
> > > > > > > > 
> > > > > > > > That's also how I know about IOMMU in general - I assume it do the
> > > > > > > > translations always with page masks (never arbitary length), though
> > > > > > > > page size can differ from platfrom to platform, that's why here the
> > > > > > > > IOTLB interface used addr_mask, then it works for all platforms. I
> > > > > > > > don't know whether I'm 100% correct here though.
> > > > > > > > 
> > > > > > > > Maybe David/Paolo/... would comment as well?
> > > > > > > 
> > > > > > > I would ask David.  There are PowerPC MMUs that allow fast lookup of
> > > > > > > arbitrarily-sized windows (not necessarily power of two),
> > > > > > 
> > > > > > Uh.. I'm not sure what you mean here.  You might be thinking of the
> > > > > > BATs which really old (32-bit) PowerPC MMUs had - those allow
> > > > > > arbitrary large block translations, but they do have to be a power of
> > > > > > two.
> > > > > > 
> > > > > > > so maybe the
> > > > > > > IOMMUs can do the same.
> > > > > > 
> > > > > > The only Power IOMMU I know about uses a fixed, power-of-two page size
> > > > > > per DMA window.
> > > > > 
> > > > > If so, I would still be inclined to keep using masks for QEMU IOTLB.
> > > > > Then, my first two patches should still stand.
> > > > > 
> > > > > I am just afraid that not using masks will diverge the emulation from
> > > > > real hardware and brings trouble one day.
> > > > > 
> > > > > For vhost IOTLB interface, it does not need to be strictly aligned to
> > > > > QEMU IOMMU IOTLB definition, and that's how it's working now (current
> > > > > vhost iotlb allows arbitary length, and I think it's good). So imho we
> > > > > don't really need to worry about the performance - after all, we can
> > > > > do everything customized for vhost, just like what patch 3 did (yeah,
> > > > > it can be better...).
> > > > > 
> > > > > Thanks,
> > > > 
> > > > Pre-faults is also something that does not happen on real hardware.
> > > > And it's about security so a bigger issue.
> > > > 
> > > > If I had to choose between that and using non-power-of-2 in
> > > > the API, I'd go for non-power-of-2. Let backends that can only
> > > > support power of 2 split it up to multiple transactions.
> > > 
> > > The problem is that when I was fixing the problem that vhost had with
> > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > need to fix it first for correctness (patch 1/2).
> > > 
> > > For patch 3, if we can have Jason's patch to allow dynamic
> > > iommu_platform switching, that'll be the best, then I can rewrite
> > > patch 3 with the switching logic rather than caching anything. But
> > > IMHO that can be separated from patch 1/2 if you like.
> > > 
> > > Or do you have better suggestion on how should we fix it?
> > > 
> > > Thanks,
> > 
> > Can we drop masks completely and replace with length? I think we
> > should do that instead of trying to fix masks.
> 
> Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> 
> Again, I am not sure this is good... At least we need to get ack from
> David since spapr should be the initial user of it, and possibly also
> Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> addr_mask is page masks rather than arbirary length.

So, I don't see that using size instead of mask would be a particular
problem for spapr.  However, I also don't see any advantage to
switching.
Michael S. Tsirkin June 11, 2017, 10:09 a.m. UTC | #12
On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > The problem is that when I was fixing the problem that vhost had with
> > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > need to fix it first for correctness (patch 1/2).
> > > 
> > > For patch 3, if we can have Jason's patch to allow dynamic
> > > iommu_platform switching, that'll be the best, then I can rewrite
> > > patch 3 with the switching logic rather than caching anything. But
> > > IMHO that can be separated from patch 1/2 if you like.
> > > 
> > > Or do you have better suggestion on how should we fix it?
> > > 
> > > Thanks,
> > 
> > Can we drop masks completely and replace with length? I think we
> > should do that instead of trying to fix masks.
> 
> Do you mean to modify IOMMUTLBEntry.addr_mask into length?

I think it's better than alternatives.

> Again, I am not sure this is good... At least we need to get ack from
> David since spapr should be the initial user of it, and possibly also
> Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> addr_mask is page masks rather than arbirary length.
> 
> (CC Alex)
> 
> Thanks,

Callbacks that need powers of two can easily split up the range.


> -- 
> Peter Xu
David Gibson June 11, 2017, 12:10 p.m. UTC | #13
On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > The problem is that when I was fixing the problem that vhost had with
> > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > need to fix it first for correctness (patch 1/2).
> > > > 
> > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > patch 3 with the switching logic rather than caching anything. But
> > > > IMHO that can be separated from patch 1/2 if you like.
> > > > 
> > > > Or do you have better suggestion on how should we fix it?
> > > > 
> > > > Thanks,
> > > 
> > > Can we drop masks completely and replace with length? I think we
> > > should do that instead of trying to fix masks.
> > 
> > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> 
> I think it's better than alternatives.
> 
> > Again, I am not sure this is good... At least we need to get ack from
> > David since spapr should be the initial user of it, and possibly also
> > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > addr_mask is page masks rather than arbirary length.
> > 
> > (CC Alex)
> > 
> > Thanks,
> 
> Callbacks that need powers of two can easily split up the range.

I think I missed part of the thread.  What's the original use case for
non-power-of-two IOTLB entries?  It certainly won't happen on Power.
Peter Xu June 12, 2017, 2:34 a.m. UTC | #14
On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > need to fix it first for correctness (patch 1/2).
> > > > > 
> > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > 
> > > > > Or do you have better suggestion on how should we fix it?
> > > > > 
> > > > > Thanks,
> > > > 
> > > > Can we drop masks completely and replace with length? I think we
> > > > should do that instead of trying to fix masks.
> > > 
> > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > 
> > I think it's better than alternatives.
> > 
> > > Again, I am not sure this is good... At least we need to get ack from
> > > David since spapr should be the initial user of it, and possibly also
> > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > addr_mask is page masks rather than arbirary length.
> > > 
> > > (CC Alex)
> > > 
> > > Thanks,
> > 
> > Callbacks that need powers of two can easily split up the range.
> 
> I think I missed part of the thread.  What's the original use case for
> non-power-of-two IOTLB entries?  It certainly won't happen on Power.

Currently address_space_get_iotlb_entry() didn't really follow the
rule, addr_mask can be arbitary length. This series tried to fix it,
while Michael was questioning about whether we should really fix that
at all.

Michael,

Even if for performance's sake, I should still think we should fix it.
Let's consider a most simple worst case: we have a single page mapped
with IOVA range (2M page):

 [0x0, 0x200000)

And if guest access IOVA using the following patern:

 0x1fffff, 0x1ffffe, 0x1ffffd, ...

Then now we'll get this:

- request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
- request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
- request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
- ...

We'll all cache miss along the way until we access 0x0. While if we
are with page mask, we'll get:

- request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
- request 0x1ffffe, cache hit
- request 0x1ffffd, cache hit
- ...

We'll only miss at the first IO.
Michael S. Tsirkin June 12, 2017, 3:07 a.m. UTC | #15
On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > 
> > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > 
> > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > 
> > > > > > Thanks,
> > > > > 
> > > > > Can we drop masks completely and replace with length? I think we
> > > > > should do that instead of trying to fix masks.
> > > > 
> > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > 
> > > I think it's better than alternatives.
> > > 
> > > > Again, I am not sure this is good... At least we need to get ack from
> > > > David since spapr should be the initial user of it, and possibly also
> > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > addr_mask is page masks rather than arbirary length.
> > > > 
> > > > (CC Alex)
> > > > 
> > > > Thanks,
> > > 
> > > Callbacks that need powers of two can easily split up the range.
> > 
> > I think I missed part of the thread.  What's the original use case for
> > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> 
> Currently address_space_get_iotlb_entry() didn't really follow the
> rule, addr_mask can be arbitary length. This series tried to fix it,
> while Michael was questioning about whether we should really fix that
> at all.
> 
> Michael,
> 
> Even if for performance's sake, I should still think we should fix it.
> Let's consider a most simple worst case: we have a single page mapped
> with IOVA range (2M page):
> 
>  [0x0, 0x200000)
> 
> And if guest access IOVA using the following patern:
> 
>  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> 
> Then now we'll get this:
> 
> - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> - ...

We pass an offset too, do we not? So callee can figure out
the region starts at 0x0 and avoid 2nd and 3rd misses.


> We'll all cache miss along the way until we access 0x0. While if we
> are with page mask, we'll get:
> 
> - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> - request 0x1ffffe, cache hit
> - request 0x1ffffd, cache hit
> - ...
> 
> We'll only miss at the first IO.

I think we should send as much info as we can.
There should be a way to find full region start and length.

> -- 
> Peter Xu
Peter Xu June 12, 2017, 4:04 a.m. UTC | #16
On Mon, Jun 12, 2017 at 06:07:04AM +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> > On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > > 
> > > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > > 
> > > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > > 
> > > > > > > Thanks,
> > > > > > 
> > > > > > Can we drop masks completely and replace with length? I think we
> > > > > > should do that instead of trying to fix masks.
> > > > > 
> > > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > > 
> > > > I think it's better than alternatives.
> > > > 
> > > > > Again, I am not sure this is good... At least we need to get ack from
> > > > > David since spapr should be the initial user of it, and possibly also
> > > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > > addr_mask is page masks rather than arbirary length.
> > > > > 
> > > > > (CC Alex)
> > > > > 
> > > > > Thanks,
> > > > 
> > > > Callbacks that need powers of two can easily split up the range.
> > > 
> > > I think I missed part of the thread.  What's the original use case for
> > > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> > 
> > Currently address_space_get_iotlb_entry() didn't really follow the
> > rule, addr_mask can be arbitary length. This series tried to fix it,
> > while Michael was questioning about whether we should really fix that
> > at all.
> > 
> > Michael,
> > 
> > Even if for performance's sake, I should still think we should fix it.
> > Let's consider a most simple worst case: we have a single page mapped
> > with IOVA range (2M page):
> > 
> >  [0x0, 0x200000)
> > 
> > And if guest access IOVA using the following patern:
> > 
> >  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> > 
> > Then now we'll get this:
> > 
> > - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> > - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> > - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> > - ...
> 
> We pass an offset too, do we not? So callee can figure out
> the region starts at 0x0 and avoid 2nd and 3rd misses.

Here when you say "offset", do you mean the offset in
MemoryRegionSection?

In address_space_get_iotlb_entry() we have this:

    section = address_space_do_translate(as, addr, &xlat, &plen,
                                         is_write, false);

One thing to mention is that, imho we cannot really assume the xlat is
valid on the whole "section" range - the section can be a huge GPA
range, while the xlat may only be valid on a single 4K page. The only
safe region we can use here is (xlat, xlat+plen). Outside that, we
should know nothing valid.

Please correct me if I didn't really catch the point..

> 
> 
> > We'll all cache miss along the way until we access 0x0. While if we
> > are with page mask, we'll get:
> > 
> > - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> > - request 0x1ffffe, cache hit
> > - request 0x1ffffd, cache hit
> > - ...
> > 
> > We'll only miss at the first IO.
> 
> I think we should send as much info as we can.
> There should be a way to find full region start and length.

Thanks,
Michael S. Tsirkin June 14, 2017, 6:34 p.m. UTC | #17
On Mon, Jun 12, 2017 at 12:04:58PM +0800, Peter Xu wrote:
> On Mon, Jun 12, 2017 at 06:07:04AM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> > > On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > > > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > > > 
> > > > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > > > 
> > > > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > 
> > > > > > > Can we drop masks completely and replace with length? I think we
> > > > > > > should do that instead of trying to fix masks.
> > > > > > 
> > > > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > > > 
> > > > > I think it's better than alternatives.
> > > > > 
> > > > > > Again, I am not sure this is good... At least we need to get ack from
> > > > > > David since spapr should be the initial user of it, and possibly also
> > > > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > > > addr_mask is page masks rather than arbirary length.
> > > > > > 
> > > > > > (CC Alex)
> > > > > > 
> > > > > > Thanks,
> > > > > 
> > > > > Callbacks that need powers of two can easily split up the range.
> > > > 
> > > > I think I missed part of the thread.  What's the original use case for
> > > > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> > > 
> > > Currently address_space_get_iotlb_entry() didn't really follow the
> > > rule, addr_mask can be arbitary length. This series tried to fix it,
> > > while Michael was questioning about whether we should really fix that
> > > at all.
> > > 
> > > Michael,
> > > 
> > > Even if for performance's sake, I should still think we should fix it.
> > > Let's consider a most simple worst case: we have a single page mapped
> > > with IOVA range (2M page):
> > > 
> > >  [0x0, 0x200000)
> > > 
> > > And if guest access IOVA using the following patern:
> > > 
> > >  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> > > 
> > > Then now we'll get this:
> > > 
> > > - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> > > - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> > > - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> > > - ...
> > 
> > We pass an offset too, do we not? So callee can figure out
> > the region starts at 0x0 and avoid 2nd and 3rd misses.
> 
> Here when you say "offset", do you mean the offset in
> MemoryRegionSection?
> 
> In address_space_get_iotlb_entry() we have this:
> 
>     section = address_space_do_translate(as, addr, &xlat, &plen,
>                                          is_write, false);
> 
> One thing to mention is that, imho we cannot really assume the xlat is
> valid on the whole "section" range - the section can be a huge GPA
> range, while the xlat may only be valid on a single 4K page. The only
> safe region we can use here is (xlat, xlat+plen). Outside that, we
> should know nothing valid.
> 
> Please correct me if I didn't really catch the point..

IIUC section is the translation result. If so all of it is valid,
not just one page.

> > 
> > 
> > > We'll all cache miss along the way until we access 0x0. While if we
> > > are with page mask, we'll get:
> > > 
> > > - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> > > - request 0x1ffffe, cache hit
> > > - request 0x1ffffd, cache hit
> > > - ...
> > > 
> > > We'll only miss at the first IO.
> > 
> > I think we should send as much info as we can.
> > There should be a way to find full region start and length.
> 
> Thanks,
> 
> -- 
> Peter Xu
Peter Xu June 15, 2017, 2:31 a.m. UTC | #18
On Wed, Jun 14, 2017 at 09:34:52PM +0300, Michael S. Tsirkin wrote:
> On Mon, Jun 12, 2017 at 12:04:58PM +0800, Peter Xu wrote:
> > On Mon, Jun 12, 2017 at 06:07:04AM +0300, Michael S. Tsirkin wrote:
> > > On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> > > > On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > > > > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > > > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > > > > 
> > > > > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > > > > 
> > > > > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > > > > 
> > > > > > > > > Thanks,
> > > > > > > > 
> > > > > > > > Can we drop masks completely and replace with length? I think we
> > > > > > > > should do that instead of trying to fix masks.
> > > > > > > 
> > > > > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > > > > 
> > > > > > I think it's better than alternatives.
> > > > > > 
> > > > > > > Again, I am not sure this is good... At least we need to get ack from
> > > > > > > David since spapr should be the initial user of it, and possibly also
> > > > > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > > > > addr_mask is page masks rather than arbirary length.
> > > > > > > 
> > > > > > > (CC Alex)
> > > > > > > 
> > > > > > > Thanks,
> > > > > > 
> > > > > > Callbacks that need powers of two can easily split up the range.
> > > > > 
> > > > > I think I missed part of the thread.  What's the original use case for
> > > > > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> > > > 
> > > > Currently address_space_get_iotlb_entry() didn't really follow the
> > > > rule, addr_mask can be arbitary length. This series tried to fix it,
> > > > while Michael was questioning about whether we should really fix that
> > > > at all.
> > > > 
> > > > Michael,
> > > > 
> > > > Even if for performance's sake, I should still think we should fix it.
> > > > Let's consider a most simple worst case: we have a single page mapped
> > > > with IOVA range (2M page):
> > > > 
> > > >  [0x0, 0x200000)
> > > > 
> > > > And if guest access IOVA using the following patern:
> > > > 
> > > >  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> > > > 
> > > > Then now we'll get this:
> > > > 
> > > > - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> > > > - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> > > > - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> > > > - ...
> > > 
> > > We pass an offset too, do we not? So callee can figure out
> > > the region starts at 0x0 and avoid 2nd and 3rd misses.
> > 
> > Here when you say "offset", do you mean the offset in
> > MemoryRegionSection?
> > 
> > In address_space_get_iotlb_entry() we have this:
> > 
> >     section = address_space_do_translate(as, addr, &xlat, &plen,
> >                                          is_write, false);
> > 
> > One thing to mention is that, imho we cannot really assume the xlat is
> > valid on the whole "section" range - the section can be a huge GPA
> > range, while the xlat may only be valid on a single 4K page. The only
> > safe region we can use here is (xlat, xlat+plen). Outside that, we
> > should know nothing valid.

[1]

> > 
> > Please correct me if I didn't really catch the point..
> 
> IIUC section is the translation result. If so all of it is valid,
> not just one page.

It should be only one page (I think that depends on how we implemented
the translation logic). Please check this (gdb session on current
master):

(gdb) b address_space_get_iotlb_entry 
Breakpoint 2 at 0x55555576803a: file /root/git/qemu/exec.c, line 512.
... (until break)
(gdb) bt
#0  0x000055555576803a in address_space_get_iotlb_entry (as=0x55555841bbb0, addr=4294959104, is_write=false) at /root/git/qemu/exec.c:512
#1  0x00005555558322dd in vhost_device_iotlb_miss (dev=0x5555568b03a0, iova=4294959104, write=0) at /root/git/qemu/hw/virtio/vhost.c:982
#2  0x0000555555834ad9 in vhost_backend_handle_iotlb_msg (dev=0x5555568b03a0, imsg=0x7fffffffd428) at /root/git/qemu/hw/virtio/vhost-backend.c:334
#3  0x00005555558347be in vhost_kernel_iotlb_read (opaque=0x5555568b03a0) at /root/git/qemu/hw/virtio/vhost-backend.c:204
#4  0x0000555555c7e7c3 in aio_dispatch_handlers (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:399
#5  0x0000555555c7e956 in aio_dispatch (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:430
#6  0x0000555555c7a722 in aio_ctx_dispatch (source=0x5555568091b0, callback=0x0, user_data=0x0) at /root/git/qemu/util/async.c:261
#7  0x00007f98c62f3703 in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
#8  0x0000555555c7d36c in glib_pollfds_poll () at /root/git/qemu/util/main-loop.c:213
#9  0x0000555555c7d468 in os_host_main_loop_wait (timeout=29311167) at /root/git/qemu/util/main-loop.c:261
#10 0x0000555555c7d521 in main_loop_wait (nonblocking=0) at /root/git/qemu/util/main-loop.c:517
#11 0x00005555558fe234 in main_loop () at /root/git/qemu/vl.c:1918
#12 0x0000555555905fe8 in main (argc=21, argv=0x7fffffffda48, envp=0x7fffffffdaf8) at /root/git/qemu/vl.c:4752
(gdb) s
517         plen = (hwaddr)-1;
(gdb)
520         section = address_space_do_translate(as, addr, &xlat, &plen,
(gdb) n
524         if (section.mr == &io_mem_unassigned) {
(gdb) p/x xlat
$2 = 0x73900000
(gdb) p/x addr
$3 = 0xffffe000
(gdb) p/x plen
$4 = 0x1000
(gdb) p section
$5 = {mr = 0x5555569ad790, address_space = 0x5555562f9b40 <address_space_memory>, offset_within_region = 1048576, size = 0x0000000000000000000000007ff00000, 
  offset_within_address_space = 1048576, readonly = false}

Here $2-$4 shows that we are translating IOVA 0xffffe000 into
[0x73900000, 0x73901000) range, while we can see the section is having
size as 0x7ff00000, which is ranged from [0x100000000, 0x17ff00000).
That's the upper RAM that the guest has, corresponds to what we can
see in "info mtree -f" (the guest contains 4G RAM, this is upper 2G):

  0000000100000000-000000017fffffff (prio 0, ram): pc.ram @0000000080000000

Obvious, we cannot assume we have a linear mapping on this whole
range. So I don't think we can really use section info (though the
section can be used to obtain some offset information, or address
space the range belongs) to build up the IOTLB, but only the plen.
Then, we need to fix current codes.

And this is exactly what I meant above at [1].  Hope this clarifies a
bit on the issue.  Thanks,

> 
> > > 
> > > 
> > > > We'll all cache miss along the way until we access 0x0. While if we
> > > > are with page mask, we'll get:
> > > > 
> > > > - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> > > > - request 0x1ffffe, cache hit
> > > > - request 0x1ffffd, cache hit
> > > > - ...
> > > > 
> > > > We'll only miss at the first IO.
> > > 
> > > I think we should send as much info as we can.
> > > There should be a way to find full region start and length.
> > 
> > Thanks,
> > 
> > -- 
> > Peter Xu
Peter Xu June 15, 2017, 2:57 a.m. UTC | #19
On Thu, Jun 15, 2017 at 10:31:11AM +0800, Peter Xu wrote:
> On Wed, Jun 14, 2017 at 09:34:52PM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 12, 2017 at 12:04:58PM +0800, Peter Xu wrote:
> > > On Mon, Jun 12, 2017 at 06:07:04AM +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> > > > > On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > > > > > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > > > > > 
> > > > > > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > > > > > 
> > > > > > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > > > > > 
> > > > > > > > > > Thanks,
> > > > > > > > > 
> > > > > > > > > Can we drop masks completely and replace with length? I think we
> > > > > > > > > should do that instead of trying to fix masks.
> > > > > > > > 
> > > > > > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > > > > > 
> > > > > > > I think it's better than alternatives.
> > > > > > > 
> > > > > > > > Again, I am not sure this is good... At least we need to get ack from
> > > > > > > > David since spapr should be the initial user of it, and possibly also
> > > > > > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > > > > > addr_mask is page masks rather than arbirary length.
> > > > > > > > 
> > > > > > > > (CC Alex)
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > 
> > > > > > > Callbacks that need powers of two can easily split up the range.
> > > > > > 
> > > > > > I think I missed part of the thread.  What's the original use case for
> > > > > > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> > > > > 
> > > > > Currently address_space_get_iotlb_entry() didn't really follow the
> > > > > rule, addr_mask can be arbitary length. This series tried to fix it,
> > > > > while Michael was questioning about whether we should really fix that
> > > > > at all.
> > > > > 
> > > > > Michael,
> > > > > 
> > > > > Even if for performance's sake, I should still think we should fix it.
> > > > > Let's consider a most simple worst case: we have a single page mapped
> > > > > with IOVA range (2M page):
> > > > > 
> > > > >  [0x0, 0x200000)
> > > > > 
> > > > > And if guest access IOVA using the following patern:
> > > > > 
> > > > >  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> > > > > 
> > > > > Then now we'll get this:
> > > > > 
> > > > > - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> > > > > - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> > > > > - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> > > > > - ...
> > > > 
> > > > We pass an offset too, do we not? So callee can figure out
> > > > the region starts at 0x0 and avoid 2nd and 3rd misses.
> > > 
> > > Here when you say "offset", do you mean the offset in
> > > MemoryRegionSection?
> > > 
> > > In address_space_get_iotlb_entry() we have this:
> > > 
> > >     section = address_space_do_translate(as, addr, &xlat, &plen,
> > >                                          is_write, false);
> > > 
> > > One thing to mention is that, imho we cannot really assume the xlat is
> > > valid on the whole "section" range - the section can be a huge GPA
> > > range, while the xlat may only be valid on a single 4K page. The only
> > > safe region we can use here is (xlat, xlat+plen). Outside that, we
> > > should know nothing valid.
> 
> [1]
> 
> > > 
> > > Please correct me if I didn't really catch the point..
> > 
> > IIUC section is the translation result. If so all of it is valid,
> > not just one page.
> 
> It should be only one page (I think that depends on how we implemented
> the translation logic). Please check this (gdb session on current
> master):
> 
> (gdb) b address_space_get_iotlb_entry 
> Breakpoint 2 at 0x55555576803a: file /root/git/qemu/exec.c, line 512.
> ... (until break)
> (gdb) bt
> #0  0x000055555576803a in address_space_get_iotlb_entry (as=0x55555841bbb0, addr=4294959104, is_write=false) at /root/git/qemu/exec.c:512
> #1  0x00005555558322dd in vhost_device_iotlb_miss (dev=0x5555568b03a0, iova=4294959104, write=0) at /root/git/qemu/hw/virtio/vhost.c:982
> #2  0x0000555555834ad9 in vhost_backend_handle_iotlb_msg (dev=0x5555568b03a0, imsg=0x7fffffffd428) at /root/git/qemu/hw/virtio/vhost-backend.c:334
> #3  0x00005555558347be in vhost_kernel_iotlb_read (opaque=0x5555568b03a0) at /root/git/qemu/hw/virtio/vhost-backend.c:204
> #4  0x0000555555c7e7c3 in aio_dispatch_handlers (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:399
> #5  0x0000555555c7e956 in aio_dispatch (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:430
> #6  0x0000555555c7a722 in aio_ctx_dispatch (source=0x5555568091b0, callback=0x0, user_data=0x0) at /root/git/qemu/util/async.c:261
> #7  0x00007f98c62f3703 in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
> #8  0x0000555555c7d36c in glib_pollfds_poll () at /root/git/qemu/util/main-loop.c:213
> #9  0x0000555555c7d468 in os_host_main_loop_wait (timeout=29311167) at /root/git/qemu/util/main-loop.c:261
> #10 0x0000555555c7d521 in main_loop_wait (nonblocking=0) at /root/git/qemu/util/main-loop.c:517
> #11 0x00005555558fe234 in main_loop () at /root/git/qemu/vl.c:1918
> #12 0x0000555555905fe8 in main (argc=21, argv=0x7fffffffda48, envp=0x7fffffffdaf8) at /root/git/qemu/vl.c:4752
> (gdb) s
> 517         plen = (hwaddr)-1;
> (gdb)
> 520         section = address_space_do_translate(as, addr, &xlat, &plen,
> (gdb) n
> 524         if (section.mr == &io_mem_unassigned) {
> (gdb) p/x xlat
> $2 = 0x73900000
> (gdb) p/x addr
> $3 = 0xffffe000
> (gdb) p/x plen
> $4 = 0x1000
> (gdb) p section
> $5 = {mr = 0x5555569ad790, address_space = 0x5555562f9b40 <address_space_memory>, offset_within_region = 1048576, size = 0x0000000000000000000000007ff00000, 
>   offset_within_address_space = 1048576, readonly = false}
> 
> Here $2-$4 shows that we are translating IOVA 0xffffe000 into
> [0x73900000, 0x73901000) range, while we can see the section is having
> size as 0x7ff00000,

> which is ranged from [0x100000000, 0x17ff00000).
> That's the upper RAM that the guest has, corresponds to what we can
> see in "info mtree -f" (the guest contains 4G RAM, this is upper 2G):
> 
>   0000000100000000-000000017fffffff (prio 0, ram): pc.ram @0000000080000000

Hmm I made a mistake here... It should be the lower 2G mem (not
upper), which ranges from:

  0000000000100000-000000007fffffff (prio 0, ram): pc.ram @0000000000100000

Though the main point still stands below...

> 
> Obvious, we cannot assume we have a linear mapping on this whole
> range. So I don't think we can really use section info (though the
> section can be used to obtain some offset information, or address
> space the range belongs) to build up the IOTLB, but only the plen.
> Then, we need to fix current codes.
> 
> And this is exactly what I meant above at [1].  Hope this clarifies a
> bit on the issue.  Thanks,
> 
> > 
> > > > 
> > > > 
> > > > > We'll all cache miss along the way until we access 0x0. While if we
> > > > > are with page mask, we'll get:
> > > > > 
> > > > > - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> > > > > - request 0x1ffffe, cache hit
> > > > > - request 0x1ffffd, cache hit
> > > > > - ...
> > > > > 
> > > > > We'll only miss at the first IO.
> > > > 
> > > > I think we should send as much info as we can.
> > > > There should be a way to find full region start and length.
> > > 
> > > Thanks,
> > > 
> > > -- 
> > > Peter Xu
> 
> -- 
> Peter Xu
>
Michael S. Tsirkin June 16, 2017, 3:33 p.m. UTC | #20
On Thu, Jun 15, 2017 at 10:31:11AM +0800, Peter Xu wrote:
> On Wed, Jun 14, 2017 at 09:34:52PM +0300, Michael S. Tsirkin wrote:
> > On Mon, Jun 12, 2017 at 12:04:58PM +0800, Peter Xu wrote:
> > > On Mon, Jun 12, 2017 at 06:07:04AM +0300, Michael S. Tsirkin wrote:
> > > > On Mon, Jun 12, 2017 at 10:34:43AM +0800, Peter Xu wrote:
> > > > > On Sun, Jun 11, 2017 at 08:10:15PM +0800, David Gibson wrote:
> > > > > > On Sun, Jun 11, 2017 at 01:09:26PM +0300, Michael S. Tsirkin wrote:
> > > > > > > On Fri, Jun 09, 2017 at 09:58:47AM +0800, Peter Xu wrote:
> > > > > > > > > > The problem is that when I was fixing the problem that vhost had with
> > > > > > > > > > PT (a764040, "exec: abstract address_space_do_translate()"), I did
> > > > > > > > > > broke the IOTLB translation a bit (it was using page masks). IMHO we
> > > > > > > > > > need to fix it first for correctness (patch 1/2).
> > > > > > > > > > 
> > > > > > > > > > For patch 3, if we can have Jason's patch to allow dynamic
> > > > > > > > > > iommu_platform switching, that'll be the best, then I can rewrite
> > > > > > > > > > patch 3 with the switching logic rather than caching anything. But
> > > > > > > > > > IMHO that can be separated from patch 1/2 if you like.
> > > > > > > > > > 
> > > > > > > > > > Or do you have better suggestion on how should we fix it?
> > > > > > > > > > 
> > > > > > > > > > Thanks,
> > > > > > > > > 
> > > > > > > > > Can we drop masks completely and replace with length? I think we
> > > > > > > > > should do that instead of trying to fix masks.
> > > > > > > > 
> > > > > > > > Do you mean to modify IOMMUTLBEntry.addr_mask into length?
> > > > > > > 
> > > > > > > I think it's better than alternatives.
> > > > > > > 
> > > > > > > > Again, I am not sure this is good... At least we need to get ack from
> > > > > > > > David since spapr should be the initial user of it, and possibly also
> > > > > > > > Alex since vfio should be assuming that (IIUC both in QEMU and kernel)
> > > > > > > > addr_mask is page masks rather than arbirary length.
> > > > > > > > 
> > > > > > > > (CC Alex)
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > 
> > > > > > > Callbacks that need powers of two can easily split up the range.
> > > > > > 
> > > > > > I think I missed part of the thread.  What's the original use case for
> > > > > > non-power-of-two IOTLB entries?  It certainly won't happen on Power.
> > > > > 
> > > > > Currently address_space_get_iotlb_entry() didn't really follow the
> > > > > rule, addr_mask can be arbitary length. This series tried to fix it,
> > > > > while Michael was questioning about whether we should really fix that
> > > > > at all.
> > > > > 
> > > > > Michael,
> > > > > 
> > > > > Even if for performance's sake, I should still think we should fix it.
> > > > > Let's consider a most simple worst case: we have a single page mapped
> > > > > with IOVA range (2M page):
> > > > > 
> > > > >  [0x0, 0x200000)
> > > > > 
> > > > > And if guest access IOVA using the following patern:
> > > > > 
> > > > >  0x1fffff, 0x1ffffe, 0x1ffffd, ...
> > > > > 
> > > > > Then now we'll get this:
> > > > > 
> > > > > - request 0x1fffff, cache miss, will get iotlb [0x1fffff, 0x200000)
> > > > > - request 0x1ffffe, cache miss, will get iotlb [0x1ffffe, 0x200000)
> > > > > - request 0x1ffffd, cache miss, will get iotlb [0x1ffffd, 0x200000)
> > > > > - ...
> > > > 
> > > > We pass an offset too, do we not? So callee can figure out
> > > > the region starts at 0x0 and avoid 2nd and 3rd misses.
> > > 
> > > Here when you say "offset", do you mean the offset in
> > > MemoryRegionSection?
> > > 
> > > In address_space_get_iotlb_entry() we have this:
> > > 
> > >     section = address_space_do_translate(as, addr, &xlat, &plen,
> > >                                          is_write, false);
> > > 
> > > One thing to mention is that, imho we cannot really assume the xlat is
> > > valid on the whole "section" range - the section can be a huge GPA
> > > range, while the xlat may only be valid on a single 4K page. The only
> > > safe region we can use here is (xlat, xlat+plen). Outside that, we
> > > should know nothing valid.
> 
> [1]
> 
> > > 
> > > Please correct me if I didn't really catch the point..
> > 
> > IIUC section is the translation result. If so all of it is valid,
> > not just one page.
> 
> It should be only one page (I think that depends on how we implemented
> the translation logic). Please check this (gdb session on current
> master):
> 
> (gdb) b address_space_get_iotlb_entry 
> Breakpoint 2 at 0x55555576803a: file /root/git/qemu/exec.c, line 512.
> ... (until break)
> (gdb) bt
> #0  0x000055555576803a in address_space_get_iotlb_entry (as=0x55555841bbb0, addr=4294959104, is_write=false) at /root/git/qemu/exec.c:512
> #1  0x00005555558322dd in vhost_device_iotlb_miss (dev=0x5555568b03a0, iova=4294959104, write=0) at /root/git/qemu/hw/virtio/vhost.c:982
> #2  0x0000555555834ad9 in vhost_backend_handle_iotlb_msg (dev=0x5555568b03a0, imsg=0x7fffffffd428) at /root/git/qemu/hw/virtio/vhost-backend.c:334
> #3  0x00005555558347be in vhost_kernel_iotlb_read (opaque=0x5555568b03a0) at /root/git/qemu/hw/virtio/vhost-backend.c:204
> #4  0x0000555555c7e7c3 in aio_dispatch_handlers (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:399
> #5  0x0000555555c7e956 in aio_dispatch (ctx=0x5555568091b0) at /root/git/qemu/util/aio-posix.c:430
> #6  0x0000555555c7a722 in aio_ctx_dispatch (source=0x5555568091b0, callback=0x0, user_data=0x0) at /root/git/qemu/util/async.c:261
> #7  0x00007f98c62f3703 in g_main_context_dispatch () at /lib64/libglib-2.0.so.0
> #8  0x0000555555c7d36c in glib_pollfds_poll () at /root/git/qemu/util/main-loop.c:213
> #9  0x0000555555c7d468 in os_host_main_loop_wait (timeout=29311167) at /root/git/qemu/util/main-loop.c:261
> #10 0x0000555555c7d521 in main_loop_wait (nonblocking=0) at /root/git/qemu/util/main-loop.c:517
> #11 0x00005555558fe234 in main_loop () at /root/git/qemu/vl.c:1918
> #12 0x0000555555905fe8 in main (argc=21, argv=0x7fffffffda48, envp=0x7fffffffdaf8) at /root/git/qemu/vl.c:4752
> (gdb) s
> 517         plen = (hwaddr)-1;
> (gdb)
> 520         section = address_space_do_translate(as, addr, &xlat, &plen,
> (gdb) n
> 524         if (section.mr == &io_mem_unassigned) {
> (gdb) p/x xlat
> $2 = 0x73900000
> (gdb) p/x addr
> $3 = 0xffffe000
> (gdb) p/x plen
> $4 = 0x1000
> (gdb) p section
> $5 = {mr = 0x5555569ad790, address_space = 0x5555562f9b40 <address_space_memory>, offset_within_region = 1048576, size = 0x0000000000000000000000007ff00000, 
>   offset_within_address_space = 1048576, readonly = false}
> 
> Here $2-$4 shows that we are translating IOVA 0xffffe000 into
> [0x73900000, 0x73901000) range, while we can see the section is having
> size as 0x7ff00000, which is ranged from [0x100000000, 0x17ff00000).
> That's the upper RAM that the guest has, corresponds to what we can
> see in "info mtree -f" (the guest contains 4G RAM, this is upper 2G):
> 
>   0000000100000000-000000017fffffff (prio 0, ram): pc.ram @0000000080000000
> 
> Obvious, we cannot assume we have a linear mapping on this whole
> range. So I don't think we can really use section info (though the
> section can be used to obtain some offset information, or address
> space the range belongs) to build up the IOTLB, but only the plen.
> Then, we need to fix current codes.
> 
> And this is exactly what I meant above at [1].  Hope this clarifies a
> bit on the issue.  Thanks,

I agree we need to take the IOMMU PTE into account.  What I was saying
is that e.g. with a huge PTE we can figure out how it overlaps with the
section and pass that exact info. Whatever is valid in both PTE and in
section, is always safe to access, but might not be a power of two.

> > 
> > > > 
> > > > 
> > > > > We'll all cache miss along the way until we access 0x0. While if we
> > > > > are with page mask, we'll get:
> > > > > 
> > > > > - request 0x1fffff, cache miss, will get iotlb [0x0, 0x200000)
> > > > > - request 0x1ffffe, cache hit
> > > > > - request 0x1ffffd, cache hit
> > > > > - ...
> > > > > 
> > > > > We'll only miss at the first IO.
> > > > 
> > > > I think we should send as much info as we can.
> > > > There should be a way to find full region start and length.
> > > 
> > > Thanks,
> > > 
> > > -- 
> > > Peter Xu
> 
> -- 
> Peter Xu
diff mbox

Patch

diff --git a/exec.c b/exec.c
index 63a3ff0..1f86253 100644
--- a/exec.c
+++ b/exec.c
@@ -544,14 +544,14 @@  IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
                                             bool is_write)
 {
     MemoryRegionSection section;
-    hwaddr xlat, plen;
+    hwaddr xlat, page_mask;
 
-    /* Try to get maximum page mask during translation. */
-    plen = (hwaddr)-1;
-
-    /* This can never be MMIO. */
-    section = address_space_do_translate(as, addr, &xlat, &plen,
-                                         NULL, is_write, false);
+    /*
+     * This can never be MMIO, and we don't really care about plen,
+     * but page mask.
+     */
+    section = address_space_do_translate(as, addr, &xlat, NULL,
+                                         &page_mask, is_write, false);
 
     /* Illegal translation */
     if (section.mr == &io_mem_unassigned) {
@@ -562,20 +562,11 @@  IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
     xlat += section.offset_within_address_space -
         section.offset_within_region;
 
-    if (plen == (hwaddr)-1) {
-        /* If not specified during translation, use default mask */
-        plen = TARGET_PAGE_MASK;
-    } else {
-        /* Make it a valid page mask */
-        assert(plen);
-        plen = pow2floor(plen) - 1;
-    }
-
     return (IOMMUTLBEntry) {
         .target_as = section.address_space,
-        .iova = addr & ~plen,
-        .translated_addr = xlat & ~plen,
-        .addr_mask = plen,
+        .iova = addr & ~page_mask,
+        .translated_addr = xlat & ~page_mask,
+        .addr_mask = page_mask,
         /* IOTLBs are for DMAs, and DMA only allows on RAMs. */
         .perm = IOMMU_RW,
     };