Message ID | f21a7b6a8f141b87f75687904a76f3728ea639a8.1523304324.git.digetx@gmail.com |
---|---|
State | Deferred |
Headers | show |
Series | Tegra GART fixes and improvements | expand |
On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote: > Currently GART writes one page entry at a time. More optimal would be to > aggregate the writes and flush BUS buffer in the end, this gives map/unmap > 10-40% (depending on size of mapping) performance boost compared to a > flushing after each entry update. > > Signed-off-by: Dmitry Osipenko <digetx@gmail.com> > --- > drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++----------- > 1 file changed, 48 insertions(+), 15 deletions(-) > > diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c > index 4a0607669d34..9f59f5f17661 100644 > --- a/drivers/iommu/tegra-gart.c > +++ b/drivers/iommu/tegra-gart.c > @@ -36,7 +36,7 @@ > #define GART_APERTURE_SIZE SZ_32M > > /* bitmap of the page sizes currently supported */ > -#define GART_IOMMU_PGSIZES (SZ_4K) > +#define GART_IOMMU_PGSIZES GENMASK(24, 12) That doesn't look right. The GART really only supports 4 KiB pages. You seem to be "emulating" more page sizes here in order to improve mapping performance. That seems wrong to me. I'm wondering if this couldn't be improved by a similar factor by simply moving the flushing into an implementation of ->iotlb_sync(). That said, it seems like ->iotlb_sync() is only used for unmapping, but I don't see a reason why iommu_map() wouldn't need to call it as well after going through several calls to ->map(). It seems to me like a driver that implements ->iotlb_sync() would want to use it to optimize for both the mapping and unmapping cases. Joerg, I've gone over the git log and header files and I see no mention of why the TLB flush interface isn't used for mapping. Do you recall any special reasons why the same shouldn't be applied for mapping? Would you accept any patches doing this? Thierry > > #define GART_REG_BASE 0x24 > #define GART_CONFIG (0x24 - GART_REG_BASE) > @@ -269,25 +269,21 @@ static void gart_iommu_domain_free(struct iommu_domain *domain) > kfree(gart_domain); > } > > -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, > - phys_addr_t pa, size_t bytes, int prot) > +static int gart_iommu_map_page(struct gart_device *gart, > + unsigned long iova, > + phys_addr_t pa) > { > - struct gart_domain *gart_domain = to_gart_domain(domain); > - struct gart_device *gart = gart_domain->gart; > unsigned long flags; > unsigned long pfn; > unsigned long pte; > > - if (!gart_iova_range_valid(gart, iova, bytes)) > - return -EINVAL; > - > - spin_lock_irqsave(&gart->pte_lock, flags); > pfn = __phys_to_pfn(pa); > if (!pfn_valid(pfn)) { > dev_err(gart->dev, "Invalid page: %pa\n", &pa); > - spin_unlock_irqrestore(&gart->pte_lock, flags); > return -EINVAL; > } > + > + spin_lock_irqsave(&gart->pte_lock, flags); > if (gart_debug) { > pte = gart_read_pte(gart, iova); > if (pte & GART_ENTRY_PHYS_ADDR_VALID) { > @@ -297,8 +293,41 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, > } > } > gart_set_pte(gart, iova, GART_PTE(pfn)); > + spin_unlock_irqrestore(&gart->pte_lock, flags); > + > + return 0; > +} > + > +static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, > + phys_addr_t pa, size_t bytes, int prot) > +{ > + struct gart_domain *gart_domain = to_gart_domain(domain); > + struct gart_device *gart = gart_domain->gart; > + size_t mapped; > + int ret = -1; > + > + if (!gart_iova_range_valid(gart, iova, bytes)) > + return -EINVAL; > + > + for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) { > + ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped); > + if (ret) > + break; > + } > + > FLUSH_GART_REGS(gart); > + return ret; > +} > + > +static int gart_iommu_unmap_page(struct gart_device *gart, > + unsigned long iova) > +{ > + unsigned long flags; > + > + spin_lock_irqsave(&gart->pte_lock, flags); > + gart_set_pte(gart, iova, 0); > spin_unlock_irqrestore(&gart->pte_lock, flags); > + > return 0; > } > > @@ -307,16 +336,20 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, > { > struct gart_domain *gart_domain = to_gart_domain(domain); > struct gart_device *gart = gart_domain->gart; > - unsigned long flags; > + size_t unmapped; > + int ret; > > if (!gart_iova_range_valid(gart, iova, bytes)) > return 0; > > - spin_lock_irqsave(&gart->pte_lock, flags); > - gart_set_pte(gart, iova, 0); > + for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) { > + ret = gart_iommu_unmap_page(gart, iova + unmapped); > + if (ret) > + break; > + } > + > FLUSH_GART_REGS(gart); > - spin_unlock_irqrestore(&gart->pte_lock, flags); > - return bytes; > + return unmapped; > } > > static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain, > -- > 2.16.3 >
On 27.04.2018 13:02, Thierry Reding wrote: > On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote: >> Currently GART writes one page entry at a time. More optimal would be to >> aggregate the writes and flush BUS buffer in the end, this gives map/unmap >> 10-40% (depending on size of mapping) performance boost compared to a >> flushing after each entry update. >> >> Signed-off-by: Dmitry Osipenko <digetx@gmail.com> >> --- >> drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++----------- >> 1 file changed, 48 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c >> index 4a0607669d34..9f59f5f17661 100644 >> --- a/drivers/iommu/tegra-gart.c >> +++ b/drivers/iommu/tegra-gart.c >> @@ -36,7 +36,7 @@ >> #define GART_APERTURE_SIZE SZ_32M >> >> /* bitmap of the page sizes currently supported */ >> -#define GART_IOMMU_PGSIZES (SZ_4K) >> +#define GART_IOMMU_PGSIZES GENMASK(24, 12) > > That doesn't look right. The GART really only supports 4 KiB pages. You > seem to be "emulating" more page sizes here in order to improve mapping > performance. That seems wrong to me. I'm wondering if this couldn't be > improved by a similar factor by simply moving the flushing into an > implementation of ->iotlb_sync(). > > That said, it seems like ->iotlb_sync() is only used for unmapping, but > I don't see a reason why iommu_map() wouldn't need to call it as well > after going through several calls to ->map(). It seems to me like a > driver that implements ->iotlb_sync() would want to use it to optimize > for both the mapping and unmapping cases. I vaguely remember looking at the IOMMU syncing, but decided to try at first the way this patch is. I'll be happy to switch to map/unmap syncing, let's see what Joerg would suggest. > Joerg, I've gone over the git log and header files and I see no mention > of why the TLB flush interface isn't used for mapping. Do you recall any > special reasons why the same shouldn't be applied for mapping? Would you > accept any patches doing this? -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Hi Thierry, On 27/04/18 11:02, Thierry Reding wrote: > On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote: >> Currently GART writes one page entry at a time. More optimal would be to >> aggregate the writes and flush BUS buffer in the end, this gives map/unmap >> 10-40% (depending on size of mapping) performance boost compared to a >> flushing after each entry update. >> >> Signed-off-by: Dmitry Osipenko <digetx@gmail.com> >> --- >> drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++----------- >> 1 file changed, 48 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c >> index 4a0607669d34..9f59f5f17661 100644 >> --- a/drivers/iommu/tegra-gart.c >> +++ b/drivers/iommu/tegra-gart.c >> @@ -36,7 +36,7 @@ >> #define GART_APERTURE_SIZE SZ_32M >> >> /* bitmap of the page sizes currently supported */ >> -#define GART_IOMMU_PGSIZES (SZ_4K) >> +#define GART_IOMMU_PGSIZES GENMASK(24, 12) > > That doesn't look right. The GART really only supports 4 KiB pages. You > seem to be "emulating" more page sizes here in order to improve mapping > performance. That seems wrong to me. I'm wondering if this couldn't be > improved by a similar factor by simply moving the flushing into an > implementation of ->iotlb_sync(). > > That said, it seems like ->iotlb_sync() is only used for unmapping, but > I don't see a reason why iommu_map() wouldn't need to call it as well > after going through several calls to ->map(). It seems to me like a > driver that implements ->iotlb_sync() would want to use it to optimize > for both the mapping and unmapping cases. > > Joerg, I've gone over the git log and header files and I see no mention > of why the TLB flush interface isn't used for mapping. Do you recall any > special reasons why the same shouldn't be applied for mapping? Would you > accept any patches doing this? In general, requiring TLB maintenance when transitioning from an invalid entry to a valid one tends to be the exception rather than the norm, and I think we ended up at the consensus that it wasn't worth the complication of trying to cater for this in the generic iotlb API. To be fair, on simple hardware which doesn't implement multiple page sizes with associated walk depth/TLB pressure benefits for larger ones, there's no need for the IOMMU API (and/or the owner of the domain) to try harder to use them, so handling "compound" page sizes within the driver is a more reasonable thing to do. There is already some precedent for this in other drivers (e.g. mtk_iommu_v1). Robin. > > Thierry > >> >> #define GART_REG_BASE 0x24 >> #define GART_CONFIG (0x24 - GART_REG_BASE) >> @@ -269,25 +269,21 @@ static void gart_iommu_domain_free(struct iommu_domain *domain) >> kfree(gart_domain); >> } >> >> -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, >> - phys_addr_t pa, size_t bytes, int prot) >> +static int gart_iommu_map_page(struct gart_device *gart, >> + unsigned long iova, >> + phys_addr_t pa) >> { >> - struct gart_domain *gart_domain = to_gart_domain(domain); >> - struct gart_device *gart = gart_domain->gart; >> unsigned long flags; >> unsigned long pfn; >> unsigned long pte; >> >> - if (!gart_iova_range_valid(gart, iova, bytes)) >> - return -EINVAL; >> - >> - spin_lock_irqsave(&gart->pte_lock, flags); >> pfn = __phys_to_pfn(pa); >> if (!pfn_valid(pfn)) { >> dev_err(gart->dev, "Invalid page: %pa\n", &pa); >> - spin_unlock_irqrestore(&gart->pte_lock, flags); >> return -EINVAL; >> } >> + >> + spin_lock_irqsave(&gart->pte_lock, flags); >> if (gart_debug) { >> pte = gart_read_pte(gart, iova); >> if (pte & GART_ENTRY_PHYS_ADDR_VALID) { >> @@ -297,8 +293,41 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, >> } >> } >> gart_set_pte(gart, iova, GART_PTE(pfn)); >> + spin_unlock_irqrestore(&gart->pte_lock, flags); >> + >> + return 0; >> +} >> + >> +static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, >> + phys_addr_t pa, size_t bytes, int prot) >> +{ >> + struct gart_domain *gart_domain = to_gart_domain(domain); >> + struct gart_device *gart = gart_domain->gart; >> + size_t mapped; >> + int ret = -1; >> + >> + if (!gart_iova_range_valid(gart, iova, bytes)) >> + return -EINVAL; >> + >> + for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) { >> + ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped); >> + if (ret) >> + break; >> + } >> + >> FLUSH_GART_REGS(gart); >> + return ret; >> +} >> + >> +static int gart_iommu_unmap_page(struct gart_device *gart, >> + unsigned long iova) >> +{ >> + unsigned long flags; >> + >> + spin_lock_irqsave(&gart->pte_lock, flags); >> + gart_set_pte(gart, iova, 0); >> spin_unlock_irqrestore(&gart->pte_lock, flags); >> + >> return 0; >> } >> >> @@ -307,16 +336,20 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, >> { >> struct gart_domain *gart_domain = to_gart_domain(domain); >> struct gart_device *gart = gart_domain->gart; >> - unsigned long flags; >> + size_t unmapped; >> + int ret; >> >> if (!gart_iova_range_valid(gart, iova, bytes)) >> return 0; >> >> - spin_lock_irqsave(&gart->pte_lock, flags); >> - gart_set_pte(gart, iova, 0); >> + for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) { >> + ret = gart_iommu_unmap_page(gart, iova + unmapped); >> + if (ret) >> + break; >> + } >> + >> FLUSH_GART_REGS(gart); >> - spin_unlock_irqrestore(&gart->pte_lock, flags); >> - return bytes; >> + return unmapped; >> } >> >> static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain, >> -- >> 2.16.3 >> >> >> >> _______________________________________________ >> iommu mailing list >> iommu@lists.linux-foundation.org >> https://lists.linuxfoundation.org/mailman/listinfo/iommu -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 27.04.2018 15:36, Robin Murphy wrote: > Hi Thierry, > > On 27/04/18 11:02, Thierry Reding wrote: >> On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote: >>> Currently GART writes one page entry at a time. More optimal would be to >>> aggregate the writes and flush BUS buffer in the end, this gives map/unmap >>> 10-40% (depending on size of mapping) performance boost compared to a >>> flushing after each entry update. >>> >>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com> >>> --- >>> drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++----------- >>> 1 file changed, 48 insertions(+), 15 deletions(-) >>> >>> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c >>> index 4a0607669d34..9f59f5f17661 100644 >>> --- a/drivers/iommu/tegra-gart.c >>> +++ b/drivers/iommu/tegra-gart.c >>> @@ -36,7 +36,7 @@ >>> #define GART_APERTURE_SIZE SZ_32M >>> /* bitmap of the page sizes currently supported */ >>> -#define GART_IOMMU_PGSIZES (SZ_4K) >>> +#define GART_IOMMU_PGSIZES GENMASK(24, 12) >> >> That doesn't look right. The GART really only supports 4 KiB pages. You >> seem to be "emulating" more page sizes here in order to improve mapping >> performance. That seems wrong to me. I'm wondering if this couldn't be >> improved by a similar factor by simply moving the flushing into an >> implementation of ->iotlb_sync(). >> >> That said, it seems like ->iotlb_sync() is only used for unmapping, but >> I don't see a reason why iommu_map() wouldn't need to call it as well >> after going through several calls to ->map(). It seems to me like a >> driver that implements ->iotlb_sync() would want to use it to optimize >> for both the mapping and unmapping cases. >> >> Joerg, I've gone over the git log and header files and I see no mention >> of why the TLB flush interface isn't used for mapping. Do you recall any >> special reasons why the same shouldn't be applied for mapping? Would you >> accept any patches doing this? > > In general, requiring TLB maintenance when transitioning from an invalid entry > to a valid one tends to be the exception rather than the norm, and I think we > ended up at the consensus that it wasn't worth the complication of trying to > cater for this in the generic iotlb API. > > To be fair, on simple hardware which doesn't implement multiple page sizes with > associated walk depth/TLB pressure benefits for larger ones, there's no need for > the IOMMU API (and/or the owner of the domain) to try harder to use them, so > handling "compound" page sizes within the driver is a more reasonable thing to > do. There is already some precedent for this in other drivers (e.g. mtk_iommu_v1). Probably the best variant would be to give an explicit control over syncing to a user of the IOMMU API, like for example device driver may perform multiple mappings / unmappings and then sync/flush in the end. I'm not sure that it's really worth the hassle to shuffle the API right now, maybe we can implement it later if needed. Joerg, do you have objections to a 'compound page' approach? -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Apr 27, 2018 at 12:02:02PM +0200, Thierry Reding wrote: > Joerg, I've gone over the git log and header files and I see no mention > of why the TLB flush interface isn't used for mapping. Do you recall any > special reasons why the same shouldn't be applied for mapping? Would you > accept any patches doing this? Yeah, the reason was that we assumed that only emulated IOMMUs ever need flushing in the mapping path, and there is no reason to optimize for that. Especially when a call to iotlb_sync() in the mapping path would hurt most other users. Does the tegra-gart actually need it too? Joerg -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, May 07, 2018 at 12:19:01AM +0300, Dmitry Osipenko wrote: > Probably the best variant would be to give an explicit control over syncing to a > user of the IOMMU API, like for example device driver may perform multiple > mappings / unmappings and then sync/flush in the end. I'm not sure that it's > really worth the hassle to shuffle the API right now, maybe we can implement it > later if needed. Joerg, do you have objections to a 'compound page' approach? Have you measured the performance difference on both variants? The compound-page approach only works for cases when the physical memory you map contiguous and correctly aligned. If it is really needed I would prefer a separate iotlb_sync_map() call-back that is just NULL when not needed. This way all users that don't need it only get a minimal penalty in the mapping path and you don't have any requirements on the physical memory you map to get good performance. Joerg -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07.05.2018 10:59, Joerg Roedel wrote: > On Fri, Apr 27, 2018 at 12:02:02PM +0200, Thierry Reding wrote: >> Joerg, I've gone over the git log and header files and I see no mention >> of why the TLB flush interface isn't used for mapping. Do you recall any >> special reasons why the same shouldn't be applied for mapping? Would you >> accept any patches doing this? > > Yeah, the reason was that we assumed that only emulated IOMMUs ever need > flushing in the mapping path, and there is no reason to optimize for > that. Especially when a call to iotlb_sync() in the mapping path would > hurt most other users. > > Does the tegra-gart actually need it too? Tegra-GART is exactly an emulated IOMMU, it doesn't have anything like TLB and it's simply a remapping table. What is actually needed for the GART - is to make sure that the remapping table modifications reach GART HW before anything tries to touch the modified page entries. Presumable there is some kind of a HW buffer that accumulates multiple registers writes and issues them in bursts to improve performance, read-after-write is the way to flush that buffer. -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07.05.2018 11:04, Joerg Roedel wrote: > On Mon, May 07, 2018 at 12:19:01AM +0300, Dmitry Osipenko wrote: >> Probably the best variant would be to give an explicit control over syncing to a >> user of the IOMMU API, like for example device driver may perform multiple >> mappings / unmappings and then sync/flush in the end. I'm not sure that it's >> really worth the hassle to shuffle the API right now, maybe we can implement it >> later if needed. Joerg, do you have objections to a 'compound page' approach? > > Have you measured the performance difference on both variants? The > compound-page approach only works for cases when the physical memory you > map contiguous and correctly aligned. Yes, previously I actually only tested mapping of the contiguous allocations (used for memory isolation purposes). But now I've re-tested all variants and got somewhat interesting results. Firstly it is not that easy to test a really sparse mapping simply because memory allocator produces sparse allocation only when memory is _really_ fragmented. Pretty much all of the time the sparse allocations are contiguous or they consist of a very few chunks that do not impose any noticeable performance impact. Secondly, the interesting part is that mapping / unmapping of a contiguous allocation (CMA using DMA API) is slower by ~50% then doing it for a sparse allocation (get_pages using bare IOMMU API). /I think/ it's a shortcoming of the arch/arm/mm/dma-mapping.c, which also suffers from other inflexibilities that Thierry faced recently. Though I haven't really tried to figure out what is the bottleneck yet and Thierry was going to re-write ARM's dma-mapping implementation anyway, I'll take a closer look at this issue a bit later. I've implemented the iotlb_sync_map() and tested things with it. The end result is the same as for the compound page approach, simply because actual allocations are pretty much always contiguous. > If it is really needed I would prefer a separate iotlb_sync_map() > call-back that is just NULL when not needed. This way all users that > don't need it only get a minimal penalty in the mapping path and you > don't have any requirements on the physical memory you map to get good > performance. Summarizing, the iotlb_sync_map() is indeed better way. As you rightly noticed, that approach is also optimal for the non-contiguous cases as we won't have to flush on mapping of each contiguous chunk of the sparse allocation, but after the whole mapping is done. Thierry, Robin and Joerg - thanks for your input, I'll prepare patches implementing the iotlb_sync_map. -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 07.05.2018 18:51, Dmitry Osipenko wrote: [snip] > Secondly, the interesting part is that mapping / unmapping of a contiguous > allocation (CMA using DMA API) is slower by ~50% then doing it for a sparse > allocation (get_pages using bare IOMMU API). /I think/ it's a shortcoming of the > arch/arm/mm/dma-mapping.c, which also suffers from other inflexibilities that > Thierry faced recently. Though I haven't really tried to figure out what is the > bottleneck yet and Thierry was going to re-write ARM's dma-mapping > implementation anyway, I'll take a closer look at this issue a bit later. Please scratch my accusation of ARM's dma-mapping, it's not the culprit at all. I completely forgot that in a case of sparse allocation displays framebuffer IOMMU mapping is "pinned" to the GART and hence it's not getting dynamically mapped / unmapped during of my testing. I also forgot to set CPU freq governor to "perfomance", that reduced 50% to 20% of the above perf difference. The rest of the testing is unaffected, flushing after whole mapping is still much more efficient than flushing after modification of each page entry. And yet again, performance of sparse mapping is nearly the same as of contiguous mapping unless sparse allocation is large and _very_ fragmented. -- To unsubscribe from this list: send the line "unsubscribe linux-tegra" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index 4a0607669d34..9f59f5f17661 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -36,7 +36,7 @@ #define GART_APERTURE_SIZE SZ_32M /* bitmap of the page sizes currently supported */ -#define GART_IOMMU_PGSIZES (SZ_4K) +#define GART_IOMMU_PGSIZES GENMASK(24, 12) #define GART_REG_BASE 0x24 #define GART_CONFIG (0x24 - GART_REG_BASE) @@ -269,25 +269,21 @@ static void gart_iommu_domain_free(struct iommu_domain *domain) kfree(gart_domain); } -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t pa, size_t bytes, int prot) +static int gart_iommu_map_page(struct gart_device *gart, + unsigned long iova, + phys_addr_t pa) { - struct gart_domain *gart_domain = to_gart_domain(domain); - struct gart_device *gart = gart_domain->gart; unsigned long flags; unsigned long pfn; unsigned long pte; - if (!gart_iova_range_valid(gart, iova, bytes)) - return -EINVAL; - - spin_lock_irqsave(&gart->pte_lock, flags); pfn = __phys_to_pfn(pa); if (!pfn_valid(pfn)) { dev_err(gart->dev, "Invalid page: %pa\n", &pa); - spin_unlock_irqrestore(&gart->pte_lock, flags); return -EINVAL; } + + spin_lock_irqsave(&gart->pte_lock, flags); if (gart_debug) { pte = gart_read_pte(gart, iova); if (pte & GART_ENTRY_PHYS_ADDR_VALID) { @@ -297,8 +293,41 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, } } gart_set_pte(gart, iova, GART_PTE(pfn)); + spin_unlock_irqrestore(&gart->pte_lock, flags); + + return 0; +} + +static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova, + phys_addr_t pa, size_t bytes, int prot) +{ + struct gart_domain *gart_domain = to_gart_domain(domain); + struct gart_device *gart = gart_domain->gart; + size_t mapped; + int ret = -1; + + if (!gart_iova_range_valid(gart, iova, bytes)) + return -EINVAL; + + for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) { + ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped); + if (ret) + break; + } + FLUSH_GART_REGS(gart); + return ret; +} + +static int gart_iommu_unmap_page(struct gart_device *gart, + unsigned long iova) +{ + unsigned long flags; + + spin_lock_irqsave(&gart->pte_lock, flags); + gart_set_pte(gart, iova, 0); spin_unlock_irqrestore(&gart->pte_lock, flags); + return 0; } @@ -307,16 +336,20 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova, { struct gart_domain *gart_domain = to_gart_domain(domain); struct gart_device *gart = gart_domain->gart; - unsigned long flags; + size_t unmapped; + int ret; if (!gart_iova_range_valid(gart, iova, bytes)) return 0; - spin_lock_irqsave(&gart->pte_lock, flags); - gart_set_pte(gart, iova, 0); + for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) { + ret = gart_iommu_unmap_page(gart, iova + unmapped); + if (ret) + break; + } + FLUSH_GART_REGS(gart); - spin_unlock_irqrestore(&gart->pte_lock, flags); - return bytes; + return unmapped; } static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
Currently GART writes one page entry at a time. More optimal would be to aggregate the writes and flush BUS buffer in the end, this gives map/unmap 10-40% (depending on size of mapping) performance boost compared to a flushing after each entry update. Signed-off-by: Dmitry Osipenko <digetx@gmail.com> --- drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 15 deletions(-)