[v1,4/4] iommu/tegra: gart: Optimize map/unmap

Message ID f21a7b6a8f141b87f75687904a76f3728ea639a8.1523304324.git.digetx@gmail.com
State New
Headers show
Series
  • Tegra GART fixes and improvements
Related show

Commit Message

Dmitry Osipenko April 9, 2018, 8:07 p.m.
Currently GART writes one page entry at a time. More optimal would be to
aggregate the writes and flush BUS buffer in the end, this gives map/unmap
10-40% (depending on size of mapping) performance boost compared to a
flushing after each entry update.

Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 48 insertions(+), 15 deletions(-)

Comments

Thierry Reding April 27, 2018, 10:02 a.m. | #1
On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote:
> Currently GART writes one page entry at a time. More optimal would be to
> aggregate the writes and flush BUS buffer in the end, this gives map/unmap
> 10-40% (depending on size of mapping) performance boost compared to a
> flushing after each entry update.
> 
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>  drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
> index 4a0607669d34..9f59f5f17661 100644
> --- a/drivers/iommu/tegra-gart.c
> +++ b/drivers/iommu/tegra-gart.c
> @@ -36,7 +36,7 @@
>  #define GART_APERTURE_SIZE	SZ_32M
>  
>  /* bitmap of the page sizes currently supported */
> -#define GART_IOMMU_PGSIZES	(SZ_4K)
> +#define GART_IOMMU_PGSIZES	GENMASK(24, 12)

That doesn't look right. The GART really only supports 4 KiB pages. You
seem to be "emulating" more page sizes here in order to improve mapping
performance. That seems wrong to me. I'm wondering if this couldn't be
improved by a similar factor by simply moving the flushing into an
implementation of ->iotlb_sync().

That said, it seems like ->iotlb_sync() is only used for unmapping, but
I don't see a reason why iommu_map() wouldn't need to call it as well
after going through several calls to ->map(). It seems to me like a
driver that implements ->iotlb_sync() would want to use it to optimize
for both the mapping and unmapping cases.

Joerg, I've gone over the git log and header files and I see no mention
of why the TLB flush interface isn't used for mapping. Do you recall any
special reasons why the same shouldn't be applied for mapping? Would you
accept any patches doing this?

Thierry

>  
>  #define GART_REG_BASE		0x24
>  #define GART_CONFIG		(0x24 - GART_REG_BASE)
> @@ -269,25 +269,21 @@ static void gart_iommu_domain_free(struct iommu_domain *domain)
>  	kfree(gart_domain);
>  }
>  
> -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
> -			  phys_addr_t pa, size_t bytes, int prot)
> +static int gart_iommu_map_page(struct gart_device *gart,
> +			       unsigned long iova,
> +			       phys_addr_t pa)
>  {
> -	struct gart_domain *gart_domain = to_gart_domain(domain);
> -	struct gart_device *gart = gart_domain->gart;
>  	unsigned long flags;
>  	unsigned long pfn;
>  	unsigned long pte;
>  
> -	if (!gart_iova_range_valid(gart, iova, bytes))
> -		return -EINVAL;
> -
> -	spin_lock_irqsave(&gart->pte_lock, flags);
>  	pfn = __phys_to_pfn(pa);
>  	if (!pfn_valid(pfn)) {
>  		dev_err(gart->dev, "Invalid page: %pa\n", &pa);
> -		spin_unlock_irqrestore(&gart->pte_lock, flags);
>  		return -EINVAL;
>  	}
> +
> +	spin_lock_irqsave(&gart->pte_lock, flags);
>  	if (gart_debug) {
>  		pte = gart_read_pte(gart, iova);
>  		if (pte & GART_ENTRY_PHYS_ADDR_VALID) {
> @@ -297,8 +293,41 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
>  		}
>  	}
>  	gart_set_pte(gart, iova, GART_PTE(pfn));
> +	spin_unlock_irqrestore(&gart->pte_lock, flags);
> +
> +	return 0;
> +}
> +
> +static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
> +			  phys_addr_t pa, size_t bytes, int prot)
> +{
> +	struct gart_domain *gart_domain = to_gart_domain(domain);
> +	struct gart_device *gart = gart_domain->gart;
> +	size_t mapped;
> +	int ret = -1;
> +
> +	if (!gart_iova_range_valid(gart, iova, bytes))
> +		return -EINVAL;
> +
> +	for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) {
> +		ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped);
> +		if (ret)
> +			break;
> +	}
> +
>  	FLUSH_GART_REGS(gart);
> +	return ret;
> +}
> +
> +static int gart_iommu_unmap_page(struct gart_device *gart,
> +				 unsigned long iova)
> +{
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&gart->pte_lock, flags);
> +	gart_set_pte(gart, iova, 0);
>  	spin_unlock_irqrestore(&gart->pte_lock, flags);
> +
>  	return 0;
>  }
>  
> @@ -307,16 +336,20 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
>  {
>  	struct gart_domain *gart_domain = to_gart_domain(domain);
>  	struct gart_device *gart = gart_domain->gart;
> -	unsigned long flags;
> +	size_t unmapped;
> +	int ret;
>  
>  	if (!gart_iova_range_valid(gart, iova, bytes))
>  		return 0;
>  
> -	spin_lock_irqsave(&gart->pte_lock, flags);
> -	gart_set_pte(gart, iova, 0);
> +	for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) {
> +		ret = gart_iommu_unmap_page(gart, iova + unmapped);
> +		if (ret)
> +			break;
> +	}
> +
>  	FLUSH_GART_REGS(gart);
> -	spin_unlock_irqrestore(&gart->pte_lock, flags);
> -	return bytes;
> +	return unmapped;
>  }
>  
>  static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
> -- 
> 2.16.3
>
Dmitry Osipenko April 27, 2018, 12:01 p.m. | #2
On 27.04.2018 13:02, Thierry Reding wrote:
> On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote:
>> Currently GART writes one page entry at a time. More optimal would be to
>> aggregate the writes and flush BUS buffer in the end, this gives map/unmap
>> 10-40% (depending on size of mapping) performance boost compared to a
>> flushing after each entry update.
>>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>  drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++-----------
>>  1 file changed, 48 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
>> index 4a0607669d34..9f59f5f17661 100644
>> --- a/drivers/iommu/tegra-gart.c
>> +++ b/drivers/iommu/tegra-gart.c
>> @@ -36,7 +36,7 @@
>>  #define GART_APERTURE_SIZE	SZ_32M
>>  
>>  /* bitmap of the page sizes currently supported */
>> -#define GART_IOMMU_PGSIZES	(SZ_4K)
>> +#define GART_IOMMU_PGSIZES	GENMASK(24, 12)
> 
> That doesn't look right. The GART really only supports 4 KiB pages. You
> seem to be "emulating" more page sizes here in order to improve mapping
> performance. That seems wrong to me. I'm wondering if this couldn't be
> improved by a similar factor by simply moving the flushing into an
> implementation of ->iotlb_sync().
> 
> That said, it seems like ->iotlb_sync() is only used for unmapping, but
> I don't see a reason why iommu_map() wouldn't need to call it as well
> after going through several calls to ->map(). It seems to me like a
> driver that implements ->iotlb_sync() would want to use it to optimize
> for both the mapping and unmapping cases.

I vaguely remember looking at the IOMMU syncing, but decided to try at first the
way this patch is. I'll be happy to switch to map/unmap syncing, let's see what
Joerg would suggest.

> Joerg, I've gone over the git log and header files and I see no mention
> of why the TLB flush interface isn't used for mapping. Do you recall any
> special reasons why the same shouldn't be applied for mapping? Would you
> accept any patches doing this?
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Robin Murphy April 27, 2018, 12:36 p.m. | #3
Hi Thierry,

On 27/04/18 11:02, Thierry Reding wrote:
> On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote:
>> Currently GART writes one page entry at a time. More optimal would be to
>> aggregate the writes and flush BUS buffer in the end, this gives map/unmap
>> 10-40% (depending on size of mapping) performance boost compared to a
>> flushing after each entry update.
>>
>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>> ---
>>   drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++-----------
>>   1 file changed, 48 insertions(+), 15 deletions(-)
>>
>> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
>> index 4a0607669d34..9f59f5f17661 100644
>> --- a/drivers/iommu/tegra-gart.c
>> +++ b/drivers/iommu/tegra-gart.c
>> @@ -36,7 +36,7 @@
>>   #define GART_APERTURE_SIZE	SZ_32M
>>   
>>   /* bitmap of the page sizes currently supported */
>> -#define GART_IOMMU_PGSIZES	(SZ_4K)
>> +#define GART_IOMMU_PGSIZES	GENMASK(24, 12)
> 
> That doesn't look right. The GART really only supports 4 KiB pages. You
> seem to be "emulating" more page sizes here in order to improve mapping
> performance. That seems wrong to me. I'm wondering if this couldn't be
> improved by a similar factor by simply moving the flushing into an
> implementation of ->iotlb_sync().
> 
> That said, it seems like ->iotlb_sync() is only used for unmapping, but
> I don't see a reason why iommu_map() wouldn't need to call it as well
> after going through several calls to ->map(). It seems to me like a
> driver that implements ->iotlb_sync() would want to use it to optimize
> for both the mapping and unmapping cases.
> 
> Joerg, I've gone over the git log and header files and I see no mention
> of why the TLB flush interface isn't used for mapping. Do you recall any
> special reasons why the same shouldn't be applied for mapping? Would you
> accept any patches doing this?

In general, requiring TLB maintenance when transitioning from an invalid 
entry to a valid one tends to be the exception rather than the norm, and 
I think we ended up at the consensus that it wasn't worth the 
complication of trying to cater for this in the generic iotlb API.

To be fair, on simple hardware which doesn't implement multiple page 
sizes with associated walk depth/TLB pressure benefits for larger ones, 
there's no need for the IOMMU API (and/or the owner of the domain) to 
try harder to use them, so handling "compound" page sizes within the 
driver is a more reasonable thing to do. There is already some precedent 
for this in other drivers (e.g. mtk_iommu_v1).

Robin.

> 
> Thierry
> 
>>   
>>   #define GART_REG_BASE		0x24
>>   #define GART_CONFIG		(0x24 - GART_REG_BASE)
>> @@ -269,25 +269,21 @@ static void gart_iommu_domain_free(struct iommu_domain *domain)
>>   	kfree(gart_domain);
>>   }
>>   
>> -static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
>> -			  phys_addr_t pa, size_t bytes, int prot)
>> +static int gart_iommu_map_page(struct gart_device *gart,
>> +			       unsigned long iova,
>> +			       phys_addr_t pa)
>>   {
>> -	struct gart_domain *gart_domain = to_gart_domain(domain);
>> -	struct gart_device *gart = gart_domain->gart;
>>   	unsigned long flags;
>>   	unsigned long pfn;
>>   	unsigned long pte;
>>   
>> -	if (!gart_iova_range_valid(gart, iova, bytes))
>> -		return -EINVAL;
>> -
>> -	spin_lock_irqsave(&gart->pte_lock, flags);
>>   	pfn = __phys_to_pfn(pa);
>>   	if (!pfn_valid(pfn)) {
>>   		dev_err(gart->dev, "Invalid page: %pa\n", &pa);
>> -		spin_unlock_irqrestore(&gart->pte_lock, flags);
>>   		return -EINVAL;
>>   	}
>> +
>> +	spin_lock_irqsave(&gart->pte_lock, flags);
>>   	if (gart_debug) {
>>   		pte = gart_read_pte(gart, iova);
>>   		if (pte & GART_ENTRY_PHYS_ADDR_VALID) {
>> @@ -297,8 +293,41 @@ static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
>>   		}
>>   	}
>>   	gart_set_pte(gart, iova, GART_PTE(pfn));
>> +	spin_unlock_irqrestore(&gart->pte_lock, flags);
>> +
>> +	return 0;
>> +}
>> +
>> +static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
>> +			  phys_addr_t pa, size_t bytes, int prot)
>> +{
>> +	struct gart_domain *gart_domain = to_gart_domain(domain);
>> +	struct gart_device *gart = gart_domain->gart;
>> +	size_t mapped;
>> +	int ret = -1;
>> +
>> +	if (!gart_iova_range_valid(gart, iova, bytes))
>> +		return -EINVAL;
>> +
>> +	for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) {
>> +		ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped);
>> +		if (ret)
>> +			break;
>> +	}
>> +
>>   	FLUSH_GART_REGS(gart);
>> +	return ret;
>> +}
>> +
>> +static int gart_iommu_unmap_page(struct gart_device *gart,
>> +				 unsigned long iova)
>> +{
>> +	unsigned long flags;
>> +
>> +	spin_lock_irqsave(&gart->pte_lock, flags);
>> +	gart_set_pte(gart, iova, 0);
>>   	spin_unlock_irqrestore(&gart->pte_lock, flags);
>> +
>>   	return 0;
>>   }
>>   
>> @@ -307,16 +336,20 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
>>   {
>>   	struct gart_domain *gart_domain = to_gart_domain(domain);
>>   	struct gart_device *gart = gart_domain->gart;
>> -	unsigned long flags;
>> +	size_t unmapped;
>> +	int ret;
>>   
>>   	if (!gart_iova_range_valid(gart, iova, bytes))
>>   		return 0;
>>   
>> -	spin_lock_irqsave(&gart->pte_lock, flags);
>> -	gart_set_pte(gart, iova, 0);
>> +	for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) {
>> +		ret = gart_iommu_unmap_page(gart, iova + unmapped);
>> +		if (ret)
>> +			break;
>> +	}
>> +
>>   	FLUSH_GART_REGS(gart);
>> -	spin_unlock_irqrestore(&gart->pte_lock, flags);
>> -	return bytes;
>> +	return unmapped;
>>   }
>>   
>>   static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
>> -- 
>> 2.16.3
>>
>>
>>
>> _______________________________________________
>> iommu mailing list
>> iommu@lists.linux-foundation.org
>> https://lists.linuxfoundation.org/mailman/listinfo/iommu
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko May 6, 2018, 9:19 p.m. | #4
On 27.04.2018 15:36, Robin Murphy wrote:
> Hi Thierry,
> 
> On 27/04/18 11:02, Thierry Reding wrote:
>> On Mon, Apr 09, 2018 at 11:07:22PM +0300, Dmitry Osipenko wrote:
>>> Currently GART writes one page entry at a time. More optimal would be to
>>> aggregate the writes and flush BUS buffer in the end, this gives map/unmap
>>> 10-40% (depending on size of mapping) performance boost compared to a
>>> flushing after each entry update.
>>>
>>> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
>>> ---
>>>   drivers/iommu/tegra-gart.c | 63 +++++++++++++++++++++++++++++++++++-----------
>>>   1 file changed, 48 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
>>> index 4a0607669d34..9f59f5f17661 100644
>>> --- a/drivers/iommu/tegra-gart.c
>>> +++ b/drivers/iommu/tegra-gart.c
>>> @@ -36,7 +36,7 @@
>>>   #define GART_APERTURE_SIZE    SZ_32M
>>>     /* bitmap of the page sizes currently supported */
>>> -#define GART_IOMMU_PGSIZES    (SZ_4K)
>>> +#define GART_IOMMU_PGSIZES    GENMASK(24, 12)
>>
>> That doesn't look right. The GART really only supports 4 KiB pages. You
>> seem to be "emulating" more page sizes here in order to improve mapping
>> performance. That seems wrong to me. I'm wondering if this couldn't be
>> improved by a similar factor by simply moving the flushing into an
>> implementation of ->iotlb_sync().
>>
>> That said, it seems like ->iotlb_sync() is only used for unmapping, but
>> I don't see a reason why iommu_map() wouldn't need to call it as well
>> after going through several calls to ->map(). It seems to me like a
>> driver that implements ->iotlb_sync() would want to use it to optimize
>> for both the mapping and unmapping cases.
>>
>> Joerg, I've gone over the git log and header files and I see no mention
>> of why the TLB flush interface isn't used for mapping. Do you recall any
>> special reasons why the same shouldn't be applied for mapping? Would you
>> accept any patches doing this?
> 
> In general, requiring TLB maintenance when transitioning from an invalid entry
> to a valid one tends to be the exception rather than the norm, and I think we
> ended up at the consensus that it wasn't worth the complication of trying to
> cater for this in the generic iotlb API.
> 
> To be fair, on simple hardware which doesn't implement multiple page sizes with
> associated walk depth/TLB pressure benefits for larger ones, there's no need for
> the IOMMU API (and/or the owner of the domain) to try harder to use them, so
> handling "compound" page sizes within the driver is a more reasonable thing to
> do. There is already some precedent for this in other drivers (e.g. mtk_iommu_v1).
Probably the best variant would be to give an explicit control over syncing to a
user of the IOMMU API, like for example device driver may perform multiple
mappings / unmappings and then sync/flush in the end. I'm not sure that it's
really worth the hassle to shuffle the API right now, maybe we can implement it
later if needed. Joerg, do you have objections to a 'compound page' approach?
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 7, 2018, 7:59 a.m. | #5
On Fri, Apr 27, 2018 at 12:02:02PM +0200, Thierry Reding wrote:
> Joerg, I've gone over the git log and header files and I see no mention
> of why the TLB flush interface isn't used for mapping. Do you recall any
> special reasons why the same shouldn't be applied for mapping? Would you
> accept any patches doing this?

Yeah, the reason was that we assumed that only emulated IOMMUs ever need
flushing in the mapping path, and there is no reason to optimize for
that. Especially when a call to iotlb_sync() in the mapping path would
hurt most other users.

Does the tegra-gart actually need it too?



	Joerg

--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Joerg Roedel May 7, 2018, 8:04 a.m. | #6
On Mon, May 07, 2018 at 12:19:01AM +0300, Dmitry Osipenko wrote:
> Probably the best variant would be to give an explicit control over syncing to a
> user of the IOMMU API, like for example device driver may perform multiple
> mappings / unmappings and then sync/flush in the end. I'm not sure that it's
> really worth the hassle to shuffle the API right now, maybe we can implement it
> later if needed. Joerg, do you have objections to a 'compound page' approach?

Have you measured the performance difference on both variants? The
compound-page approach only works for cases when the physical memory you
map contiguous and correctly aligned.

If it is really needed I would prefer a separate iotlb_sync_map()
call-back that is just NULL when not needed. This way all users that
don't need it only get a minimal penalty in the mapping path and you
don't have any requirements on the physical memory you map to get good
performance.


	Joerg

--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko May 7, 2018, 3:46 p.m. | #7
On 07.05.2018 10:59, Joerg Roedel wrote:
> On Fri, Apr 27, 2018 at 12:02:02PM +0200, Thierry Reding wrote:
>> Joerg, I've gone over the git log and header files and I see no mention
>> of why the TLB flush interface isn't used for mapping. Do you recall any
>> special reasons why the same shouldn't be applied for mapping? Would you
>> accept any patches doing this?
> 
> Yeah, the reason was that we assumed that only emulated IOMMUs ever need
> flushing in the mapping path, and there is no reason to optimize for
> that. Especially when a call to iotlb_sync() in the mapping path would
> hurt most other users.
> 
> Does the tegra-gart actually need it too?

Tegra-GART is exactly an emulated IOMMU, it doesn't have anything like TLB and
it's simply a remapping table. What is actually needed for the GART - is to make
sure that the remapping table modifications reach GART HW before anything tries
to touch the modified page entries. Presumable there is some kind of a HW buffer
that accumulates multiple registers writes and issues them in bursts to improve
performance, read-after-write is the way to flush that buffer.
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko May 7, 2018, 3:51 p.m. | #8
On 07.05.2018 11:04, Joerg Roedel wrote:
> On Mon, May 07, 2018 at 12:19:01AM +0300, Dmitry Osipenko wrote:
>> Probably the best variant would be to give an explicit control over syncing to a
>> user of the IOMMU API, like for example device driver may perform multiple
>> mappings / unmappings and then sync/flush in the end. I'm not sure that it's
>> really worth the hassle to shuffle the API right now, maybe we can implement it
>> later if needed. Joerg, do you have objections to a 'compound page' approach?
> 
> Have you measured the performance difference on both variants? The
> compound-page approach only works for cases when the physical memory you
> map contiguous and correctly aligned.

Yes, previously I actually only tested mapping of the contiguous allocations
(used for memory isolation purposes). But now I've re-tested all variants and
got somewhat interesting results.

Firstly it is not that easy to test a really sparse mapping simply because
memory allocator produces sparse allocation only when memory is _really_
fragmented. Pretty much all of the time the sparse allocations are contiguous or
they consist of a very few chunks that do not impose any noticeable performance
impact.

Secondly, the interesting part is that mapping / unmapping of a contiguous
allocation (CMA using DMA API) is slower by ~50% then doing it for a sparse
allocation (get_pages using bare IOMMU API). /I think/ it's a shortcoming of the
arch/arm/mm/dma-mapping.c, which also suffers from other inflexibilities that
Thierry faced recently. Though I haven't really tried to figure out what is the
bottleneck yet and Thierry was going to re-write ARM's dma-mapping
implementation anyway, I'll take a closer look at this issue a bit later.

I've implemented the iotlb_sync_map() and tested things with it. The end result
is the same as for the compound page approach, simply because actual allocations
are pretty much always contiguous.

> If it is really needed I would prefer a separate iotlb_sync_map()
> call-back that is just NULL when not needed. This way all users that
> don't need it only get a minimal penalty in the mapping path and you
> don't have any requirements on the physical memory you map to get good
> performance.
Summarizing, the iotlb_sync_map() is indeed better way. As you rightly noticed,
that approach is also optimal for the non-contiguous cases as we won't have to
flush on mapping of each contiguous chunk of the sparse allocation, but after
the whole mapping is done.

Thierry, Robin and Joerg - thanks for your input, I'll prepare patches
implementing the iotlb_sync_map.
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dmitry Osipenko May 7, 2018, 5:38 p.m. | #9
On 07.05.2018 18:51, Dmitry Osipenko wrote:

[snip]

> Secondly, the interesting part is that mapping / unmapping of a contiguous
> allocation (CMA using DMA API) is slower by ~50% then doing it for a sparse
> allocation (get_pages using bare IOMMU API). /I think/ it's a shortcoming of the
> arch/arm/mm/dma-mapping.c, which also suffers from other inflexibilities that
> Thierry faced recently. Though I haven't really tried to figure out what is the
> bottleneck yet and Thierry was going to re-write ARM's dma-mapping
> implementation anyway, I'll take a closer look at this issue a bit later.

Please scratch my accusation of ARM's dma-mapping, it's not the culprit at all.
I completely forgot that in a case of sparse allocation displays framebuffer
IOMMU mapping is "pinned" to the GART and hence it's not getting dynamically
mapped / unmapped during of my testing. I also forgot to set CPU freq governor
to "perfomance", that reduced 50% to 20% of the above perf difference. The rest
of the testing is unaffected, flushing after whole mapping is still much more
efficient than flushing after modification of each page entry. And yet again,
performance of sparse mapping is nearly the same as of contiguous mapping unless
sparse allocation is large and _very_ fragmented.
--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 4a0607669d34..9f59f5f17661 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -36,7 +36,7 @@ 
 #define GART_APERTURE_SIZE	SZ_32M
 
 /* bitmap of the page sizes currently supported */
-#define GART_IOMMU_PGSIZES	(SZ_4K)
+#define GART_IOMMU_PGSIZES	GENMASK(24, 12)
 
 #define GART_REG_BASE		0x24
 #define GART_CONFIG		(0x24 - GART_REG_BASE)
@@ -269,25 +269,21 @@  static void gart_iommu_domain_free(struct iommu_domain *domain)
 	kfree(gart_domain);
 }
 
-static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
-			  phys_addr_t pa, size_t bytes, int prot)
+static int gart_iommu_map_page(struct gart_device *gart,
+			       unsigned long iova,
+			       phys_addr_t pa)
 {
-	struct gart_domain *gart_domain = to_gart_domain(domain);
-	struct gart_device *gart = gart_domain->gart;
 	unsigned long flags;
 	unsigned long pfn;
 	unsigned long pte;
 
-	if (!gart_iova_range_valid(gart, iova, bytes))
-		return -EINVAL;
-
-	spin_lock_irqsave(&gart->pte_lock, flags);
 	pfn = __phys_to_pfn(pa);
 	if (!pfn_valid(pfn)) {
 		dev_err(gart->dev, "Invalid page: %pa\n", &pa);
-		spin_unlock_irqrestore(&gart->pte_lock, flags);
 		return -EINVAL;
 	}
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
 	if (gart_debug) {
 		pte = gart_read_pte(gart, iova);
 		if (pte & GART_ENTRY_PHYS_ADDR_VALID) {
@@ -297,8 +293,41 @@  static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
 		}
 	}
 	gart_set_pte(gart, iova, GART_PTE(pfn));
+	spin_unlock_irqrestore(&gart->pte_lock, flags);
+
+	return 0;
+}
+
+static int gart_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			  phys_addr_t pa, size_t bytes, int prot)
+{
+	struct gart_domain *gart_domain = to_gart_domain(domain);
+	struct gart_device *gart = gart_domain->gart;
+	size_t mapped;
+	int ret = -1;
+
+	if (!gart_iova_range_valid(gart, iova, bytes))
+		return -EINVAL;
+
+	for (mapped = 0; mapped < bytes; mapped += GART_PAGE_SIZE) {
+		ret = gart_iommu_map_page(gart, iova + mapped, pa + mapped);
+		if (ret)
+			break;
+	}
+
 	FLUSH_GART_REGS(gart);
+	return ret;
+}
+
+static int gart_iommu_unmap_page(struct gart_device *gart,
+				 unsigned long iova)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&gart->pte_lock, flags);
+	gart_set_pte(gart, iova, 0);
 	spin_unlock_irqrestore(&gart->pte_lock, flags);
+
 	return 0;
 }
 
@@ -307,16 +336,20 @@  static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 {
 	struct gart_domain *gart_domain = to_gart_domain(domain);
 	struct gart_device *gart = gart_domain->gart;
-	unsigned long flags;
+	size_t unmapped;
+	int ret;
 
 	if (!gart_iova_range_valid(gart, iova, bytes))
 		return 0;
 
-	spin_lock_irqsave(&gart->pte_lock, flags);
-	gart_set_pte(gart, iova, 0);
+	for (unmapped = 0; unmapped < bytes; unmapped += GART_PAGE_SIZE) {
+		ret = gart_iommu_unmap_page(gart, iova + unmapped);
+		if (ret)
+			break;
+	}
+
 	FLUSH_GART_REGS(gart);
-	spin_unlock_irqrestore(&gart->pte_lock, flags);
-	return bytes;
+	return unmapped;
 }
 
 static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,