diff mbox

[kernel,v7,26/31] powerpc/iommu: Add userspace view of TCE table

Message ID 1427468115-2224-27-git-send-email-aik@ozlabs.ru (mailing list archive)
State Superseded
Delegated to: Benjamin Herrenschmidt
Headers show

Commit Message

Alexey Kardashevskiy March 27, 2015, 2:55 p.m. UTC
In order to support memory pre-registration, we need a way to track
the use of every registered memory region and only allow unregistration
if a region is not in use anymore. So we need a way to tell from what
region the just cleared TCE was from.

This adds a userspace view of the TCE table into iommu_table struct.
It contains userspace address, one per TCE entry. The table is only
allocated when the ownership over an IOMMU group is taken which means
it is only used from outside of the powernv code (such as VFIO).

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
 arch/powerpc/include/asm/iommu.h          |  6 ++++++
 arch/powerpc/kernel/iommu.c               |  7 +++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 1 deletion(-)

Comments

Alex Williamson April 2, 2015, 8:50 p.m. UTC | #1
Should have sent this with the other comments, but found it hiding on my
desktop...

On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> In order to support memory pre-registration, we need a way to track
> the use of every registered memory region and only allow unregistration
> if a region is not in use anymore. So we need a way to tell from what
> region the just cleared TCE was from.
> 
> This adds a userspace view of the TCE table into iommu_table struct.
> It contains userspace address, one per TCE entry. The table is only
> allocated when the ownership over an IOMMU group is taken which means
> it is only used from outside of the powernv code (such as VFIO).
> 
> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> ---
>  arch/powerpc/include/asm/iommu.h          |  6 ++++++
>  arch/powerpc/kernel/iommu.c               |  7 +++++++
>  arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
>  3 files changed, 35 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> index 2c08c91..a768a4d 100644
> --- a/arch/powerpc/include/asm/iommu.h
> +++ b/arch/powerpc/include/asm/iommu.h
> @@ -106,9 +106,15 @@ struct iommu_table {
>  	unsigned long *it_map;       /* A simple allocation bitmap for now */
>  	unsigned long  it_page_shift;/* table iommu page size */
>  	struct iommu_table_group *it_group;
> +	unsigned long *it_userspace; /* userspace view of the table */
>  	struct iommu_table_ops *it_ops;
>  };
>  
> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> +		((tbl)->it_userspace ? \
> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> +			NULL)
> +
>  /* Pure 2^n version of get_order */
>  static inline __attribute_const__
>  int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> index 0bcd988..82102d1 100644
> --- a/arch/powerpc/kernel/iommu.c
> +++ b/arch/powerpc/kernel/iommu.c
> @@ -38,6 +38,7 @@
>  #include <linux/pci.h>
>  #include <linux/iommu.h>
>  #include <linux/sched.h>
> +#include <linux/vmalloc.h>
>  #include <asm/io.h>
>  #include <asm/prom.h>
>  #include <asm/iommu.h>
> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
>  		spin_unlock(&tbl->pools[i].lock);
>  	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>  
> +	BUG_ON(tbl->it_userspace);
> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> +

-ENOMEM?

>  	return 0;
>  }
>  
> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
>  {
>  	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>  
> +	vfree(tbl->it_userspace);
> +	tbl->it_userspace = NULL;
> +
>  	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>  	for (i = 0; i < tbl->nr_pools; i++)
>  		spin_lock(&tbl->pools[i].lock);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index bc36cf1..036f3c1 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -26,6 +26,7 @@
>  #include <linux/iommu.h>
>  #include <linux/mmzone.h>
>  #include <linux/sizes.h>
> +#include <linux/vmalloc.h>
>  
>  #include <asm/mmzone.h>
>  #include <asm/sections.h>
> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
>  	if (!tbl->it_size)
>  		return;
>  
> +	if (tbl->it_userspace)

Not necessary

> +		vfree(tbl->it_userspace);
> +

Why no NULL setting this time?

>  	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
>  	iommu_reset_table(tbl, "ioda2");
>  }
> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
>  	pnv_pci_ioda2_set_bypass(pe, !enable);
>  }
>  
> +static long pnv_pci_ioda2_create_table_with_uas(
> +		struct iommu_table_group *table_group,
> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> +		struct iommu_table *tbl)
> +{
> +	long ret = pnv_pci_ioda2_create_table(table_group, num,
> +			page_shift, window_size, levels, tbl);
> +
> +	if (ret)
> +		return ret;
> +
> +	BUG_ON(tbl->it_userspace);
> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);

-ENOMEM

> +
> +	return 0;
> +}
> +
>  static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>  	.set_ownership = pnv_ioda2_set_ownership,
> -	.create_table = pnv_pci_ioda2_create_table,
> +	.create_table = pnv_pci_ioda2_create_table_with_uas,
>  	.set_window = pnv_pci_ioda2_set_window,
>  	.unset_window = pnv_pci_ioda2_unset_window,
>  };
Alexey Kardashevskiy April 8, 2015, 3:22 a.m. UTC | #2
On 04/03/2015 07:50 AM, Alex Williamson wrote:
>
> Should have sent this with the other comments, but found it hiding on my
> desktop...
>
> On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
>> In order to support memory pre-registration, we need a way to track
>> the use of every registered memory region and only allow unregistration
>> if a region is not in use anymore. So we need a way to tell from what
>> region the just cleared TCE was from.
>>
>> This adds a userspace view of the TCE table into iommu_table struct.
>> It contains userspace address, one per TCE entry. The table is only
>> allocated when the ownership over an IOMMU group is taken which means
>> it is only used from outside of the powernv code (such as VFIO).
>>
>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>> ---
>>   arch/powerpc/include/asm/iommu.h          |  6 ++++++
>>   arch/powerpc/kernel/iommu.c               |  7 +++++++
>>   arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
>>   3 files changed, 35 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>> index 2c08c91..a768a4d 100644
>> --- a/arch/powerpc/include/asm/iommu.h
>> +++ b/arch/powerpc/include/asm/iommu.h
>> @@ -106,9 +106,15 @@ struct iommu_table {
>>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
>>   	unsigned long  it_page_shift;/* table iommu page size */
>>   	struct iommu_table_group *it_group;
>> +	unsigned long *it_userspace; /* userspace view of the table */
>>   	struct iommu_table_ops *it_ops;
>>   };
>>
>> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
>> +		((tbl)->it_userspace ? \
>> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
>> +			NULL)
>> +
>>   /* Pure 2^n version of get_order */
>>   static inline __attribute_const__
>>   int get_iommu_order(unsigned long size, struct iommu_table *tbl)
>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>> index 0bcd988..82102d1 100644
>> --- a/arch/powerpc/kernel/iommu.c
>> +++ b/arch/powerpc/kernel/iommu.c
>> @@ -38,6 +38,7 @@
>>   #include <linux/pci.h>
>>   #include <linux/iommu.h>
>>   #include <linux/sched.h>
>> +#include <linux/vmalloc.h>
>>   #include <asm/io.h>
>>   #include <asm/prom.h>
>>   #include <asm/iommu.h>
>> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
>>   		spin_unlock(&tbl->pools[i].lock);
>>   	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>>
>> +	BUG_ON(tbl->it_userspace);
>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
>> +
>
> -ENOMEM?
>
>>   	return 0;
>>   }
>>
>> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
>>   {
>>   	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>>
>> +	vfree(tbl->it_userspace);
>> +	tbl->it_userspace = NULL;
>> +
>>   	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>>   	for (i = 0; i < tbl->nr_pools; i++)
>>   		spin_lock(&tbl->pools[i].lock);
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index bc36cf1..036f3c1 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -26,6 +26,7 @@
>>   #include <linux/iommu.h>
>>   #include <linux/mmzone.h>
>>   #include <linux/sizes.h>
>> +#include <linux/vmalloc.h>
>>
>>   #include <asm/mmzone.h>
>>   #include <asm/sections.h>
>> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
>>   	if (!tbl->it_size)
>>   		return;
>>
>> +	if (tbl->it_userspace)
>
> Not necessary

Out of curiosity - why? Is every single implementation is known for 
checking the argument?


>
>> +		vfree(tbl->it_userspace);
>> +
>
> Why no NULL setting this time?

iommu_reset_table() (2 lines below) will do memset(0) on the entire struct.


>
>>   	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
>>   	iommu_reset_table(tbl, "ioda2");
>>   }
>> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
>>   	pnv_pci_ioda2_set_bypass(pe, !enable);
>>   }
>>
>> +static long pnv_pci_ioda2_create_table_with_uas(
>> +		struct iommu_table_group *table_group,
>> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
>> +		struct iommu_table *tbl)
>> +{
>> +	long ret = pnv_pci_ioda2_create_table(table_group, num,
>> +			page_shift, window_size, levels, tbl);
>> +
>> +	if (ret)
>> +		return ret;
>> +
>> +	BUG_ON(tbl->it_userspace);
>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
>
> -ENOMEM
>
>> +
>> +	return 0;
>> +}
>> +
>>   static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>>   	.set_ownership = pnv_ioda2_set_ownership,
>> -	.create_table = pnv_pci_ioda2_create_table,
>> +	.create_table = pnv_pci_ioda2_create_table_with_uas,
>>   	.set_window = pnv_pci_ioda2_set_window,
>>   	.unset_window = pnv_pci_ioda2_unset_window,
>>   };
>
>
>


Thanks for the review! What is overall resume? Another respin?
Alex Williamson April 8, 2015, 3:43 p.m. UTC | #3
On Wed, 2015-04-08 at 13:22 +1000, Alexey Kardashevskiy wrote:
> On 04/03/2015 07:50 AM, Alex Williamson wrote:
> >
> > Should have sent this with the other comments, but found it hiding on my
> > desktop...
> >
> > On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> >> In order to support memory pre-registration, we need a way to track
> >> the use of every registered memory region and only allow unregistration
> >> if a region is not in use anymore. So we need a way to tell from what
> >> region the just cleared TCE was from.
> >>
> >> This adds a userspace view of the TCE table into iommu_table struct.
> >> It contains userspace address, one per TCE entry. The table is only
> >> allocated when the ownership over an IOMMU group is taken which means
> >> it is only used from outside of the powernv code (such as VFIO).
> >>
> >> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >> ---
> >>   arch/powerpc/include/asm/iommu.h          |  6 ++++++
> >>   arch/powerpc/kernel/iommu.c               |  7 +++++++
> >>   arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
> >>   3 files changed, 35 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >> index 2c08c91..a768a4d 100644
> >> --- a/arch/powerpc/include/asm/iommu.h
> >> +++ b/arch/powerpc/include/asm/iommu.h
> >> @@ -106,9 +106,15 @@ struct iommu_table {
> >>   	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >>   	unsigned long  it_page_shift;/* table iommu page size */
> >>   	struct iommu_table_group *it_group;
> >> +	unsigned long *it_userspace; /* userspace view of the table */
> >>   	struct iommu_table_ops *it_ops;
> >>   };
> >>
> >> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> >> +		((tbl)->it_userspace ? \
> >> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> >> +			NULL)
> >> +
> >>   /* Pure 2^n version of get_order */
> >>   static inline __attribute_const__
> >>   int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> >> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >> index 0bcd988..82102d1 100644
> >> --- a/arch/powerpc/kernel/iommu.c
> >> +++ b/arch/powerpc/kernel/iommu.c
> >> @@ -38,6 +38,7 @@
> >>   #include <linux/pci.h>
> >>   #include <linux/iommu.h>
> >>   #include <linux/sched.h>
> >> +#include <linux/vmalloc.h>
> >>   #include <asm/io.h>
> >>   #include <asm/prom.h>
> >>   #include <asm/iommu.h>
> >> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
> >>   		spin_unlock(&tbl->pools[i].lock);
> >>   	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
> >>
> >> +	BUG_ON(tbl->it_userspace);
> >> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> >> +
> >
> > -ENOMEM?
> >
> >>   	return 0;
> >>   }
> >>
> >> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
> >>   {
> >>   	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> >>
> >> +	vfree(tbl->it_userspace);
> >> +	tbl->it_userspace = NULL;
> >> +
> >>   	spin_lock_irqsave(&tbl->large_pool.lock, flags);
> >>   	for (i = 0; i < tbl->nr_pools; i++)
> >>   		spin_lock(&tbl->pools[i].lock);
> >> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> index bc36cf1..036f3c1 100644
> >> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >> @@ -26,6 +26,7 @@
> >>   #include <linux/iommu.h>
> >>   #include <linux/mmzone.h>
> >>   #include <linux/sizes.h>
> >> +#include <linux/vmalloc.h>
> >>
> >>   #include <asm/mmzone.h>
> >>   #include <asm/sections.h>
> >> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
> >>   	if (!tbl->it_size)
> >>   		return;
> >>
> >> +	if (tbl->it_userspace)
> >
> > Not necessary
> 
> Out of curiosity - why? Is every single implementation is known for 
> checking the argument?

AFAIK, all flavors of free in the kernel accept NULL pointers and do the
right thing.  I verified this one does too.

> >> +		vfree(tbl->it_userspace);
> >> +
> >
> > Why no NULL setting this time?
> 
> iommu_reset_table() (2 lines below) will do memset(0) on the entire struct.

So then should iommu_reset_table() handle the vfree() as well?

> >>   	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
> >>   	iommu_reset_table(tbl, "ioda2");
> >>   }
> >> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
> >>   	pnv_pci_ioda2_set_bypass(pe, !enable);
> >>   }
> >>
> >> +static long pnv_pci_ioda2_create_table_with_uas(
> >> +		struct iommu_table_group *table_group,
> >> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> >> +		struct iommu_table *tbl)
> >> +{
> >> +	long ret = pnv_pci_ioda2_create_table(table_group, num,
> >> +			page_shift, window_size, levels, tbl);
> >> +
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	BUG_ON(tbl->it_userspace);
> >> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> >
> > -ENOMEM
> >
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>   static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
> >>   	.set_ownership = pnv_ioda2_set_ownership,
> >> -	.create_table = pnv_pci_ioda2_create_table,
> >> +	.create_table = pnv_pci_ioda2_create_table_with_uas,
> >>   	.set_window = pnv_pci_ioda2_set_window,
> >>   	.unset_window = pnv_pci_ioda2_unset_window,
> >>   };
> >
> >
> >
> 
> 
> Thanks for the review! What is overall resume? Another respin?

Is there another option?  It seems like there are too many issues to
simply fold cleanups onto the end of the series.  Thanks,

Alex
Alexey Kardashevskiy April 9, 2015, 3:21 a.m. UTC | #4
On 04/09/2015 01:43 AM, Alex Williamson wrote:
> On Wed, 2015-04-08 at 13:22 +1000, Alexey Kardashevskiy wrote:
>> On 04/03/2015 07:50 AM, Alex Williamson wrote:
>>>
>>> Should have sent this with the other comments, but found it hiding on my
>>> desktop...
>>>
>>> On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
>>>> In order to support memory pre-registration, we need a way to track
>>>> the use of every registered memory region and only allow unregistration
>>>> if a region is not in use anymore. So we need a way to tell from what
>>>> region the just cleared TCE was from.
>>>>
>>>> This adds a userspace view of the TCE table into iommu_table struct.
>>>> It contains userspace address, one per TCE entry. The table is only
>>>> allocated when the ownership over an IOMMU group is taken which means
>>>> it is only used from outside of the powernv code (such as VFIO).
>>>>
>>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>>>> ---
>>>>    arch/powerpc/include/asm/iommu.h          |  6 ++++++
>>>>    arch/powerpc/kernel/iommu.c               |  7 +++++++
>>>>    arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
>>>>    3 files changed, 35 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
>>>> index 2c08c91..a768a4d 100644
>>>> --- a/arch/powerpc/include/asm/iommu.h
>>>> +++ b/arch/powerpc/include/asm/iommu.h
>>>> @@ -106,9 +106,15 @@ struct iommu_table {
>>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now */
>>>>    	unsigned long  it_page_shift;/* table iommu page size */
>>>>    	struct iommu_table_group *it_group;
>>>> +	unsigned long *it_userspace; /* userspace view of the table */
>>>>    	struct iommu_table_ops *it_ops;
>>>>    };
>>>>
>>>> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
>>>> +		((tbl)->it_userspace ? \
>>>> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
>>>> +			NULL)
>>>> +
>>>>    /* Pure 2^n version of get_order */
>>>>    static inline __attribute_const__
>>>>    int get_iommu_order(unsigned long size, struct iommu_table *tbl)
>>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>>> index 0bcd988..82102d1 100644
>>>> --- a/arch/powerpc/kernel/iommu.c
>>>> +++ b/arch/powerpc/kernel/iommu.c
>>>> @@ -38,6 +38,7 @@
>>>>    #include <linux/pci.h>
>>>>    #include <linux/iommu.h>
>>>>    #include <linux/sched.h>
>>>> +#include <linux/vmalloc.h>
>>>>    #include <asm/io.h>
>>>>    #include <asm/prom.h>
>>>>    #include <asm/iommu.h>
>>>> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
>>>>    		spin_unlock(&tbl->pools[i].lock);
>>>>    	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
>>>>
>>>> +	BUG_ON(tbl->it_userspace);
>>>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
>>>> +
>>>
>>> -ENOMEM?
>>>
>>>>    	return 0;
>>>>    }
>>>>
>>>> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
>>>>    {
>>>>    	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
>>>>
>>>> +	vfree(tbl->it_userspace);
>>>> +	tbl->it_userspace = NULL;
>>>> +
>>>>    	spin_lock_irqsave(&tbl->large_pool.lock, flags);
>>>>    	for (i = 0; i < tbl->nr_pools; i++)
>>>>    		spin_lock(&tbl->pools[i].lock);
>>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> index bc36cf1..036f3c1 100644
>>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>> @@ -26,6 +26,7 @@
>>>>    #include <linux/iommu.h>
>>>>    #include <linux/mmzone.h>
>>>>    #include <linux/sizes.h>
>>>> +#include <linux/vmalloc.h>
>>>>
>>>>    #include <asm/mmzone.h>
>>>>    #include <asm/sections.h>
>>>> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
>>>>    	if (!tbl->it_size)
>>>>    		return;
>>>>
>>>> +	if (tbl->it_userspace)
>>>
>>> Not necessary
>>
>> Out of curiosity - why? Is every single implementation is known for
>> checking the argument?
>
> AFAIK, all flavors of free in the kernel accept NULL pointers and do the
> right thing.  I verified this one does too.
>
>>>> +		vfree(tbl->it_userspace);
>>>> +
>>>
>>> Why no NULL setting this time?
>>
>> iommu_reset_table() (2 lines below) will do memset(0) on the entire struct.
>
> So then should iommu_reset_table() handle the vfree() as well?


I wanted to keep vfree() in the same file with vzalloc(). Bad idea?
But I'll move vfree() to iommu_reset_table() anyway.


>
>>>>    	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
>>>>    	iommu_reset_table(tbl, "ioda2");
>>>>    }
>>>> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
>>>>    	pnv_pci_ioda2_set_bypass(pe, !enable);
>>>>    }
>>>>
>>>> +static long pnv_pci_ioda2_create_table_with_uas(
>>>> +		struct iommu_table_group *table_group,
>>>> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
>>>> +		struct iommu_table *tbl)
>>>> +{
>>>> +	long ret = pnv_pci_ioda2_create_table(table_group, num,
>>>> +			page_shift, window_size, levels, tbl);
>>>> +
>>>> +	if (ret)
>>>> +		return ret;
>>>> +
>>>> +	BUG_ON(tbl->it_userspace);
>>>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
>>>
>>> -ENOMEM
>>>
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>    static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
>>>>    	.set_ownership = pnv_ioda2_set_ownership,
>>>> -	.create_table = pnv_pci_ioda2_create_table,
>>>> +	.create_table = pnv_pci_ioda2_create_table_with_uas,
>>>>    	.set_window = pnv_pci_ioda2_set_window,
>>>>    	.unset_window = pnv_pci_ioda2_unset_window,
>>>>    };
>>>
>>>
>>>
>>
>>
>> Thanks for the review! What is overall resume? Another respin?
>
> Is there another option?  It seems like there are too many issues to
> simply fold cleanups onto the end of the series.  Thanks,

I'll repost indeed, I meant it would help me if you could tell that you 
agree with the patchset and Ben can pull this stuff to his tree. Thanks!
Alex Williamson April 9, 2015, 3:46 a.m. UTC | #5
On Thu, 2015-04-09 at 13:21 +1000, Alexey Kardashevskiy wrote:
> On 04/09/2015 01:43 AM, Alex Williamson wrote:
> > On Wed, 2015-04-08 at 13:22 +1000, Alexey Kardashevskiy wrote:
> >> On 04/03/2015 07:50 AM, Alex Williamson wrote:
> >>>
> >>> Should have sent this with the other comments, but found it hiding on my
> >>> desktop...
> >>>
> >>> On Sat, 2015-03-28 at 01:55 +1100, Alexey Kardashevskiy wrote:
> >>>> In order to support memory pre-registration, we need a way to track
> >>>> the use of every registered memory region and only allow unregistration
> >>>> if a region is not in use anymore. So we need a way to tell from what
> >>>> region the just cleared TCE was from.
> >>>>
> >>>> This adds a userspace view of the TCE table into iommu_table struct.
> >>>> It contains userspace address, one per TCE entry. The table is only
> >>>> allocated when the ownership over an IOMMU group is taken which means
> >>>> it is only used from outside of the powernv code (such as VFIO).
> >>>>
> >>>> Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >>>> ---
> >>>>    arch/powerpc/include/asm/iommu.h          |  6 ++++++
> >>>>    arch/powerpc/kernel/iommu.c               |  7 +++++++
> >>>>    arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
> >>>>    3 files changed, 35 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
> >>>> index 2c08c91..a768a4d 100644
> >>>> --- a/arch/powerpc/include/asm/iommu.h
> >>>> +++ b/arch/powerpc/include/asm/iommu.h
> >>>> @@ -106,9 +106,15 @@ struct iommu_table {
> >>>>    	unsigned long *it_map;       /* A simple allocation bitmap for now */
> >>>>    	unsigned long  it_page_shift;/* table iommu page size */
> >>>>    	struct iommu_table_group *it_group;
> >>>> +	unsigned long *it_userspace; /* userspace view of the table */
> >>>>    	struct iommu_table_ops *it_ops;
> >>>>    };
> >>>>
> >>>> +#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
> >>>> +		((tbl)->it_userspace ? \
> >>>> +			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
> >>>> +			NULL)
> >>>> +
> >>>>    /* Pure 2^n version of get_order */
> >>>>    static inline __attribute_const__
> >>>>    int get_iommu_order(unsigned long size, struct iommu_table *tbl)
> >>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> >>>> index 0bcd988..82102d1 100644
> >>>> --- a/arch/powerpc/kernel/iommu.c
> >>>> +++ b/arch/powerpc/kernel/iommu.c
> >>>> @@ -38,6 +38,7 @@
> >>>>    #include <linux/pci.h>
> >>>>    #include <linux/iommu.h>
> >>>>    #include <linux/sched.h>
> >>>> +#include <linux/vmalloc.h>
> >>>>    #include <asm/io.h>
> >>>>    #include <asm/prom.h>
> >>>>    #include <asm/iommu.h>
> >>>> @@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
> >>>>    		spin_unlock(&tbl->pools[i].lock);
> >>>>    	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
> >>>>
> >>>> +	BUG_ON(tbl->it_userspace);
> >>>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> >>>> +
> >>>
> >>> -ENOMEM?
> >>>
> >>>>    	return 0;
> >>>>    }
> >>>>
> >>>> @@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
> >>>>    {
> >>>>    	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
> >>>>
> >>>> +	vfree(tbl->it_userspace);
> >>>> +	tbl->it_userspace = NULL;
> >>>> +
> >>>>    	spin_lock_irqsave(&tbl->large_pool.lock, flags);
> >>>>    	for (i = 0; i < tbl->nr_pools; i++)
> >>>>    		spin_lock(&tbl->pools[i].lock);
> >>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>>> index bc36cf1..036f3c1 100644
> >>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> >>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> >>>> @@ -26,6 +26,7 @@
> >>>>    #include <linux/iommu.h>
> >>>>    #include <linux/mmzone.h>
> >>>>    #include <linux/sizes.h>
> >>>> +#include <linux/vmalloc.h>
> >>>>
> >>>>    #include <asm/mmzone.h>
> >>>>    #include <asm/sections.h>
> >>>> @@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
> >>>>    	if (!tbl->it_size)
> >>>>    		return;
> >>>>
> >>>> +	if (tbl->it_userspace)
> >>>
> >>> Not necessary
> >>
> >> Out of curiosity - why? Is every single implementation is known for
> >> checking the argument?
> >
> > AFAIK, all flavors of free in the kernel accept NULL pointers and do the
> > right thing.  I verified this one does too.
> >
> >>>> +		vfree(tbl->it_userspace);
> >>>> +
> >>>
> >>> Why no NULL setting this time?
> >>
> >> iommu_reset_table() (2 lines below) will do memset(0) on the entire struct.
> >
> > So then should iommu_reset_table() handle the vfree() as well?
> 
> 
> I wanted to keep vfree() in the same file with vzalloc(). Bad idea?
> But I'll move vfree() to iommu_reset_table() anyway.

No, that's probably a good idea to have some locality between the alloc
and free.  If it doesn't make sense, don't do it.

> >>>>    	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
> >>>>    	iommu_reset_table(tbl, "ioda2");
> >>>>    }
> >>>> @@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
> >>>>    	pnv_pci_ioda2_set_bypass(pe, !enable);
> >>>>    }
> >>>>
> >>>> +static long pnv_pci_ioda2_create_table_with_uas(
> >>>> +		struct iommu_table_group *table_group,
> >>>> +		int num, __u32 page_shift, __u64 window_size, __u32 levels,
> >>>> +		struct iommu_table *tbl)
> >>>> +{
> >>>> +	long ret = pnv_pci_ioda2_create_table(table_group, num,
> >>>> +			page_shift, window_size, levels, tbl);
> >>>> +
> >>>> +	if (ret)
> >>>> +		return ret;
> >>>> +
> >>>> +	BUG_ON(tbl->it_userspace);
> >>>> +	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
> >>>
> >>> -ENOMEM
> >>>
> >>>> +
> >>>> +	return 0;
> >>>> +}
> >>>> +
> >>>>    static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
> >>>>    	.set_ownership = pnv_ioda2_set_ownership,
> >>>> -	.create_table = pnv_pci_ioda2_create_table,
> >>>> +	.create_table = pnv_pci_ioda2_create_table_with_uas,
> >>>>    	.set_window = pnv_pci_ioda2_set_window,
> >>>>    	.unset_window = pnv_pci_ioda2_unset_window,
> >>>>    };
> >>>
> >>>
> >>>
> >>
> >>
> >> Thanks for the review! What is overall resume? Another respin?
> >
> > Is there another option?  It seems like there are too many issues to
> > simply fold cleanups onto the end of the series.  Thanks,
> 
> I'll repost indeed, I meant it would help me if you could tell that you 
> agree with the patchset and Ben can pull this stuff to his tree. Thanks!

It doesn't seem like there's much choice in the matter, it's hard to
define the API that's being used between the vfio code and the powerpc
code and the patch series is completely intermixed between the two.  I
haven't found any blocker bugs in this version, but it will take at
least another spin to get an ack.  Thanks,

Alex
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c08c91..a768a4d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -106,9 +106,15 @@  struct iommu_table {
 	unsigned long *it_map;       /* A simple allocation bitmap for now */
 	unsigned long  it_page_shift;/* table iommu page size */
 	struct iommu_table_group *it_group;
+	unsigned long *it_userspace; /* userspace view of the table */
 	struct iommu_table_ops *it_ops;
 };
 
+#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
+		((tbl)->it_userspace ? \
+			&((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
+			NULL)
+
 /* Pure 2^n version of get_order */
 static inline __attribute_const__
 int get_iommu_order(unsigned long size, struct iommu_table *tbl)
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 0bcd988..82102d1 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -38,6 +38,7 @@ 
 #include <linux/pci.h>
 #include <linux/iommu.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -1069,6 +1070,9 @@  static int iommu_table_take_ownership(struct iommu_table *tbl)
 		spin_unlock(&tbl->pools[i].lock);
 	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
 
+	BUG_ON(tbl->it_userspace);
+	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
+
 	return 0;
 }
 
@@ -1102,6 +1106,9 @@  static void iommu_table_release_ownership(struct iommu_table *tbl)
 {
 	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
 
+	vfree(tbl->it_userspace);
+	tbl->it_userspace = NULL;
+
 	spin_lock_irqsave(&tbl->large_pool.lock, flags);
 	for (i = 0; i < tbl->nr_pools; i++)
 		spin_lock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index bc36cf1..036f3c1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -26,6 +26,7 @@ 
 #include <linux/iommu.h>
 #include <linux/mmzone.h>
 #include <linux/sizes.h>
+#include <linux/vmalloc.h>
 
 #include <asm/mmzone.h>
 #include <asm/sections.h>
@@ -1469,6 +1470,9 @@  static void pnv_pci_free_table(struct iommu_table *tbl)
 	if (!tbl->it_size)
 		return;
 
+	if (tbl->it_userspace)
+		vfree(tbl->it_userspace);
+
 	pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
 	iommu_reset_table(tbl, "ioda2");
 }
@@ -1656,9 +1660,26 @@  static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
 	pnv_pci_ioda2_set_bypass(pe, !enable);
 }
 
+static long pnv_pci_ioda2_create_table_with_uas(
+		struct iommu_table_group *table_group,
+		int num, __u32 page_shift, __u64 window_size, __u32 levels,
+		struct iommu_table *tbl)
+{
+	long ret = pnv_pci_ioda2_create_table(table_group, num,
+			page_shift, window_size, levels, tbl);
+
+	if (ret)
+		return ret;
+
+	BUG_ON(tbl->it_userspace);
+	tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
+
+	return 0;
+}
+
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
 	.set_ownership = pnv_ioda2_set_ownership,
-	.create_table = pnv_pci_ioda2_create_table,
+	.create_table = pnv_pci_ioda2_create_table_with_uas,
 	.set_window = pnv_pci_ioda2_set_window,
 	.unset_window = pnv_pci_ioda2_unset_window,
 };