diff mbox series

[v2,1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR

Message ID 20210407195613.131140-1-leobras.c@gmail.com (mailing list archive)
State Superseded, archived
Headers show
Series [v2,1/1] powerpc/iommu: Enable remaining IOMMU Pagesizes present in LoPAR | expand
Related show

Checks

Context Check Description
snowpatch_ozlabs/apply_patch success Successfully applied on branch powerpc/merge (571b0d1ccf5cd3dc1b9866a908769ee23f7d127e)
snowpatch_ozlabs/build-ppc64le fail Build failed!
snowpatch_ozlabs/build-ppc64be fail Build failed!
snowpatch_ozlabs/build-ppc64e success Build succeeded
snowpatch_ozlabs/build-pmac32 success Build succeeded
snowpatch_ozlabs/checkpatch success total: 0 errors, 0 warnings, 0 checks, 67 lines checked
snowpatch_ozlabs/needsstable success Patch has no Fixes tags

Commit Message

Leonardo Brás April 7, 2021, 7:56 p.m. UTC
According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
will let the OS know all possible pagesizes that can be used for creating a
new DDW.

Currently Linux will only try using 3 of the 8 available options:
4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
128M, 256M and 16G.

Enabling bigger pages would be interesting for direct mapping systems
with a lot of RAM, while using less TCE entries.

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
---
 arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 7 deletions(-)

Comments

kernel test robot April 8, 2021, 12:22 a.m. UTC | #1
Hi Leonardo,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on powerpc/next]
[also build test WARNING on v5.12-rc6 next-20210407]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allyesconfig (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/faa8b10e5b9652dbd56ed8e759a1cc09b95805be
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
        git checkout faa8b10e5b9652dbd56ed8e759a1cc09b95805be
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

   In file included from include/vdso/const.h:5,
                    from include/linux/const.h:4,
                    from include/linux/bits.h:5,
                    from include/linux/bitops.h:6,
                    from include/linux/kernel.h:11,
                    from include/asm-generic/bug.h:20,
                    from arch/powerpc/include/asm/bug.h:109,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/gfp.h:5,
                    from include/linux/slab.h:15,
                    from arch/powerpc/platforms/pseries/iommu.c:15:
   arch/powerpc/platforms/pseries/iommu.c: In function 'iommu_get_page_shift':
>> include/uapi/linux/const.h:20:19: warning: conversion from 'long long unsigned int' to 'unsigned int' changes value from '17179869184' to '0' [-Woverflow]
      20 | #define __AC(X,Y) (X##Y)
         |                   ^~~~~~
   include/uapi/linux/const.h:21:18: note: in expansion of macro '__AC'
      21 | #define _AC(X,Y) __AC(X,Y)
         |                  ^~~~
   include/linux/sizes.h:48:19: note: in expansion of macro '_AC'
      48 | #define SZ_16G    _AC(0x400000000, ULL)
         |                   ^~~
   arch/powerpc/platforms/pseries/iommu.c:1120:42: note: in expansion of macro 'SZ_16G'
    1120 |   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
         |                                          ^~~~~~


vim +20 include/uapi/linux/const.h

9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   6  
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   7  /* Some constant macros are used in both assembler and
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   8   * C code.  Therefore we cannot annotate them always with
6df95fd7ad9a84 include/linux/const.h      Randy Dunlap        2007-05-08   9   * 'UL' and other type specifiers unilaterally.  We
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  10   * use the following macros to deal with this.
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  11   *
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  12   * Similarly, _AT() will cast an expression with a type in C, but
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  13   * leave it unchanged in asm.
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  14   */
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  15  
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  16  #ifdef __ASSEMBLY__
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  17  #define _AC(X,Y)	X
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  18  #define _AT(T,X)	X
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  19  #else
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02 @20  #define __AC(X,Y)	(X##Y)
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  21  #define _AC(X,Y)	__AC(X,Y)
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  22  #define _AT(T,X)	((T)(X))
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  23  #endif
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  24  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Michael Ellerman April 8, 2021, 5:37 a.m. UTC | #2
Leonardo Bras <leobras.c@gmail.com> writes:
> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> will let the OS know all possible pagesizes that can be used for creating a
> new DDW.
>
> Currently Linux will only try using 3 of the 8 available options:
> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> 128M, 256M and 16G.

Do we know of any hardware & hypervisor combination that will actually
give us bigger pages?

> Enabling bigger pages would be interesting for direct mapping systems
> with a lot of RAM, while using less TCE entries.
>
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
>  1 file changed, 42 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 9fc5217f0c8e..6cda1c92597d 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -53,6 +53,20 @@ enum {
>  	DDW_EXT_QUERY_OUT_SIZE = 2
>  };

A comment saying where the values come from would be good.

> +#define QUERY_DDW_PGSIZE_4K	0x01
> +#define QUERY_DDW_PGSIZE_64K	0x02
> +#define QUERY_DDW_PGSIZE_16M	0x04
> +#define QUERY_DDW_PGSIZE_32M	0x08
> +#define QUERY_DDW_PGSIZE_64M	0x10
> +#define QUERY_DDW_PGSIZE_128M	0x20
> +#define QUERY_DDW_PGSIZE_256M	0x40
> +#define QUERY_DDW_PGSIZE_16G	0x80

I'm not sure the #defines really gain us much vs just putting the
literal values in the array below?

> +struct iommu_ddw_pagesize {
> +	u32 mask;
> +	int shift;
> +};
> +
>  static struct iommu_table_group *iommu_pseries_alloc_group(int node)
>  {
>  	struct iommu_table_group *table_group;
> @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
>  			 ret);
>  }
>  
> +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
> +static int iommu_get_page_shift(u32 query_page_size)
> +{
> +	const struct iommu_ddw_pagesize ddw_pagesize[] = {
> +		{ QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
> +		{ QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> +		{ QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> +		{ QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
> +		{ QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
> +		{ QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
> +		{ QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
> +		{ QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
> +	};


cheers
Leonardo Brás April 8, 2021, 6:20 a.m. UTC | #3
Hello Michael, thank you for this feedback!
Comments inline:

On Thu, 2021-04-08 at 15:37 +1000, Michael Ellerman wrote:
> Leonardo Bras <leobras.c@gmail.com> writes:
> > According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
> > will let the OS know all possible pagesizes that can be used for creating a
> > new DDW.
> > 
> > Currently Linux will only try using 3 of the 8 available options:
> > 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
> > 128M, 256M and 16G.
> 
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?
> 
> > Enabling bigger pages would be interesting for direct mapping systems
> > with a lot of RAM, while using less TCE entries.
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
> >  1 file changed, 42 insertions(+), 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 9fc5217f0c8e..6cda1c92597d 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,6 +53,20 @@ enum {
> >  	DDW_EXT_QUERY_OUT_SIZE = 2
> >  };
> 
> A comment saying where the values come from would be good.

Sure, I will add the information about LoPAR.

> 
> > +#define QUERY_DDW_PGSIZE_4K	0x01
> > +#define QUERY_DDW_PGSIZE_64K	0x02
> > +#define QUERY_DDW_PGSIZE_16M	0x04
> > +#define QUERY_DDW_PGSIZE_32M	0x08
> > +#define QUERY_DDW_PGSIZE_64M	0x10
> > +#define QUERY_DDW_PGSIZE_128M	0x20
> > +#define QUERY_DDW_PGSIZE_256M	0x40
> > +#define QUERY_DDW_PGSIZE_16G	0x80
> 
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?

My v1 did not use the define approach, what do you think of that?
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobras.c@gmail.com/

> 
> > +struct iommu_ddw_pagesize {
> > +	u32 mask;
> > +	int shift;
> > +};
> > +
> >  static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> >  {
> >  	struct iommu_table_group *table_group;
> > @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
> >  			 ret);
> >  }
> >  
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > 
> > +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
> > +static int iommu_get_page_shift(u32 query_page_size)
> > +{
> > +	const struct iommu_ddw_pagesize ddw_pagesize[] = {
> > +		{ QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
> > +		{ QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
> > +		{ QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
> > +		{ QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
> > +		{ QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
> > +		{ QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
> > +		{ QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
> > +		{ QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
> > +	};
> 
> 
> cheers

Best regards,
Leonardo Bras
Leonardo Brás April 8, 2021, 6:35 a.m. UTC | #4
On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
> > > +#define QUERY_DDW_PGSIZE_4K	0x01
> > > +#define QUERY_DDW_PGSIZE_64K	0x02
> > > +#define QUERY_DDW_PGSIZE_16M	0x04
> > > +#define QUERY_DDW_PGSIZE_32M	0x08
> > > +#define QUERY_DDW_PGSIZE_64M	0x10
> > > +#define QUERY_DDW_PGSIZE_128M	0x20
> > > +#define QUERY_DDW_PGSIZE_256M	0x40
> > > +#define QUERY_DDW_PGSIZE_16G	0x80
> > 
> > I'm not sure the #defines really gain us much vs just putting the
> > literal values in the array below?
> 
> My v1 did not use the define approach, what do you think of that?
> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobras.c@gmail.com/
> 
> 
(of course, it would be that without the pageshift defines also, using
the __builtin_ctz() approach suggested by Alexey.)
Alexey Kardashevskiy April 8, 2021, 7:13 a.m. UTC | #5
On 08/04/2021 15:37, Michael Ellerman wrote:
> Leonardo Bras <leobras.c@gmail.com> writes:
>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>> will let the OS know all possible pagesizes that can be used for creating a
>> new DDW.
>>
>> Currently Linux will only try using 3 of the 8 available options:
>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>> 128M, 256M and 16G.
> 
> Do we know of any hardware & hypervisor combination that will actually
> give us bigger pages?


On P8 16MB host pages and 16MB hardware iommu pages worked.

On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB 
hardware IOMMU pages.


> 
>> Enabling bigger pages would be interesting for direct mapping systems
>> with a lot of RAM, while using less TCE entries.
>>
>> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
>> ---
>>   arch/powerpc/platforms/pseries/iommu.c | 49 ++++++++++++++++++++++----
>>   1 file changed, 42 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 9fc5217f0c8e..6cda1c92597d 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -53,6 +53,20 @@ enum {
>>   	DDW_EXT_QUERY_OUT_SIZE = 2
>>   };
> 
> A comment saying where the values come from would be good.
> 
>> +#define QUERY_DDW_PGSIZE_4K	0x01
>> +#define QUERY_DDW_PGSIZE_64K	0x02
>> +#define QUERY_DDW_PGSIZE_16M	0x04
>> +#define QUERY_DDW_PGSIZE_32M	0x08
>> +#define QUERY_DDW_PGSIZE_64M	0x10
>> +#define QUERY_DDW_PGSIZE_128M	0x20
>> +#define QUERY_DDW_PGSIZE_256M	0x40
>> +#define QUERY_DDW_PGSIZE_16G	0x80
> 
> I'm not sure the #defines really gain us much vs just putting the
> literal values in the array below?


Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,



>> +struct iommu_ddw_pagesize {
>> +	u32 mask;
>> +	int shift;
>> +};
>> +
>>   static struct iommu_table_group *iommu_pseries_alloc_group(int node)
>>   {
>>   	struct iommu_table_group *table_group;
>> @@ -1099,6 +1113,31 @@ static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
>>   			 ret);
>>   }
>>   
>> +/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
>> +static int iommu_get_page_shift(u32 query_page_size)
>> +{
>> +	const struct iommu_ddw_pagesize ddw_pagesize[] = {
>> +		{ QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
>> +		{ QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
>> +		{ QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
>> +		{ QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
>> +		{ QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
>> +		{ QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
>> +		{ QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
>> +		{ QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
>> +	};
> 
> 
> cheers
>
kernel test robot April 8, 2021, 7:48 a.m. UTC | #6
Hi Leonardo,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on v5.12-rc6 next-20210407]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-randconfig-r016-20210407 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/faa8b10e5b9652dbd56ed8e759a1cc09b95805be
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Leonardo-Bras/powerpc-iommu-Enable-remaining-IOMMU-Pagesizes-present-in-LoPAR/20210408-035800
        git checkout faa8b10e5b9652dbd56ed8e759a1cc09b95805be
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from include/vdso/const.h:5,
                    from include/linux/const.h:4,
                    from include/linux/bits.h:5,
                    from include/linux/bitops.h:6,
                    from include/linux/kernel.h:11,
                    from include/asm-generic/bug.h:20,
                    from arch/powerpc/include/asm/bug.h:109,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/gfp.h:5,
                    from include/linux/slab.h:15,
                    from arch/powerpc/platforms/pseries/iommu.c:15:
   arch/powerpc/platforms/pseries/iommu.c: In function 'iommu_get_page_shift':
>> include/uapi/linux/const.h:20:19: error: conversion from 'long long unsigned int' to 'unsigned int' changes value from '17179869184' to '0' [-Werror=overflow]
      20 | #define __AC(X,Y) (X##Y)
         |                   ^~~~~~
   include/uapi/linux/const.h:21:18: note: in expansion of macro '__AC'
      21 | #define _AC(X,Y) __AC(X,Y)
         |                  ^~~~
   include/linux/sizes.h:48:19: note: in expansion of macro '_AC'
      48 | #define SZ_16G    _AC(0x400000000, ULL)
         |                   ^~~
   arch/powerpc/platforms/pseries/iommu.c:1120:42: note: in expansion of macro 'SZ_16G'
    1120 |   { QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
         |                                          ^~~~~~
   cc1: all warnings being treated as errors


vim +20 include/uapi/linux/const.h

9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   6  
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   7  /* Some constant macros are used in both assembler and
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02   8   * C code.  Therefore we cannot annotate them always with
6df95fd7ad9a84 include/linux/const.h      Randy Dunlap        2007-05-08   9   * 'UL' and other type specifiers unilaterally.  We
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  10   * use the following macros to deal with this.
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  11   *
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  12   * Similarly, _AT() will cast an expression with a type in C, but
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  13   * leave it unchanged in asm.
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  14   */
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  15  
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  16  #ifdef __ASSEMBLY__
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  17  #define _AC(X,Y)	X
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  18  #define _AT(T,X)	X
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  19  #else
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02 @20  #define __AC(X,Y)	(X##Y)
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  21  #define _AC(X,Y)	__AC(X,Y)
74ef649fe847fd include/linux/const.h      Jeremy Fitzhardinge 2008-01-30  22  #define _AT(T,X)	((T)(X))
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  23  #endif
9d291e787b2b71 include/asm-x86_64/const.h Vivek Goyal         2007-05-02  24  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Michael Ellerman April 8, 2021, 9:04 a.m. UTC | #7
Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> On 08/04/2021 15:37, Michael Ellerman wrote:
>> Leonardo Bras <leobras.c@gmail.com> writes:
>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>>> will let the OS know all possible pagesizes that can be used for creating a
>>> new DDW.
>>>
>>> Currently Linux will only try using 3 of the 8 available options:
>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>>> 128M, 256M and 16G.
>> 
>> Do we know of any hardware & hypervisor combination that will actually
>> give us bigger pages?
>
>
> On P8 16MB host pages and 16MB hardware iommu pages worked.
>
> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB 
> hardware IOMMU pages.

The current code already tries 16MB though.

I'm wondering if we're going to ask for larger sizes that have never
been tested and possibly expose bugs. But it sounds like this is mainly
targeted at future platforms.


>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>> index 9fc5217f0c8e..6cda1c92597d 100644
>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>> @@ -53,6 +53,20 @@ enum {
>>>   	DDW_EXT_QUERY_OUT_SIZE = 2
>>>   };
>> 
>> A comment saying where the values come from would be good.
>> 
>>> +#define QUERY_DDW_PGSIZE_4K	0x01
>>> +#define QUERY_DDW_PGSIZE_64K	0x02
>>> +#define QUERY_DDW_PGSIZE_16M	0x04
>>> +#define QUERY_DDW_PGSIZE_32M	0x08
>>> +#define QUERY_DDW_PGSIZE_64M	0x10
>>> +#define QUERY_DDW_PGSIZE_128M	0x20
>>> +#define QUERY_DDW_PGSIZE_256M	0x40
>>> +#define QUERY_DDW_PGSIZE_16G	0x80
>> 
>> I'm not sure the #defines really gain us much vs just putting the
>> literal values in the array below?
>
> Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,

Yeah that's true. But #defining them doesn't make them less magic, if
you only use them in one place :)

cheers
Michael Ellerman April 8, 2021, 9:08 a.m. UTC | #8
Leonardo Bras <leobras.c@gmail.com> writes:
> On Thu, 2021-04-08 at 03:20 -0300, Leonardo Bras wrote:
>> > > +#define QUERY_DDW_PGSIZE_4K	0x01
>> > > +#define QUERY_DDW_PGSIZE_64K	0x02
>> > > +#define QUERY_DDW_PGSIZE_16M	0x04
>> > > +#define QUERY_DDW_PGSIZE_32M	0x08
>> > > +#define QUERY_DDW_PGSIZE_64M	0x10
>> > > +#define QUERY_DDW_PGSIZE_128M	0x20
>> > > +#define QUERY_DDW_PGSIZE_256M	0x40
>> > > +#define QUERY_DDW_PGSIZE_16G	0x80
>> > 
>> > I'm not sure the #defines really gain us much vs just putting the
>> > literal values in the array below?
>> 
>> My v1 did not use the define approach, what do you think of that?
>> http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210322190943.715368-1-leobras.c@gmail.com/
>> 
>> 
> (of course, it would be that without the pageshift defines also, using
> the __builtin_ctz() approach suggested by Alexey.)

Yeah I think I like that better.

cheers
Alexey Kardashevskiy April 9, 2021, 4:36 a.m. UTC | #9
On 08/04/2021 19:04, Michael Ellerman wrote:
> Alexey Kardashevskiy <aik@ozlabs.ru> writes:
>> On 08/04/2021 15:37, Michael Ellerman wrote:
>>> Leonardo Bras <leobras.c@gmail.com> writes:
>>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page Sizes"
>>>> will let the OS know all possible pagesizes that can be used for creating a
>>>> new DDW.
>>>>
>>>> Currently Linux will only try using 3 of the 8 available options:
>>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M, 64M,
>>>> 128M, 256M and 16G.
>>>
>>> Do we know of any hardware & hypervisor combination that will actually
>>> give us bigger pages?
>>
>>
>> On P8 16MB host pages and 16MB hardware iommu pages worked.
>>
>> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
>> hardware IOMMU pages.
> 
> The current code already tries 16MB though.
> 
> I'm wondering if we're going to ask for larger sizes that have never
> been tested and possibly expose bugs. But it sounds like this is mainly
> targeted at future platforms.


I tried for fun to pass through a PCI device to a guest with this patch as:

pbuild/qemu-killslof-aiku1904le-ppc64/qemu-system-ppc64 \
-nodefaults \
-chardev stdio,id=STDIO0,signal=off,mux=on \
-device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
-mon id=MON0,chardev=STDIO0,mode=readline \
-nographic \
-vga none \
-enable-kvm \
-m 16G \
-kernel ./vmldbg \
-initrd /home/aik/t/le.cpio \
-device vfio-pci,id=vfio0001_01_00_0,host=0001:01:00.0 \
-mem-prealloc \
-mem-path qemu_hp_1G_node0 \
-global spapr-pci-host-bridge.pgsz=0xffffff000 \
-machine cap-cfpc=broken,cap-ccf-assist=off \
-smp 1,threads=1 \
-L /home/aik/t/qemu-ppc64-bios/ \
-trace events=qemu_trace_events \
-d guest_errors,mmu \
-chardev socket,id=SOCKET0,server=on,wait=off,path=qemu.mon.1_1_0_0 \
-mon chardev=SOCKET0,mode=control


The guest created a huge window:

xhci_hcd 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000 
22 22 returned 0 (liobn = 0x80000001 starting addr = 8000000 0)

The first "22" is page_shift in hex (16GB), the second "22" is 
window_shift (so we have 1 TCE).

On the host side the window#1 was created with 1GB pages:
pci 0001:01     : [PE# fd] Setting up window#1 
800000000000000..80007ffffffffff pg=40000000


The XHCI seems working. Without the patch 16MB was the maximum.


> 
>>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>>> index 9fc5217f0c8e..6cda1c92597d 100644
>>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>>> @@ -53,6 +53,20 @@ enum {
>>>>    	DDW_EXT_QUERY_OUT_SIZE = 2
>>>>    };
>>>
>>> A comment saying where the values come from would be good.
>>>
>>>> +#define QUERY_DDW_PGSIZE_4K	0x01
>>>> +#define QUERY_DDW_PGSIZE_64K	0x02
>>>> +#define QUERY_DDW_PGSIZE_16M	0x04
>>>> +#define QUERY_DDW_PGSIZE_32M	0x08
>>>> +#define QUERY_DDW_PGSIZE_64M	0x10
>>>> +#define QUERY_DDW_PGSIZE_128M	0x20
>>>> +#define QUERY_DDW_PGSIZE_256M	0x40
>>>> +#define QUERY_DDW_PGSIZE_16G	0x80
>>>
>>> I'm not sure the #defines really gain us much vs just putting the
>>> literal values in the array below?
>>
>> Then someone says "uuuuu magic values" :) I do not mind either way. Thanks,
> 
> Yeah that's true. But #defining them doesn't make them less magic, if
> you only use them in one place :)

Defining them with "QUERY_DDW" in the names kinda tells where they are 
from. Can also grep QEMU using these to see how the other side handles 
it. Dunno.

btw the bot complained about __builtin_ctz(SZ_16G) which should be 
__builtin_ctzl(SZ_16G) so we have to ask Leonardo to repost anyway :)
Leonardo Brás April 9, 2021, 4:44 a.m. UTC | #10
Em sex., 9 de abr. de 2021 01:36, Alexey Kardashevskiy <aik@ozlabs.ru>
escreveu:

>
>
> On 08/04/2021 19:04, Michael Ellerman wrote:
> > Alexey Kardashevskiy <aik@ozlabs.ru> writes:
> >> On 08/04/2021 15:37, Michael Ellerman wrote:
> >>> Leonardo Bras <leobras.c@gmail.com> writes:
> >>>> According to LoPAR, ibm,query-pe-dma-window output named "IO Page
> Sizes"
> >>>> will let the OS know all possible pagesizes that can be used for
> creating a
> >>>> new DDW.
> >>>>
> >>>> Currently Linux will only try using 3 of the 8 available options:
> >>>> 4K, 64K and 16M. According to LoPAR, Hypervisor may also offer 32M,
> 64M,
> >>>> 128M, 256M and 16G.
> >>>
> >>> Do we know of any hardware & hypervisor combination that will actually
> >>> give us bigger pages?
> >>
> >>
> >> On P8 16MB host pages and 16MB hardware iommu pages worked.
> >>
> >> On P9, VM's 16MB IOMMU pages worked on top of 2MB host pages + 2MB
> >> hardware IOMMU pages.
> >
> > The current code already tries 16MB though.
> >
> > I'm wondering if we're going to ask for larger sizes that have never
> > been tested and possibly expose bugs. But it sounds like this is mainly
> > targeted at future platforms.
>
>
> I tried for fun to pass through a PCI device to a guest with this patch as:
>
> pbuild/qemu-killslof-aiku1904le-ppc64/qemu-system-ppc64 \
> -nodefaults \
> -chardev stdio,id=STDIO0,signal=off,mux=on \
> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
> -mon id=MON0,chardev=STDIO0,mode=readline \
> -nographic \
> -vga none \
> -enable-kvm \
> -m 16G \
> -kernel ./vmldbg \
> -initrd /home/aik/t/le.cpio \
> -device vfio-pci,id=vfio0001_01_00_0,host=0001:01:00.0 \
> -mem-prealloc \
> -mem-path qemu_hp_1G_node0 \
> -global spapr-pci-host-bridge.pgsz=0xffffff000 \
> -machine cap-cfpc=broken,cap-ccf-assist=off \
> -smp 1,threads=1 \
> -L /home/aik/t/qemu-ppc64-bios/ \
> -trace events=qemu_trace_events \
> -d guest_errors,mmu \
> -chardev socket,id=SOCKET0,server=on,wait=off,path=qemu.mon.1_1_0_0 \
> -mon chardev=SOCKET0,mode=control
>
>
> The guest created a huge window:
>
> xhci_hcd 0000:00:00.0: ibm,create-pe-dma-window(2027) 0 8000000 20000000
> 22 22 returned 0 (liobn = 0x80000001 starting addr = 8000000 0)
>
> The first "22" is page_shift in hex (16GB), the second "22" is
> window_shift (so we have 1 TCE).
>
> On the host side the window#1 was created with 1GB pages:
> pci 0001:01     : [PE# fd] Setting up window#1
> 800000000000000..80007ffffffffff pg=40000000
>
>
> The XHCI seems working. Without the patch 16MB was the maximum.
>
>
> >
> >>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c
> b/arch/powerpc/platforms/pseries/iommu.c
> >>>> index 9fc5217f0c8e..6cda1c92597d 100644
> >>>> --- a/arch/powerpc/platforms/pseries/iommu.c
> >>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
> >>>> @@ -53,6 +53,20 @@ enum {
> >>>>            DDW_EXT_QUERY_OUT_SIZE = 2
> >>>>    };
> >>>
> >>> A comment saying where the values come from would be good.
> >>>
> >>>> +#define QUERY_DDW_PGSIZE_4K       0x01
> >>>> +#define QUERY_DDW_PGSIZE_64K      0x02
> >>>> +#define QUERY_DDW_PGSIZE_16M      0x04
> >>>> +#define QUERY_DDW_PGSIZE_32M      0x08
> >>>> +#define QUERY_DDW_PGSIZE_64M      0x10
> >>>> +#define QUERY_DDW_PGSIZE_128M     0x20
> >>>> +#define QUERY_DDW_PGSIZE_256M     0x40
> >>>> +#define QUERY_DDW_PGSIZE_16G      0x80
> >>>
> >>> I'm not sure the #defines really gain us much vs just putting the
> >>> literal values in the array below?
> >>
> >> Then someone says "uuuuu magic values" :) I do not mind either way.
> Thanks,
> >
> > Yeah that's true. But #defining them doesn't make them less magic, if
> > you only use them in one place :)
>
> Defining them with "QUERY_DDW" in the names kinda tells where they are
> from. Can also grep QEMU using these to see how the other side handles
> it. Dunno.
>
> btw the bot complained about __builtin_ctz(SZ_16G) which should be
> __builtin_ctzl(SZ_16G) so we have to ask Leonardo to repost anyway :)
>

Thanks for testing!

http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210408201915.174217-1-leobras.c@gmail.com/

I sent a v3 a few hours ago, fixing this by using __builtin_ctzll() instead
of __builtin_ctz() in all sizes, and it worked like a charm.

I also reverted to the previous approach of not having QUERY_DDW defines
for masks, as Michael suggested.

I can revert back to v2 approach if you guys decide it's better.

Best regards,
Leonardo Bras
Segher Boessenkool April 12, 2021, 10:21 p.m. UTC | #11
On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> On 08/04/2021 19:04, Michael Ellerman wrote:
> >>>>+#define QUERY_DDW_PGSIZE_4K	0x01
> >>>>+#define QUERY_DDW_PGSIZE_64K	0x02
> >>>>+#define QUERY_DDW_PGSIZE_16M	0x04
> >>>>+#define QUERY_DDW_PGSIZE_32M	0x08
> >>>>+#define QUERY_DDW_PGSIZE_64M	0x10
> >>>>+#define QUERY_DDW_PGSIZE_128M	0x20
> >>>>+#define QUERY_DDW_PGSIZE_256M	0x40
> >>>>+#define QUERY_DDW_PGSIZE_16G	0x80
> >>>
> >>>I'm not sure the #defines really gain us much vs just putting the
> >>>literal values in the array below?
> >>
> >>Then someone says "uuuuu magic values" :) I do not mind either way. 
> >>Thanks,
> >
> >Yeah that's true. But #defining them doesn't make them less magic, if
> >you only use them in one place :)
> 
> Defining them with "QUERY_DDW" in the names kinda tells where they are 
> from. Can also grep QEMU using these to see how the other side handles 
> it. Dunno.

And *not* defining anything reduces the mental load a lot.  You can add
a comment at the single spot you use them, explaining what this is, in a
much better way!

Comments are *good*.


Segher
Leonardo Brás April 14, 2021, 4:02 a.m. UTC | #12
On Mon, 2021-04-12 at 17:21 -0500, Segher Boessenkool wrote:
> On Fri, Apr 09, 2021 at 02:36:16PM +1000, Alexey Kardashevskiy wrote:
> > On 08/04/2021 19:04, Michael Ellerman wrote:
> > > > > > +#define QUERY_DDW_PGSIZE_4K	0x01
> > > > > > +#define QUERY_DDW_PGSIZE_64K	0x02
> > > > > > +#define QUERY_DDW_PGSIZE_16M	0x04
> > > > > > +#define QUERY_DDW_PGSIZE_32M	0x08
> > > > > > +#define QUERY_DDW_PGSIZE_64M	0x10
> > > > > > +#define QUERY_DDW_PGSIZE_128M	0x20
> > > > > > +#define QUERY_DDW_PGSIZE_256M	0x40
> > > > > > +#define QUERY_DDW_PGSIZE_16G	0x80
> > > > > 
> > > > > I'm not sure the #defines really gain us much vs just putting the
> > > > > literal values in the array below?
> > > > 
> > > > Then someone says "uuuuu magic values" :) I do not mind either way. 
> > > > Thanks,
> > > 
> > > Yeah that's true. But #defining them doesn't make them less magic, if
> > > you only use them in one place :)
> > 
> > Defining them with "QUERY_DDW" in the names kinda tells where they are 
> > from. Can also grep QEMU using these to see how the other side handles 
> > it. Dunno.
> 
> And *not* defining anything reduces the mental load a lot.  You can add
> a comment at the single spot you use them, explaining what this is, in a
> much better way!
> 
> Comments are *good*.
> 
> 
> Segher

Thanks for the feedback Alexey, Michael and Segher!

I have sent a v3 for this patch. 
http://patchwork.ozlabs.org/project/linuxppc-dev/patch/20210408201915.174217-1-leobras.c@gmail.com/

Please let me know of your feedback in it.

Best regards,
Leonardo Bras
diff mbox series

Patch

diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 9fc5217f0c8e..6cda1c92597d 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,20 @@  enum {
 	DDW_EXT_QUERY_OUT_SIZE = 2
 };
 
+#define QUERY_DDW_PGSIZE_4K	0x01
+#define QUERY_DDW_PGSIZE_64K	0x02
+#define QUERY_DDW_PGSIZE_16M	0x04
+#define QUERY_DDW_PGSIZE_32M	0x08
+#define QUERY_DDW_PGSIZE_64M	0x10
+#define QUERY_DDW_PGSIZE_128M	0x20
+#define QUERY_DDW_PGSIZE_256M	0x40
+#define QUERY_DDW_PGSIZE_16G	0x80
+
+struct iommu_ddw_pagesize {
+	u32 mask;
+	int shift;
+};
+
 static struct iommu_table_group *iommu_pseries_alloc_group(int node)
 {
 	struct iommu_table_group *table_group;
@@ -1099,6 +1113,31 @@  static void reset_dma_window(struct pci_dev *dev, struct device_node *par_dn)
 			 ret);
 }
 
+/* Returns page shift based on "IO Page Sizes" output at ibm,query-pe-dma-window. See LoPAR */
+static int iommu_get_page_shift(u32 query_page_size)
+{
+	const struct iommu_ddw_pagesize ddw_pagesize[] = {
+		{ QUERY_DDW_PGSIZE_16G,  __builtin_ctz(SZ_16G)  },
+		{ QUERY_DDW_PGSIZE_256M, __builtin_ctz(SZ_256M) },
+		{ QUERY_DDW_PGSIZE_128M, __builtin_ctz(SZ_128M) },
+		{ QUERY_DDW_PGSIZE_64M,  __builtin_ctz(SZ_64M)  },
+		{ QUERY_DDW_PGSIZE_32M,  __builtin_ctz(SZ_32M)  },
+		{ QUERY_DDW_PGSIZE_16M,  __builtin_ctz(SZ_16M)  },
+		{ QUERY_DDW_PGSIZE_64K,  __builtin_ctz(SZ_64K)  },
+		{ QUERY_DDW_PGSIZE_4K,   __builtin_ctz(SZ_4K)   }
+	};
+
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ddw_pagesize); i++) {
+		if (query_page_size & ddw_pagesize[i].mask)
+			return ddw_pagesize[i].shift;
+	}
+
+	/* No valid page size found. */
+	return 0;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1206,13 +1245,9 @@  static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 			goto out_failed;
 		}
 	}
-	if (query.page_size & 4) {
-		page_shift = 24; /* 16MB */
-	} else if (query.page_size & 2) {
-		page_shift = 16; /* 64kB */
-	} else if (query.page_size & 1) {
-		page_shift = 12; /* 4kB */
-	} else {
+
+	page_shift = iommu_get_page_shift(query.page_size);
+	if (!page_shift) {
 		dev_dbg(&dev->dev, "no supported direct page size in mask %x",
 			  query.page_size);
 		goto out_failed;