diff mbox series

[mm,v3,1/6] mm: Use mm_zero_struct_page from SPARC on all 64b architectures

Message ID 20181015202656.2171.92963.stgit@localhost.localdomain
State Not Applicable
Delegated to: David Miller
Headers show
Series Deferred page init improvements | expand

Commit Message

Alexander Duyck Oct. 15, 2018, 8:26 p.m. UTC
This change makes it so that we use the same approach that was already in
use on Sparc on all the archtectures that support a 64b long.

This is mostly motivated by the fact that 8 to 10 store/move instructions
are likely always going to be faster than having to call into a function
that is not specialized for handling page init.

An added advantage to doing it this way is that the compiler can get away
with combining writes in the __init_single_page call. As a result the
memset call will be reduced to only about 4 write operations, or at least
that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
my system.

One change I had to make to the function was to reduce the minimum page
size to 56 to support some powerpc64 configurations.

Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
---
 arch/sparc/include/asm/pgtable_64.h |   30 ------------------------------
 include/linux/mm.h                  |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 30 deletions(-)

Comments

Pavel Tatashin Oct. 16, 2018, 7:01 p.m. UTC | #1
On 10/15/18 4:26 PM, Alexander Duyck wrote:
> This change makes it so that we use the same approach that was already in
> use on Sparc on all the archtectures that support a 64b long.
> 
> This is mostly motivated by the fact that 8 to 10 store/move instructions
> are likely always going to be faster than having to call into a function
> that is not specialized for handling page init.
> 
> An added advantage to doing it this way is that the compiler can get away
> with combining writes in the __init_single_page call. As a result the
> memset call will be reduced to only about 4 write operations, or at least
> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
> count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
> my system.
> 
> One change I had to make to the function was to reduce the minimum page
> size to 56 to support some powerpc64 configurations.
> 
> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>


I have tested on Broadcom's Stingray cpu with 48G RAM:
__init_single_page() takes 19.30ns / 64-byte struct page
Wit the change it takes 17.33ns / 64-byte struct page

Please add this data and also the data from Intel to the description.

Thank you,
Pavel

> ---
>  arch/sparc/include/asm/pgtable_64.h |   30 ------------------------------
>  include/linux/mm.h                  |   34 ++++++++++++++++++++++++++++++++++
>  2 files changed, 34 insertions(+), 30 deletions(-)
> 
> diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> index 1393a8ac596b..22500c3be7a9 100644
> --- a/arch/sparc/include/asm/pgtable_64.h
> +++ b/arch/sparc/include/asm/pgtable_64.h
> @@ -231,36 +231,6 @@
>  extern struct page *mem_map_zero;
>  #define ZERO_PAGE(vaddr)	(mem_map_zero)
>  
> -/* This macro must be updated when the size of struct page grows above 80
> - * or reduces below 64.
> - * The idea that compiler optimizes out switch() statement, and only
> - * leaves clrx instructions
> - */
> -#define	mm_zero_struct_page(pp) do {					\
> -	unsigned long *_pp = (void *)(pp);				\
> -									\
> -	 /* Check that struct page is either 64, 72, or 80 bytes */	\
> -	BUILD_BUG_ON(sizeof(struct page) & 7);				\
> -	BUILD_BUG_ON(sizeof(struct page) < 64);				\
> -	BUILD_BUG_ON(sizeof(struct page) > 80);				\
> -									\
> -	switch (sizeof(struct page)) {					\
> -	case 80:							\
> -		_pp[9] = 0;	/* fallthrough */			\
> -	case 72:							\
> -		_pp[8] = 0;	/* fallthrough */			\
> -	default:							\
> -		_pp[7] = 0;						\
> -		_pp[6] = 0;						\
> -		_pp[5] = 0;						\
> -		_pp[4] = 0;						\
> -		_pp[3] = 0;						\
> -		_pp[2] = 0;						\
> -		_pp[1] = 0;						\
> -		_pp[0] = 0;						\
> -	}								\
> -} while (0)
> -
>  /* PFNs are real physical page numbers.  However, mem_map only begins to record
>   * per-page information starting at pfn_base.  This is to handle systems where
>   * the first physical page in the machine is at some huge physical address,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bb0de406f8e7..ec6e57a0c14e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long limit) { }
>   * zeroing by defining this macro in <asm/pgtable.h>.
>   */

The comment above becomes outdated. Please change, we use optimized
mm_zero_struct_page on every 64-bit platform.

>  #ifndef mm_zero_struct_page
> +#if BITS_PER_LONG == 64
> +/* This function must be updated when the size of struct page grows above 80
> + * or reduces below 64. The idea that compiler optimizes out switch()
> + * statement, and only leaves move/store instructions
> + */
> +#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
> +static inline void __mm_zero_struct_page(struct page *page)
> +{
> +	unsigned long *_pp = (void *)page;
> +
> +	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
> +	BUILD_BUG_ON(sizeof(struct page) & 7);
> +	BUILD_BUG_ON(sizeof(struct page) < 56);
> +	BUILD_BUG_ON(sizeof(struct page) > 80);
> +
> +	switch (sizeof(struct page)) {
> +	case 80:
> +		_pp[9] = 0;	/* fallthrough */
> +	case 72:
> +		_pp[8] = 0;	/* fallthrough */
> +	default:
> +		_pp[7] = 0;	/* fallthrough */
> +	case 56:
> +		_pp[6] = 0;
> +		_pp[5] = 0;
> +		_pp[4] = 0;
> +		_pp[3] = 0;
> +		_pp[2] = 0;
> +		_pp[1] = 0;
> +		_pp[0] = 0;
> +	}
> +}
> +#else
>  #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
>  #endif
> +#endif
>  
>  /*
>   * Default maximum number of active map areas, this limits the number of vmas
>
Mike Rapoport Oct. 17, 2018, 7:30 a.m. UTC | #2
On Tue, Oct 16, 2018 at 03:01:11PM -0400, Pavel Tatashin wrote:
> 
> 
> On 10/15/18 4:26 PM, Alexander Duyck wrote:
> > This change makes it so that we use the same approach that was already in
> > use on Sparc on all the archtectures that support a 64b long.
> > 
> > This is mostly motivated by the fact that 8 to 10 store/move instructions
> > are likely always going to be faster than having to call into a function
> > that is not specialized for handling page init.
> > 
> > An added advantage to doing it this way is that the compiler can get away
> > with combining writes in the __init_single_page call. As a result the
> > memset call will be reduced to only about 4 write operations, or at least
> > that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
> > count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
> > my system.
> > 
> > One change I had to make to the function was to reduce the minimum page
> > size to 56 to support some powerpc64 configurations.
> > 
> > Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
> 
> 
> I have tested on Broadcom's Stingray cpu with 48G RAM:
> __init_single_page() takes 19.30ns / 64-byte struct page
> Wit the change it takes 17.33ns / 64-byte struct page
 
I gave it a run on an OpenPower (S812LC 8348-21C) with Power8 processor and
with 128G of RAM. My results for 64-byte struct page were:

before: 4.6788ns
after: 4.5882ns

My two cents :)

> Please add this data and also the data from Intel to the description.
> 
> Thank you,
> Pavel
> 
> > ---
> >  arch/sparc/include/asm/pgtable_64.h |   30 ------------------------------
> >  include/linux/mm.h                  |   34 ++++++++++++++++++++++++++++++++++
> >  2 files changed, 34 insertions(+), 30 deletions(-)
> > 
> > diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
> > index 1393a8ac596b..22500c3be7a9 100644
> > --- a/arch/sparc/include/asm/pgtable_64.h
> > +++ b/arch/sparc/include/asm/pgtable_64.h
> > @@ -231,36 +231,6 @@
> >  extern struct page *mem_map_zero;
> >  #define ZERO_PAGE(vaddr)	(mem_map_zero)
> >  
> > -/* This macro must be updated when the size of struct page grows above 80
> > - * or reduces below 64.
> > - * The idea that compiler optimizes out switch() statement, and only
> > - * leaves clrx instructions
> > - */
> > -#define	mm_zero_struct_page(pp) do {					\
> > -	unsigned long *_pp = (void *)(pp);				\
> > -									\
> > -	 /* Check that struct page is either 64, 72, or 80 bytes */	\
> > -	BUILD_BUG_ON(sizeof(struct page) & 7);				\
> > -	BUILD_BUG_ON(sizeof(struct page) < 64);				\
> > -	BUILD_BUG_ON(sizeof(struct page) > 80);				\
> > -									\
> > -	switch (sizeof(struct page)) {					\
> > -	case 80:							\
> > -		_pp[9] = 0;	/* fallthrough */			\
> > -	case 72:							\
> > -		_pp[8] = 0;	/* fallthrough */			\
> > -	default:							\
> > -		_pp[7] = 0;						\
> > -		_pp[6] = 0;						\
> > -		_pp[5] = 0;						\
> > -		_pp[4] = 0;						\
> > -		_pp[3] = 0;						\
> > -		_pp[2] = 0;						\
> > -		_pp[1] = 0;						\
> > -		_pp[0] = 0;						\
> > -	}								\
> > -} while (0)
> > -
> >  /* PFNs are real physical page numbers.  However, mem_map only begins to record
> >   * per-page information starting at pfn_base.  This is to handle systems where
> >   * the first physical page in the machine is at some huge physical address,
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index bb0de406f8e7..ec6e57a0c14e 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long limit) { }
> >   * zeroing by defining this macro in <asm/pgtable.h>.
> >   */
> 
> The comment above becomes outdated. Please change, we use optimized
> mm_zero_struct_page on every 64-bit platform.
> 
> >  #ifndef mm_zero_struct_page
> > +#if BITS_PER_LONG == 64
> > +/* This function must be updated when the size of struct page grows above 80
> > + * or reduces below 64. The idea that compiler optimizes out switch()
> > + * statement, and only leaves move/store instructions
> > + */
> > +#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
> > +static inline void __mm_zero_struct_page(struct page *page)
> > +{
> > +	unsigned long *_pp = (void *)page;
> > +
> > +	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
> > +	BUILD_BUG_ON(sizeof(struct page) & 7);
> > +	BUILD_BUG_ON(sizeof(struct page) < 56);
> > +	BUILD_BUG_ON(sizeof(struct page) > 80);
> > +
> > +	switch (sizeof(struct page)) {
> > +	case 80:
> > +		_pp[9] = 0;	/* fallthrough */
> > +	case 72:
> > +		_pp[8] = 0;	/* fallthrough */
> > +	default:
> > +		_pp[7] = 0;	/* fallthrough */
> > +	case 56:
> > +		_pp[6] = 0;
> > +		_pp[5] = 0;
> > +		_pp[4] = 0;
> > +		_pp[3] = 0;
> > +		_pp[2] = 0;
> > +		_pp[1] = 0;
> > +		_pp[0] = 0;
> > +	}
> > +}
> > +#else
> >  #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
> >  #endif
> > +#endif
> >  
> >  /*
> >   * Default maximum number of active map areas, this limits the number of vmas
> > 
>
Michal Hocko Oct. 17, 2018, 8:47 a.m. UTC | #3
On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
> This change makes it so that we use the same approach that was already in
> use on Sparc on all the archtectures that support a 64b long.
> 
> This is mostly motivated by the fact that 8 to 10 store/move instructions
> are likely always going to be faster than having to call into a function
> that is not specialized for handling page init.
> 
> An added advantage to doing it this way is that the compiler can get away
> with combining writes in the __init_single_page call. As a result the
> memset call will be reduced to only about 4 write operations, or at least
> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
> count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
> my system.
> 
> One change I had to make to the function was to reduce the minimum page
> size to 56 to support some powerpc64 configurations.

This really begs for numbers. I do not mind the change itself with some
minor comments below.

[...]
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index bb0de406f8e7..ec6e57a0c14e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long limit) { }
>   * zeroing by defining this macro in <asm/pgtable.h>.
>   */
>  #ifndef mm_zero_struct_page

Do we still need this ifdef? I guess we can wait for an arch which
doesn't like this change and then add the override. I would rather go
simple if possible.

> +#if BITS_PER_LONG == 64
> +/* This function must be updated when the size of struct page grows above 80
> + * or reduces below 64. The idea that compiler optimizes out switch()
> + * statement, and only leaves move/store instructions
> + */
> +#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
> +static inline void __mm_zero_struct_page(struct page *page)
> +{
> +	unsigned long *_pp = (void *)page;
> +
> +	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
> +	BUILD_BUG_ON(sizeof(struct page) & 7);
> +	BUILD_BUG_ON(sizeof(struct page) < 56);
> +	BUILD_BUG_ON(sizeof(struct page) > 80);
> +
> +	switch (sizeof(struct page)) {
> +	case 80:
> +		_pp[9] = 0;	/* fallthrough */
> +	case 72:
> +		_pp[8] = 0;	/* fallthrough */
> +	default:
> +		_pp[7] = 0;	/* fallthrough */
> +	case 56:
> +		_pp[6] = 0;
> +		_pp[5] = 0;
> +		_pp[4] = 0;
> +		_pp[3] = 0;
> +		_pp[2] = 0;
> +		_pp[1] = 0;
> +		_pp[0] = 0;
> +	}

This just hit my eyes. I have to confess I have never seen default: to
be not the last one in the switch. Can we have case 64 instead or does gcc
complain? I would be surprised with the set of BUILD_BUG_ONs.

> +}
> +#else
>  #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
>  #endif
> +#endif
>  
>  /*
>   * Default maximum number of active map areas, this limits the number of vmas
>
Alexander Duyck Oct. 17, 2018, 2:52 p.m. UTC | #4
On 10/17/2018 12:30 AM, Mike Rapoport wrote:
> On Tue, Oct 16, 2018 at 03:01:11PM -0400, Pavel Tatashin wrote:
>>
>>
>> On 10/15/18 4:26 PM, Alexander Duyck wrote:
>>> This change makes it so that we use the same approach that was already in
>>> use on Sparc on all the archtectures that support a 64b long.
>>>
>>> This is mostly motivated by the fact that 8 to 10 store/move instructions
>>> are likely always going to be faster than having to call into a function
>>> that is not specialized for handling page init.
>>>
>>> An added advantage to doing it this way is that the compiler can get away
>>> with combining writes in the __init_single_page call. As a result the
>>> memset call will be reduced to only about 4 write operations, or at least
>>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
>>> count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
>>> my system.
>>>
>>> One change I had to make to the function was to reduce the minimum page
>>> size to 56 to support some powerpc64 configurations.
>>>
>>> Signed-off-by: Alexander Duyck <alexander.h.duyck@linux.intel.com>
>>
>>
>> I have tested on Broadcom's Stingray cpu with 48G RAM:
>> __init_single_page() takes 19.30ns / 64-byte struct page
>> Wit the change it takes 17.33ns / 64-byte struct page
>   
> I gave it a run on an OpenPower (S812LC 8348-21C) with Power8 processor and
> with 128G of RAM. My results for 64-byte struct page were:
> 
> before: 4.6788ns
> after: 4.5882ns
> 
> My two cents :)

Thanks. I will add this and Pavel's data to the patch description.

- Alex
Alexander Duyck Oct. 17, 2018, 3:07 p.m. UTC | #5
On 10/17/2018 1:47 AM, Michal Hocko wrote:
> On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
>> This change makes it so that we use the same approach that was already in
>> use on Sparc on all the archtectures that support a 64b long.
>>
>> This is mostly motivated by the fact that 8 to 10 store/move instructions
>> are likely always going to be faster than having to call into a function
>> that is not specialized for handling page init.
>>
>> An added advantage to doing it this way is that the compiler can get away
>> with combining writes in the __init_single_page call. As a result the
>> memset call will be reduced to only about 4 write operations, or at least
>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
>> count/mapcount seem to be cancelling out at least 4 of the 8 assignments on
>> my system.
>>
>> One change I had to make to the function was to reduce the minimum page
>> size to 56 to support some powerpc64 configurations.
> 
> This really begs for numbers. I do not mind the change itself with some
> minor comments below.
> 
> [...]
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index bb0de406f8e7..ec6e57a0c14e 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long limit) { }
>>    * zeroing by defining this macro in <asm/pgtable.h>.
>>    */
>>   #ifndef mm_zero_struct_page
> 
> Do we still need this ifdef? I guess we can wait for an arch which
> doesn't like this change and then add the override. I would rather go
> simple if possible.

We probably don't, but as soon as I remove it somebody will probably 
complain somewhere. I guess I could drop it for now and see if anybody 
screams. Adding it back should be pretty straight forward since it would 
only be 2 lines.

>> +#if BITS_PER_LONG == 64
>> +/* This function must be updated when the size of struct page grows above 80
>> + * or reduces below 64. The idea that compiler optimizes out switch()
>> + * statement, and only leaves move/store instructions
>> + */
>> +#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
>> +static inline void __mm_zero_struct_page(struct page *page)
>> +{
>> +	unsigned long *_pp = (void *)page;
>> +
>> +	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
>> +	BUILD_BUG_ON(sizeof(struct page) & 7);
>> +	BUILD_BUG_ON(sizeof(struct page) < 56);
>> +	BUILD_BUG_ON(sizeof(struct page) > 80);
>> +
>> +	switch (sizeof(struct page)) {
>> +	case 80:
>> +		_pp[9] = 0;	/* fallthrough */
>> +	case 72:
>> +		_pp[8] = 0;	/* fallthrough */
>> +	default:
>> +		_pp[7] = 0;	/* fallthrough */
>> +	case 56:
>> +		_pp[6] = 0;
>> +		_pp[5] = 0;
>> +		_pp[4] = 0;
>> +		_pp[3] = 0;
>> +		_pp[2] = 0;
>> +		_pp[1] = 0;
>> +		_pp[0] = 0;
>> +	}
> 
> This just hit my eyes. I have to confess I have never seen default: to
> be not the last one in the switch. Can we have case 64 instead or does gcc
> complain? I would be surprised with the set of BUILD_BUG_ONs.

I can probably just replace the "default:" with "case 64:". I think I 
have seen other switch statements in the kernel without a default so 
odds are it should be okay.
Pavel Tatashin Oct. 17, 2018, 3:12 p.m. UTC | #6
On 10/17/18 11:07 AM, Alexander Duyck wrote:
> On 10/17/2018 1:47 AM, Michal Hocko wrote:
>> On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
>>> This change makes it so that we use the same approach that was
>>> already in
>>> use on Sparc on all the archtectures that support a 64b long.
>>>
>>> This is mostly motivated by the fact that 8 to 10 store/move
>>> instructions
>>> are likely always going to be faster than having to call into a function
>>> that is not specialized for handling page init.
>>>
>>> An added advantage to doing it this way is that the compiler can get
>>> away
>>> with combining writes in the __init_single_page call. As a result the
>>> memset call will be reduced to only about 4 write operations, or at
>>> least
>>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
>>> count/mapcount seem to be cancelling out at least 4 of the 8
>>> assignments on
>>> my system.
>>>
>>> One change I had to make to the function was to reduce the minimum page
>>> size to 56 to support some powerpc64 configurations.
>>
>> This really begs for numbers. I do not mind the change itself with some
>> minor comments below.
>>
>> [...]
>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>> index bb0de406f8e7..ec6e57a0c14e 100644
>>> --- a/include/linux/mm.h
>>> +++ b/include/linux/mm.h
>>> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long
>>> limit) { }
>>>    * zeroing by defining this macro in <asm/pgtable.h>.
>>>    */
>>>   #ifndef mm_zero_struct_page
>>
>> Do we still need this ifdef? I guess we can wait for an arch which
>> doesn't like this change and then add the override. I would rather go
>> simple if possible.
> 
> We probably don't, but as soon as I remove it somebody will probably
> complain somewhere. I guess I could drop it for now and see if anybody
> screams. Adding it back should be pretty straight forward since it would
> only be 2 lines.
> 
>>> +#if BITS_PER_LONG == 64
>>> +/* This function must be updated when the size of struct page grows
>>> above 80
>>> + * or reduces below 64. The idea that compiler optimizes out switch()
>>> + * statement, and only leaves move/store instructions
>>> + */
>>> +#define    mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
>>> +static inline void __mm_zero_struct_page(struct page *page)
>>> +{
>>> +    unsigned long *_pp = (void *)page;
>>> +
>>> +     /* Check that struct page is either 56, 64, 72, or 80 bytes */
>>> +    BUILD_BUG_ON(sizeof(struct page) & 7);
>>> +    BUILD_BUG_ON(sizeof(struct page) < 56);
>>> +    BUILD_BUG_ON(sizeof(struct page) > 80);
>>> +
>>> +    switch (sizeof(struct page)) {
>>> +    case 80:
>>> +        _pp[9] = 0;    /* fallthrough */
>>> +    case 72:
>>> +        _pp[8] = 0;    /* fallthrough */
>>> +    default:
>>> +        _pp[7] = 0;    /* fallthrough */
>>> +    case 56:
>>> +        _pp[6] = 0;
>>> +        _pp[5] = 0;
>>> +        _pp[4] = 0;
>>> +        _pp[3] = 0;
>>> +        _pp[2] = 0;
>>> +        _pp[1] = 0;
>>> +        _pp[0] = 0;
>>> +    }
>>
>> This just hit my eyes. I have to confess I have never seen default: to
>> be not the last one in the switch. Can we have case 64 instead or does
>> gcc
>> complain? I would be surprised with the set of BUILD_BUG_ONs.

It was me, C does not really care where default is placed, I was trying
to keep stores sequential for better cache locality, but "case 64"
should be OK, and even better for this purpose.

Pavel

> 
> I can probably just replace the "default:" with "case 64:". I think I
> have seen other switch statements in the kernel without a default so
> odds are it should be okay.
>
David Laight Oct. 17, 2018, 3:40 p.m. UTC | #7
From: Pavel Tatashin
> Sent: 17 October 2018 16:12
> On 10/17/18 11:07 AM, Alexander Duyck wrote:
> > On 10/17/2018 1:47 AM, Michal Hocko wrote:
> >> On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
> >>> This change makes it so that we use the same approach that was
> >>> already in
> >>> use on Sparc on all the archtectures that support a 64b long.
> >>>
> >>> This is mostly motivated by the fact that 8 to 10 store/move
> >>> instructions
> >>> are likely always going to be faster than having to call into a function
> >>> that is not specialized for handling page init.
> >>>
> >>> An added advantage to doing it this way is that the compiler can get
> >>> away
> >>> with combining writes in the __init_single_page call. As a result the
> >>> memset call will be reduced to only about 4 write operations, or at
> >>> least
> >>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
> >>> count/mapcount seem to be cancelling out at least 4 of the 8
> >>> assignments on
> >>> my system.
> >>>
> >>> One change I had to make to the function was to reduce the minimum page
> >>> size to 56 to support some powerpc64 configurations.
> >>
> >> This really begs for numbers. I do not mind the change itself with some
> >> minor comments below.
> >>
> >> [...]
> >>> diff --git a/include/linux/mm.h b/include/linux/mm.h
> >>> index bb0de406f8e7..ec6e57a0c14e 100644
> >>> --- a/include/linux/mm.h
> >>> +++ b/include/linux/mm.h
> >>> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long
> >>> limit) { }
> >>>    * zeroing by defining this macro in <asm/pgtable.h>.
> >>>    */
> >>>   #ifndef mm_zero_struct_page
> >>
> >> Do we still need this ifdef? I guess we can wait for an arch which
> >> doesn't like this change and then add the override. I would rather go
> >> simple if possible.
> >
> > We probably don't, but as soon as I remove it somebody will probably
> > complain somewhere. I guess I could drop it for now and see if anybody
> > screams. Adding it back should be pretty straight forward since it would
> > only be 2 lines.
> >
> >>> +#if BITS_PER_LONG == 64
> >>> +/* This function must be updated when the size of struct page grows
> >>> above 80
> >>> + * or reduces below 64. The idea that compiler optimizes out switch()
> >>> + * statement, and only leaves move/store instructions
> >>> + */
> >>> +#define    mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
> >>> +static inline void __mm_zero_struct_page(struct page *page)
> >>> +{
> >>> +    unsigned long *_pp = (void *)page;
> >>> +
> >>> +     /* Check that struct page is either 56, 64, 72, or 80 bytes */
> >>> +    BUILD_BUG_ON(sizeof(struct page) & 7);
> >>> +    BUILD_BUG_ON(sizeof(struct page) < 56);
> >>> +    BUILD_BUG_ON(sizeof(struct page) > 80);
> >>> +
> >>> +    switch (sizeof(struct page)) {
> >>> +    case 80:
> >>> +        _pp[9] = 0;    /* fallthrough */
> >>> +    case 72:
> >>> +        _pp[8] = 0;    /* fallthrough */
> >>> +    default:
> >>> +        _pp[7] = 0;    /* fallthrough */
> >>> +    case 56:
> >>> +        _pp[6] = 0;
> >>> +        _pp[5] = 0;
> >>> +        _pp[4] = 0;
> >>> +        _pp[3] = 0;
> >>> +        _pp[2] = 0;
> >>> +        _pp[1] = 0;
> >>> +        _pp[0] = 0;
> >>> +    }
> >>
> >> This just hit my eyes. I have to confess I have never seen default: to
> >> be not the last one in the switch. Can we have case 64 instead or does
> >> gcc
> >> complain? I would be surprised with the set of BUILD_BUG_ONs.
> 
> It was me, C does not really care where default is placed, I was trying
> to keep stores sequential for better cache locality, but "case 64"
> should be OK, and even better for this purpose.

You'd need to put memory barriers between them to force sequential stores.
I'm also surprised that gcc doesn't inline the memset().

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Alexander Duyck Oct. 17, 2018, 4:31 p.m. UTC | #8
On 10/17/2018 8:40 AM, David Laight wrote:
> From: Pavel Tatashin
>> Sent: 17 October 2018 16:12
>> On 10/17/18 11:07 AM, Alexander Duyck wrote:
>>> On 10/17/2018 1:47 AM, Michal Hocko wrote:
>>>> On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
>>>>> This change makes it so that we use the same approach that was
>>>>> already in
>>>>> use on Sparc on all the archtectures that support a 64b long.
>>>>>
>>>>> This is mostly motivated by the fact that 8 to 10 store/move
>>>>> instructions
>>>>> are likely always going to be faster than having to call into a function
>>>>> that is not specialized for handling page init.
>>>>>
>>>>> An added advantage to doing it this way is that the compiler can get
>>>>> away
>>>>> with combining writes in the __init_single_page call. As a result the
>>>>> memset call will be reduced to only about 4 write operations, or at
>>>>> least
>>>>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
>>>>> count/mapcount seem to be cancelling out at least 4 of the 8
>>>>> assignments on
>>>>> my system.
>>>>>
>>>>> One change I had to make to the function was to reduce the minimum page
>>>>> size to 56 to support some powerpc64 configurations.
>>>>
>>>> This really begs for numbers. I do not mind the change itself with some
>>>> minor comments below.
>>>>
>>>> [...]
>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>> index bb0de406f8e7..ec6e57a0c14e 100644
>>>>> --- a/include/linux/mm.h
>>>>> +++ b/include/linux/mm.h
>>>>> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long
>>>>> limit) { }
>>>>>     * zeroing by defining this macro in <asm/pgtable.h>.
>>>>>     */
>>>>>    #ifndef mm_zero_struct_page
>>>>
>>>> Do we still need this ifdef? I guess we can wait for an arch which
>>>> doesn't like this change and then add the override. I would rather go
>>>> simple if possible.
>>>
>>> We probably don't, but as soon as I remove it somebody will probably
>>> complain somewhere. I guess I could drop it for now and see if anybody
>>> screams. Adding it back should be pretty straight forward since it would
>>> only be 2 lines.
>>>
>>>>> +#if BITS_PER_LONG == 64
>>>>> +/* This function must be updated when the size of struct page grows
>>>>> above 80
>>>>> + * or reduces below 64. The idea that compiler optimizes out switch()
>>>>> + * statement, and only leaves move/store instructions
>>>>> + */
>>>>> +#define    mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
>>>>> +static inline void __mm_zero_struct_page(struct page *page)
>>>>> +{
>>>>> +    unsigned long *_pp = (void *)page;
>>>>> +
>>>>> +     /* Check that struct page is either 56, 64, 72, or 80 bytes */
>>>>> +    BUILD_BUG_ON(sizeof(struct page) & 7);
>>>>> +    BUILD_BUG_ON(sizeof(struct page) < 56);
>>>>> +    BUILD_BUG_ON(sizeof(struct page) > 80);
>>>>> +
>>>>> +    switch (sizeof(struct page)) {
>>>>> +    case 80:
>>>>> +        _pp[9] = 0;    /* fallthrough */
>>>>> +    case 72:
>>>>> +        _pp[8] = 0;    /* fallthrough */
>>>>> +    default:
>>>>> +        _pp[7] = 0;    /* fallthrough */
>>>>> +    case 56:
>>>>> +        _pp[6] = 0;
>>>>> +        _pp[5] = 0;
>>>>> +        _pp[4] = 0;
>>>>> +        _pp[3] = 0;
>>>>> +        _pp[2] = 0;
>>>>> +        _pp[1] = 0;
>>>>> +        _pp[0] = 0;
>>>>> +    }
>>>>
>>>> This just hit my eyes. I have to confess I have never seen default: to
>>>> be not the last one in the switch. Can we have case 64 instead or does
>>>> gcc
>>>> complain? I would be surprised with the set of BUILD_BUG_ONs.
>>
>> It was me, C does not really care where default is placed, I was trying
>> to keep stores sequential for better cache locality, but "case 64"
>> should be OK, and even better for this purpose.
> 
> You'd need to put memory barriers between them to force sequential stores.
> I'm also surprised that gcc doesn't inline the memset().
> 
> 	David

We don't need them to be sequential. The general idea is we have have to 
fill a given amount of space with 0s. After that we have some calls that 
are initialing the memory that doesn't have to be zero. Ideally the 
compiler is smart enough to realize that since we don't have barriers 
and we are performing assignments after the assignment of zero it can 
just combine the two writes into one and drop the zero assignment.

- Alex
Michal Hocko Oct. 17, 2018, 4:34 p.m. UTC | #9
On Wed 17-10-18 08:07:06, Alexander Duyck wrote:
> On 10/17/2018 1:47 AM, Michal Hocko wrote:
> > On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
[...]
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index bb0de406f8e7..ec6e57a0c14e 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long limit) { }
> > >    * zeroing by defining this macro in <asm/pgtable.h>.
> > >    */
> > >   #ifndef mm_zero_struct_page
> > 
> > Do we still need this ifdef? I guess we can wait for an arch which
> > doesn't like this change and then add the override. I would rather go
> > simple if possible.
> 
> We probably don't, but as soon as I remove it somebody will probably
> complain somewhere. I guess I could drop it for now and see if anybody
> screams. Adding it back should be pretty straight forward since it would
> only be 2 lines.

Let's make it simpler please. If somebody really cares then this is
trivial to add later.
 
> > > +#if BITS_PER_LONG == 64
> > > +/* This function must be updated when the size of struct page grows above 80
> > > + * or reduces below 64. The idea that compiler optimizes out switch()
> > > + * statement, and only leaves move/store instructions
> > > + */
> > > +#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
> > > +static inline void __mm_zero_struct_page(struct page *page)
> > > +{
> > > +	unsigned long *_pp = (void *)page;
> > > +
> > > +	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
> > > +	BUILD_BUG_ON(sizeof(struct page) & 7);
> > > +	BUILD_BUG_ON(sizeof(struct page) < 56);
> > > +	BUILD_BUG_ON(sizeof(struct page) > 80);
> > > +
> > > +	switch (sizeof(struct page)) {
> > > +	case 80:
> > > +		_pp[9] = 0;	/* fallthrough */
> > > +	case 72:
> > > +		_pp[8] = 0;	/* fallthrough */
> > > +	default:
> > > +		_pp[7] = 0;	/* fallthrough */
> > > +	case 56:
> > > +		_pp[6] = 0;
> > > +		_pp[5] = 0;
> > > +		_pp[4] = 0;
> > > +		_pp[3] = 0;
> > > +		_pp[2] = 0;
> > > +		_pp[1] = 0;
> > > +		_pp[0] = 0;
> > > +	}
> > 
> > This just hit my eyes. I have to confess I have never seen default: to
> > be not the last one in the switch. Can we have case 64 instead or does gcc
> > complain? I would be surprised with the set of BUILD_BUG_ONs.
> 
> I can probably just replace the "default:" with "case 64:". I think I have
> seen other switch statements in the kernel without a default so odds are it
> should be okay.

Please do, there shouldn't be any need to obfuscate the code more than
necessary.
Pavel Tatashin Oct. 17, 2018, 5:08 p.m. UTC | #10
On 10/17/18 12:31 PM, Alexander Duyck wrote:
> On 10/17/2018 8:40 AM, David Laight wrote:
>> From: Pavel Tatashin
>>> Sent: 17 October 2018 16:12
>>> On 10/17/18 11:07 AM, Alexander Duyck wrote:
>>>> On 10/17/2018 1:47 AM, Michal Hocko wrote:
>>>>> On Mon 15-10-18 13:26:56, Alexander Duyck wrote:
>>>>>> This change makes it so that we use the same approach that was
>>>>>> already in
>>>>>> use on Sparc on all the archtectures that support a 64b long.
>>>>>>
>>>>>> This is mostly motivated by the fact that 8 to 10 store/move
>>>>>> instructions
>>>>>> are likely always going to be faster than having to call into a
>>>>>> function
>>>>>> that is not specialized for handling page init.
>>>>>>
>>>>>> An added advantage to doing it this way is that the compiler can get
>>>>>> away
>>>>>> with combining writes in the __init_single_page call. As a result the
>>>>>> memset call will be reduced to only about 4 write operations, or at
>>>>>> least
>>>>>> that is what I am seeing with GCC 6.2 as the flags, LRU poitners, and
>>>>>> count/mapcount seem to be cancelling out at least 4 of the 8
>>>>>> assignments on
>>>>>> my system.
>>>>>>
>>>>>> One change I had to make to the function was to reduce the minimum
>>>>>> page
>>>>>> size to 56 to support some powerpc64 configurations.
>>>>>
>>>>> This really begs for numbers. I do not mind the change itself with
>>>>> some
>>>>> minor comments below.
>>>>>
>>>>> [...]
>>>>>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>>>>>> index bb0de406f8e7..ec6e57a0c14e 100644
>>>>>> --- a/include/linux/mm.h
>>>>>> +++ b/include/linux/mm.h
>>>>>> @@ -102,8 +102,42 @@ static inline void set_max_mapnr(unsigned long
>>>>>> limit) { }
>>>>>>     * zeroing by defining this macro in <asm/pgtable.h>.
>>>>>>     */
>>>>>>    #ifndef mm_zero_struct_page
>>>>>
>>>>> Do we still need this ifdef? I guess we can wait for an arch which
>>>>> doesn't like this change and then add the override. I would rather go
>>>>> simple if possible.
>>>>
>>>> We probably don't, but as soon as I remove it somebody will probably
>>>> complain somewhere. I guess I could drop it for now and see if anybody
>>>> screams. Adding it back should be pretty straight forward since it
>>>> would
>>>> only be 2 lines.
>>>>
>>>>>> +#if BITS_PER_LONG == 64
>>>>>> +/* This function must be updated when the size of struct page grows
>>>>>> above 80
>>>>>> + * or reduces below 64. The idea that compiler optimizes out
>>>>>> switch()
>>>>>> + * statement, and only leaves move/store instructions
>>>>>> + */
>>>>>> +#define    mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
>>>>>> +static inline void __mm_zero_struct_page(struct page *page)
>>>>>> +{
>>>>>> +    unsigned long *_pp = (void *)page;
>>>>>> +
>>>>>> +     /* Check that struct page is either 56, 64, 72, or 80 bytes */
>>>>>> +    BUILD_BUG_ON(sizeof(struct page) & 7);
>>>>>> +    BUILD_BUG_ON(sizeof(struct page) < 56);
>>>>>> +    BUILD_BUG_ON(sizeof(struct page) > 80);
>>>>>> +
>>>>>> +    switch (sizeof(struct page)) {
>>>>>> +    case 80:
>>>>>> +        _pp[9] = 0;    /* fallthrough */
>>>>>> +    case 72:
>>>>>> +        _pp[8] = 0;    /* fallthrough */
>>>>>> +    default:
>>>>>> +        _pp[7] = 0;    /* fallthrough */
>>>>>> +    case 56:
>>>>>> +        _pp[6] = 0;
>>>>>> +        _pp[5] = 0;
>>>>>> +        _pp[4] = 0;
>>>>>> +        _pp[3] = 0;
>>>>>> +        _pp[2] = 0;
>>>>>> +        _pp[1] = 0;
>>>>>> +        _pp[0] = 0;
>>>>>> +    }
>>>>>
>>>>> This just hit my eyes. I have to confess I have never seen default: to
>>>>> be not the last one in the switch. Can we have case 64 instead or does
>>>>> gcc
>>>>> complain? I would be surprised with the set of BUILD_BUG_ONs.
>>>
>>> It was me, C does not really care where default is placed, I was trying
>>> to keep stores sequential for better cache locality, but "case 64"
>>> should be OK, and even better for this purpose.
>>
>> You'd need to put memory barriers between them to force sequential
>> stores.
>> I'm also surprised that gcc doesn't inline the memset().

I meant sequential only as hint, there is no reason for them to be
strictly sequential, and barrier is one of the reasons why memset() is
slower compared to having these stores here. As, most of memset()
implementations include barrier. As Alex said, compiler will most likely
drop some unnecessary stores anyway because of inlines in
__init_single_page()

Pavel

>>
>>     David
> 
> We don't need them to be sequential. The general idea is we have have to
> fill a given amount of space with 0s. After that we have some calls that
> are initialing the memory that doesn't have to be zero. Ideally the
> compiler is smart enough to realize that since we don't have barriers
> and we are performing assignments after the assignment of zero it can
> just combine the two writes into one and drop the zero assignment.
> 
> - Alex
diff mbox series

Patch

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1393a8ac596b..22500c3be7a9 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -231,36 +231,6 @@ 
 extern struct page *mem_map_zero;
 #define ZERO_PAGE(vaddr)	(mem_map_zero)
 
-/* This macro must be updated when the size of struct page grows above 80
- * or reduces below 64.
- * The idea that compiler optimizes out switch() statement, and only
- * leaves clrx instructions
- */
-#define	mm_zero_struct_page(pp) do {					\
-	unsigned long *_pp = (void *)(pp);				\
-									\
-	 /* Check that struct page is either 64, 72, or 80 bytes */	\
-	BUILD_BUG_ON(sizeof(struct page) & 7);				\
-	BUILD_BUG_ON(sizeof(struct page) < 64);				\
-	BUILD_BUG_ON(sizeof(struct page) > 80);				\
-									\
-	switch (sizeof(struct page)) {					\
-	case 80:							\
-		_pp[9] = 0;	/* fallthrough */			\
-	case 72:							\
-		_pp[8] = 0;	/* fallthrough */			\
-	default:							\
-		_pp[7] = 0;						\
-		_pp[6] = 0;						\
-		_pp[5] = 0;						\
-		_pp[4] = 0;						\
-		_pp[3] = 0;						\
-		_pp[2] = 0;						\
-		_pp[1] = 0;						\
-		_pp[0] = 0;						\
-	}								\
-} while (0)
-
 /* PFNs are real physical page numbers.  However, mem_map only begins to record
  * per-page information starting at pfn_base.  This is to handle systems where
  * the first physical page in the machine is at some huge physical address,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bb0de406f8e7..ec6e57a0c14e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -102,8 +102,42 @@  static inline void set_max_mapnr(unsigned long limit) { }
  * zeroing by defining this macro in <asm/pgtable.h>.
  */
 #ifndef mm_zero_struct_page
+#if BITS_PER_LONG == 64
+/* This function must be updated when the size of struct page grows above 80
+ * or reduces below 64. The idea that compiler optimizes out switch()
+ * statement, and only leaves move/store instructions
+ */
+#define	mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
+static inline void __mm_zero_struct_page(struct page *page)
+{
+	unsigned long *_pp = (void *)page;
+
+	 /* Check that struct page is either 56, 64, 72, or 80 bytes */
+	BUILD_BUG_ON(sizeof(struct page) & 7);
+	BUILD_BUG_ON(sizeof(struct page) < 56);
+	BUILD_BUG_ON(sizeof(struct page) > 80);
+
+	switch (sizeof(struct page)) {
+	case 80:
+		_pp[9] = 0;	/* fallthrough */
+	case 72:
+		_pp[8] = 0;	/* fallthrough */
+	default:
+		_pp[7] = 0;	/* fallthrough */
+	case 56:
+		_pp[6] = 0;
+		_pp[5] = 0;
+		_pp[4] = 0;
+		_pp[3] = 0;
+		_pp[2] = 0;
+		_pp[1] = 0;
+		_pp[0] = 0;
+	}
+}
+#else
 #define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
 #endif
+#endif
 
 /*
  * Default maximum number of active map areas, this limits the number of vmas