diff mbox series

[v6,6/8] powerpc/pmem: Avoid the barrier in flush routines

Message ID 20200629135722.73558-7-aneesh.kumar@linux.ibm.com (mailing list archive)
State Superseded
Headers show
Series Support new pmem flush and sync instructions for POWER | expand

Commit Message

Aneesh Kumar K V June 29, 2020, 1:57 p.m. UTC
nvdimm expect the flush routines to just mark the cache clean. The barrier
that mark the store globally visible is done in nvdimm_flush().

Update the papr_scm driver to a simplified nvdim_flush callback that do
only the required barrier.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 arch/powerpc/lib/pmem.c                   |  6 ------
 arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
 2 files changed, 13 insertions(+), 6 deletions(-)

Comments

Michal Suchánek June 29, 2020, 4:09 p.m. UTC | #1
Hello,

On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
> nvdimm expect the flush routines to just mark the cache clean. The barrier
> that mark the store globally visible is done in nvdimm_flush().
> 
> Update the papr_scm driver to a simplified nvdim_flush callback that do
> only the required barrier.
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---
>  arch/powerpc/lib/pmem.c                   |  6 ------
>  arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
>  2 files changed, 13 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
> index 5a61aaeb6930..21210fa676e5 100644
> --- a/arch/powerpc/lib/pmem.c
> +++ b/arch/powerpc/lib/pmem.c
> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
>  
>  	for (i = 0; i < size >> shift; i++, addr += bytes)
>  		asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
> -
> -
> -	asm volatile(PPC_PHWSYNC ::: "memory");
>  }
>  
>  static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
>  
>  	for (i = 0; i < size >> shift; i++, addr += bytes)
>  		asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
> -
> -
> -	asm volatile(PPC_PHWSYNC ::: "memory");
>  }
>  
>  static inline void clean_pmem_range(unsigned long start, unsigned long stop)
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> index 9c569078a09f..9a9a0766f8b6 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
>  
>  	return 0;
>  }
> +/*
> + * We have made sure the pmem writes are done such that before calling this
> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
> + * we just need to add the necessary barrier to make sure the above flushes
> + * are have updated persistent storage before any data access or data transfer
> + * caused by subsequent instructions is initiated.
> + */
> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
> +{
> +	arch_pmem_flush_barrier();
> +	return 0;
> +}
>  
>  static ssize_t flags_show(struct device *dev,
>  			  struct device_attribute *attr, char *buf)
> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>  	ndr_desc.mapping = &mapping;
>  	ndr_desc.num_mappings = 1;
>  	ndr_desc.nd_set = &p->nd_set;
> +	ndr_desc.flush = papr_scm_flush_sync;

AFAICT currently the only device that implements flush is virtio_pmem.
How does the nfit driver get away without implementing flush?
Also the flush takes arguments that are completely unused but a user of
the pmem region must assume they are used, and call flush() on the
region rather than arch_pmem_flush_barrier() directly.  This may not
work well with md as discussed with earlier iteration of the patchest.

Thanks

Michal
Aneesh Kumar K V June 29, 2020, 8:40 p.m. UTC | #2
Michal Suchánek <msuchanek@suse.de> writes:

> Hello,
>
> On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
>> nvdimm expect the flush routines to just mark the cache clean. The barrier
>> that mark the store globally visible is done in nvdimm_flush().
>> 
>> Update the papr_scm driver to a simplified nvdim_flush callback that do
>> only the required barrier.
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>>  arch/powerpc/lib/pmem.c                   |  6 ------
>>  arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
>>  2 files changed, 13 insertions(+), 6 deletions(-)
>> 
>> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
>> index 5a61aaeb6930..21210fa676e5 100644
>> --- a/arch/powerpc/lib/pmem.c
>> +++ b/arch/powerpc/lib/pmem.c
>> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
>>  
>>  	for (i = 0; i < size >> shift; i++, addr += bytes)
>>  		asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
>> -
>> -
>> -	asm volatile(PPC_PHWSYNC ::: "memory");
>>  }
>>  
>>  static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
>> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
>>  
>>  	for (i = 0; i < size >> shift; i++, addr += bytes)
>>  		asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
>> -
>> -
>> -	asm volatile(PPC_PHWSYNC ::: "memory");
>>  }
>>  
>>  static inline void clean_pmem_range(unsigned long start, unsigned long stop)
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>> index 9c569078a09f..9a9a0766f8b6 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
>>  
>>  	return 0;
>>  }
>> +/*
>> + * We have made sure the pmem writes are done such that before calling this
>> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
>> + * we just need to add the necessary barrier to make sure the above flushes
>> + * are have updated persistent storage before any data access or data transfer
>> + * caused by subsequent instructions is initiated.
>> + */
>> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
>> +{
>> +	arch_pmem_flush_barrier();
>> +	return 0;
>> +}
>>  
>>  static ssize_t flags_show(struct device *dev,
>>  			  struct device_attribute *attr, char *buf)
>> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>  	ndr_desc.mapping = &mapping;
>>  	ndr_desc.num_mappings = 1;
>>  	ndr_desc.nd_set = &p->nd_set;
>> +	ndr_desc.flush = papr_scm_flush_sync;
>
> AFAICT currently the only device that implements flush is virtio_pmem.
> How does the nfit driver get away without implementing flush?

generic_nvdimm_flush does the required barrier for nfit. The reason for
adding ndr_desc.flush call back for papr_scm was to avoid the usage
of iomem based deep flushing (ndr_region_data.flush_wpq) which is not
supported by papr_scm.

BTW we do return NULL for ndrd_get_flush_wpq() on power. So the upstream
code also does the same thing, but in a different way.


> Also the flush takes arguments that are completely unused but a user of
> the pmem region must assume they are used, and call flush() on the
> region rather than arch_pmem_flush_barrier() directly.

The bio argument can help a pmem driver to do range based flushing in
case of pmem_make_request. If bio is null then we must assume a full
device flush. 

>This may not
> work well with md as discussed with earlier iteration of the patchest.
>

dm-writecache needs some major changes to work with asynchronous pmem
devices. 

-aneesh
Dan Williams June 30, 2020, 1:50 a.m. UTC | #3
On Mon, Jun 29, 2020 at 1:41 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Michal Suchánek <msuchanek@suse.de> writes:
>
> > Hello,
> >
> > On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
> >> nvdimm expect the flush routines to just mark the cache clean. The barrier
> >> that mark the store globally visible is done in nvdimm_flush().
> >>
> >> Update the papr_scm driver to a simplified nvdim_flush callback that do
> >> only the required barrier.
> >>
> >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> >> ---
> >>  arch/powerpc/lib/pmem.c                   |  6 ------
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
> >>  2 files changed, 13 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
> >> index 5a61aaeb6930..21210fa676e5 100644
> >> --- a/arch/powerpc/lib/pmem.c
> >> +++ b/arch/powerpc/lib/pmem.c
> >> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
> >>
> >>      for (i = 0; i < size >> shift; i++, addr += bytes)
> >>              asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
> >> -
> >> -
> >> -    asm volatile(PPC_PHWSYNC ::: "memory");
> >>  }
> >>
> >>  static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
> >> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
> >>
> >>      for (i = 0; i < size >> shift; i++, addr += bytes)
> >>              asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
> >> -
> >> -
> >> -    asm volatile(PPC_PHWSYNC ::: "memory");
> >>  }
> >>
> >>  static inline void clean_pmem_range(unsigned long start, unsigned long stop)
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index 9c569078a09f..9a9a0766f8b6 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
> >>
> >>      return 0;
> >>  }
> >> +/*
> >> + * We have made sure the pmem writes are done such that before calling this
> >> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
> >> + * we just need to add the necessary barrier to make sure the above flushes
> >> + * are have updated persistent storage before any data access or data transfer
> >> + * caused by subsequent instructions is initiated.
> >> + */
> >> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
> >> +{
> >> +    arch_pmem_flush_barrier();
> >> +    return 0;
> >> +}
> >>
> >>  static ssize_t flags_show(struct device *dev,
> >>                        struct device_attribute *attr, char *buf)
> >> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
> >>      ndr_desc.mapping = &mapping;
> >>      ndr_desc.num_mappings = 1;
> >>      ndr_desc.nd_set = &p->nd_set;
> >> +    ndr_desc.flush = papr_scm_flush_sync;
> >
> > AFAICT currently the only device that implements flush is virtio_pmem.
> > How does the nfit driver get away without implementing flush?
>
> generic_nvdimm_flush does the required barrier for nfit. The reason for
> adding ndr_desc.flush call back for papr_scm was to avoid the usage
> of iomem based deep flushing (ndr_region_data.flush_wpq) which is not
> supported by papr_scm.
>
> BTW we do return NULL for ndrd_get_flush_wpq() on power. So the upstream
> code also does the same thing, but in a different way.
>
>
> > Also the flush takes arguments that are completely unused but a user of
> > the pmem region must assume they are used, and call flush() on the
> > region rather than arch_pmem_flush_barrier() directly.
>
> The bio argument can help a pmem driver to do range based flushing in
> case of pmem_make_request. If bio is null then we must assume a full
> device flush.

The bio argument isn't for range based flushing, it is for flush
operations that need to complete asynchronously.

There's no mechanism for the block layer to communicate range based
cache flushing, block-device flushing is assumed to be the device's
entire cache. For pmem that would be the entirety of the cpu cache.
Instead of modeling the cpu cache as a storage device cache it is
modeled as page-cache. Once the fs-layer writes back page-cache /
cpu-cache the storage device is only responsible for flushing those
cache-writes into the persistence domain.

Additionally there is a concept of deep-flush that relegates some
power-fail scenarios to a smaller failure domain. For example consider
the difference between a write arriving at the head of a device-queue
and successfully traversing a device-queue to media. The expectation
of pmem applications is that data is persisted once they reach the
equivalent of the x86 ADR domain, deep-flush is past ADR.
Michal Suchánek June 30, 2020, 8:54 a.m. UTC | #4
On Mon, Jun 29, 2020 at 06:50:15PM -0700, Dan Williams wrote:
> On Mon, Jun 29, 2020 at 1:41 PM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
> >
> > Michal Suchánek <msuchanek@suse.de> writes:
> >
> > > Hello,
> > >
> > > On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
> > >> nvdimm expect the flush routines to just mark the cache clean. The barrier
> > >> that mark the store globally visible is done in nvdimm_flush().
> > >>
> > >> Update the papr_scm driver to a simplified nvdim_flush callback that do
> > >> only the required barrier.
> > >>
> > >> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> > >> ---
> > >>  arch/powerpc/lib/pmem.c                   |  6 ------
> > >>  arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
> > >>  2 files changed, 13 insertions(+), 6 deletions(-)
> > >>
> > >> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
> > >> index 5a61aaeb6930..21210fa676e5 100644
> > >> --- a/arch/powerpc/lib/pmem.c
> > >> +++ b/arch/powerpc/lib/pmem.c
> > >> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
> > >>
> > >>      for (i = 0; i < size >> shift; i++, addr += bytes)
> > >>              asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
> > >> -
> > >> -
> > >> -    asm volatile(PPC_PHWSYNC ::: "memory");
> > >>  }
> > >>
> > >>  static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
> > >> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
> > >>
> > >>      for (i = 0; i < size >> shift; i++, addr += bytes)
> > >>              asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
> > >> -
> > >> -
> > >> -    asm volatile(PPC_PHWSYNC ::: "memory");
> > >>  }
> > >>
> > >>  static inline void clean_pmem_range(unsigned long start, unsigned long stop)
> > >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
> > >> index 9c569078a09f..9a9a0766f8b6 100644
> > >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> > >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> > >> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
> > >>
> > >>      return 0;
> > >>  }
> > >> +/*
> > >> + * We have made sure the pmem writes are done such that before calling this
> > >> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
> > >> + * we just need to add the necessary barrier to make sure the above flushes
> > >> + * are have updated persistent storage before any data access or data transfer
> > >> + * caused by subsequent instructions is initiated.
> > >> + */
> > >> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
> > >> +{
> > >> +    arch_pmem_flush_barrier();
> > >> +    return 0;
> > >> +}
> > >>
> > >>  static ssize_t flags_show(struct device *dev,
> > >>                        struct device_attribute *attr, char *buf)
> > >> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
> > >>      ndr_desc.mapping = &mapping;
> > >>      ndr_desc.num_mappings = 1;
> > >>      ndr_desc.nd_set = &p->nd_set;
> > >> +    ndr_desc.flush = papr_scm_flush_sync;
> > >
> > > AFAICT currently the only device that implements flush is virtio_pmem.
> > > How does the nfit driver get away without implementing flush?
> >
> > generic_nvdimm_flush does the required barrier for nfit. The reason for
> > adding ndr_desc.flush call back for papr_scm was to avoid the usage
> > of iomem based deep flushing (ndr_region_data.flush_wpq) which is not
> > supported by papr_scm.
> >
> > BTW we do return NULL for ndrd_get_flush_wpq() on power. So the upstream
> > code also does the same thing, but in a different way.
> >
> >
> > > Also the flush takes arguments that are completely unused but a user of
> > > the pmem region must assume they are used, and call flush() on the
> > > region rather than arch_pmem_flush_barrier() directly.
> >
> > The bio argument can help a pmem driver to do range based flushing in
> > case of pmem_make_request. If bio is null then we must assume a full
> > device flush.
> 
> The bio argument isn't for range based flushing, it is for flush
> operations that need to complete asynchronously.
How does the block layer determine that the pmem device needs
asynchronous fushing?

The flush() was designed for the purpose with the bio argument and only
virtio_pmem which is fulshed asynchronously used it. Now that papr_scm
resuses it fir different purpose how do you tell?

Thanks

Michal
Aneesh Kumar K V June 30, 2020, 9:20 a.m. UTC | #5
On 6/30/20 2:24 PM, Michal Suchánek wrote:
> On Mon, Jun 29, 2020 at 06:50:15PM -0700, Dan Williams wrote:
>> On Mon, Jun 29, 2020 at 1:41 PM Aneesh Kumar K.V
>> <aneesh.kumar@linux.ibm.com> wrote:
>>>
>>> Michal Suchánek <msuchanek@suse.de> writes:
>>>
>>>> Hello,
>>>>
>>>> On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
>>>>> nvdimm expect the flush routines to just mark the cache clean. The barrier
>>>>> that mark the store globally visible is done in nvdimm_flush().
>>>>>
>>>>> Update the papr_scm driver to a simplified nvdim_flush callback that do
>>>>> only the required barrier.
>>>>>
>>>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>>>> ---
>>>>>   arch/powerpc/lib/pmem.c                   |  6 ------
>>>>>   arch/powerpc/platforms/pseries/papr_scm.c | 13 +++++++++++++
>>>>>   2 files changed, 13 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
>>>>> index 5a61aaeb6930..21210fa676e5 100644
>>>>> --- a/arch/powerpc/lib/pmem.c
>>>>> +++ b/arch/powerpc/lib/pmem.c
>>>>> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
>>>>>
>>>>>       for (i = 0; i < size >> shift; i++, addr += bytes)
>>>>>               asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
>>>>> -
>>>>> -
>>>>> -    asm volatile(PPC_PHWSYNC ::: "memory");
>>>>>   }
>>>>>
>>>>>   static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
>>>>> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
>>>>>
>>>>>       for (i = 0; i < size >> shift; i++, addr += bytes)
>>>>>               asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
>>>>> -
>>>>> -
>>>>> -    asm volatile(PPC_PHWSYNC ::: "memory");
>>>>>   }
>>>>>
>>>>>   static inline void clean_pmem_range(unsigned long start, unsigned long stop)
>>>>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
>>>>> index 9c569078a09f..9a9a0766f8b6 100644
>>>>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>>>>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
>>>>> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
>>>>>
>>>>>       return 0;
>>>>>   }
>>>>> +/*
>>>>> + * We have made sure the pmem writes are done such that before calling this
>>>>> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
>>>>> + * we just need to add the necessary barrier to make sure the above flushes
>>>>> + * are have updated persistent storage before any data access or data transfer
>>>>> + * caused by subsequent instructions is initiated.
>>>>> + */
>>>>> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
>>>>> +{
>>>>> +    arch_pmem_flush_barrier();
>>>>> +    return 0;
>>>>> +}
>>>>>
>>>>>   static ssize_t flags_show(struct device *dev,
>>>>>                         struct device_attribute *attr, char *buf)
>>>>> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>>>>>       ndr_desc.mapping = &mapping;
>>>>>       ndr_desc.num_mappings = 1;
>>>>>       ndr_desc.nd_set = &p->nd_set;
>>>>> +    ndr_desc.flush = papr_scm_flush_sync;
>>>>
>>>> AFAICT currently the only device that implements flush is virtio_pmem.
>>>> How does the nfit driver get away without implementing flush?
>>>
>>> generic_nvdimm_flush does the required barrier for nfit. The reason for
>>> adding ndr_desc.flush call back for papr_scm was to avoid the usage
>>> of iomem based deep flushing (ndr_region_data.flush_wpq) which is not
>>> supported by papr_scm.
>>>
>>> BTW we do return NULL for ndrd_get_flush_wpq() on power. So the upstream
>>> code also does the same thing, but in a different way.
>>>
>>>
>>>> Also the flush takes arguments that are completely unused but a user of
>>>> the pmem region must assume they are used, and call flush() on the
>>>> region rather than arch_pmem_flush_barrier() directly.
>>>
>>> The bio argument can help a pmem driver to do range based flushing in
>>> case of pmem_make_request. If bio is null then we must assume a full
>>> device flush.
>>
>> The bio argument isn't for range based flushing, it is for flush
>> operations that need to complete asynchronously.
> How does the block layer determine that the pmem device needs
> asynchronous fushing?
> 

	set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
	
and dax_synchronous(dev)

> The flush() was designed for the purpose with the bio argument and only
> virtio_pmem which is fulshed asynchronously used it. Now that papr_scm
> resuses it fir different purpose how do you tell?
> 

-aneesh
Dan Williams June 30, 2020, 7:45 p.m. UTC | #6
On Tue, Jun 30, 2020 at 2:21 AM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
[..]
> >> The bio argument isn't for range based flushing, it is for flush
> >> operations that need to complete asynchronously.
> > How does the block layer determine that the pmem device needs
> > asynchronous fushing?
> >
>
>         set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
>
> and dax_synchronous(dev)

Yes, but I think it is overkill to have an indirect function call just
for a single instruction.

How about something like this instead, to share a common pmem_wmb()
across x86 and powerpc.

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 20ff30c2ab93..b14009060c83 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1180,6 +1180,13 @@ int nvdimm_flush(struct nd_region *nd_region,
struct bio *bio)
 {
        int rc = 0;

+       /*
+        * pmem_wmb() is needed to 'sfence' all previous writes such
+        * that they are architecturally visible for the platform buffer
+        * flush.
+        */
+       pmem_wmb();
+
        if (!nd_region->flush)
                rc = generic_nvdimm_flush(nd_region);
        else {
@@ -1206,17 +1213,14 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
        idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));

        /*
-        * The first wmb() is needed to 'sfence' all previous writes
-        * such that they are architecturally visible for the platform
-        * buffer flush.  Note that we've already arranged for pmem
-        * writes to avoid the cache via memcpy_flushcache().  The final
-        * wmb() ensures ordering for the NVDIMM flush write.
+        * Note that we've already arranged for pmem writes to avoid the
+        * cache via memcpy_flushcache().  The final wmb() ensures
+        * ordering for the NVDIMM flush write.
         */
-       wmb();
        for (i = 0; i < nd_region->ndr_mappings; i++)
                if (ndrd_get_flush_wpq(ndrd, i, 0))
                        writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
-       wmb();
+       pmem_wmb();

        return 0;
 }
Aneesh Kumar K V July 1, 2020, 3:09 a.m. UTC | #7
On 7/1/20 1:15 AM, Dan Williams wrote:
> On Tue, Jun 30, 2020 at 2:21 AM Aneesh Kumar K.V
> <aneesh.kumar@linux.ibm.com> wrote:
> [..]
>>>> The bio argument isn't for range based flushing, it is for flush
>>>> operations that need to complete asynchronously.
>>> How does the block layer determine that the pmem device needs
>>> asynchronous fushing?
>>>
>>
>>          set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
>>
>> and dax_synchronous(dev)
> 
> Yes, but I think it is overkill to have an indirect function call just
> for a single instruction.
> 
> How about something like this instead, to share a common pmem_wmb()
> across x86 and powerpc.
> 
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index 20ff30c2ab93..b14009060c83 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -1180,6 +1180,13 @@ int nvdimm_flush(struct nd_region *nd_region,
> struct bio *bio)
>   {
>          int rc = 0;
> 
> +       /*
> +        * pmem_wmb() is needed to 'sfence' all previous writes such
> +        * that they are architecturally visible for the platform buffer
> +        * flush.
> +        */
> +       pmem_wmb();
> +
>          if (!nd_region->flush)
>                  rc = generic_nvdimm_flush(nd_region);
>          else {
> @@ -1206,17 +1213,14 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
>          idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
> 
>          /*
> -        * The first wmb() is needed to 'sfence' all previous writes
> -        * such that they are architecturally visible for the platform
> -        * buffer flush.  Note that we've already arranged for pmem
> -        * writes to avoid the cache via memcpy_flushcache().  The final
> -        * wmb() ensures ordering for the NVDIMM flush write.
> +        * Note that we've already arranged for pmem writes to avoid the
> +        * cache via memcpy_flushcache().  The final wmb() ensures
> +        * ordering for the NVDIMM flush write.
>           */
> -       wmb();


The series already convert this to pmem_wmb().

>          for (i = 0; i < nd_region->ndr_mappings; i++)
>                  if (ndrd_get_flush_wpq(ndrd, i, 0))
>                          writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> -       wmb();
> +       pmem_wmb();


Should this be pmem_wmb()? This is ordering the above writeq() right?

> 
>          return 0;
>   }
> 

This still results in two pmem_wmb() on platforms that doesn't have 
flush_wpq. I was trying to avoid that by adding a nd_region->flush call 
back.

-aneesh
Dan Williams July 1, 2020, 5:08 a.m. UTC | #8
On Tue, Jun 30, 2020 at 8:09 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> On 7/1/20 1:15 AM, Dan Williams wrote:
> > On Tue, Jun 30, 2020 at 2:21 AM Aneesh Kumar K.V
> > <aneesh.kumar@linux.ibm.com> wrote:
> > [..]
> >>>> The bio argument isn't for range based flushing, it is for flush
> >>>> operations that need to complete asynchronously.
> >>> How does the block layer determine that the pmem device needs
> >>> asynchronous fushing?
> >>>
> >>
> >>          set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> >>
> >> and dax_synchronous(dev)
> >
> > Yes, but I think it is overkill to have an indirect function call just
> > for a single instruction.
> >
> > How about something like this instead, to share a common pmem_wmb()
> > across x86 and powerpc.
> >
> > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> > index 20ff30c2ab93..b14009060c83 100644
> > --- a/drivers/nvdimm/region_devs.c
> > +++ b/drivers/nvdimm/region_devs.c
> > @@ -1180,6 +1180,13 @@ int nvdimm_flush(struct nd_region *nd_region,
> > struct bio *bio)
> >   {
> >          int rc = 0;
> >
> > +       /*
> > +        * pmem_wmb() is needed to 'sfence' all previous writes such
> > +        * that they are architecturally visible for the platform buffer
> > +        * flush.
> > +        */
> > +       pmem_wmb();
> > +
> >          if (!nd_region->flush)
> >                  rc = generic_nvdimm_flush(nd_region);
> >          else {
> > @@ -1206,17 +1213,14 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
> >          idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
> >
> >          /*
> > -        * The first wmb() is needed to 'sfence' all previous writes
> > -        * such that they are architecturally visible for the platform
> > -        * buffer flush.  Note that we've already arranged for pmem
> > -        * writes to avoid the cache via memcpy_flushcache().  The final
> > -        * wmb() ensures ordering for the NVDIMM flush write.
> > +        * Note that we've already arranged for pmem writes to avoid the
> > +        * cache via memcpy_flushcache().  The final wmb() ensures
> > +        * ordering for the NVDIMM flush write.
> >           */
> > -       wmb();
>
>
> The series already convert this to pmem_wmb().
>
> >          for (i = 0; i < nd_region->ndr_mappings; i++)
> >                  if (ndrd_get_flush_wpq(ndrd, i, 0))
> >                          writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> > -       wmb();
> > +       pmem_wmb();
>
>
> Should this be pmem_wmb()? This is ordering the above writeq() right?

Correct, this can just be wmb().

>
> >
> >          return 0;
> >   }
> >
>
> This still results in two pmem_wmb() on platforms that doesn't have
> flush_wpq. I was trying to avoid that by adding a nd_region->flush call
> back.

How about skip or exit early out of generic_nvdimm_flush if
ndrd->flush_wpq is NULL? That still saves an indirect branch at the
cost of another conditional, but that should still be worth it.
diff mbox series

Patch

diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
index 5a61aaeb6930..21210fa676e5 100644
--- a/arch/powerpc/lib/pmem.c
+++ b/arch/powerpc/lib/pmem.c
@@ -19,9 +19,6 @@  static inline void __clean_pmem_range(unsigned long start, unsigned long stop)
 
 	for (i = 0; i < size >> shift; i++, addr += bytes)
 		asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): "memory");
-
-
-	asm volatile(PPC_PHWSYNC ::: "memory");
 }
 
 static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
@@ -34,9 +31,6 @@  static inline void __flush_pmem_range(unsigned long start, unsigned long stop)
 
 	for (i = 0; i < size >> shift; i++, addr += bytes)
 		asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): "memory");
-
-
-	asm volatile(PPC_PHWSYNC ::: "memory");
 }
 
 static inline void clean_pmem_range(unsigned long start, unsigned long stop)
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c
index 9c569078a09f..9a9a0766f8b6 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -630,6 +630,18 @@  static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc,
 
 	return 0;
 }
+/*
+ * We have made sure the pmem writes are done such that before calling this
+ * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. Here
+ * we just need to add the necessary barrier to make sure the above flushes
+ * are have updated persistent storage before any data access or data transfer
+ * caused by subsequent instructions is initiated.
+ */
+static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio *bio)
+{
+	arch_pmem_flush_barrier();
+	return 0;
+}
 
 static ssize_t flags_show(struct device *dev,
 			  struct device_attribute *attr, char *buf)
@@ -743,6 +755,7 @@  static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
 	ndr_desc.mapping = &mapping;
 	ndr_desc.num_mappings = 1;
 	ndr_desc.nd_set = &p->nd_set;
+	ndr_desc.flush = papr_scm_flush_sync;
 
 	if (p->is_volatile)
 		p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);