Patchwork [v6,04/46] percpu_rwlock: Implement the core design of Per-CPU Reader-Writer Locks

login
register
mail settings
Submitter Lai Jiangshan
Date Feb. 25, 2013, 3:53 p.m.
Message ID <CACvQF53bdh4_BxF0y1fnTVR+T2OmRc0jmWQYftsvx92-fg-Lug@mail.gmail.com>
Download mbox | patch
Permalink /patch/222943/
State Not Applicable
Delegated to: David Miller
Headers show

Comments

Lai Jiangshan - Feb. 25, 2013, 3:53 p.m.
Hi, Srivatsa,

The target of the whole patchset is nice for me.
A question: How did you find out the such usages of
"preempt_disable()" and convert them? did all are converted?

And I think the lock is too complex and reinvent the wheel, why don't
you reuse the lglock?
I wrote an untested draft here.

Thanks,
Lai

PS: Some HA tools(I'm writing one) which takes checkpoints of
virtual-machines frequently, I guess this patchset can speedup the
tools.

From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 25 Feb 2013 23:14:27 +0800
Subject: [PATCH] lglock: add read-preference local-global rwlock

locality via lglock(trylock)
read-preference read-write-lock via fallback rwlock_t

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
 kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 0 deletions(-)
Srivatsa S. Bhat - Feb. 25, 2013, 7:26 p.m.
Hi Lai,

On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
> Hi, Srivatsa,
> 
> The target of the whole patchset is nice for me.

Cool! Thanks :-)

> A question: How did you find out the such usages of
> "preempt_disable()" and convert them? did all are converted?
> 

Well, I scanned through the source tree for usages which implicitly
disabled CPU offline and converted them over. Its not limited to uses
of preempt_disable() alone - even spin_locks, rwlocks, local_irq_disable()
etc also help disable CPU offline. So I tried to dig out all such uses
and converted them. However, since the merge window is open, a lot of
new code is flowing into the tree. So I'll have to rescan the tree to
see if there are any more places to convert.

> And I think the lock is too complex and reinvent the wheel, why don't
> you reuse the lglock?

lglocks? No way! ;-) See below...

> I wrote an untested draft here.
> 
> Thanks,
> Lai
> 
> PS: Some HA tools(I'm writing one) which takes checkpoints of
> virtual-machines frequently, I guess this patchset can speedup the
> tools.
> 
> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
> From: Lai Jiangshan <laijs@cn.fujitsu.com>
> Date: Mon, 25 Feb 2013 23:14:27 +0800
> Subject: [PATCH] lglock: add read-preference local-global rwlock
> 
> locality via lglock(trylock)
> read-preference read-write-lock via fallback rwlock_t
> 
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> ---
>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 76 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
> index 0d24e93..30fe887 100644
> --- a/include/linux/lglock.h
> +++ b/include/linux/lglock.h
> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>  void lg_global_lock(struct lglock *lg);
>  void lg_global_unlock(struct lglock *lg);
> 
> +struct lgrwlock {
> +	unsigned long __percpu *fallback_reader_refcnt;
> +	struct lglock lglock;
> +	rwlock_t fallback_rwlock;
> +};
> +
> +#define DEFINE_LGRWLOCK(name)						\
> +	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
> +	= __ARCH_SPIN_LOCK_UNLOCKED;					\
> +	static DEFINE_PER_CPU(unsigned long, name ## _refcnt);		\
> +	struct lgrwlock name = {					\
> +		.fallback_reader_refcnt = &name ## _refcnt,		\
> +		.lglock = { .lock = &name ## _lock } }
> +
> +#define DEFINE_STATIC_LGRWLOCK(name)					\
> +	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
> +	= __ARCH_SPIN_LOCK_UNLOCKED;					\
> +	static DEFINE_PER_CPU(unsigned long, name ## _refcnt);		\
> +	static struct lgrwlock name = {					\
> +		.fallback_reader_refcnt = &name ## _refcnt,		\
> +		.lglock = { .lock = &name ## _lock } }
> +
> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
> +{
> +	lg_lock_init(&lgrw->lglock, name);
> +}
> +
> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>  #endif
> diff --git a/kernel/lglock.c b/kernel/lglock.c
> index 6535a66..463543a 100644
> --- a/kernel/lglock.c
> +++ b/kernel/lglock.c
> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>  	preempt_enable();
>  }
>  EXPORT_SYMBOL(lg_global_unlock);
> +
> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
> +{
> +	struct lglock *lg = &lgrw->lglock;
> +
> +	preempt_disable();
> +	if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
> +		if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
> +			rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
> +			return;
> +		}
> +		read_lock(&lgrw->fallback_rwlock);
> +	}
> +
> +	__this_cpu_inc(*lgrw->fallback_reader_refcnt);
> +}
> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
> +
> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
> +{
> +	if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
> +		lg_local_unlock(&lgrw->lglock);
> +		return;
> +	}
> +
> +	if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
> +		read_unlock(&lgrw->fallback_rwlock);
> +
> +	preempt_enable();
> +}
> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
> +

If I read the code above correctly, all you are doing is implementing a
recursive reader-side primitive (ie., allowing the reader to call these
functions recursively, without resulting in a self-deadlock).

But the thing is, making the reader-side recursive is the least of our
problems! Our main challenge is to make the locking extremely flexible
and also safe-guard it against circular-locking-dependencies and deadlocks.
Please take a look at the changelog of patch 1 - it explains the situation
with an example.

> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
> +{
> +	lg_global_lock(&lgrw->lglock);

This does a for-loop on all CPUs and takes their locks one-by-one. That's
exactly what we want to prevent, because that is the _source_ of all our
deadlock woes in this case. In the presence of perfect lock ordering
guarantees, this wouldn't have been a problem (that's why lglocks are
being used successfully elsewhere in the kernel). In the stop-machine()
removal case, the over-flexibility of preempt_disable() forces us to provide
an equally flexible locking alternative. Hence we can't use such per-cpu
locking schemes.

You might note that, for exactly this reason, I haven't actually used any
per-cpu _locks_ in this synchronization scheme, though it is named as
"per-cpu rwlocks". The only per-cpu component here are the refcounts, and
we consciously avoid waiting/spinning on them (because then that would be
equivalent to having per-cpu locks, which are deadlock-prone). We use
global rwlocks to get the deadlock-safety that we need.

> +	write_lock(&lgrw->fallback_rwlock);
> +}
> +EXPORT_SYMBOL(lg_rwlock_global_write_lock);
> +
> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
> +{
> +	write_unlock(&lgrw->fallback_rwlock);
> +	lg_global_unlock(&lgrw->lglock);
> +}
> +EXPORT_SYMBOL(lg_rwlock_global_write_unlock);
> 

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 26, 2013, 12:17 a.m.
On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> Hi Lai,
>
> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>> Hi, Srivatsa,
>>
>> The target of the whole patchset is nice for me.
>
> Cool! Thanks :-)
>
>> A question: How did you find out the such usages of
>> "preempt_disable()" and convert them? did all are converted?
>>
>
> Well, I scanned through the source tree for usages which implicitly
> disabled CPU offline and converted them over. Its not limited to uses
> of preempt_disable() alone - even spin_locks, rwlocks, local_irq_disable()
> etc also help disable CPU offline. So I tried to dig out all such uses
> and converted them. However, since the merge window is open, a lot of
> new code is flowing into the tree. So I'll have to rescan the tree to
> see if there are any more places to convert.
>
>> And I think the lock is too complex and reinvent the wheel, why don't
>> you reuse the lglock?
>
> lglocks? No way! ;-) See below...
>
>> I wrote an untested draft here.
>>
>> Thanks,
>> Lai
>>
>> PS: Some HA tools(I'm writing one) which takes checkpoints of
>> virtual-machines frequently, I guess this patchset can speedup the
>> tools.
>>
>> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
>> From: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Date: Mon, 25 Feb 2013 23:14:27 +0800
>> Subject: [PATCH] lglock: add read-preference local-global rwlock
>>
>> locality via lglock(trylock)
>> read-preference read-write-lock via fallback rwlock_t
>>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> ---
>>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 76 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
>> index 0d24e93..30fe887 100644
>> --- a/include/linux/lglock.h
>> +++ b/include/linux/lglock.h
>> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>>  void lg_global_lock(struct lglock *lg);
>>  void lg_global_unlock(struct lglock *lg);
>>
>> +struct lgrwlock {
>> +     unsigned long __percpu *fallback_reader_refcnt;
>> +     struct lglock lglock;
>> +     rwlock_t fallback_rwlock;
>> +};
>> +
>> +#define DEFINE_LGRWLOCK(name)                                                \
>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>> +     struct lgrwlock name = {                                        \
>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>> +             .lglock = { .lock = &name ## _lock } }
>> +
>> +#define DEFINE_STATIC_LGRWLOCK(name)                                 \
>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>> +     static struct lgrwlock name = {                                 \
>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>> +             .lglock = { .lock = &name ## _lock } }
>> +
>> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
>> +{
>> +     lg_lock_init(&lgrw->lglock, name);
>> +}
>> +
>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>>  #endif
>> diff --git a/kernel/lglock.c b/kernel/lglock.c
>> index 6535a66..463543a 100644
>> --- a/kernel/lglock.c
>> +++ b/kernel/lglock.c
>> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>>       preempt_enable();
>>  }
>>  EXPORT_SYMBOL(lg_global_unlock);
>> +
>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>> +{
>> +     struct lglock *lg = &lgrw->lglock;
>> +
>> +     preempt_disable();
>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>> +             if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
>> +                     rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
>> +                     return;
>> +             }
>> +             read_lock(&lgrw->fallback_rwlock);
>> +     }
>> +
>> +     __this_cpu_inc(*lgrw->fallback_reader_refcnt);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
>> +
>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
>> +{
>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>> +             lg_local_unlock(&lgrw->lglock);
>> +             return;
>> +     }
>> +
>> +     if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
>> +             read_unlock(&lgrw->fallback_rwlock);
>> +
>> +     preempt_enable();
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
>> +
>
> If I read the code above correctly, all you are doing is implementing a
> recursive reader-side primitive (ie., allowing the reader to call these
> functions recursively, without resulting in a self-deadlock).
>
> But the thing is, making the reader-side recursive is the least of our
> problems! Our main challenge is to make the locking extremely flexible
> and also safe-guard it against circular-locking-dependencies and deadlocks.
> Please take a look at the changelog of patch 1 - it explains the situation
> with an example.


My lock fixes your requirements(I read patch 1-6 before I sent). In
readsite, lglock 's lock is token via trylock, the lglock doesn't
contribute to deadlocks, we can consider it doesn't exist when we find
deadlock from it. And global fallback rwlock doesn't result to
deadlocks because it is read-preference(you need to inc the
fallback_reader_refcnt inside the cpu-hotplug write-side, I don't do
it in generic lgrwlock)


If lg_rwlock_local_read_lock() spins, which means
lg_rwlock_local_read_lock() spins on fallback_rwlock, and which means
lg_rwlock_global_write_lock() took the lgrwlock successfully and
return, and which means lg_rwlock_local_read_lock() will stop spinning
when the write side finished.


>
>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
>> +{
>> +     lg_global_lock(&lgrw->lglock);
>
> This does a for-loop on all CPUs and takes their locks one-by-one. That's
> exactly what we want to prevent, because that is the _source_ of all our
> deadlock woes in this case. In the presence of perfect lock ordering
> guarantees, this wouldn't have been a problem (that's why lglocks are
> being used successfully elsewhere in the kernel). In the stop-machine()
> removal case, the over-flexibility of preempt_disable() forces us to provide
> an equally flexible locking alternative. Hence we can't use such per-cpu
> locking schemes.
>
> You might note that, for exactly this reason, I haven't actually used any
> per-cpu _locks_ in this synchronization scheme, though it is named as
> "per-cpu rwlocks". The only per-cpu component here are the refcounts, and
> we consciously avoid waiting/spinning on them (because then that would be
> equivalent to having per-cpu locks, which are deadlock-prone). We use
> global rwlocks to get the deadlock-safety that we need.
>
>> +     write_lock(&lgrw->fallback_rwlock);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_global_write_lock);
>> +
>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
>> +{
>> +     write_unlock(&lgrw->fallback_rwlock);
>> +     lg_global_unlock(&lgrw->lglock);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_global_write_unlock);
>>
>
> Regards,
> Srivatsa S. Bhat
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 26, 2013, 12:19 a.m.
On Tue, Feb 26, 2013 at 8:17 AM, Lai Jiangshan <eag0628@gmail.com> wrote:
> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> Hi Lai,
>>
>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>> Hi, Srivatsa,
>>>
>>> The target of the whole patchset is nice for me.
>>
>> Cool! Thanks :-)
>>
>>> A question: How did you find out the such usages of
>>> "preempt_disable()" and convert them? did all are converted?
>>>
>>
>> Well, I scanned through the source tree for usages which implicitly
>> disabled CPU offline and converted them over. Its not limited to uses
>> of preempt_disable() alone - even spin_locks, rwlocks, local_irq_disable()
>> etc also help disable CPU offline. So I tried to dig out all such uses
>> and converted them. However, since the merge window is open, a lot of
>> new code is flowing into the tree. So I'll have to rescan the tree to
>> see if there are any more places to convert.
>>
>>> And I think the lock is too complex and reinvent the wheel, why don't
>>> you reuse the lglock?
>>
>> lglocks? No way! ;-) See below...
>>
>>> I wrote an untested draft here.
>>>
>>> Thanks,
>>> Lai
>>>
>>> PS: Some HA tools(I'm writing one) which takes checkpoints of
>>> virtual-machines frequently, I guess this patchset can speedup the
>>> tools.
>>>
>>> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
>>> From: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> Date: Mon, 25 Feb 2013 23:14:27 +0800
>>> Subject: [PATCH] lglock: add read-preference local-global rwlock
>>>
>>> locality via lglock(trylock)
>>> read-preference read-write-lock via fallback rwlock_t
>>>
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> ---
>>>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>>>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 76 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
>>> index 0d24e93..30fe887 100644
>>> --- a/include/linux/lglock.h
>>> +++ b/include/linux/lglock.h
>>> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>>>  void lg_global_lock(struct lglock *lg);
>>>  void lg_global_unlock(struct lglock *lg);
>>>
>>> +struct lgrwlock {
>>> +     unsigned long __percpu *fallback_reader_refcnt;
>>> +     struct lglock lglock;
>>> +     rwlock_t fallback_rwlock;
>>> +};
>>> +
>>> +#define DEFINE_LGRWLOCK(name)                                                \
>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>> +     struct lgrwlock name = {                                        \
>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>> +             .lglock = { .lock = &name ## _lock } }
>>> +
>>> +#define DEFINE_STATIC_LGRWLOCK(name)                                 \
>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>> +     static struct lgrwlock name = {                                 \
>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>> +             .lglock = { .lock = &name ## _lock } }
>>> +
>>> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
>>> +{
>>> +     lg_lock_init(&lgrw->lglock, name);
>>> +}
>>> +
>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>>>  #endif
>>> diff --git a/kernel/lglock.c b/kernel/lglock.c
>>> index 6535a66..463543a 100644
>>> --- a/kernel/lglock.c
>>> +++ b/kernel/lglock.c
>>> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>>>       preempt_enable();
>>>  }
>>>  EXPORT_SYMBOL(lg_global_unlock);
>>> +
>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>>> +{
>>> +     struct lglock *lg = &lgrw->lglock;
>>> +
>>> +     preempt_disable();
>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>> +             if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
>>> +                     rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
>>> +                     return;
>>> +             }
>>> +             read_lock(&lgrw->fallback_rwlock);
>>> +     }
>>> +
>>> +     __this_cpu_inc(*lgrw->fallback_reader_refcnt);
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
>>> +
>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
>>> +{
>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>> +             lg_local_unlock(&lgrw->lglock);
>>> +             return;
>>> +     }
>>> +
>>> +     if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
>>> +             read_unlock(&lgrw->fallback_rwlock);
>>> +
>>> +     preempt_enable();
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
>>> +
>>
>> If I read the code above correctly, all you are doing is implementing a
>> recursive reader-side primitive (ie., allowing the reader to call these
>> functions recursively, without resulting in a self-deadlock).
>>
>> But the thing is, making the reader-side recursive is the least of our
>> problems! Our main challenge is to make the locking extremely flexible
>> and also safe-guard it against circular-locking-dependencies and deadlocks.
>> Please take a look at the changelog of patch 1 - it explains the situation
>> with an example.
>
>
> My lock fixes your requirements(I read patch 1-6 before I sent). In

s/fixes/fits/

> readsite, lglock 's lock is token via trylock, the lglock doesn't
> contribute to deadlocks, we can consider it doesn't exist when we find
> deadlock from it. And global fallback rwlock doesn't result to
> deadlocks because it is read-preference(you need to inc the
> fallback_reader_refcnt inside the cpu-hotplug write-side, I don't do
> it in generic lgrwlock)
>
>
> If lg_rwlock_local_read_lock() spins, which means
> lg_rwlock_local_read_lock() spins on fallback_rwlock, and which means
> lg_rwlock_global_write_lock() took the lgrwlock successfully and
> return, and which means lg_rwlock_local_read_lock() will stop spinning
> when the write side finished.
>
>
>>
>>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
>>> +{
>>> +     lg_global_lock(&lgrw->lglock);
>>
>> This does a for-loop on all CPUs and takes their locks one-by-one. That's
>> exactly what we want to prevent, because that is the _source_ of all our
>> deadlock woes in this case. In the presence of perfect lock ordering
>> guarantees, this wouldn't have been a problem (that's why lglocks are
>> being used successfully elsewhere in the kernel). In the stop-machine()
>> removal case, the over-flexibility of preempt_disable() forces us to provide
>> an equally flexible locking alternative. Hence we can't use such per-cpu
>> locking schemes.
>>
>> You might note that, for exactly this reason, I haven't actually used any
>> per-cpu _locks_ in this synchronization scheme, though it is named as
>> "per-cpu rwlocks". The only per-cpu component here are the refcounts, and
>> we consciously avoid waiting/spinning on them (because then that would be
>> equivalent to having per-cpu locks, which are deadlock-prone). We use
>> global rwlocks to get the deadlock-safety that we need.
>>
>>> +     write_lock(&lgrw->fallback_rwlock);
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_global_write_lock);
>>> +
>>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
>>> +{
>>> +     write_unlock(&lgrw->fallback_rwlock);
>>> +     lg_global_unlock(&lgrw->lglock);
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_global_write_unlock);
>>>
>>
>> Regards,
>> Srivatsa S. Bhat
>>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - Feb. 26, 2013, 9:02 a.m.
On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> Hi Lai,
>>
>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>> Hi, Srivatsa,
>>>
>>> The target of the whole patchset is nice for me.
>>
>> Cool! Thanks :-)
>>
[...]
>>> I wrote an untested draft here.
>>>
>>> Thanks,
>>> Lai
>>>
>>> PS: Some HA tools(I'm writing one) which takes checkpoints of
>>> virtual-machines frequently, I guess this patchset can speedup the
>>> tools.
>>>
>>> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
>>> From: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> Date: Mon, 25 Feb 2013 23:14:27 +0800
>>> Subject: [PATCH] lglock: add read-preference local-global rwlock
>>>
>>> locality via lglock(trylock)
>>> read-preference read-write-lock via fallback rwlock_t
>>>
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> ---
>>>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>>>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 76 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
>>> index 0d24e93..30fe887 100644
>>> --- a/include/linux/lglock.h
>>> +++ b/include/linux/lglock.h
>>> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>>>  void lg_global_lock(struct lglock *lg);
>>>  void lg_global_unlock(struct lglock *lg);
>>>
>>> +struct lgrwlock {
>>> +     unsigned long __percpu *fallback_reader_refcnt;
>>> +     struct lglock lglock;
>>> +     rwlock_t fallback_rwlock;
>>> +};
>>> +
>>> +#define DEFINE_LGRWLOCK(name)                                                \
>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>> +     struct lgrwlock name = {                                        \
>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>> +             .lglock = { .lock = &name ## _lock } }
>>> +
>>> +#define DEFINE_STATIC_LGRWLOCK(name)                                 \
>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>> +     static struct lgrwlock name = {                                 \
>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>> +             .lglock = { .lock = &name ## _lock } }
>>> +
>>> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
>>> +{
>>> +     lg_lock_init(&lgrw->lglock, name);
>>> +}
>>> +
>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
>>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>>>  #endif
>>> diff --git a/kernel/lglock.c b/kernel/lglock.c
>>> index 6535a66..463543a 100644
>>> --- a/kernel/lglock.c
>>> +++ b/kernel/lglock.c
>>> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>>>       preempt_enable();
>>>  }
>>>  EXPORT_SYMBOL(lg_global_unlock);
>>> +
>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>>> +{
>>> +     struct lglock *lg = &lgrw->lglock;
>>> +
>>> +     preempt_disable();
>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>> +             if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
>>> +                     rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
>>> +                     return;
>>> +             }
>>> +             read_lock(&lgrw->fallback_rwlock);
>>> +     }
>>> +
>>> +     __this_cpu_inc(*lgrw->fallback_reader_refcnt);
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
>>> +
>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
>>> +{
>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>> +             lg_local_unlock(&lgrw->lglock);
>>> +             return;
>>> +     }
>>> +
>>> +     if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
>>> +             read_unlock(&lgrw->fallback_rwlock);
>>> +
>>> +     preempt_enable();
>>> +}
>>> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
>>> +
>>
>> If I read the code above correctly, all you are doing is implementing a
>> recursive reader-side primitive (ie., allowing the reader to call these
>> functions recursively, without resulting in a self-deadlock).
>>
>> But the thing is, making the reader-side recursive is the least of our
>> problems! Our main challenge is to make the locking extremely flexible
>> and also safe-guard it against circular-locking-dependencies and deadlocks.
>> Please take a look at the changelog of patch 1 - it explains the situation
>> with an example.
> 
> 
> My lock fixes your requirements(I read patch 1-6 before I sent). In
> readsite, lglock 's lock is token via trylock, the lglock doesn't
> contribute to deadlocks, we can consider it doesn't exist when we find
> deadlock from it. And global fallback rwlock doesn't result to
> deadlocks because it is read-preference(you need to inc the
> fallback_reader_refcnt inside the cpu-hotplug write-side, I don't do
> it in generic lgrwlock)
>

Ah, since you hadn't mentioned the increment at the writer-side in your
previous email, I had missed the bigger picture of what you were trying
to achieve.
 
> 
> If lg_rwlock_local_read_lock() spins, which means
> lg_rwlock_local_read_lock() spins on fallback_rwlock, and which means
> lg_rwlock_global_write_lock() took the lgrwlock successfully and
> return, and which means lg_rwlock_local_read_lock() will stop spinning
> when the write side finished.
> 

Unfortunately, I see quite a few issues with the code above. IIUC, the
writer and the reader both increment the same counters. So how will the
unlock() code in the reader path know when to unlock which of the locks?
(The counter-dropping-to-zero logic is not safe, since it can be updated
due to different reasons). And now that I look at it again, in the absence
of the writer, the reader is allowed to be recursive at the heavy cost of
taking the global rwlock for read, every 2nd time you nest (because the
spinlock is non-recursive). Also, this lg_rwlock implementation uses 3
different data-structures - a per-cpu spinlock, a global rwlock and
a per-cpu refcnt, and its not immediately apparent why you need those many
or even those many varieties. Also I see that this doesn't handle the
case of interrupt-handlers also being readers.

IMHO, the per-cpu rwlock scheme that I have implemented in this patchset
has a clean, understandable design and just enough data-structures/locks
to achieve its goal and has several optimizations (like reducing the
interrupts-disabled time etc) included - all in a very straight-forward
manner. Since this is non-trivial, IMHO, starting from a clean slate is
actually better than trying to retrofit the logic into some locking scheme
which we actively want to avoid (and hence effectively we aren't even
borrowing anything from!).

To summarize, if you are just pointing out that we can implement the same
logic by altering lglocks, then sure, I acknowledge the possibility.
However, I don't think doing that actually makes it better; it either
convolutes the logic unnecessarily, or ends up looking _very_ similar to
the implementation in this patchset, from what I can see.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 26, 2013, 12:59 p.m.
On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>> Hi Lai,
>>>
>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>> Hi, Srivatsa,
>>>>
>>>> The target of the whole patchset is nice for me.
>>>
>>> Cool! Thanks :-)
>>>
> [...]
>>>> I wrote an untested draft here.
>>>>
>>>> Thanks,
>>>> Lai
>>>>
>>>> PS: Some HA tools(I'm writing one) which takes checkpoints of
>>>> virtual-machines frequently, I guess this patchset can speedup the
>>>> tools.
>>>>
>>>> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
>>>> From: Lai Jiangshan <laijs@cn.fujitsu.com>
>>>> Date: Mon, 25 Feb 2013 23:14:27 +0800
>>>> Subject: [PATCH] lglock: add read-preference local-global rwlock
>>>>
>>>> locality via lglock(trylock)
>>>> read-preference read-write-lock via fallback rwlock_t
>>>>
>>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>>>> ---
>>>>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>>>>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>>>>  2 files changed, 76 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
>>>> index 0d24e93..30fe887 100644
>>>> --- a/include/linux/lglock.h
>>>> +++ b/include/linux/lglock.h
>>>> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>>>>  void lg_global_lock(struct lglock *lg);
>>>>  void lg_global_unlock(struct lglock *lg);
>>>>
>>>> +struct lgrwlock {
>>>> +     unsigned long __percpu *fallback_reader_refcnt;
>>>> +     struct lglock lglock;
>>>> +     rwlock_t fallback_rwlock;
>>>> +};
>>>> +
>>>> +#define DEFINE_LGRWLOCK(name)                                                \
>>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>>> +     struct lgrwlock name = {                                        \
>>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>>> +             .lglock = { .lock = &name ## _lock } }
>>>> +
>>>> +#define DEFINE_STATIC_LGRWLOCK(name)                                 \
>>>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>>>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>>>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>>>> +     static struct lgrwlock name = {                                 \
>>>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>>>> +             .lglock = { .lock = &name ## _lock } }
>>>> +
>>>> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
>>>> +{
>>>> +     lg_lock_init(&lgrw->lglock, name);
>>>> +}
>>>> +
>>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
>>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
>>>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
>>>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>>>>  #endif
>>>> diff --git a/kernel/lglock.c b/kernel/lglock.c
>>>> index 6535a66..463543a 100644
>>>> --- a/kernel/lglock.c
>>>> +++ b/kernel/lglock.c
>>>> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>>>>       preempt_enable();
>>>>  }
>>>>  EXPORT_SYMBOL(lg_global_unlock);
>>>> +
>>>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>>>> +{
>>>> +     struct lglock *lg = &lgrw->lglock;
>>>> +
>>>> +     preempt_disable();
>>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>>> +             if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
>>>> +                     rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
>>>> +                     return;
>>>> +             }
>>>> +             read_lock(&lgrw->fallback_rwlock);
>>>> +     }
>>>> +
>>>> +     __this_cpu_inc(*lgrw->fallback_reader_refcnt);
>>>> +}
>>>> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
>>>> +
>>>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
>>>> +{
>>>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>>>> +             lg_local_unlock(&lgrw->lglock);
>>>> +             return;
>>>> +     }
>>>> +
>>>> +     if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
>>>> +             read_unlock(&lgrw->fallback_rwlock);
>>>> +
>>>> +     preempt_enable();
>>>> +}
>>>> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
>>>> +
>>>
>>> If I read the code above correctly, all you are doing is implementing a
>>> recursive reader-side primitive (ie., allowing the reader to call these
>>> functions recursively, without resulting in a self-deadlock).
>>>
>>> But the thing is, making the reader-side recursive is the least of our
>>> problems! Our main challenge is to make the locking extremely flexible
>>> and also safe-guard it against circular-locking-dependencies and deadlocks.
>>> Please take a look at the changelog of patch 1 - it explains the situation
>>> with an example.
>>
>>
>> My lock fixes your requirements(I read patch 1-6 before I sent). In
>> readsite, lglock 's lock is token via trylock, the lglock doesn't
>> contribute to deadlocks, we can consider it doesn't exist when we find
>> deadlock from it. And global fallback rwlock doesn't result to
>> deadlocks because it is read-preference(you need to inc the
>> fallback_reader_refcnt inside the cpu-hotplug write-side, I don't do
>> it in generic lgrwlock)
>>
>
> Ah, since you hadn't mentioned the increment at the writer-side in your
> previous email, I had missed the bigger picture of what you were trying
> to achieve.
>
>>
>> If lg_rwlock_local_read_lock() spins, which means
>> lg_rwlock_local_read_lock() spins on fallback_rwlock, and which means
>> lg_rwlock_global_write_lock() took the lgrwlock successfully and
>> return, and which means lg_rwlock_local_read_lock() will stop spinning
>> when the write side finished.
>>
>
> Unfortunately, I see quite a few issues with the code above. IIUC, the
> writer and the reader both increment the same counters. So how will the
> unlock() code in the reader path know when to unlock which of the locks?

The same as your code, the reader(which nested in write C.S.) just dec
the counters.

> (The counter-dropping-to-zero logic is not safe, since it can be updated
> due to different reasons). And now that I look at it again, in the absence
> of the writer, the reader is allowed to be recursive at the heavy cost of
> taking the global rwlock for read, every 2nd time you nest (because the
> spinlock is non-recursive).

(I did not understand your comments of this part)
nested reader is considered seldom. But if N(>=2) nested readers happen,
the overhead is:
    1 spin_try_lock() + 1 read_lock() + (N-1) __this_cpu_inc()

> Also, this lg_rwlock implementation uses 3
> different data-structures - a per-cpu spinlock, a global rwlock and
> a per-cpu refcnt, and its not immediately apparent why you need those many
> or even those many varieties.

data-structures is the same as yours.
fallback_reader_refcnt <--> reader_refcnt
per-cpu spinlock <--> write_signal
fallback_rwlock  <---> global_rwlock

> Also I see that this doesn't handle the
> case of interrupt-handlers also being readers.

handled. nested reader will see the ref or take the fallback_rwlock

>
> IMHO, the per-cpu rwlock scheme that I have implemented in this patchset
> has a clean, understandable design and just enough data-structures/locks
> to achieve its goal and has several optimizations (like reducing the
> interrupts-disabled time etc) included - all in a very straight-forward
> manner. Since this is non-trivial, IMHO, starting from a clean slate is
> actually better than trying to retrofit the logic into some locking scheme
> which we actively want to avoid (and hence effectively we aren't even
> borrowing anything from!).
>
> To summarize, if you are just pointing out that we can implement the same
> logic by altering lglocks, then sure, I acknowledge the possibility.
> However, I don't think doing that actually makes it better; it either
> convolutes the logic unnecessarily, or ends up looking _very_ similar to
> the implementation in this patchset, from what I can see.
>
> Regards,
> Srivatsa S. Bhat
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 26, 2013, 1:34 p.m.
On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> Hi Lai,
>
> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>> Hi, Srivatsa,
>>
>> The target of the whole patchset is nice for me.
>
> Cool! Thanks :-)
>
>> A question: How did you find out the such usages of
>> "preempt_disable()" and convert them? did all are converted?
>>
>
> Well, I scanned through the source tree for usages which implicitly
> disabled CPU offline and converted them over.

How do you scan? could you show the way you scan the source tree.
I can follow your instructions for double checking.

> Its not limited to uses
> of preempt_disable() alone - even spin_locks, rwlocks, local_irq_disable()
> etc also help disable CPU offline. So I tried to dig out all such uses
> and converted them. However, since the merge window is open, a lot of
> new code is flowing into the tree. So I'll have to rescan the tree to
> see if there are any more places to convert.

I remember some code has such assumption:
    preempt_disable() (or something else)
    //the code assume that the cpu_online_map can't be changed.
    preempt_enable()

It is very hard to find out all such kinds of assumptions and fixes them.
(I notice your code mainly fixes code around send_xxxx())


>
>> And I think the lock is too complex and reinvent the wheel, why don't
>> you reuse the lglock?
>
> lglocks? No way! ;-) See below...
>
>> I wrote an untested draft here.
>>
>> Thanks,
>> Lai
>>
>> PS: Some HA tools(I'm writing one) which takes checkpoints of
>> virtual-machines frequently, I guess this patchset can speedup the
>> tools.
>>
>> From 01db542693a1b7fc6f9ece45d57cb529d9be5b66 Mon Sep 17 00:00:00 2001
>> From: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Date: Mon, 25 Feb 2013 23:14:27 +0800
>> Subject: [PATCH] lglock: add read-preference local-global rwlock
>>
>> locality via lglock(trylock)
>> read-preference read-write-lock via fallback rwlock_t
>>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> ---
>>  include/linux/lglock.h |   31 +++++++++++++++++++++++++++++++
>>  kernel/lglock.c        |   45 +++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 76 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/lglock.h b/include/linux/lglock.h
>> index 0d24e93..30fe887 100644
>> --- a/include/linux/lglock.h
>> +++ b/include/linux/lglock.h
>> @@ -67,4 +67,35 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
>>  void lg_global_lock(struct lglock *lg);
>>  void lg_global_unlock(struct lglock *lg);
>>
>> +struct lgrwlock {
>> +     unsigned long __percpu *fallback_reader_refcnt;
>> +     struct lglock lglock;
>> +     rwlock_t fallback_rwlock;
>> +};
>> +
>> +#define DEFINE_LGRWLOCK(name)                                                \
>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>> +     struct lgrwlock name = {                                        \
>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>> +             .lglock = { .lock = &name ## _lock } }
>> +
>> +#define DEFINE_STATIC_LGRWLOCK(name)                                 \
>> +     static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)           \
>> +     = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
>> +     static DEFINE_PER_CPU(unsigned long, name ## _refcnt);          \
>> +     static struct lgrwlock name = {                                 \
>> +             .fallback_reader_refcnt = &name ## _refcnt,             \
>> +             .lglock = { .lock = &name ## _lock } }
>> +
>> +static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
>> +{
>> +     lg_lock_init(&lgrw->lglock, name);
>> +}
>> +
>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
>>  #endif
>> diff --git a/kernel/lglock.c b/kernel/lglock.c
>> index 6535a66..463543a 100644
>> --- a/kernel/lglock.c
>> +++ b/kernel/lglock.c
>> @@ -87,3 +87,48 @@ void lg_global_unlock(struct lglock *lg)
>>       preempt_enable();
>>  }
>>  EXPORT_SYMBOL(lg_global_unlock);
>> +
>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>> +{
>> +     struct lglock *lg = &lgrw->lglock;
>> +
>> +     preempt_disable();
>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>> +             if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
>> +                     rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
>> +                     return;
>> +             }
>> +             read_lock(&lgrw->fallback_rwlock);
>> +     }
>> +
>> +     __this_cpu_inc(*lgrw->fallback_reader_refcnt);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_local_read_lock);
>> +
>> +void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
>> +{
>> +     if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
>> +             lg_local_unlock(&lgrw->lglock);
>> +             return;
>> +     }
>> +
>> +     if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
>> +             read_unlock(&lgrw->fallback_rwlock);
>> +
>> +     preempt_enable();
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
>> +
>
> If I read the code above correctly, all you are doing is implementing a
> recursive reader-side primitive (ie., allowing the reader to call these
> functions recursively, without resulting in a self-deadlock).
>
> But the thing is, making the reader-side recursive is the least of our
> problems! Our main challenge is to make the locking extremely flexible
> and also safe-guard it against circular-locking-dependencies and deadlocks.
> Please take a look at the changelog of patch 1 - it explains the situation
> with an example.
>
>> +void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
>> +{
>> +     lg_global_lock(&lgrw->lglock);
>
> This does a for-loop on all CPUs and takes their locks one-by-one. That's
> exactly what we want to prevent, because that is the _source_ of all our
> deadlock woes in this case. In the presence of perfect lock ordering
> guarantees, this wouldn't have been a problem (that's why lglocks are
> being used successfully elsewhere in the kernel). In the stop-machine()
> removal case, the over-flexibility of preempt_disable() forces us to provide
> an equally flexible locking alternative. Hence we can't use such per-cpu
> locking schemes.
>
> You might note that, for exactly this reason, I haven't actually used any
> per-cpu _locks_ in this synchronization scheme, though it is named as
> "per-cpu rwlocks". The only per-cpu component here are the refcounts, and
> we consciously avoid waiting/spinning on them (because then that would be
> equivalent to having per-cpu locks, which are deadlock-prone). We use
> global rwlocks to get the deadlock-safety that we need.
>
>> +     write_lock(&lgrw->fallback_rwlock);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_global_write_lock);
>> +
>> +void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
>> +{
>> +     write_unlock(&lgrw->fallback_rwlock);
>> +     lg_global_unlock(&lgrw->lglock);
>> +}
>> +EXPORT_SYMBOL(lg_rwlock_global_write_unlock);
>>
>
> Regards,
> Srivatsa S. Bhat
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - Feb. 26, 2013, 2:22 p.m.
Hi Lai,

I'm really not convinced that piggy-backing on lglocks would help
us in any way. But still, let me try to address some of the points
you raised...

On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>> Hi Lai,
>>>>
>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>> Hi, Srivatsa,
>>>>>
>>>>> The target of the whole patchset is nice for me.
>>>>
>>>> Cool! Thanks :-)
>>>>
>> [...]
>>
>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>> writer and the reader both increment the same counters. So how will the
>> unlock() code in the reader path know when to unlock which of the locks?
> 
> The same as your code, the reader(which nested in write C.S.) just dec
> the counters.

And that works fine in my case because the writer and the reader update
_two_ _different_ counters. If both of them update the same counter, there
will be a semantic clash - an increment of the counter can either mean that
a new writer became active, or it can also indicate a nested reader. A decrement
can also similarly have 2 meanings. And thus it will be difficult to decide
the right action to take, based on the value of the counter.

> 
>> (The counter-dropping-to-zero logic is not safe, since it can be updated
>> due to different reasons). And now that I look at it again, in the absence
>> of the writer, the reader is allowed to be recursive at the heavy cost of
>> taking the global rwlock for read, every 2nd time you nest (because the
>> spinlock is non-recursive).
> 
> (I did not understand your comments of this part)
> nested reader is considered seldom.

No, nested readers can be _quite_ frequent. Because, potentially all users
of preempt_disable() are readers - and its well-known how frequently we
nest preempt_disable(). As a simple example, any atomic reader who calls
smp_call_function() will become a nested reader, because smp_call_function()
itself is a reader. So reader nesting is expected to be quite frequent.

> But if N(>=2) nested readers happen,
> the overhead is:
>     1 spin_try_lock() + 1 read_lock() + (N-1) __this_cpu_inc()
> 

In my patch, its just this_cpu_inc(). Note that these are _very_ hot paths.
So every bit of optimization that you can add is worthwhile.

And your read_lock() is a _global_ lock - thus, it can lead to a lot of
cache-line bouncing. That's *exactly* why I have used per-cpu refcounts in
my synchronization scheme, to avoid taking the global rwlock as much as possible.

Another important point to note is that, the overhead we are talking about
here, exists even when _not_ performing hotplug. And its the replacement to
the super-fast preempt_disable(). So its extremely important to consciously
minimize this overhead - else we'll end up slowing down the system significantly.

>> Also, this lg_rwlock implementation uses 3
>> different data-structures - a per-cpu spinlock, a global rwlock and
>> a per-cpu refcnt, and its not immediately apparent why you need those many
>> or even those many varieties.
> 
> data-structures is the same as yours.
> fallback_reader_refcnt <--> reader_refcnt

This has semantic problems, as noted above.

> per-cpu spinlock <--> write_signal

Acquire/release of (spin) lock is costlier than inc/dec of a counter, IIUC.

> fallback_rwlock  <---> global_rwlock
> 
>> Also I see that this doesn't handle the
>> case of interrupt-handlers also being readers.
> 
> handled. nested reader will see the ref or take the fallback_rwlock
>

I'm not referring to simple nested readers here, but interrupt handlers who
can act as readers. For starters, the arch_spin_trylock() is not safe when
interrupt handlers can also run the same code, right? You'll need to save
and restore interrupts at critical points in the code. Also, the __foo()
variants used to read/update the counters are not interrupt-safe. And,
the unlock() code in the reader path is again going to be confused about
what to do when interrupt handlers interrupt regular readers, due to the
messed up refcount.
 
>>
>> IMHO, the per-cpu rwlock scheme that I have implemented in this patchset
>> has a clean, understandable design and just enough data-structures/locks
>> to achieve its goal and has several optimizations (like reducing the
>> interrupts-disabled time etc) included - all in a very straight-forward
>> manner. Since this is non-trivial, IMHO, starting from a clean slate is
>> actually better than trying to retrofit the logic into some locking scheme
>> which we actively want to avoid (and hence effectively we aren't even
>> borrowing anything from!).
>>
>> To summarize, if you are just pointing out that we can implement the same
>> logic by altering lglocks, then sure, I acknowledge the possibility.
>> However, I don't think doing that actually makes it better; it either
>> convolutes the logic unnecessarily, or ends up looking _very_ similar to
>> the implementation in this patchset, from what I can see.
>>

 
Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - Feb. 26, 2013, 3:17 p.m.
On 02/26/2013 07:04 PM, Lai Jiangshan wrote:
> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> Hi Lai,
>>
>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>> Hi, Srivatsa,
>>>
>>> The target of the whole patchset is nice for me.
>>
>> Cool! Thanks :-)
>>
>>> A question: How did you find out the such usages of
>>> "preempt_disable()" and convert them? did all are converted?
>>>
>>
>> Well, I scanned through the source tree for usages which implicitly
>> disabled CPU offline and converted them over.
> 
> How do you scan? could you show the way you scan the source tree.
> I can follow your instructions for double checking.
> 

Its nothing special. I grepped the source tree for anything dealing with
cpu_online_mask or its derivatives and also for functions/constructs that
rely on the cpumasks internally (eg: smp_call_function). Then I audited all
such call-sites and converted them (if needed) accordingly.

>> Its not limited to uses
>> of preempt_disable() alone - even spin_locks, rwlocks, local_irq_disable()
>> etc also help disable CPU offline. So I tried to dig out all such uses
>> and converted them. However, since the merge window is open, a lot of
>> new code is flowing into the tree. So I'll have to rescan the tree to
>> see if there are any more places to convert.
> 
> I remember some code has such assumption:
>     preempt_disable() (or something else)
>     //the code assume that the cpu_online_map can't be changed.
>     preempt_enable()
> 
> It is very hard to find out all such kinds of assumptions and fixes them.
> (I notice your code mainly fixes code around send_xxxx())
> 

The conversion can be carried out using the method I mentioned above.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 26, 2013, 4:25 p.m.
On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
>
> Hi Lai,
>
> I'm really not convinced that piggy-backing on lglocks would help
> us in any way. But still, let me try to address some of the points
> you raised...
>
> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>> Hi Lai,
>>>>>
>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>> Hi, Srivatsa,
>>>>>>
>>>>>> The target of the whole patchset is nice for me.
>>>>>
>>>>> Cool! Thanks :-)
>>>>>
>>> [...]
>>>
>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>> writer and the reader both increment the same counters. So how will the
>>> unlock() code in the reader path know when to unlock which of the locks?
>>
>> The same as your code, the reader(which nested in write C.S.) just dec
>> the counters.
>
> And that works fine in my case because the writer and the reader update
> _two_ _different_ counters.

I can't find any magic in your code, they are the same counter.

        /*
         * It is desirable to allow the writer to acquire the percpu-rwlock
         * for read (if necessary), without deadlocking or getting complaints
         * from lockdep. To achieve that, just increment the reader_refcnt of
         * this CPU - that way, any attempt by the writer to acquire the
         * percpu-rwlock for read, will get treated as a case of nested percpu
         * reader, which is safe, from a locking perspective.
         */
        this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);


> If both of them update the same counter, there
> will be a semantic clash - an increment of the counter can either mean that
> a new writer became active, or it can also indicate a nested reader. A decrement
> can also similarly have 2 meanings. And thus it will be difficult to decide
> the right action to take, based on the value of the counter.
>
>>
>>> (The counter-dropping-to-zero logic is not safe, since it can be updated
>>> due to different reasons). And now that I look at it again, in the absence
>>> of the writer, the reader is allowed to be recursive at the heavy cost of
>>> taking the global rwlock for read, every 2nd time you nest (because the
>>> spinlock is non-recursive).
>>
>> (I did not understand your comments of this part)
>> nested reader is considered seldom.
>
> No, nested readers can be _quite_ frequent. Because, potentially all users
> of preempt_disable() are readers - and its well-known how frequently we
> nest preempt_disable(). As a simple example, any atomic reader who calls
> smp_call_function() will become a nested reader, because smp_call_function()
> itself is a reader. So reader nesting is expected to be quite frequent.
>
>> But if N(>=2) nested readers happen,
>> the overhead is:
>>     1 spin_try_lock() + 1 read_lock() + (N-1) __this_cpu_inc()
>>
>
> In my patch, its just this_cpu_inc(). Note that these are _very_ hot paths.
> So every bit of optimization that you can add is worthwhile.
>
> And your read_lock() is a _global_ lock - thus, it can lead to a lot of
> cache-line bouncing. That's *exactly* why I have used per-cpu refcounts in
> my synchronization scheme, to avoid taking the global rwlock as much as possible.
>
> Another important point to note is that, the overhead we are talking about
> here, exists even when _not_ performing hotplug. And its the replacement to
> the super-fast preempt_disable(). So its extremely important to consciously
> minimize this overhead - else we'll end up slowing down the system significantly.
>

All I was considered is "nested reader is seldom", so I always
fallback to rwlock when nested.
If you like, I can add 6 lines of code, the overhead is
1 spin_try_lock()(fast path)  + N  __this_cpu_inc()

The overhead of your code is
2 smp_mb() + N __this_cpu_inc()

I don't see how much different.

>>> Also, this lg_rwlock implementation uses 3
>>> different data-structures - a per-cpu spinlock, a global rwlock and
>>> a per-cpu refcnt, and its not immediately apparent why you need those many
>>> or even those many varieties.
>>
>> data-structures is the same as yours.
>> fallback_reader_refcnt <--> reader_refcnt
>
> This has semantic problems, as noted above.
>
>> per-cpu spinlock <--> write_signal
>
> Acquire/release of (spin) lock is costlier than inc/dec of a counter, IIUC.
>
>> fallback_rwlock  <---> global_rwlock
>>
>>> Also I see that this doesn't handle the
>>> case of interrupt-handlers also being readers.
>>
>> handled. nested reader will see the ref or take the fallback_rwlock
>>

Sorry, _reentrance_ read_lock() will see the ref or take the fallback_rwlock

>
> I'm not referring to simple nested readers here, but interrupt handlers who
> can act as readers. For starters, the arch_spin_trylock() is not safe when
> interrupt handlers can also run the same code, right? You'll need to save
> and restore interrupts at critical points in the code. Also, the __foo()
> variants used to read/update the counters are not interrupt-safe.

I must missed something.

Could you elaborate more why arch_spin_trylock() is not safe when
interrupt handlers can also run the same code?

Could you elaborate more why __this_cpu_op variants is not
interrupt-safe since they are always called paired.


> And,
> the unlock() code in the reader path is again going to be confused about
> what to do when interrupt handlers interrupt regular readers, due to the
> messed up refcount.

I still can't understand.

>
>>>
>>> IMHO, the per-cpu rwlock scheme that I have implemented in this patchset
>>> has a clean, understandable design and just enough data-structures/locks
>>> to achieve its goal and has several optimizations (like reducing the
>>> interrupts-disabled time etc) included - all in a very straight-forward
>>> manner. Since this is non-trivial, IMHO, starting from a clean slate is
>>> actually better than trying to retrofit the logic into some locking scheme
>>> which we actively want to avoid (and hence effectively we aren't even
>>> borrowing anything from!).
>>>
>>> To summarize, if you are just pointing out that we can implement the same
>>> logic by altering lglocks, then sure, I acknowledge the possibility.
>>> However, I don't think doing that actually makes it better; it either
>>> convolutes the logic unnecessarily, or ends up looking _very_ similar to
>>> the implementation in this patchset, from what I can see.
>>>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - Feb. 26, 2013, 7:30 p.m.
On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>
>> Hi Lai,
>>
>> I'm really not convinced that piggy-backing on lglocks would help
>> us in any way. But still, let me try to address some of the points
>> you raised...
>>
>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>> Hi Lai,
>>>>>>
>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>> Hi, Srivatsa,
>>>>>>>
>>>>>>> The target of the whole patchset is nice for me.
>>>>>>
>>>>>> Cool! Thanks :-)
>>>>>>
>>>> [...]
>>>>
>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>> writer and the reader both increment the same counters. So how will the
>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>
>>> The same as your code, the reader(which nested in write C.S.) just dec
>>> the counters.
>>
>> And that works fine in my case because the writer and the reader update
>> _two_ _different_ counters.
> 
> I can't find any magic in your code, they are the same counter.
> 
>         /*
>          * It is desirable to allow the writer to acquire the percpu-rwlock
>          * for read (if necessary), without deadlocking or getting complaints
>          * from lockdep. To achieve that, just increment the reader_refcnt of
>          * this CPU - that way, any attempt by the writer to acquire the
>          * percpu-rwlock for read, will get treated as a case of nested percpu
>          * reader, which is safe, from a locking perspective.
>          */
>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>

Whoa! Hold on, were you really referring to _this_ increment when you said
that, in your patch you would increment the refcnt at the writer? Then I guess
there is a major disconnect in our conversations. (I had assumed that you were
referring to the update of writer_signal, and were just trying to have a single
refcnt instead of reader_refcnt and writer_signal).

So, please let me clarify things a bit here. Forget about the above increment
of reader_refcnt at the writer side. Its almost utterly insignificant for our
current discussion. We can simply replace it with a check as shown below, at
the reader side:

void percpu_read_lock_irqsafe()
{
	if (current == active_writer)
		return;

	/* Rest of the code */
}

Now, assuming that, in your patch, you were trying to use the per-cpu refcnt
to allow the writer to safely take the reader path, you can simply get rid of
that percpu-refcnt, as demonstrated above.

So that would reduce your code to the following (after simplification):

lg_rwlock_local_read_lock()
{
	if (current == active_writer)
		return;
	if (arch_spin_trylock(per-cpu-spinlock))
		return;
	read_lock(global-rwlock);
}

Now, let us assume that hotplug is not happening, meaning, nobody is running
the writer side code. Now let's see what happens at the reader side in your
patch. As I mentioned earlier, the readers are _very_ frequent and can be in
very hot paths. And they also happen to be nested quite often. 

So, a non-nested reader acquires the per-cpu spinlock. Every subsequent nested
reader on that CPU has to acquire the global rwlock for read. Right there you
have 2 significant performance issues -
1. Acquiring the (spin) lock is costly
2. Acquiring the global rwlock causes cacheline bouncing, which hurts
   performance.

And why do we care so much about performance here? Because, the existing
kernel does an efficient preempt_disable() here - which is an optimized
per-cpu counter increment. Replacing that with such heavy primitives on the
reader side can be very bad.

Now, how does my patchset tackle this? My scheme just requires an increment
of a per-cpu refcnt (reader_refcnt) and memory barrier. Which is acceptable
from a performance-perspective, because IMHO its not horrendously worse than
a preempt_disable().

> 
>> If both of them update the same counter, there
>> will be a semantic clash - an increment of the counter can either mean that
>> a new writer became active, or it can also indicate a nested reader. A decrement
>> can also similarly have 2 meanings. And thus it will be difficult to decide
>> the right action to take, based on the value of the counter.
>>
>>>
>>>> (The counter-dropping-to-zero logic is not safe, since it can be updated
>>>> due to different reasons). And now that I look at it again, in the absence
>>>> of the writer, the reader is allowed to be recursive at the heavy cost of
>>>> taking the global rwlock for read, every 2nd time you nest (because the
>>>> spinlock is non-recursive).
>>>
>>> (I did not understand your comments of this part)
>>> nested reader is considered seldom.
>>
>> No, nested readers can be _quite_ frequent. Because, potentially all users
>> of preempt_disable() are readers - and its well-known how frequently we
>> nest preempt_disable(). As a simple example, any atomic reader who calls
>> smp_call_function() will become a nested reader, because smp_call_function()
>> itself is a reader. So reader nesting is expected to be quite frequent.
>>
>>> But if N(>=2) nested readers happen,
>>> the overhead is:
>>>     1 spin_try_lock() + 1 read_lock() + (N-1) __this_cpu_inc()
>>>
>>
>> In my patch, its just this_cpu_inc(). Note that these are _very_ hot paths.
>> So every bit of optimization that you can add is worthwhile.
>>
>> And your read_lock() is a _global_ lock - thus, it can lead to a lot of
>> cache-line bouncing. That's *exactly* why I have used per-cpu refcounts in
>> my synchronization scheme, to avoid taking the global rwlock as much as possible.
>>
>> Another important point to note is that, the overhead we are talking about
>> here, exists even when _not_ performing hotplug. And its the replacement to
>> the super-fast preempt_disable(). So its extremely important to consciously
>> minimize this overhead - else we'll end up slowing down the system significantly.
>>
> 
> All I was considered is "nested reader is seldom", so I always
> fallback to rwlock when nested.
> If you like, I can add 6 lines of code, the overhead is
> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
> 

I'm assuming that calculation is no longer valid, considering that
we just discussed how the per-cpu refcnt that you were using is quite
unnecessary and can be removed.

IIUC, the overhead with your code, as per above discussion would be:
1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).

Note that I'm referring to the scenario when hotplug is _not_ happening
(ie., nobody is running writer side code).

> The overhead of your code is
> 2 smp_mb() + N __this_cpu_inc()
> 

Right. And as you can see, this is much much better than the overhead
shown above.

> I don't see how much different.
> 
>>>> Also, this lg_rwlock implementation uses 3
>>>> different data-structures - a per-cpu spinlock, a global rwlock and
>>>> a per-cpu refcnt, and its not immediately apparent why you need those many
>>>> or even those many varieties.
>>>
>>> data-structures is the same as yours.
>>> fallback_reader_refcnt <--> reader_refcnt
>>
>> This has semantic problems, as noted above.
>>
>>> per-cpu spinlock <--> write_signal
>>
>> Acquire/release of (spin) lock is costlier than inc/dec of a counter, IIUC.
>>
>>> fallback_rwlock  <---> global_rwlock
>>>
>>>> Also I see that this doesn't handle the
>>>> case of interrupt-handlers also being readers.
>>>
>>> handled. nested reader will see the ref or take the fallback_rwlock
>>>
> 
> Sorry, _reentrance_ read_lock() will see the ref or take the fallback_rwlock
> 
>>
>> I'm not referring to simple nested readers here, but interrupt handlers who
>> can act as readers. For starters, the arch_spin_trylock() is not safe when
>> interrupt handlers can also run the same code, right? You'll need to save
>> and restore interrupts at critical points in the code. Also, the __foo()
>> variants used to read/update the counters are not interrupt-safe.
> 
> I must missed something.
> 
> Could you elaborate more why arch_spin_trylock() is not safe when
> interrupt handlers can also run the same code?
> 
> Could you elaborate more why __this_cpu_op variants is not
> interrupt-safe since they are always called paired.
>

Take a look at include/linux/percpu.h. You'll note that __this_cpu_*
operations map to __this_cpu_generic_to_op(), which doesn't disable interrupts
while doing the update. Hence you can get inconsistent results if an interrupt
hits the CPU at that time and the interrupt handler tries to do the same thing.
In contrast, if you use this_cpu_inc() for example, interrupts are explicitly
disabled during the update and hence you won't get inconsistent results.

> 
>> And,
>> the unlock() code in the reader path is again going to be confused about
>> what to do when interrupt handlers interrupt regular readers, due to the
>> messed up refcount.
> 
> I still can't understand.
>

The primary reason _I_ was using the refcnt vs the reason _you_ were using the
refcnt, appears to be very different. Maybe that's why the above statement
didn't make sense. In your case, IIUC, you can simply get rid of the refcnt
and replace it with the simple check I mentioned above. Whereas, I use
refcnts to keep the reader-side synchronization fast (and for reader-writer
communication).

 
Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - Feb. 27, 2013, 12:33 a.m.
On Wed, Feb 27, 2013 at 3:30 AM, Srivatsa S. Bhat
<srivatsa.bhat@linux.vnet.ibm.com> wrote:
> On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
>> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>
>>> Hi Lai,
>>>
>>> I'm really not convinced that piggy-backing on lglocks would help
>>> us in any way. But still, let me try to address some of the points
>>> you raised...
>>>
>>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>> Hi Lai,
>>>>>>>
>>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>>> Hi, Srivatsa,
>>>>>>>>
>>>>>>>> The target of the whole patchset is nice for me.
>>>>>>>
>>>>>>> Cool! Thanks :-)
>>>>>>>
>>>>> [...]
>>>>>
>>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>>> writer and the reader both increment the same counters. So how will the
>>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>>
>>>> The same as your code, the reader(which nested in write C.S.) just dec
>>>> the counters.
>>>
>>> And that works fine in my case because the writer and the reader update
>>> _two_ _different_ counters.
>>
>> I can't find any magic in your code, they are the same counter.
>>
>>         /*
>>          * It is desirable to allow the writer to acquire the percpu-rwlock
>>          * for read (if necessary), without deadlocking or getting complaints
>>          * from lockdep. To achieve that, just increment the reader_refcnt of
>>          * this CPU - that way, any attempt by the writer to acquire the
>>          * percpu-rwlock for read, will get treated as a case of nested percpu
>>          * reader, which is safe, from a locking perspective.
>>          */
>>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>>
>
> Whoa! Hold on, were you really referring to _this_ increment when you said
> that, in your patch you would increment the refcnt at the writer? Then I guess
> there is a major disconnect in our conversations. (I had assumed that you were
> referring to the update of writer_signal, and were just trying to have a single
> refcnt instead of reader_refcnt and writer_signal).

https://github.com/laijs/linux/commit/53e5053d5b724bea7c538b11743d0f420d98f38d

Sorry the name "fallback_reader_refcnt" misled you.

>
> So, please let me clarify things a bit here. Forget about the above increment
> of reader_refcnt at the writer side. Its almost utterly insignificant for our
> current discussion. We can simply replace it with a check as shown below, at
> the reader side:
>
> void percpu_read_lock_irqsafe()
> {
>         if (current == active_writer)
>                 return;
>
>         /* Rest of the code */
> }
>
> Now, assuming that, in your patch, you were trying to use the per-cpu refcnt
> to allow the writer to safely take the reader path, you can simply get rid of
> that percpu-refcnt, as demonstrated above.
>
> So that would reduce your code to the following (after simplification):
>
> lg_rwlock_local_read_lock()
> {
>         if (current == active_writer)
>                 return;
>         if (arch_spin_trylock(per-cpu-spinlock))
>                 return;
>         read_lock(global-rwlock);
> }
>
> Now, let us assume that hotplug is not happening, meaning, nobody is running
> the writer side code. Now let's see what happens at the reader side in your
> patch. As I mentioned earlier, the readers are _very_ frequent and can be in
> very hot paths. And they also happen to be nested quite often.
>
> So, a non-nested reader acquires the per-cpu spinlock. Every subsequent nested
> reader on that CPU has to acquire the global rwlock for read. Right there you
> have 2 significant performance issues -
> 1. Acquiring the (spin) lock is costly
> 2. Acquiring the global rwlock causes cacheline bouncing, which hurts
>    performance.
>
> And why do we care so much about performance here? Because, the existing
> kernel does an efficient preempt_disable() here - which is an optimized
> per-cpu counter increment. Replacing that with such heavy primitives on the
> reader side can be very bad.
>
> Now, how does my patchset tackle this? My scheme just requires an increment
> of a per-cpu refcnt (reader_refcnt) and memory barrier. Which is acceptable
> from a performance-perspective, because IMHO its not horrendously worse than
> a preempt_disable().
>
>>
>>> If both of them update the same counter, there
>>> will be a semantic clash - an increment of the counter can either mean that
>>> a new writer became active, or it can also indicate a nested reader. A decrement
>>> can also similarly have 2 meanings. And thus it will be difficult to decide
>>> the right action to take, based on the value of the counter.
>>>
>>>>
>>>>> (The counter-dropping-to-zero logic is not safe, since it can be updated
>>>>> due to different reasons). And now that I look at it again, in the absence
>>>>> of the writer, the reader is allowed to be recursive at the heavy cost of
>>>>> taking the global rwlock for read, every 2nd time you nest (because the
>>>>> spinlock is non-recursive).
>>>>
>>>> (I did not understand your comments of this part)
>>>> nested reader is considered seldom.
>>>
>>> No, nested readers can be _quite_ frequent. Because, potentially all users
>>> of preempt_disable() are readers - and its well-known how frequently we
>>> nest preempt_disable(). As a simple example, any atomic reader who calls
>>> smp_call_function() will become a nested reader, because smp_call_function()
>>> itself is a reader. So reader nesting is expected to be quite frequent.
>>>
>>>> But if N(>=2) nested readers happen,
>>>> the overhead is:
>>>>     1 spin_try_lock() + 1 read_lock() + (N-1) __this_cpu_inc()
>>>>
>>>
>>> In my patch, its just this_cpu_inc(). Note that these are _very_ hot paths.
>>> So every bit of optimization that you can add is worthwhile.
>>>
>>> And your read_lock() is a _global_ lock - thus, it can lead to a lot of
>>> cache-line bouncing. That's *exactly* why I have used per-cpu refcounts in
>>> my synchronization scheme, to avoid taking the global rwlock as much as possible.
>>>
>>> Another important point to note is that, the overhead we are talking about
>>> here, exists even when _not_ performing hotplug. And its the replacement to
>>> the super-fast preempt_disable(). So its extremely important to consciously
>>> minimize this overhead - else we'll end up slowing down the system significantly.
>>>
>>
>> All I was considered is "nested reader is seldom", so I always
>> fallback to rwlock when nested.
>> If you like, I can add 6 lines of code, the overhead is
>> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
>>
>
> I'm assuming that calculation is no longer valid, considering that
> we just discussed how the per-cpu refcnt that you were using is quite
> unnecessary and can be removed.
>
> IIUC, the overhead with your code, as per above discussion would be:
> 1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).

https://github.com/laijs/linux/commit/46334544bb7961550b7065e015da76f6dab21f16

Again, I'm so sorry the name "fallback_reader_refcnt" misled you.

>
> Note that I'm referring to the scenario when hotplug is _not_ happening
> (ie., nobody is running writer side code).
>
>> The overhead of your code is
>> 2 smp_mb() + N __this_cpu_inc()
>>
>
> Right. And as you can see, this is much much better than the overhead
> shown above.

I will write a test to compare it to "1 spin_try_lock()(fast path)  +
N  __this_cpu_inc()"

>
>> I don't see how much different.
>>
>>>>> Also, this lg_rwlock implementation uses 3
>>>>> different data-structures - a per-cpu spinlock, a global rwlock and
>>>>> a per-cpu refcnt, and its not immediately apparent why you need those many
>>>>> or even those many varieties.
>>>>
>>>> data-structures is the same as yours.
>>>> fallback_reader_refcnt <--> reader_refcnt
>>>
>>> This has semantic problems, as noted above.
>>>
>>>> per-cpu spinlock <--> write_signal
>>>
>>> Acquire/release of (spin) lock is costlier than inc/dec of a counter, IIUC.
>>>
>>>> fallback_rwlock  <---> global_rwlock
>>>>
>>>>> Also I see that this doesn't handle the
>>>>> case of interrupt-handlers also being readers.
>>>>
>>>> handled. nested reader will see the ref or take the fallback_rwlock
>>>>
>>
>> Sorry, _reentrance_ read_lock() will see the ref or take the fallback_rwlock
>>
>>>
>>> I'm not referring to simple nested readers here, but interrupt handlers who
>>> can act as readers. For starters, the arch_spin_trylock() is not safe when
>>> interrupt handlers can also run the same code, right? You'll need to save
>>> and restore interrupts at critical points in the code. Also, the __foo()
>>> variants used to read/update the counters are not interrupt-safe.
>>
>> I must missed something.
>>
>> Could you elaborate more why arch_spin_trylock() is not safe when
>> interrupt handlers can also run the same code?
>>
>> Could you elaborate more why __this_cpu_op variants is not
>> interrupt-safe since they are always called paired.
>>
>
> Take a look at include/linux/percpu.h. You'll note that __this_cpu_*
> operations map to __this_cpu_generic_to_op(), which doesn't disable interrupts
> while doing the update. Hence you can get inconsistent results if an interrupt
> hits the CPU at that time and the interrupt handler tries to do the same thing.
> In contrast, if you use this_cpu_inc() for example, interrupts are explicitly
> disabled during the update and hence you won't get inconsistent results.


xx_lock()/xx_unlock() are must called paired, if interrupts happens
the value of the data is recovered after the interrupts return.

the same reason, preempt_disable() itself it is not irqsafe,
but preempt_disable()/preempt_enable() are called paired, so they are
all irqsafe.

>
>>
>>> And,
>>> the unlock() code in the reader path is again going to be confused about
>>> what to do when interrupt handlers interrupt regular readers, due to the
>>> messed up refcount.
>>
>> I still can't understand.
>>
>
> The primary reason _I_ was using the refcnt vs the reason _you_ were using the
> refcnt, appears to be very different.  Maybe that's why the above statement
> didn't make sense. In your case, IIUC, you can simply get rid of the refcnt
> and replace it with the simple check I mentioned above. Whereas, I use
> refcnts to keep the reader-side synchronization fast (and for reader-writer
> communication).
>
>
> Regards,
> Srivatsa S. Bhat
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michel Lespinasse - Feb. 27, 2013, 11:11 a.m.
Hi Srivatsa,

I think there is some elegance in Lai's proposal of using a local
trylock for the reader uncontended case and global rwlock to deal with
the contended case without deadlocks. He apparently didn't realize
initially that nested read locks are common, and he seems to have
confused you because of that, but I think his proposal could be
changed easily to account for that and result in short, easily
understood code. What about the following:

- local_refcnt is a local lock count; it indicates how many recursive
locks are taken using the local lglock
- lglock is used by readers for local locking; it must be acquired
before local_refcnt becomes nonzero and released after local_refcnt
goes back to zero.
- fallback_rwlock is used by readers for global locking; it is
acquired when fallback_reader_refcnt is zero and the trylock fails on
lglock

+void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
+{
+       preempt_disable();
+
+       if (__this_cpu_read(*lgrw->local_refcnt) ||
+           arch_spin_trylock(this_cpu_ptr(lgrw->lglock->lock))) {
+               __this_cpu_inc(*lgrw->local_refcnt);
+
rwlock_acquire_read(&lgrw->fallback_rwlock->lock_dep_map, 0, 0,
_RET_IP_);
+       } else {
+               read_lock(&lgrw->fallback_rwlock);
+       }
+}
+EXPORT_SYMBOL(lg_rwlock_local_read_lock);
+
+void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
+{
+       if (likely(__this_cpu_read(*lgrw->local_refcnt))) {
+               rwlock_release(&lgrw->fallback_rwlock->lock_dep_map,
1, _RET_IP_);
+               if (!__this_cpu_dec_return(*lgrw->local_refcnt))
+                       arch_spin_unlock(this_cpu_ptr(lgrw->lglock->lock));
+       } else {
+               read_unlock(&lgrw->fallback_rwlock);
+       }
+
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
+
+void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
+{
+       int i;
+
+       preempt_disable();
+
+       for_each_possible_cpu(i)
+               arch_spin_lock(per_cpu_ptr(lgrw->lglock->lock, i));
+       write_lock(&lgrw->fallback_rwlock);
+}
+EXPORT_SYMBOL(lg_rwlock_global_write_lock);
+
+void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
+{
+       int i;
+
+       write_unlock(&lgrw->fallback_rwlock);
+       for_each_possible_cpu(i)
+               arch_spin_unlock(per_cpu_ptr(lgrw->lglock->lock, i));
+
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_rwlock_global_write_unlock);

This is to me relatively easier to understand than Srivatsa's
proposal. Now I'm not sure who wins efficiency wise, but I think it
should be relatively close as readers at least don't touch shared
state in the uncontended case (even with some recursion going on).

There is an interesting case where lg_rwlock_local_read_lock could be
interrupted after getting the local lglock but before incrementing
local_refcnt to 1; if that happens any nested readers within that
interrupt will have to take the global rwlock read side. I think this
is perfectly acceptable as this should not be a common case though
(and thus the global rwlock cache line probably wouldn't even bounce
between cpus then).
Oleg Nesterov - Feb. 27, 2013, 7:25 p.m.
On 02/27, Michel Lespinasse wrote:
>
> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
> +{
> +       preempt_disable();
> +
> +       if (__this_cpu_read(*lgrw->local_refcnt) ||
> +           arch_spin_trylock(this_cpu_ptr(lgrw->lglock->lock))) {
> +               __this_cpu_inc(*lgrw->local_refcnt);

Please look at __this_cpu_generic_to_op(). You need this_cpu_inc()
to avoid the race with irs. The same for _read_unlock.

But otherwise I agree, looks like a clever and working idea to me.
And simple!

> There is an interesting case where lg_rwlock_local_read_lock could be
> interrupted after getting the local lglock but before incrementing
> local_refcnt to 1; if that happens any nested readers within that
> interrupt will have to take the global rwlock read side. I think this
> is perfectly acceptable

Agreed.

Or interrupt can do spin_trylock(percpu-lock) after we take the global
->fallback_rwlock (if we race with write_lock + write_unlock), but I do
not see any possible deadlock in this case.

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - Feb. 27, 2013, 9:19 p.m.
On 02/27/2013 06:03 AM, Lai Jiangshan wrote:
> On Wed, Feb 27, 2013 at 3:30 AM, Srivatsa S. Bhat
> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>> On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
>>> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>
>>>> Hi Lai,
>>>>
>>>> I'm really not convinced that piggy-backing on lglocks would help
>>>> us in any way. But still, let me try to address some of the points
>>>> you raised...
>>>>
>>>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>> Hi Lai,
>>>>>>>>
>>>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>>>> Hi, Srivatsa,
>>>>>>>>>
>>>>>>>>> The target of the whole patchset is nice for me.
>>>>>>>>
>>>>>>>> Cool! Thanks :-)
>>>>>>>>
>>>>>> [...]
>>>>>>
>>>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>>>> writer and the reader both increment the same counters. So how will the
>>>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>>>
>>>>> The same as your code, the reader(which nested in write C.S.) just dec
>>>>> the counters.
>>>>
>>>> And that works fine in my case because the writer and the reader update
>>>> _two_ _different_ counters.
>>>
>>> I can't find any magic in your code, they are the same counter.
>>>
>>>         /*
>>>          * It is desirable to allow the writer to acquire the percpu-rwlock
>>>          * for read (if necessary), without deadlocking or getting complaints
>>>          * from lockdep. To achieve that, just increment the reader_refcnt of
>>>          * this CPU - that way, any attempt by the writer to acquire the
>>>          * percpu-rwlock for read, will get treated as a case of nested percpu
>>>          * reader, which is safe, from a locking perspective.
>>>          */
>>>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>>>
>>
>> Whoa! Hold on, were you really referring to _this_ increment when you said
>> that, in your patch you would increment the refcnt at the writer? Then I guess
>> there is a major disconnect in our conversations. (I had assumed that you were
>> referring to the update of writer_signal, and were just trying to have a single
>> refcnt instead of reader_refcnt and writer_signal).
> 
> https://github.com/laijs/linux/commit/53e5053d5b724bea7c538b11743d0f420d98f38d
> 
> Sorry the name "fallback_reader_refcnt" misled you.
> 
[...]

>>> All I was considered is "nested reader is seldom", so I always
>>> fallback to rwlock when nested.
>>> If you like, I can add 6 lines of code, the overhead is
>>> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
>>>
>>
>> I'm assuming that calculation is no longer valid, considering that
>> we just discussed how the per-cpu refcnt that you were using is quite
>> unnecessary and can be removed.
>>
>> IIUC, the overhead with your code, as per above discussion would be:
>> 1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).
> 
> https://github.com/laijs/linux/commit/46334544bb7961550b7065e015da76f6dab21f16
> 
> Again, I'm so sorry the name "fallback_reader_refcnt" misled you.
> 

At this juncture I really have to admit that I don't understand your
intentions at all. What are you really trying to prove? Without giving
a single good reason why my code is inferior, why are you even bringing
up the discussion about a complete rewrite of the synchronization code?
http://article.gmane.org/gmane.linux.kernel.cross-arch/17103
http://article.gmane.org/gmane.linux.power-management.general/31345

I'm beginning to add 2 + 2 together based on the kinds of questions you
have been asking...

You posted a patch in this thread and started a discussion around it without
even establishing a strong reason to do so. Now you point me to your git
tree where your patches have even more traces of ideas being borrowed from
my patchset (apart from my own ideas/code, there are traces of others' ideas
being borrowed too - for example, it was Oleg who originally proposed the
idea of splitting up the counter into 2 parts and I'm seeing that it is
slowly crawling into your code with no sign of appropriate credits).
http://article.gmane.org/gmane.linux.network/260288

And in reply to my mail pointing out the performance implications of the
global read_lock at the reader side in your code, you said you'll come up
with a comparison between that and my patchset.
http://article.gmane.org/gmane.linux.network/260288
The issue has been well-documented in my patch description of patch 4.
http://article.gmane.org/gmane.linux.kernel/1443258

Are you really trying to pit bits and pieces of my own ideas/versions
against one another and claiming them as your own?

You projected the work involved in handling the locking issues pertaining
to CPU_DYING notifiers etc as a TODO, despite the fact that I had explicitly
noted in my cover letter that I had audited and taken care of all of them.
http://article.gmane.org/gmane.linux.documentation/9727
http://article.gmane.org/gmane.linux.documentation/9520

You failed to acknowledge (on purpose?) that I had done a tree-wide
conversion despite the fact that you were replying to the very thread which
had the 46 patches which did exactly that (and I had also mentioned it
explicitly in my cover letter).
http://article.gmane.org/gmane.linux.documentation/9727
http://article.gmane.org/gmane.linux.documentation/9520

You then started probing more and more about the technique I used to do
the tree-wide conversion.
http://article.gmane.org/gmane.linux.kernel.cross-arch/17111

You also retorted saying you did go through my patch descriptions, so
its not like you have missed reading them.
http://article.gmane.org/gmane.linux.power-management.general/31345

Each of these when considered individually, might appear like innocuous and
honest attempts at evaluating my code. But when put together, I'm beginning
to sense a whole different angle to it altogether, as if you are trying
to spin your own patch series, complete with the locking framework _and_
the tree-wide conversion, heavily borrowed from mine. At the beginning of
this discussion, I predicted that the lglock version that you are proposing
would end up being either less efficient than my version or look very similar
to my version. http://article.gmane.org/gmane.linux.kernel/1447139

I thought it was just the former till now, but its not hard to see how it
is getting closer to becoming the latter too. So yeah, I'm not amused.

Maybe (and hopefully) you are just trying out different ideas on your own,
and I'm just being paranoid. I really hope that is the case. If you are just
trying to review my code, then please stop sending patches with borrowed ideas
with your sole Signed-off-by, and purposefully ignoring the work already done
in my patchset, because it is really starting to look suspicious, at least
to me.

Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
_your_ code is better than mine. I just don't like the idea of somebody
plagiarizing my ideas/code (or even others' ideas for that matter).
However, I sincerely apologize in advance if I misunderstood/misjudged your
intentions; I just wanted to voice my concerns out loud at this point,
considering the bad feeling I got by looking at your responses collectively.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michel Lespinasse - Feb. 28, 2013, 11:34 a.m.
On Thu, Feb 28, 2013 at 3:25 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> On 02/27, Michel Lespinasse wrote:
>>
>> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
>> +{
>> +       preempt_disable();
>> +
>> +       if (__this_cpu_read(*lgrw->local_refcnt) ||
>> +           arch_spin_trylock(this_cpu_ptr(lgrw->lglock->lock))) {
>> +               __this_cpu_inc(*lgrw->local_refcnt);
>
> Please look at __this_cpu_generic_to_op(). You need this_cpu_inc()
> to avoid the race with irs. The same for _read_unlock.

Hmmm, I was thinking that this was safe because while interrupts might
modify local_refcnt to acquire a nested read lock, they are expected
to release that lock as well which would set local_refcnt back to its
original value ???
Oleg Nesterov - Feb. 28, 2013, 6 p.m.
On 02/28, Michel Lespinasse wrote:
>
> On Thu, Feb 28, 2013 at 3:25 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> > On 02/27, Michel Lespinasse wrote:
> >>
> >> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
> >> +{
> >> +       preempt_disable();
> >> +
> >> +       if (__this_cpu_read(*lgrw->local_refcnt) ||
> >> +           arch_spin_trylock(this_cpu_ptr(lgrw->lglock->lock))) {
> >> +               __this_cpu_inc(*lgrw->local_refcnt);
> >
> > Please look at __this_cpu_generic_to_op(). You need this_cpu_inc()
> > to avoid the race with irs. The same for _read_unlock.
>
> Hmmm, I was thinking that this was safe because while interrupts might
> modify local_refcnt to acquire a nested read lock, they are expected
> to release that lock as well which would set local_refcnt back to its
> original value ???

Yes, yes, this is correct.

I meant that (in general, x86 is fine) __this_cpu_inc() itself is not
irq-safe. It simply does "pcp += 1".

this_cpu_inc() is fine, _this_cpu_generic_to_op() does cli/sti around.

I know this only because I did the same mistake recently, and Srivatsa
explained the problem to me ;)

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Oleg Nesterov - Feb. 28, 2013, 6:20 p.m.
On 02/28, Oleg Nesterov wrote:
> On 02/28, Michel Lespinasse wrote:
> >
> > On Thu, Feb 28, 2013 at 3:25 AM, Oleg Nesterov <oleg@redhat.com> wrote:
> > > On 02/27, Michel Lespinasse wrote:
> > >>
> > >> +void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
> > >> +{
> > >> +       preempt_disable();
> > >> +
> > >> +       if (__this_cpu_read(*lgrw->local_refcnt) ||
> > >> +           arch_spin_trylock(this_cpu_ptr(lgrw->lglock->lock))) {
> > >> +               __this_cpu_inc(*lgrw->local_refcnt);
> > >
> > > Please look at __this_cpu_generic_to_op(). You need this_cpu_inc()
> > > to avoid the race with irs. The same for _read_unlock.
> >
> > Hmmm, I was thinking that this was safe because while interrupts might
> > modify local_refcnt to acquire a nested read lock, they are expected
> > to release that lock as well which would set local_refcnt back to its
> > original value ???
>
> Yes, yes, this is correct.
>
> I meant that (in general, x86 is fine) __this_cpu_inc() itself is not
> irq-safe. It simply does "pcp += 1".
>
> this_cpu_inc() is fine, _this_cpu_generic_to_op() does cli/sti around.

Just in case, it is not that I really understand why __this_cpu_inc() can
race with irq in this particular case (given that irq handler should
restore the counter).

So perhaps I am wrong again. The comments in include/linux/percpu.h look
confusing to me, and I simply know nothing about !x86 architectures. But
since, say, preempt_disable() doesn't do anything special then probably
__this_cpu_inc() is fine too.

In short: please ignore me ;)

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - March 1, 2013, 5:50 p.m.
On 28/02/13 05:19, Srivatsa S. Bhat wrote:
> On 02/27/2013 06:03 AM, Lai Jiangshan wrote:
>> On Wed, Feb 27, 2013 at 3:30 AM, Srivatsa S. Bhat
>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>> On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
>>>> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>
>>>>> Hi Lai,
>>>>>
>>>>> I'm really not convinced that piggy-backing on lglocks would help
>>>>> us in any way. But still, let me try to address some of the points
>>>>> you raised...
>>>>>
>>>>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>>>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>>> Hi Lai,
>>>>>>>>>
>>>>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>>>>> Hi, Srivatsa,
>>>>>>>>>>
>>>>>>>>>> The target of the whole patchset is nice for me.
>>>>>>>>>
>>>>>>>>> Cool! Thanks :-)
>>>>>>>>>
>>>>>>> [...]
>>>>>>>
>>>>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>>>>> writer and the reader both increment the same counters. So how will the
>>>>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>>>>
>>>>>> The same as your code, the reader(which nested in write C.S.) just dec
>>>>>> the counters.
>>>>>
>>>>> And that works fine in my case because the writer and the reader update
>>>>> _two_ _different_ counters.
>>>>
>>>> I can't find any magic in your code, they are the same counter.
>>>>
>>>>         /*
>>>>          * It is desirable to allow the writer to acquire the percpu-rwlock
>>>>          * for read (if necessary), without deadlocking or getting complaints
>>>>          * from lockdep. To achieve that, just increment the reader_refcnt of
>>>>          * this CPU - that way, any attempt by the writer to acquire the
>>>>          * percpu-rwlock for read, will get treated as a case of nested percpu
>>>>          * reader, which is safe, from a locking perspective.
>>>>          */
>>>>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>>>>
>>>
>>> Whoa! Hold on, were you really referring to _this_ increment when you said
>>> that, in your patch you would increment the refcnt at the writer? Then I guess
>>> there is a major disconnect in our conversations. (I had assumed that you were
>>> referring to the update of writer_signal, and were just trying to have a single
>>> refcnt instead of reader_refcnt and writer_signal).
>>
>> https://github.com/laijs/linux/commit/53e5053d5b724bea7c538b11743d0f420d98f38d
>>
>> Sorry the name "fallback_reader_refcnt" misled you.
>>
> [...]
> 
>>>> All I was considered is "nested reader is seldom", so I always
>>>> fallback to rwlock when nested.
>>>> If you like, I can add 6 lines of code, the overhead is
>>>> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
>>>>
>>>
>>> I'm assuming that calculation is no longer valid, considering that
>>> we just discussed how the per-cpu refcnt that you were using is quite
>>> unnecessary and can be removed.
>>>
>>> IIUC, the overhead with your code, as per above discussion would be:
>>> 1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).
>>
>> https://github.com/laijs/linux/commit/46334544bb7961550b7065e015da76f6dab21f16
>>
>> Again, I'm so sorry the name "fallback_reader_refcnt" misled you.
>>
> 
> At this juncture I really have to admit that I don't understand your
> intentions at all. What are you really trying to prove? Without giving
> a single good reason why my code is inferior, why are you even bringing
> up the discussion about a complete rewrite of the synchronization code?
> http://article.gmane.org/gmane.linux.kernel.cross-arch/17103
> http://article.gmane.org/gmane.linux.power-management.general/31345
> 
> I'm beginning to add 2 + 2 together based on the kinds of questions you
> have been asking...
> 
> You posted a patch in this thread and started a discussion around it without
> even establishing a strong reason to do so. Now you point me to your git
> tree where your patches have even more traces of ideas being borrowed from
> my patchset (apart from my own ideas/code, there are traces of others' ideas
> being borrowed too - for example, it was Oleg who originally proposed the
> idea of splitting up the counter into 2 parts and I'm seeing that it is
> slowly crawling into your code with no sign of appropriate credits).
> http://article.gmane.org/gmane.linux.network/260288
> 
> And in reply to my mail pointing out the performance implications of the
> global read_lock at the reader side in your code, you said you'll come up
> with a comparison between that and my patchset.
> http://article.gmane.org/gmane.linux.network/260288
> The issue has been well-documented in my patch description of patch 4.
> http://article.gmane.org/gmane.linux.kernel/1443258
> 
> Are you really trying to pit bits and pieces of my own ideas/versions
> against one another and claiming them as your own?
> 
> You projected the work involved in handling the locking issues pertaining
> to CPU_DYING notifiers etc as a TODO, despite the fact that I had explicitly
> noted in my cover letter that I had audited and taken care of all of them.
> http://article.gmane.org/gmane.linux.documentation/9727
> http://article.gmane.org/gmane.linux.documentation/9520
> 
> You failed to acknowledge (on purpose?) that I had done a tree-wide
> conversion despite the fact that you were replying to the very thread which
> had the 46 patches which did exactly that (and I had also mentioned it
> explicitly in my cover letter).
> http://article.gmane.org/gmane.linux.documentation/9727
> http://article.gmane.org/gmane.linux.documentation/9520
> 
> You then started probing more and more about the technique I used to do
> the tree-wide conversion.
> http://article.gmane.org/gmane.linux.kernel.cross-arch/17111
> 
> You also retorted saying you did go through my patch descriptions, so
> its not like you have missed reading them.
> http://article.gmane.org/gmane.linux.power-management.general/31345
> 
> Each of these when considered individually, might appear like innocuous and
> honest attempts at evaluating my code. But when put together, I'm beginning
> to sense a whole different angle to it altogether, as if you are trying
> to spin your own patch series, complete with the locking framework _and_
> the tree-wide conversion, heavily borrowed from mine. At the beginning of
> this discussion, I predicted that the lglock version that you are proposing
> would end up being either less efficient than my version or look very similar
> to my version. http://article.gmane.org/gmane.linux.kernel/1447139
> 
> I thought it was just the former till now, but its not hard to see how it
> is getting closer to becoming the latter too. So yeah, I'm not amused.
> 
> Maybe (and hopefully) you are just trying out different ideas on your own,
> and I'm just being paranoid. I really hope that is the case. If you are just
> trying to review my code, then please stop sending patches with borrowed ideas
> with your sole Signed-off-by, and purposefully ignoring the work already done
> in my patchset, because it is really starting to look suspicious, at least
> to me.
> 
> Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
> _your_ code is better than mine. I just don't like the idea of somebody
> plagiarizing my ideas/code (or even others' ideas for that matter).
> However, I sincerely apologize in advance if I misunderstood/misjudged your
> intentions; I just wanted to voice my concerns out loud at this point,
> considering the bad feeling I got by looking at your responses collectively.
> 

Hi, Srivatsa

I'm sorry, big apology to you.
I'm bad in communication and I did be wrong.
I tended to improve the codes but in false direction.

Thanks,
Lai
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tejun Heo - March 1, 2013, 6:10 p.m.
Hello, Srivatsa.

On Thu, Feb 28, 2013 at 02:49:53AM +0530, Srivatsa S. Bhat wrote:
> Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
> _your_ code is better than mine. I just don't like the idea of somebody
> plagiarizing my ideas/code (or even others' ideas for that matter).
> However, I sincerely apologize in advance if I misunderstood/misjudged your
> intentions; I just wanted to voice my concerns out loud at this point,
> considering the bad feeling I got by looking at your responses collectively.

Although I don't know Lai personally, from my experience working with
him past several months on workqueue, I strongly doubt he has dark
urterior intenions.  The biggest problem probably is that his
communication in English isn't very fluent yet and thus doesn't carry
the underlying intentions or tones very well and he tends to bombard
patches in quick succession without much explanation inbetween (during
3.8 cycle, I was receiving several workqueue patchsets days apart
trying to solve the same problem with completely different approaches
way before the discussion on the previous postings reach any kind of
consensus).  A lot of that also probably comes from communication
problems.

Anyways, I really don't think he's trying to take claim of your work.
At least for now, you kinda need to speculate what he's trying to do
and then confirm that back to him, but once you get used to that part,
he's pretty nice to work with and I'm sure his communication will
improve in time (I think it has gotten a lot better than only some
months ago).

Thanks!
Srivatsa S. Bhat - March 1, 2013, 7:47 p.m.
On 03/01/2013 11:20 PM, Lai Jiangshan wrote:
> On 28/02/13 05:19, Srivatsa S. Bhat wrote:
>> On 02/27/2013 06:03 AM, Lai Jiangshan wrote:
>>> On Wed, Feb 27, 2013 at 3:30 AM, Srivatsa S. Bhat
>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>> On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
>>>>> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>
>>>>>> Hi Lai,
>>>>>>
>>>>>> I'm really not convinced that piggy-backing on lglocks would help
>>>>>> us in any way. But still, let me try to address some of the points
>>>>>> you raised...
>>>>>>
>>>>>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>>>>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>>>> Hi Lai,
>>>>>>>>>>
>>>>>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>>>>>> Hi, Srivatsa,
>>>>>>>>>>>
>>>>>>>>>>> The target of the whole patchset is nice for me.
>>>>>>>>>>
>>>>>>>>>> Cool! Thanks :-)
>>>>>>>>>>
>>>>>>>> [...]
>>>>>>>>
>>>>>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>>>>>> writer and the reader both increment the same counters. So how will the
>>>>>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>>>>>
>>>>>>> The same as your code, the reader(which nested in write C.S.) just dec
>>>>>>> the counters.
>>>>>>
>>>>>> And that works fine in my case because the writer and the reader update
>>>>>> _two_ _different_ counters.
>>>>>
>>>>> I can't find any magic in your code, they are the same counter.
>>>>>
>>>>>         /*
>>>>>          * It is desirable to allow the writer to acquire the percpu-rwlock
>>>>>          * for read (if necessary), without deadlocking or getting complaints
>>>>>          * from lockdep. To achieve that, just increment the reader_refcnt of
>>>>>          * this CPU - that way, any attempt by the writer to acquire the
>>>>>          * percpu-rwlock for read, will get treated as a case of nested percpu
>>>>>          * reader, which is safe, from a locking perspective.
>>>>>          */
>>>>>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>>>>>
>>>>
>>>> Whoa! Hold on, were you really referring to _this_ increment when you said
>>>> that, in your patch you would increment the refcnt at the writer? Then I guess
>>>> there is a major disconnect in our conversations. (I had assumed that you were
>>>> referring to the update of writer_signal, and were just trying to have a single
>>>> refcnt instead of reader_refcnt and writer_signal).
>>>
>>> https://github.com/laijs/linux/commit/53e5053d5b724bea7c538b11743d0f420d98f38d
>>>
>>> Sorry the name "fallback_reader_refcnt" misled you.
>>>
>> [...]
>>
>>>>> All I was considered is "nested reader is seldom", so I always
>>>>> fallback to rwlock when nested.
>>>>> If you like, I can add 6 lines of code, the overhead is
>>>>> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
>>>>>
>>>>
>>>> I'm assuming that calculation is no longer valid, considering that
>>>> we just discussed how the per-cpu refcnt that you were using is quite
>>>> unnecessary and can be removed.
>>>>
>>>> IIUC, the overhead with your code, as per above discussion would be:
>>>> 1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).
>>>
>>> https://github.com/laijs/linux/commit/46334544bb7961550b7065e015da76f6dab21f16
>>>
>>> Again, I'm so sorry the name "fallback_reader_refcnt" misled you.
>>>
>>
>> At this juncture I really have to admit that I don't understand your
>> intentions at all. What are you really trying to prove? Without giving
>> a single good reason why my code is inferior, why are you even bringing
>> up the discussion about a complete rewrite of the synchronization code?
>> http://article.gmane.org/gmane.linux.kernel.cross-arch/17103
>> http://article.gmane.org/gmane.linux.power-management.general/31345
>>
>> I'm beginning to add 2 + 2 together based on the kinds of questions you
>> have been asking...
>>
>> You posted a patch in this thread and started a discussion around it without
>> even establishing a strong reason to do so. Now you point me to your git
>> tree where your patches have even more traces of ideas being borrowed from
>> my patchset (apart from my own ideas/code, there are traces of others' ideas
>> being borrowed too - for example, it was Oleg who originally proposed the
>> idea of splitting up the counter into 2 parts and I'm seeing that it is
>> slowly crawling into your code with no sign of appropriate credits).
>> http://article.gmane.org/gmane.linux.network/260288
>>
>> And in reply to my mail pointing out the performance implications of the
>> global read_lock at the reader side in your code, you said you'll come up
>> with a comparison between that and my patchset.
>> http://article.gmane.org/gmane.linux.network/260288
>> The issue has been well-documented in my patch description of patch 4.
>> http://article.gmane.org/gmane.linux.kernel/1443258
>>
>> Are you really trying to pit bits and pieces of my own ideas/versions
>> against one another and claiming them as your own?
>>
>> You projected the work involved in handling the locking issues pertaining
>> to CPU_DYING notifiers etc as a TODO, despite the fact that I had explicitly
>> noted in my cover letter that I had audited and taken care of all of them.
>> http://article.gmane.org/gmane.linux.documentation/9727
>> http://article.gmane.org/gmane.linux.documentation/9520
>>
>> You failed to acknowledge (on purpose?) that I had done a tree-wide
>> conversion despite the fact that you were replying to the very thread which
>> had the 46 patches which did exactly that (and I had also mentioned it
>> explicitly in my cover letter).
>> http://article.gmane.org/gmane.linux.documentation/9727
>> http://article.gmane.org/gmane.linux.documentation/9520
>>
>> You then started probing more and more about the technique I used to do
>> the tree-wide conversion.
>> http://article.gmane.org/gmane.linux.kernel.cross-arch/17111
>>
>> You also retorted saying you did go through my patch descriptions, so
>> its not like you have missed reading them.
>> http://article.gmane.org/gmane.linux.power-management.general/31345
>>
>> Each of these when considered individually, might appear like innocuous and
>> honest attempts at evaluating my code. But when put together, I'm beginning
>> to sense a whole different angle to it altogether, as if you are trying
>> to spin your own patch series, complete with the locking framework _and_
>> the tree-wide conversion, heavily borrowed from mine. At the beginning of
>> this discussion, I predicted that the lglock version that you are proposing
>> would end up being either less efficient than my version or look very similar
>> to my version. http://article.gmane.org/gmane.linux.kernel/1447139
>>
>> I thought it was just the former till now, but its not hard to see how it
>> is getting closer to becoming the latter too. So yeah, I'm not amused.
>>
>> Maybe (and hopefully) you are just trying out different ideas on your own,
>> and I'm just being paranoid. I really hope that is the case. If you are just
>> trying to review my code, then please stop sending patches with borrowed ideas
>> with your sole Signed-off-by, and purposefully ignoring the work already done
>> in my patchset, because it is really starting to look suspicious, at least
>> to me.
>>
>> Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
>> _your_ code is better than mine. I just don't like the idea of somebody
>> plagiarizing my ideas/code (or even others' ideas for that matter).
>> However, I sincerely apologize in advance if I misunderstood/misjudged your
>> intentions; I just wanted to voice my concerns out loud at this point,
>> considering the bad feeling I got by looking at your responses collectively.
>>
> 
> Hi, Srivatsa
> 
> I'm sorry, big apology to you.
> I'm bad in communication and I did be wrong.
> I tended to improve the codes but in false direction.
> 

OK, in that case, I'm extremely sorry too, for jumping on you like that.
I hope you'll forgive me for the uneasiness it caused.

Now that I understand that you were simply trying to help, I would like to
express my gratitude for your time, effort and inputs in improving the design
of the stop-machine replacement.

I'm looking forward to working with you on this as well as future endeavours,
so I sincerely hope that we can put this unfortunate incident behind us and
collaborate effectively with renewed mutual trust and good-will.

Thank you very much!

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - March 1, 2013, 7:59 p.m.
Hi Tejun,

On 03/01/2013 11:40 PM, Tejun Heo wrote:
> Hello, Srivatsa.
> 
> On Thu, Feb 28, 2013 at 02:49:53AM +0530, Srivatsa S. Bhat wrote:
>> Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
>> _your_ code is better than mine. I just don't like the idea of somebody
>> plagiarizing my ideas/code (or even others' ideas for that matter).
>> However, I sincerely apologize in advance if I misunderstood/misjudged your
>> intentions; I just wanted to voice my concerns out loud at this point,
>> considering the bad feeling I got by looking at your responses collectively.
> 
> Although I don't know Lai personally, from my experience working with
> him past several months on workqueue, I strongly doubt he has dark
> urterior intenions.  The biggest problem probably is that his
> communication in English isn't very fluent yet and thus doesn't carry
> the underlying intentions or tones very well and he tends to bombard
> patches in quick succession without much explanation inbetween (during
> 3.8 cycle, I was receiving several workqueue patchsets days apart
> trying to solve the same problem with completely different approaches
> way before the discussion on the previous postings reach any kind of
> consensus).  A lot of that also probably comes from communication
> problems.
> 
> Anyways, I really don't think he's trying to take claim of your work.
> At least for now, you kinda need to speculate what he's trying to do
> and then confirm that back to him, but once you get used to that part,
> he's pretty nice to work with and I'm sure his communication will
> improve in time (I think it has gotten a lot better than only some
> months ago).
>

Thank you so much for helping clear the air, Tejun! I'm truly sorry again
for having drawn the wrong conclusions based on gaps in our communication.
It was a mistake on my part and I would like to request Lai to kindly
forgive me. I'm looking forward to working with Lai effectively, going
forward.

Thank you!

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lai Jiangshan - March 5, 2013, 4:25 p.m.
On 02/03/13 03:47, Srivatsa S. Bhat wrote:
> On 03/01/2013 11:20 PM, Lai Jiangshan wrote:
>> On 28/02/13 05:19, Srivatsa S. Bhat wrote:
>>> On 02/27/2013 06:03 AM, Lai Jiangshan wrote:
>>>> On Wed, Feb 27, 2013 at 3:30 AM, Srivatsa S. Bhat
>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>> On 02/26/2013 09:55 PM, Lai Jiangshan wrote:
>>>>>> On Tue, Feb 26, 2013 at 10:22 PM, Srivatsa S. Bhat
>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>
>>>>>>> Hi Lai,
>>>>>>>
>>>>>>> I'm really not convinced that piggy-backing on lglocks would help
>>>>>>> us in any way. But still, let me try to address some of the points
>>>>>>> you raised...
>>>>>>>
>>>>>>> On 02/26/2013 06:29 PM, Lai Jiangshan wrote:
>>>>>>>> On Tue, Feb 26, 2013 at 5:02 PM, Srivatsa S. Bhat
>>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>>> On 02/26/2013 05:47 AM, Lai Jiangshan wrote:
>>>>>>>>>> On Tue, Feb 26, 2013 at 3:26 AM, Srivatsa S. Bhat
>>>>>>>>>> <srivatsa.bhat@linux.vnet.ibm.com> wrote:
>>>>>>>>>>> Hi Lai,
>>>>>>>>>>>
>>>>>>>>>>> On 02/25/2013 09:23 PM, Lai Jiangshan wrote:
>>>>>>>>>>>> Hi, Srivatsa,
>>>>>>>>>>>>
>>>>>>>>>>>> The target of the whole patchset is nice for me.
>>>>>>>>>>>
>>>>>>>>>>> Cool! Thanks :-)
>>>>>>>>>>>
>>>>>>>>> [...]
>>>>>>>>>
>>>>>>>>> Unfortunately, I see quite a few issues with the code above. IIUC, the
>>>>>>>>> writer and the reader both increment the same counters. So how will the
>>>>>>>>> unlock() code in the reader path know when to unlock which of the locks?
>>>>>>>>
>>>>>>>> The same as your code, the reader(which nested in write C.S.) just dec
>>>>>>>> the counters.
>>>>>>>
>>>>>>> And that works fine in my case because the writer and the reader update
>>>>>>> _two_ _different_ counters.
>>>>>>
>>>>>> I can't find any magic in your code, they are the same counter.
>>>>>>
>>>>>>         /*
>>>>>>          * It is desirable to allow the writer to acquire the percpu-rwlock
>>>>>>          * for read (if necessary), without deadlocking or getting complaints
>>>>>>          * from lockdep. To achieve that, just increment the reader_refcnt of
>>>>>>          * this CPU - that way, any attempt by the writer to acquire the
>>>>>>          * percpu-rwlock for read, will get treated as a case of nested percpu
>>>>>>          * reader, which is safe, from a locking perspective.
>>>>>>          */
>>>>>>         this_cpu_inc(pcpu_rwlock->rw_state->reader_refcnt);
>>>>>>
>>>>>
>>>>> Whoa! Hold on, were you really referring to _this_ increment when you said
>>>>> that, in your patch you would increment the refcnt at the writer? Then I guess
>>>>> there is a major disconnect in our conversations. (I had assumed that you were
>>>>> referring to the update of writer_signal, and were just trying to have a single
>>>>> refcnt instead of reader_refcnt and writer_signal).
>>>>
>>>> https://github.com/laijs/linux/commit/53e5053d5b724bea7c538b11743d0f420d98f38d
>>>>
>>>> Sorry the name "fallback_reader_refcnt" misled you.
>>>>
>>> [...]
>>>
>>>>>> All I was considered is "nested reader is seldom", so I always
>>>>>> fallback to rwlock when nested.
>>>>>> If you like, I can add 6 lines of code, the overhead is
>>>>>> 1 spin_try_lock()(fast path)  + N  __this_cpu_inc()
>>>>>>
>>>>>
>>>>> I'm assuming that calculation is no longer valid, considering that
>>>>> we just discussed how the per-cpu refcnt that you were using is quite
>>>>> unnecessary and can be removed.
>>>>>
>>>>> IIUC, the overhead with your code, as per above discussion would be:
>>>>> 1 spin_try_lock() [non-nested] + N read_lock(global_rwlock).
>>>>
>>>> https://github.com/laijs/linux/commit/46334544bb7961550b7065e015da76f6dab21f16
>>>>
>>>> Again, I'm so sorry the name "fallback_reader_refcnt" misled you.
>>>>
>>>
>>> At this juncture I really have to admit that I don't understand your
>>> intentions at all. What are you really trying to prove? Without giving
>>> a single good reason why my code is inferior, why are you even bringing
>>> up the discussion about a complete rewrite of the synchronization code?
>>> http://article.gmane.org/gmane.linux.kernel.cross-arch/17103
>>> http://article.gmane.org/gmane.linux.power-management.general/31345
>>>
>>> I'm beginning to add 2 + 2 together based on the kinds of questions you
>>> have been asking...
>>>
>>> You posted a patch in this thread and started a discussion around it without
>>> even establishing a strong reason to do so. Now you point me to your git
>>> tree where your patches have even more traces of ideas being borrowed from
>>> my patchset (apart from my own ideas/code, there are traces of others' ideas
>>> being borrowed too - for example, it was Oleg who originally proposed the
>>> idea of splitting up the counter into 2 parts and I'm seeing that it is
>>> slowly crawling into your code with no sign of appropriate credits).
>>> http://article.gmane.org/gmane.linux.network/260288
>>>
>>> And in reply to my mail pointing out the performance implications of the
>>> global read_lock at the reader side in your code, you said you'll come up
>>> with a comparison between that and my patchset.
>>> http://article.gmane.org/gmane.linux.network/260288
>>> The issue has been well-documented in my patch description of patch 4.
>>> http://article.gmane.org/gmane.linux.kernel/1443258
>>>
>>> Are you really trying to pit bits and pieces of my own ideas/versions
>>> against one another and claiming them as your own?
>>>
>>> You projected the work involved in handling the locking issues pertaining
>>> to CPU_DYING notifiers etc as a TODO, despite the fact that I had explicitly
>>> noted in my cover letter that I had audited and taken care of all of them.
>>> http://article.gmane.org/gmane.linux.documentation/9727
>>> http://article.gmane.org/gmane.linux.documentation/9520
>>>
>>> You failed to acknowledge (on purpose?) that I had done a tree-wide
>>> conversion despite the fact that you were replying to the very thread which
>>> had the 46 patches which did exactly that (and I had also mentioned it
>>> explicitly in my cover letter).
>>> http://article.gmane.org/gmane.linux.documentation/9727
>>> http://article.gmane.org/gmane.linux.documentation/9520
>>>
>>> You then started probing more and more about the technique I used to do
>>> the tree-wide conversion.
>>> http://article.gmane.org/gmane.linux.kernel.cross-arch/17111
>>>
>>> You also retorted saying you did go through my patch descriptions, so
>>> its not like you have missed reading them.
>>> http://article.gmane.org/gmane.linux.power-management.general/31345
>>>
>>> Each of these when considered individually, might appear like innocuous and
>>> honest attempts at evaluating my code. But when put together, I'm beginning
>>> to sense a whole different angle to it altogether, as if you are trying
>>> to spin your own patch series, complete with the locking framework _and_
>>> the tree-wide conversion, heavily borrowed from mine. At the beginning of
>>> this discussion, I predicted that the lglock version that you are proposing
>>> would end up being either less efficient than my version or look very similar
>>> to my version. http://article.gmane.org/gmane.linux.kernel/1447139
>>>
>>> I thought it was just the former till now, but its not hard to see how it
>>> is getting closer to becoming the latter too. So yeah, I'm not amused.
>>>
>>> Maybe (and hopefully) you are just trying out different ideas on your own,
>>> and I'm just being paranoid. I really hope that is the case. If you are just
>>> trying to review my code, then please stop sending patches with borrowed ideas
>>> with your sole Signed-off-by, and purposefully ignoring the work already done
>>> in my patchset, because it is really starting to look suspicious, at least
>>> to me.
>>>
>>> Don't get me wrong - I'll whole-heartedly acknowledge and appreciate if
>>> _your_ code is better than mine. I just don't like the idea of somebody
>>> plagiarizing my ideas/code (or even others' ideas for that matter).
>>> However, I sincerely apologize in advance if I misunderstood/misjudged your
>>> intentions; I just wanted to voice my concerns out loud at this point,
>>> considering the bad feeling I got by looking at your responses collectively.
>>>
>>
>> Hi, Srivatsa
>>
>> I'm sorry, big apology to you.
>> I'm bad in communication and I did be wrong.
>> I tended to improve the codes but in false direction.
>>
> 
> OK, in that case, I'm extremely sorry too, for jumping on you like that.
> I hope you'll forgive me for the uneasiness it caused.
> 
> Now that I understand that you were simply trying to help, I would like to
> express my gratitude for your time, effort and inputs in improving the design
> of the stop-machine replacement.
> 
> I'm looking forward to working with you on this as well as future endeavours,
> so I sincerely hope that we can put this unfortunate incident behind us and
> collaborate effectively with renewed mutual trust and good-will.
> 
> Thank you very much!
> 

Hi, Srivatsa,

I'm sorry again, I delayed your works.

I have some thinkings about the way how to get this work done.

First step: (2~3 patches)
Use preempt_disable() to implement get_online_cpu_atomic(), and add lockdep for it.

Second step:
Conversion patches.

We can send the patchset of the above steps at first.
{
It does not change any behavior of the kernel.
and it is annotation(instead of direct preempt_diable() without comments sometimes),
so I expected they can be merged very early.
}

Third step:
After all people confide the conversion patches covered all cases and cpuhotplug site is ready for it,
we will implement get_online_cpu_atomic() via locks and remove stop_machine() from cpuhotplug.

Any thought?

Thanks,
Lai

If I have time, I will help you for the patches of the first step.
(I was assigned bad job in office-time, I can only do kernel-dev work in night.)

And for step2, I will write a checklist or spatch-script.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Srivatsa S. Bhat - March 5, 2013, 6:27 p.m.
Hi Lai,

On 03/05/2013 09:55 PM, Lai Jiangshan wrote:
> Hi, Srivatsa,
> 
> I'm sorry again, I delayed your works.
>

No, you didn't :-) I have been busy with some internal work lately,
so I haven't been able to go through the recent discussions and
review the new code carefully.. I'll get to it as soon as I can.
 
> I have some thinkings about the way how to get this work done.
> 
> First step: (2~3 patches)
> Use preempt_disable() to implement get_online_cpu_atomic(), and add lockdep for it.
> 
> Second step:
> Conversion patches.
> 
> We can send the patchset of the above steps at first.
> {
> It does not change any behavior of the kernel.
> and it is annotation(instead of direct preempt_diable() without comments sometimes),
> so I expected they can be merged very early.
> }
> 
> Third step:
> After all people confide the conversion patches covered all cases and cpuhotplug site is ready for it,
> we will implement get_online_cpu_atomic() via locks and remove stop_machine() from cpuhotplug.
> 
> Any thought?
> 

That sounds like a good plan. It might involve slightly more churn
than just directly changing the locking scheme, but it is safer.
And the extra churn is anyway limited only to the implementation of
get/put_online_cpus_atomic().. so that should be fine IMHO.

> 
> If I have time, I will help you for the patches of the first step.
> (I was assigned bad job in office-time, I can only do kernel-dev work in night.)
> 
> And for step2, I will write a checklist or spatch-script.
> 

Do look at the conversion already done in this v6 as well. In
addition to that, we will have to account for the new kernel
code that went in recently.

I'll get back to working on the above mentioned aspects soon.

Regards,
Srivatsa S. Bhat

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 0d24e93..30fe887 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -67,4 +67,35 @@  void lg_local_unlock_cpu(struct lglock *lg, int cpu);
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);

+struct lgrwlock {
+	unsigned long __percpu *fallback_reader_refcnt;
+	struct lglock lglock;
+	rwlock_t fallback_rwlock;
+};
+
+#define DEFINE_LGRWLOCK(name)						\
+	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	static DEFINE_PER_CPU(unsigned long, name ## _refcnt);		\
+	struct lgrwlock name = {					\
+		.fallback_reader_refcnt = &name ## _refcnt,		\
+		.lglock = { .lock = &name ## _lock } }
+
+#define DEFINE_STATIC_LGRWLOCK(name)					\
+	static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)		\
+	= __ARCH_SPIN_LOCK_UNLOCKED;					\
+	static DEFINE_PER_CPU(unsigned long, name ## _refcnt);		\
+	static struct lgrwlock name = {					\
+		.fallback_reader_refcnt = &name ## _refcnt,		\
+		.lglock = { .lock = &name ## _lock } }
+
+static inline void lg_rwlock_init(struct lgrwlock *lgrw, char *name)
+{
+	lg_lock_init(&lgrw->lglock, name);
+}
+
+void lg_rwlock_local_read_lock(struct lgrwlock *lgrw);
+void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw);
+void lg_rwlock_global_write_lock(struct lgrwlock *lgrw);
+void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw);
 #endif
diff --git a/kernel/lglock.c b/kernel/lglock.c
index 6535a66..463543a 100644
--- a/kernel/lglock.c
+++ b/kernel/lglock.c
@@ -87,3 +87,48 @@  void lg_global_unlock(struct lglock *lg)
 	preempt_enable();
 }
 EXPORT_SYMBOL(lg_global_unlock);
+
+void lg_rwlock_local_read_lock(struct lgrwlock *lgrw)
+{
+	struct lglock *lg = &lgrw->lglock;
+
+	preempt_disable();
+	if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
+		if (likely(arch_spin_trylock(this_cpu_ptr(lg->lock)))) {
+			rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+			return;
+		}
+		read_lock(&lgrw->fallback_rwlock);
+	}
+
+	__this_cpu_inc(*lgrw->fallback_reader_refcnt);
+}
+EXPORT_SYMBOL(lg_rwlock_local_read_lock);
+
+void lg_rwlock_local_read_unlock(struct lgrwlock *lgrw)
+{
+	if (likely(!__this_cpu_read(*lgrw->fallback_reader_refcnt))) {
+		lg_local_unlock(&lgrw->lglock);
+		return;
+	}
+
+	if (!__this_cpu_dec_return(*lgrw->fallback_reader_refcnt))
+		read_unlock(&lgrw->fallback_rwlock);
+
+	preempt_enable();
+}
+EXPORT_SYMBOL(lg_rwlock_local_read_unlock);
+
+void lg_rwlock_global_write_lock(struct lgrwlock *lgrw)
+{
+	lg_global_lock(&lgrw->lglock);
+	write_lock(&lgrw->fallback_rwlock);
+}
+EXPORT_SYMBOL(lg_rwlock_global_write_lock);
+
+void lg_rwlock_global_write_unlock(struct lgrwlock *lgrw)
+{
+	write_unlock(&lgrw->fallback_rwlock);
+	lg_global_unlock(&lgrw->lglock);
+}
+EXPORT_SYMBOL(lg_rwlock_global_write_unlock);