diff mbox

neighbour.c: Avoid GC directly after state change

Message ID 1426105727-64654-1-git-send-email-netdev@emagii.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Ulf Samuelsson March 11, 2015, 8:28 p.m. UTC
From: Ulf Samuelsson <ulf@emagii.com>

The neighbour state is changed in the ARP timer handler.
If the state is changed to NUD_STALE, then the neighbour
entry becomes a candidate for garbage collection.

The garbage collection is handled by a "periodic work" routine.

When :

	* noone refers to the entry
	* the state is no longer valid (I.E: NUD_STALE).
	* the timeout value has  been reached or state is FAILED

the "periodic work" routine will notify
the stack that the entry should be deleted.

A user application monitoring and controlling the neighbour table
using NETLINK may fail, if the "period work" routine is run
directly after the state has been changed to NUD_STALE,
but before the user application has had a chance to change
the state to something valid.

The "period work" routine will detect the NUD_STALE state
and if the timeout value has been reached, it will notify the stack
that the entry should be deleted.

The patch adds a check in the periodic work routine
which will skip test for garbage collection
unless a number of ticks has passed since the last time
the neighbour entry state was changed.

The feature is controlled through Kconfig

The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
The guardband time (in ticks) is set in ARP_GC_GUARDBAND
Default time is 100 ms if HZ_### is set.

Signed-off-by: Ulf Samuelsson <ulf@emagii.com>
---
 net/Kconfig          |   32 ++++++++++++++++++++++++++++++++
 net/core/neighbour.c |   15 ++++++++++++---
 2 files changed, 44 insertions(+), 3 deletions(-)

Comments

YOSHIFUJI Hideaki / 吉藤英明 March 15, 2015, 8:27 a.m. UTC | #1
Hello.

Ulf Samuelsson wrote:
> From: Ulf Samuelsson <ulf@emagii.com>
> 
> The neighbour state is changed in the ARP timer handler.
> If the state is changed to NUD_STALE, then the neighbour
> entry becomes a candidate for garbage collection.
> 
> The garbage collection is handled by a "periodic work" routine.
> 
> When :
> 
> 	* noone refers to the entry
> 	* the state is no longer valid (I.E: NUD_STALE).

NUD_STALE is still valid.

> 	* the timeout value has  been reached or state is FAILED
> 
> the "periodic work" routine will notify
> the stack that the entry should be deleted.
> 
> A user application monitoring and controlling the neighbour table
> using NETLINK may fail, if the "period work" routine is run
> directly after the state has been changed to NUD_STALE,
> but before the user application has had a chance to change
> the state to something valid.
> 
> The "period work" routine will detect the NUD_STALE state
> and if the timeout value has been reached, it will notify the stack
> that the entry should be deleted.
> 
> The patch adds a check in the periodic work routine
> which will skip test for garbage collection
> unless a number of ticks has passed since the last time
> the neighbour entry state was changed.
> 
> The feature is controlled through Kconfig
> 
> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
> Default time is 100 ms if HZ_### is set.

We have "lower limit" not to start releasing neighbour entries.
Try increasing gc_thresh1.

--yoshfuji

> 
> Signed-off-by: Ulf Samuelsson <ulf@emagii.com>
> ---
>  net/Kconfig          |   32 ++++++++++++++++++++++++++++++++
>  net/core/neighbour.c |   15 ++++++++++++---
>  2 files changed, 44 insertions(+), 3 deletions(-)
> 
> diff --git a/net/Kconfig b/net/Kconfig
> index 44dd578..099a5dd 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -77,6 +77,38 @@ config INET
>  	  Short answer: say Y.
>  
>  if INET
> +
> +#
> +# Core Network configuration
> +#
> +
> +config ARP_GC_APPLY_GUARDBAND
> +	bool "IP: ARP: Avoid garbage collection directly after state change"
> +	default n
> +	---help---
> +	  With this item selected, an entry in the neighbour table
> +	  will not be garbage collected directly after the ARP state
> +	  has changed to STALE of FAILED
> +	  This allows an application program change the state to something valid
> +	  before garbage colllection occurs.
> +
> +	  If unsure, say N.
> +
> +config ARP_GC_GUARDBAND
> +	int "Guardband time on garbage collection"
> +	depends on ARP_GC_APPLY_GUARDBAND
> +	default 10 if HZ_100
> +	default 25 if HZ_250
> +	default 30 if HZ_300
> +	default 100 if HZ_1000
> +	default 100
> +
> +	---help---
> +	  The number of ticks to delay garbage collection
> +	  after the neighbour entry has been updated
> +	  A delay of 100 ms is reasonable.
> +	  With CONFIG_HZ = 250, this value should be 25
> +
>  source "net/ipv4/Kconfig"
>  source "net/ipv6/Kconfig"
>  source "net/netlabel/Kconfig"
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 70fe9e1..194195d 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -786,13 +786,23 @@ static void neigh_periodic_work(struct work_struct *work)
>  
>  			state = n->nud_state;
>  			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
> -				write_unlock(&n->lock);
>  				goto next_elt;
>  			}
>  
>  			if (time_before(n->used, n->confirmed))
>  				n->used = n->confirmed;
>  
> +#if defined(CONFIG_ARP_GC_APPLY_GUARDBAND)
> +			/* Do not garbage collect directly after we
> +			 * updated n->state to allow applications to
> +			 * react to the event
> +			 */
> +			if (time_before(jiffies,
> +					n->updated + CONFIG_ARP_GC_GUARDBAND)) {
> +				goto next_elt;
> +			}
> +#endif
> +
>  			if (atomic_read(&n->refcnt) == 1 &&
>  			    (state == NUD_FAILED ||
>  			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
> @@ -802,9 +812,8 @@ static void neigh_periodic_work(struct work_struct *work)
>  				neigh_cleanup_and_release(n);
>  				continue;
>  			}
> -			write_unlock(&n->lock);
> -
>  next_elt:
> +			write_unlock(&n->lock);
>  			np = &n->next;
>  		}
>  		/*
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ulf Samuelsson March 15, 2015, 7:34 p.m. UTC | #2
Den 2015-03-15 09:27, YOSHIFUJI Hideaki skrev:
> Hello.
>
> Ulf Samuelsson wrote:
>> From: Ulf Samuelsson <ulf@emagii.com>
>>
>> The neighbour state is changed in the ARP timer handler.
>> If the state is changed to NUD_STALE, then the neighbour
>> entry becomes a candidate for garbage collection.
>>
>> The garbage collection is handled by a "periodic work" routine.
>>
>> When :
>>
>> 	* noone refers to the entry
>> 	* the state is no longer valid (I.E: NUD_STALE).
> NUD_STALE is still valid.
Yes, my fault.
The condition which causes garbage collection to be skipped is.


     if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {

     NUD_STALE is not part of that, so GC will not be skipped,
     and therefore the patch is needed if you want to be able
     to use the API to modify the neigh statemachine..
>
>> 	* the timeout value has  been reached or state is FAILED
>>
>> the "periodic work" routine will notify
>> the stack that the entry should be deleted.
>>
>> A user application monitoring and controlling the neighbour table
>> using NETLINK may fail, if the "period work" routine is run
>> directly after the state has been changed to NUD_STALE,
>> but before the user application has had a chance to change
>> the state to something valid.
>>
>> The "period work" routine will detect the NUD_STALE state
>> and if the timeout value has been reached, it will notify the stack
>> that the entry should be deleted.
>>
>> The patch adds a check in the periodic work routine
>> which will skip test for garbage collection
>> unless a number of ticks has passed since the last time
>> the neighbour entry state was changed.
>>
>> The feature is controlled through Kconfig
>>
>> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
>> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
>> Default time is 100 ms if HZ_### is set.
> We have "lower limit" not to start releasing neighbour entries.
> Try increasing gc_thresh1.
Why would  that work?

The only place where this is used is

     "if (atomic_read(&tbl->entries) < tbl->gc_thresh1)"

tbl->entries is related to how many entries there are in the neighbour 
table.

The only way I think this would work, is if this is raised so high that
garbage collection does not occur.

That is not the intention.

It does not solve the race condition between the timer_handler and the 
periodic_work.

BR
Ulf Samuelsson

>
> --yoshfuji
>
>> Signed-off-by: Ulf Samuelsson <ulf@emagii.com>
>> ---
>>   net/Kconfig          |   32 ++++++++++++++++++++++++++++++++
>>   net/core/neighbour.c |   15 ++++++++++++---
>>   2 files changed, 44 insertions(+), 3 deletions(-)
>>
>> diff --git a/net/Kconfig b/net/Kconfig
>> index 44dd578..099a5dd 100644
>> --- a/net/Kconfig
>> +++ b/net/Kconfig
>> @@ -77,6 +77,38 @@ config INET
>>   	  Short answer: say Y.
>>   
>>   if INET
>> +
>> +#
>> +# Core Network configuration
>> +#
>> +
>> +config ARP_GC_APPLY_GUARDBAND
>> +	bool "IP: ARP: Avoid garbage collection directly after state change"
>> +	default n
>> +	---help---
>> +	  With this item selected, an entry in the neighbour table
>> +	  will not be garbage collected directly after the ARP state
>> +	  has changed to STALE of FAILED
>> +	  This allows an application program change the state to something valid
>> +	  before garbage colllection occurs.
>> +
>> +	  If unsure, say N.
>> +
>> +config ARP_GC_GUARDBAND
>> +	int "Guardband time on garbage collection"
>> +	depends on ARP_GC_APPLY_GUARDBAND
>> +	default 10 if HZ_100
>> +	default 25 if HZ_250
>> +	default 30 if HZ_300
>> +	default 100 if HZ_1000
>> +	default 100
>> +
>> +	---help---
>> +	  The number of ticks to delay garbage collection
>> +	  after the neighbour entry has been updated
>> +	  A delay of 100 ms is reasonable.
>> +	  With CONFIG_HZ = 250, this value should be 25
>> +
>>   source "net/ipv4/Kconfig"
>>   source "net/ipv6/Kconfig"
>>   source "net/netlabel/Kconfig"
>> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
>> index 70fe9e1..194195d 100644
>> --- a/net/core/neighbour.c
>> +++ b/net/core/neighbour.c
>> @@ -786,13 +786,23 @@ static void neigh_periodic_work(struct work_struct *work)
>>   
>>   			state = n->nud_state;
>>   			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>> -				write_unlock(&n->lock);
>>   				goto next_elt;
>>   			}
>>   
>>   			if (time_before(n->used, n->confirmed))
>>   				n->used = n->confirmed;
>>   
>> +#if defined(CONFIG_ARP_GC_APPLY_GUARDBAND)
>> +			/* Do not garbage collect directly after we
>> +			 * updated n->state to allow applications to
>> +			 * react to the event
>> +			 */
>> +			if (time_before(jiffies,
>> +					n->updated + CONFIG_ARP_GC_GUARDBAND)) {
>> +				goto next_elt;
>> +			}
>> +#endif
>> +
>>   			if (atomic_read(&n->refcnt) == 1 &&
>>   			    (state == NUD_FAILED ||
>>   			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
>> @@ -802,9 +812,8 @@ static void neigh_periodic_work(struct work_struct *work)
>>   				neigh_cleanup_and_release(n);
>>   				continue;
>>   			}
>> -			write_unlock(&n->lock);
>> -
>>   next_elt:
>> +			write_unlock(&n->lock);
>>   			np = &n->next;
>>   		}
>>   		/*
>>
Hideaki Yoshifuji March 16, 2015, 4:57 a.m. UTC | #3
Hello.

Ulf Samuelsson wrote:
> Den 2015-03-15 09:27, YOSHIFUJI Hideaki skrev:
>> Hello.
>>
>> Ulf Samuelsson wrote:
>>> From: Ulf Samuelsson <ulf@emagii.com>
>>>
>>> The neighbour state is changed in the ARP timer handler.
>>> If the state is changed to NUD_STALE, then the neighbour
>>> entry becomes a candidate for garbage collection.
>>>
>>> The garbage collection is handled by a "periodic work" routine.
>>>
>>> When :
>>>
>>>     * noone refers to the entry
>>>     * the state is no longer valid (I.E: NUD_STALE).
>> NUD_STALE is still valid.
> Yes, my fault.
> The condition which causes garbage collection to be skipped is.
>
>
>      if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>
>      NUD_STALE is not part of that, so GC will not be skipped,
>      and therefore the patch is needed if you want to be able
>      to use the API to modify the neigh statemachine..
>>
>>>     * the timeout value has  been reached or state is FAILED
>>>
>>> the "periodic work" routine will notify
>>> the stack that the entry should be deleted.
>>>
>>> A user application monitoring and controlling the neighbour table
>>> using NETLINK may fail, if the "period work" routine is run
>>> directly after the state has been changed to NUD_STALE,
>>> but before the user application has had a chance to change
>>> the state to something valid.
>>>
>>> The "period work" routine will detect the NUD_STALE state
>>> and if the timeout value has been reached, it will notify the stack
>>> that the entry should be deleted.
>>>
>>> The patch adds a check in the periodic work routine
>>> which will skip test for garbage collection
>>> unless a number of ticks has passed since the last time
>>> the neighbour entry state was changed.
>>>
>>> The feature is controlled through Kconfig
>>>
>>> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
>>> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
>>> Default time is 100 ms if HZ_### is set.
>> We have "lower limit" not to start releasing neighbour entries.
>> Try increasing gc_thresh1.
> Why would  that work?
>
> The only place where this is used is
>
>      "if (atomic_read(&tbl->entries) < tbl->gc_thresh1)"
>
> tbl->entries is related to how many entries there are in the neighbour table.
>
> The only way I think this would work, is if this is raised so high that
> garbage collection does not occur.
>
> That is not the intention.
>
> It does not solve the race condition between the timer_handler and the periodic_work.

I don't think it is a race.

You can try increasing gc_staletime to hold each entry based
on last usage.  Plus, you can "confirm" neighbors by
MSG_CONFIRM.

Note that if the number of entries becomes high, "forced GC" will
drop valid, "not connected" entries as well.

--yoshfuji

>
> BR
> Ulf Samuelsson
>
>>
>> --yoshfuji
>>
>>> Signed-off-by: Ulf Samuelsson <ulf@emagii.com>
>>> ---
>>>   net/Kconfig          |   32 ++++++++++++++++++++++++++++++++
>>>   net/core/neighbour.c |   15 ++++++++++++---
>>>   2 files changed, 44 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/net/Kconfig b/net/Kconfig
>>> index 44dd578..099a5dd 100644
>>> --- a/net/Kconfig
>>> +++ b/net/Kconfig
>>> @@ -77,6 +77,38 @@ config INET
>>>         Short answer: say Y.
>>>   if INET
>>> +
>>> +#
>>> +# Core Network configuration
>>> +#
>>> +
>>> +config ARP_GC_APPLY_GUARDBAND
>>> +    bool "IP: ARP: Avoid garbage collection directly after state change"
>>> +    default n
>>> +    ---help---
>>> +      With this item selected, an entry in the neighbour table
>>> +      will not be garbage collected directly after the ARP state
>>> +      has changed to STALE of FAILED
>>> +      This allows an application program change the state to something valid
>>> +      before garbage colllection occurs.
>>> +
>>> +      If unsure, say N.
>>> +
>>> +config ARP_GC_GUARDBAND
>>> +    int "Guardband time on garbage collection"
>>> +    depends on ARP_GC_APPLY_GUARDBAND
>>> +    default 10 if HZ_100
>>> +    default 25 if HZ_250
>>> +    default 30 if HZ_300
>>> +    default 100 if HZ_1000
>>> +    default 100
>>> +
>>> +    ---help---
>>> +      The number of ticks to delay garbage collection
>>> +      after the neighbour entry has been updated
>>> +      A delay of 100 ms is reasonable.
>>> +      With CONFIG_HZ = 250, this value should be 25
>>> +
>>>   source "net/ipv4/Kconfig"
>>>   source "net/ipv6/Kconfig"
>>>   source "net/netlabel/Kconfig"
>>> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
>>> index 70fe9e1..194195d 100644
>>> --- a/net/core/neighbour.c
>>> +++ b/net/core/neighbour.c
>>> @@ -786,13 +786,23 @@ static void neigh_periodic_work(struct work_struct *work)
>>>               state = n->nud_state;
>>>               if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>>> -                write_unlock(&n->lock);
>>>                   goto next_elt;
>>>               }
>>>               if (time_before(n->used, n->confirmed))
>>>                   n->used = n->confirmed;
>>> +#if defined(CONFIG_ARP_GC_APPLY_GUARDBAND)
>>> +            /* Do not garbage collect directly after we
>>> +             * updated n->state to allow applications to
>>> +             * react to the event
>>> +             */
>>> +            if (time_before(jiffies,
>>> +                    n->updated + CONFIG_ARP_GC_GUARDBAND)) {
>>> +                goto next_elt;
>>> +            }
>>> +#endif
>>> +
>>>               if (atomic_read(&n->refcnt) == 1 &&
>>>                   (state == NUD_FAILED ||
>>>                    time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
>>> @@ -802,9 +812,8 @@ static void neigh_periodic_work(struct work_struct *work)
>>>                   neigh_cleanup_and_release(n);
>>>                   continue;
>>>               }
>>> -            write_unlock(&n->lock);
>>> -
>>>   next_elt:
>>> +            write_unlock(&n->lock);
>>>               np = &n->next;
>>>           }
>>>           /*
>>>
>
>
Ulf Samuelsson March 16, 2015, 7:55 p.m. UTC | #4
Den 2015-03-16 05:57, YOSHIFUJI Hideaki/吉藤英明 skrev:
> Hello.
>
> Ulf Samuelsson wrote:
>> Den 2015-03-15 09:27, YOSHIFUJI Hideaki skrev:
>>> Hello.
>>>
>>> Ulf Samuelsson wrote:
>>>> From: Ulf Samuelsson <ulf@emagii.com>
>>>>
>>>> The neighbour state is changed in the ARP timer handler.
>>>> If the state is changed to NUD_STALE, then the neighbour
>>>> entry becomes a candidate for garbage collection.
>>>>
>>>> The garbage collection is handled by a "periodic work" routine.
>>>>
>>>> When :
>>>>
>>>>     * noone refers to the entry
>>>>     * the state is no longer valid (I.E: NUD_STALE).
>>> NUD_STALE is still valid.
>> Yes, my fault.
>> The condition which causes garbage collection to be skipped is.
>>
>>
>>      if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>>
>>      NUD_STALE is not part of that, so GC will not be skipped,
>>      and therefore the patch is needed if you want to be able
>>      to use the API to modify the neigh statemachine..
>>>
>>>>     * the timeout value has  been reached or state is FAILED
>>>>
>>>> the "periodic work" routine will notify
>>>> the stack that the entry should be deleted.
>>>>
>>>> A user application monitoring and controlling the neighbour table
>>>> using NETLINK may fail, if the "period work" routine is run
>>>> directly after the state has been changed to NUD_STALE,
>>>> but before the user application has had a chance to change
>>>> the state to something valid.
>>>>
>>>> The "period work" routine will detect the NUD_STALE state
>>>> and if the timeout value has been reached, it will notify the stack
>>>> that the entry should be deleted.
>>>>
>>>> The patch adds a check in the periodic work routine
>>>> which will skip test for garbage collection
>>>> unless a number of ticks has passed since the last time
>>>> the neighbour entry state was changed.
>>>>
>>>> The feature is controlled through Kconfig
>>>>
>>>> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
>>>> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
>>>> Default time is 100 ms if HZ_### is set.
>>> We have "lower limit" not to start releasing neighbour entries.
>>> Try increasing gc_thresh1.
>> Why would  that work?
>>
>> The only place where this is used is
>>
>>      "if (atomic_read(&tbl->entries) < tbl->gc_thresh1)"
>>
>> tbl->entries is related to how many entries there are in the 
>> neighbour table.
>>
>> The only way I think this would work, is if this is raised so high that
>> garbage collection does not occur.
>>
>> That is not the intention.
>>
>> It does not solve the race condition between the timer_handler and 
>> the periodic_work.
>
> I don't think it is a race.
And I think you are wrong and my logging shows that it is.

>
> You can try increasing gc_staletime to hold each entry based
> on last usage.  Plus, you can "confirm" neighbors by
> MSG_CONFIRM.
>
> Note that if the number of entries becomes high, "forced GC" will
> drop valid, "not connected" entries as well.

I can try increasing gc_staletime, but its a waste of time, because it 
is not the last usage which is interesting.
What is interesting is the time when the entry state was updated by the 
timer handler.

Pls explain further MSG_CONFIRM.

The problem occurs when the periodic_work routine runs immediately
after the timer handler has changes the state to NUD_STALE and
the entry has reached the expiry time.

I suggest you read the code, and you will understand what I mean.



Ulf

>
> --yoshfuji
>
>>
>> BR
>> Ulf Samuelsson
>>
>>>
>>> --yoshfuji
>>>
>>>> Signed-off-by: Ulf Samuelsson <ulf@emagii.com>
>>>> ---
>>>>   net/Kconfig          |   32 ++++++++++++++++++++++++++++++++
>>>>   net/core/neighbour.c |   15 ++++++++++++---
>>>>   2 files changed, 44 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/net/Kconfig b/net/Kconfig
>>>> index 44dd578..099a5dd 100644
>>>> --- a/net/Kconfig
>>>> +++ b/net/Kconfig
>>>> @@ -77,6 +77,38 @@ config INET
>>>>         Short answer: say Y.
>>>>   if INET
>>>> +
>>>> +#
>>>> +# Core Network configuration
>>>> +#
>>>> +
>>>> +config ARP_GC_APPLY_GUARDBAND
>>>> +    bool "IP: ARP: Avoid garbage collection directly after state 
>>>> change"
>>>> +    default n
>>>> +    ---help---
>>>> +      With this item selected, an entry in the neighbour table
>>>> +      will not be garbage collected directly after the ARP state
>>>> +      has changed to STALE of FAILED
>>>> +      This allows an application program change the state to 
>>>> something valid
>>>> +      before garbage colllection occurs.
>>>> +
>>>> +      If unsure, say N.
>>>> +
>>>> +config ARP_GC_GUARDBAND
>>>> +    int "Guardband time on garbage collection"
>>>> +    depends on ARP_GC_APPLY_GUARDBAND
>>>> +    default 10 if HZ_100
>>>> +    default 25 if HZ_250
>>>> +    default 30 if HZ_300
>>>> +    default 100 if HZ_1000
>>>> +    default 100
>>>> +
>>>> +    ---help---
>>>> +      The number of ticks to delay garbage collection
>>>> +      after the neighbour entry has been updated
>>>> +      A delay of 100 ms is reasonable.
>>>> +      With CONFIG_HZ = 250, this value should be 25
>>>> +
>>>>   source "net/ipv4/Kconfig"
>>>>   source "net/ipv6/Kconfig"
>>>>   source "net/netlabel/Kconfig"
>>>> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
>>>> index 70fe9e1..194195d 100644
>>>> --- a/net/core/neighbour.c
>>>> +++ b/net/core/neighbour.c
>>>> @@ -786,13 +786,23 @@ static void neigh_periodic_work(struct 
>>>> work_struct *work)
>>>>               state = n->nud_state;
>>>>               if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>>>> -                write_unlock(&n->lock);
>>>>                   goto next_elt;
>>>>               }
>>>>               if (time_before(n->used, n->confirmed))
>>>>                   n->used = n->confirmed;
>>>> +#if defined(CONFIG_ARP_GC_APPLY_GUARDBAND)
>>>> +            /* Do not garbage collect directly after we
>>>> +             * updated n->state to allow applications to
>>>> +             * react to the event
>>>> +             */
>>>> +            if (time_before(jiffies,
>>>> +                    n->updated + CONFIG_ARP_GC_GUARDBAND)) {
>>>> +                goto next_elt;
>>>> +            }
>>>> +#endif
>>>> +
>>>>               if (atomic_read(&n->refcnt) == 1 &&
>>>>                   (state == NUD_FAILED ||
>>>>                    time_after(jiffies, n->used + 
>>>> NEIGH_VAR(n->parms, GC_STALETIME)))) {
>>>> @@ -802,9 +812,8 @@ static void neigh_periodic_work(struct 
>>>> work_struct *work)
>>>>                   neigh_cleanup_and_release(n);
>>>>                   continue;
>>>>               }
>>>> -            write_unlock(&n->lock);
>>>> -
>>>>   next_elt:
>>>> +            write_unlock(&n->lock);
>>>>               np = &n->next;
>>>>           }
>>>>           /*
>>>>
>>
>>
>
Hideaki Yoshifuji March 17, 2015, 12:31 p.m. UTC | #5
Hello,

Ulf Samuelsson wrote:
> Den 2015-03-16 05:57, YOSHIFUJI Hideaki/吉藤英明 skrev:
>> Hello.
>>
>> Ulf Samuelsson wrote:
>>> Den 2015-03-15 09:27, YOSHIFUJI Hideaki skrev:
>>>> Hello.
>>>>
>>>> Ulf Samuelsson wrote:
>>>>> From: Ulf Samuelsson <ulf@emagii.com>
>>>>>
>>>>> The neighbour state is changed in the ARP timer handler.
>>>>> If the state is changed to NUD_STALE, then the neighbour
>>>>> entry becomes a candidate for garbage collection.
>>>>>
>>>>> The garbage collection is handled by a "periodic work" routine.
>>>>>
>>>>> When :
>>>>>
>>>>>     * noone refers to the entry
>>>>>     * the state is no longer valid (I.E: NUD_STALE).
>>>> NUD_STALE is still valid.
>>> Yes, my fault.
>>> The condition which causes garbage collection to be skipped is.
>>>
>>>
>>>      if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>>>
>>>      NUD_STALE is not part of that, so GC will not be skipped,
>>>      and therefore the patch is needed if you want to be able
>>>      to use the API to modify the neigh statemachine..
>>>>
>>>>>     * the timeout value has  been reached or state is FAILED
>>>>>
>>>>> the "periodic work" routine will notify
>>>>> the stack that the entry should be deleted.
>>>>>
>>>>> A user application monitoring and controlling the neighbour table
>>>>> using NETLINK may fail, if the "period work" routine is run
>>>>> directly after the state has been changed to NUD_STALE,
>>>>> but before the user application has had a chance to change
>>>>> the state to something valid.
>>>>>
>>>>> The "period work" routine will detect the NUD_STALE state
>>>>> and if the timeout value has been reached, it will notify the stack
>>>>> that the entry should be deleted.
>>>>>
>>>>> The patch adds a check in the periodic work routine
>>>>> which will skip test for garbage collection
>>>>> unless a number of ticks has passed since the last time
>>>>> the neighbour entry state was changed.
>>>>>
>>>>> The feature is controlled through Kconfig
>>>>>
>>>>> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
>>>>> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
>>>>> Default time is 100 ms if HZ_### is set.
>>>> We have "lower limit" not to start releasing neighbour entries.
>>>> Try increasing gc_thresh1.
>>> Why would  that work?
>>>
>>> The only place where this is used is
>>>
>>>      "if (atomic_read(&tbl->entries) < tbl->gc_thresh1)"
>>>
>>> tbl->entries is related to how many entries there are in the neighbour table.
>>>
>>> The only way I think this would work, is if this is raised so high that
>>> garbage collection does not occur.
>>>
>>> That is not the intention.
>>>
>>> It does not solve the race condition between the timer_handler and the periodic_work.
>>
>> I don't think it is a race.
> And I think you are wrong and my logging shows that it is.

We do not gurantee holding "stale" entries more than
gc_thresh1, so it shall not be called as a race at all.

>
>>
>> You can try increasing gc_staletime to hold each entry based
>> on last usage.  Plus, you can "confirm" neighbors by
>> MSG_CONFIRM.
>>
>> Note that if the number of entries becomes high, "forced GC" will
>> drop valid, "not connected" entries as well.
>
> I can try increasing gc_staletime, but its a waste of time, because it is not the last usage which is interesting.
> What is interesting is the time when the entry state was updated by the timer handler.
>
> Pls explain further MSG_CONFIRM.

Typically, base reachable time is set to 30sec and gc_staletime is
set to 60sec.  So, entries are expected to be held for a while
after it has become "stale", no?

MSG_CONFIRM is a sendmsg() flag.  It allows user-space application
to confirm reachability of neighbor.  It refreshes "confirmed"
time. In neigh_periodic_work(), "used" time is updated to
"confirmed" time if "used" time is before "confirmed" time.

ping(8), ping6(8), tftpd(8) use that flag, for example.


> The problem occurs when the periodic_work routine runs immediately
> after the timer handler has changes the state to NUD_STALE and
> the entry has reached the expiry time.

It is what we expect.

In neigh_periodic_work(), you may try to release non-STALE
entries first, and then STALE entries if the number of entries is
still high.

--yoshfuji
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ulf Samuelsson March 17, 2015, 11:27 p.m. UTC | #6
Den 2015-03-17 13:31, YOSHIFUJI Hideaki skrev:
> Hello,
>
> Ulf Samuelsson wrote:
>> Den 2015-03-16 05:57, YOSHIFUJI Hideaki/吉藤英明 skrev:
>>> Hello.
>>>
>>> Ulf Samuelsson wrote:
>>>> Den 2015-03-15 09:27, YOSHIFUJI Hideaki skrev:
>>>>> Hello.
>>>>>
>>>>> Ulf Samuelsson wrote:
>>>>>> From: Ulf Samuelsson <ulf@emagii.com>
>>>>>>
>>>>>> The neighbour state is changed in the ARP timer handler.
>>>>>> If the state is changed to NUD_STALE, then the neighbour
>>>>>> entry becomes a candidate for garbage collection.
>>>>>>
>>>>>> The garbage collection is handled by a "periodic work" routine.
>>>>>>
>>>>>> When :
>>>>>>
>>>>>>     * noone refers to the entry
>>>>>>     * the state is no longer valid (I.E: NUD_STALE).
>>>>> NUD_STALE is still valid.
>>>> Yes, my fault.
>>>> The condition which causes garbage collection to be skipped is.
>>>>
>>>>
>>>>      if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
>>>>
>>>>      NUD_STALE is not part of that, so GC will not be skipped,
>>>>      and therefore the patch is needed if you want to be able
>>>>      to use the API to modify the neigh statemachine..
>>>>>
>>>>>>     * the timeout value has  been reached or state is FAILED
>>>>>>
>>>>>> the "periodic work" routine will notify
>>>>>> the stack that the entry should be deleted.
>>>>>>
>>>>>> A user application monitoring and controlling the neighbour table
>>>>>> using NETLINK may fail, if the "period work" routine is run
>>>>>> directly after the state has been changed to NUD_STALE,
>>>>>> but before the user application has had a chance to change
>>>>>> the state to something valid.
>>>>>>
>>>>>> The "period work" routine will detect the NUD_STALE state
>>>>>> and if the timeout value has been reached, it will notify the stack
>>>>>> that the entry should be deleted.
>>>>>>
>>>>>> The patch adds a check in the periodic work routine
>>>>>> which will skip test for garbage collection
>>>>>> unless a number of ticks has passed since the last time
>>>>>> the neighbour entry state was changed.
>>>>>>
>>>>>> The feature is controlled through Kconfig
>>>>>>
>>>>>> The featuree is enabled by setting ARP_GC_APPLY_GUARDBAND
>>>>>> The guardband time (in ticks) is set in ARP_GC_GUARDBAND
>>>>>> Default time is 100 ms if HZ_### is set.
>>>>> We have "lower limit" not to start releasing neighbour entries.
>>>>> Try increasing gc_thresh1.
>>>> Why would  that work?
>>>>
>>>> The only place where this is used is
>>>>
>>>>      "if (atomic_read(&tbl->entries) < tbl->gc_thresh1)"
>>>>
>>>> tbl->entries is related to how many entries there are in the 
>>>> neighbour table.
>>>>
>>>> The only way I think this would work, is if this is raised so high 
>>>> that
>>>> garbage collection does not occur.
>>>>
>>>> That is not the intention.
>>>>
>>>> It does not solve the race condition between the timer_handler and 
>>>> the periodic_work.
>>>
>>> I don't think it is a race.
>> And I think you are wrong and my logging shows that it is.
>
> We do not gurantee holding "stale" entries more than
> gc_thresh1, so it shall not be called as a race at all.


And that is a problem, which results in traffic loss.

>
>>
>>>
>>> You can try increasing gc_staletime to hold each entry based
>>> on last usage.  Plus, you can "confirm" neighbors by
>>> MSG_CONFIRM.
>>>
>>> Note that if the number of entries becomes high, "forced GC" will
>>> drop valid, "not connected" entries as well.
>>
>> I can try increasing gc_staletime, but its a waste of time, because 
>> it is not the last usage which is interesting.
>> What is interesting is the time when the entry state was updated by 
>> the timer handler.
>>
>> Pls explain further MSG_CONFIRM.
>
> Typically, base reachable time is set to 30sec and gc_staletime is
> set to 60sec.  So, entries are expected to be held for a while
> after it has become "stale", no?
>
> MSG_CONFIRM is a sendmsg() flag.  It allows user-space application
> to confirm reachability of neighbor.  It refreshes "confirmed"
> time. In neigh_periodic_work(), "used" time is updated to
> "confirmed" time if "used" time is before "confirmed" time.
>
May work, if the application was totally redesigned,

> ping(8), ping6(8), tftpd(8) use that flag, for example.
>
>
>> The problem occurs when the periodic_work routine runs immediately
>> after the timer handler has changes the state to NUD_STALE and
>> the entry has reached the expiry time.
>
> It is what we expect.
>
> In neigh_periodic_work(), you may try to release non-STALE
> entries first, and then STALE entries if the number of entries is
> still high.
That sounds much more complex than the proposed fix.

>
> --yoshfuji

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/Kconfig b/net/Kconfig
index 44dd578..099a5dd 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -77,6 +77,38 @@  config INET
 	  Short answer: say Y.
 
 if INET
+
+#
+# Core Network configuration
+#
+
+config ARP_GC_APPLY_GUARDBAND
+	bool "IP: ARP: Avoid garbage collection directly after state change"
+	default n
+	---help---
+	  With this item selected, an entry in the neighbour table
+	  will not be garbage collected directly after the ARP state
+	  has changed to STALE of FAILED
+	  This allows an application program change the state to something valid
+	  before garbage colllection occurs.
+
+	  If unsure, say N.
+
+config ARP_GC_GUARDBAND
+	int "Guardband time on garbage collection"
+	depends on ARP_GC_APPLY_GUARDBAND
+	default 10 if HZ_100
+	default 25 if HZ_250
+	default 30 if HZ_300
+	default 100 if HZ_1000
+	default 100
+
+	---help---
+	  The number of ticks to delay garbage collection
+	  after the neighbour entry has been updated
+	  A delay of 100 ms is reasonable.
+	  With CONFIG_HZ = 250, this value should be 25
+
 source "net/ipv4/Kconfig"
 source "net/ipv6/Kconfig"
 source "net/netlabel/Kconfig"
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 70fe9e1..194195d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -786,13 +786,23 @@  static void neigh_periodic_work(struct work_struct *work)
 
 			state = n->nud_state;
 			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
-				write_unlock(&n->lock);
 				goto next_elt;
 			}
 
 			if (time_before(n->used, n->confirmed))
 				n->used = n->confirmed;
 
+#if defined(CONFIG_ARP_GC_APPLY_GUARDBAND)
+			/* Do not garbage collect directly after we
+			 * updated n->state to allow applications to
+			 * react to the event
+			 */
+			if (time_before(jiffies,
+					n->updated + CONFIG_ARP_GC_GUARDBAND)) {
+				goto next_elt;
+			}
+#endif
+
 			if (atomic_read(&n->refcnt) == 1 &&
 			    (state == NUD_FAILED ||
 			     time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
@@ -802,9 +812,8 @@  static void neigh_periodic_work(struct work_struct *work)
 				neigh_cleanup_and_release(n);
 				continue;
 			}
-			write_unlock(&n->lock);
-
 next_elt:
+			write_unlock(&n->lock);
 			np = &n->next;
 		}
 		/*