diff mbox

[net-next,2/3] tipc: byte-based overload control on socket receive queue

Message ID 1360969067-29956-3-git-send-email-paul.gortmaker@windriver.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Paul Gortmaker Feb. 15, 2013, 10:57 p.m. UTC
From: Ying Xue <ying.xue@windriver.com>

Change overload control to be purely byte-based, using
sk->sk_rmem_alloc as byte counter, and compare it to a calculated
upper limit for the socket receive queue.

For all connection messages, irrespective of message importance,
the overload limit is set to a constant value (i.e, 67MB). This
limit should normally never be reached because of the lower
limit used by the flow control algorithm, and is there only
as a last resort in case a faulty peer doesn't respect the send
window limit.

For datagram messages, message importance is taken into account
when calculating the overload limit. The calculation is based
on sk->sk_rcvbuf, and is hence configurable via the socket option
SO_RCVBUF.

Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 net/tipc/socket.c | 77 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 38 deletions(-)

Comments

Neil Horman Feb. 18, 2013, 2:47 p.m. UTC | #1
On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> From: Ying Xue <ying.xue@windriver.com>
> 
> Change overload control to be purely byte-based, using
> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
> upper limit for the socket receive queue.
> 
> For all connection messages, irrespective of message importance,
> the overload limit is set to a constant value (i.e, 67MB). This
> limit should normally never be reached because of the lower
> limit used by the flow control algorithm, and is there only
> as a last resort in case a faulty peer doesn't respect the send
> window limit.
> 
> For datagram messages, message importance is taken into account
> when calculating the overload limit. The calculation is based
> on sk->sk_rcvbuf, and is hence configurable via the socket option
> SO_RCVBUF.
> 
> Cc: Neil Horman <nhorman@tuxdriver.com>
> Signed-off-by: Ying Xue <ying.xue@windriver.com>
> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com>
> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>

This looks alot better, thank you.  I still have a few questions though.  See
inline.

> ---
>  net/tipc/socket.c | 77 ++++++++++++++++++++++++++++---------------------------
>  1 file changed, 39 insertions(+), 38 deletions(-)
> 
> diff --git a/net/tipc/socket.c b/net/tipc/socket.c
> index f6ceecd..cbe2f6e 100644
> --- a/net/tipc/socket.c
> +++ b/net/tipc/socket.c
> @@ -43,7 +43,8 @@
>  #define SS_LISTENING	-1	/* socket is listening */
>  #define SS_READY	-2	/* socket is connectionless */
>  
> -#define OVERLOAD_LIMIT_BASE	10000
> +#define CONN_OVERLOAD_LIMIT	((TIPC_FLOW_CONTROL_WIN * 2 + 1) * \
> +				SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
>  #define CONN_TIMEOUT_DEFAULT	8000	/* default connect timeout = 8s */
>  
>  struct tipc_sock {
> @@ -202,7 +203,6 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol,
>  
>  	sock_init_data(sock, sk);
>  	sk->sk_backlog_rcv = backlog_rcv;
> -	sk->sk_rcvbuf = TIPC_FLOW_CONTROL_WIN * 2 * TIPC_MAX_USER_MSG_SIZE * 2;
>  	sk->sk_data_ready = tipc_data_ready;
>  	sk->sk_write_space = tipc_write_space;
>  	tipc_sk(sk)->p = tp_ptr;
> @@ -1142,34 +1142,6 @@ static void tipc_data_ready(struct sock *sk, int len)
>  }
>  
>  /**
> - * rx_queue_full - determine if receive queue can accept another message
> - * @msg: message to be added to queue
> - * @queue_size: current size of queue
> - * @base: nominal maximum size of queue
> - *
> - * Returns 1 if queue is unable to accept message, 0 otherwise
> - */
> -static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
> -{
> -	u32 threshold;
> -	u32 imp = msg_importance(msg);
> -
> -	if (imp == TIPC_LOW_IMPORTANCE)
> -		threshold = base;
> -	else if (imp == TIPC_MEDIUM_IMPORTANCE)
> -		threshold = base * 2;
> -	else if (imp == TIPC_HIGH_IMPORTANCE)
> -		threshold = base * 100;
> -	else
> -		return 0;
> -
> -	if (msg_connected(msg))
> -		threshold *= 4;
> -
> -	return queue_size >= threshold;
> -}
> -
> -/**
>   * filter_connect - Handle all incoming messages for a connection-based socket
>   * @tsock: TIPC socket
>   * @msg: message
> @@ -1247,6 +1219,36 @@ static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf)
>  }
>  
>  /**
> + * rcvbuf_limit - get proper overload limit of socket receive queue
> + * @sk: socket
> + * @buf: message
> + *
> + * For all connection oriented messages, irrespective of importance,
> + * the default overload value (i.e. 67MB) is set as limit.
> + *
> + * For all connectionless messages, by default new queue limits are
> + * as belows:
> + *
> + * TIPC_LOW_IMPORTANCE       (5MB)
> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
> + * TIPC_HIGH_IMPORTANCE      (20MB)
> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
> + *
> + * Returns overload limit according to corresponding message importance
> + */
> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
> +{
> +	struct tipc_msg *msg = buf_msg(buf);
> +	unsigned int limit;
> +
> +	if (msg_connected(msg))
> +		limit = CONN_OVERLOAD_LIMIT;
This still strikes me as a bit wierd.  If you really can't tolerate the default
rmem settings in proc, have you considered separating the rmem and wmem values
out into their own sysctls?  Decnet is an example of a protocol that does this
IIRC.  If you did that, then you wouldn't be mysteriously violating the
sk_rcvbuf value that gets set on all new sockets (and you could make this
legitimately tunable).

> +	else
> +		limit = sk->sk_rcvbuf << (msg_importance(msg) + 5);
Same here.  You're using sk_rcvbuf as a base value, rather than a maximum value.
It seems to me, sk_rcvbuf should be the maximum backlog at which you will store
incomming messages.  You can discard lower importance messages at fractions of
sk_rcvbuf if you need to.  If you create separate rmem and wmem sysctls you can
still make the queue limits you document above work, and you won't violate the
receive buffer value set in the socket.

You might also consider looking into adding support for memory accounting, so
that you can play a little more fast and loose with memory limits on an
individual socket when the system as a whole isn't under pressure (tcp and sctp
aer good examples of this).

Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 19, 2013, 8:07 a.m. UTC | #2
On 02/18/2013 09:47 AM, Neil Horman wrote:
> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>> From: Ying Xue <ying.xue@windriver.com>
>>
>> Change overload control to be purely byte-based, using
>> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
>> upper limit for the socket receive queue.

[...]

>> + *
>> + * For all connectionless messages, by default new queue limits are
>> + * as belows:
>> + *
>> + * TIPC_LOW_IMPORTANCE       (5MB)
>> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
>> + * TIPC_HIGH_IMPORTANCE      (20MB)
>> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
>> + *
>> + * Returns overload limit according to corresponding message importance
>> + */
>> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
>> +{
>> +	struct tipc_msg *msg = buf_msg(buf);
>> +	unsigned int limit;
>> +
>> +	if (msg_connected(msg))
>> +		limit = CONN_OVERLOAD_LIMIT;
> This still strikes me as a bit wierd.  If you really can't tolerate the default
> rmem settings in proc, have you considered separating the rmem and wmem values
> out into their own sysctls?  

Initially we tried to set this value as default for sk_rcvbuf, and then use
fractions of it as limits, as you suggest below. The problem we found was that
if we want to change this via setsockopt(SOL_SOCKET, SO_RCVBUF) the value range
we can use is very limited, and doesn't fit our purposes.

We did consider to introduce a separate setsockopt at TIPC level for this,
but thought it had a value in itself to use the mechanism that is already there. 
Hence the "re-interpretation" of sk_rcvbuf as we do below.
Considering the weird doubling of this parameter that is done elsewhere in the
code we thought that having our own interpretation might be acceptable.
We did of course see the potential issue with this, that is why we cc-ed
you for comments.
Now I see that David already pulled the series, so I am a little uncertain 
about how to proceed. 
Any comments from David on this?

///jon

Decnet is an example of a protocol that does this
> IIRC.  If you did that, then you wouldn't be mysteriously violating the
> sk_rcvbuf value that gets set on all new sockets (and you could make this
> legitimately tunable).
> 
>> +	else
>> +		limit = sk->sk_rcvbuf << (msg_importance(msg) + 5);
> Same here.  You're using sk_rcvbuf as a base value, rather than a maximum value.
> It seems to me, sk_rcvbuf should be the maximum backlog at which you will store
> incomming messages.  You can discard lower importance messages at fractions of
> sk_rcvbuf if you need to.  If you create separate rmem and wmem sysctls you can
> still make the queue limits you document above work, and you won't violate the
> receive buffer value set in the socket.

> 
> You might also consider looking into adding support for memory accounting, so
> that you can play a little more fast and loose with memory limits on an
> individual socket when the system as a whole isn't under pressure (tcp and sctp
> aer good examples of this).
> 
> Neil
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 19, 2013, 2:26 p.m. UTC | #3
On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> On 02/18/2013 09:47 AM, Neil Horman wrote:
> > On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >> From: Ying Xue <ying.xue@windriver.com>
> >>
> >> Change overload control to be purely byte-based, using
> >> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
> >> upper limit for the socket receive queue.
> 
> [...]
> 
> >> + *
> >> + * For all connectionless messages, by default new queue limits are
> >> + * as belows:
> >> + *
> >> + * TIPC_LOW_IMPORTANCE       (5MB)
> >> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
> >> + * TIPC_HIGH_IMPORTANCE      (20MB)
> >> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
> >> + *
> >> + * Returns overload limit according to corresponding message importance
> >> + */
> >> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
> >> +{
> >> +	struct tipc_msg *msg = buf_msg(buf);
> >> +	unsigned int limit;
> >> +
> >> +	if (msg_connected(msg))
> >> +		limit = CONN_OVERLOAD_LIMIT;
> > This still strikes me as a bit wierd.  If you really can't tolerate the default
> > rmem settings in proc, have you considered separating the rmem and wmem values
> > out into their own sysctls?  
> 
> Initially we tried to set this value as default for sk_rcvbuf, and then use
> fractions of it as limits, as you suggest below. The problem we found was that
> if we want to change this via setsockopt(SOL_SOCKET, SO_RCVBUF) the value range
> we can use is very limited, and doesn't fit our purposes.
> 
Can you elaborate on this please?  The above doesn't really explain why you
can't do what I suggested.  Not asserting that what you say is untrue, mind you,
I'm just trying to understand what it is about TIPC that requires such a
specific reception buffer envelope, and how enforcing queue limits here is so
important when packets could just as easily be dropped at the ip layer (with
ostensibly no fatal failure).

> We did consider to introduce a separate setsockopt at TIPC level for this,
> but thought it had a value in itself to use the mechanism that is already there. 
> Hence the "re-interpretation" of sk_rcvbuf as we do below.
> Considering the weird doubling of this parameter that is done elsewhere in the
> code we thought that having our own interpretation might be acceptable.
Thats quite different IMHO.  The comments in sock_setsockopt make it pretty
clear that the doubling of the rcvbuf value is done to account for the sk_buff
overhead of packet reception, and thats documented in the socket(7) man page.
What you have here is the ability to set sk_rcvbuf, and then have that setting
be ignored, but only to within several different limits, depending on various
conditions, all of which are not visible to user space.

> We did of course see the potential issue with this, that is why we cc-ed
> you for comments.
I appreciate that.

> Now I see that David already pulled the series, so I am a little uncertain 
> about how to proceed. 
I saw that too, and asked him about this.  A follow-on patch (if we wind up
deciding one is warranted) is the way to go here.

Regards
Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 19, 2013, 5:54 p.m. UTC | #4
On 02/19/2013 03:26 PM, Neil Horman wrote:
> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>> From: Ying Xue <ying.xue@windriver.com>
>>>>
>>>> Change overload control to be purely byte-based, using
>>>> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
>>>> upper limit for the socket receive queue.
>>
>> [...]
>>
>>>> + *
>>>> + * For all connectionless messages, by default new queue limits are
>>>> + * as belows:
>>>> + *
>>>> + * TIPC_LOW_IMPORTANCE       (5MB)
>>>> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
>>>> + * TIPC_HIGH_IMPORTANCE      (20MB)
>>>> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
>>>> + *
>>>> + * Returns overload limit according to corresponding message importance
>>>> + */
>>>> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
>>>> +{
>>>> +	struct tipc_msg *msg = buf_msg(buf);
>>>> +	unsigned int limit;
>>>> +
>>>> +	if (msg_connected(msg))
>>>> +		limit = CONN_OVERLOAD_LIMIT;
>>> This still strikes me as a bit wierd.  If you really can't tolerate the default
>>> rmem settings in proc, have you considered separating the rmem and wmem values
>>> out into their own sysctls?  
>>
>> Initially we tried to set this value as default for sk_rcvbuf, and then use
>> fractions of it as limits, as you suggest below. The problem we found was that
>> if we want to change this via setsockopt(SOL_SOCKET, SO_RCVBUF) the value range
>> we can use is very limited, and doesn't fit our purposes.
>>
> Can you elaborate on this please?  The above doesn't really explain why you
> can't do what I suggested.  Not asserting that what you say is untrue, mind you,
> I'm just trying to understand what it is about TIPC that requires such a
> specific reception buffer envelope, 
	
There are two reasons for this. 
The first one due to the message oriented nature of the flow control for 
connections. Max message size is 65k, and max number of unacked messages 
(at socket level, that is) before the sending process will take a break 
is 1024. 
So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
that a connection never is broken because of receiver overload. Contrary to TCP,
we don't have the luxury to just drop packets and expect them to be retransmitted,
because in TIPC they have already passed through the retransmission and 
defragmentation layers, and are committed for delivery when they reach the 
receiving socket.
You may question the wisdom of having a message oriented flow control algorithm, 
but this the way it is now. We do actually have a working prototype where we 
introduce purely byte-based flow control, but it is not ready for delivery yet, 
and compatibility constraints will require that we still keep this high limit 
in some cases.

The second reason is that TIPC provides SOCK_RDM instead of SOCK_DGRAM as its
basic datagram service. (It is the only transport service doing this afaik.)
So, the risk of having datagram messages rejected (not dropped), for any reason, 
must be extremely low. ("Rejected" here means that we notify the sender when a
message cannot be delivered, we don't just silently drop it.)
Given that we also often have seen designs with many clients sending towards
one server we have empirical experience that we must have a high threshold here.
One can discuss whether 2MB or 5MB is the adequate limit for the lowest level, 
we don't really now since this a new design, but we have every reason to believe
that the upper limit permitted by setsockopt(SOL_SOCK) (425,984 bytes according
to a quick test I made) is not enough for us.

> and how enforcing queue limits here is so
> important when packets could just as easily be dropped at the ip layer (with
> ostensibly no fatal failure).

There is no IP layer. TIPC and its retransmission layer sits directly on top of
L2/Ethernet. Nothing can be dropped below that layer, for obvious reasons,
and to drop anything above that layer *has* fatal consequences, because TIPC
guarantees delivery both for connection oriented and connectionless (as far as 
ever possible) services. 
Anyway, only the receiving socket contains the info making it possible to
select which messages to reject, in the rare cases where that becomes
unavoidable.

> 
>> We did consider to introduce a separate setsockopt at TIPC level for this,
>> but thought it had a value in itself to use the mechanism that is already there. 
>> Hence the "re-interpretation" of sk_rcvbuf as we do below.
>> Considering the weird doubling of this parameter that is done elsewhere in the
>> code we thought that having our own interpretation might be acceptable.
> Thats quite different IMHO.  The comments in sock_setsockopt make it pretty
> clear that the doubling of the rcvbuf value is done to account for the sk_buff
> overhead of packet reception, and thats documented in the socket(7) man page.
> What you have here is the ability to set sk_rcvbuf, and then have that setting
> be ignored, but only to within several different limits, depending on various
> conditions, all of which are not visible to user space.
> 
>> We did of course see the potential issue with this, that is why we cc-ed
>> you for comments.
> I appreciate that.
> 
>> Now I see that David already pulled the series, so I am a little uncertain 
>> about how to proceed. 
> I saw that too, and asked him about this.  A follow-on patch (if we wind up
> deciding one is warranted) is the way to go here.

Good. The best solution I see now, if you think the times-32 scaling is 
unacceptable, would be to introduce a setsockopt() at SOL_TIPC level, and allow
for much wider limits than SOL_SOCK permits now. That we need these wider limits
is beyond doubt, as I see it.

Regards
///jon

> 
> Regards
> Neil
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 19, 2013, 7:18 p.m. UTC | #5
On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> On 02/19/2013 03:26 PM, Neil Horman wrote:
> > On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>> From: Ying Xue <ying.xue@windriver.com>
> >>>>
> >>>> Change overload control to be purely byte-based, using
> >>>> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
> >>>> upper limit for the socket receive queue.
> >>
> >> [...]
> >>
> >>>> + *
> >>>> + * For all connectionless messages, by default new queue limits are
> >>>> + * as belows:
> >>>> + *
> >>>> + * TIPC_LOW_IMPORTANCE       (5MB)
> >>>> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
> >>>> + * TIPC_HIGH_IMPORTANCE      (20MB)
> >>>> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
> >>>> + *
> >>>> + * Returns overload limit according to corresponding message importance
> >>>> + */
> >>>> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
> >>>> +{
> >>>> +	struct tipc_msg *msg = buf_msg(buf);
> >>>> +	unsigned int limit;
> >>>> +
> >>>> +	if (msg_connected(msg))
> >>>> +		limit = CONN_OVERLOAD_LIMIT;
> >>> This still strikes me as a bit wierd.  If you really can't tolerate the default
> >>> rmem settings in proc, have you considered separating the rmem and wmem values
> >>> out into their own sysctls?  
> >>
> >> Initially we tried to set this value as default for sk_rcvbuf, and then use
> >> fractions of it as limits, as you suggest below. The problem we found was that
> >> if we want to change this via setsockopt(SOL_SOCKET, SO_RCVBUF) the value range
> >> we can use is very limited, and doesn't fit our purposes.
> >>
> > Can you elaborate on this please?  The above doesn't really explain why you
> > can't do what I suggested.  Not asserting that what you say is untrue, mind you,
> > I'm just trying to understand what it is about TIPC that requires such a
> > specific reception buffer envelope, 
> 	
> There are two reasons for this. 
> The first one due to the message oriented nature of the flow control for 
> connections. Max message size is 65k, and max number of unacked messages 
> (at socket level, that is) before the sending process will take a break 
> is 1024. 
> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
> that a connection never is broken because of receiver overload. Contrary to TCP,
Note, this is false, due to the fact that, it presumes that the sender will
honor the congestion window.  Granted, that would be a sender violation, but its
still possible for packets to get lost due to receiver overrun.  The fact that
you ack packets before accepting them to the receive queue is the problem that
needs fixing in this case, but that looks like it can be easily accomplished
(see below).

> we don't have the luxury to just drop packets and expect them to be retransmitted,
> because in TIPC they have already passed through the retransmission and 
> defragmentation layers, and are committed for delivery when they reach the 
> receiving socket.
The problem isn't that you don't have the luxury of droping packets, the problem
is that you've decided to ack an incomming packet before you've determined if
you have the space to accept it (I sympathise with you here, I had to go through
this same exercise with SCTP a few years ago).  Just trying to balance your
rcvbuf space so that you don't have to worry about it causes more problems than
it solves in the end.

The thing to do is just move your rcvbuf check lower in the stack, in this case
to tipc_recv_msg.  You have a tipc_msg struct there, from which you can
determine if the message is data, get the desination port, and convert that to a
tipc_port pointer, which contains a usr_handle pointer.  From my read, the
usr_handle pointer is only ever set to a socket structure, so if you have a non
NULL usr_handle, cast it to a struct sock, and read is sk_rcvbuf.  If its over
capacity, drop the packet before you ack it.

> You may question the wisdom of having a message oriented flow control algorithm, 
> but this the way it is now. We do actually have a working prototype where we 
> introduce purely byte-based flow control, but it is not ready for delivery yet, 
> and compatibility constraints will require that we still keep this high limit 
> in some cases.
> 
I do believe that a byte oriented algorithm is better, but for now a message
based mechanism is fine by me.  My only concern is that you honor the limits
placed on the socket.  I'm not sure what compatibilty requirement enjoin you
from having a lower rcvbuf limit, but it would seem to me, just not acking a
message until you are sure you can accept it would fix those problems.  Even if
that isn't the case, honor the limits, tie them to the tunables in /proc (or
create your own), and document what the required limits are.

> The second reason is that TIPC provides SOCK_RDM instead of SOCK_DGRAM as its
> basic datagram service. (It is the only transport service doing this afaik.)
> So, the risk of having datagram messages rejected (not dropped), for any reason, 
> must be extremely low. ("Rejected" here means that we notify the sender when a
> message cannot be delivered, we don't just silently drop it.)
Thats fine, I'm not talking about rejecting messages, I'm specifically referring
to dropping them - i.e. behaving as if they never arrived at the host at all
(save for maybe increasing a stats counter, so the admin knows whats going on).
One way or another the sender has to be prepared to handle an unacked frame.  If
you can't, then the protocol is broken.

> Given that we also often have seen designs with many clients sending towards
> one server we have empirical experience that we must have a high threshold here.
> One can discuss whether 2MB or 5MB is the adequate limit for the lowest level, 
> we don't really now since this a new design, but we have every reason to believe
> that the upper limit permitted by setsockopt(SOL_SOCK) (425,984 bytes according
> to a quick test I made) is not enough for us.
I'm not trying to force you to a lower sk_rcvbuf value, I'm fine with whatever
you want to set it to, my only request here is that you honor the limits set on
your socket at least semi accurately.  If you need to set that limit higher by
default, do so (theres sysctl values for that already:
sysctl_rmem_[min|default|max], or you can build your own, though I would
recommend you use the former).

Note also, that the upper limit of sk_rcvbuf isn't 425,984 bytes.  That sounds
more like what you have sysctl_rmem_max set to on your system at the moment.
Bump that up if you need it higher (do the same with sysctl_rmem_default, for
apps that need to expect your old buffer size value.

> 
> > and how enforcing queue limits here is so
> > important when packets could just as easily be dropped at the ip layer (with
> > ostensibly no fatal failure).
> 
> There is no IP layer. TIPC and its retransmission layer sits directly on top of
> L2/Ethernet. Nothing can be dropped below that layer, for obvious reasons,
> and to drop anything above that layer *has* fatal consequences, because TIPC
> guarantees delivery both for connection oriented and connectionless (as far as 
> ever possible) services. 
Sorry, I forgot TIPC sat directly on top of L2.  Regardless however, as noted
above, you can certainly still do discards below your ack point in your stack.

> Anyway, only the receiving socket contains the info making it possible to
> select which messages to reject, in the rare cases where that becomes
> unavoidable.
> 
Yup, and you can interrogate that from tipc_recv_msg

> > 
> >> We did consider to introduce a separate setsockopt at TIPC level for this,
> >> but thought it had a value in itself to use the mechanism that is already there. 
> >> Hence the "re-interpretation" of sk_rcvbuf as we do below.
> >> Considering the weird doubling of this parameter that is done elsewhere in the
> >> code we thought that having our own interpretation might be acceptable.
> > Thats quite different IMHO.  The comments in sock_setsockopt make it pretty
> > clear that the doubling of the rcvbuf value is done to account for the sk_buff
> > overhead of packet reception, and thats documented in the socket(7) man page.
> > What you have here is the ability to set sk_rcvbuf, and then have that setting
> > be ignored, but only to within several different limits, depending on various
> > conditions, all of which are not visible to user space.
> > 
> >> We did of course see the potential issue with this, that is why we cc-ed
> >> you for comments.
> > I appreciate that.
> > 
> >> Now I see that David already pulled the series, so I am a little uncertain 
> >> about how to proceed. 
> > I saw that too, and asked him about this.  A follow-on patch (if we wind up
> > deciding one is warranted) is the way to go here.
> 
> Good. The best solution I see now, if you think the times-32 scaling is 
> unacceptable, would be to introduce a setsockopt() at SOL_TIPC level, and allow
> for much wider limits than SOL_SOCK permits now. That we need these wider limits
> is beyond doubt, as I see it.
> 
No  See above, don't do a special socket option.  Just use the existing sysctl
to define your default and upper limits, and if they're not big enough, set them
higher,  you can use /etc/sysctl.conf to automate this.

Neil

> Regards
> ///jon
> 
> > 
> > Regards
> > Neil
> > 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 19, 2013, 8:16 p.m. UTC | #6
On 02/19/2013 08:18 PM, Neil Horman wrote:
> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
>> On 02/19/2013 03:26 PM, Neil Horman wrote:
>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>>>> From: Ying Xue <ying.xue@windriver.com>
>>>>>>
>>>>>> Change overload control to be purely byte-based, using
>>>>>> sk->sk_rmem_alloc as byte counter, and compare it to a calculated
>>>>>> upper limit for the socket receive queue.
>>>>
>>>> [...]
>>>>
>>>>>> + *
>>>>>> + * For all connectionless messages, by default new queue limits are
>>>>>> + * as belows:
>>>>>> + *
>>>>>> + * TIPC_LOW_IMPORTANCE       (5MB)
>>>>>> + * TIPC_MEDIUM_IMPORTANCE    (10MB)
>>>>>> + * TIPC_HIGH_IMPORTANCE      (20MB)
>>>>>> + * TIPC_CRITICAL_IMPORTANCE  (40MB)
>>>>>> + *
>>>>>> + * Returns overload limit according to corresponding message importance
>>>>>> + */
>>>>>> +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
>>>>>> +{
>>>>>> +	struct tipc_msg *msg = buf_msg(buf);
>>>>>> +	unsigned int limit;
>>>>>> +
>>>>>> +	if (msg_connected(msg))
>>>>>> +		limit = CONN_OVERLOAD_LIMIT;
>>>>> This still strikes me as a bit wierd.  If you really can't tolerate the default
>>>>> rmem settings in proc, have you considered separating the rmem and wmem values
>>>>> out into their own sysctls?  
>>>>
>>>> Initially we tried to set this value as default for sk_rcvbuf, and then use
>>>> fractions of it as limits, as you suggest below. The problem we found was that
>>>> if we want to change this via setsockopt(SOL_SOCKET, SO_RCVBUF) the value range
>>>> we can use is very limited, and doesn't fit our purposes.
>>>>
>>> Can you elaborate on this please?  The above doesn't really explain why you
>>> can't do what I suggested.  Not asserting that what you say is untrue, mind you,
>>> I'm just trying to understand what it is about TIPC that requires such a
>>> specific reception buffer envelope, 
>> 	
>> There are two reasons for this. 
>> The first one due to the message oriented nature of the flow control for 
>> connections. Max message size is 65k, and max number of unacked messages 
>> (at socket level, that is) before the sending process will take a break 
>> is 1024. 
>> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
>> that a connection never is broken because of receiver overload. Contrary to TCP,
> Note, this is false, due to the fact that, it presumes that the sender will
> honor the congestion window.  Granted, that would be a sender violation, but its
> still possible for packets to get lost due to receiver overrun.  

The reason for this high limit is exactly to guard against crazy or malevolent 
senders. If they respect their send window they will never hit this limit on
connections.

The fact that
> you ack packets before accepting them to the receive queue is the problem that
> needs fixing in this case, but that looks like it can be easily accomplished
> (see below).
> 
>> we don't have the luxury to just drop packets and expect them to be retransmitted,
>> because in TIPC they have already passed through the retransmission and 
>> defragmentation layers, and are committed for delivery when they reach the 
>> receiving socket.
> The problem isn't that you don't have the luxury of droping packets, the problem
> is that you've decided to ack an incomming packet before you've determined if
> you have the space to accept it (I sympathise with you here, I had to go through
> this same exercise with SCTP a few years ago).  Just trying to balance your
> rcvbuf space so that you don't have to worry about it causes more problems than
> it solves in the end.

I think you are missing an important point here. There is no one-to-one relation
between a link and a socket. The link serves *all* socket on a node, with one shared
send/retransmission queue and one shared packet number sequence.  So dropping 
packets on arrival as you suggest will not only hurt the process sending too many 
messages, but everybody else sending anything between the two nodes.
Granted, the retransmission will take care of the dropped packet, but in the 
meantime no other packets can be delivered through from that link, to any 
socket. Basically, all TIPC communication between the two involved
nodes in the given direction would grind to a halt until the slow or overwhelmed 
receiver process has decided to work off his receive queue, something that may 
never happen if it is faulty.
You may see this as a flaw, but it is a consequence of that TIPC is a radically 
different design than IP based protocols, designed for completely different needs.
The shared link concept is a fundamental feature that has a lot of other advantages 
which I won't elaborate on here.

> 
> The thing to do is just move your rcvbuf check lower in the stack, in this case
> to tipc_recv_msg.  You have a tipc_msg struct there, from which you can
> determine if the message is data, get the desination port, and convert that to a
> tipc_port pointer, which contains a usr_handle pointer.  From my read, the
> usr_handle pointer is only ever set to a socket structure, 

Unfortunately not. There are still remnants of the "native" interface present, 
which is not socket based at all. We are working on that.

so if you have a non
> NULL usr_handle, cast it to a struct sock, and read is sk_rcvbuf.  If its over
> capacity, drop the packet before you ack it.

In theory this is possible, but extremely awkward with the current locking
structure (We are working on that too). Anyway, I hope I have convinced
you with the above that dropping packets at the link level is not a viable
option.

> 
>> You may question the wisdom of having a message oriented flow control algorithm, 
>> but this the way it is now. We do actually have a working prototype where we 
>> introduce purely byte-based flow control, but it is not ready for delivery yet, 
>> and compatibility constraints will require that we still keep this high limit 
>> in some cases.
>>
> I do believe that a byte oriented algorithm is better, but for now a message
> based mechanism is fine by me.  My only concern is that you honor the limits
> placed on the socket.  I'm not sure what compatibilty requirement enjoin you
> from having a lower rcvbuf limit, but it would seem to me, just not acking a
> message until you are sure you can accept it would fix those problems.  Even if
> that isn't the case, honor the limits, tie them to the tunables in /proc (or
> create your own), and document what the required limits are.
> 
>> The second reason is that TIPC provides SOCK_RDM instead of SOCK_DGRAM as its
>> basic datagram service. (It is the only transport service doing this afaik.)
>> So, the risk of having datagram messages rejected (not dropped), for any reason, 
>> must be extremely low. ("Rejected" here means that we notify the sender when a
>> message cannot be delivered, we don't just silently drop it.)
> Thats fine, I'm not talking about rejecting messages, I'm specifically referring
> to dropping them - i.e. behaving as if they never arrived at the host at all
> (save for maybe increasing a stats counter, so the admin knows whats going on).
> One way or another the sender has to be prepared to handle an unacked frame.  If
> you can't, then the protocol is broken.
> 
>> Given that we also often have seen designs with many clients sending towards
>> one server we have empirical experience that we must have a high threshold here.
>> One can discuss whether 2MB or 5MB is the adequate limit for the lowest level, 
>> we don't really now since this a new design, but we have every reason to believe
>> that the upper limit permitted by setsockopt(SOL_SOCK) (425,984 bytes according
>> to a quick test I made) is not enough for us.
> I'm not trying to force you to a lower sk_rcvbuf value, I'm fine with whatever
> you want to set it to, my only request here is that you honor the limits set on
> your socket at least semi accurately.  If you need to set that limit higher by
> default, do so (theres sysctl values for that already:
> sysctl_rmem_[min|default|max], or you can build your own, though I would
> recommend you use the former).

Ok, I wasn't aware of that. Now, if we could set these parameters from inside 
the module, when a socket is created, we I think we have what we need. We 
don't want to force every socket creator to set these limits explicitly, unless
he has some very particular needs and knows what he is doing.

///jon

> 
> Note also, that the upper limit of sk_rcvbuf isn't 425,984 bytes.  That sounds
> more like what you have sysctl_rmem_max set to on your system at the moment.
> Bump that up if you need it higher (do the same with sysctl_rmem_default, for
> apps that need to expect your old buffer size value.
> 
>>
>>> and how enforcing queue limits here is so
>>> important when packets could just as easily be dropped at the ip layer (with
>>> ostensibly no fatal failure).
>>
>> There is no IP layer. TIPC and its retransmission layer sits directly on top of
>> L2/Ethernet. Nothing can be dropped below that layer, for obvious reasons,
>> and to drop anything above that layer *has* fatal consequences, because TIPC
>> guarantees delivery both for connection oriented and connectionless (as far as 
>> ever possible) services. 
> Sorry, I forgot TIPC sat directly on top of L2.  Regardless however, as noted
> above, you can certainly still do discards below your ack point in your stack.
> 
>> Anyway, only the receiving socket contains the info making it possible to
>> select which messages to reject, in the rare cases where that becomes
>> unavoidable.
>>
> Yup, and you can interrogate that from tipc_recv_msg
> 
>>>
>>>> We did consider to introduce a separate setsockopt at TIPC level for this,
>>>> but thought it had a value in itself to use the mechanism that is already there. 
>>>> Hence the "re-interpretation" of sk_rcvbuf as we do below.
>>>> Considering the weird doubling of this parameter that is done elsewhere in the
>>>> code we thought that having our own interpretation might be acceptable.
>>> Thats quite different IMHO.  The comments in sock_setsockopt make it pretty
>>> clear that the doubling of the rcvbuf value is done to account for the sk_buff
>>> overhead of packet reception, and thats documented in the socket(7) man page.
>>> What you have here is the ability to set sk_rcvbuf, and then have that setting
>>> be ignored, but only to within several different limits, depending on various
>>> conditions, all of which are not visible to user space.
>>>
>>>> We did of course see the potential issue with this, that is why we cc-ed
>>>> you for comments.
>>> I appreciate that.
>>>
>>>> Now I see that David already pulled the series, so I am a little uncertain 
>>>> about how to proceed. 
>>> I saw that too, and asked him about this.  A follow-on patch (if we wind up
>>> deciding one is warranted) is the way to go here.
>>
>> Good. The best solution I see now, if you think the times-32 scaling is 
>> unacceptable, would be to introduce a setsockopt() at SOL_TIPC level, and allow
>> for much wider limits than SOL_SOCK permits now. That we need these wider limits
>> is beyond doubt, as I see it.
>>
> No  See above, don't do a special socket option.  Just use the existing sysctl
> to define your default and upper limits, and if they're not big enough, set them
> higher,  you can use /etc/sysctl.conf to automate this.
> 
> Neil
> 
>> Regards
>> ///jon
>>
>>>
>>> Regards
>>> Neil
>>>
>>
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 19, 2013, 9:44 p.m. UTC | #7
On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
> On 02/19/2013 08:18 PM, Neil Horman wrote:
> > On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> >> On 02/19/2013 03:26 PM, Neil Horman wrote:
> >>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>>>> From: Ying Xue <ying.xue@windriver.com>
><snip>

> >> 	
> >> There are two reasons for this. 
> >> The first one due to the message oriented nature of the flow control for 
> >> connections. Max message size is 65k, and max number of unacked messages 
> >> (at socket level, that is) before the sending process will take a break 
> >> is 1024. 
> >> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
> >> that a connection never is broken because of receiver overload. Contrary to TCP,
> > Note, this is false, due to the fact that, it presumes that the sender will
> > honor the congestion window.  Granted, that would be a sender violation, but its
> > still possible for packets to get lost due to receiver overrun.  
> 
> The reason for this high limit is exactly to guard against crazy or malevolent 
> senders. If they respect their send window they will never hit this limit on
> connections.
> 
Nope, You don't get to have it both ways - If a sender is malevolent or crazy,
what makes you think it will respect its send window?

> The fact that
> > you ack packets before accepting them to the receive queue is the problem that
> > needs fixing in this case, but that looks like it can be easily accomplished
> > (see below).
> > 
> >> we don't have the luxury to just drop packets and expect them to be retransmitted,
> >> because in TIPC they have already passed through the retransmission and 
> >> defragmentation layers, and are committed for delivery when they reach the 
> >> receiving socket.
> > The problem isn't that you don't have the luxury of droping packets, the problem
> > is that you've decided to ack an incomming packet before you've determined if
> > you have the space to accept it (I sympathise with you here, I had to go through
> > this same exercise with SCTP a few years ago).  Just trying to balance your
> > rcvbuf space so that you don't have to worry about it causes more problems than
> > it solves in the end.
> 
> I think you are missing an important point here. There is no one-to-one relation
> between a link and a socket. The link serves *all* socket on a node, with one shared
> send/retransmission queue and one shared packet number sequence.  So dropping 
> packets on arrival as you suggest will not only hurt the process sending too many 
> messages, but everybody else sending anything between the two nodes.
No, I get that, but the fact that TIPC has a shared retransmit queue between all
sockets on the link isn't an excuse to violate the limits set on an individual
socket.

> Granted, the retransmission will take care of the dropped packet, but in the 
> meantime no other packets can be delivered through from that link, to any 
> socket. Basically, all TIPC communication between the two involved
> nodes in the given direction would grind to a halt until the slow or overwhelmed 
> receiver process has decided to work off his receive queue, something that may 
> never happen if it is faulty.
Sounds like a bug.  That should be fixed.

> You may see this as a flaw, but it is a consequence of that TIPC is a radically 
> different design than IP based protocols, designed for completely different needs.
> The shared link concept is a fundamental feature that has a lot of other advantages 
> which I won't elaborate on here.
> 
Very well.  While I'm thinking of it though, you also seem to be making a large
leap in reasoning - you seem to be repeatedly equating my request to have you
honor the limits of sk_rcvbuf, with a desire to have that limit be smaller than
what you currently have it set to.  Thats absolutely untrue.  I don't care if
you set your limit to UINT_MAX/2 on a socket, I just want what you set to be
honored.  If you set your sk_rcvbuf high enough (using setsockopt or the proc
tunables I mentioned below), checking to make sure you're complying with those values
should do nothing from a rx path standpoint (i.e. if you set your rcvbuf high
enough, you will never hit the limit anyway, unless of course you have a
malevolent sender, in which case all bets are off anyway).

> > 
> > The thing to do is just move your rcvbuf check lower in the stack, in this case
> > to tipc_recv_msg.  You have a tipc_msg struct there, from which you can
> > determine if the message is data, get the desination port, and convert that to a
> > tipc_port pointer, which contains a usr_handle pointer.  From my read, the
> > usr_handle pointer is only ever set to a socket structure, 
> 
> Unfortunately not. There are still remnants of the "native" interface present, 
> which is not socket based at all. We are working on that.
> 
Ugh, you're right.  Well, thats still just two possible types in usr_handle, you
could easily add a boolean flag to mark those ports which had sockets assigned
to them, at least until you cleaned out the native interface

> so if you have a non
> > NULL usr_handle, cast it to a struct sock, and read is sk_rcvbuf.  If its over
> > capacity, drop the packet before you ack it.
> 
> In theory this is possible, but extremely awkward with the current locking
> structure (We are working on that too). Anyway, I hope I have convinced
> you with the above that dropping packets at the link level is not a viable
> option.
> 
The only locking that I see you doing to check the sk_rcvbuf value in
its current location is the tipc_port_lock, and you do that with all the same
locks held farther up the call stack (tipc_net_lock, tipc_node_lock).  The fact
of the matter is, moving that check lower down into tipc_recv_msg doesn't
require any nesting changes to your locking at all.

> > 
> >> You may question the wisdom of having a message oriented flow control algorithm, 
> >> but this the way it is now. We do actually have a working prototype where we 
> >> introduce purely byte-based flow control, but it is not ready for delivery yet, 
> >> and compatibility constraints will require that we still keep this high limit 
> >> in some cases.
> >>
> > I do believe that a byte oriented algorithm is better, but for now a message
> > based mechanism is fine by me.  My only concern is that you honor the limits
> > placed on the socket.  I'm not sure what compatibilty requirement enjoin you
> > from having a lower rcvbuf limit, but it would seem to me, just not acking a
> > message until you are sure you can accept it would fix those problems.  Even if
> > that isn't the case, honor the limits, tie them to the tunables in /proc (or
> > create your own), and document what the required limits are.
> > 
> >> The second reason is that TIPC provides SOCK_RDM instead of SOCK_DGRAM as its
> >> basic datagram service. (It is the only transport service doing this afaik.)
> >> So, the risk of having datagram messages rejected (not dropped), for any reason, 
> >> must be extremely low. ("Rejected" here means that we notify the sender when a
> >> message cannot be delivered, we don't just silently drop it.)
> > Thats fine, I'm not talking about rejecting messages, I'm specifically referring
> > to dropping them - i.e. behaving as if they never arrived at the host at all
> > (save for maybe increasing a stats counter, so the admin knows whats going on).
> > One way or another the sender has to be prepared to handle an unacked frame.  If
> > you can't, then the protocol is broken.
> > 
> >> Given that we also often have seen designs with many clients sending towards
> >> one server we have empirical experience that we must have a high threshold here.
> >> One can discuss whether 2MB or 5MB is the adequate limit for the lowest level, 
> >> we don't really now since this a new design, but we have every reason to believe
> >> that the upper limit permitted by setsockopt(SOL_SOCK) (425,984 bytes according
> >> to a quick test I made) is not enough for us.
> > I'm not trying to force you to a lower sk_rcvbuf value, I'm fine with whatever
> > you want to set it to, my only request here is that you honor the limits set on
> > your socket at least semi accurately.  If you need to set that limit higher by
> > default, do so (theres sysctl values for that already:
> > sysctl_rmem_[min|default|max], or you can build your own, though I would
> > recommend you use the former).
> 
> Ok, I wasn't aware of that. Now, if we could set these parameters from inside 
> the module, when a socket is created, we I think we have what we need. We 
> don't want to force every socket creator to set these limits explicitly, unless
> he has some very particular needs and knows what he is doing.
> 
I've tried to explain this several times now.  You don't have to have set this value to
what you want programatically, nor do you need to force the value from within
the module code itself, you can do it administratively.  If you set:

/proc/sys/net/core/rmem_default

to the value that you want all your sockets to have, any socket that gets
initalized with sock_init_data will inherit that value.  Note that, when doing
so, you may also have to set:

/proc/sys/net/core/rmem_max

As you can't adminstratively set your default socket rcvbuf value to something
larger than the maximum allowed value without raising the maximum allowed value
first.

Then all you have to do is make sure those values are set during boot up, and for
users, it will appear as though nothing has changed.

Neil

> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 21, 2013, 10:24 a.m. UTC | #8
On 02/19/2013 10:44 PM, Neil Horman wrote:
> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
>> On 02/19/2013 08:18 PM, Neil Horman wrote:
>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
>> <snip>
> 
>>>> 	
>>>> There are two reasons for this. 
>>>> The first one due to the message oriented nature of the flow control for 
>>>> connections. Max message size is 65k, and max number of unacked messages 
>>>> (at socket level, that is) before the sending process will take a break 
>>>> is 1024. 
>>>> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
>>>> that a connection never is broken because of receiver overload. Contrary to TCP,
>>> Note, this is false, due to the fact that, it presumes that the sender will
>>> honor the congestion window.  Granted, that would be a sender violation, but its
>>> still possible for packets to get lost due to receiver overrun.  
>>
>> The reason for this high limit is exactly to guard against crazy or malevolent 
>> senders. If they respect their send window they will never hit this limit on
>> connections.
>>
> Nope, You don't get to have it both ways - If a sender is malevolent or crazy,
> what makes you think it will respect its send window?

Nothing. That is why we need this extra security measure that this limit provides.
The normal senders will never hit it, the crazy ones will, and have their 
connections broken as a consequence. 
Sorry I didn't express this clearly enough.

> 
>> The fact that
>>> you ack packets before accepting them to the receive queue is the problem that
>>> needs fixing in this case, but that looks like it can be easily accomplished
>>> (see below).
>>>
[...]
> No, I get that, but the fact that TIPC has a shared retransmit queue between all
> sockets on the link isn't an excuse to violate the limits set on an individual
> socket.
> 
>> Granted, the retransmission will take care of the dropped packet, but in the 
>> meantime no other packets can be delivered through from that link, to any 
>> socket. Basically, all TIPC communication between the two involved
>> nodes in the given direction would grind to a halt until the slow or overwhelmed 
>> receiver process has decided to work off his receive queue, something that may 
>> never happen if it is faulty.
> Sounds like a bug.  That should be fixed.

This is not a bug, but an inherent  property of any protocol providing
sequential, guaranteed delivery of packets, TCP inclusive.
If you lose/drop a packet in the sequence, no subsequent packets in
the stream can be delivered until the missing one has been retransmitted and
delivered. To make this delivery dependent of the whims of each and any of the 
potentially hundreds of receiving processes is simply not a good idea.

I could even show that your proposal would cause almost immediate
deadlock, due to the order incoming and outgoing data paths are grabbing 
the locks. But I would rather drop this part of the discussion; we can 
achieve what we want anyway, with much simpler means. See below.

> 
>> You may see this as a flaw, but it is a consequence of that TIPC is a radically 
>> different design than IP based protocols, designed for completely different needs.
>> The shared link concept is a fundamental feature that has a lot of other advantages 
>> which I won't elaborate on here.
>>
> Very well.  While I'm thinking of it though, you also seem to be making a large
> leap in reasoning - you seem to be repeatedly equating my request to have you
> honor the limits of sk_rcvbuf, with a desire to have that limit be smaller than
> what you currently have it set to.  

Not at all. Just like you, I am trying to find a way to tune up the defaults 
and the the limits using existing mechanisms, so that we can honour the 
nominal value of sk_rcvbuf. 
I think what you suggested may be a way to achieve this, but I am a little 
worried about side effects. I'll explain further down.

> Thats absolutely untrue.  I don't care if
> you set your limit to UINT_MAX/2 on a socket, I just want what you set to be

[...]

>>
>> Ok, I wasn't aware of that. Now, if we could set these parameters from inside 
>> the module, when a socket is created, we I think we have what we need. We 
>> don't want to force every socket creator to set these limits explicitly, unless
>> he has some very particular needs and knows what he is doing.
>>
> I've tried to explain this several times now.  You don't have to have set this value to
> what you want programatically, nor do you need to force the value from within
> the module code itself, you can do it administratively.  

This is exactly what I want to avoid. To force the users to set special parameters
during boot up just to make TIPC work would become another obstacle for the its 
adoption. TIPC is extremely easy to install and use, and it should remain so.

A programmatic solution is much better, if we can solve it from inside the module,
and this is what I am convinced that we can.
The following is an approach we discussed (inside the team) earlier, but dropped
due to the perceived impossibility to set the sk_rcvbuf to the desired values.
Now this limitation is not there, as I understand, and we can revive our 
proposal.

1: When a TIPC socket is created, sk_rcvbuf is set to 64MB+ (as it was before the 
   latest patch). This is the limit used as last resort against connected peers
   not respecting the connection send window.

2: Datagram messages are checked against fractions of this value, according to
   their importance priority. E.g. LOW->sk_rcvbuf/16,  MEDIUM->sk_rcvbuf/8
   HIGH->sk_rcvbuf/4 and CRITICAL->sk_rcvbuf/2.

3: In the unlikely event that anybody wants to change these limits, he can change
   sk_rcvbuf via setsockopt(SOL_SOCK) either by first changing
   /proc/sys/net/core/rmem_max as you suggest, or via a dedicated setsockopt(SOL_TIPC).
   The latter option would have the advantage that it could enforce a lower limit of
   skr_rcvbuf of 64MB, something that would be unthinkable with SOL_SOCK, because it 
   would obviously affect other protocols.

What do you think?

///jon

If you set:
> 
> /proc/sys/net/core/rmem_default

But this will 
> 
> to the value that you want all your sockets to have, any socket that gets
> initalized with sock_init_data will inherit that value.  Note that, when doing
> so, you may also have to set:
> 
> 
> 
> As you can't adminstratively set your default socket rcvbuf value to something
> larger than the maximum allowed value without raising the maximum allowed value
> first.
> 
> Then all you have to do is make sure those values are set during boot up, and for
> users, it will appear as though nothing has changed.
> 
> Neil
> 
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 21, 2013, 3:07 p.m. UTC | #9
On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
> On 02/19/2013 10:44 PM, Neil Horman wrote:
> > On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
> >> On 02/19/2013 08:18 PM, Neil Horman wrote:
> >>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> >>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
> >>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>>>>>> From: Ying Xue <ying.xue@windriver.com>
> >> <snip>
> > 
> >>>> 	
> >>>> There are two reasons for this. 
> >>>> The first one due to the message oriented nature of the flow control for 
> >>>> connections. Max message size is 65k, and max number of unacked messages 
> >>>> (at socket level, that is) before the sending process will take a break 
> >>>> is 1024. 
> >>>> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
> >>>> that a connection never is broken because of receiver overload. Contrary to TCP,
> >>> Note, this is false, due to the fact that, it presumes that the sender will
> >>> honor the congestion window.  Granted, that would be a sender violation, but its
> >>> still possible for packets to get lost due to receiver overrun.  
> >>
> >> The reason for this high limit is exactly to guard against crazy or malevolent 
> >> senders. If they respect their send window they will never hit this limit on
> >> connections.
> >>
> > Nope, You don't get to have it both ways - If a sender is malevolent or crazy,
> > what makes you think it will respect its send window?
> 
> Nothing. That is why we need this extra security measure that this limit provides.
> The normal senders will never hit it, the crazy ones will, and have their 
> connections broken as a consequence. 
> Sorry I didn't express this clearly enough.
This isn't a security measure.  If a sender is well
behaved, then as you say, they won't violate their send window, and will
ostensibly behave properly (regardless of what limit you set on sk_rcvbuf, be it
the large one you currently employ, or something bigger or smaller).  If they
are malicious, then they will hit the limit, and they won't care (because
they're
malicious).  They may have their connections broken, but so what?  Its the
receiver that needs to be able to protect itself.  The static limit you set using
this implementation is likely fine for many systems that have sufficient memory,
but consider a system that is memory constrained.  If a user administratively
tunes down sk_rcvbuf to avoid over-consumption of memory on a TIPC socket, that
condition will be ignored (or at least severly violated).  Thats bad, and may
lead to memory exhaustion despite having administratively taken action to fix
it.

> 
> > 
> >> The fact that
> >>> you ack packets before accepting them to the receive queue is the problem that
> >>> needs fixing in this case, but that looks like it can be easily accomplished
> >>> (see below).
> >>>
> [...]
> > No, I get that, but the fact that TIPC has a shared retransmit queue between all
> > sockets on the link isn't an excuse to violate the limits set on an individual
> > socket.
> > 
> >> Granted, the retransmission will take care of the dropped packet, but in the 
> >> meantime no other packets can be delivered through from that link, to any 
> >> socket. Basically, all TIPC communication between the two involved
> >> nodes in the given direction would grind to a halt until the slow or overwhelmed 
> >> receiver process has decided to work off his receive queue, something that may 
> >> never happen if it is faulty.
> > Sounds like a bug.  That should be fixed.
> 
> This is not a bug, but an inherent  property of any protocol providing
> sequential, guaranteed delivery of packets, TCP inclusive.
> If you lose/drop a packet in the sequence, no subsequent packets in
> the stream can be delivered until the missing one has been retransmitted and
> delivered. To make this delivery dependent of the whims of each and any of the 
> potentially hundreds of receiving processes is simply not a good idea.
> 
Clearly sequential packet delivery implies the need to queue and postpone
application delivery of packets behind the reception of a lost packet in
sequence if it is dropped.  The problem I'm referring to here is that you seem
to have built TIPC around the notion of a single sequence namespace per system,
rather than one per socket.  TCP, employing sequential delivery, means that
dropping a packet stalls reception, but only for the socket in question.  The
fact you created TIPC with a design point in which packet loss blocked an entire
system, rather than only the intended receiving application is the issue here,
and doing so doesn't give you the freedom to violate other control points in the
network stack.

> I could even show that your proposal would cause almost immediate
> deadlock, due to the order incoming and outgoing data paths are grabbing 
> the locks. But I would rather drop this part of the discussion; we can 
> achieve what we want anyway, with much simpler means. See below.
> 
Perhaps you could, but if you were to deomonstrate how lowering your recieve
buffer limit (or even just honoring whatever is set), results in a deadlock
(ostensibly within the TIPC protocol code), how do you justify that as a reason
why you should be allowed to simply violate your set socket limits, rather than
recognizing the deadlock as a coding bug that needs to be fixed?  If you like,
we can begin a separate thread on that subject and look into fixing the issue.

> > 
> >> You may see this as a flaw, but it is a consequence of that TIPC is a radically 
> >> different design than IP based protocols, designed for completely different needs.
> >> The shared link concept is a fundamental feature that has a lot of other advantages 
> >> which I won't elaborate on here.
> >>
> > Very well.  While I'm thinking of it though, you also seem to be making a large
> > leap in reasoning - you seem to be repeatedly equating my request to have you
> > honor the limits of sk_rcvbuf, with a desire to have that limit be smaller than
> > what you currently have it set to.  
> 
> Not at all. Just like you, I am trying to find a way to tune up the defaults 
> and the the limits using existing mechanisms, so that we can honour the 
> nominal value of sk_rcvbuf. 
> I think what you suggested may be a way to achieve this, but I am a little 
> worried about side effects. I'll explain further down.
> 
> > Thats absolutely untrue.  I don't care if
> > you set your limit to UINT_MAX/2 on a socket, I just want what you set to be
> 
> [...]
> 
> >>
> >> Ok, I wasn't aware of that. Now, if we could set these parameters from inside 
> >> the module, when a socket is created, we I think we have what we need. We 
> >> don't want to force every socket creator to set these limits explicitly, unless
> >> he has some very particular needs and knows what he is doing.
> >>
> > I've tried to explain this several times now.  You don't have to have set this value to
> > what you want programatically, nor do you need to force the value from within
> > the module code itself, you can do it administratively.  
> 
> This is exactly what I want to avoid. To force the users to set special parameters
> during boot up just to make TIPC work would become another obstacle for the its 
> adoption. TIPC is extremely easy to install and use, and it should remain so.
> 
Theres nothing special about the default values for socket limits.  They're well
known, and well understood.  If the defaults don't work for your protocol, you
can adjust them.  Its two lines in /etc/sysctl.conf, thats it.  And if you were
to implement the memory pressue interface in the protocol, you could likely
skip doing even that.

> A programmatic solution is much better, if we can solve it from inside the module,
> and this is what I am convinced that we can.
> The following is an approach we discussed (inside the team) earlier, but dropped
> due to the perceived impossibility to set the sk_rcvbuf to the desired values.
> Now this limitation is not there, as I understand, and we can revive our 
> proposal.
> 
> 1: When a TIPC socket is created, sk_rcvbuf is set to 64MB+ (as it was before the 
>    latest patch). This is the limit used as last resort against connected peers
>    not respecting the connection send window.
> 
As I noted above, as long as there is a method to change the administrative
default, thats fine.

> 2: Datagram messages are checked against fractions of this value, according to
>    their importance priority. E.g. LOW->sk_rcvbuf/16,  MEDIUM->sk_rcvbuf/8
>    HIGH->sk_rcvbuf/4 and CRITICAL->sk_rcvbuf/2.
> 
Yes, this is what I proposed in my initial email.

> 3: In the unlikely event that anybody wants to change these limits, he can change
>    sk_rcvbuf via setsockopt(SOL_SOCK) either by first changing
>    /proc/sys/net/core/rmem_max as you suggest, or via a dedicated setsockopt(SOL_TIPC).
>    The latter option would have the advantage that it could enforce a lower limit of
>    skr_rcvbuf of 64MB, something that would be unthinkable with SOL_SOCK, because it 
>    would obviously affect other protocols.
Theres really no need for a private TIPC socket option here.  You do this at the
socket level, and if you want to set it higher than rmem_max, you adjust
rmem_max first.  Its just the way the system works.  Every other protocol
conforms to it, theres no reason TIPC shouldn't also.
> 
> What do you think?
> 
Yes, this is for the most part, good.
Thanks
Neil
> ///jon
> 
> If you set:
> > 
> > /proc/sys/net/core/rmem_default
> 
> But this will 
> > 
> > to the value that you want all your sockets to have, any socket that gets
> > initalized with sock_init_data will inherit that value.  Note that, when doing
> > so, you may also have to set:
> > 
> > 
> > 
> > As you can't adminstratively set your default socket rcvbuf value to something
> > larger than the maximum allowed value without raising the maximum allowed value
> > first.
> > 
> > Then all you have to do is make sure those values are set during boot up, and for
> > users, it will appear as though nothing has changed.
> > 
> > Neil
> > 
> >>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 21, 2013, 4:54 p.m. UTC | #10
On 02/21/2013 04:07 PM, Neil Horman wrote:
> On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
>> On 02/19/2013 10:44 PM, Neil Horman wrote:
>>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
>>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
>>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
>>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
>>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
>>>> <snip>
>>>
>>>>>> 	
>>>>>> There are two reasons for this. 
>>>>>> The first one due to the message oriented nature of the flow control for 
>>>>>> connections. Max message size is 65k, and max number of unacked messages 
>>>>>> (at socket level, that is) before the sending process will take a break 
>>>>>> is 1024. 
>>>>>> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
>>>>>> that a connection never is broken because of receiver overload. Contrary to TCP,
>>>>> Note, this is false, due to the fact that, it presumes that the sender will
>>>>> honor the congestion window.  Granted, that would be a sender violation, but its
>>>>> still possible for packets to get lost due to receiver overrun.  
>>>>
>>>> The reason for this high limit is exactly to guard against crazy or malevolent 
>>>> senders. If they respect their send window they will never hit this limit on
>>>> connections.
>>>>
>>> Nope, You don't get to have it both ways - If a sender is malevolent or crazy,
>>> what makes you think it will respect its send window?
>>
>> Nothing. That is why we need this extra security measure that this limit provides.
>> The normal senders will never hit it, the crazy ones will, and have their 
>> connections broken as a consequence. 
>> Sorry I didn't express this clearly enough.
> This isn't a security measure.  If a sender is well
> behaved, then as you say, they won't violate their send window, and will
> ostensibly behave properly (regardless of what limit you set on sk_rcvbuf, be it
> the large one you currently employ, or something bigger or smaller).  If they
> are malicious, then they will hit the limit, and they won't care (because
> they're malicious).  They may have their connections broken, but so what?  Its the
> receiver that needs to be able to protect itself.  The static limit you set using
> this implementation is likely fine for many systems that have sufficient memory,
> but consider a system that is memory constrained. 

The default limit is insanely high of course, but until we have introduced byte 
oriented flow control this is what we have to deal with, for reasons I explained
earlier.

If anybody want to turn it down, it will probably work well in most cases.
The risk of having connections broken rises from 0 to some low percentage. 
If somebody decides to do this, it is probably because he can accept
that risk, or because he knows the apps he is running are nice, e.g. never send 
out streams of 64k messages the way I described as a worst-case scenario.

> If a user administratively
> tunes down sk_rcvbuf to avoid over-consumption of memory on a TIPC socket, that
> condition will be ignored (or at least severly violated).  Thats bad, and may
> lead to memory exhaustion despite having administratively taken action to fix
> it.
> 
[...]

> Clearly sequential packet delivery implies the need to queue and postpone
> application delivery of packets behind the reception of a lost packet in
> sequence if it is dropped.  The problem I'm referring to here is that you seem
> to have built TIPC around the notion of a single sequence namespace per system,
> rather than one per socket.  

This is the way TIPC and other link-oriented protocols, e.g. LINX, are designed, 
and they have this well-known flip-side. They were never intended to become 
another TCP.
Many users accept this, and value that the advantages you get from this design 
more than outweigh the disadvantages. If they don't, then they should use TCP 
or SCTP instead.

Anyway the design doesn't by necessity mean that we are bound to violate the 
system limits; I think we have established that now.

> TCP, employing sequential delivery, means that
> dropping a packet stalls reception, but only for the socket in question.  The
> fact you created TIPC with a design point in which packet loss blocked an entire
> system, rather than only the intended receiving application is the issue here,
> and doing so doesn't give you the freedom to violate other control points in the
> network stack.
> 
>> I could even show that your proposal would cause almost immediate
>> deadlock, due to the order incoming and outgoing data paths are grabbing 
>> the locks. But I would rather drop this part of the discussion; we can 
>> achieve what we want anyway, with much simpler means. See below.
>>
> Perhaps you could, but if you were to deomonstrate how lowering your recieve
> buffer limit (or even just honoring whatever is set), results in a deadlock
> (ostensibly within the TIPC protocol code), how do you justify that as a reason
> why you should be allowed to simply violate your set socket limits, rather than
> recognizing the deadlock as a coding bug that needs to be fixed?  If you like,
> we can begin a separate thread on that subject and look into fixing the issue.

I wouldn't call it a bug, because it doesn't cause deadlock in the current code,
but it is clearly a design that can be improved.
We do have plans and prototypes ready to greatly simplify the locking structure, 
e.g. by getting rid of the port lock and replace most of the other ones with 
RCU-locks.
There are however a few other changes we need to get in place first, notably to
remove the remnants of the native interface, to make that possible. Stay tuned.

> 
>>>
>>>> You may see this as a flaw, but it is a consequence of that TIPC is a radically 
>>>> different design than IP based protocols, designed for completely different needs.
>>>> The shared link concept is a fundamental feature that has a lot of other advantages 
>>>> which I won't elaborate on here.
>>>>
>>> Very well.  While I'm thinking of it though, you also seem to be making a large
>>> leap in reasoning - you seem to be repeatedly equating my request to have you
>>> honor the limits of sk_rcvbuf, with a desire to have that limit be smaller than
>>> what you currently have it set to.  
>>

[...]

>> The following is an approach we discussed (inside the team) earlier, but dropped
>> due to the perceived impossibility to set the sk_rcvbuf to the desired values.
>> Now this limitation is not there, as I understand, and we can revive our 
>> proposal.
>>
>> 1: When a TIPC socket is created, sk_rcvbuf is set to 64MB+ (as it was before the 
>>    latest patch). This is the limit used as last resort against connected peers
>>    not respecting the connection send window.
>>
> As I noted above, as long as there is a method to change the administrative
> default, thats fine.
> 
>> 2: Datagram messages are checked against fractions of this value, according to
>>    their importance priority. E.g. LOW->sk_rcvbuf/16,  MEDIUM->sk_rcvbuf/8
>>    HIGH->sk_rcvbuf/4 and CRITICAL->sk_rcvbuf/2.
>>
> Yes, this is what I proposed in my initial email.
> 
>> 3: In the unlikely event that anybody wants to change these limits, he can change
>>    sk_rcvbuf via setsockopt(SOL_SOCK) either by first changing
>>    /proc/sys/net/core/rmem_max as you suggest, or via a dedicated setsockopt(SOL_TIPC).
>>    The latter option would have the advantage that it could enforce a lower limit of
>>    skr_rcvbuf of 64MB, something that would be unthinkable with SOL_SOCK, because it 
>>    would obviously affect other protocols.
> Theres really no need for a private TIPC socket option here.  You do this at the
> socket level, and if you want to set it higher than rmem_max, you adjust
> rmem_max first.  Its just the way the system works.  Every other protocol
> conforms to it, theres no reason TIPC shouldn't also.
>>
>> What do you think?
>>
> Yes, this is for the most part, good.

Happy to hear that. So we'll go ahead and make this change.

Thank you for your feedback.
///jon

> Thanks
> Neil
>> ///jon
>>
>> If you set:
>>>
>>> /proc/sys/net/core/rmem_default
>>
>> But this will 
>>>
>>> to the value that you want all your sockets to have, any socket that gets
>>> initalized with sock_init_data will inherit that value.  Note that, when doing
>>> so, you may also have to set:
>>>
>>>
>>>
>>> As you can't adminstratively set your default socket rcvbuf value to something
>>> larger than the maximum allowed value without raising the maximum allowed value
>>> first.
>>>
>>> Then all you have to do is make sure those values are set during boot up, and for
>>> users, it will appear as though nothing has changed.
>>>
>>> Neil
>>>
>>>>
>>
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 21, 2013, 6:16 p.m. UTC | #11
On Thu, Feb 21, 2013 at 05:54:12PM +0100, Jon Maloy wrote:
> On 02/21/2013 04:07 PM, Neil Horman wrote:
> > On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
> >> On 02/19/2013 10:44 PM, Neil Horman wrote:
> >>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
> >>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
> >>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> >>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
> >>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
> >>>> <snip>
> >>>
> >>>>>> 	
> >>>>>> There are two reasons for this. 
> >>>>>> The first one due to the message oriented nature of the flow control for 
> >>>>>> connections. Max message size is 65k, and max number of unacked messages 
> >>>>>> (at socket level, that is) before the sending process will take a break 
> >>>>>> is 1024. 
> >>>>>> So, simple maths gives that we must allow for 64MB + sk_overhead to guarantee 
> >>>>>> that a connection never is broken because of receiver overload. Contrary to TCP,
> >>>>> Note, this is false, due to the fact that, it presumes that the sender will
> >>>>> honor the congestion window.  Granted, that would be a sender violation, but its
> >>>>> still possible for packets to get lost due to receiver overrun.  
> >>>>
> >>>> The reason for this high limit is exactly to guard against crazy or malevolent 
> >>>> senders. If they respect their send window they will never hit this limit on
> >>>> connections.
> >>>>
> >>> Nope, You don't get to have it both ways - If a sender is malevolent or crazy,
> >>> what makes you think it will respect its send window?
> >>
> >> Nothing. That is why we need this extra security measure that this limit provides.
> >> The normal senders will never hit it, the crazy ones will, and have their 
> >> connections broken as a consequence. 
> >> Sorry I didn't express this clearly enough.
> > This isn't a security measure.  If a sender is well
> > behaved, then as you say, they won't violate their send window, and will
> > ostensibly behave properly (regardless of what limit you set on sk_rcvbuf, be it
> > the large one you currently employ, or something bigger or smaller).  If they
> > are malicious, then they will hit the limit, and they won't care (because
> > they're malicious).  They may have their connections broken, but so what?  Its the
> > receiver that needs to be able to protect itself.  The static limit you set using
> > this implementation is likely fine for many systems that have sufficient memory,
> > but consider a system that is memory constrained. 
> 
> The default limit is insanely high of course, but until we have introduced byte 
> oriented flow control this is what we have to deal with, for reasons I explained
> earlier.
> 
> If anybody want to turn it down, it will probably work well in most cases.
> The risk of having connections broken rises from 0 to some low percentage. 
> If somebody decides to do this, it is probably because he can accept
> that risk, or because he knows the apps he is running are nice, e.g. never send 
> out streams of 64k messages the way I described as a worst-case scenario.
> 
> > If a user administratively
> > tunes down sk_rcvbuf to avoid over-consumption of memory on a TIPC socket, that
> > condition will be ignored (or at least severly violated).  Thats bad, and may
> > lead to memory exhaustion despite having administratively taken action to fix
> > it.
> > 
> [...]
> 
> > Clearly sequential packet delivery implies the need to queue and postpone
> > application delivery of packets behind the reception of a lost packet in
> > sequence if it is dropped.  The problem I'm referring to here is that you seem
> > to have built TIPC around the notion of a single sequence namespace per system,
> > rather than one per socket.  
> 
> This is the way TIPC and other link-oriented protocols, e.g. LINX, are designed, 
> and they have this well-known flip-side. They were never intended to become 
> another TCP.
> Many users accept this, and value that the advantages you get from this design 
> more than outweigh the disadvantages. If they don't, then they should use TCP 
> or SCTP instead.
> 
> Anyway the design doesn't by necessity mean that we are bound to violate the 
> system limits; I think we have established that now.
> 
Agreed, you're not bound to deterministically violate the system limits - my
only request is that you honor them should you reach them.

> >> I could even show that your proposal would cause almost immediate
> >> deadlock, due to the order incoming and outgoing data paths are grabbing 
> >> the locks. But I would rather drop this part of the discussion; we can 
> >> achieve what we want anyway, with much simpler means. See below.
> >>
> > Perhaps you could, but if you were to deomonstrate how lowering your recieve
> > buffer limit (or even just honoring whatever is set), results in a deadlock
> > (ostensibly within the TIPC protocol code), how do you justify that as a reason
> > why you should be allowed to simply violate your set socket limits, rather than
> > recognizing the deadlock as a coding bug that needs to be fixed?  If you like,
> > we can begin a separate thread on that subject and look into fixing the issue.
> 
> I wouldn't call it a bug, because it doesn't cause deadlock in the current code,
> but it is clearly a design that can be improved.
I don't understand this - Above you said you could demonstrate how my proposal
(which was to drop packets when they surpassed the sk_rcvbuf limit), would cause
deadlock - if that happens, you have a locking bug.  If the only reason this
does not happen currently is because you allow for a large overrun of your
set sk_rcvbuf, then ostensibly a lockup can still be triggered if you have a
misbehaving sender that is willing to send frames past its congestion window.
So I think the root question here is: Does the code currently deadlock if you
drop frames in the receive path?  If not, then we should be ok, regardless of
what the rcvbuf value is set to.  If so, then we have a bug that needs fixing.

> We do have plans and prototypes ready to greatly simplify the locking structure, 
> e.g. by getting rid of the port lock and replace most of the other ones with 
> RCU-locks.
> There are however a few other changes we need to get in place first, notably to
> remove the remnants of the native interface, to make that possible. Stay tuned.
> 
Okey dokey.

> >>
> > As I noted above, as long as there is a method to change the administrative
> > default, thats fine.
> > 
> >> 2: Datagram messages are checked against fractions of this value, according to
> >>    their importance priority. E.g. LOW->sk_rcvbuf/16,  MEDIUM->sk_rcvbuf/8
> >>    HIGH->sk_rcvbuf/4 and CRITICAL->sk_rcvbuf/2.
> >>
> > Yes, this is what I proposed in my initial email.
> > 
> >> 3: In the unlikely event that anybody wants to change these limits, he can change
> >>    sk_rcvbuf via setsockopt(SOL_SOCK) either by first changing
> >>    /proc/sys/net/core/rmem_max as you suggest, or via a dedicated setsockopt(SOL_TIPC).
> >>    The latter option would have the advantage that it could enforce a lower limit of
> >>    skr_rcvbuf of 64MB, something that would be unthinkable with SOL_SOCK, because it 
> >>    would obviously affect other protocols.
> > Theres really no need for a private TIPC socket option here.  You do this at the
> > socket level, and if you want to set it higher than rmem_max, you adjust
> > rmem_max first.  Its just the way the system works.  Every other protocol
> > conforms to it, theres no reason TIPC shouldn't also.
> >>
> >> What do you think?
> >>
> > Yes, this is for the most part, good.
> 
> Happy to hear that. So we'll go ahead and make this change.
> 
> Thank you for your feedback.
Sounds good, thank you!
Neil

> ///jon
> 
> > Thanks
> > Neil
> >> ///jon
> >>
> >> If you set:
> >>>
> >>> /proc/sys/net/core/rmem_default
> >>
> >> But this will 
> >>>
> >>> to the value that you want all your sockets to have, any socket that gets
> >>> initalized with sock_init_data will inherit that value.  Note that, when doing
> >>> so, you may also have to set:
> >>>
> >>>
> >>>
> >>> As you can't adminstratively set your default socket rcvbuf value to something
> >>> larger than the maximum allowed value without raising the maximum allowed value
> >>> first.
> >>>
> >>> Then all you have to do is make sure those values are set during boot up, and for
> >>> users, it will appear as though nothing has changed.
> >>>
> >>> Neil
> >>>
> >>>>
> >>
> >>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 21, 2013, 9:05 p.m. UTC | #12
On 02/21/2013 07:16 PM, Neil Horman wrote:
> On Thu, Feb 21, 2013 at 05:54:12PM +0100, Jon Maloy wrote:
>> On 02/21/2013 04:07 PM, Neil Horman wrote:
>>> On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
>>>> On 02/19/2013 10:44 PM, Neil Horman wrote:
>>>>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
>>>>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
>>>>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
>>>>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
>>>>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>>>>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>>>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
>>>>>> <snip>
>> I wouldn't call it a bug, because it doesn't cause deadlock in the current code,
>> but it is clearly a design that can be improved.
> I don't understand this - Above you said you could demonstrate how my proposal
> (which was to drop packets when they surpassed the sk_rcvbuf limit), would cause
> deadlock - if that happens, you have a locking bug.  If the only reason this
> does not happen currently is because you allow for a large overrun of your
> set sk_rcvbuf, then ostensibly a lockup can still be triggered if you have a
> misbehaving sender that is willing to send frames past its congestion window.
> So I think the root question here is: Does the code currently deadlock if you
> drop frames in the receive path? 
No. We can drop as as many as we want, the retransmission protocol will
take hand of that, and that part is pretty robust by now.
But it *would* deadlock if we tried to read fields in the sock structure, with
the necessary grabbing of locks that involves, from within the scope of
tipc_recv_msg, which is at a completely different level in the stack.

Since we don't do that in the current code, there is no deadlock problem.

///jon

[...]
>
>>>>
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 21, 2013, 9:35 p.m. UTC | #13
On Thu, Feb 21, 2013 at 10:05:37PM +0100, Jon Maloy wrote:
> On 02/21/2013 07:16 PM, Neil Horman wrote:
> > On Thu, Feb 21, 2013 at 05:54:12PM +0100, Jon Maloy wrote:
> >> On 02/21/2013 04:07 PM, Neil Horman wrote:
> >>> On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
> >>>> On 02/19/2013 10:44 PM, Neil Horman wrote:
> >>>>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
> >>>>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
> >>>>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> >>>>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
> >>>>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >>>>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>>>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
> >>>>>> <snip>
> >> I wouldn't call it a bug, because it doesn't cause deadlock in the current code,
> >> but it is clearly a design that can be improved.
> > I don't understand this - Above you said you could demonstrate how my proposal
> > (which was to drop packets when they surpassed the sk_rcvbuf limit), would cause
> > deadlock - if that happens, you have a locking bug.  If the only reason this
> > does not happen currently is because you allow for a large overrun of your
> > set sk_rcvbuf, then ostensibly a lockup can still be triggered if you have a
> > misbehaving sender that is willing to send frames past its congestion window.
> > So I think the root question here is: Does the code currently deadlock if you
> > drop frames in the receive path? 
> No. We can drop as as many as we want, the retransmission protocol will
> take hand of that, and that part is pretty robust by now.
> But it *would* deadlock if we tried to read fields in the sock structure, with
> the necessary grabbing of locks that involves, from within the scope of
> tipc_recv_msg, which is at a completely different level in the stack.
> 
> Since we don't do that in the current code, there is no deadlock problem.
> 
Theres tons of protocols that read socket structures that low in the receive
path, in fact (with the exception of TIPC) they all do, specifically for the
purpose of being able to check, among other things, the socket buffer recieve
limit.  See sctp_rcv, tcp_v4_rcv, tcp_v6_rcv, _udp4_lib_rcv, etc for examples.
Looking at tipc_recv_msg, it looks to me like you need to grab the
tipc_port_lock for a short critical section to get the tipc_port struct, from
which (as we previously discussed, you can get the socket).  Presuming you've
done appropriate reference counting on your socket, thats it.  One lock, that
you take and release in several other places in the same call path.

Neil

> ///jon
> 
> [...]
> >
> >>>>
> >>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Maloy Feb. 22, 2013, 11:18 a.m. UTC | #14
On 02/21/2013 10:35 PM, Neil Horman wrote:
> On Thu, Feb 21, 2013 at 10:05:37PM +0100, Jon Maloy wrote:
>> On 02/21/2013 07:16 PM, Neil Horman wrote:
>>> On Thu, Feb 21, 2013 at 05:54:12PM +0100, Jon Maloy wrote:
>>>> On 02/21/2013 04:07 PM, Neil Horman wrote:
>>>>> On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
>>>>>> On 02/19/2013 10:44 PM, Neil Horman wrote:
>>>>>>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
>>>>>>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
>>>>>>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
>>>>>>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
>>>>>>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
>>>>>>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
>>>>>>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
>>>>>>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
>>>>>>>> <snip>
>>>> I wouldn't call it a bug, because it doesn't cause deadlock in the current code,
>>>> but it is clearly a design that can be improved.
>>> I don't understand this - Above you said you could demonstrate how my proposal
>>> (which was to drop packets when they surpassed the sk_rcvbuf limit), would cause
>>> deadlock - if that happens, you have a locking bug.  If the only reason this
>>> does not happen currently is because you allow for a large overrun of your
>>> set sk_rcvbuf, then ostensibly a lockup can still be triggered if you have a
>>> misbehaving sender that is willing to send frames past its congestion window.
>>> So I think the root question here is: Does the code currently deadlock if you
>>> drop frames in the receive path? 
>> No. We can drop as as many as we want, the retransmission protocol will
>> take hand of that, and that part is pretty robust by now.
>> But it *would* deadlock if we tried to read fields in the sock structure, with
>> the necessary grabbing of locks that involves, from within the scope of
>> tipc_recv_msg, which is at a completely different level in the stack.
>>
>> Since we don't do that in the current code, there is no deadlock problem.
>>
> Theres tons of protocols that read socket structures that low in the receive
> path, in fact (with the exception of TIPC) they all do, specifically for the
> purpose of being able to check, among other things, the socket buffer recieve
> limit.  See sctp_rcv, tcp_v4_rcv, tcp_v6_rcv, _udp4_lib_rcv, etc for examples.

All those examples are trivial in this respect, because they either have
no retransmission layer at all, or the retransmission is bound to the socket 
name space itself. If you first have to lookup the socket even to perform the 
most basic protocol check it goes without saying that you can first take a 
quick look into sk_rcvbuf before going on. Besides, it doesn't cost any 
performance with this approach.

There are, or have  been, other protocols, TIPC, LINX, HDLC, LAPB, MTP2 etc. 
where reliability is ensured at a logical data link layer (L2.5), and where 
this kind of early endpoint access is less obvious, but not impossible.
TIPC is the only one of these present in the Linux kernel and sporting
a socket API, though; that's why we are having this discussion.

> Looking at tipc_recv_msg, it looks to me like you need to grab the
> tipc_port_lock for a short critical section to get the tipc_port struct, from
> which (as we previously discussed, you can get the socket).  Presuming you've
> done appropriate reference counting on your socket, thats it.  One lock, that
> you take and release in several other places in the same call path.

Ok. Currently it looks like this:

Outgoing data path:
------------------

grab socket lock
   grab net lock
     grab node_lock
       <send packet(s)>
     release node lock
   release net lock
release socket lock


Incoming data path:
-------------------

grab net lock (read mode)
    <do some packet sanity check>
    grab node lock
       <perform link level protocol check>
    release node lock'
release net lock
grab port lock
  grab socket lock
     <check sk_rcvbuf>
     <deliver or reject>
  release socket lock
release port lock

As I interpreted your suggestion, incoming data path
would become as follows:
-----------------------

grab net lock (read mode)
   <do some packet sanity check>
   grab node lock
-->   grab port lock
        grab socket lock
           <check sk_rcvbuf>
        release socket lock
-->   release port lock
      <perform link level protocol check>
    release node lock
release net lock
grab port lock
  grab socket lock
     <check sk_rcvbuf>
     <deliver or reject>
  release socket lock
release port lock
 
I.e., deadlock would occur almost immediately.


Now, having slept on it, I see we could also do:
-----------------------------------------------

grab port lock
   grab socket lock
       check sk_rcvbuf>
   release socket lock
release port lock
grab net lock (read mode)
     <do some packet sanity check>
     grab node lock
        <perform link level protocol check>
     release node lock'
release net lock'
grab port lock
  grab socket lock
     <check sk_rcvbuf>
     <deliver or reject>
  release socket lock
release port lock

If this is what you meant, then you are right.
It would work, although it would severely impact performance.

But the fact that it works technically doesn't mean it is the 
right thing to do, because of the way it would mess up the 
overall packet flow. 
This is not a good solution! 

And I think there are better ways...
If we really want to improve the solution we agreed on
yesterday we should rather go for a scheme adding back-pressure
to the sender socket, even for connectionless datagram messages, 
not only connections as we do now. We have some ideas there, and 
you are welcome to participate in the discussions. 
Maybe another thread at tipc-discussion?

Regards
///jon




> 
> Neil
> 
>> ///jon
>>
>> [...]
>>>
>>>>>>
>>>>
>>
>>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight Feb. 22, 2013, 11:54 a.m. UTC | #15
> grab net lock (read mode)
>    <do some packet sanity check>
>    grab node lock
> -->   grab port lock
>         grab socket lock
>            <check sk_rcvbuf>
>         release socket lock
> -->   release port lock
>       <perform link level protocol check>
>     release node lock
> release net lock
> grab port lock
>   grab socket lock
>      <check sk_rcvbuf>
>      <deliver or reject>
>   release socket lock
> release port lock

You probably don't need to grab the socket lock itself, you
only need to be able to read a 'receive buffer full' marker.
That could (probably) be done with some careful coding in the
socket create/delete paths.

Another option is to remember that the socket became full
the previous time, and only do the early check of the
receive buffer state when it had previously been full.

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Neil Horman Feb. 22, 2013, 12:08 p.m. UTC | #16
On Fri, Feb 22, 2013 at 12:18:52PM +0100, Jon Maloy wrote:
> On 02/21/2013 10:35 PM, Neil Horman wrote:
> > On Thu, Feb 21, 2013 at 10:05:37PM +0100, Jon Maloy wrote:
> >> On 02/21/2013 07:16 PM, Neil Horman wrote:
> >>> On Thu, Feb 21, 2013 at 05:54:12PM +0100, Jon Maloy wrote:
> >>>> On 02/21/2013 04:07 PM, Neil Horman wrote:
> >>>>> On Thu, Feb 21, 2013 at 11:24:19AM +0100, Jon Maloy wrote:
> >>>>>> On 02/19/2013 10:44 PM, Neil Horman wrote:
> >>>>>>> On Tue, Feb 19, 2013 at 09:16:40PM +0100, Jon Maloy wrote:
> >>>>>>>> On 02/19/2013 08:18 PM, Neil Horman wrote:
> >>>>>>>>> On Tue, Feb 19, 2013 at 06:54:14PM +0100, Jon Maloy wrote:
> >>>>>>>>>> On 02/19/2013 03:26 PM, Neil Horman wrote:
> >>>>>>>>>>> On Tue, Feb 19, 2013 at 09:07:54AM +0100, Jon Maloy wrote:
> >>>>>>>>>>>> On 02/18/2013 09:47 AM, Neil Horman wrote:
> >>>>>>>>>>>>> On Fri, Feb 15, 2013 at 05:57:46PM -0500, Paul Gortmaker wrote:
> >>>>>>>>>>>>>> From: Ying Xue <ying.xue@windriver.com>
> >>>>>>>> <snip>
><snip>
> 
> grab net lock (read mode)
>    <do some packet sanity check>
>    grab node lock
> -->   grab port lock
>         grab socket lock
>            <check sk_rcvbuf>
>         release socket lock
> -->   release port lock
>       <perform link level protocol check>
>     release node lock
> release net lock
> grab port lock
>   grab socket lock
>      <check sk_rcvbuf>
>      <deliver or reject>
>   release socket lock
> release port lock
>  
> I.e., deadlock would occur almost immediately.
> 
> 
> Now, having slept on it, I see we could also do:
> -----------------------------------------------
> 
> grab port lock
>    grab socket lock
>        check sk_rcvbuf>
>    release socket lock
> release port lock
> grab net lock (read mode)
>      <do some packet sanity check>
>      grab node lock
>         <perform link level protocol check>
>      release node lock'
> release net lock'
> grab port lock
>   grab socket lock
>      <check sk_rcvbuf>
>      <deliver or reject>
>   release socket lock
> release port lock
> 
> If this is what you meant, then you are right.
Yes, this is one of the options I had meant.  Another one would be to pass a
reference of b_ptr up to the dispatch function (as everthing else is discernable
from the message buffer), and send the ack from there.

> It would work, although it would severely impact performance.
> 
> But the fact that it works technically doesn't mean it is the 
> right thing to do, because of the way it would mess up the 
> overall packet flow. 
> This is not a good solution! 
> 
How many times have we gone over this?
A) Impacting performance isn't an excuse for being able to overwhelm a system by
flooding it with traffic

B) I'm not advocating that you lower you receive buffer limit by default, only
that if someone chooses to, they be able to do so (and accept the performance
consequences thereof).

> And I think there are better ways...
> If we really want to improve the solution we agreed on
> yesterday we should rather go for a scheme adding back-pressure
> to the sender socket, even for connectionless datagram messages, 
As long as you drop frames when you supercede the limits set by the user on your
socket buffer, sure.

> not only connections as we do now. We have some ideas there, and 
> you are welcome to participate in the discussions. 
> Maybe another thread at tipc-discussion?
> 
Sure

> Regards
> ///jon
> 
> 
> 
> 
> > 
> > Neil
> > 
> >> ///jon
> >>
> >> [...]
> >>>
> >>>>>>
> >>>>
> >>
> >>
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f6ceecd..cbe2f6e 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -43,7 +43,8 @@ 
 #define SS_LISTENING	-1	/* socket is listening */
 #define SS_READY	-2	/* socket is connectionless */
 
-#define OVERLOAD_LIMIT_BASE	10000
+#define CONN_OVERLOAD_LIMIT	((TIPC_FLOW_CONTROL_WIN * 2 + 1) * \
+				SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE))
 #define CONN_TIMEOUT_DEFAULT	8000	/* default connect timeout = 8s */
 
 struct tipc_sock {
@@ -202,7 +203,6 @@  static int tipc_create(struct net *net, struct socket *sock, int protocol,
 
 	sock_init_data(sock, sk);
 	sk->sk_backlog_rcv = backlog_rcv;
-	sk->sk_rcvbuf = TIPC_FLOW_CONTROL_WIN * 2 * TIPC_MAX_USER_MSG_SIZE * 2;
 	sk->sk_data_ready = tipc_data_ready;
 	sk->sk_write_space = tipc_write_space;
 	tipc_sk(sk)->p = tp_ptr;
@@ -1142,34 +1142,6 @@  static void tipc_data_ready(struct sock *sk, int len)
 }
 
 /**
- * rx_queue_full - determine if receive queue can accept another message
- * @msg: message to be added to queue
- * @queue_size: current size of queue
- * @base: nominal maximum size of queue
- *
- * Returns 1 if queue is unable to accept message, 0 otherwise
- */
-static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
-{
-	u32 threshold;
-	u32 imp = msg_importance(msg);
-
-	if (imp == TIPC_LOW_IMPORTANCE)
-		threshold = base;
-	else if (imp == TIPC_MEDIUM_IMPORTANCE)
-		threshold = base * 2;
-	else if (imp == TIPC_HIGH_IMPORTANCE)
-		threshold = base * 100;
-	else
-		return 0;
-
-	if (msg_connected(msg))
-		threshold *= 4;
-
-	return queue_size >= threshold;
-}
-
-/**
  * filter_connect - Handle all incoming messages for a connection-based socket
  * @tsock: TIPC socket
  * @msg: message
@@ -1247,6 +1219,36 @@  static u32 filter_connect(struct tipc_sock *tsock, struct sk_buff **buf)
 }
 
 /**
+ * rcvbuf_limit - get proper overload limit of socket receive queue
+ * @sk: socket
+ * @buf: message
+ *
+ * For all connection oriented messages, irrespective of importance,
+ * the default overload value (i.e. 67MB) is set as limit.
+ *
+ * For all connectionless messages, by default new queue limits are
+ * as belows:
+ *
+ * TIPC_LOW_IMPORTANCE       (5MB)
+ * TIPC_MEDIUM_IMPORTANCE    (10MB)
+ * TIPC_HIGH_IMPORTANCE      (20MB)
+ * TIPC_CRITICAL_IMPORTANCE  (40MB)
+ *
+ * Returns overload limit according to corresponding message importance
+ */
+static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf)
+{
+	struct tipc_msg *msg = buf_msg(buf);
+	unsigned int limit;
+
+	if (msg_connected(msg))
+		limit = CONN_OVERLOAD_LIMIT;
+	else
+		limit = sk->sk_rcvbuf << (msg_importance(msg) + 5);
+	return limit;
+}
+
+/**
  * filter_rcv - validate incoming message
  * @sk: socket
  * @buf: message
@@ -1262,7 +1264,7 @@  static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
 {
 	struct socket *sock = sk->sk_socket;
 	struct tipc_msg *msg = buf_msg(buf);
-	u32 recv_q_len;
+	unsigned int limit = rcvbuf_limit(sk, buf);
 	u32 res = TIPC_OK;
 
 	/* Reject message if it is wrong sort of message for socket */
@@ -1279,15 +1281,13 @@  static u32 filter_rcv(struct sock *sk, struct sk_buff *buf)
 	}
 
 	/* Reject message if there isn't room to queue it */
-	recv_q_len = skb_queue_len(&sk->sk_receive_queue);
-	if (unlikely(recv_q_len >= (OVERLOAD_LIMIT_BASE / 2))) {
-		if (rx_queue_full(msg, recv_q_len, OVERLOAD_LIMIT_BASE / 2))
-			return TIPC_ERR_OVERLOAD;
-	}
+	if (sk_rmem_alloc_get(sk) + buf->truesize >= limit)
+		return TIPC_ERR_OVERLOAD;
 
-	/* Enqueue message (finally!) */
+	/* Enqueue message */
 	TIPC_SKB_CB(buf)->handle = 0;
 	__skb_queue_tail(&sk->sk_receive_queue, buf);
+	skb_set_owner_r(buf, sk);
 
 	sk->sk_data_ready(sk, 0);
 	return TIPC_OK;
@@ -1336,7 +1336,7 @@  static u32 dispatch(struct tipc_port *tport, struct sk_buff *buf)
 	if (!sock_owned_by_user(sk)) {
 		res = filter_rcv(sk, buf);
 	} else {
-		if (sk_add_backlog(sk, buf, sk->sk_rcvbuf))
+		if (sk_add_backlog(sk, buf, rcvbuf_limit(sk, buf)))
 			res = TIPC_ERR_OVERLOAD;
 		else
 			res = TIPC_OK;
@@ -1570,6 +1570,7 @@  static int accept(struct socket *sock, struct socket *new_sock, int flags)
 	} else {
 		__skb_dequeue(&sk->sk_receive_queue);
 		__skb_queue_head(&new_sk->sk_receive_queue, buf);
+		skb_set_owner_r(buf, new_sk);
 	}
 	release_sock(new_sk);