diff mbox

[net-next,v3,3/3] net: sctp: Add support for MSG_MORE on SCTP

Message ID 063D6719AE5E284EB5DD2968C1650D6D1727AF02@AcuExch.aculab.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

David Laight July 22, 2014, 8:59 a.m. UTC
If MSG_MORE is set then the data chunk will be buffered until either
a full packet would be generated, or something causes a chunk to be
sent (eg data without MSG_MORE or a heartbeat).

The MSG_MORE flag is saved 'per association' along with a copy
of the SCTP_NODELAY/Nagle flag.

It is expected that an application will only set MSG_MORE when it
has an additional data chunk ready to send. The sends could be done
with a single sendmmsg() system call.

Signed-off-by: David Laight <david.laight@aculab.com>
---

Resend with corrected subject line.

Changes from v2:
- MSG_MORE is now saved per association (not per socket)
- The first data chunk is also not sent

 include/net/sctp/structs.h |  9 ++++++++-
 net/sctp/endpointola.c     |  3 +++
 net/sctp/output.c          | 16 ++++++++++++----
 net/sctp/socket.c          | 24 +++++++++++++++++++++---
 4 files changed, 44 insertions(+), 8 deletions(-)

Comments

Vladislav Yasevich July 22, 2014, 1:23 p.m. UTC | #1
On 07/22/2014 04:59 AM, David Laight wrote:
> If MSG_MORE is set then the data chunk will be buffered until either
> a full packet would be generated, or something causes a chunk to be
> sent (eg data without MSG_MORE or a heartbeat).

heartbeat will not cause a data flush.  Only SACKs do that as they control
congestion and flow.

That's might actually be a good solution to the problem of the an incorrectly
using MSG_MORE.  When a SACK drops inflight to 0, clear MSG_MORE from the
association thus allowing any queued data (even less then MTU) to be flushed.

This way, when the data flow just starts, you can use MSG_MORE to control
bundling.  However, the app stops sending, even if it forgot to clear MSG_MORE,
we'll clear it ourselves once inflight drops to 0.

> 
> The MSG_MORE flag is saved 'per association' along with a copy
> of the SCTP_NODELAY/Nagle flag.
> 
> It is expected that an application will only set MSG_MORE when it
> has an additional data chunk ready to send. The sends could be done
> with a single sendmmsg() system call.

Is that really true?  If the application has 5 messages and it sends all
5 with the sendmmsg(), then MSG_MORE will never get cleared and a flush
would not get triggered.


> 
> Signed-off-by: David Laight <david.laight@aculab.com>
> ---
> 
> Resend with corrected subject line.
> 
> Changes from v2:
> - MSG_MORE is now saved per association (not per socket)
> - The first data chunk is also not sent
> 
>  include/net/sctp/structs.h |  9 ++++++++-
>  net/sctp/endpointola.c     |  3 +++
>  net/sctp/output.c          | 16 ++++++++++++----
>  net/sctp/socket.c          | 24 +++++++++++++++++++++---
>  4 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
> index 0dfcc92..441320a 100644
> --- a/include/net/sctp/structs.h
> +++ b/include/net/sctp/structs.h
> @@ -209,7 +209,11 @@ struct sctp_sock {
>  	struct sctp_assocparams assocparams;
>  	int user_frag;
>  	__u32 autoclose;
> -	__u8 nodelay;
> +
> +#define	SCTP_F_TX_NODELAY	0
> +#define	SCTP_F_TX_NAGLE		1	/* SCTP_NODELAY not set */
> +#define	SCTP_F_TX_MSG_MORE	2	/* MSG_MORE set on last send */

Why SCTP_F_TX and not just SCTP_TX_?  What does the _F stand for?

> +	__u8 tx_delay;
>  	__u8 disable_fragments;
>  	__u8 v4mapped;
>  	__u8 frag_interleave;
> @@ -1581,6 +1585,9 @@ struct sctp_association {
>  	/* Flag that path mtu update is pending */
>  	__u8   pmtu_pending;
>  
> +	/* SCTP_F_TX_xxx, Nagle copied from socket */
> +	__u8 tx_delay;
> +
>  	/* Association : The smallest PMTU discovered for all of the
>  	 * PMTU	       : peer's transport addresses.
>  	 */
> diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
> index 3d9f429..077220f 100644
> --- a/net/sctp/endpointola.c
> +++ b/net/sctp/endpointola.c
> @@ -221,6 +221,9 @@ void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
>  	/* Increment the backlog value for a TCP-style listening socket. */
>  	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
>  		sk->sk_ack_backlog++;
> +
> +	/* Cache SCTP_NODELAY (aka Nagle) state */
> +	asoc->tx_delay = sctp_sk(sk)->tx_delay;

state inheritance like this is usually done in sctp_assocociation_init().

>  }
>  
>  /* Free the endpoint structure.  Delay cleanup until
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index 7f28a8e..275a1ab 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -679,22 +679,30 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
>  	    flight_size >= transport->cwnd)
>  		return SCTP_XMIT_RWND_FULL;
>  
> +	/* If MSG_MORE is set we probably shouldn't create a new message.
> +	 * However unless we also implement a timeout (preferable settable
> +	 * as a socket option) then data could easily be left unsent.
> +	 * Instead we ignore MSG_MORE on the first data chunk.
> +	 * This makes the implementation of MSG_MORE the same as the
> +	 * implementation of Nagle.
> +	 */
> +
>  	/* Nagle's algorithm to solve small-packet problem:
>  	 * Inhibit the sending of new chunks when new outgoing data arrives
>  	 * if any previously transmitted data on the connection remains
>  	 * unacknowledged.
>  	 */
>  
> -	if (sctp_sk(asoc->base.sk)->nodelay)
> -		/* Nagle disabled */
> +	if (asoc->tx_delay == SCTP_F_TX_NODELAY)
> +		/* Nagle disabled and MSG_MORE unset */
>  		return SCTP_XMIT_OK;
>  
>  	if (!sctp_packet_empty(packet))
>  		/* Append to packet */
>  		return SCTP_XMIT_OK;
>  
> -	if (inflight == 0)
> -		/* Nothing unacked */
> +	if (inflight == 0 && !(asoc->tx_delay & SCTP_F_TX_MSG_MORE))
> +		/* Nothing unacked and application isn't going to send more */
>  		return SCTP_XMIT_OK;
>  	
>  	if (!sctp_state(asoc, ESTABLISHED))
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index fee06b9..73a421d 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1927,6 +1927,18 @@ static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
>  		pr_debug("%s: we associated primitively\n", __func__);
>  	}
>  
> +	/* Setting MSG_MORE currently has the same effect as enabling Nagle.
> +	 * This means that the user can't force bundling of the first two data
> +	 * chunks.  It does mean that all the data chunks will be sent
> +	 * without an extra timer.
> +	 * It is enough to save the last value since any data sent with
> +	 * MSG_MORE clear will already have been sent (subject to flow control).
> +	 */
> +	if (msg->msg_flags & MSG_MORE)
> +		asoc->tx_delay |= SCTP_F_TX_MSG_MORE;
> +	else
> +		asoc->tx_delay &= ~SCTP_F_TX_MSG_MORE;
> +
>  	/* Break the message into multiple chunks of maximum size. */
>  	datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len);
>  	if (IS_ERR(datamsg)) {
> @@ -2814,6 +2826,7 @@ static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
>  static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
>  				   unsigned int optlen)
>  {
> +	struct sctp_association *asoc;
>  	int val;
>  
>  	if (optlen < sizeof(int))
> @@ -2821,7 +2834,12 @@ static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
>  	if (get_user(val, (int __user *)optval))
>  		return -EFAULT;
>  
> -	sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1;
> +	val = val == 0 ? SCTP_F_TX_NAGLE : SCTP_F_TX_NODELAY;
> +	sctp_sk(sk)->tx_delay = val;
> +
> +	/* Update cached value on each asoc (clears SCTP_F_TX_MSG_MORE) */
> +	list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs)
> +		asoc->tx_delay = val;
>  	return 0;
>  }
>  
> @@ -3968,7 +3986,7 @@ static int sctp_init_sock(struct sock *sk)
>  	sp->disable_fragments = 0;
>  
>  	/* Enable Nagle algorithm by default.  */
> -	sp->nodelay           = 0;
> +	sp->tx_delay          = SCTP_F_TX_NAGLE;
>  
>  	/* Enable by default. */
>  	sp->v4mapped          = 1;
> @@ -5020,7 +5038,7 @@ static int sctp_getsockopt_nodelay(struct sock *sk, int len,
>  		return -EINVAL;
>  
>  	len = sizeof(int);
> -	val = (sctp_sk(sk)->nodelay == 1);
> +	val = sctp_sk(sk)->tx_delay & SCTP_F_TX_NAGLE ? 0 : 1;
>  	if (put_user(len, optlen))
>  		return -EFAULT;
>  	if (copy_to_user(optval, &val, len))
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight July 22, 2014, 1:45 p.m. UTC | #2
From: Vlad Yasevich 
> On 07/22/2014 04:59 AM, David Laight wrote:
> > If MSG_MORE is set then the data chunk will be buffered until either
> > a full packet would be generated, or something causes a chunk to be
> > sent (eg data without MSG_MORE or a heartbeat).
> 
> heartbeat will not cause a data flush.  Only SACKs do that as they control
> congestion and flow.

Experimentally heartbeats cause data to be sent.
I'd only got one IP address at each end - so don't know what happens to
any 'probe' HBs.
I frigged my code to set MSG_MORE if the data chunk was over 200 bytes.
A single long fragment would be collected by a short one, or by a heartbeat.
I did wonder if that was a bug - but it would happen for data delayed by Nagle.

> That's might actually be a good solution to the problem of the an incorrectly
> using MSG_MORE.  When a SACK drops inflight to 0, clear MSG_MORE from the
> association thus allowing any queued data (even less then MTU) to be flushed.
> 
> This way, when the data flow just starts, you can use MSG_MORE to control
> bundling.  However, the app stops sending, even if it forgot to clear MSG_MORE,
> we'll clear it ourselves once inflight drops to 0.

That will only work if there is enough data to actually send something.
So more likely to cause confusion than be useful.

All this is really defining the behaviour for 'broken' apps.
It wouldn't be completely unreasonable if a single message sent with 'MSG_MORE'
never actually got sent.

> > The MSG_MORE flag is saved 'per association' along with a copy
> > of the SCTP_NODELAY/Nagle flag.
> >
> > It is expected that an application will only set MSG_MORE when it
> > has an additional data chunk ready to send. The sends could be done
> > with a single sendmmsg() system call.
> 
> Is that really true?  If the application has 5 messages and it sends all
> 5 with the sendmmsg(), then MSG_MORE will never get cleared and a flush
> would not get triggered.

Hmmm.... I was thinking that each entry in msgvec[] had its own flags field.
But I guess that is only used for receive.

I looked at that code at the beginning to see if it made sense to just bundle
data sent with sendmmsg() - and decided it wasn't worth the effort because
semdmmsg() is mostly a system call saver, I'd still need to implement the
same code inside sctp.
This is made more difficult for 1-many sockets because the buffers for sendmmsg()
can all have different addresses.

One option for sendmmsg() is set a different flag on all but the last buffer
if there are no addresses present, and treat it the same as MSG_MORE for sctp.

I'm on holiday for the next 2 weeks. I'll probably manage to read netdev, but
won't be able to repost the patches from here - and certainly can't test
anything.

It would be nice to get the current patches (especially 1/3 and 2/3) in now.

	David




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Yasevich July 22, 2014, 2:54 p.m. UTC | #3
On 07/22/2014 09:45 AM, David Laight wrote:
> From: Vlad Yasevich 
>> On 07/22/2014 04:59 AM, David Laight wrote:
>>> If MSG_MORE is set then the data chunk will be buffered until either
>>> a full packet would be generated, or something causes a chunk to be
>>> sent (eg data without MSG_MORE or a heartbeat).
>>
>> heartbeat will not cause a data flush.  Only SACKs do that as they control
>> congestion and flow.
> 
> Experimentally heartbeats cause data to be sent.
> I'd only got one IP address at each end - so don't know what happens to
> any 'probe' HBs.
> I frigged my code to set MSG_MORE if the data chunk was over 200 bytes.
> A single long fragment would be collected by a short one, or by a heartbeat.
> I did wonder if that was a bug - but it would happen for data delayed by Nagle.

That sounds kind like a bug.  The end of packet usually causes a flush of the outqueue,
but any delayed data shouldn't be sent as HB should not cause a data flush.

The delay is either due to Nagle or closed windows (congestion or receive) and HB
doesn't effect either.

> 
>> That's might actually be a good solution to the problem of the an incorrectly
>> using MSG_MORE.  When a SACK drops inflight to 0, clear MSG_MORE from the
>> association thus allowing any queued data (even less then MTU) to be flushed.
>>
>> This way, when the data flow just starts, you can use MSG_MORE to control
>> bundling.  However, the app stops sending, even if it forgot to clear MSG_MORE,
>> we'll clear it ourselves once inflight drops to 0.
> 
> That will only work if there is enough data to actually send something.
> So more likely to cause confusion than be useful.
> 
> All this is really defining the behaviour for 'broken' apps.
> It wouldn't be completely unreasonable if a single message sent with 'MSG_MORE'
> never actually got sent.

Actually, this code opens up a way to eat kernel memory.  Consider an application
that does:
   while (1) {
       socket(IPPROTO_SCTP);
       connect()
       sendmsg(MSG_MORE); <- write 1 byte
       close();
   }

Because we send with MSG_MORE, the 1 byte gets queued to the association.  The close()
causes us to enter SHUTDOWN_PENDING state and we never flush the buffer and close
the association.

This is a malicious example.  Similarly, a broken application could just forget to
clear MSG_MORE and when we end up in a condition where the amount of queued data is
smaller then MTU and all inflight data has been acked, we'd once again be stuck.

Just because application isn't doing the right thing, we can't assume it's broken.  It
could be malicious.  We need to make this behavior robust.

>>> The MSG_MORE flag is saved 'per association' along with a copy
>>> of the SCTP_NODELAY/Nagle flag.
>>>
>>> It is expected that an application will only set MSG_MORE when it
>>> has an additional data chunk ready to send. The sends could be done
>>> with a single sendmmsg() system call.
>>
>> Is that really true?  If the application has 5 messages and it sends all
>> 5 with the sendmmsg(), then MSG_MORE will never get cleared and a flush
>> would not get triggered.
> 
> Hmmm.... I was thinking that each entry in msgvec[] had its own flags field.
> But I guess that is only used for receive.
> 
> I looked at that code at the beginning to see if it made sense to just bundle
> data sent with sendmmsg() - and decided it wasn't worth the effort because
> semdmmsg() is mostly a system call saver, I'd still need to implement the
> same code inside sctp.
> This is made more difficult for 1-many sockets because the buffers for sendmmsg()
> can all have different addresses.
> 
> One option for sendmmsg() is set a different flag on all but the last buffer
> if there are no addresses present, and treat it the same as MSG_MORE for sctp.
> 
> I'm on holiday for the next 2 weeks. I'll probably manage to read netdev, but
> won't be able to repost the patches from here - and certainly can't test
> anything.
> 
> It would be nice to get the current patches (especially 1/3 and 2/3) in now.

I've acked patched 1 and 2.  This one needs a little bit more work.

-vlad

> 
> 	David
> 
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight July 22, 2014, 3:22 p.m. UTC | #4
...
> > All this is really defining the behaviour for 'broken' apps.
> > It wouldn't be completely unreasonable if a single message sent with 'MSG_MORE'
> > never actually got sent.
> 
> Actually, this code opens up a way to eat kernel memory.  Consider an application
> that does:
>    while (1) {
>        socket(IPPROTO_SCTP);
>        connect()
>        sendmsg(MSG_MORE); <- write 1 byte
>        close();
>    }
> 
> Because we send with MSG_MORE, the 1 byte gets queued to the association.  The close()
> causes us to enter SHUTDOWN_PENDING state and we never flush the buffer and close
> the association.
> 
> This is a malicious example.  Similarly, a broken application could just forget to
> clear MSG_MORE and when we end up in a condition where the amount of queued data is
> smaller then MTU and all inflight data has been acked, we'd once again be stuck.

From a system point of view that one doesn't really matter.

> Just because application isn't doing the right thing, we can't assume it's broken.  It
> could be malicious.  We need to make this behavior robust.

Hmmm....
The amount of data per-socket is limited by the mtu and socket buffer size,
so a malicious app would have to create a lot of sockets to use a significant
amount of kernel memory.

If close() blocks forever, isn't there a worse problem with a remote app
that causes the window to be zero - and so data can't be sent and the
close will block forever anyway?
I've never liked protocols that wait forever for send data to drain on disconnect.
Seems like an 'accident' waiting to happen.

I'm not sure what would be needed to be done to cause queued send data to be
sent when SCTP_F_TX_MSG_MORE is cleared by close() or set/clear SCTP_NODELAY.
It is probably a simple call to the correct function.

	David




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight July 22, 2014, 3:25 p.m. UTC | #5
From: Vlad Yasevich
> On 07/22/2014 09:45 AM, David Laight wrote:
> > From: Vlad Yasevich
> >> On 07/22/2014 04:59 AM, David Laight wrote:
> >>> If MSG_MORE is set then the data chunk will be buffered until either
> >>> a full packet would be generated, or something causes a chunk to be
> >>> sent (eg data without MSG_MORE or a heartbeat).
> >>
> >> heartbeat will not cause a data flush.  Only SACKs do that as they control
> >> congestion and flow.
> >
> > Experimentally heartbeats cause data to be sent.
> > I'd only got one IP address at each end - so don't know what happens to
> > any 'probe' HBs.
> > I frigged my code to set MSG_MORE if the data chunk was over 200 bytes.
> > A single long fragment would be collected by a short one, or by a heartbeat.
> > I did wonder if that was a bug - but it would happen for data delayed by Nagle.
> 
> That sounds kind like a bug.  The end of packet usually causes a flush of the outqueue,
> but any delayed data shouldn't be sent as HB should not cause a data flush.

I think the data is already in the relevant queue, it would have been sent
earlier if there had been a packet with anything in it.
So maybe the HB does get bundled with pending data.

Try applying my patch and doing some simple tests...

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vladislav Yasevich July 22, 2014, 4:38 p.m. UTC | #6
On 07/22/2014 11:22 AM, David Laight wrote:
> ...
>>> All this is really defining the behaviour for 'broken' apps.
>>> It wouldn't be completely unreasonable if a single message sent with 'MSG_MORE'
>>> never actually got sent.
>>
>> Actually, this code opens up a way to eat kernel memory.  Consider an application
>> that does:
>>    while (1) {
>>        socket(IPPROTO_SCTP);
>>        connect()
>>        sendmsg(MSG_MORE); <- write 1 byte
>>        close();
>>    }
>>
>> Because we send with MSG_MORE, the 1 byte gets queued to the association.  The close()
>> causes us to enter SHUTDOWN_PENDING state and we never flush the buffer and close
>> the association.
>>
>> This is a malicious example.  Similarly, a broken application could just forget to
>> clear MSG_MORE and when we end up in a condition where the amount of queued data is
>> smaller then MTU and all inflight data has been acked, we'd once again be stuck.
> 
> From a system point of view that one doesn't really matter.
> 
>> Just because application isn't doing the right thing, we can't assume it's broken.  It
>> could be malicious.  We need to make this behavior robust.
> 
> Hmmm....
> The amount of data per-socket is limited by the mtu and socket buffer size,
> so a malicious app would have to create a lot of sockets to use a significant
> amount of kernel memory.

it's not just the data, all the corresponding data structures which are particularly
fat in sctp.

> 
> If close() blocks forever, isn't there a worse problem with a remote app
> that causes the window to be zero - and so data can't be sent and the
> close will block forever anyway?

No, this doesn't happen since there are triggered as retransmissions and
once we enter SHUTDOWN-PENDING, we can start the shutdown-guard timer
once we've hit the max retransmit count.  This solves this issue.

> I've never liked protocols that wait forever for send data to drain on disconnect.
> Seems like an 'accident' waiting to happen.
> 
> I'm not sure what would be needed to be done to cause queued send data to be
> sent when SCTP_F_TX_MSG_MORE is cleared by close() or set/clear SCTP_NODELAY.
> It is probably a simple call to the correct function.

OK. I'll try to come up with some code for this.

-vlad

> 
> 	David
> 
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight July 22, 2014, 4:46 p.m. UTC | #7
From: Vlad Yasevich ...
> > I'm not sure what would be needed to be done to cause queued send data to be
> > sent when SCTP_F_TX_MSG_MORE is cleared by close() or set/clear SCTP_NODELAY.
> > It is probably a simple call to the correct function.
> 
> OK. I'll try to come up with some code for this.

Thanks.

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0dfcc92..441320a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -209,7 +209,11 @@  struct sctp_sock {
 	struct sctp_assocparams assocparams;
 	int user_frag;
 	__u32 autoclose;
-	__u8 nodelay;
+
+#define	SCTP_F_TX_NODELAY	0
+#define	SCTP_F_TX_NAGLE		1	/* SCTP_NODELAY not set */
+#define	SCTP_F_TX_MSG_MORE	2	/* MSG_MORE set on last send */
+	__u8 tx_delay;
 	__u8 disable_fragments;
 	__u8 v4mapped;
 	__u8 frag_interleave;
@@ -1581,6 +1585,9 @@  struct sctp_association {
 	/* Flag that path mtu update is pending */
 	__u8   pmtu_pending;
 
+	/* SCTP_F_TX_xxx, Nagle copied from socket */
+	__u8 tx_delay;
+
 	/* Association : The smallest PMTU discovered for all of the
 	 * PMTU	       : peer's transport addresses.
 	 */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 3d9f429..077220f 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -221,6 +221,9 @@  void sctp_endpoint_add_asoc(struct sctp_endpoint *ep,
 	/* Increment the backlog value for a TCP-style listening socket. */
 	if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))
 		sk->sk_ack_backlog++;
+
+	/* Cache SCTP_NODELAY (aka Nagle) state */
+	asoc->tx_delay = sctp_sk(sk)->tx_delay;
 }
 
 /* Free the endpoint structure.  Delay cleanup until
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 7f28a8e..275a1ab 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -679,22 +679,30 @@  static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
 	    flight_size >= transport->cwnd)
 		return SCTP_XMIT_RWND_FULL;
 
+	/* If MSG_MORE is set we probably shouldn't create a new message.
+	 * However unless we also implement a timeout (preferable settable
+	 * as a socket option) then data could easily be left unsent.
+	 * Instead we ignore MSG_MORE on the first data chunk.
+	 * This makes the implementation of MSG_MORE the same as the
+	 * implementation of Nagle.
+	 */
+
 	/* Nagle's algorithm to solve small-packet problem:
 	 * Inhibit the sending of new chunks when new outgoing data arrives
 	 * if any previously transmitted data on the connection remains
 	 * unacknowledged.
 	 */
 
-	if (sctp_sk(asoc->base.sk)->nodelay)
-		/* Nagle disabled */
+	if (asoc->tx_delay == SCTP_F_TX_NODELAY)
+		/* Nagle disabled and MSG_MORE unset */
 		return SCTP_XMIT_OK;
 
 	if (!sctp_packet_empty(packet))
 		/* Append to packet */
 		return SCTP_XMIT_OK;
 
-	if (inflight == 0)
-		/* Nothing unacked */
+	if (inflight == 0 && !(asoc->tx_delay & SCTP_F_TX_MSG_MORE))
+		/* Nothing unacked and application isn't going to send more */
 		return SCTP_XMIT_OK;
 	
 	if (!sctp_state(asoc, ESTABLISHED))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index fee06b9..73a421d 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1927,6 +1927,18 @@  static int sctp_sendmsg(struct kiocb *iocb, struct sock *sk,
 		pr_debug("%s: we associated primitively\n", __func__);
 	}
 
+	/* Setting MSG_MORE currently has the same effect as enabling Nagle.
+	 * This means that the user can't force bundling of the first two data
+	 * chunks.  It does mean that all the data chunks will be sent
+	 * without an extra timer.
+	 * It is enough to save the last value since any data sent with
+	 * MSG_MORE clear will already have been sent (subject to flow control).
+	 */
+	if (msg->msg_flags & MSG_MORE)
+		asoc->tx_delay |= SCTP_F_TX_MSG_MORE;
+	else
+		asoc->tx_delay &= ~SCTP_F_TX_MSG_MORE;
+
 	/* Break the message into multiple chunks of maximum size. */
 	datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len);
 	if (IS_ERR(datamsg)) {
@@ -2814,6 +2826,7 @@  static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval,
 static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
 				   unsigned int optlen)
 {
+	struct sctp_association *asoc;
 	int val;
 
 	if (optlen < sizeof(int))
@@ -2821,7 +2834,12 @@  static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval,
 	if (get_user(val, (int __user *)optval))
 		return -EFAULT;
 
-	sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1;
+	val = val == 0 ? SCTP_F_TX_NAGLE : SCTP_F_TX_NODELAY;
+	sctp_sk(sk)->tx_delay = val;
+
+	/* Update cached value on each asoc (clears SCTP_F_TX_MSG_MORE) */
+	list_for_each_entry(asoc, &sctp_sk(sk)->ep->asocs, asocs)
+		asoc->tx_delay = val;
 	return 0;
 }
 
@@ -3968,7 +3986,7 @@  static int sctp_init_sock(struct sock *sk)
 	sp->disable_fragments = 0;
 
 	/* Enable Nagle algorithm by default.  */
-	sp->nodelay           = 0;
+	sp->tx_delay          = SCTP_F_TX_NAGLE;
 
 	/* Enable by default. */
 	sp->v4mapped          = 1;
@@ -5020,7 +5038,7 @@  static int sctp_getsockopt_nodelay(struct sock *sk, int len,
 		return -EINVAL;
 
 	len = sizeof(int);
-	val = (sctp_sk(sk)->nodelay == 1);
+	val = sctp_sk(sk)->tx_delay & SCTP_F_TX_NAGLE ? 0 : 1;
 	if (put_user(len, optlen))
 		return -EFAULT;
 	if (copy_to_user(optval, &val, len))