diff mbox

[v2,net-next] tcp: avoid tx starvation by SYNACK packets

Message ID 1340440962.17495.39.camel@edumazet-glaptop
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet June 23, 2012, 8:42 a.m. UTC
From: Eric Dumazet <edumazet@google.com>

On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:

> This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
> packets)  is neither in net/net-next trees nor on patchwork. Maybe it
> was missed since it was sent during the merge window. Is this not
> needed anymore or is it being tested currently?

You're right, thanks for the reminder !

[PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets

pfifo_fast being the default Qdisc, its pretty easy to fill it with
SYNACK (small) packets while host is under synflood attack.

Packets of established TCP sessions are dropped at Qdisc layer and
host appears almost dead.

Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
generated in SYNCOOKIE mode, so that these packets are enqueued into
pfifo_fast lowest priority (band 2).

Other packets, queued to band 0 or 1 are dequeued before any SYNACK
packets waiting in band 2.

If not under synflood, SYNACK priority is as requested by listener
sk_priority policy.

Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
---
 net/dccp/ipv4.c                  |    2 ++
 net/ipv4/ip_output.c             |    2 +-
 net/ipv4/tcp_ipv4.c              |    7 ++++++-
 net/ipv6/inet6_connection_sock.c |    1 +
 net/ipv6/ip6_output.c            |    2 +-
 net/ipv6/tcp_ipv6.c              |   11 ++++++++---
 6 files changed, 19 insertions(+), 6 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Hans Schillstrom June 25, 2012, 6:24 a.m. UTC | #1
Hi Eric
On Saturday 23 June 2012 10:42:42 Eric Dumazet wrote:
> From: Eric Dumazet <edumazet@google.com>
> 
> On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:
> 
> > This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
> > packets)  is neither in net/net-next trees nor on patchwork. Maybe it
> > was missed since it was sent during the merge window. Is this not
> > needed anymore or is it being tested currently?
> 
> You're right, thanks for the reminder !

We have been runing this patch for a while now,
so I added a "Tested-by:"

> 
> [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets
> 
> pfifo_fast being the default Qdisc, its pretty easy to fill it with
> SYNACK (small) packets while host is under synflood attack.
> 
> Packets of established TCP sessions are dropped at Qdisc layer and
> host appears almost dead.
> 
> Avoid this problem assigning TC_PRIO_FILLER priority to SYNACK
> generated in SYNCOOKIE mode, so that these packets are enqueued into
> pfifo_fast lowest priority (band 2).
> 
> Other packets, queued to band 0 or 1 are dequeued before any SYNACK
> packets waiting in band 2.
> 
> If not under synflood, SYNACK priority is as requested by listener
> sk_priority policy.
> 
> Reported-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Tested-by: Hans Schillstrom <hans.schillstrom@ericsson.com>


> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Neal Cardwell <ncardwell@google.com>
> Cc: Tom Herbert <therbert@google.com>
> Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
> ---
>  net/dccp/ipv4.c                  |    2 ++
>  net/ipv4/ip_output.c             |    2 +-
>  net/ipv4/tcp_ipv4.c              |    7 ++++++-
>  net/ipv6/inet6_connection_sock.c |    1 +
>  net/ipv6/ip6_output.c            |    2 +-
>  net/ipv6/tcp_ipv6.c              |   11 ++++++++---
>  6 files changed, 19 insertions(+), 6 deletions(-)
> 
> diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
> index 3eb76b5..045176f 100644
> --- a/net/dccp/ipv4.c
> +++ b/net/dccp/ipv4.c
> @@ -515,6 +515,7 @@ static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
>  
>  		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
>  							      ireq->rmt_addr);
> +		skb->priority = sk->sk_priority;
>  		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
>  					    ireq->rmt_addr,
>  					    ireq->opt);
> @@ -556,6 +557,7 @@ static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
>  	skb_dst_set(skb, dst_clone(dst));
>  
>  	bh_lock_sock(ctl_sk);
> +	skb->priority = ctl_sk->sk_priority;
>  	err = ip_build_and_send_pkt(skb, ctl_sk,
>  				    rxiph->daddr, rxiph->saddr, NULL);
>  	bh_unlock_sock(ctl_sk);
> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> index 0f3185a..71c6c20 100644
> --- a/net/ipv4/ip_output.c
> +++ b/net/ipv4/ip_output.c
> @@ -155,7 +155,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
>  		ip_options_build(skb, &opt->opt, daddr, rt, 0);
>  	}
>  
> -	skb->priority = sk->sk_priority;
> +	/* skb->priority is set by the caller */
>  	skb->mark = sk->sk_mark;
>  
>  	/* Send it out. */
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index b52934f..5ef4131 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -81,7 +81,7 @@
>  #include <linux/stddef.h>
>  #include <linux/proc_fs.h>
>  #include <linux/seq_file.h>
> -
> +#include <linux/pkt_sched.h>
>  #include <linux/crypto.h>
>  #include <linux/scatterlist.h>
>  
> @@ -821,6 +821,7 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
>   *	Send a SYN-ACK after having received a SYN.
>   *	This still operates on a request_sock only, not on a big
>   *	socket.
> + *	nocache is set for SYN-ACK sent in SYNCOOKIE mode
>   */
>  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
>  			      struct request_sock *req,
> @@ -843,6 +844,10 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
>  		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
>  
>  		skb_set_queue_mapping(skb, queue_mapping);
> +
> +		/* SYNACK sent in SYNCOOKIE mode have low priority */
> +		skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;
> +
>  		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
>  					    ireq->rmt_addr,
>  					    ireq->opt);
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index e6cee52..5812a74 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -248,6 +248,7 @@ int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
>  	/* Restore final destination back after routing done */
>  	fl6.daddr = np->daddr;
>  
> +	skb->priority = sk->sk_priority;
>  	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
>  	rcu_read_unlock();
>  	return res;
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index a233a7c..a93378a 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -228,7 +228,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
>  	hdr->saddr = fl6->saddr;
>  	hdr->daddr = *first_hop;
>  
> -	skb->priority = sk->sk_priority;
> +	/* skb->priority is set by the caller */
>  	skb->mark = sk->sk_mark;
>  
>  	mtu = dst_mtu(dst);
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 26a8862..f664452 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -43,6 +43,7 @@
>  #include <linux/ipv6.h>
>  #include <linux/icmpv6.h>
>  #include <linux/random.h>
> +#include <linux/pkt_sched.h>
>  
>  #include <net/tcp.h>
>  #include <net/ndisc.h>
> @@ -479,7 +480,8 @@ out:
>  
>  static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
>  			      struct request_values *rvp,
> -			      u16 queue_mapping)
> +			      u16 queue_mapping,
> +			      bool syncookie)
>  {
>  	struct inet6_request_sock *treq = inet6_rsk(req);
>  	struct ipv6_pinfo *np = inet6_sk(sk);
> @@ -515,6 +517,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
>  	if (skb) {
>  		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
>  
> +		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
>  		fl6.daddr = treq->rmt_addr;
>  		skb_set_queue_mapping(skb, queue_mapping);
>  		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
> @@ -531,7 +534,7 @@ static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
>  			     struct request_values *rvp)
>  {
>  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
> -	return tcp_v6_send_synack(sk, req, rvp, 0);
> +	return tcp_v6_send_synack(sk, req, rvp, 0, false);
>  }
>  
>  static void tcp_v6_reqsk_destructor(struct request_sock *req)
> @@ -909,6 +912,7 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
>  	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
>  	if (!IS_ERR(dst)) {
>  		skb_dst_set(buff, dst);
> +		skb->priority = ctl_sk->sk_priority;
>  		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
>  		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
>  		if (rst)
> @@ -1217,7 +1221,8 @@ have_isn:
>  
>  	if (tcp_v6_send_synack(sk, req,
>  			       (struct request_values *)&tmp_ext,
> -			       skb_get_queue_mapping(skb)) ||
> +			       skb_get_queue_mapping(skb),
> +			       want_cookie) ||
>  	    want_cookie)
>  		goto drop_and_free;
>  
> 
> 
>
David Miller June 25, 2012, 10:43 p.m. UTC | #2
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 23 Jun 2012 10:42:42 +0200

> From: Eric Dumazet <edumazet@google.com>
> 
> On Sat, 2012-06-23 at 00:34 -0700, Vijay Subramanian wrote:
> 
>> This  patch ([PATCH net-next] tcp: avoid tx starvation by SYNACK
>> packets)  is neither in net/net-next trees nor on patchwork. Maybe it
>> was missed since it was sent during the merge window. Is this not
>> needed anymore or is it being tested currently?
> 
> You're right, thanks for the reminder !
> 
> [PATCH v2 net-next] tcp: avoid tx starvation by SYNACK packets

I don't agree with this change.

What is the point in having real classification configuration if
arbitrary places in the network stack are going to override SKB
priority with a fixed priority setting?

I bet the person who set listening socket priority really meant it and
does not expect you to override it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 26, 2012, 4:51 a.m. UTC | #3
On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:

> I don't agree with this change.
> 
> What is the point in having real classification configuration if
> arbitrary places in the network stack are going to override SKB
> priority with a fixed priority setting?
> 
> I bet the person who set listening socket priority really meant it and
> does not expect you to override it.


If I add a test on listener_sk->sk_priority being 0, would you accept
the patch ? If classification is done after tcp stack, it wont be hurt
by initial skb priority ?

instead of :

	/* SYNACK sent in SYNCOOKIE mode have low priority */
	skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;

Having :

	/* SYNACK sent in SYNCOOKIE mode have low priority */
	skb->priority = (nocache && !sk->sk_priority) ?
			TC_PRIO_FILLER : sk->sk_priority;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 26, 2012, 4:55 a.m. UTC | #4
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 26 Jun 2012 06:51:36 +0200

> On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:
> 
>> I don't agree with this change.
>> 
>> What is the point in having real classification configuration if
>> arbitrary places in the network stack are going to override SKB
>> priority with a fixed priority setting?
>> 
>> I bet the person who set listening socket priority really meant it and
>> does not expect you to override it.
> 
> 
> If I add a test on listener_sk->sk_priority being 0, would you accept
> the patch ? If classification is done after tcp stack, it wont be hurt
> by initial skb priority ?

It's better than your original patch, but it suffers from the same
fundamental problem.

No user is going to expect that TCP on it's own has choosen a
non-default priority and only for some packet types.  It's completely
unexpected behavior.

A SYN flood consumes so much more RX work than the TX for the SYNACK's
ever can.

So whilst I understand your desire to handle all elements of this kind
of attack, this one is reaching too far.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hans Schillstrom June 26, 2012, 5:34 a.m. UTC | #5
On Tuesday 26 June 2012 06:55:37 David Miller wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 26 Jun 2012 06:51:36 +0200
> 
> > On Mon, 2012-06-25 at 15:43 -0700, David Miller wrote:
> > 
> >> I don't agree with this change.
> >> 
> >> What is the point in having real classification configuration if
> >> arbitrary places in the network stack are going to override SKB
> >> priority with a fixed priority setting?
> >> 
> >> I bet the person who set listening socket priority really meant it and
> >> does not expect you to override it.
> > 
> > 
> > If I add a test on listener_sk->sk_priority being 0, would you accept
> > the patch ? If classification is done after tcp stack, it wont be hurt
> > by initial skb priority ?
> 
> It's better than your original patch, but it suffers from the same
> fundamental problem.
> 
> No user is going to expect that TCP on it's own has choosen a
> non-default priority and only for some packet types.  It's completely
> unexpected behavior.
> 
> A SYN flood consumes so much more RX work than the TX for the SYNACK's
> ever can.
> 
> So whilst I understand your desire to handle all elements of this kind
> of attack, this one is reaching too far.
> 

This patch didn't give much in gain actually.
The big cycle consumer during a syn attack is SHA sum right now, 
so from that perspective it's better to add aes crypto (by using AES-NI) 
to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 26, 2012, 5:02 p.m. UTC | #6
On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:

> This patch didn't give much in gain actually.

With a 100Mbps link it does.

With a 1Gbps link we are cpu bounded for sure.

> The big cycle consumer during a syn attack is SHA sum right now, 
> so from that perspective it's better to add aes crypto (by using AES-NI) 
> to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

My dev machine is able to process ~280.000 SYN (and synack) per second
(tg3, mono queue), and sha_transform() takes ~10 % of the time according
to perf.

With David patch using jhash instead of SHA, I reach ~315.000 SYN per
second.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hans Schillstrom June 27, 2012, 5:23 a.m. UTC | #7
On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:
> 
> > This patch didn't give much in gain actually.
> 
> With a 100Mbps link it does.
 
I was testing with a patched igb driver with TCP SYN irq:s on one core only,
there was some fault in the prev. setup (RPS was also involved) because now it gives a boost of ~15%

> With a 1Gbps link we are cpu bounded for sure.

True.

> 
> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.
> 
> My dev machine is able to process ~280.000 SYN (and synack) per second
> (tg3, mono queue), and sha_transform() takes ~10 % of the time according
> to perf.

My test machine is not that fast :-(
I have only 170.000 syn/synack per sec. and sha_transform() takes ~9.6%
have seen peeks of 16% (during 10 sec samples)

> 
> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> second.

I have similar results from ~170k to ~199k synack/sec.

BTW, 
cookie_hash() did not show up in the perf results, (< 0.08%)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 6:32 a.m. UTC | #8
On Tue, 2012-06-26 at 19:02 +0200, Eric Dumazet wrote:
> On Tue, 2012-06-26 at 07:34 +0200, Hans Schillstrom wrote:
> 
> > This patch didn't give much in gain actually.
> 
> With a 100Mbps link it does.
> 
> With a 1Gbps link we are cpu bounded for sure.

I'm using a 10G link

> > The big cycle consumer during a syn attack is SHA sum right now, 
> > so from that perspective it's better to add aes crypto (by using AES-NI) 
> > to the syn cookies instead of SHA sum. Even if only newer x86_64 can use it.

How are you avoiding the lock bh_lock_sock_nested(sk) in tcp_v4_rcv()?


> My dev machine is able to process ~280.000 SYN (and synack) per second
> (tg3, mono queue), and sha_transform() takes ~10 % of the time according
> to perf.

With my parallel SYN cookie/brownies patches, I could easily process 750
Kpps (limited by the generator, think the owners of the big machine did
a test where they reached 1400 Kpps).

I also had ~10% CPU usage from sha_transform() but across all cores...


> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> second.

IMHO a faster hash is not the answer... parallel processing of SYN
packets is a better answer.  But I do think, adding this faster hash as
a sysctl switch might be a good idea, for people with smaller embedded
hardware.  Using it as default, might be "dangerous" and open an attack
vector on SYN cookies in Linux.
David Miller June 27, 2012, 6:54 a.m. UTC | #9
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 27 Jun 2012 08:32:13 +0200

> Using it as default, might be "dangerous" and open an attack vector
> on SYN cookies in Linux.

If it's dangerous for syncookies then it's just as dangerous for
the routing hash and the socket hashes where we use it already.

Therefore, this sounds like a baseless claim to me.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 7:24 a.m. UTC | #10
On Tue, 2012-06-26 at 23:54 -0700, David Miller wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> Date: Wed, 27 Jun 2012 08:32:13 +0200
> 
> > Using it as default, might be "dangerous" and open an attack vector
> > on SYN cookies in Linux.
> 
> If it's dangerous for syncookies then it's just as dangerous for
> the routing hash and the socket hashes where we use it already.
> 
> Therefore, this sounds like a baseless claim to me.

Yes, you are right. Looking at you patch again, you also use
syncookie_secret[c] as initval.  So, it should be safe.

But, I still believe that we need, to solve this SYN issues by parallel
processing of packets. (It seems Eric and Hans are looking at a single
core SYN processing scheme, but I might have missed their point).


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 27, 2012, 7:30 a.m. UTC | #11
On Wed, 2012-06-27 at 09:24 +0200, Jesper Dangaard Brouer wrote:

> But, I still believe that we need, to solve this SYN issues by parallel
> processing of packets. (It seems Eric and Hans are looking at a single
> core SYN processing scheme, but I might have missed their point).

Yep

Parallel processing will only benefit multiqueue setups.

Many linux servers in colocations are still using a mono queue NIC, and
default linux configuration is to use a single cpu to handle all
incoming frames (no RPS/RFS).

Sometime the hw IRQ itself is distributed among several cpus, but at one
single moment, only one cpu is serving the NAPI poll.

As long as the LISTEN processing is locking the socket, there is no
point distributing SYN packets to multiple cpus, this only adds
contention and poor performance because of false sharing.

My plan is to get rid of the socket lock for LISTEN and use RCU instead.




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 7:54 a.m. UTC | #12
On Wed, 2012-06-27 at 09:30 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 09:24 +0200, Jesper Dangaard Brouer wrote:
> 
> > But, I still believe that we need, to solve this SYN issues by parallel
> > processing of packets. (It seems Eric and Hans are looking at a single
> > core SYN processing scheme, but I might have missed their point).
> 
> Yep
> 
> Parallel processing will only benefit multiqueue setups.
> 
> Many linux servers in colocations are still using a mono queue NIC, and
> default linux configuration is to use a single cpu to handle all
> incoming frames (no RPS/RFS).

I see, your target is different than mine (now I understand you
motivation).  Its good, as optimizing the single queue case, would also
be a benefit once we implement parallel processing / take advantage of
the multi queue devices.


> Sometime the hw IRQ itself is distributed among several cpus, but at one
> single moment, only one cpu is serving the NAPI poll.
> 
> As long as the LISTEN processing is locking the socket, there is no
> point distributing SYN packets to multiple cpus, this only adds
> contention and poor performance because of false sharing.
> 
> My plan is to get rid of the socket lock for LISTEN and use RCU instead.

Well, that would lead to parallel SYN processing, wouldn't it?


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 27, 2012, 8:02 a.m. UTC | #13
On Wed, 2012-06-27 at 09:54 +0200, Jesper Dangaard Brouer wrote:

> Well, that would lead to parallel SYN processing, wouldn't it?

I think we already discussed of the current issues of current code.

Telling people to spread SYN to several cpus is a good way to have a
freeze in case of synflood, because 15 cpus are busy looping while one
is doing progress.

Thats why Intel felt the need of a hardware filter to direct all SYN
packets on a single queue.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2012, 8:13 a.m. UTC | #14
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 09:30:16 +0200

> Many linux servers in colocations are still using a mono queue NIC, and
> default linux configuration is to use a single cpu to handle all
> incoming frames (no RPS/RFS).

Even worse, many are virtualized guest with single virtual netdev
queue :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 8:21 a.m. UTC | #15
On Wed, 2012-06-27 at 10:02 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 09:54 +0200, Jesper Dangaard Brouer wrote:
> 
> > Well, that would lead to parallel SYN processing, wouldn't it?
> 
> I think we already discussed of the current issues of current code.
> 
> Telling people to spread SYN to several cpus is a good way to have a
> freeze in case of synflood, because 15 cpus are busy looping while one
> is doing progress.

Yes, that was also what I experienced (contention on spinlock), and then
tried to address it with my parallel SYN cookie patches, and it worked
amazing well...


> Thats why Intel felt the need of a hardware filter to direct all SYN
> packets on a single queue.

It works because we have a spinlock problem in the code... Perhaps, they
did it, because we have have locking/contention problem, not the other
way around ;-)  How about fixing the code instead? ;-)))




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2012, 8:22 a.m. UTC | #16
From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Date: Wed, 27 Jun 2012 07:23:03 +0200

> On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
>> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
>> second.
> 
> I have similar results from ~170k to ~199k synack/sec.

Eric and Hans, I'm going to add Tested-by: tags for both of you
when I commit this patch if you don't mind. :-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 8:25 a.m. UTC | #17
On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)

You can also add my

Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>

(I agree with you patch, and its does not have an attack vector...)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hans Schillstrom June 27, 2012, 8:30 a.m. UTC | #18
On Wednesday 27 June 2012 10:22:04 David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)
> 

No problems, go ahead

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 27, 2012, 8:40 a.m. UTC | #19
On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> Date: Wed, 27 Jun 2012 07:23:03 +0200
> 
> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
> >> second.
> > 
> > I have similar results from ~170k to ~199k synack/sec.
> 
> Eric and Hans, I'm going to add Tested-by: tags for both of you
> when I commit this patch if you don't mind. :-)

Well, please send your complete patch before (with IPv6 part) ?



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 27, 2012, 8:45 a.m. UTC | #20
On Wed, 2012-06-27 at 10:21 +0200, Jesper Dangaard Brouer wrote:

> It works because we have a spinlock problem in the code... Perhaps, they
> did it, because we have have locking/contention problem, not the other
> way around ;-)  How about fixing the code instead? ;-)))

The socket lock is currently mandatory.

It's really _hard_ to remove it, your attempts added a lot of races.

I want to do it properly, adding needed RCU and array of spinlocks were
appropriate.

Hopefully, its easier than the RCU conversion I did for the lookups of
ESTABLISHED/TIMEWAIT sockets.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2012, 8:48 a.m. UTC | #21
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 10:40:00 +0200

> On Wed, 2012-06-27 at 01:22 -0700, David Miller wrote:
>> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
>> Date: Wed, 27 Jun 2012 07:23:03 +0200
>> 
>> > On Tuesday 26 June 2012 19:02:36 Eric Dumazet wrote:
>> >> With David patch using jhash instead of SHA, I reach ~315.000 SYN per
>> >> second.
>> > 
>> > I have similar results from ~170k to ~199k synack/sec.
>> 
>> Eric and Hans, I'm going to add Tested-by: tags for both of you
>> when I commit this patch if you don't mind. :-)
> 
> Well, please send your complete patch before (with IPv6 part) ?

Indeed, I'll do that soon, thanks Eric.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer June 27, 2012, 9:23 a.m. UTC | #22
On Wed, 2012-06-27 at 10:45 +0200, Eric Dumazet wrote:
> On Wed, 2012-06-27 at 10:21 +0200, Jesper Dangaard Brouer wrote:
> 
> > It works because we have a spinlock problem in the code... Perhaps, they
> > did it, because we have have locking/contention problem, not the other
> > way around ;-)  How about fixing the code instead? ;-)))
> 
> The socket lock is currently mandatory.
>
> It's really _hard_ to remove it, your attempts added a lot of races.

Yes, its really hard to remove completely.  That's why I choose _only_
to handle the SYN cookie "overload" case, and leave the rest locked, and
I also introduced extra locking in the latest patches.  I know it was
not perfect, hence the RFC tag, but I hope I didn't add that many races.


> I want to do it properly, adding needed RCU and array of spinlocks were
> appropriate.

I really appreciate that you will attempt to fix this properly.  Like a
real network ninja ;-).


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal June 27, 2012, 7:50 p.m. UTC | #23
David Miller <davem@davemloft.net> wrote:
> From: Jesper Dangaard Brouer <brouer@redhat.com>
> Date: Wed, 27 Jun 2012 08:32:13 +0200
> 
> > Using it as default, might be "dangerous" and open an attack vector
> > on SYN cookies in Linux.
> 
> If it's dangerous for syncookies then it's just as dangerous for
> the routing hash and the socket hashes where we use it already.
> Therefore, this sounds like a baseless claim to me.

I doubt using jhash is safe for syncookies.

There a several differences to other uses in kernel:
- all hash input except u32 cookie_secret[2] is known
- we transmit hash result (i.e, its visible to 3rd party)
- we do not re-seed the secret, ever

it should be quite easy to recompute cookie_secret[] from known syncookie
values?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 27, 2012, 9:39 p.m. UTC | #24
On Wed, 2012-06-27 at 21:50 +0200, Florian Westphal wrote:

> I doubt using jhash is safe for syncookies.
> 
> There a several differences to other uses in kernel:
> - all hash input except u32 cookie_secret[2] is known
> - we transmit hash result (i.e, its visible to 3rd party)
> - we do not re-seed the secret, ever
> 
> it should be quite easy to recompute cookie_secret[] from known syncookie
> values?

We could re-seed the secrets every MSL seconds a bit like in
tcp_cookie_generator()

This would require check_tcp_syn_cookie() doing two checks (most recent
seed, and previous one if first check failed)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2012, 10:23 p.m. UTC | #25
From: Florian Westphal <fw@strlen.de>
Date: Wed, 27 Jun 2012 21:50:32 +0200

> - we transmit hash result (i.e, its visible to 3rd party)

Indeed, that's right.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2012, 10:23 p.m. UTC | #26
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 27 Jun 2012 23:39:20 +0200

> On Wed, 2012-06-27 at 21:50 +0200, Florian Westphal wrote:
> 
>> I doubt using jhash is safe for syncookies.
>> 
>> There a several differences to other uses in kernel:
>> - all hash input except u32 cookie_secret[2] is known
>> - we transmit hash result (i.e, its visible to 3rd party)
>> - we do not re-seed the secret, ever
>> 
>> it should be quite easy to recompute cookie_secret[] from known syncookie
>> values?
> 
> We could re-seed the secrets every MSL seconds a bit like in
> tcp_cookie_generator()
> 
> This would require check_tcp_syn_cookie() doing two checks (most recent
> seed, and previous one if first check failed)

That could help, but I'm leaning towards not doing this at all.  Like
for the normal sequence number generation we really can't do this.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3eb76b5..045176f 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -515,6 +515,7 @@  static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
 
 		dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->loc_addr,
 							      ireq->rmt_addr);
+		skb->priority = sk->sk_priority;
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
@@ -556,6 +557,7 @@  static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
 	skb_dst_set(skb, dst_clone(dst));
 
 	bh_lock_sock(ctl_sk);
+	skb->priority = ctl_sk->sk_priority;
 	err = ip_build_and_send_pkt(skb, ctl_sk,
 				    rxiph->daddr, rxiph->saddr, NULL);
 	bh_unlock_sock(ctl_sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0f3185a..71c6c20 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -155,7 +155,7 @@  int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 		ip_options_build(skb, &opt->opt, daddr, rt, 0);
 	}
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	/* Send it out. */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b52934f..5ef4131 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -81,7 +81,7 @@ 
 #include <linux/stddef.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-
+#include <linux/pkt_sched.h>
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 
@@ -821,6 +821,7 @@  static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  *	Send a SYN-ACK after having received a SYN.
  *	This still operates on a request_sock only, not on a big
  *	socket.
+ *	nocache is set for SYN-ACK sent in SYNCOOKIE mode
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
@@ -843,6 +844,10 @@  static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 
 		skb_set_queue_mapping(skb, queue_mapping);
+
+		/* SYNACK sent in SYNCOOKIE mode have low priority */
+		skb->priority = nocache ? TC_PRIO_FILLER : sk->sk_priority;
+
 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 					    ireq->rmt_addr,
 					    ireq->opt);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index e6cee52..5812a74 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -248,6 +248,7 @@  int inet6_csk_xmit(struct sk_buff *skb, struct flowi *fl_unused)
 	/* Restore final destination back after routing done */
 	fl6.daddr = np->daddr;
 
+	skb->priority = sk->sk_priority;
 	res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
 	rcu_read_unlock();
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index a233a7c..a93378a 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -228,7 +228,7 @@  int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	hdr->saddr = fl6->saddr;
 	hdr->daddr = *first_hop;
 
-	skb->priority = sk->sk_priority;
+	/* skb->priority is set by the caller */
 	skb->mark = sk->sk_mark;
 
 	mtu = dst_mtu(dst);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 26a8862..f664452 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -43,6 +43,7 @@ 
 #include <linux/ipv6.h>
 #include <linux/icmpv6.h>
 #include <linux/random.h>
+#include <linux/pkt_sched.h>
 
 #include <net/tcp.h>
 #include <net/ndisc.h>
@@ -479,7 +480,8 @@  out:
 
 static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 			      struct request_values *rvp,
-			      u16 queue_mapping)
+			      u16 queue_mapping,
+			      bool syncookie)
 {
 	struct inet6_request_sock *treq = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -515,6 +517,7 @@  static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req,
 	if (skb) {
 		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
 
+		skb->priority = syncookie ? TC_PRIO_FILLER : sk->sk_priority;
 		fl6.daddr = treq->rmt_addr;
 		skb_set_queue_mapping(skb, queue_mapping);
 		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
@@ -531,7 +534,7 @@  static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
 			     struct request_values *rvp)
 {
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
-	return tcp_v6_send_synack(sk, req, rvp, 0);
+	return tcp_v6_send_synack(sk, req, rvp, 0, false);
 }
 
 static void tcp_v6_reqsk_destructor(struct request_sock *req)
@@ -909,6 +912,7 @@  static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win,
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL, false);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
+		skb->priority = ctl_sk->sk_priority;
 		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
 		TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 		if (rst)
@@ -1217,7 +1221,8 @@  have_isn:
 
 	if (tcp_v6_send_synack(sk, req,
 			       (struct request_values *)&tmp_ext,
-			       skb_get_queue_mapping(skb)) ||
+			       skb_get_queue_mapping(skb),
+			       want_cookie) ||
 	    want_cookie)
 		goto drop_and_free;