diff mbox

PROBLEM: Linux kernel 2.6.31 IPv4 TCP fails to open huge amount of outgoing connections (unable to bind ... )

Message ID 1271877975.7895.3171.camel@edumazet-laptop
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Eric Dumazet April 21, 2010, 7:26 p.m. UTC
Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :

> Damn it, I tried multiple times :)
> You are right of course!
> 

Here is a formal patch then :)

[PATCH] tcp: bind() fix when many ports are bound

Port autoselection done by kernel only works when number of bound
sockets is under a threshold (typically 30000).

When this threshold is over, we must check if there is a conflict before
exiting first loop in inet_csk_get_port()

Change inet_csk_bind_conflict() to forbid two reuse-enabled sockets to
bind on same (address,port) tuple (with a non ANY address)

Same change for inet6_csk_bind_conflict()

Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/ipv4/inet_connection_sock.c  |   16 +++++++++++-----
 net/ipv6/inet6_connection_sock.c |   15 ++++++++++-----
 2 files changed, 21 insertions(+), 10 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Evgeniy Polyakov April 21, 2010, 8:08 p.m. UTC | #1
On Wed, Apr 21, 2010 at 09:26:15PM +0200, Eric Dumazet (eric.dumazet@gmail.com) wrote:
> Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :
> 
> > Damn it, I tried multiple times :)
> > You are right of course!
> > 
> 
> Here is a formal patch then :)

Ack. Thank you Eric!
David Miller April 23, 2010, 2:06 a.m. UTC | #2
From: Evgeniy Polyakov <zbr@ioremap.net>
Date: Thu, 22 Apr 2010 00:08:02 +0400

> On Wed, Apr 21, 2010 at 09:26:15PM +0200, Eric Dumazet (eric.dumazet@gmail.com) wrote:
>> Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :
>> 
>> > Damn it, I tried multiple times :)
>> > You are right of course!
>> > 
>> 
>> Here is a formal patch then :)
> 
> Ack. Thank you Eric!

This code is way too complex :-)

Applied, thanks everyone!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin April 25, 2010, 2:26 p.m. UTC | #3
On Wed, Apr 21, 2010 at 09:26:15PM +0200, Eric Dumazet wrote:
> Le mercredi 21 avril 2010 à 22:58 +0400, Evgeniy Polyakov a écrit :
> 
> > Damn it, I tried multiple times :)
> > You are right of course!
> > 
> 
> Here is a formal patch then :)
> 
> [PATCH] tcp: bind() fix when many ports are bound
> 
> Port autoselection done by kernel only works when number of bound
> sockets is under a threshold (typically 30000).
> 
> When this threshold is over, we must check if there is a conflict before
> exiting first loop in inet_csk_get_port()
> 
> Change inet_csk_bind_conflict() to forbid two reuse-enabled sockets to
> bind on same (address,port) tuple (with a non ANY address)
> 
> Same change for inet6_csk_bind_conflict()
> 
> Reported-by: Gaspar Chilingarov <gasparch@gmail.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  net/ipv4/inet_connection_sock.c  |   16 +++++++++++-----
>  net/ipv6/inet6_connection_sock.c |   15 ++++++++++-----
>  2 files changed, 21 insertions(+), 10 deletions(-)
> 
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index e0a3e35..78cbc39 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -70,13 +70,17 @@ int inet_csk_bind_conflict(const struct sock *sk,
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
>  		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
> +
>  			if (!reuse || !sk2->sk_reuse ||
>  			    sk2->sk_state == TCP_LISTEN) {
> -				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
>  				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
>  				    sk2_rcv_saddr == sk_rcv_saddr)
>  					break;
> -			}
> +			} else if (reuse && sk2->sk_reuse &&
> +				   sk2_rcv_saddr &&
> +				   sk2_rcv_saddr == sk_rcv_saddr)
> +				break;
>  		}
>  	}
>  	return node != NULL;
> @@ -120,9 +124,11 @@ again:
>  						smallest_size = tb->num_owners;
>  						smallest_rover = rover;
>  						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
> -							spin_unlock(&head->lock);
> -							snum = smallest_rover;
> -							goto have_snum;
> +							if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
> +								spin_unlock(&head->lock);
> +								snum = smallest_rover;
> +								goto have_snum;
> +							}
>  						}
>  					}
>  					goto next;
> diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> index 0c5e3c3..fb6959c 100644
> --- a/net/ipv6/inet6_connection_sock.c
> +++ b/net/ipv6/inet6_connection_sock.c
> @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
>  		if (sk != sk2 &&
>  		    (!sk->sk_bound_dev_if ||
>  		     !sk2->sk_bound_dev_if ||
> -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> -		     sk2->sk_state == TCP_LISTEN) &&
> -		     ipv6_rcv_saddr_equal(sk, sk2))
> -			break;
> +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> +			     sk2->sk_state == TCP_LISTEN) &&
> +			     ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +			else if (sk->sk_reuse && sk2->sk_reuse &&
> +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> +				ipv6_rcv_saddr_equal(sk, sk2))
> +				break;
> +		}
>  	}
>  
>  	return node != NULL;
> 

With this applied, my box crashes on boot:
rhel6 beta userspace, v2.6.34-rc5-204-gddc9b34 kernel.
2.6.34-rc5 kernel boots fine.
the crash seems to be around net/ipv6/inet6_connection_sock.c:50
after reverting fda48a0d7a8412cedacda46a9c0bf8ef9cd13559,
the crash goes away.

I created https://bugzilla.kernel.org/show_bug.cgi?id=15847
to track this.

Oops below:

BUG: unable to handle kernel NULL pointer dereference at
0000000000000004
IP: [<ffffffffa02b99aa>] inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
PGD 0 
Oops: 0000 [#1] SMP 
last sysfs file:
/sys/devices/pci0000:00/0000:00:01.0/0000:01:00.0/net/eth0/ifindex
CPU 9 
Modules linked in: ip6t_REJECT nf_conntrack_ipv6 ip6table_filter
ip6_tables ipv6 dm_mirror dm_region_hash dm_log igb i2c_i801 sg iTCO_wdt
iTCO_vendor_support shpchp ioatdma dca pcspkr sr_mod cdrom ext4 mbcache
jbd2 sd_mod ata_generic crc_t10dif pata_acpi ahci pata_jmicron radeon
ttm drm_kms_helper drm i2c_algo_bit i2c_core dm_mod [last unloaded:
scsi_wait_scan]

Pid: 1640, comm: master Not tainted 2.6.34-rc5-mst #1 X8DTN/X8DTN
RIP: 0010:[<ffffffffa02b99aa>]  [<ffffffffa02b99aa>]
inet6_csk_bind_conflict+0x6a/0x110 [ipv6]
RSP: 0018:ffff8803357a7d98  EFLAGS: 00010293
RAX: 0000000000000000 RBX: ffff880335709440 RCX: 0000000000000000
RDX: 0000000000020011 RSI: ffff880335709440 RDI: ffff880334c61e78
RBP: ffff8803357a7db8 R08: 0000000000000019 R09: 0000000000000019
R10: 00000000000000d4 R11: 0000000000000400 R12: ffff880335709468
R13: ffff880334c61800 R14: ffff880335489500 R15: ffffffff8225d700
FS:  00007feacd26f7c0(0000) GS:ffff8801c5700000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000004 CR3: 00000003341ef000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process master (pid: 1640, threadinfo ffff8803357a6000, task
ffff880334225540)
Stack:
 0000000000000000 ffffffff8225b500 ffffc9001251ced0 ffff880334c61800
<0> ffff8803357a7e48 ffffffff81418fa8 ffff880300000019 ffffffff8149ceb6
<0> 0000000536306140 0000000000000246 ffff8803357a7e08 0000000000000246
Call Trace:
 [<ffffffff81418fa8>] inet_csk_get_port+0x238/0x450
 [<ffffffff8149ceb6>] ? _raw_spin_lock_bh+0x16/0x40
 [<ffffffff8149ce15>] ? _raw_read_unlock_bh+0x15/0x20
 [<ffffffffa0290226>] ? ipv6_chk_addr+0xe6/0x100 [ipv6]
Evgeniy Polyakov April 25, 2010, 3:56 p.m. UTC | #4
On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:

> > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > index 0c5e3c3..fb6959c 100644
> > --- a/net/ipv6/inet6_connection_sock.c
> > +++ b/net/ipv6/inet6_connection_sock.c
> > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> >  		if (sk != sk2 &&
> >  		    (!sk->sk_bound_dev_if ||
> >  		     !sk2->sk_bound_dev_if ||
> > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > -		     sk2->sk_state == TCP_LISTEN) &&
> > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > -			break;
> > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > +			     sk2->sk_state == TCP_LISTEN) &&
> > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > +				break;
> > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&

I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
Eric Dumazet April 25, 2010, 4:13 p.m. UTC | #5
Le dimanche 25 avril 2010 à 19:56 +0400, Evgeniy Polyakov a écrit :
> On Sun, Apr 25, 2010 at 05:26:42PM +0300, Michael S. Tsirkin (mst@redhat.com) wrote:
> 
> > > diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
> > > index 0c5e3c3..fb6959c 100644
> > > --- a/net/ipv6/inet6_connection_sock.c
> > > +++ b/net/ipv6/inet6_connection_sock.c
> > > @@ -42,11 +42,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
> > >  		if (sk != sk2 &&
> > >  		    (!sk->sk_bound_dev_if ||
> > >  		     !sk2->sk_bound_dev_if ||
> > > -		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
> > > -		    (!sk->sk_reuse || !sk2->sk_reuse ||
> > > -		     sk2->sk_state == TCP_LISTEN) &&
> > > -		     ipv6_rcv_saddr_equal(sk, sk2))
> > > -			break;
> > > +		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
> > > +			if ((!sk->sk_reuse || !sk2->sk_reuse ||
> > > +			     sk2->sk_state == TCP_LISTEN) &&
> > > +			     ipv6_rcv_saddr_equal(sk, sk2))
> > > +				break;
> > > +			else if (sk->sk_reuse && sk2->sk_reuse &&
> > > +				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
> 
> I suppose above line is guilty when inet6_rcv_saddr() returns NULL?
> 

Oh its a typo

we should test ipv6_addr_any(inet6_rcv_saddr(sk))

instead of ipv6_addr_any(inet6_rcv_saddr(sk2))

(sk is AF_INET6, while sk2 could be AF_INET)

I'll submit a patch promptly


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index e0a3e35..78cbc39 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -70,13 +70,17 @@  int inet_csk_bind_conflict(const struct sock *sk,
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
 		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
+
 			if (!reuse || !sk2->sk_reuse ||
 			    sk2->sk_state == TCP_LISTEN) {
-				const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2);
 				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
 				    sk2_rcv_saddr == sk_rcv_saddr)
 					break;
-			}
+			} else if (reuse && sk2->sk_reuse &&
+				   sk2_rcv_saddr &&
+				   sk2_rcv_saddr == sk_rcv_saddr)
+				break;
 		}
 	}
 	return node != NULL;
@@ -120,9 +124,11 @@  again:
 						smallest_size = tb->num_owners;
 						smallest_rover = rover;
 						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) {
-							spin_unlock(&head->lock);
-							snum = smallest_rover;
-							goto have_snum;
+							if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) {
+								spin_unlock(&head->lock);
+								snum = smallest_rover;
+								goto have_snum;
+							}
 						}
 					}
 					goto next;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 0c5e3c3..fb6959c 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -42,11 +42,16 @@  int inet6_csk_bind_conflict(const struct sock *sk,
 		if (sk != sk2 &&
 		    (!sk->sk_bound_dev_if ||
 		     !sk2->sk_bound_dev_if ||
-		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
-		    (!sk->sk_reuse || !sk2->sk_reuse ||
-		     sk2->sk_state == TCP_LISTEN) &&
-		     ipv6_rcv_saddr_equal(sk, sk2))
-			break;
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if ((!sk->sk_reuse || !sk2->sk_reuse ||
+			     sk2->sk_state == TCP_LISTEN) &&
+			     ipv6_rcv_saddr_equal(sk, sk2))
+				break;
+			else if (sk->sk_reuse && sk2->sk_reuse &&
+				!ipv6_addr_any(inet6_rcv_saddr(sk2)) &&
+				ipv6_rcv_saddr_equal(sk, sk2))
+				break;
+		}
 	}
 
 	return node != NULL;