diff mbox

[net-next,v4,3/3] net: reserve ports for applications using fixed port numbers

Message ID 1266271241-6293-4-git-send-email-opurdila@ixiacom.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Octavian Purdila Feb. 15, 2010, 10 p.m. UTC
This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports
(bitmap type) which allows users to reserve ports for third-party
applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |   12 ++++++++++++
 drivers/infiniband/core/cma.c          |    7 ++++++-
 include/net/ip.h                       |    6 ++++++
 net/ipv4/af_inet.c                     |    8 +++++++-
 net/ipv4/inet_connection_sock.c        |    6 ++++++
 net/ipv4/inet_hashtables.c             |    2 ++
 net/ipv4/sysctl_net_ipv4.c             |   17 +++++++++++++++++
 net/ipv4/udp.c                         |    3 ++-
 net/sctp/socket.c                      |    2 ++
 9 files changed, 60 insertions(+), 3 deletions(-)

Comments

Amerigo Wang Feb. 16, 2010, 9:37 a.m. UTC | #1
Octavian Purdila wrote:
> This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports
> (bitmap type) which allows users to reserve ports for third-party
> applications.
> 
> The reserved ports will not be used by automatic port assignments
> (e.g. when calling connect() or bind() with port number 0). Explicit
> port allocation behavior is unchanged.
> 
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> Signed-off-by: WANG Cong <amwang@redhat.com>
> Cc: Neil Horman <nhorman@tuxdriver.com>
> Cc: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  Documentation/networking/ip-sysctl.txt |   12 ++++++++++++
>  drivers/infiniband/core/cma.c          |    7 ++++++-
>  include/net/ip.h                       |    6 ++++++
>  net/ipv4/af_inet.c                     |    8 +++++++-
>  net/ipv4/inet_connection_sock.c        |    6 ++++++
>  net/ipv4/inet_hashtables.c             |    2 ++
>  net/ipv4/sysctl_net_ipv4.c             |   17 +++++++++++++++++
>  net/ipv4/udp.c                         |    3 ++-
>  net/sctp/socket.c                      |    2 ++
>  9 files changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 2dc7a1d..23be7a4 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -564,6 +564,18 @@ ip_local_port_range - 2 INTEGERS
>  	(i.e. by default) range 1024-4999 is enough to issue up to
>  	2000 connections per second to systems supporting timestamps.
>  
> +ip_local_reserved_ports - BITMAP of 65536 ports
> +	Specify the ports which are reserved for known third-party
> +	applications. These ports will not be used by automatic port assignments
> +	(e.g. when calling connect() or bind() with port number 0). Explicit
> +	port allocation behavior is unchanged.
> +
> +	Reserving ports is done by writing positive numbers in this proc entry,
> +	clearing them is done by writing negative numbers (e.g. 8080 reserves
> +	port number, -8080 makes it available for automatic assignment again).
> +
> +	Default: Empty
> +
>  ip_nonlocal_bind - BOOLEAN
>  	If set, allows processes to bind() to non-local IP addresses,
>  	which can be quite useful - but may break some applications.
> diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
> index cc9b594..8248fc6 100644
> --- a/drivers/infiniband/core/cma.c
> +++ b/drivers/infiniband/core/cma.c
> @@ -1979,6 +1979,8 @@ retry:
>  	/* FIXME: add proper port randomization per like inet_csk_get_port */
>  	do {
>  		ret = idr_get_new_above(ps, bind_list, next_port, &port);
> +		if (inet_is_reserved_local_port(port))
> +			ret = -EAGAIN;
>  	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
>  
>  	if (ret)
> @@ -2997,10 +2999,13 @@ static int __init cma_init(void)
>  {
>  	int ret, low, high, remaining;
>  
> -	get_random_bytes(&next_port, sizeof next_port);
>  	inet_get_local_port_range(&low, &high);
> +again:
> +	get_random_bytes(&next_port, sizeof next_port);
>  	remaining = (high - low) + 1;
>  	next_port = ((unsigned int) next_port % remaining) + low;
> +	if (inet_is_reserved_local_port(next_port))
> +		goto again;
>  
>  	cma_wq = create_singlethread_workqueue("rdma_cm");
>  	if (!cma_wq)
> diff --git a/include/net/ip.h b/include/net/ip.h
> index fb63371..2e24256 100644
> --- a/include/net/ip.h
> +++ b/include/net/ip.h
> @@ -184,6 +184,12 @@ extern struct local_ports {
>  } sysctl_local_ports;
>  extern void inet_get_local_port_range(int *low, int *high);
>  
> +extern unsigned long *sysctl_local_reserved_ports;
> +static inline int inet_is_reserved_local_port(int port)
> +{
> +	return test_bit(port, sysctl_local_reserved_ports);
> +}
> +
>  extern int sysctl_ip_default_ttl;
>  extern int sysctl_ip_nonlocal_bind;
>  
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index 7d12c6a..06810b0 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1546,9 +1546,13 @@ static int __init inet_init(void)
>  
>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
>  
> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
> +	if (!sysctl_local_reserved_ports)
> +		goto out;
> +


I think we should also consider the ports in ip_local_port_range,
since we can only reserve the ports in that range.


>  	rc = proto_register(&tcp_prot, 1);
>  	if (rc)
> -		goto out;
> +		goto out_free_reserved_ports;
>  
>  	rc = proto_register(&udp_prot, 1);
>  	if (rc)
> @@ -1647,6 +1651,8 @@ out_unregister_udp_proto:
>  	proto_unregister(&udp_prot);
>  out_unregister_tcp_proto:
>  	proto_unregister(&tcp_prot);
> +out_free_reserved_ports:
> +	kfree(sysctl_local_reserved_ports);
>  	goto out;
>  }
>  
> diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
> index 8da6429..1acb462 100644
> --- a/net/ipv4/inet_connection_sock.c
> +++ b/net/ipv4/inet_connection_sock.c
> @@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __read_mostly = {
>  	.range = { 32768, 61000 },
>  };
>  
> +unsigned long *sysctl_local_reserved_ports;
> +EXPORT_SYMBOL(sysctl_local_reserved_ports);
> +
>  void inet_get_local_port_range(int *low, int *high)
>  {
>  	unsigned seq;
> @@ -108,6 +111,8 @@ again:
>  
>  		smallest_size = -1;
>  		do {
> +			if (inet_is_reserved_local_port(rover))
> +				goto next_nolock;
>  			head = &hashinfo->bhash[inet_bhashfn(net, rover,
>  					hashinfo->bhash_size)];
>  			spin_lock(&head->lock);
> @@ -130,6 +135,7 @@ again:
>  			break;
>  		next:
>  			spin_unlock(&head->lock);
> +		next_nolock:
>  			if (++rover > high)
>  				rover = low;
>  		} while (--remaining > 0);
> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
> index 2b79377..d3e160a 100644
> --- a/net/ipv4/inet_hashtables.c
> +++ b/net/ipv4/inet_hashtables.c
> @@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
>  		local_bh_disable();
>  		for (i = 1; i <= remaining; i++) {
>  			port = low + (i + offset) % remaining;
> +			if (inet_is_reserved_local_port(port))
> +				continue;
>  			head = &hinfo->bhash[inet_bhashfn(net, port,
>  					hinfo->bhash_size)];
>  			spin_lock(&head->lock);
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 7e3712c..ce3597a 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -298,6 +298,13 @@ static struct ctl_table ipv4_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= ipv4_local_port_range,
>  	},
> +	{
> +		.procname	= "ip_local_reserved_ports",
> +		.data		= NULL, /* initialized in sysctl_ipv4_init */
> +		.maxlen		= 65536,
> +		.mode		= 0644,
> +		.proc_handler	= proc_dobitmap,
> +	},


Isn't there an off-by-one here?

In patch 2/3, you use 0 to set the fist bit, then how about 65535 which
writes 65536th bit? This is beyond the range of port number.


>  #ifdef CONFIG_IP_MULTICAST
>  	{
>  		.procname	= "igmp_max_memberships",
> @@ -721,6 +728,16 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
>  static __init int sysctl_ipv4_init(void)
>  {
>  	struct ctl_table_header *hdr;
> +	struct ctl_table *i;
> +
> +	for (i = ipv4_table; i->procname; i++) {
> +		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
> +			i->data = sysctl_local_reserved_ports;
> +			break;
> +		}
> +	}
> +	if (!i->procname[0])
> +		return -EINVAL;
>  
>  	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
>  	if (hdr == NULL)
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 608a544..bfd0a6a 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -232,7 +232,8 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
>  			 */
>  			do {
>  				if (low <= snum && snum <= high &&
> -				    !test_bit(snum >> udptable->log, bitmap))
> +				    !test_bit(snum >> udptable->log, bitmap) &&
> +				    !inet_is_reserved_local_port(snum))
>  					goto found;
>  				snum += rand;
>  			} while (snum != first);
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index f6d1e59..1f839d0 100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -5432,6 +5432,8 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
>  			rover++;
>  			if ((rover < low) || (rover > high))
>  				rover = low;
> +			if (inet_is_reserved_local_port(rover))
> +				continue;
>  			index = sctp_phashfn(rover);
>  			head = &sctp_port_hashtable[index];
>  			sctp_spin_lock(&head->lock);

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Octavian Purdila Feb. 16, 2010, 11:06 a.m. UTC | #2
On Tuesday 16 February 2010 11:37:04 you wrote:
> >  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
> >
> > +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
> > +	if (!sysctl_local_reserved_ports)
> > +		goto out;
> > +
> 
> I think we should also consider the ports in ip_local_port_range,
> since we can only reserve the ports in that range.
> 

That is subject to changes at runtime, which means we will have to readjust 
the bitmap at runtime which introduces the need for additional synchronization 
operations which I would rather avoid. 

> > +	{
> > +		.procname	= "ip_local_reserved_ports",
> > +		.data		= NULL, /* initialized in sysctl_ipv4_init */
> > +		.maxlen		= 65536,
> > +		.mode		= 0644,
> > +		.proc_handler	= proc_dobitmap,
> > +	},
> 
> Isn't there an off-by-one here?
> 
> In patch 2/3, you use 0 to set the fist bit, then how about 65535 which
> writes 65536th bit? This is beyond the range of port number.
> 

This seems fine to me, 65535 is the value used by both the port checking 
function and the proc read/write function. And it translates indeed to  
65536th bit, but that is also bit 65535 if you start counting bits from 0 
instead of 1. The usual computing/natural arithmetic confusion for the meaning 
of first :)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amerigo Wang Feb. 16, 2010, 1:06 p.m. UTC | #3
Octavian Purdila wrote:
> On Tuesday 16 February 2010 11:37:04 you wrote:
>>>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
>>>
>>> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
>>> +	if (!sysctl_local_reserved_ports)
>>> +		goto out;
>>> +
>> I think we should also consider the ports in ip_local_port_range,
>> since we can only reserve the ports in that range.
>>
> 
> That is subject to changes at runtime, which means we will have to readjust 
> the bitmap at runtime which introduces the need for additional synchronization 
> operations which I would rather avoid. 

Why? As long as the bitmap is global, this will not be hard.

Consider that if one user writes a port number which is beyond
the ip_local_port_range into ip_local_reserved_ports, we should
not accept this, because it doesn't make any sense. But with your
patch, we do.


> 
>>> +	{
>>> +		.procname	= "ip_local_reserved_ports",
>>> +		.data		= NULL, /* initialized in sysctl_ipv4_init */
>>> +		.maxlen		= 65536,
>>> +		.mode		= 0644,
>>> +		.proc_handler	= proc_dobitmap,
>>> +	},
>> Isn't there an off-by-one here?
>>
>> In patch 2/3, you use 0 to set the fist bit, then how about 65535 which
>> writes 65536th bit? This is beyond the range of port number.
>>
> 
> This seems fine to me, 65535 is the value used by both the port checking 
> function and the proc read/write function. And it translates indeed to  
> 65536th bit, but that is also bit 65535 if you start counting bits from 0 
> instead of 1. The usual computing/natural arithmetic confusion for the meaning 
> of first :)
> 

Oh, I see.

Thanks.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Feb. 16, 2010, 1:20 p.m. UTC | #4
Le mardi 16 février 2010 à 21:06 +0800, Cong Wang a écrit :
> Octavian Purdila wrote:
> > On Tuesday 16 February 2010 11:37:04 you wrote:
> >>>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
> >>>
> >>> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
> >>> +	if (!sysctl_local_reserved_ports)
> >>> +		goto out;
> >>> +
> >> I think we should also consider the ports in ip_local_port_range,
> >> since we can only reserve the ports in that range.
> >>
> > 
> > That is subject to changes at runtime, which means we will have to readjust 
> > the bitmap at runtime which introduces the need for additional synchronization 
> > operations which I would rather avoid. 
> 
> Why? As long as the bitmap is global, this will not be hard.
> 
> Consider that if one user writes a port number which is beyond
> the ip_local_port_range into ip_local_reserved_ports, we should
> not accept this, because it doesn't make any sense. But with your
> patch, we do.

I disagree with you. This is perfectly OK.

A port not being flagged in ip_local_reserved_ports doesnt mean it can
be used for allocation.

If you want to really block ports from being used at boot, you could for
example :

# temporarly reduce the ip_local_port_range
echo "61000 61001" >/proc/sys/net/ipv4/ip_local_port_range
# Build our bitmap (could be slow, if a remote database is read)
for port in $LIST_RESERVED_PORT
do
  echo $port >/proc/sys/net/ipv4/ip_local_reserved_ports
done
echo "10000 61000" >/proc/sys/net/ipv4/ip_local_port_range


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Octavian Purdila Feb. 16, 2010, 2:25 p.m. UTC | #5
On Tuesday 16 February 2010 15:06:26 you wrote:
> Octavian Purdila wrote:
> > On Tuesday 16 February 2010 11:37:04 you wrote:
> >>>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
> >>>
> >>> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
> >>> +	if (!sysctl_local_reserved_ports)
> >>> +		goto out;
> >>> +
> >>
> >> I think we should also consider the ports in ip_local_port_range,
> >> since we can only reserve the ports in that range.
> >
> > That is subject to changes at runtime, which means we will have to
> > readjust the bitmap at runtime which introduces the need for additional
> > synchronization operations which I would rather avoid.
> 
> Why? As long as the bitmap is global, this will not be hard.
> 

For the more important point see bellow, but with regard to reallocation, this 
means we need to at least use rcu_read_lock() in the fast path to avoid races 
between freeing the old bitmap and doing a read in progress. 

Granted, that is a light operation, but would it makes things so much more 
complicated just so that we save one memory page (assuming the range is the 
default [32000 64000] one).

> Consider that if one user writes a port number which is beyond
> the ip_local_port_range into ip_local_reserved_ports, we should
> not accept this, because it doesn't make any sense. But with your
> patch, we do.
> 

I think it should be allowed. I see ip_local_reserved_ports and ip_local_range 
as independent settings that can be change at any time.

That way I can flag port 8080 even if the current range is [32000, 64000] and 
then later I can expand the range to [1024, 64000] without loosing the 8080 
reservation.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Octavian Purdila Feb. 17, 2010, 4:01 p.m. UTC | #6
On Wednesday 17 February 2010 18:39:28 you wrote:
> Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit :
> > I don't think so, if you want to avoid race condition, you just need to
> > write the reserved ports before any networking application starts, IOW,
> > as early as possible during boot.
> 
> Sure, but I was thinking retrieving the list of reserved port by a
> database query, using network :)
> 
> Anyway, I just feel your argument is not applicable.
> 
> Our kernel is capable of doing an intersection for us, we dont need
> to forbid user to mark a port as 'reserved' if this port is already
> blacklisted by another mechanism (for example, if this port is already
> in use)
> 

Also I believe that ip_local_port_range purpose is not to reserve *specific* 
ports. Changing this setting helps with things like increasing the port space 
for NAT or for a higher connection rate.

We add the new option for reserving *specific* ports. 

So, even from a functional perspective, it makes more sense to me to keep them 
independent, as they serve different purposes.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amerigo Wang Feb. 17, 2010, 4:07 p.m. UTC | #7
Octavian Purdila wrote:
> On Tuesday 16 February 2010 15:06:26 you wrote:
>> Octavian Purdila wrote:
>>> On Tuesday 16 February 2010 11:37:04 you wrote:
>>>>>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
>>>>>
>>>>> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
>>>>> +	if (!sysctl_local_reserved_ports)
>>>>> +		goto out;
>>>>> +
>>>> I think we should also consider the ports in ip_local_port_range,
>>>> since we can only reserve the ports in that range.
>>> That is subject to changes at runtime, which means we will have to
>>> readjust the bitmap at runtime which introduces the need for additional
>>> synchronization operations which I would rather avoid.
>> Why? As long as the bitmap is global, this will not be hard.
>>
> 
> For the more important point see bellow, but with regard to reallocation, this 
> means we need to at least use rcu_read_lock() in the fast path to avoid races 
> between freeing the old bitmap and doing a read in progress. 
> 
> Granted, that is a light operation, but would it makes things so much more 
> complicated just so that we save one memory page (assuming the range is the 
> default [32000 64000] one).


Why not just allocate the bitmap for all ports? 65535/8 bytes are
needed.

> 
>> Consider that if one user writes a port number which is beyond
>> the ip_local_port_range into ip_local_reserved_ports, we should
>> not accept this, because it doesn't make any sense. But with your
>> patch, we do.
>>
> 
> I think it should be allowed. I see ip_local_reserved_ports and ip_local_range 
> as independent settings that can be change at any time.


According to the original purpose, they are not.

> 
> That way I can flag port 8080 even if the current range is [32000, 64000] and 
> then later I can expand the range to [1024, 64000] without loosing the 8080 
> reservation.

Then its meaning is changed, bind(0) will never have chance to get 8080,
thus reserving 8080 for this purpose fails.

I want to always keep its original meaning, if the local_port_range goes
out, then local_reserved_port should be empty at the same time, you have
to reset it after changing local_port_range.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amerigo Wang Feb. 17, 2010, 4:13 p.m. UTC | #8
Eric Dumazet wrote:
> Le mardi 16 février 2010 à 21:06 +0800, Cong Wang a écrit :
>> Octavian Purdila wrote:
>>> On Tuesday 16 February 2010 11:37:04 you wrote:
>>>>>  	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
>>>>>
>>>>> +	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
>>>>> +	if (!sysctl_local_reserved_ports)
>>>>> +		goto out;
>>>>> +
>>>> I think we should also consider the ports in ip_local_port_range,
>>>> since we can only reserve the ports in that range.
>>>>
>>> That is subject to changes at runtime, which means we will have to readjust 
>>> the bitmap at runtime which introduces the need for additional synchronization 
>>> operations which I would rather avoid. 
>> Why? As long as the bitmap is global, this will not be hard.
>>
>> Consider that if one user writes a port number which is beyond
>> the ip_local_port_range into ip_local_reserved_ports, we should
>> not accept this, because it doesn't make any sense. But with your
>> patch, we do.
> 
> I disagree with you. This is perfectly OK.
> 
> A port not being flagged in ip_local_reserved_ports doesnt mean it can
> be used for allocation.
> 
> If you want to really block ports from being used at boot, you could for
> example :
> 
> # temporarly reduce the ip_local_port_range
> echo "61000 61001" >/proc/sys/net/ipv4/ip_local_port_range
> # Build our bitmap (could be slow, if a remote database is read)
> for port in $LIST_RESERVED_PORT
> do
>   echo $port >/proc/sys/net/ipv4/ip_local_reserved_ports
> done
> echo "10000 61000" >/proc/sys/net/ipv4/ip_local_port_range
> 
> 

I don't think so, if you want to avoid race condition, you just need to
write the reserved ports before any networking application starts, IOW,
as early as possible during boot.

Thanks.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Feb. 17, 2010, 4:39 p.m. UTC | #9
Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit :

> I don't think so, if you want to avoid race condition, you just need to
> write the reserved ports before any networking application starts, IOW,
> as early as possible during boot.
> 

Sure, but I was thinking retrieving the list of reserved port by a
database query, using network :)

Anyway, I just feel your argument is not applicable.

Our kernel is capable of doing an intersection for us, we dont need
to forbid user to mark a port as 'reserved' if this port is already
blacklisted by another mechanism (for example, if this port is already
in use)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Amerigo Wang Feb. 20, 2010, 8 a.m. UTC | #10
Eric Dumazet wrote:
> Le jeudi 18 février 2010 à 00:13 +0800, Cong Wang a écrit :
> 
>> I don't think so, if you want to avoid race condition, you just need to
>> write the reserved ports before any networking application starts, IOW,
>> as early as possible during boot.
>>
> 
> Sure, but I was thinking retrieving the list of reserved port by a
> database query, using network :)
> 
> Anyway, I just feel your argument is not applicable.
> 
> Our kernel is capable of doing an intersection for us, we dont need
> to forbid user to mark a port as 'reserved' if this port is already
> blacklisted by another mechanism (for example, if this port is already
> in use)
> 

Oh, I see your points.

But this still could make people confused, like me. I think we'd better
document this.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 2dc7a1d..23be7a4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -564,6 +564,18 @@  ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - BITMAP of 65536 ports
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port assignments
+	(e.g. when calling connect() or bind() with port number 0). Explicit
+	port allocation behavior is unchanged.
+
+	Reserving ports is done by writing positive numbers in this proc entry,
+	clearing them is done by writing negative numbers (e.g. 8080 reserves
+	port number, -8080 makes it available for automatic assignment again).
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index cc9b594..8248fc6 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1979,6 +1979,8 @@  retry:
 	/* FIXME: add proper port randomization per like inet_csk_get_port */
 	do {
 		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+		if (inet_is_reserved_local_port(port))
+			ret = -EAGAIN;
 	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
 
 	if (ret)
@@ -2997,10 +2999,13 @@  static int __init cma_init(void)
 {
 	int ret, low, high, remaining;
 
-	get_random_bytes(&next_port, sizeof next_port);
 	inet_get_local_port_range(&low, &high);
+again:
+	get_random_bytes(&next_port, sizeof next_port);
 	remaining = (high - low) + 1;
 	next_port = ((unsigned int) next_port % remaining) + low;
+	if (inet_is_reserved_local_port(next_port))
+		goto again;
 
 	cma_wq = create_singlethread_workqueue("rdma_cm");
 	if (!cma_wq)
diff --git a/include/net/ip.h b/include/net/ip.h
index fb63371..2e24256 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -184,6 +184,12 @@  extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7d12c6a..06810b0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1546,9 +1546,13 @@  static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1647,6 +1651,8 @@  out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8da6429..1acb462 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@  struct local_ports sysctl_local_ports __read_mostly = {
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -108,6 +111,8 @@  again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -130,6 +135,7 @@  again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 2b79377..d3e160a 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 7e3712c..ce3597a 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -298,6 +298,13 @@  static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_dobitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -721,6 +728,16 @@  static __net_initdata struct pernet_operations ipv4_sysctl_ops = {
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname[0])
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 608a544..bfd0a6a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -232,7 +232,8 @@  int udp_lib_get_port(struct sock *sk, unsigned short snum,
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f6d1e59..1f839d0 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5432,6 +5432,8 @@  static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr)
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);