diff mbox

[net-next] sockopt: Make SO_BINDTODEVICE readable

Message ID 50858ABD.2000206@hp.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Brian Haley Oct. 22, 2012, 6:04 p.m. UTC
On 10/19/2012 05:55 AM, Pavel Emelyanov wrote:
> The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but
> cannot be get via sockopt API. The only way we can find the device id a
> socket is bound to is via sock-diag interface. But the diag works only on
> hashed sockets, while the opt in question can be set for yet unhashed one.
> 
> That said, in order to know what device a socket is bound to (we do want
> to know this in checkpoint-restore project) I propose to make this option
> getsockopt-able and report the respective device index.
> 
> Another solution to the problem might be to teach the sock-diag reporting
> info on unhashed sockets. Should I go this way instead?
> 
> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
> 
> ---
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 8a146cf..c49412c 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>  	case SO_NOFCS:
>  		v.val = sock_flag(sk, SOCK_NOFCS);
>  		break;
> +	case SO_BINDTODEVICE:
> +		v.val = sk->sk_bound_dev_if;
> +		break;
>  	default:
>  		return -ENOPROTOOPT;
>  	}

Doesn't this make the set and get non-symmetrical?  For example, setsockopt()
would take "eth0", but getsockopt() would return 2.  The following patch would
return a string, or -ENODEV if not set.

-Brian

---

Change getsockopt(SO_BINDTODEVICE) to be symmetrical with setsockopt() by
returning the interface name as a string.

Signed-off-by: Brian Haley <brian.haley@hp.com>

 	}
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Pavel Emelyanov Oct. 22, 2012, 8:28 p.m. UTC | #1
On 10/22/2012 10:04 PM, Brian Haley wrote:
> On 10/19/2012 05:55 AM, Pavel Emelyanov wrote:
>> The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but
>> cannot be get via sockopt API. The only way we can find the device id a
>> socket is bound to is via sock-diag interface. But the diag works only on
>> hashed sockets, while the opt in question can be set for yet unhashed one.
>>
>> That said, in order to know what device a socket is bound to (we do want
>> to know this in checkpoint-restore project) I propose to make this option
>> getsockopt-able and report the respective device index.
>>
>> Another solution to the problem might be to teach the sock-diag reporting
>> info on unhashed sockets. Should I go this way instead?
>>
>> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
>>
>> ---
>>
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index 8a146cf..c49412c 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>  	case SO_NOFCS:
>>  		v.val = sock_flag(sk, SOCK_NOFCS);
>>  		break;
>> +	case SO_BINDTODEVICE:
>> +		v.val = sk->sk_bound_dev_if;
>> +		break;
>>  	default:
>>  		return -ENOPROTOOPT;
>>  	}
> 
> Doesn't this make the set and get non-symmetrical?  For example, setsockopt()
> would take "eth0", but getsockopt() would return 2.

It will, but since device name and index are two equal device "IDs" I assumed
it would be OK.

However, some comments inline.

> The following patch would return a string, or -ENODEV if not set.
> 
> -Brian
> 
> ---
> 
> Change getsockopt(SO_BINDTODEVICE) to be symmetrical with setsockopt() by
> returning the interface name as a string.
> 
> Signed-off-by: Brian Haley <brian.haley@hp.com>
> 
> diff --git a/net/core/sock.c b/net/core/sock.c
> index c49412c..69b9d92 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -505,7 +505,8 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
>  }
>  EXPORT_SYMBOL(sk_dst_check);
> 
> -static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
> +static int sock_setbindtodevice(struct sock *sk, char __user *optval,
> +				int optlen)
>  {
>  	int ret = -ENOPROTOOPT;
>  #ifdef CONFIG_NETDEVICES
> @@ -562,6 +563,49 @@ out:
>  	return ret;
>  }
> 
> +static int sock_getbindtodevice(struct sock *sk, char __user *optval,
> +				int __user *optlen, int len)
> +{
> +	int ret = -ENOPROTOOPT;
> +#ifdef CONFIG_NETDEVICES
> +	struct net *net = sock_net(sk);
> +	struct net_device *dev;
> +	char devname[IFNAMSIZ];
> +
> +	ret = 0;
> +	if (sk->sk_bound_dev_if == 0)
> +		goto out;

It will return 0 if device is not set, thus making it impossible to detect
this situation.

> +	ret = -EINVAL;
> +	if (len < IFNAMSIZ)
> +		goto out;
> +	if (len > IFNAMSIZ)
> +		len = IFNAMSIZ;
> +
> +	rcu_read_lock();
> +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
> +	if (dev)
> +		strcpy(dev->name, devname);
> +	rcu_read_unlock();
> +	ret = -ENODEV;
> +	if (!dev)
> +		goto out;
> +
> +	ret = -EFAULT;
> +	if (copy_to_user(optval, devname, len))
> +		goto out;
> +
> +	if (put_user(len, optlen))
> +		goto out;

What's the point in reporting IFNAMSIZ to the userspace always, taking
into account that this constant is exported there anyway?

> +	ret = 0;
> +
> +out:
> +#endif
> +
> +	return ret;
> +}
> +
>  static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
>  {
>  	if (valbool)
> @@ -589,7 +633,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>  	 */
> 
>  	if (optname == SO_BINDTODEVICE)
> -		return sock_bindtodevice(sk, optval, optlen);
> +		return sock_setbindtodevice(sk, optval, optlen);
> 
>  	if (optlen < sizeof(int))
>  		return -EINVAL;
> @@ -1074,9 +1118,10 @@ int sock_getsockopt(struct socket *sock, int level, int
> optname,
>  	case SO_NOFCS:
>  		v.val = sock_flag(sk, SOCK_NOFCS);
>  		break;
> +
>  	case SO_BINDTODEVICE:
> -		v.val = sk->sk_bound_dev_if;
> -		break;
> +		return sock_getbindtodevice(sk, optval, optlen, len);
> +
>  	default:
>  		return -ENOPROTOOPT;
>  	}
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> .
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 22, 2012, 8:45 p.m. UTC | #2
On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote:

> +	char devname[IFNAMSIZ];
> +
> +	ret = 0;
> +	if (sk->sk_bound_dev_if == 0)
> +		goto out;
> +
> +	ret = -EINVAL;
> +	if (len < IFNAMSIZ)
> +		goto out;
> +	if (len > IFNAMSIZ)
> +		len = IFNAMSIZ;
> +
> +	rcu_read_lock();
> +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
> +	if (dev)
> +		strcpy(dev->name, devname);
> +	rcu_read_unlock();
> +	ret = -ENODEV;

You probably meant

	strcpy(devname, dev->name)

By the way, this is not really safe in case device is renamed



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pavel Emelyanov Oct. 22, 2012, 9:10 p.m. UTC | #3
On 10/23/2012 12:45 AM, Eric Dumazet wrote:
> On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote:
> 
>> +	char devname[IFNAMSIZ];
>> +
>> +	ret = 0;
>> +	if (sk->sk_bound_dev_if == 0)
>> +		goto out;
>> +
>> +	ret = -EINVAL;
>> +	if (len < IFNAMSIZ)
>> +		goto out;
>> +	if (len > IFNAMSIZ)
>> +		len = IFNAMSIZ;
>> +
>> +	rcu_read_lock();
>> +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
>> +	if (dev)
>> +		strcpy(dev->name, devname);
>> +	rcu_read_unlock();
>> +	ret = -ENODEV;
> 
> You probably meant
> 
> 	strcpy(devname, dev->name)
> 
> By the way, this is not really safe in case device is renamed

Good point, actually. Getting a device name may be not very safe in terms of --
once we have the name there's no 100% guarantee, that this name corresponds to the
actual device the socket is bound to (it could be renamed after we strcpy-ed its
name). This problem doesn't exist when we get device index, as it cannot be changed.

Thanks,
Pavel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Haley Oct. 22, 2012, 9:20 p.m. UTC | #4
On 10/22/2012 04:45 PM, Eric Dumazet wrote:
> On Mon, 2012-10-22 at 14:04 -0400, Brian Haley wrote:
> 
>> +	char devname[IFNAMSIZ];
>> +
>> +	ret = 0;
>> +	if (sk->sk_bound_dev_if == 0)
>> +		goto out;
>> +
>> +	ret = -EINVAL;
>> +	if (len < IFNAMSIZ)
>> +		goto out;
>> +	if (len > IFNAMSIZ)
>> +		len = IFNAMSIZ;
>> +
>> +	rcu_read_lock();
>> +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
>> +	if (dev)
>> +		strcpy(dev->name, devname);
>> +	rcu_read_unlock();
>> +	ret = -ENODEV;
> 
> You probably meant
> 
> 	strcpy(devname, dev->name)

Yes, that was a stupid mistake, I'll fix it.

> By the way, this is not really safe in case device is renamed

It's not much different from what's there:

	setsockopt("foo");

	rename foo -> bar

	index = getsockopt();
	if_indextoname(index) -> "bar"

I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *'
to getsockopt(), I don't think any other value is non-symmetrical like this.

-Brian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Haley Oct. 22, 2012, 9:23 p.m. UTC | #5
On 10/22/2012 04:28 PM, Pavel Emelyanov wrote:
> On 10/22/2012 10:04 PM, Brian Haley wrote:
>> On 10/19/2012 05:55 AM, Pavel Emelyanov wrote:
>>> The SO_BINDTODEVICE option is the only SOL_SOCKET one that can be set, but
>>> cannot be get via sockopt API. The only way we can find the device id a
>>> socket is bound to is via sock-diag interface. But the diag works only on
>>> hashed sockets, while the opt in question can be set for yet unhashed one.
>>>
>>> That said, in order to know what device a socket is bound to (we do want
>>> to know this in checkpoint-restore project) I propose to make this option
>>> getsockopt-able and report the respective device index.
>>>
>>> Another solution to the problem might be to teach the sock-diag reporting
>>> info on unhashed sockets. Should I go this way instead?
>>>
>>> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
>>>
>>> ---
>>>
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index 8a146cf..c49412c 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -1074,6 +1074,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>>  	case SO_NOFCS:
>>>  		v.val = sock_flag(sk, SOCK_NOFCS);
>>>  		break;
>>> +	case SO_BINDTODEVICE:
>>> +		v.val = sk->sk_bound_dev_if;
>>> +		break;
>>>  	default:
>>>  		return -ENOPROTOOPT;
>>>  	}
>>
>> Doesn't this make the set and get non-symmetrical?  For example, setsockopt()
>> would take "eth0", but getsockopt() would return 2.
> 
> It will, but since device name and index are two equal device "IDs" I assumed
> it would be OK.
> 
> However, some comments inline.
> 
>> The following patch would return a string, or -ENODEV if not set.

Sorry, my description is not quite right, this should return something like this
to be correct:

  0 on success
    optlen zero if interface not set
    optlen > zero if set and optval filled-in

  -errno on failure

>> +static int sock_getbindtodevice(struct sock *sk, char __user *optval,
>> +				int __user *optlen, int len)
>> +{
>> +	int ret = -ENOPROTOOPT;
>> +#ifdef CONFIG_NETDEVICES
>> +	struct net *net = sock_net(sk);
>> +	struct net_device *dev;
>> +	char devname[IFNAMSIZ];
>> +
>> +	ret = 0;
>> +	if (sk->sk_bound_dev_if == 0)
>> +		goto out;
> 
> It will return 0 if device is not set, thus making it impossible to detect
> this situation.

See below.

>> +	ret = -EINVAL;
>> +	if (len < IFNAMSIZ)
>> +		goto out;
>> +	if (len > IFNAMSIZ)
>> +		len = IFNAMSIZ;
>> +
>> +	rcu_read_lock();
>> +	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
>> +	if (dev)
>> +		strcpy(dev->name, devname);
>> +	rcu_read_unlock();
>> +	ret = -ENODEV;
>> +	if (!dev)
>> +		goto out;
>> +
>> +	ret = -EFAULT;
>> +	if (copy_to_user(optval, devname, len))
>> +		goto out;
>> +
>> +	if (put_user(len, optlen))
>> +		goto out;
> 
> What's the point in reporting IFNAMSIZ to the userspace always, taking
> into account that this constant is exported there anyway?

I should have put an RFC on the patch :)

In the case that there is no interface, the length returned would be zero,
indicating nothing was there.

I can post another version.

-Brian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 22, 2012, 9:37 p.m. UTC | #6
On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote:

> It's not much different from what's there:
> 
> 	setsockopt("foo");
> 
> 	rename foo -> bar
> 
> 	index = getsockopt();
> 	if_indextoname(index) -> "bar"
> 
> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *'
> to getsockopt(), I don't think any other value is non-symmetrical like this.
> 
> -Brian

I meant another cpu can be changing dev->name[] content while the
strcpy() is done, and you get a mangled devname, like "for" or "bao"
instead of "foo" or "bar"

But yes, I obviously understood your point about "char *" and "int *"


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Brian Haley Oct. 22, 2012, 9:47 p.m. UTC | #7
On 10/22/2012 05:37 PM, Eric Dumazet wrote:
> On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote:
> 
>> It's not much different from what's there:
>>
>> 	setsockopt("foo");
>>
>> 	rename foo -> bar
>>
>> 	index = getsockopt();
>> 	if_indextoname(index) -> "bar"
>>
>> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *'
>> to getsockopt(), I don't think any other value is non-symmetrical like this.
>>
>> -Brian
> 
> I meant another cpu can be changing dev->name[] content while the
> strcpy() is done, and you get a mangled devname, like "for" or "bao"
> instead of "foo" or "bar"

Even when holding the rcu_read_lock()?  I'd have to hold the rtnl lock there?

-Brian
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 22, 2012, 9:52 p.m. UTC | #8
On Mon, 2012-10-22 at 17:47 -0400, Brian Haley wrote:
> On 10/22/2012 05:37 PM, Eric Dumazet wrote:
> > On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote:
> > 
> >> It's not much different from what's there:
> >>
> >> 	setsockopt("foo");
> >>
> >> 	rename foo -> bar
> >>
> >> 	index = getsockopt();
> >> 	if_indextoname(index) -> "bar"
> >>
> >> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *'
> >> to getsockopt(), I don't think any other value is non-symmetrical like this.
> >>
> >> -Brian
> > 
> > I meant another cpu can be changing dev->name[] content while the
> > strcpy() is done, and you get a mangled devname, like "for" or "bao"
> > instead of "foo" or "bar"
> 
> Even when holding the rcu_read_lock()?  I'd have to hold the rtnl lock there?

Yes, rcu_read_lock() only makes sure the device doesnt disappear.

But its name can be changed.

You could use a seqcount_t, so that readers dont have to lock rtnl.

But do we really want to return a name here, I am not yet convinced.






--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ben Hutchings Oct. 22, 2012, 9:58 p.m. UTC | #9
On Mon, 2012-10-22 at 23:52 +0200, Eric Dumazet wrote:
> On Mon, 2012-10-22 at 17:47 -0400, Brian Haley wrote:
> > On 10/22/2012 05:37 PM, Eric Dumazet wrote:
> > > On Mon, 2012-10-22 at 17:20 -0400, Brian Haley wrote:
> > > 
> > >> It's not much different from what's there:
> > >>
> > >> 	setsockopt("foo");
> > >>
> > >> 	rename foo -> bar
> > >>
> > >> 	index = getsockopt();
> > >> 	if_indextoname(index) -> "bar"
> > >>
> > >> I more raised the issue since you pass a 'char *' to setsockopt() but an 'int *'
> > >> to getsockopt(), I don't think any other value is non-symmetrical like this.
> > >>
> > >> -Brian
> > > 
> > > I meant another cpu can be changing dev->name[] content while the
> > > strcpy() is done, and you get a mangled devname, like "for" or "bao"
> > > instead of "foo" or "bar"
> > 
> > Even when holding the rcu_read_lock()?  I'd have to hold the rtnl lock there?
> 
> Yes, rcu_read_lock() only makes sure the device doesnt disappear.
> 
> But its name can be changed.
> 
> You could use a seqcount_t, so that readers dont have to lock rtnl.
> 
> But do we really want to return a name here, I am not yet convinced.

If setsockopt() takes a name then it makes no sense that getsockopt()
would return an index.  Perhaps an SO_BINDTOIFINDEX would be useful, but
let's not make SO_BINDTODEVICE mean two different things.

Ben.
Brian Haley Oct. 23, 2012, 9:43 p.m. UTC | #10
On 10/22/2012 05:52 PM, Eric Dumazet wrote:

>>> I meant another cpu can be changing dev->name[] content while the
>>> strcpy() is done, and you get a mangled devname, like "for" or "bao"
>>> instead of "foo" or "bar"
>>
>> Even when holding the rcu_read_lock()?  I'd have to hold the rtnl lock there?
> 
> Yes, rcu_read_lock() only makes sure the device doesnt disappear.
> 
> But its name can be changed.

There's a similar bug in the SIOCGIFNAME/dev_ifname() code too then, but I would
think this rare enough that it doesn't happen in practice.

-Brian

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/core/sock.c b/net/core/sock.c
index c49412c..69b9d92 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -505,7 +505,8 @@  struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 }
 EXPORT_SYMBOL(sk_dst_check);

-static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+static int sock_setbindtodevice(struct sock *sk, char __user *optval,
+				int optlen)
 {
 	int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
@@ -562,6 +563,49 @@  out:
 	return ret;
 }

+static int sock_getbindtodevice(struct sock *sk, char __user *optval,
+				int __user *optlen, int len)
+{
+	int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+	struct net *net = sock_net(sk);
+	struct net_device *dev;
+	char devname[IFNAMSIZ];
+
+	ret = 0;
+	if (sk->sk_bound_dev_if == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (len < IFNAMSIZ)
+		goto out;
+	if (len > IFNAMSIZ)
+		len = IFNAMSIZ;
+
+	rcu_read_lock();
+	dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+	if (dev)
+		strcpy(dev->name, devname);
+	rcu_read_unlock();
+	ret = -ENODEV;
+	if (!dev)
+		goto out;
+
+	ret = -EFAULT;
+	if (copy_to_user(optval, devname, len))
+		goto out;
+
+	if (put_user(len, optlen))
+		goto out;
+
+	ret = 0;
+
+out:
+#endif
+
+	return ret;
+}
+
 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 {
 	if (valbool)
@@ -589,7 +633,7 @@  int sock_setsockopt(struct socket *sock, int level, int optname,
 	 */

 	if (optname == SO_BINDTODEVICE)
-		return sock_bindtodevice(sk, optval, optlen);
+		return sock_setbindtodevice(sk, optval, optlen);

 	if (optlen < sizeof(int))
 		return -EINVAL;
@@ -1074,9 +1118,10 @@  int sock_getsockopt(struct socket *sock, int level, int
optname,
 	case SO_NOFCS:
 		v.val = sock_flag(sk, SOCK_NOFCS);
 		break;
+
 	case SO_BINDTODEVICE:
-		v.val = sk->sk_bound_dev_if;
-		break;
+		return sock_getbindtodevice(sk, optval, optlen, len);
+
 	default:
 		return -ENOPROTOOPT;