diff mbox series

[bpf-next] : add sock_ops R/W access to ipv4 tos

Message ID 20180326153657.2229959-1-tehnerd@fb.com
State Accepted, archived
Delegated to: BPF Maintainers
Headers show
Series [bpf-next] : add sock_ops R/W access to ipv4 tos | expand

Commit Message

Nikita V. Shirokov March 26, 2018, 3:36 p.m. UTC
bpf: Add sock_ops R/W access to ipv4 tos

    Sample usage for tos:

      bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v))

    where skops is a pointer to the ctx (struct bpf_sock_ops).

Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
---
 net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

Comments

Daniel Borkmann March 28, 2018, 1:47 p.m. UTC | #1
On 03/26/2018 05:36 PM, Nikita V. Shirokov wrote:
>     bpf: Add sock_ops R/W access to ipv4 tos
> 
>     Sample usage for tos:
> 
>       bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v))
> 
>     where skops is a pointer to the ctx (struct bpf_sock_ops).
> 
> Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
> ---
>  net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
>  1 file changed, 35 insertions(+)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 00c711c..afd8255 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -3462,6 +3462,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>  			ret = -EINVAL;
>  		}
>  #ifdef CONFIG_INET
> +	} else if (level == SOL_IP) {
> +		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
> +			return -EINVAL;
> +
> +		val = *((int *)optval);
> +		/* Only some options are supported */
> +		switch (optname) {
> +		case IP_TOS:
> +			if (val < -1 || val > 0xff) {
> +				ret = -EINVAL;
> +			} else {
> +				struct inet_sock *inet = inet_sk(sk);
> +
> +				if (val == -1)
> +					val = 0;
> +				inet->tos = val;

Should this not have the exact same semantics given the helper resembles
the normal setsockopt? do_ip_setsockopt() does the following when setting
IP_TOS:

        case IP_TOS:    /* This sets both TOS and Precedence */
                if (sk->sk_type == SOCK_STREAM) {
                        val &= ~INET_ECN_MASK;
                        val |= inet->tos & INET_ECN_MASK;
                }
                if (inet->tos != val) {
                        inet->tos = val;
                        sk->sk_priority = rt_tos2priority(val);
                        sk_dst_reset(sk);
                }
                break;

E.g. why we don't need to set sk->sk_priority as well or reset the dst
entry here?

> +			}
> +			break;
> +		default:
> +			ret = -EINVAL;
> +		}
>  #if IS_ENABLED(CONFIG_IPV6)
>  	} else if (level == SOL_IPV6) {
>  		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
> @@ -3561,6 +3582,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>  		} else {
>  			goto err_clear;
>  		}
> +	} else if (level == SOL_IP) {
> +		struct inet_sock *inet = inet_sk(sk);
> +
> +		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
> +			goto err_clear;
> +
> +		/* Only some options are supported */
> +		switch (optname) {
> +		case IP_TOS:
> +			*((int *)optval) = (int)inet->tos;

This part is fine though, same as in do_ip_getsockopt().

> +			break;
> +		default:
> +			goto err_clear;
> +		}
>  #if IS_ENABLED(CONFIG_IPV6)
>  	} else if (level == SOL_IPV6) {
>  		struct ipv6_pinfo *np = inet6_sk(sk);
>
Nikita V. Shirokov March 28, 2018, 5:41 p.m. UTC | #2
>>On 03/26/2018 05:36 PM, Nikita V. Shirokov wrote:
>>     bpf: Add sock_ops R/W access to ipv4 tos
>>
>>     Sample usage for tos:
>>
>>       bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v))
>>
>>     where skops is a pointer to the ctx (struct bpf_sock_ops).
>>
>> Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
>> ---
>>  net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
>>  1 file changed, 35 insertions(+)
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 00c711c..afd8255 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -3462,6 +3462,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>>                        ret = -EINVAL;
>>                }
>>  #ifdef CONFIG_INET
>> +     } else if (level == SOL_IP) {
>> +             if (optlen != sizeof(int) || sk->sk_family != AF_INET)
>> +                     return -EINVAL;
>> +
>> +             val = *((int *)optval);
>> +             /* Only some options are supported */
>> +             switch (optname) {
>> +             case IP_TOS:
>> +                     if (val < -1 || val > 0xff) {
>> +                             ret = -EINVAL;
>> +                     } else {
>> +                             struct inet_sock *inet = inet_sk(sk);
>> +
>> +                             if (val == -1)
>> +                                     val = 0;
>> +                             inet->tos = val;
>
>Should this not have the exact same semantics given the helper resembles
>the normal setsockopt? do_ip_setsockopt() does the following when setting
>IP_TOS:
>
>        case IP_TOS:    /* This sets both TOS and Precedence */
>                if (sk->sk_type == SOCK_STREAM) {
>                        val &= ~INET_ECN_MASK;
>                        val |= inet->tos & INET_ECN_MASK;
>                }
>                if (inet->tos != val) {
>                        inet->tos = val;
>                        sk->sk_priority = rt_tos2priority(val);
>                        sk_dst_reset(sk);
>                }
>                break;
>
>E.g. why we don't need to set sk->sk_priority as well or reset the dst
>entry here?
>

it feels like initially (w/ commit for IP_TOS in ip_sockglue.c) there were some usecase in mind
where reflection of tos to prio was needed + some policy based routing (thats why dst_reset).
but e.g. for ipv6 (IPV6_TCLASS, same as TOS but in ipv6 world) we do just this - set new tclass value
and call it the day. in my opinion this aproach is more flexible (e.g. we have separate
bpf_setsockopt for SOL_PRIORITY) as it did only what we want (i can imagine few usecases
where we want just to change TOS w/o changing priority)

however if you feel strong and want to reproduce the same behavior as in regular setsockopt, i can
totally make v2. please let me know

>> +                     }
>> +                     break;
>> +             default:
>> +                     ret = -EINVAL;
>> +             }
>>  #if IS_ENABLED(CONFIG_IPV6)
>>        } else if (level == SOL_IPV6) {
>>                if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
>> @@ -3561,6 +3582,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>>                } else {
>>                        goto err_clear;
>>                }
>> +     } else if (level == SOL_IP) {
>> +             struct inet_sock *inet = inet_sk(sk);
>> +
>> +             if (optlen != sizeof(int) || sk->sk_family != AF_INET)
>> +                     goto err_clear;
>> +
>> +             /* Only some options are supported */
>> +             switch (optname) {
>> +             case IP_TOS:
>> +                     *((int *)optval) = (int)inet->tos;
>
>This part is fine though, same as in do_ip_getsockopt().
>
>> +                     break;
>> +             default:
>> +                     goto err_clear;
>> +             }
>>  #if IS_ENABLED(CONFIG_IPV6)
>>        } else if (level == SOL_IPV6) {
>>                struct ipv6_pinfo *np = inet6_sk(sk);
>>
Daniel Borkmann March 28, 2018, 7:09 p.m. UTC | #3
On 03/28/2018 07:41 PM, Nikita Shirokov wrote:
>>> On 03/26/2018 05:36 PM, Nikita V. Shirokov wrote:
>>>      bpf: Add sock_ops R/W access to ipv4 tos
>>>
>>>      Sample usage for tos:
>>>
>>>        bpf_getsockopt(skops, SOL_IP, IP_TOS, &v, sizeof(v))
>>>
>>>      where skops is a pointer to the ctx (struct bpf_sock_ops).
>>>
>>> Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
>>> ---
>>>   net/core/filter.c | 35 +++++++++++++++++++++++++++++++++++
>>>   1 file changed, 35 insertions(+)
>>>
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index 00c711c..afd8255 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -3462,6 +3462,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
>>>                         ret = -EINVAL;
>>>                 }
>>>   #ifdef CONFIG_INET
>>> +     } else if (level == SOL_IP) {
>>> +             if (optlen != sizeof(int) || sk->sk_family != AF_INET)
>>> +                     return -EINVAL;
>>> +
>>> +             val = *((int *)optval);
>>> +             /* Only some options are supported */
>>> +             switch (optname) {
>>> +             case IP_TOS:
>>> +                     if (val < -1 || val > 0xff) {
>>> +                             ret = -EINVAL;
>>> +                     } else {
>>> +                             struct inet_sock *inet = inet_sk(sk);
>>> +
>>> +                             if (val == -1)
>>> +                                     val = 0;
>>> +                             inet->tos = val;
>>
>> Should this not have the exact same semantics given the helper resembles
>> the normal setsockopt? do_ip_setsockopt() does the following when setting
>> IP_TOS:
>>
>>         case IP_TOS:    /* This sets both TOS and Precedence */
>>                 if (sk->sk_type == SOCK_STREAM) {
>>                         val &= ~INET_ECN_MASK;
>>                         val |= inet->tos & INET_ECN_MASK;
>>                 }
>>                 if (inet->tos != val) {
>>                         inet->tos = val;
>>                         sk->sk_priority = rt_tos2priority(val);
>>                         sk_dst_reset(sk);
>>                 }
>>                 break;
>>
>> E.g. why we don't need to set sk->sk_priority as well or reset the dst
>> entry here?
> 
> it feels like initially (w/ commit for IP_TOS in ip_sockglue.c) there were some usecase in mind
> where reflection of tos to prio was needed + some policy based routing (thats why dst_reset).
> but e.g. for ipv6 (IPV6_TCLASS, same as TOS but in ipv6 world) we do just this - set new tclass value
> and call it the day. in my opinion this aproach is more flexible (e.g. we have separate
> bpf_setsockopt for SOL_PRIORITY) as it did only what we want (i can imagine few usecases
> where we want just to change TOS w/o changing priority)

Ok, fair point, that way the behavior is exactly the same as in v6 case.

Applied to bpf-next, thanks Nikita!
diff mbox series

Patch

diff --git a/net/core/filter.c b/net/core/filter.c
index 00c711c..afd8255 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3462,6 +3462,27 @@  BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 			ret = -EINVAL;
 		}
 #ifdef CONFIG_INET
+	} else if (level == SOL_IP) {
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+			return -EINVAL;
+
+		val = *((int *)optval);
+		/* Only some options are supported */
+		switch (optname) {
+		case IP_TOS:
+			if (val < -1 || val > 0xff) {
+				ret = -EINVAL;
+			} else {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (val == -1)
+					val = 0;
+				inet->tos = val;
+			}
+			break;
+		default:
+			ret = -EINVAL;
+		}
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (level == SOL_IPV6) {
 		if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
@@ -3561,6 +3582,20 @@  BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
 		} else {
 			goto err_clear;
 		}
+	} else if (level == SOL_IP) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+			goto err_clear;
+
+		/* Only some options are supported */
+		switch (optname) {
+		case IP_TOS:
+			*((int *)optval) = (int)inet->tos;
+			break;
+		default:
+			goto err_clear;
+		}
 #if IS_ENABLED(CONFIG_IPV6)
 	} else if (level == SOL_IPV6) {
 		struct ipv6_pinfo *np = inet6_sk(sk);