diff mbox series

[net-next] bpf: expose sk_priority through struct bpf_sock_ops

Message ID 20171110191709.13051-1-vlad@dumitrescu.ro
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series [net-next] bpf: expose sk_priority through struct bpf_sock_ops | expand

Commit Message

Vlad Dumitrescu Nov. 10, 2017, 7:17 p.m. UTC
From: Vlad Dumitrescu <vladum@google.com>

Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.

Signed-off-by: Vlad Dumitrescu <vladum@google.com>
---
 include/uapi/linux/bpf.h       |  1 +
 net/core/filter.c              | 11 +++++++++++
 tools/include/uapi/linux/bpf.h |  1 +
 3 files changed, 13 insertions(+)

Comments

Daniel Borkmann Nov. 10, 2017, 9:07 p.m. UTC | #1
On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
> From: Vlad Dumitrescu <vladum@google.com>
> 
> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
> 
> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
> ---
>   include/uapi/linux/bpf.h       |  1 +
>   net/core/filter.c              | 11 +++++++++++
>   tools/include/uapi/linux/bpf.h |  1 +
>   3 files changed, 13 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index e880ae6434ee..9757a2002513 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>   	__u32 local_ip6[4];	/* Stored in network byte order */
>   	__u32 remote_port;	/* Stored in network byte order */
>   	__u32 local_port;	/* stored in host byte order */
> +	__u32 priority;
>   };
>   
>   /* List of known BPF sock_ops operators.
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 61c791f9f628..a6329642d047 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>   		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>   				      offsetof(struct sock_common, skc_num));
>   		break;
> +
> +	case offsetof(struct bpf_sock_ops, priority):
> +		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
> +
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> +						struct bpf_sock_ops_kern, sk),
> +				      si->dst_reg, si->src_reg,
> +				      offsetof(struct bpf_sock_ops_kern, sk));
> +		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> +				      offsetof(struct sock, sk_priority));
> +		break;

Hm, I don't think this would work, I actually think your initial patch was ok.
bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk) right
before accessing options on either socket or TCP level, and bail out with error
otherwise; in such cases we'd read something else here and assume it's sk_priority.

>   	}
>   	return insn - insn_buf;
>   }
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index e880ae6434ee..9757a2002513 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>   	__u32 local_ip6[4];	/* Stored in network byte order */
>   	__u32 remote_port;	/* Stored in network byte order */
>   	__u32 local_port;	/* stored in host byte order */
> +	__u32 priority;
>   };
>   
>   /* List of known BPF sock_ops operators.
>
Alexei Starovoitov Nov. 11, 2017, 4:06 a.m. UTC | #2
On 11/11/17 6:07 AM, Daniel Borkmann wrote:
> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>> From: Vlad Dumitrescu <vladum@google.com>
>>
>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>>
>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>> ---
>>   include/uapi/linux/bpf.h       |  1 +
>>   net/core/filter.c              | 11 +++++++++++
>>   tools/include/uapi/linux/bpf.h |  1 +
>>   3 files changed, 13 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index e880ae6434ee..9757a2002513 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>>       __u32 local_ip6[4];    /* Stored in network byte order */
>>       __u32 remote_port;    /* Stored in network byte order */
>>       __u32 local_port;    /* stored in host byte order */
>> +    __u32 priority;
>>   };
>>     /* List of known BPF sock_ops operators.
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 61c791f9f628..a6329642d047 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>> bpf_access_type type,
>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>>                         offsetof(struct sock_common, skc_num));
>>           break;
>> +
>> +    case offsetof(struct bpf_sock_ops, priority):
>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>> +
>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>> +                        struct bpf_sock_ops_kern, sk),
>> +                      si->dst_reg, si->src_reg,
>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>> +                      offsetof(struct sock, sk_priority));
>> +        break;
>
> Hm, I don't think this would work, I actually think your initial patch
> was ok.
> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
> right
> before accessing options on either socket or TCP level, and bail out
> with error
> otherwise; in such cases we'd read something else here and assume it's
> sk_priority.

even if it's not fullsock, it will just read zero, no? what's a problem
with that?
In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
the program author will know that it's meaningless to read sk_priority,
so returning zero with minimal checks is fine.
While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
since the safety is not compromised.
Daniel Borkmann Nov. 11, 2017, 8:46 p.m. UTC | #3
On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:
> On 11/11/17 6:07 AM, Daniel Borkmann wrote:
>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>>> From: Vlad Dumitrescu <vladum@google.com>
>>>
>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>>>
>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>>> ---
>>>   include/uapi/linux/bpf.h       |  1 +
>>>   net/core/filter.c              | 11 +++++++++++
>>>   tools/include/uapi/linux/bpf.h |  1 +
>>>   3 files changed, 13 insertions(+)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index e880ae6434ee..9757a2002513 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>>>       __u32 local_ip6[4];    /* Stored in network byte order */
>>>       __u32 remote_port;    /* Stored in network byte order */
>>>       __u32 local_port;    /* stored in host byte order */
>>> +    __u32 priority;
>>>   };
>>>     /* List of known BPF sock_ops operators.
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index 61c791f9f628..a6329642d047 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>>> bpf_access_type type,
>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>>>                         offsetof(struct sock_common, skc_num));
>>>           break;
>>> +
>>> +    case offsetof(struct bpf_sock_ops, priority):
>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>>> +
>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>>> +                        struct bpf_sock_ops_kern, sk),
>>> +                      si->dst_reg, si->src_reg,
>>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>> +                      offsetof(struct sock, sk_priority));
>>> +        break;
>>
>> Hm, I don't think this would work, I actually think your initial patch
>> was ok.
>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
>> right
>> before accessing options on either socket or TCP level, and bail out
>> with error
>> otherwise; in such cases we'd read something else here and assume it's
>> sk_priority.
> 
> even if it's not fullsock, it will just read zero, no? what's a problem
> with that?
> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
> the program author will know that it's meaningless to read sk_priority,
> so returning zero with minimal checks is fine.
> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
> since the safety is not compromised.

Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,
struct request_sock itself is only 232 byte long in total, and the struct
inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds
that way, so it cannot be ignored and assumed zero.

If we don't care about error when !fullsock, then you could code the
sk_fullsock(sk) check in BPF itself above in the ctx conversion, and
set it to 0 manually when !fullsock. It might make it harder in the
future to change sk_fullsock() itself, but in any case sk_fullsock()
helper should get a comment in its function saying that when contents
are changed, also above BPF bits need to be adjusted to remain an
equivalent test.
Alexei Starovoitov Nov. 11, 2017, 10:38 p.m. UTC | #4
On 11/12/17 4:46 AM, Daniel Borkmann wrote:
> On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:
>> On 11/11/17 6:07 AM, Daniel Borkmann wrote:
>>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>>>> From: Vlad Dumitrescu <vladum@google.com>
>>>>
>>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>>>>
>>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>>>> ---
>>>>   include/uapi/linux/bpf.h       |  1 +
>>>>   net/core/filter.c              | 11 +++++++++++
>>>>   tools/include/uapi/linux/bpf.h |  1 +
>>>>   3 files changed, 13 insertions(+)
>>>>
>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>> index e880ae6434ee..9757a2002513 100644
>>>> --- a/include/uapi/linux/bpf.h
>>>> +++ b/include/uapi/linux/bpf.h
>>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>>>>       __u32 local_ip6[4];    /* Stored in network byte order */
>>>>       __u32 remote_port;    /* Stored in network byte order */
>>>>       __u32 local_port;    /* stored in host byte order */
>>>> +    __u32 priority;
>>>>   };
>>>>     /* List of known BPF sock_ops operators.
>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>> index 61c791f9f628..a6329642d047 100644
>>>> --- a/net/core/filter.c
>>>> +++ b/net/core/filter.c
>>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>>>> bpf_access_type type,
>>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>>>>                         offsetof(struct sock_common, skc_num));
>>>>           break;
>>>> +
>>>> +    case offsetof(struct bpf_sock_ops, priority):
>>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>>>> +
>>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>>>> +                        struct bpf_sock_ops_kern, sk),
>>>> +                      si->dst_reg, si->src_reg,
>>>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>>> +                      offsetof(struct sock, sk_priority));
>>>> +        break;
>>>
>>> Hm, I don't think this would work, I actually think your initial patch
>>> was ok.
>>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
>>> right
>>> before accessing options on either socket or TCP level, and bail out
>>> with error
>>> otherwise; in such cases we'd read something else here and assume it's
>>> sk_priority.
>>
>> even if it's not fullsock, it will just read zero, no? what's a problem
>> with that?
>> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
>> the program author will know that it's meaningless to read sk_priority,
>> so returning zero with minimal checks is fine.
>> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
>> since the safety is not compromised.
>
> Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,
> struct request_sock itself is only 232 byte long in total, and the struct
> inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds
> that way, so it cannot be ignored and assumed zero.

I thought we always pass fully allocated sock but technically not 
fullsock yet. My mistake. We do: tcp_timeout_init((struct sock *)req))
so yeah ctx rewrite approach won't work.
Let's go back to access via helper.
Vlad Dumitrescu Nov. 13, 2017, 7 p.m. UTC | #5
On Sat, Nov 11, 2017 at 2:38 PM, Alexei Starovoitov <ast@fb.com> wrote:
>
> On 11/12/17 4:46 AM, Daniel Borkmann wrote:
>>
>> On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:
>>>
>>> On 11/11/17 6:07 AM, Daniel Borkmann wrote:
>>>>
>>>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>>>>>
>>>>> From: Vlad Dumitrescu <vladum@google.com>
>>>>>
>>>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>>>>>
>>>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>>>>> ---
>>>>>   include/uapi/linux/bpf.h       |  1 +
>>>>>   net/core/filter.c              | 11 +++++++++++
>>>>>   tools/include/uapi/linux/bpf.h |  1 +
>>>>>   3 files changed, 13 insertions(+)
>>>>>
>>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>>>> index e880ae6434ee..9757a2002513 100644
>>>>> --- a/include/uapi/linux/bpf.h
>>>>> +++ b/include/uapi/linux/bpf.h
>>>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>>>>>       __u32 local_ip6[4];    /* Stored in network byte order */
>>>>>       __u32 remote_port;    /* Stored in network byte order */
>>>>>       __u32 local_port;    /* stored in host byte order */
>>>>> +    __u32 priority;
>>>>>   };
>>>>>     /* List of known BPF sock_ops operators.
>>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>>>> index 61c791f9f628..a6329642d047 100644
>>>>> --- a/net/core/filter.c
>>>>> +++ b/net/core/filter.c
>>>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>>>>> bpf_access_type type,
>>>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>>>>>                         offsetof(struct sock_common, skc_num));
>>>>>           break;
>>>>> +
>>>>> +    case offsetof(struct bpf_sock_ops, priority):
>>>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>>>>> +
>>>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>>>>> +                        struct bpf_sock_ops_kern, sk),
>>>>> +                      si->dst_reg, si->src_reg,
>>>>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>>>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>>>> +                      offsetof(struct sock, sk_priority));
>>>>> +        break;
>>>>
>>>>
>>>> Hm, I don't think this would work, I actually think your initial patch
>>>> was ok.
>>>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
>>>> right
>>>> before accessing options on either socket or TCP level, and bail out
>>>> with error
>>>> otherwise; in such cases we'd read something else here and assume it's
>>>> sk_priority.
>>>
>>>
>>> even if it's not fullsock, it will just read zero, no? what's a problem
>>> with that?
>>> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
>>> the program author will know that it's meaningless to read sk_priority,
>>> so returning zero with minimal checks is fine.
>>> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
>>> since the safety is not compromised.
>>
>>
>> Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,
>> struct request_sock itself is only 232 byte long in total, and the struct
>> inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds
>> that way, so it cannot be ignored and assumed zero.
>
>
> I thought we always pass fully allocated sock but technically not fullsock yet. My mistake. We do: tcp_timeout_init((struct sock *)req))
> so yeah ctx rewrite approach won't work.
> Let's go back to access via helper.
>

TIL. Thanks!

Is there anything else needed from me to get the helper approach accepted?
Lawrence Brakmo Nov. 13, 2017, 8:09 p.m. UTC | #6
On 11/13/17, 11:01 AM, "Vlad Dumitrescu" <vlad@dumitrescu.ro> wrote:

    On Sat, Nov 11, 2017 at 2:38 PM, Alexei Starovoitov <ast@fb.com> wrote:
    >

    > On 11/12/17 4:46 AM, Daniel Borkmann wrote:

    >>

    >> On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:

    >>>

    >>> On 11/11/17 6:07 AM, Daniel Borkmann wrote:

    >>>>

    >>>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:

    >>>>>

    >>>>> From: Vlad Dumitrescu <vladum@google.com>

    >>>>>

    >>>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.

    >>>>>

    >>>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>

    >>>>> ---

    >>>>>   include/uapi/linux/bpf.h       |  1 +

    >>>>>   net/core/filter.c              | 11 +++++++++++

    >>>>>   tools/include/uapi/linux/bpf.h |  1 +

    >>>>>   3 files changed, 13 insertions(+)

    >>>>>

    >>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

    >>>>> index e880ae6434ee..9757a2002513 100644

    >>>>> --- a/include/uapi/linux/bpf.h

    >>>>> +++ b/include/uapi/linux/bpf.h

    >>>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {

    >>>>>       __u32 local_ip6[4];    /* Stored in network byte order */

    >>>>>       __u32 remote_port;    /* Stored in network byte order */

    >>>>>       __u32 local_port;    /* stored in host byte order */

    >>>>> +    __u32 priority;

    >>>>>   };

    >>>>>     /* List of known BPF sock_ops operators.

    >>>>> diff --git a/net/core/filter.c b/net/core/filter.c

    >>>>> index 61c791f9f628..a6329642d047 100644

    >>>>> --- a/net/core/filter.c

    >>>>> +++ b/net/core/filter.c

    >>>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum

    >>>>> bpf_access_type type,

    >>>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,

    >>>>>                         offsetof(struct sock_common, skc_num));

    >>>>>           break;

    >>>>> +

    >>>>> +    case offsetof(struct bpf_sock_ops, priority):

    >>>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);

    >>>>> +

    >>>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(

    >>>>> +                        struct bpf_sock_ops_kern, sk),

    >>>>> +                      si->dst_reg, si->src_reg,

    >>>>> +                      offsetof(struct bpf_sock_ops_kern, sk));

    >>>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,

    >>>>> +                      offsetof(struct sock, sk_priority));

    >>>>> +        break;

    >>>>

    >>>>

    >>>> Hm, I don't think this would work, I actually think your initial patch

    >>>> was ok.

    >>>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)

    >>>> right

    >>>> before accessing options on either socket or TCP level, and bail out

    >>>> with error

    >>>> otherwise; in such cases we'd read something else here and assume it's

    >>>> sk_priority.

    >>>

    >>>

    >>> even if it's not fullsock, it will just read zero, no? what's a problem

    >>> with that?

    >>> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB

    >>> the program author will know that it's meaningless to read sk_priority,

    >>> so returning zero with minimal checks is fine.

    >>> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,

    >>> since the safety is not compromised.

    >>

    >>

    >> Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,

    >> struct request_sock itself is only 232 byte long in total, and the struct

    >> inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds

    >> that way, so it cannot be ignored and assumed zero.

    >

    >

    > I thought we always pass fully allocated sock but technically not fullsock yet. My mistake. We do: tcp_timeout_init((struct sock *)req))

    > so yeah ctx rewrite approach won't work.

    > Let's go back to access via helper.

    >

    
    TIL. Thanks!
    
    Is there anything else needed from me to get the helper approach accepted?

I plan to add access to TCP state variables (cwnd, rtt, etc.) and I have been thinking
about this issue. I think it is possible to access it directly as long as we use a value
like 0xffffffff to represent an invalid value (e.g. not fullsock). The ctx rewrite just
needs to add a conditional to determine what to return. I would probably add a
field into the internal kernel struct to indicate if it is fullsock or not (we should
know when we call tcp_call_bpf whether it is a fullsock or not based on context).

Let me do a sample patch that I can send for review and get feedback from
Alexi and Daniel.
Daniel Borkmann Nov. 13, 2017, 8:20 p.m. UTC | #7
On 11/13/2017 09:09 PM, Lawrence Brakmo wrote:
> On 11/13/17, 11:01 AM, "Vlad Dumitrescu" <vlad@dumitrescu.ro> wrote:
> 
>     On Sat, Nov 11, 2017 at 2:38 PM, Alexei Starovoitov <ast@fb.com> wrote:
>     >
>     > On 11/12/17 4:46 AM, Daniel Borkmann wrote:
>     >>
>     >> On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:
>     >>>
>     >>> On 11/11/17 6:07 AM, Daniel Borkmann wrote:
>     >>>>
>     >>>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>     >>>>>
>     >>>>> From: Vlad Dumitrescu <vladum@google.com>
>     >>>>>
>     >>>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>     >>>>>
>     >>>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>     >>>>> ---
>     >>>>>   include/uapi/linux/bpf.h       |  1 +
>     >>>>>   net/core/filter.c              | 11 +++++++++++
>     >>>>>   tools/include/uapi/linux/bpf.h |  1 +
>     >>>>>   3 files changed, 13 insertions(+)
>     >>>>>
>     >>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>     >>>>> index e880ae6434ee..9757a2002513 100644
>     >>>>> --- a/include/uapi/linux/bpf.h
>     >>>>> +++ b/include/uapi/linux/bpf.h
>     >>>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>     >>>>>       __u32 local_ip6[4];    /* Stored in network byte order */
>     >>>>>       __u32 remote_port;    /* Stored in network byte order */
>     >>>>>       __u32 local_port;    /* stored in host byte order */
>     >>>>> +    __u32 priority;
>     >>>>>   };
>     >>>>>     /* List of known BPF sock_ops operators.
>     >>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>     >>>>> index 61c791f9f628..a6329642d047 100644
>     >>>>> --- a/net/core/filter.c
>     >>>>> +++ b/net/core/filter.c
>     >>>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>     >>>>> bpf_access_type type,
>     >>>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>     >>>>>                         offsetof(struct sock_common, skc_num));
>     >>>>>           break;
>     >>>>> +
>     >>>>> +    case offsetof(struct bpf_sock_ops, priority):
>     >>>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>     >>>>> +
>     >>>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>     >>>>> +                        struct bpf_sock_ops_kern, sk),
>     >>>>> +                      si->dst_reg, si->src_reg,
>     >>>>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>     >>>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>     >>>>> +                      offsetof(struct sock, sk_priority));
>     >>>>> +        break;
>     >>>>
>     >>>>
>     >>>> Hm, I don't think this would work, I actually think your initial patch
>     >>>> was ok.
>     >>>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
>     >>>> right
>     >>>> before accessing options on either socket or TCP level, and bail out
>     >>>> with error
>     >>>> otherwise; in such cases we'd read something else here and assume it's
>     >>>> sk_priority.
>     >>>
>     >>>
>     >>> even if it's not fullsock, it will just read zero, no? what's a problem
>     >>> with that?
>     >>> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
>     >>> the program author will know that it's meaningless to read sk_priority,
>     >>> so returning zero with minimal checks is fine.
>     >>> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
>     >>> since the safety is not compromised.
>     >>
>     >>
>     >> Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,
>     >> struct request_sock itself is only 232 byte long in total, and the struct
>     >> inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds
>     >> that way, so it cannot be ignored and assumed zero.
>     >
>     >
>     > I thought we always pass fully allocated sock but technically not fullsock yet. My mistake. We do: tcp_timeout_init((struct sock *)req))
>     > so yeah ctx rewrite approach won't work.
>     > Let's go back to access via helper.
>     >
>     
>     TIL. Thanks!
>     
>     Is there anything else needed from me to get the helper approach accepted?
> 
> I plan to add access to TCP state variables (cwnd, rtt, etc.) and I have been thinking
> about this issue. I think it is possible to access it directly as long as we use a value
> like 0xffffffff to represent an invalid value (e.g. not fullsock). The ctx rewrite just
> needs to add a conditional to determine what to return. I would probably add a
> field into the internal kernel struct to indicate if it is fullsock or not (we should
> know when we call tcp_call_bpf whether it is a fullsock or not based on context).
> 
> Let me do a sample patch that I can send for review and get feedback from
> Alexi and Daniel.

Agree, if the mov op from the ctx rewrite to read(/write) a sk member, for
example, is just a BPF_W, then we know upper reg bits are zero anyway for the
success case, so we might be able to utilize this for writing a signed error
back to the user if !fullsk.
Alexei Starovoitov Nov. 13, 2017, 10:51 p.m. UTC | #8
On 11/14/17 4:20 AM, Daniel Borkmann wrote:
> On 11/13/2017 09:09 PM, Lawrence Brakmo wrote:
>> On 11/13/17, 11:01 AM, "Vlad Dumitrescu" <vlad@dumitrescu.ro> wrote:
>>
>>     On Sat, Nov 11, 2017 at 2:38 PM, Alexei Starovoitov <ast@fb.com> wrote:
>>     >
>>     > On 11/12/17 4:46 AM, Daniel Borkmann wrote:
>>     >>
>>     >> On 11/11/2017 05:06 AM, Alexei Starovoitov wrote:
>>     >>>
>>     >>> On 11/11/17 6:07 AM, Daniel Borkmann wrote:
>>     >>>>
>>     >>>> On 11/10/2017 08:17 PM, Vlad Dumitrescu wrote:
>>     >>>>>
>>     >>>>> From: Vlad Dumitrescu <vladum@google.com>
>>     >>>>>
>>     >>>>> Allows BPF_PROG_TYPE_SOCK_OPS programs to read sk_priority.
>>     >>>>>
>>     >>>>> Signed-off-by: Vlad Dumitrescu <vladum@google.com>
>>     >>>>> ---
>>     >>>>>   include/uapi/linux/bpf.h       |  1 +
>>     >>>>>   net/core/filter.c              | 11 +++++++++++
>>     >>>>>   tools/include/uapi/linux/bpf.h |  1 +
>>     >>>>>   3 files changed, 13 insertions(+)
>>     >>>>>
>>     >>>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>     >>>>> index e880ae6434ee..9757a2002513 100644
>>     >>>>> --- a/include/uapi/linux/bpf.h
>>     >>>>> +++ b/include/uapi/linux/bpf.h
>>     >>>>> @@ -947,6 +947,7 @@ struct bpf_sock_ops {
>>     >>>>>       __u32 local_ip6[4];    /* Stored in network byte order */
>>     >>>>>       __u32 remote_port;    /* Stored in network byte order */
>>     >>>>>       __u32 local_port;    /* stored in host byte order */
>>     >>>>> +    __u32 priority;
>>     >>>>>   };
>>     >>>>>     /* List of known BPF sock_ops operators.
>>     >>>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>     >>>>> index 61c791f9f628..a6329642d047 100644
>>     >>>>> --- a/net/core/filter.c
>>     >>>>> +++ b/net/core/filter.c
>>     >>>>> @@ -4449,6 +4449,17 @@ static u32 sock_ops_convert_ctx_access(enum
>>     >>>>> bpf_access_type type,
>>     >>>>>           *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
>>     >>>>>                         offsetof(struct sock_common, skc_num));
>>     >>>>>           break;
>>     >>>>> +
>>     >>>>> +    case offsetof(struct bpf_sock_ops, priority):
>>     >>>>> +        BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
>>     >>>>> +
>>     >>>>> +        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
>>     >>>>> +                        struct bpf_sock_ops_kern, sk),
>>     >>>>> +                      si->dst_reg, si->src_reg,
>>     >>>>> +                      offsetof(struct bpf_sock_ops_kern, sk));
>>     >>>>> +        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
>>     >>>>> +                      offsetof(struct sock, sk_priority));
>>     >>>>> +        break;
>>     >>>>
>>     >>>>
>>     >>>> Hm, I don't think this would work, I actually think your initial patch
>>     >>>> was ok.
>>     >>>> bpf_setsockopt() as well as bpf_getsockopt() check for sk_fullsock(sk)
>>     >>>> right
>>     >>>> before accessing options on either socket or TCP level, and bail out
>>     >>>> with error
>>     >>>> otherwise; in such cases we'd read something else here and assume it's
>>     >>>> sk_priority.
>>     >>>
>>     >>>
>>     >>> even if it's not fullsock, it will just read zero, no? what's a problem
>>     >>> with that?
>>     >>> In non-fullsock hooks like BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB
>>     >>> the program author will know that it's meaningless to read sk_priority,
>>     >>> so returning zero with minimal checks is fine.
>>     >>> While adding extra runtime if (sk_fullsock(sk)) is unnecessary,
>>     >>> since the safety is not compromised.
>>     >>
>>     >>
>>     >> Hm, on my kernel, struct sock has the 4 bytes sk_priority at offset 440,
>>     >> struct request_sock itself is only 232 byte long in total, and the struct
>>     >> inet_timewait_sock is 208 byte long, so you'd be accessing out of bounds
>>     >> that way, so it cannot be ignored and assumed zero.
>>     >
>>     >
>>     > I thought we always pass fully allocated sock but technically not fullsock yet. My mistake. We do: tcp_timeout_init((struct sock *)req))
>>     > so yeah ctx rewrite approach won't work.
>>     > Let's go back to access via helper.
>>     >
>>
>>     TIL. Thanks!
>>
>>     Is there anything else needed from me to get the helper approach accepted?
>>
>> I plan to add access to TCP state variables (cwnd, rtt, etc.) and I have been thinking
>> about this issue. I think it is possible to access it directly as long as we use a value
>> like 0xffffffff to represent an invalid value (e.g. not fullsock). The ctx rewrite just
>> needs to add a conditional to determine what to return. I would probably add a
>> field into the internal kernel struct to indicate if it is fullsock or not (we should
>> know when we call tcp_call_bpf whether it is a fullsock or not based on context).
>>
>> Let me do a sample patch that I can send for review and get feedback from
>> Alexi and Daniel.
>
> Agree, if the mov op from the ctx rewrite to read(/write) a sk member, for
> example, is just a BPF_W, then we know upper reg bits are zero anyway for the
> success case, so we might be able to utilize this for writing a signed error
> back to the user if !fullsk.

it can be __u64 in bpf_sock_ops too, while real read is 32-bit or less,
then guaranteed no conflicts if we return (s64)-enoent or (s64)-einval
in case of !fullsock.
I like the idea of copying boolean value of sk_fullsock() into hidden
part of bpf_sock_ops_kern, since it's been accessed by tcp_call_bpf()
anyway.
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..9757a2002513 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -947,6 +947,7 @@  struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 priority;
 };
 
 /* List of known BPF sock_ops operators.
diff --git a/net/core/filter.c b/net/core/filter.c
index 61c791f9f628..a6329642d047 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4449,6 +4449,17 @@  static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
+
+	case offsetof(struct bpf_sock_ops, priority):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+						struct bpf_sock_ops_kern, sk),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_sock_ops_kern, sk));
+		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+				      offsetof(struct sock, sk_priority));
+		break;
 	}
 	return insn - insn_buf;
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e880ae6434ee..9757a2002513 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -947,6 +947,7 @@  struct bpf_sock_ops {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	__u32 priority;
 };
 
 /* List of known BPF sock_ops operators.