[v2,bpf-next,4/9] bpf: add bpf helper bpf_skb_ecn_set_ce
diff mbox series

Message ID 20190223010703.678070-5-brakmo@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series
  • bpf: Network Resource Manager (NRM)
Related show

Commit Message

Lawrence Brakmo Feb. 23, 2019, 1:06 a.m. UTC
This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
"int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because his type of bpf_prog cannot modify the skb directly.

This helper is used to set the ECN field of ECN capable IP packets to ce
(congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
used by a bpf_prog to manage egress or ingress network bandwdith limit
per cgroupv2 by inducing an ECN response in the TCP sender.
This works best when using DCTCP.

Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 net/core/filter.c        | 14 ++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann Feb. 23, 2019, 1:14 a.m. UTC | #1
On 02/23/2019 02:06 AM, brakmo wrote:
> This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
> "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
> BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
> be attached to the ingress and egress path. The helper is needed
> because his type of bpf_prog cannot modify the skb directly.
> 
> This helper is used to set the ECN field of ECN capable IP packets to ce
> (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
> used by a bpf_prog to manage egress or ingress network bandwdith limit
> per cgroupv2 by inducing an ECN response in the TCP sender.
> This works best when using DCTCP.
> 
> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> ---
>  include/uapi/linux/bpf.h | 10 +++++++++-
>  net/core/filter.c        | 14 ++++++++++++++
>  2 files changed, 23 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 95b5058fa945..fc646f3eaf9b 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2365,6 +2365,13 @@ union bpf_attr {
>   *		Make a tcp_sock enter CWR state.
>   *	Return
>   *		0 on success, or a negative error in case of failure.
> + *
> + * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
> + *	Description
> + *		Sets ECN of IP header to ce (congestion encountered) if
> + *		current value is ect (ECN capable). Works with IPv6 and IPv4.
> + *	Return
> + *		1 if set, 0 if not set.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -2464,7 +2471,8 @@ union bpf_attr {
>  	FN(spin_unlock),		\
>  	FN(sk_fullsock),		\
>  	FN(tcp_sock),			\
> -	FN(tcp_enter_cwr),
> +	FN(tcp_enter_cwr),		\
> +	FN(skb_ecn_set_ce),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/net/core/filter.c b/net/core/filter.c
> index ca57ef25279c..955369c6ed30 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5444,6 +5444,18 @@ static const struct bpf_func_proto bpf_tcp_enter_cwr_proto = {
>  	.ret_type    = RET_INTEGER,
>  	.arg1_type    = ARG_PTR_TO_TCP_SOCK,
>  };
> +
> +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
> +{
> +	return INET_ECN_set_ce(skb);

Hm, but as mentioned last time, don't we have to ensure here that skb
is writable (aka skb->data private to us before writing into it)?

> +}
> +
> +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
> +	.func		= bpf_skb_ecn_set_ce,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +};
>  #endif /* CONFIG_INET */
>  
>  bool bpf_helper_changes_pkt_data(void *func)
> @@ -5610,6 +5622,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, struct bpf_prog *prog)
>  		} else {
>  			return NULL;
>  		}
> +	case BPF_FUNC_skb_ecn_set_ce:
> +		return &bpf_skb_ecn_set_ce_proto;
>  #endif
>  	default:
>  		return sk_filter_func_proto(func_id, prog);
> 

Thanks,
Daniel
Martin KaFai Lau Feb. 23, 2019, 7:30 a.m. UTC | #2
On Sat, Feb 23, 2019 at 02:14:26AM +0100, Daniel Borkmann wrote:
> On 02/23/2019 02:06 AM, brakmo wrote:
> > This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
> > "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
> > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
> > be attached to the ingress and egress path. The helper is needed
> > because his type of bpf_prog cannot modify the skb directly.
> > 
> > This helper is used to set the ECN field of ECN capable IP packets to ce
> > (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
> > used by a bpf_prog to manage egress or ingress network bandwdith limit
> > per cgroupv2 by inducing an ECN response in the TCP sender.
> > This works best when using DCTCP.
> > 
> > Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
> > ---
> >  include/uapi/linux/bpf.h | 10 +++++++++-
> >  net/core/filter.c        | 14 ++++++++++++++
> >  2 files changed, 23 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 95b5058fa945..fc646f3eaf9b 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2365,6 +2365,13 @@ union bpf_attr {
> >   *		Make a tcp_sock enter CWR state.
> >   *	Return
> >   *		0 on success, or a negative error in case of failure.
> > + *
> > + * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
> > + *	Description
> > + *		Sets ECN of IP header to ce (congestion encountered) if
> > + *		current value is ect (ECN capable). Works with IPv6 and IPv4.
> > + *	Return
> > + *		1 if set, 0 if not set.
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)		\
> >  	FN(unspec),			\
> > @@ -2464,7 +2471,8 @@ union bpf_attr {
> >  	FN(spin_unlock),		\
> >  	FN(sk_fullsock),		\
> >  	FN(tcp_sock),			\
> > -	FN(tcp_enter_cwr),
> > +	FN(tcp_enter_cwr),		\
> > +	FN(skb_ecn_set_ce),
> >  
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > diff --git a/net/core/filter.c b/net/core/filter.c
> > index ca57ef25279c..955369c6ed30 100644
> > --- a/net/core/filter.c
> > +++ b/net/core/filter.c
> > @@ -5444,6 +5444,18 @@ static const struct bpf_func_proto bpf_tcp_enter_cwr_proto = {
> >  	.ret_type    = RET_INTEGER,
> >  	.arg1_type    = ARG_PTR_TO_TCP_SOCK,
> >  };
> > +
> > +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
> > +{
> > +	return INET_ECN_set_ce(skb);
> 
> Hm, but as mentioned last time, don't we have to ensure here that skb
> is writable (aka skb->data private to us before writing into it)?
INET_ECN_set_ce(skb) is also called from a few net/sched/sch_*.c
but I don't see how they ensure if a skb is writable.

May be I have missed something there that can also be borrowed and
reused here?

Thanks,
Martin

> 
> > +}
> > +
> > +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
> > +	.func		= bpf_skb_ecn_set_ce,
> > +	.gpl_only	= false,
> > +	.ret_type	= RET_INTEGER,
> > +	.arg1_type	= ARG_PTR_TO_CTX,
> > +};
> >  #endif /* CONFIG_INET */
> >  
> >  bool bpf_helper_changes_pkt_data(void *func)
> > @@ -5610,6 +5622,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, struct bpf_prog *prog)
> >  		} else {
> >  			return NULL;
> >  		}
> > +	case BPF_FUNC_skb_ecn_set_ce:
> > +		return &bpf_skb_ecn_set_ce_proto;
> >  #endif
> >  	default:
> >  		return sk_filter_func_proto(func_id, prog);
> > 
> 
> Thanks,
> Daniel
Daniel Borkmann Feb. 25, 2019, 10:10 a.m. UTC | #3
On 02/23/2019 08:30 AM, Martin Lau wrote:
> On Sat, Feb 23, 2019 at 02:14:26AM +0100, Daniel Borkmann wrote:
>> On 02/23/2019 02:06 AM, brakmo wrote:
>>> This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce
>>> "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to
>>> BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
>>> be attached to the ingress and egress path. The helper is needed
>>> because his type of bpf_prog cannot modify the skb directly.
>>>
>>> This helper is used to set the ECN field of ECN capable IP packets to ce
>>> (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be
>>> used by a bpf_prog to manage egress or ingress network bandwdith limit
>>> per cgroupv2 by inducing an ECN response in the TCP sender.
>>> This works best when using DCTCP.
>>>
>>> Signed-off-by: Lawrence Brakmo <brakmo@fb.com>
>>> ---
>>>  include/uapi/linux/bpf.h | 10 +++++++++-
>>>  net/core/filter.c        | 14 ++++++++++++++
>>>  2 files changed, 23 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 95b5058fa945..fc646f3eaf9b 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -2365,6 +2365,13 @@ union bpf_attr {
>>>   *		Make a tcp_sock enter CWR state.
>>>   *	Return
>>>   *		0 on success, or a negative error in case of failure.
>>> + *
>>> + * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
>>> + *	Description
>>> + *		Sets ECN of IP header to ce (congestion encountered) if
>>> + *		current value is ect (ECN capable). Works with IPv6 and IPv4.
>>> + *	Return
>>> + *		1 if set, 0 if not set.
>>>   */
>>>  #define __BPF_FUNC_MAPPER(FN)		\
>>>  	FN(unspec),			\
>>> @@ -2464,7 +2471,8 @@ union bpf_attr {
>>>  	FN(spin_unlock),		\
>>>  	FN(sk_fullsock),		\
>>>  	FN(tcp_sock),			\
>>> -	FN(tcp_enter_cwr),
>>> +	FN(tcp_enter_cwr),		\
>>> +	FN(skb_ecn_set_ce),
>>>  
>>>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>>>   * function eBPF program intends to call
>>> diff --git a/net/core/filter.c b/net/core/filter.c
>>> index ca57ef25279c..955369c6ed30 100644
>>> --- a/net/core/filter.c
>>> +++ b/net/core/filter.c
>>> @@ -5444,6 +5444,18 @@ static const struct bpf_func_proto bpf_tcp_enter_cwr_proto = {
>>>  	.ret_type    = RET_INTEGER,
>>>  	.arg1_type    = ARG_PTR_TO_TCP_SOCK,
>>>  };
>>> +
>>> +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
>>> +{
>>> +	return INET_ECN_set_ce(skb);
>>
>> Hm, but as mentioned last time, don't we have to ensure here that skb
>> is writable (aka skb->data private to us before writing into it)?
> INET_ECN_set_ce(skb) is also called from a few net/sched/sch_*.c
> but I don't see how they ensure if a skb is writable.
> 
> May be I have missed something there that can also be borrowed and
> reused here?

My understanding is that before doing any writes into skb, we should make
sure the data area is private to us (and offset in linear data). In tc BPF
(ingress, egress) we use bpf_try_make_writable() helper for this, others
like act_{pedit,skbmod} or ovs have similar logic before writing into skb,
note that in all these cases it's mostly about generic writes, so location
could also be L4, for example.

Difference of above helper compared to net/sched/sch_*.c instances could
be that it's i) for the qdisc case it's only on egress INET_ECN_set_ce()
and that there may be a convention that qdiscs specifically may mangle
it whereas the helper could be called on ingress and egress and confuse
other subsystems since they won't see original or race by seeing partially
updated (invalid) packet.

Eric, have a chance to clarify? Perhaps then would make sense to disallow
the helper in cgroup ingress path.
Eric Dumazet Feb. 25, 2019, 4:52 p.m. UTC | #4
On 02/25/2019 02:10 AM, Daniel Borkmann wrote:

> My understanding is that before doing any writes into skb, we should make
> sure the data area is private to us (and offset in linear data). In tc BPF
> (ingress, egress) we use bpf_try_make_writable() helper for this, others
> like act_{pedit,skbmod} or ovs have similar logic before writing into skb,
> note that in all these cases it's mostly about generic writes, so location
> could also be L4, for example.
> 
> Difference of above helper compared to net/sched/sch_*.c instances could
> be that it's i) for the qdisc case it's only on egress INET_ECN_set_ce()
> and that there may be a convention that qdiscs specifically may mangle
> it whereas the helper could be called on ingress and egress and confuse
> other subsystems since they won't see original or race by seeing partially
> updated (invalid) packet.
> 
> Eric, have a chance to clarify? Perhaps then would make sense to disallow
> the helper in cgroup ingress path.

Good observations Daniel, thanks for bringing this up.

skb_ensure_writable() seems a big hammer for the case we change some bits in IP header.

TCP cloned packets certainly can have their headers mangled, so maybe
we need to use something using skb_header_cloned() instead of skb_cloned()

Patch
diff mbox series

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 95b5058fa945..fc646f3eaf9b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2365,6 +2365,13 @@  union bpf_attr {
  *		Make a tcp_sock enter CWR state.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_skb_ecn_set_ce(struct sk_buf *skb)
+ *	Description
+ *		Sets ECN of IP header to ce (congestion encountered) if
+ *		current value is ect (ECN capable). Works with IPv6 and IPv4.
+ *	Return
+ *		1 if set, 0 if not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2464,7 +2471,8 @@  union bpf_attr {
 	FN(spin_unlock),		\
 	FN(sk_fullsock),		\
 	FN(tcp_sock),			\
-	FN(tcp_enter_cwr),
+	FN(tcp_enter_cwr),		\
+	FN(skb_ecn_set_ce),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index ca57ef25279c..955369c6ed30 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5444,6 +5444,18 @@  static const struct bpf_func_proto bpf_tcp_enter_cwr_proto = {
 	.ret_type    = RET_INTEGER,
 	.arg1_type    = ARG_PTR_TO_TCP_SOCK,
 };
+
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+	return INET_ECN_set_ce(skb);
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+	.func		= bpf_skb_ecn_set_ce,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+};
 #endif /* CONFIG_INET */
 
 bool bpf_helper_changes_pkt_data(void *func)
@@ -5610,6 +5622,8 @@  cg_skb_func_proto(enum bpf_func_id func_id, struct bpf_prog *prog)
 		} else {
 			return NULL;
 		}
+	case BPF_FUNC_skb_ecn_set_ce:
+		return &bpf_skb_ecn_set_ce_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);