diff mbox series

[net-next,RFC] ipv6: elide flowlabel check if no exclusive leases exist

Message ID 20190517155625.117835-1-willemdebruijn.kernel@gmail.com
State RFC
Delegated to: David Miller
Headers show
Series [net-next,RFC] ipv6: elide flowlabel check if no exclusive leases exist | expand

Commit Message

Willem de Bruijn May 17, 2019, 3:56 p.m. UTC
From: Willem de Bruijn <willemb@google.com>

Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO.
If not set, by default an autogenerated flowlabel is selected.

Explicit flowlabels require a control operation per label plus a
datapath check on every connection (every datagram if unconnected).

This is particularly expensive on unconnected sockets with many
connections, such as QUIC.

In the common case, where no lease is exclusive, the check can be
safely elided, as both lease request and check trivially succeed.
Indeed, autoflowlabel does the same (even with exclusive leases).

Elide the check if no process has requested an exclusive lease.

This is an optimization. Robust applications still have to revert to
requesting leases if the fast path fails due to an exclusive lease.

This is decidedly an RFC patch:
- need to update all fl6_sock_lookup callers, not just udp
- behavior should be per-netns isolated

Other approaches considered:
- a single "get all flowlabels, non-exclusive" flowlabel get request
  if set, elide fl6_sock_lookup and fail exclusive lease requests

- sysctls (only useful if on by default, with static_branch)
  A) "non-exclusive mode", failing all exclusive lease requests:
     processes already have to be robust against lease failure
  B) just bypass check in fl6_sock_lookup, like autoflowlabel

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/net/ipv6.h       | 11 +++++++++++
 net/ipv6/ip6_flowlabel.c |  6 ++++++
 net/ipv6/udp.c           |  8 ++++----
 3 files changed, 21 insertions(+), 4 deletions(-)

Comments

Eric Dumazet May 17, 2019, 8:32 p.m. UTC | #1
On 5/17/19 8:56 AM, Willem de Bruijn wrote:
> From: Willem de Bruijn <willemb@google.com>
> 
> Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO.
> If not set, by default an autogenerated flowlabel is selected.
> 
> Explicit flowlabels require a control operation per label plus a
> datapath check on every connection (every datagram if unconnected).
> 
> This is particularly expensive on unconnected sockets with many
> connections, such as QUIC.
> 
> In the common case, where no lease is exclusive, the check can be
> safely elided, as both lease request and check trivially succeed.
> Indeed, autoflowlabel does the same (even with exclusive leases).
> 
> Elide the check if no process has requested an exclusive lease.
> 
> This is an optimization. Robust applications still have to revert to
> requesting leases if the fast path fails due to an exclusive lease.
> 
> This is decidedly an RFC patch:
> - need to update all fl6_sock_lookup callers, not just udp
> - behavior should be per-netns isolated
> 
> Other approaches considered:
> - a single "get all flowlabels, non-exclusive" flowlabel get request
>   if set, elide fl6_sock_lookup and fail exclusive lease requests
> 
> - sysctls (only useful if on by default, with static_branch)
>   A) "non-exclusive mode", failing all exclusive lease requests:
>      processes already have to be robust against lease failure
>   B) just bypass check in fl6_sock_lookup, like autoflowlabel
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  include/net/ipv6.h       | 11 +++++++++++
>  net/ipv6/ip6_flowlabel.c |  6 ++++++
>  net/ipv6/udp.c           |  8 ++++----
>  3 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index daf80863d3a50..8881cee572410 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -17,6 +17,7 @@
>  #include <linux/hardirq.h>
>  #include <linux/jhash.h>
>  #include <linux/refcount.h>
> +#include <linux/jump_label.h>
>  #include <net/if_inet6.h>
>  #include <net/ndisc.h>
>  #include <net/flow.h>
> @@ -343,7 +344,17 @@ static inline void txopt_put(struct ipv6_txoptions *opt)
>  		kfree_rcu(opt, rcu);
>  }
>  
> +extern struct static_key_false ipv6_flowlabel_exclusive;
>  struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label);
> +static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk,
> +						    __be32 label)
> +{
> +	if (static_branch_unlikely(&ipv6_flowlabel_exclusive))
> +		return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);
> +
> +	return NULL;
> +}
> +
>  struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
>  					 struct ip6_flowlabel *fl,
>  					 struct ipv6_txoptions *fopt);
> diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
> index be5f3d7ceb966..d5f4233b04e0c 100644
> --- a/net/ipv6/ip6_flowlabel.c
> +++ b/net/ipv6/ip6_flowlabel.c
> @@ -57,6 +57,8 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
>  
>  static DEFINE_SPINLOCK(ip6_sk_fl_lock);
>  
> +DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive);
> +
>  #define for_each_fl_rcu(hash, fl)				\
>  	for (fl = rcu_dereference_bh(fl_ht[(hash)]);		\
>  	     fl != NULL;					\
> @@ -98,6 +100,8 @@ static void fl_free_rcu(struct rcu_head *head)
>  {
>  	struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
>  
> +	if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY)
> +		static_branch_dec(&ipv6_flowlabel_exclusive);

static_branch_dec() can not be invoked from a rcu call back.

>  	if (fl->share == IPV6_FL_S_PROCESS)
>  		put_pid(fl->owner.pid);
>  	kfree(fl->opt);
> @@ -423,6 +427,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
>  	}
>  	fl->dst = freq->flr_dst;
>  	atomic_set(&fl->users, 1);
> +	if (fl->share != IPV6_FL_S_ANY)
> +		static_branch_inc(&ipv6_flowlabel_exclusive);


Can this be used by unpriv users ?

If yes, then you want to use static_key_false_deferred instead
Willem de Bruijn May 17, 2019, 9:51 p.m. UTC | #2
On Fri, May 17, 2019 at 4:32 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 5/17/19 8:56 AM, Willem de Bruijn wrote:
> > From: Willem de Bruijn <willemb@google.com>
> >
> > Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO.
> > If not set, by default an autogenerated flowlabel is selected.
> >
> > Explicit flowlabels require a control operation per label plus a
> > datapath check on every connection (every datagram if unconnected).
> >
> > This is particularly expensive on unconnected sockets with many
> > connections, such as QUIC.
> >
> > In the common case, where no lease is exclusive, the check can be
> > safely elided, as both lease request and check trivially succeed.
> > Indeed, autoflowlabel does the same (even with exclusive leases).
> >
> > Elide the check if no process has requested an exclusive lease.
> >
> > This is an optimization. Robust applications still have to revert to
> > requesting leases if the fast path fails due to an exclusive lease.
> >
> > This is decidedly an RFC patch:
> > - need to update all fl6_sock_lookup callers, not just udp
> > - behavior should be per-netns isolated
> >
> > Other approaches considered:
> > - a single "get all flowlabels, non-exclusive" flowlabel get request
> >   if set, elide fl6_sock_lookup and fail exclusive lease requests
> >
> > - sysctls (only useful if on by default, with static_branch)
> >   A) "non-exclusive mode", failing all exclusive lease requests:
> >      processes already have to be robust against lease failure
> >   B) just bypass check in fl6_sock_lookup, like autoflowlabel
> >
> > Signed-off-by: Willem de Bruijn <willemb@google.com>
> > ---
> >  include/net/ipv6.h       | 11 +++++++++++
> >  net/ipv6/ip6_flowlabel.c |  6 ++++++
> >  net/ipv6/udp.c           |  8 ++++----
> >  3 files changed, 21 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> > index daf80863d3a50..8881cee572410 100644
> > --- a/include/net/ipv6.h
> > +++ b/include/net/ipv6.h
> > @@ -17,6 +17,7 @@
> >  #include <linux/hardirq.h>
> >  #include <linux/jhash.h>
> >  #include <linux/refcount.h>
> > +#include <linux/jump_label.h>
> >  #include <net/if_inet6.h>
> >  #include <net/ndisc.h>
> >  #include <net/flow.h>
> > @@ -343,7 +344,17 @@ static inline void txopt_put(struct ipv6_txoptions *opt)
> >               kfree_rcu(opt, rcu);
> >  }
> >
> > +extern struct static_key_false ipv6_flowlabel_exclusive;
> >  struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label);
> > +static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk,
> > +                                                 __be32 label)
> > +{
> > +     if (static_branch_unlikely(&ipv6_flowlabel_exclusive))
> > +             return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);
> > +
> > +     return NULL;
> > +}
> > +
> >  struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
> >                                        struct ip6_flowlabel *fl,
> >                                        struct ipv6_txoptions *fopt);
> > diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
> > index be5f3d7ceb966..d5f4233b04e0c 100644
> > --- a/net/ipv6/ip6_flowlabel.c
> > +++ b/net/ipv6/ip6_flowlabel.c
> > @@ -57,6 +57,8 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
> >
> >  static DEFINE_SPINLOCK(ip6_sk_fl_lock);
> >
> > +DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive);
> > +
> >  #define for_each_fl_rcu(hash, fl)                            \
> >       for (fl = rcu_dereference_bh(fl_ht[(hash)]);            \
> >            fl != NULL;                                        \
> > @@ -98,6 +100,8 @@ static void fl_free_rcu(struct rcu_head *head)
> >  {
> >       struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
> >
> > +     if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY)
> > +             static_branch_dec(&ipv6_flowlabel_exclusive);
>
> static_branch_dec() can not be invoked from a rcu call back.
>
> >       if (fl->share == IPV6_FL_S_PROCESS)
> >               put_pid(fl->owner.pid);
> >       kfree(fl->opt);
> > @@ -423,6 +427,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
> >       }
> >       fl->dst = freq->flr_dst;
> >       atomic_set(&fl->users, 1);
> > +     if (fl->share != IPV6_FL_S_ANY)
> > +             static_branch_inc(&ipv6_flowlabel_exclusive);
>
>
> Can this be used by unpriv users ?
>
> If yes, then you want to use static_key_false_deferred instead

Ah of course. Yes, any user can exercise this API. Thanks, Eric. I'll
take a look at both points.
diff mbox series

Patch

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index daf80863d3a50..8881cee572410 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -17,6 +17,7 @@ 
 #include <linux/hardirq.h>
 #include <linux/jhash.h>
 #include <linux/refcount.h>
+#include <linux/jump_label.h>
 #include <net/if_inet6.h>
 #include <net/ndisc.h>
 #include <net/flow.h>
@@ -343,7 +344,17 @@  static inline void txopt_put(struct ipv6_txoptions *opt)
 		kfree_rcu(opt, rcu);
 }
 
+extern struct static_key_false ipv6_flowlabel_exclusive;
 struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label);
+static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk,
+						    __be32 label)
+{
+	if (static_branch_unlikely(&ipv6_flowlabel_exclusive))
+		return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT);
+
+	return NULL;
+}
+
 struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space,
 					 struct ip6_flowlabel *fl,
 					 struct ipv6_txoptions *fopt);
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index be5f3d7ceb966..d5f4233b04e0c 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -57,6 +57,8 @@  static DEFINE_SPINLOCK(ip6_fl_lock);
 
 static DEFINE_SPINLOCK(ip6_sk_fl_lock);
 
+DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive);
+
 #define for_each_fl_rcu(hash, fl)				\
 	for (fl = rcu_dereference_bh(fl_ht[(hash)]);		\
 	     fl != NULL;					\
@@ -98,6 +100,8 @@  static void fl_free_rcu(struct rcu_head *head)
 {
 	struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
 
+	if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY)
+		static_branch_dec(&ipv6_flowlabel_exclusive);
 	if (fl->share == IPV6_FL_S_PROCESS)
 		put_pid(fl->owner.pid);
 	kfree(fl->opt);
@@ -423,6 +427,8 @@  fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
 	}
 	fl->dst = freq->flr_dst;
 	atomic_set(&fl->users, 1);
+	if (fl->share != IPV6_FL_S_ANY)
+		static_branch_inc(&ipv6_flowlabel_exclusive);
 	switch (fl->share) {
 	case IPV6_FL_S_EXCL:
 	case IPV6_FL_S_ANY:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 07fa579dfb96c..859a1cbd54906 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1331,8 +1331,8 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		if (np->sndflow) {
 			fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
 			if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
-				flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-				if (!flowlabel)
+				flowlabel = fl6_sock_verify(sk, fl6.flowlabel);
+				if (IS_ERR(flowlabel))
 					return -EINVAL;
 			}
 		}
@@ -1383,8 +1383,8 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 			return err;
 		}
 		if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
-			flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
-			if (!flowlabel)
+			flowlabel = fl6_sock_verify(sk, fl6.flowlabel);
+			if (IS_ERR(flowlabel))
 				return -EINVAL;
 		}
 		if (!(opt->opt_nflen|opt->opt_flen))