Message ID | 20190517155625.117835-1-willemdebruijn.kernel@gmail.com |
---|---|
State | RFC |
Delegated to: | David Miller |
Headers | show |
Series | [net-next,RFC] ipv6: elide flowlabel check if no exclusive leases exist | expand |
On 5/17/19 8:56 AM, Willem de Bruijn wrote: > From: Willem de Bruijn <willemb@google.com> > > Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO. > If not set, by default an autogenerated flowlabel is selected. > > Explicit flowlabels require a control operation per label plus a > datapath check on every connection (every datagram if unconnected). > > This is particularly expensive on unconnected sockets with many > connections, such as QUIC. > > In the common case, where no lease is exclusive, the check can be > safely elided, as both lease request and check trivially succeed. > Indeed, autoflowlabel does the same (even with exclusive leases). > > Elide the check if no process has requested an exclusive lease. > > This is an optimization. Robust applications still have to revert to > requesting leases if the fast path fails due to an exclusive lease. > > This is decidedly an RFC patch: > - need to update all fl6_sock_lookup callers, not just udp > - behavior should be per-netns isolated > > Other approaches considered: > - a single "get all flowlabels, non-exclusive" flowlabel get request > if set, elide fl6_sock_lookup and fail exclusive lease requests > > - sysctls (only useful if on by default, with static_branch) > A) "non-exclusive mode", failing all exclusive lease requests: > processes already have to be robust against lease failure > B) just bypass check in fl6_sock_lookup, like autoflowlabel > > Signed-off-by: Willem de Bruijn <willemb@google.com> > --- > include/net/ipv6.h | 11 +++++++++++ > net/ipv6/ip6_flowlabel.c | 6 ++++++ > net/ipv6/udp.c | 8 ++++---- > 3 files changed, 21 insertions(+), 4 deletions(-) > > diff --git a/include/net/ipv6.h b/include/net/ipv6.h > index daf80863d3a50..8881cee572410 100644 > --- a/include/net/ipv6.h > +++ b/include/net/ipv6.h > @@ -17,6 +17,7 @@ > #include <linux/hardirq.h> > #include <linux/jhash.h> > #include <linux/refcount.h> > +#include <linux/jump_label.h> > #include <net/if_inet6.h> > #include <net/ndisc.h> > #include <net/flow.h> > @@ -343,7 +344,17 @@ static inline void txopt_put(struct ipv6_txoptions *opt) > kfree_rcu(opt, rcu); > } > > +extern struct static_key_false ipv6_flowlabel_exclusive; > struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); > +static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk, > + __be32 label) > +{ > + if (static_branch_unlikely(&ipv6_flowlabel_exclusive)) > + return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); > + > + return NULL; > +} > + > struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, > struct ip6_flowlabel *fl, > struct ipv6_txoptions *fopt); > diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c > index be5f3d7ceb966..d5f4233b04e0c 100644 > --- a/net/ipv6/ip6_flowlabel.c > +++ b/net/ipv6/ip6_flowlabel.c > @@ -57,6 +57,8 @@ static DEFINE_SPINLOCK(ip6_fl_lock); > > static DEFINE_SPINLOCK(ip6_sk_fl_lock); > > +DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive); > + > #define for_each_fl_rcu(hash, fl) \ > for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ > fl != NULL; \ > @@ -98,6 +100,8 @@ static void fl_free_rcu(struct rcu_head *head) > { > struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); > > + if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY) > + static_branch_dec(&ipv6_flowlabel_exclusive); static_branch_dec() can not be invoked from a rcu call back. > if (fl->share == IPV6_FL_S_PROCESS) > put_pid(fl->owner.pid); > kfree(fl->opt); > @@ -423,6 +427,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, > } > fl->dst = freq->flr_dst; > atomic_set(&fl->users, 1); > + if (fl->share != IPV6_FL_S_ANY) > + static_branch_inc(&ipv6_flowlabel_exclusive); Can this be used by unpriv users ? If yes, then you want to use static_key_false_deferred instead
On Fri, May 17, 2019 at 4:32 PM Eric Dumazet <eric.dumazet@gmail.com> wrote: > > > > On 5/17/19 8:56 AM, Willem de Bruijn wrote: > > From: Willem de Bruijn <willemb@google.com> > > > > Processes can request ipv6 flowlabels with cmsg IPV6_FLOWINFO. > > If not set, by default an autogenerated flowlabel is selected. > > > > Explicit flowlabels require a control operation per label plus a > > datapath check on every connection (every datagram if unconnected). > > > > This is particularly expensive on unconnected sockets with many > > connections, such as QUIC. > > > > In the common case, where no lease is exclusive, the check can be > > safely elided, as both lease request and check trivially succeed. > > Indeed, autoflowlabel does the same (even with exclusive leases). > > > > Elide the check if no process has requested an exclusive lease. > > > > This is an optimization. Robust applications still have to revert to > > requesting leases if the fast path fails due to an exclusive lease. > > > > This is decidedly an RFC patch: > > - need to update all fl6_sock_lookup callers, not just udp > > - behavior should be per-netns isolated > > > > Other approaches considered: > > - a single "get all flowlabels, non-exclusive" flowlabel get request > > if set, elide fl6_sock_lookup and fail exclusive lease requests > > > > - sysctls (only useful if on by default, with static_branch) > > A) "non-exclusive mode", failing all exclusive lease requests: > > processes already have to be robust against lease failure > > B) just bypass check in fl6_sock_lookup, like autoflowlabel > > > > Signed-off-by: Willem de Bruijn <willemb@google.com> > > --- > > include/net/ipv6.h | 11 +++++++++++ > > net/ipv6/ip6_flowlabel.c | 6 ++++++ > > net/ipv6/udp.c | 8 ++++---- > > 3 files changed, 21 insertions(+), 4 deletions(-) > > > > diff --git a/include/net/ipv6.h b/include/net/ipv6.h > > index daf80863d3a50..8881cee572410 100644 > > --- a/include/net/ipv6.h > > +++ b/include/net/ipv6.h > > @@ -17,6 +17,7 @@ > > #include <linux/hardirq.h> > > #include <linux/jhash.h> > > #include <linux/refcount.h> > > +#include <linux/jump_label.h> > > #include <net/if_inet6.h> > > #include <net/ndisc.h> > > #include <net/flow.h> > > @@ -343,7 +344,17 @@ static inline void txopt_put(struct ipv6_txoptions *opt) > > kfree_rcu(opt, rcu); > > } > > > > +extern struct static_key_false ipv6_flowlabel_exclusive; > > struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); > > +static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk, > > + __be32 label) > > +{ > > + if (static_branch_unlikely(&ipv6_flowlabel_exclusive)) > > + return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); > > + > > + return NULL; > > +} > > + > > struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, > > struct ip6_flowlabel *fl, > > struct ipv6_txoptions *fopt); > > diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c > > index be5f3d7ceb966..d5f4233b04e0c 100644 > > --- a/net/ipv6/ip6_flowlabel.c > > +++ b/net/ipv6/ip6_flowlabel.c > > @@ -57,6 +57,8 @@ static DEFINE_SPINLOCK(ip6_fl_lock); > > > > static DEFINE_SPINLOCK(ip6_sk_fl_lock); > > > > +DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive); > > + > > #define for_each_fl_rcu(hash, fl) \ > > for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ > > fl != NULL; \ > > @@ -98,6 +100,8 @@ static void fl_free_rcu(struct rcu_head *head) > > { > > struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); > > > > + if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY) > > + static_branch_dec(&ipv6_flowlabel_exclusive); > > static_branch_dec() can not be invoked from a rcu call back. > > > if (fl->share == IPV6_FL_S_PROCESS) > > put_pid(fl->owner.pid); > > kfree(fl->opt); > > @@ -423,6 +427,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, > > } > > fl->dst = freq->flr_dst; > > atomic_set(&fl->users, 1); > > + if (fl->share != IPV6_FL_S_ANY) > > + static_branch_inc(&ipv6_flowlabel_exclusive); > > > Can this be used by unpriv users ? > > If yes, then you want to use static_key_false_deferred instead Ah of course. Yes, any user can exercise this API. Thanks, Eric. I'll take a look at both points.
diff --git a/include/net/ipv6.h b/include/net/ipv6.h index daf80863d3a50..8881cee572410 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -17,6 +17,7 @@ #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> +#include <linux/jump_label.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/flow.h> @@ -343,7 +344,17 @@ static inline void txopt_put(struct ipv6_txoptions *opt) kfree_rcu(opt, rcu); } +extern struct static_key_false ipv6_flowlabel_exclusive; struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label); +static inline struct ip6_flowlabel *fl6_sock_verify(struct sock *sk, + __be32 label) +{ + if (static_branch_unlikely(&ipv6_flowlabel_exclusive)) + return fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); + + return NULL; +} + struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index be5f3d7ceb966..d5f4233b04e0c 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -57,6 +57,8 @@ static DEFINE_SPINLOCK(ip6_fl_lock); static DEFINE_SPINLOCK(ip6_sk_fl_lock); +DEFINE_STATIC_KEY_FALSE(ipv6_flowlabel_exclusive); + #define for_each_fl_rcu(hash, fl) \ for (fl = rcu_dereference_bh(fl_ht[(hash)]); \ fl != NULL; \ @@ -98,6 +100,8 @@ static void fl_free_rcu(struct rcu_head *head) { struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu); + if (fl->share != IPV6_FL_S_NONE && fl->share != IPV6_FL_S_ANY) + static_branch_dec(&ipv6_flowlabel_exclusive); if (fl->share == IPV6_FL_S_PROCESS) put_pid(fl->owner.pid); kfree(fl->opt); @@ -423,6 +427,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, } fl->dst = freq->flr_dst; atomic_set(&fl->users, 1); + if (fl->share != IPV6_FL_S_ANY) + static_branch_inc(&ipv6_flowlabel_exclusive); switch (fl->share) { case IPV6_FL_S_EXCL: case IPV6_FL_S_ANY: diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 07fa579dfb96c..859a1cbd54906 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1331,8 +1331,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (np->sndflow) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + flowlabel = fl6_sock_verify(sk, fl6.flowlabel); + if (IS_ERR(flowlabel)) return -EINVAL; } } @@ -1383,8 +1383,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return err; } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (!flowlabel) + flowlabel = fl6_sock_verify(sk, fl6.flowlabel); + if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen))