Message ID | 20190917173949.19982-1-dsahern@kernel.org |
---|---|
State | Accepted |
Delegated to: | David Miller |
Headers | show |
Series | [net] ipv4: Revert removal of rt_uses_gateway | expand |
Hello, On Tue, 17 Sep 2019, David Ahern wrote: > From: David Ahern <dsahern@gmail.com> > > Julian noted that rt_uses_gateway has a more subtle use than 'is gateway > set': > https://lore.kernel.org/netdev/alpine.LFD.2.21.1909151104060.2546@ja.home.ssi.bg/ > > Revert that part of the commit referenced in the Fixes tag. > > Currently, there are no u8 holes in 'struct rtable'. There is a 4-byte hole > in the second cacheline which contains the gateway declaration. So move > rt_gw_family down to the gateway declarations since they are always used > together, and then re-use that u8 for rt_uses_gateway. End result is that > rtable size is unchanged. > > Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") > Reported-by: Julian Anastasov <ja@ssi.bg> > Signed-off-by: David Ahern <dsahern@gmail.com> Looks good to me, thanks! Reviewed-by: Julian Anastasov <ja@ssi.bg> > --- > drivers/infiniband/core/addr.c | 2 +- > include/net/route.h | 3 ++- > net/ipv4/inet_connection_sock.c | 4 ++-- > net/ipv4/ip_forward.c | 2 +- > net/ipv4/ip_output.c | 2 +- > net/ipv4/route.c | 36 +++++++++++++++++++++--------------- > net/ipv4/xfrm4_policy.c | 1 + > 7 files changed, 29 insertions(+), 21 deletions(-) > > diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c > index 9b76a8fcdd24..bf539c34ccd3 100644 > --- a/drivers/infiniband/core/addr.c > +++ b/drivers/infiniband/core/addr.c > @@ -352,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family) > > if (family == AF_INET) { > rt = container_of(dst, struct rtable, dst); > - return rt->rt_gw_family == AF_INET; > + return rt->rt_uses_gateway; > } > > rt6 = container_of(dst, struct rt6_info, dst); > diff --git a/include/net/route.h b/include/net/route.h > index dfce19c9fa96..6c516840380d 100644 > --- a/include/net/route.h > +++ b/include/net/route.h > @@ -53,10 +53,11 @@ struct rtable { > unsigned int rt_flags; > __u16 rt_type; > __u8 rt_is_input; > - u8 rt_gw_family; > + __u8 rt_uses_gateway; > > int rt_iif; > > + u8 rt_gw_family; > /* Info on neighbour */ > union { > __be32 rt_gw4; > diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c > index f5c163d4771b..a9183543ca30 100644 > --- a/net/ipv4/inet_connection_sock.c > +++ b/net/ipv4/inet_connection_sock.c > @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, > rt = ip_route_output_flow(net, fl4, sk); > if (IS_ERR(rt)) > goto no_route; > - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) > + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) > goto route_err; > rcu_read_unlock(); > return &rt->dst; > @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, > rt = ip_route_output_flow(net, fl4, sk); > if (IS_ERR(rt)) > goto no_route; > - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) > + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) > goto route_err; > return &rt->dst; > > diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c > index 06f6f280b9ff..00ec819f949b 100644 > --- a/net/ipv4/ip_forward.c > +++ b/net/ipv4/ip_forward.c > @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) > > rt = skb_rtable(skb); > > - if (opt->is_strictroute && rt->rt_gw_family) > + if (opt->is_strictroute && rt->rt_uses_gateway) > goto sr_failed; > > IPCB(skb)->flags |= IPSKB_FORWARDED; > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c > index cc7ef0d05bbd..da521790cd63 100644 > --- a/net/ipv4/ip_output.c > +++ b/net/ipv4/ip_output.c > @@ -499,7 +499,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, > skb_dst_set_noref(skb, &rt->dst); > > packet_routed: > - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) > + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) > goto no_route; > > /* OK, we know where to send it, allocate and build IP header. */ > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > index b6a6f18c3dd1..7dcce724c78b 100644 > --- a/net/ipv4/route.c > +++ b/net/ipv4/route.c > @@ -635,6 +635,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh > > if (fnhe->fnhe_gw) { > rt->rt_flags |= RTCF_REDIRECTED; > + rt->rt_uses_gateway = 1; > rt->rt_gw_family = AF_INET; > rt->rt_gw4 = fnhe->fnhe_gw; > } > @@ -1313,7 +1314,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) > mtu = READ_ONCE(dst->dev->mtu); > > if (unlikely(ip_mtu_locked(dst))) { > - if (rt->rt_gw_family && mtu > 576) > + if (rt->rt_uses_gateway && mtu > 576) > mtu = 576; > } > > @@ -1569,6 +1570,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, > struct fib_nh_common *nhc = FIB_RES_NHC(*res); > > if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { > + rt->rt_uses_gateway = 1; > rt->rt_gw_family = nhc->nhc_gw_family; > /* only INET and INET6 are supported */ > if (likely(nhc->nhc_gw_family == AF_INET)) > @@ -1634,6 +1636,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, > rt->rt_iif = 0; > rt->rt_pmtu = 0; > rt->rt_mtu_locked = 0; > + rt->rt_uses_gateway = 0; > rt->rt_gw_family = 0; > rt->rt_gw4 = 0; > INIT_LIST_HEAD(&rt->rt_uncached); > @@ -2694,6 +2697,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or > rt->rt_genid = rt_genid_ipv4(net); > rt->rt_flags = ort->rt_flags; > rt->rt_type = ort->rt_type; > + rt->rt_uses_gateway = ort->rt_uses_gateway; > rt->rt_gw_family = ort->rt_gw_family; > if (rt->rt_gw_family == AF_INET) > rt->rt_gw4 = ort->rt_gw4; > @@ -2778,21 +2782,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, > if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) > goto nla_put_failure; > } > - if (rt->rt_gw_family == AF_INET && > - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { > - goto nla_put_failure; > - } else if (rt->rt_gw_family == AF_INET6) { > - int alen = sizeof(struct in6_addr); > - struct nlattr *nla; > - struct rtvia *via; > - > - nla = nla_reserve(skb, RTA_VIA, alen + 2); > - if (!nla) > + if (rt->rt_uses_gateway) { > + if (rt->rt_gw_family == AF_INET && > + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { > goto nla_put_failure; > - > - via = nla_data(nla); > - via->rtvia_family = AF_INET6; > - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); > + } else if (rt->rt_gw_family == AF_INET6) { > + int alen = sizeof(struct in6_addr); > + struct nlattr *nla; > + struct rtvia *via; > + > + nla = nla_reserve(skb, RTA_VIA, alen + 2); > + if (!nla) > + goto nla_put_failure; > + > + via = nla_data(nla); > + via->rtvia_family = AF_INET6; > + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); > + } > } > > expires = rt->dst.expires; > diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c > index cdef8f9a3b01..35b84b52b702 100644 > --- a/net/ipv4/xfrm4_policy.c > +++ b/net/ipv4/xfrm4_policy.c > @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, > xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | > RTCF_LOCAL); > xdst->u.rt.rt_type = rt->rt_type; > + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; > xdst->u.rt.rt_gw_family = rt->rt_gw_family; > if (rt->rt_gw_family == AF_INET) > xdst->u.rt.rt_gw4 = rt->rt_gw4; > -- > 2.11.0 Regards -- Julian Anastasov <ja@ssi.bg>
On 9/17/19 12:50 PM, Julian Anastasov wrote: > > Looks good to me, thanks! > > Reviewed-by: Julian Anastasov <ja@ssi.bg> > BTW, do you have any tests for the rt_uses_gateway paths - showing why it is needed? All of the pmtu, redirect, fib tests, etc worked fine without the special flag. Sure, the 'ip ro get' had extra data; it seems like that could be handled.
Hello, On Tue, 17 Sep 2019, David Ahern wrote: > On 9/17/19 12:50 PM, Julian Anastasov wrote: > > > > Looks good to me, thanks! > > > > Reviewed-by: Julian Anastasov <ja@ssi.bg> > > > > BTW, do you have any tests for the rt_uses_gateway paths - showing why > it is needed? No special tests. > All of the pmtu, redirect, fib tests, etc worked fine without the > special flag. Sure, the 'ip ro get' had extra data; it seems like that > could be handled. I'll explain. In the period before the route cache was removed in 3.6, there were two fields: rt_dst and rt_gateway. For targets on LAN both contained the target. For targets via gateway (nh_gw), rt_dst still stores the target but rt_gateway stored the nh_gw. In short, rt_gateway always contained the next hop IP for neigh resolving. In 3.6, rt_dst was removed and only rt_gateway remained to store nh_gw. As fnhe_rth/nh_pcpu_rth_output were used to cache the output route, rt_gateway can not contain any IP because the route can be used for any target on the LAN. This is true even now, rt_gateway is 0 for cached routes that are not DST_HOST, i.e. not for single target IP. Why this matters? There are users such as IPVS, TEE, raw.c (IP_HDRINCL) that use FLOWI_FLAG_KNOWN_NH to request route with rt_gateway != 0 to be returned, i.e. they want rt_gateway to store a next hop IP for neigh resolving but to put different IP in iph->daddr. This is honoured by rt_nexthop(), it prefers rt_gateway before the iph->daddr. With this FLOWI flag the routing will avoid returning routes with rt_gateway = 0 (cached in NH), instead it will allocate DST_HOST route which can safely hold IP in rt_gateway. So, in 3.7 commit 155e8336c373 ("ipv4: introduce rt_uses_gateway") was created to restore the knowledge that rt_dst != rt_gateway means route via GW and also commit c92b96553a80 ("ipv4: Add FLOWI_FLAG_KNOWN_NH") to make sure packets are routed by requested next hop and not by iph->daddr. You see the places that need to know if rt_gateway contains nh_gw (via GW) or just a requested next hop (when nh_gw is 0). It matters for cases where strict source routes should be on connected network. In simple tests things are working without rt_uses_gateway flag because it is a corner case to see above cases combined with strict source routing or MTU locking. Not sure if we can use some trick to support it differently without such flag. Regards -- Julian Anastasov <ja@ssi.bg>
On Tue, 17 Sep 2019 10:39:49 -0700, David Ahern wrote: > From: David Ahern <dsahern@gmail.com> > > Julian noted that rt_uses_gateway has a more subtle use than 'is gateway > set': > https://lore.kernel.org/netdev/alpine.LFD.2.21.1909151104060.2546@ja.home.ssi.bg/ > > Revert that part of the commit referenced in the Fixes tag. > > Currently, there are no u8 holes in 'struct rtable'. There is a 4-byte hole > in the second cacheline which contains the gateway declaration. So move > rt_gw_family down to the gateway declarations since they are always used > together, and then re-use that u8 for rt_uses_gateway. End result is that > rtable size is unchanged. > > Fixes: 1550c171935d ("ipv4: Prepare rtable for IPv6 gateway") > Reported-by: Julian Anastasov <ja@ssi.bg> > Signed-off-by: David Ahern <dsahern@gmail.com> I'm assuming the mix of u8 and __u8 is intentional, since this is a partial revert :) Applied, queued, thanks!
On 9/20/19 7:30 PM, Jakub Kicinski wrote:
> I'm assuming the mix of u8 and __u8 is intentional, since this is a partial revert :)
original patch for rt_uses_gateway used __u8 so yes put that back.
not sure why I used u8 for the new field, but left it on the move.
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9b76a8fcdd24..bf539c34ccd3 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -352,7 +352,7 @@ static bool has_gateway(const struct dst_entry *dst, sa_family_t family) if (family == AF_INET) { rt = container_of(dst, struct rtable, dst); - return rt->rt_gw_family == AF_INET; + return rt->rt_uses_gateway; } rt6 = container_of(dst, struct rt6_info, dst); diff --git a/include/net/route.h b/include/net/route.h index dfce19c9fa96..6c516840380d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -53,10 +53,11 @@ struct rtable { unsigned int rt_flags; __u16 rt_type; __u8 rt_is_input; - u8 rt_gw_family; + __u8 rt_uses_gateway; int rt_iif; + u8 rt_gw_family; /* Info on neighbour */ union { __be32 rt_gw4; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f5c163d4771b..a9183543ca30 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -560,7 +560,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; @@ -598,7 +598,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gw_family) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 06f6f280b9ff..00ec819f949b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -123,7 +123,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gw_family) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; IPCB(skb)->flags |= IPSKB_FORWARDED; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cc7ef0d05bbd..da521790cd63 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -499,7 +499,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gw_family) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b6a6f18c3dd1..7dcce724c78b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -635,6 +635,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; + rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } @@ -1313,7 +1314,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = READ_ONCE(dst->dev->mtu); if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_gw_family && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1569,6 +1570,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { + rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) @@ -1634,6 +1636,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; + rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2694,6 +2697,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; + rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; @@ -2778,21 +2782,23 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gw_family == AF_INET && - nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { - goto nla_put_failure; - } else if (rt->rt_gw_family == AF_INET6) { - int alen = sizeof(struct in6_addr); - struct nlattr *nla; - struct rtvia *via; - - nla = nla_reserve(skb, RTA_VIA, alen + 2); - if (!nla) + if (rt->rt_uses_gateway) { + if (rt->rt_gw_family == AF_INET && + nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; - - via = nla_data(nla); - via->rtvia_family = AF_INET6; - memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } else if (rt->rt_gw_family == AF_INET6) { + int alen = sizeof(struct in6_addr); + struct nlattr *nla; + struct rtvia *via; + + nla = nla_reserve(skb, RTA_VIA, alen + 2); + if (!nla) + goto nla_put_failure; + + via = nla_data(nla); + via->rtvia_family = AF_INET6; + memcpy(via->rtvia_addr, &rt->rt_gw6, alen); + } } expires = rt->dst.expires; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index cdef8f9a3b01..35b84b52b702 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -85,6 +85,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) xdst->u.rt.rt_gw4 = rt->rt_gw4;