diff mbox series

[RFC,net-next,03/19] ipv6: Clear nexthop flags upon netdev up

Message ID 20171231161513.25785-4-idosch@mellanox.com
State RFC, archived
Delegated to: David Miller
Headers show
Series [RFC,net-next,01/19] ipv6: Remove redundant route flushing during namespace dismantle | expand

Commit Message

Ido Schimmel Dec. 31, 2017, 4:14 p.m. UTC
Previous patch marked nexthops with the 'dead' and 'linkdown' flags.
Clear these flags when the netdev comes back up.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 include/net/ip6_route.h |  1 +
 net/ipv6/addrconf.c     |  3 +++
 net/ipv6/route.c        | 29 +++++++++++++++++++++++++++++
 3 files changed, 33 insertions(+)

Comments

David Ahern Jan. 2, 2018, 4:20 p.m. UTC | #1
On 12/31/17 9:14 AM, Ido Schimmel wrote:
> Previous patch marked nexthops with the 'dead' and 'linkdown' flags.
> Clear these flags when the netdev comes back up.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> ---
>  include/net/ip6_route.h |  1 +
>  net/ipv6/addrconf.c     |  3 +++
>  net/ipv6/route.c        | 29 +++++++++++++++++++++++++++++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> index 18e442ea93d8..caad39198c2a 100644
> --- a/include/net/ip6_route.h
> +++ b/include/net/ip6_route.h
> @@ -169,6 +169,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev);
>  void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
>  void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
>  void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
> +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
>  
>  static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
>  {
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index ed06b1190f05..b6405568ed7b 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
>  			if (run_pending)
>  				addrconf_dad_run(idev);
>  
> +			/* Device has an address by now */
> +			rt6_sync_up(dev, RTNH_F_DEAD);
> +

Seems like this should be in the NETDEV_UP section, say after
addrconf_permanent_addr.
Ido Schimmel Jan. 3, 2018, 7:44 a.m. UTC | #2
Hi David,

On Tue, Jan 02, 2018 at 09:20:47AM -0700, David Ahern wrote:
> On 12/31/17 9:14 AM, Ido Schimmel wrote:
> > Previous patch marked nexthops with the 'dead' and 'linkdown' flags.
> > Clear these flags when the netdev comes back up.
> > 
> > Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> > ---
> >  include/net/ip6_route.h |  1 +
> >  net/ipv6/addrconf.c     |  3 +++
> >  net/ipv6/route.c        | 29 +++++++++++++++++++++++++++++
> >  3 files changed, 33 insertions(+)
> > 
> > diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> > index 18e442ea93d8..caad39198c2a 100644
> > --- a/include/net/ip6_route.h
> > +++ b/include/net/ip6_route.h
> > @@ -169,6 +169,7 @@ void rt6_ifdown(struct net *net, struct net_device *dev);
> >  void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
> >  void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
> >  void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
> > +void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
> >  
> >  static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
> >  {
> > diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> > index ed06b1190f05..b6405568ed7b 100644
> > --- a/net/ipv6/addrconf.c
> > +++ b/net/ipv6/addrconf.c
> > @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
> >  			if (run_pending)
> >  				addrconf_dad_run(idev);
> >  
> > +			/* Device has an address by now */
> > +			rt6_sync_up(dev, RTNH_F_DEAD);
> > +
> 
> Seems like this should be in the NETDEV_UP section, say after
> addrconf_permanent_addr.

Unless the `keep_addr_on_down` sysctl is set, then at this stage the
netdev doesn't have an IP address and we shouldn't clear the dead flag
just yet.

This is consistent with IPv4 that clears the dead flag from nexthops in
a multipath route only if the nexthop device has an IP address. When the
last IPv4 address is removed from a netdev all the routes using it are
flushed and there's nothing to clear upon NETDEV_UP.

Assuming you're OK with that, I can reword the commit message to make it
clearer.
David Ahern Jan. 3, 2018, 3:32 p.m. UTC | #3
On 1/3/18 12:44 AM, Ido Schimmel wrote:
>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>>> index ed06b1190f05..b6405568ed7b 100644
>>> --- a/net/ipv6/addrconf.c
>>> +++ b/net/ipv6/addrconf.c
>>> @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
>>>  			if (run_pending)
>>>  				addrconf_dad_run(idev);
>>>  
>>> +			/* Device has an address by now */
>>> +			rt6_sync_up(dev, RTNH_F_DEAD);
>>> +
>>
>> Seems like this should be in the NETDEV_UP section, say after
>> addrconf_permanent_addr.
> 
> Unless the `keep_addr_on_down` sysctl is set, then at this stage the
> netdev doesn't have an IP address and we shouldn't clear the dead flag
> just yet.
> 
> This is consistent with IPv4 that clears the dead flag from nexthops in
> a multipath route only if the nexthop device has an IP address. When the
> last IPv4 address is removed from a netdev all the routes using it are
> flushed and there's nothing to clear upon NETDEV_UP.

I have a bug about that IPv4 handling from the FRR team:

$ ip link add dummy1 type dummy
$ ip li set dummy1 up
$ ip route add 1.1.1.0/24 dev dummy1

$ ip addr add dev dummy1 2.2.2.1/24
$ ip ro ls | grep dummy1
1.1.1.0/24 dev dummy1 scope link
2.2.2.0/24 dev dummy1 proto kernel scope link src 2.2.2.1

$ ip addr del dev dummy1 2.2.2.1/24
$ ip ro ls | grep dummy1
<no outpu>

The 1.1.1.0/24 route was removed as well the 2.2.2.0 connected route.
Ido Schimmel Jan. 3, 2018, 4:43 p.m. UTC | #4
On Wed, Jan 03, 2018 at 08:32:51AM -0700, David Ahern wrote:
> On 1/3/18 12:44 AM, Ido Schimmel wrote:
> >>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> >>> index ed06b1190f05..b6405568ed7b 100644
> >>> --- a/net/ipv6/addrconf.c
> >>> +++ b/net/ipv6/addrconf.c
> >>> @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
> >>>  			if (run_pending)
> >>>  				addrconf_dad_run(idev);
> >>>  
> >>> +			/* Device has an address by now */
> >>> +			rt6_sync_up(dev, RTNH_F_DEAD);
> >>> +
> >>
> >> Seems like this should be in the NETDEV_UP section, say after
> >> addrconf_permanent_addr.
> > 
> > Unless the `keep_addr_on_down` sysctl is set, then at this stage the
> > netdev doesn't have an IP address and we shouldn't clear the dead flag
> > just yet.
> > 
> > This is consistent with IPv4 that clears the dead flag from nexthops in
> > a multipath route only if the nexthop device has an IP address. When the
> > last IPv4 address is removed from a netdev all the routes using it are
> > flushed and there's nothing to clear upon NETDEV_UP.
> 
> I have a bug about that IPv4 handling from the FRR team:
> 
> $ ip link add dummy1 type dummy
> $ ip li set dummy1 up
> $ ip route add 1.1.1.0/24 dev dummy1
> 
> $ ip addr add dev dummy1 2.2.2.1/24
> $ ip ro ls | grep dummy1
> 1.1.1.0/24 dev dummy1 scope link
> 2.2.2.0/24 dev dummy1 proto kernel scope link src 2.2.2.1
> 
> $ ip addr del dev dummy1 2.2.2.1/24
> $ ip ro ls | grep dummy1
> <no outpu>
> 
> The 1.1.1.0/24 route was removed as well the 2.2.2.0 connected route.

If you're going to skip the flushing in this case, at least mark the
nexthops as dead.

And this is my second reason to have rt6_sync_up() where I put it. I'm
preparing another set which sends FIB_EVENT_NH_ADD events from
rt6_sync_up() similar to what we've in fib_sync_up(). When mlxsw (others
in the future) processes the event it needs to add the nexthop back to
the forwarding plane. To do that, it needs to have a RIF for the
nexthop device. For the nexthop device to have a RIF, it needs at least
one IP address configured on the netdev.

Agree / disagree?
David Ahern Jan. 3, 2018, 4:56 p.m. UTC | #5
On 1/3/18 9:43 AM, Ido Schimmel wrote:
> On Wed, Jan 03, 2018 at 08:32:51AM -0700, David Ahern wrote:
>> On 1/3/18 12:44 AM, Ido Schimmel wrote:
>>>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>>>>> index ed06b1190f05..b6405568ed7b 100644
>>>>> --- a/net/ipv6/addrconf.c
>>>>> +++ b/net/ipv6/addrconf.c
>>>>> @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
>>>>>  			if (run_pending)
>>>>>  				addrconf_dad_run(idev);
>>>>>  
>>>>> +			/* Device has an address by now */
>>>>> +			rt6_sync_up(dev, RTNH_F_DEAD);
>>>>> +
>>>>
>>>> Seems like this should be in the NETDEV_UP section, say after
>>>> addrconf_permanent_addr.
>>>
>>> Unless the `keep_addr_on_down` sysctl is set, then at this stage the
>>> netdev doesn't have an IP address and we shouldn't clear the dead flag
>>> just yet.
>>>
>>> This is consistent with IPv4 that clears the dead flag from nexthops in
>>> a multipath route only if the nexthop device has an IP address. When the
>>> last IPv4 address is removed from a netdev all the routes using it are
>>> flushed and there's nothing to clear upon NETDEV_UP.
>>
>> I have a bug about that IPv4 handling from the FRR team:
>>
>> $ ip link add dummy1 type dummy
>> $ ip li set dummy1 up
>> $ ip route add 1.1.1.0/24 dev dummy1
>>
>> $ ip addr add dev dummy1 2.2.2.1/24
>> $ ip ro ls | grep dummy1
>> 1.1.1.0/24 dev dummy1 scope link
>> 2.2.2.0/24 dev dummy1 proto kernel scope link src 2.2.2.1
>>
>> $ ip addr del dev dummy1 2.2.2.1/24
>> $ ip ro ls | grep dummy1
>> <no outpu>
>>
>> The 1.1.1.0/24 route was removed as well the 2.2.2.0 connected route.
> 
> If you're going to skip the flushing in this case, at least mark the
> nexthops as dead.

On a down event, yes. If the device is still up then a route such as:
$ ip route add 1.1.1.0/24 dev dummy1
should still be usable even without an address on it.

> 
> And this is my second reason to have rt6_sync_up() where I put it. I'm
> preparing another set which sends FIB_EVENT_NH_ADD events from
> rt6_sync_up() similar to what we've in fib_sync_up(). When mlxsw (others

On a tangent here, but I have been meaning to ask why you have
FIB_EVENT_NH_ADD events as opposed to handling netdev events. What does
a FIB_EVENT_NH_ADD provide that you can't do from a netdev event handler?


> in the future) processes the event it needs to add the nexthop back to
> the forwarding plane. To do that, it needs to have a RIF for the
> nexthop device. For the nexthop device to have a RIF, it needs at least
> one IP address configured on the netdev.

Why is that?
$ ip addr sh dev swp1s0.51
44: swp1s0.51@swp1s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc
noqueue master vrf1101 state UP group default qlen 1000
    link/ether 7c:fe:90:e8:3a:7d brd ff:ff:ff:ff:ff:ff

$ ip ro add vrf vrf1101 1.1.1.0/24 dev swp1s0.51

$ ip ro ls vrf vrf1101
unreachable default metric 8192
1.1.1.0/24 dev swp1s0.51 scope link offload

In this case, I take it mlxsw allocates a rif because of the vlan. The
above does not work on just swp1s0 -- ie., that route is not offloaded:

$ # ip ro ls
...
1.1.1.0/24 dev swp1s0 scope link
...

Interesting.
Ido Schimmel Jan. 3, 2018, 5:40 p.m. UTC | #6
On Wed, Jan 03, 2018 at 09:56:02AM -0700, David Ahern wrote:
> On 1/3/18 9:43 AM, Ido Schimmel wrote:
> > On Wed, Jan 03, 2018 at 08:32:51AM -0700, David Ahern wrote:
> >> On 1/3/18 12:44 AM, Ido Schimmel wrote:
> >>>>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> >>>>> index ed06b1190f05..b6405568ed7b 100644
> >>>>> --- a/net/ipv6/addrconf.c
> >>>>> +++ b/net/ipv6/addrconf.c
> >>>>> @@ -3484,6 +3484,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
> >>>>>  			if (run_pending)
> >>>>>  				addrconf_dad_run(idev);
> >>>>>  
> >>>>> +			/* Device has an address by now */
> >>>>> +			rt6_sync_up(dev, RTNH_F_DEAD);
> >>>>> +
> >>>>
> >>>> Seems like this should be in the NETDEV_UP section, say after
> >>>> addrconf_permanent_addr.
> >>>
> >>> Unless the `keep_addr_on_down` sysctl is set, then at this stage the
> >>> netdev doesn't have an IP address and we shouldn't clear the dead flag
> >>> just yet.
> >>>
> >>> This is consistent with IPv4 that clears the dead flag from nexthops in
> >>> a multipath route only if the nexthop device has an IP address. When the
> >>> last IPv4 address is removed from a netdev all the routes using it are
> >>> flushed and there's nothing to clear upon NETDEV_UP.
> >>
> >> I have a bug about that IPv4 handling from the FRR team:
> >>
> >> $ ip link add dummy1 type dummy
> >> $ ip li set dummy1 up
> >> $ ip route add 1.1.1.0/24 dev dummy1
> >>
> >> $ ip addr add dev dummy1 2.2.2.1/24
> >> $ ip ro ls | grep dummy1
> >> 1.1.1.0/24 dev dummy1 scope link
> >> 2.2.2.0/24 dev dummy1 proto kernel scope link src 2.2.2.1
> >>
> >> $ ip addr del dev dummy1 2.2.2.1/24
> >> $ ip ro ls | grep dummy1
> >> <no outpu>
> >>
> >> The 1.1.1.0/24 route was removed as well the 2.2.2.0 connected route.
> > 
> > If you're going to skip the flushing in this case, at least mark the
> > nexthops as dead.
> 
> On a down event, yes. If the device is still up then a route such as:
> $ ip route add 1.1.1.0/24 dev dummy1
> should still be usable even without an address on it.

mlxsw will trap all the packets hitting the route until you assign an IP
address to dummy1.

> > And this is my second reason to have rt6_sync_up() where I put it. I'm
> > preparing another set which sends FIB_EVENT_NH_ADD events from
> > rt6_sync_up() similar to what we've in fib_sync_up(). When mlxsw (others
> 
> On a tangent here, but I have been meaning to ask why you have
> FIB_EVENT_NH_ADD events as opposed to handling netdev events. What does
> a FIB_EVENT_NH_ADD provide that you can't do from a netdev event handler?

It'll make switch drivers more complex than they already are. Why every
driver needs to duplicate the logic in call_fib_nh_notifiers()?

> > in the future) processes the event it needs to add the nexthop back to
> > the forwarding plane. To do that, it needs to have a RIF for the
> > nexthop device. For the nexthop device to have a RIF, it needs at least
> > one IP address configured on the netdev.
> 
> Why is that?
> $ ip addr sh dev swp1s0.51
> 44: swp1s0.51@swp1s0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc
> noqueue master vrf1101 state UP group default qlen 1000
>     link/ether 7c:fe:90:e8:3a:7d brd ff:ff:ff:ff:ff:ff
> 
> $ ip ro add vrf vrf1101 1.1.1.0/24 dev swp1s0.51
> 
> $ ip ro ls vrf vrf1101
> unreachable default metric 8192
> 1.1.1.0/24 dev swp1s0.51 scope link offload
> 
> In this case, I take it mlxsw allocates a rif because of the vlan. The
> above does not work on just swp1s0 -- ie., that route is not offloaded:
> 
> $ # ip ro ls
> ...
> 1.1.1.0/24 dev swp1s0 scope link
> ...
> 
> Interesting.

It allocates the RIF because of the enslavement to a VRF, which is an
explicit indication the user wants to use the interface for L3
forwarding.

David, can we please get back to the issue at hand? What's the problem
with the location of the call to rt6_sync_up()?
David Ahern Jan. 3, 2018, 6:47 p.m. UTC | #7
On 1/3/18 10:40 AM, Ido Schimmel wrote:
> David, can we please get back to the issue at hand? What's the problem
> with the location of the call to rt6_sync_up()?

My original comment was asking why do it on NETDEV_CHANGE when it should
only be needed on NETDEV_UP.
Ido Schimmel Jan. 3, 2018, 8:53 p.m. UTC | #8
On Wed, Jan 03, 2018 at 11:47:16AM -0700, David Ahern wrote:
> On 1/3/18 10:40 AM, Ido Schimmel wrote:
> > David, can we please get back to the issue at hand? What's the problem
> > with the location of the call to rt6_sync_up()?
> 
> My original comment was asking why do it on NETDEV_CHANGE when it should
> only be needed on NETDEV_UP.

I can condition the call to rt6_sync_up() on the event being NETDEV_UP,
but the location needs to stay the same. Before that the interface still
doesn't have an IP address.

Reason for this requirement is that rt6_sync_up() is going to generate
FIB_EVENT_NH_ADD events that instruct switch drivers to populate their
adjacency tables with the notified nexthop. For this to happen, the
nexthop device needs to have L3 configuration (e.g., RIF in mlxsw) which
is dependent on the presence of an IP address.
David Ahern Jan. 3, 2018, 11:08 p.m. UTC | #9
On 1/3/18 1:53 PM, Ido Schimmel wrote:
> On Wed, Jan 03, 2018 at 11:47:16AM -0700, David Ahern wrote:
>> On 1/3/18 10:40 AM, Ido Schimmel wrote:
>>> David, can we please get back to the issue at hand? What's the problem
>>> with the location of the call to rt6_sync_up()?
>>
>> My original comment was asking why do it on NETDEV_CHANGE when it should
>> only be needed on NETDEV_UP.
> 
> I can condition the call to rt6_sync_up() on the event being NETDEV_UP,
> but the location needs to stay the same. Before that the interface still
> doesn't have an IP address.

it's fine as is. The NETDEV_CHANGE section has a break after calling
rt6_sync_up(RTNH_F_LINKDOWN) which is added later. To your point,
addrconf_dev_config will add a linklocal address.
diff mbox series

Patch

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 18e442ea93d8..caad39198c2a 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -169,6 +169,7 @@  void rt6_ifdown(struct net *net, struct net_device *dev);
 void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
+void rt6_sync_up(struct net_device *dev, unsigned int nh_flags);
 
 static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index ed06b1190f05..b6405568ed7b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3484,6 +3484,9 @@  static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			if (run_pending)
 				addrconf_dad_run(idev);
 
+			/* Device has an address by now */
+			rt6_sync_up(dev, RTNH_F_DEAD);
+
 			/*
 			 * If the MTU changed during the interface down,
 			 * when the interface up, the changed MTU must be
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f5eda0aeab55..4796d87e0b93 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3459,6 +3459,35 @@  void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
 	fib6_clean_all(net, fib6_clean_tohost, gateway);
 }
 
+struct arg_netdev_event {
+	const struct net_device *dev;
+	unsigned int nh_flags;
+};
+
+static int fib6_ifup(struct rt6_info *rt, void *p_arg)
+{
+	const struct arg_netdev_event *arg = p_arg;
+	const struct net *net = dev_net(arg->dev);
+
+	if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev)
+		rt->rt6i_nh_flags &= ~arg->nh_flags;
+
+	return 0;
+}
+
+void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
+{
+	struct arg_netdev_event arg = {
+		.dev = dev,
+		.nh_flags = nh_flags,
+	};
+
+	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
+		arg.nh_flags |= RTNH_F_LINKDOWN;
+
+	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
+}
+
 struct arg_dev_net {
 	struct net_device *dev;
 	struct net *net;