diff mbox series

[v3,net-next,2/5] net/ipv6: Address checks need to consider the L3 domain

Message ID 20180307035841.774-3-dsahern@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series [v3,net-next,1/5] net/ipv6: Refactor gateway validation on route add | expand

Commit Message

David Ahern March 7, 2018, 3:58 a.m. UTC
ipv6_chk_addr_and_flags determines if an address is a local address. It
is called by ip6_route_info_create to validate a gateway address is not a
local address. It currently does not consider L3 domains and as a result
does not allow a route to be added in one VRF if the nexthop points to
an address in a second VRF. e.g.,

    $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23
    Error: Invalid gateway address.

where 2001:db8:102::23 is an address on an interface in vrf r1.

Resolve by comparing the l3mdev for the passed in device and requiring an
l3mdev match with the device containing an address. The intent of checking
for an address on the specified device versus any device in the domain is
mantained by a new argument to skip the check between the passed in device
and the device with the address.

Update the handful of users of ipv6_chk_addr with a NULL dev argument:
- anycast to call ipv6_chk_addr_and_flags. If the device is given by the
  user, look for the given address across the L3 domain. If the index is
  not given, the default table is presumed so only addresses on devices
  not enslaved are considered.

- ip6_tnl_rcv_ctl - local address must exist on device, remote address
  can not exist in L3 domain; only remote check needs to be updated but
  do both for consistency.

ip6_validate_gw needs to handle 2 cases - one where the device is given
as part of the nexthop spec and the other where the device is resolved.
There is at least 1 VRF case where deferring the check to only after
the route lookup has resolved the device fails with an unintuitive error
"RTNETLINK answers: No route to host" as opposed to the preferred
"Error: Gateway can not be a local address." The 'no route to host'
error is because of the fallback to a full lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/net/addrconf.h |  4 ++--
 net/ipv6/addrconf.c    | 26 ++++++++++++++++++++++----
 net/ipv6/anycast.c     |  9 ++++++---
 net/ipv6/datagram.c    |  5 +++--
 net/ipv6/ip6_tunnel.c  | 12 ++++++++----
 net/ipv6/ndisc.c       |  2 +-
 net/ipv6/route.c       | 37 ++++++++++++++++++++++++++++---------
 7 files changed, 70 insertions(+), 25 deletions(-)

Comments

Kirill Tkhai March 7, 2018, 11:53 a.m. UTC | #1
On 07.03.2018 06:58, David Ahern wrote:
> ipv6_chk_addr_and_flags determines if an address is a local address. It
> is called by ip6_route_info_create to validate a gateway address is not a
> local address. It currently does not consider L3 domains and as a result
> does not allow a route to be added in one VRF if the nexthop points to
> an address in a second VRF. e.g.,
> 
>     $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23
>     Error: Invalid gateway address.
> 
> where 2001:db8:102::23 is an address on an interface in vrf r1.
> 
> Resolve by comparing the l3mdev for the passed in device and requiring an
> l3mdev match with the device containing an address. The intent of checking
> for an address on the specified device versus any device in the domain is
> mantained by a new argument to skip the check between the passed in device
> and the device with the address.
> 
> Update the handful of users of ipv6_chk_addr with a NULL dev argument:
> - anycast to call ipv6_chk_addr_and_flags. If the device is given by the
>   user, look for the given address across the L3 domain. If the index is
>   not given, the default table is presumed so only addresses on devices
>   not enslaved are considered.
> 
> - ip6_tnl_rcv_ctl - local address must exist on device, remote address
>   can not exist in L3 domain; only remote check needs to be updated but
>   do both for consistency.
> 
> ip6_validate_gw needs to handle 2 cases - one where the device is given
> as part of the nexthop spec and the other where the device is resolved.
> There is at least 1 VRF case where deferring the check to only after
> the route lookup has resolved the device fails with an unintuitive error
> "RTNETLINK answers: No route to host" as opposed to the preferred
> "Error: Gateway can not be a local address." The 'no route to host'
> error is because of the fallback to a full lookup.
> 
> Signed-off-by: David Ahern <dsahern@gmail.com>
> ---
>  include/net/addrconf.h |  4 ++--
>  net/ipv6/addrconf.c    | 26 ++++++++++++++++++++++----
>  net/ipv6/anycast.c     |  9 ++++++---
>  net/ipv6/datagram.c    |  5 +++--
>  net/ipv6/ip6_tunnel.c  | 12 ++++++++----
>  net/ipv6/ndisc.c       |  2 +-
>  net/ipv6/route.c       | 37 ++++++++++++++++++++++++++++---------
>  7 files changed, 70 insertions(+), 25 deletions(-)
> 
> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
> index c4185a7b0e90..132e5b95167a 100644
> --- a/include/net/addrconf.h
> +++ b/include/net/addrconf.h
> @@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>  		  const struct net_device *dev, int strict);
>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
> -			    const struct net_device *dev, int strict,
> -			    u32 banned_flags);
> +			    const struct net_device *dev, bool skip_dev_check,
> +			    int strict, u32 banned_flags);

This function already has 5 arguments, while this patch adds one more.
Can't we use new flags argument for both of them?

Also, the name of function and input parameters are already so big, that they
don't fit a single line already, while your patch adds more parameters.
Can't we make it more slim? Something like ipv6_chk_addr_fl() instead of current
name.

>  #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
>  int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index b5fd116c046a..17d5d3f42d21 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -1851,22 +1851,40 @@ static int ipv6_count_addresses(const struct inet6_dev *idev)
>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>  		  const struct net_device *dev, int strict)
>  {
> -	return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
> +	return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
> +				       strict, IFA_F_TENTATIVE);
>  }
>  EXPORT_SYMBOL(ipv6_chk_addr);

This function was not introduced by this commit, but since the commit modifies it,
and the function is pretty simple, we could declare it as static inline in header
in separate patch.

>  
> +/* device argument is used to find the L3 domain of interest. If
> + * skip_dev_check is set, then the ifp device is not checked against
> + * the passed in dev argument. So the 2 cases for addresses checks are:
> + *   1. does the address exist in the L3 domain that dev is part of
> + *      (skip_dev_check = true), or
> + *
> + *   2. does the address exist on the specific device
> + *      (skip_dev_check = false)
> + */
>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
> -			    const struct net_device *dev, int strict,
> -			    u32 banned_flags)
> +			    const struct net_device *dev, bool skip_dev_check,
> +			    int strict, u32 banned_flags)
>  {
>  	unsigned int hash = inet6_addr_hash(net, addr);
> +	const struct net_device *l3mdev;
>  	struct inet6_ifaddr *ifp;
>  	u32 ifp_flags;
>  
>  	rcu_read_lock();
> +
> +	l3mdev = l3mdev_master_dev_rcu(dev);
> +
>  	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
>  		if (!net_eq(dev_net(ifp->idev->dev), net))
>  			continue;
> +
> +		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
> +			continue;
> +
>  		/* Decouple optimistic from tentative for evaluation here.
>  		 * Ban optimistic addresses explicitly, when required.
>  		 */
> @@ -1875,7 +1893,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>  			    : ifp->flags;
>  		if (ipv6_addr_equal(&ifp->addr, addr) &&
>  		    !(ifp_flags&banned_flags) &&
> -		    (!dev || ifp->idev->dev == dev ||
> +		    (skip_dev_check || ifp->idev->dev == dev ||
>  		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
>  			rcu_read_unlock();
>  			return 1;

There are two logical pieces in changes of this function:

1)You become always pass not NULL dev and add skip_dev_check argument.
2)l3mdev_master_dev_rcu() check is introduced.

They should go in separate patches.

> diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
> index c61718dba2e6..d580d4d456a5 100644
> --- a/net/ipv6/anycast.c
> +++ b/net/ipv6/anycast.c
> @@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>  		return -EPERM;
>  	if (ipv6_addr_is_multicast(addr))
>  		return -EINVAL;
> -	if (ipv6_chk_addr(net, addr, NULL, 0))
> +
> +	if (ifindex)
> +		dev = __dev_get_by_index(net, ifindex);
> +
> +	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
>  		return -EINVAL;
>  
>  	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
> @@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>  			dev = __dev_get_by_flags(net, IFF_UP,
>  						 IFF_UP | IFF_LOOPBACK);
>  		}
> -	} else
> -		dev = __dev_get_by_index(net, ifindex);
> +	}

The hunk moving __dev_get_by_index() dereference may go as separate change, as it's a refactoring.
This will make the review easier.

>  
>  	if (!dev) {
>  		err = -ENODEV;
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fbf08ce3f5ab..b27333d7b099 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>  			if (addr_type != IPV6_ADDR_ANY) {
>  				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
>  				if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
> -				    !ipv6_chk_addr(net, &src_info->ipi6_addr,
> -						   strict ? dev : NULL, 0) &&
> +				    !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
> +							     dev, !strict, 0,
> +							     IFA_F_TENTATIVE) &&
>  				    !ipv6_chk_acast_addr_src(net, dev,
>  							     &src_info->ipi6_addr))
>  					err = -EINVAL;
> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
> index 56c4967f1868..1ce8244e8aee 100644
> --- a/net/ipv6/ip6_tunnel.c
> +++ b/net/ipv6/ip6_tunnel.c
> @@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
>  			ldev = dev_get_by_index_rcu(net, p->link);
>  
>  		if ((ipv6_addr_is_multicast(laddr) ||
> -		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
> +		     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
> +						    0, IFA_F_TENTATIVE))) &&
>  		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
> -		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
> +		     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
> +						     0, IFA_F_TENTATIVE))))
>  			ret = 1;
>  	}
>  	return ret;
> @@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
>  		if (p->link)
>  			ldev = dev_get_by_index_rcu(net, p->link);
>  
> -		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
> +		if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
> +						      0, IFA_F_TENTATIVE)))
>  			pr_warn("%s xmit: Local address not yet configured!\n",
>  				p->name);
>  		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
>  			 !ipv6_addr_is_multicast(raddr) &&
> -			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
> +			 unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
> +							  true, 0, IFA_F_TENTATIVE)))
>  			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
>  				p->name);
>  		else
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> index 0a19ce3a6f7f..13bf775c7f1a 100644
> --- a/net/ipv6/ndisc.c
> +++ b/net/ipv6/ndisc.c
> @@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
>  	int probes = atomic_read(&neigh->probes);
>  
>  	if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
> -					   dev, 1,
> +					   dev, false, 1,
>  					   IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
>  		saddr = &ipv6_hdr(skb)->saddr;
>  	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 3851c3ccfd7a..bbc62799eb3b 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -2633,18 +2633,25 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>  	const struct in6_addr *gw_addr = &cfg->fc_gateway;
>  	int gwa_type = ipv6_addr_type(gw_addr);
>  	const struct net_device *dev = *_dev;
> +	bool need_local_addr_check = !dev;
>  	int err = -EINVAL;
>  
> -	/* if gw_addr is local we will fail to detect this in case
> -	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
> -	 * will return already-added prefix route via interface that
> -	 * prefix route was assigned to, which might be non-loopback.
> +	/* if route spec contains the device, check if gateway address
> +	 * is a local address in the same L3 domain
>  	 */
> -	if (ipv6_chk_addr_and_flags(net, gw_addr,
> -				    gwa_type & IPV6_ADDR_LINKLOCAL ?
> -				    dev : NULL, 0, 0)) {
> -		NL_SET_ERR_MSG(extack, "Invalid gateway address");
> -		goto out;
> +	if (dev) {
> +		/* if gw_addr is local we will fail to detect this in case
> +		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
> +		 * will return already-added prefix route via interface that
> +		 * prefix route was assigned to, which might be non-loopback.
> +		 */
> +		if (ipv6_chk_addr_and_flags(net, gw_addr, dev,
> +					    gwa_type & IPV6_ADDR_LINKLOCAL ?
> +					    false : true, 0, 0)) {
> +			NL_SET_ERR_MSG(extack,
> +				       "Gateway can not be a local address");
> +			goto out;
> +		}

Why do these two "if" go as tree, not as single "if (a && b)"?

>  	}
>  
>  	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
> @@ -2683,6 +2690,18 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>  			       "Egress device can not be loopback device for this route");
>  		goto out;
>  	}
> +
> +	/* if we did not check gw_addr above, do so now that the
> +	 * egress device has been resolved.
> +	 */
> +	if (need_local_addr_check &&

Do we really need a variable with so long name? Can't we use "local_check" or something like this?

> +	    ipv6_chk_addr_and_flags(net, gw_addr, dev,
> +				    gwa_type & IPV6_ADDR_LINKLOCAL ?
> +				    false : true, 0, 0)) {

Second time there is "gwa_type & IPV6_ADDR_LINKLOCAL ? false : true". gwa_type is constant,
it doesn't change. Repeating constant expressions have to be be cached into local variable
for improving readability.

> +		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
> +		goto out;
> +	}
> +
>  	err = 0;
>  out:
>  	return err;
> 

Thanks,
Kirill
Kirill Tkhai March 7, 2018, 11:59 a.m. UTC | #2
On 07.03.2018 14:53, Kirill Tkhai wrote:
> On 07.03.2018 06:58, David Ahern wrote:
>> ipv6_chk_addr_and_flags determines if an address is a local address. It
>> is called by ip6_route_info_create to validate a gateway address is not a
>> local address. It currently does not consider L3 domains and as a result
>> does not allow a route to be added in one VRF if the nexthop points to
>> an address in a second VRF. e.g.,
>>
>>     $ ip route add 2001:db8:1::/64 vrf r2 via 2001:db8:102::23
>>     Error: Invalid gateway address.
>>
>> where 2001:db8:102::23 is an address on an interface in vrf r1.
>>
>> Resolve by comparing the l3mdev for the passed in device and requiring an
>> l3mdev match with the device containing an address. The intent of checking
>> for an address on the specified device versus any device in the domain is
>> mantained by a new argument to skip the check between the passed in device
>> and the device with the address.
>>
>> Update the handful of users of ipv6_chk_addr with a NULL dev argument:
>> - anycast to call ipv6_chk_addr_and_flags. If the device is given by the
>>   user, look for the given address across the L3 domain. If the index is
>>   not given, the default table is presumed so only addresses on devices
>>   not enslaved are considered.
>>
>> - ip6_tnl_rcv_ctl - local address must exist on device, remote address
>>   can not exist in L3 domain; only remote check needs to be updated but
>>   do both for consistency.
>>
>> ip6_validate_gw needs to handle 2 cases - one where the device is given
>> as part of the nexthop spec and the other where the device is resolved.
>> There is at least 1 VRF case where deferring the check to only after
>> the route lookup has resolved the device fails with an unintuitive error
>> "RTNETLINK answers: No route to host" as opposed to the preferred
>> "Error: Gateway can not be a local address." The 'no route to host'
>> error is because of the fallback to a full lookup.
>>
>> Signed-off-by: David Ahern <dsahern@gmail.com>
>> ---
>>  include/net/addrconf.h |  4 ++--
>>  net/ipv6/addrconf.c    | 26 ++++++++++++++++++++++----
>>  net/ipv6/anycast.c     |  9 ++++++---
>>  net/ipv6/datagram.c    |  5 +++--
>>  net/ipv6/ip6_tunnel.c  | 12 ++++++++----
>>  net/ipv6/ndisc.c       |  2 +-
>>  net/ipv6/route.c       | 37 ++++++++++++++++++++++++++++---------
>>  7 files changed, 70 insertions(+), 25 deletions(-)
>>
>> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
>> index c4185a7b0e90..132e5b95167a 100644
>> --- a/include/net/addrconf.h
>> +++ b/include/net/addrconf.h
>> @@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
>>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>>  		  const struct net_device *dev, int strict);
>>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>> -			    const struct net_device *dev, int strict,
>> -			    u32 banned_flags);
>> +			    const struct net_device *dev, bool skip_dev_check,
>> +			    int strict, u32 banned_flags);
> 
> This function already has 5 arguments, while this patch adds one more.
> Can't we use new flags argument for both of them?

Them are skip_dev_check and strict.
 
> Also, the name of function and input parameters are already so big, that they
> don't fit a single line already, while your patch adds more parameters.
> Can't we make it more slim? Something like ipv6_chk_addr_fl() instead of current
> name.
> 
>>  #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
>>  int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>> index b5fd116c046a..17d5d3f42d21 100644
>> --- a/net/ipv6/addrconf.c
>> +++ b/net/ipv6/addrconf.c
>> @@ -1851,22 +1851,40 @@ static int ipv6_count_addresses(const struct inet6_dev *idev)
>>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>>  		  const struct net_device *dev, int strict)
>>  {
>> -	return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
>> +	return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
>> +				       strict, IFA_F_TENTATIVE);
>>  }
>>  EXPORT_SYMBOL(ipv6_chk_addr);
> 
> This function was not introduced by this commit, but since the commit modifies it,
> and the function is pretty simple, we could declare it as static inline in header
> in separate patch.
> 
>>  
>> +/* device argument is used to find the L3 domain of interest. If
>> + * skip_dev_check is set, then the ifp device is not checked against
>> + * the passed in dev argument. So the 2 cases for addresses checks are:
>> + *   1. does the address exist in the L3 domain that dev is part of
>> + *      (skip_dev_check = true), or
>> + *
>> + *   2. does the address exist on the specific device
>> + *      (skip_dev_check = false)
>> + */
>>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>> -			    const struct net_device *dev, int strict,
>> -			    u32 banned_flags)
>> +			    const struct net_device *dev, bool skip_dev_check,
>> +			    int strict, u32 banned_flags)
>>  {
>>  	unsigned int hash = inet6_addr_hash(net, addr);
>> +	const struct net_device *l3mdev;
>>  	struct inet6_ifaddr *ifp;
>>  	u32 ifp_flags;
>>  
>>  	rcu_read_lock();
>> +
>> +	l3mdev = l3mdev_master_dev_rcu(dev);
>> +
>>  	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
>>  		if (!net_eq(dev_net(ifp->idev->dev), net))
>>  			continue;
>> +
>> +		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
>> +			continue;
>> +
>>  		/* Decouple optimistic from tentative for evaluation here.
>>  		 * Ban optimistic addresses explicitly, when required.
>>  		 */
>> @@ -1875,7 +1893,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>>  			    : ifp->flags;
>>  		if (ipv6_addr_equal(&ifp->addr, addr) &&
>>  		    !(ifp_flags&banned_flags) &&
>> -		    (!dev || ifp->idev->dev == dev ||
>> +		    (skip_dev_check || ifp->idev->dev == dev ||
>>  		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
>>  			rcu_read_unlock();
>>  			return 1;
> 
> There are two logical pieces in changes of this function:
> 
> 1)You become always pass not NULL dev and add skip_dev_check argument.
> 2)l3mdev_master_dev_rcu() check is introduced.
> 
> They should go in separate patches.
> 
>> diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
>> index c61718dba2e6..d580d4d456a5 100644
>> --- a/net/ipv6/anycast.c
>> +++ b/net/ipv6/anycast.c
>> @@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>>  		return -EPERM;
>>  	if (ipv6_addr_is_multicast(addr))
>>  		return -EINVAL;
>> -	if (ipv6_chk_addr(net, addr, NULL, 0))
>> +
>> +	if (ifindex)
>> +		dev = __dev_get_by_index(net, ifindex);
>> +
>> +	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
>>  		return -EINVAL;
>>  
>>  	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
>> @@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>>  			dev = __dev_get_by_flags(net, IFF_UP,
>>  						 IFF_UP | IFF_LOOPBACK);
>>  		}
>> -	} else
>> -		dev = __dev_get_by_index(net, ifindex);
>> +	}
> 
> The hunk moving __dev_get_by_index() dereference may go as separate change, as it's a refactoring.
> This will make the review easier.
> 
>>  
>>  	if (!dev) {
>>  		err = -ENODEV;
>> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
>> index fbf08ce3f5ab..b27333d7b099 100644
>> --- a/net/ipv6/datagram.c
>> +++ b/net/ipv6/datagram.c
>> @@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>>  			if (addr_type != IPV6_ADDR_ANY) {
>>  				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
>>  				if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
>> -				    !ipv6_chk_addr(net, &src_info->ipi6_addr,
>> -						   strict ? dev : NULL, 0) &&
>> +				    !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
>> +							     dev, !strict, 0,
>> +							     IFA_F_TENTATIVE) &&
>>  				    !ipv6_chk_acast_addr_src(net, dev,
>>  							     &src_info->ipi6_addr))
>>  					err = -EINVAL;
>> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
>> index 56c4967f1868..1ce8244e8aee 100644
>> --- a/net/ipv6/ip6_tunnel.c
>> +++ b/net/ipv6/ip6_tunnel.c
>> @@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
>>  			ldev = dev_get_by_index_rcu(net, p->link);
>>  
>>  		if ((ipv6_addr_is_multicast(laddr) ||
>> -		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
>> +		     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
>> +						    0, IFA_F_TENTATIVE))) &&
>>  		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
>> -		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
>> +		     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
>> +						     0, IFA_F_TENTATIVE))))
>>  			ret = 1;
>>  	}
>>  	return ret;
>> @@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
>>  		if (p->link)
>>  			ldev = dev_get_by_index_rcu(net, p->link);
>>  
>> -		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
>> +		if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
>> +						      0, IFA_F_TENTATIVE)))
>>  			pr_warn("%s xmit: Local address not yet configured!\n",
>>  				p->name);
>>  		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
>>  			 !ipv6_addr_is_multicast(raddr) &&
>> -			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
>> +			 unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
>> +							  true, 0, IFA_F_TENTATIVE)))
>>  			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
>>  				p->name);
>>  		else
>> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
>> index 0a19ce3a6f7f..13bf775c7f1a 100644
>> --- a/net/ipv6/ndisc.c
>> +++ b/net/ipv6/ndisc.c
>> @@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
>>  	int probes = atomic_read(&neigh->probes);
>>  
>>  	if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
>> -					   dev, 1,
>> +					   dev, false, 1,
>>  					   IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
>>  		saddr = &ipv6_hdr(skb)->saddr;
>>  	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 3851c3ccfd7a..bbc62799eb3b 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -2633,18 +2633,25 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>>  	const struct in6_addr *gw_addr = &cfg->fc_gateway;
>>  	int gwa_type = ipv6_addr_type(gw_addr);
>>  	const struct net_device *dev = *_dev;
>> +	bool need_local_addr_check = !dev;
>>  	int err = -EINVAL;
>>  
>> -	/* if gw_addr is local we will fail to detect this in case
>> -	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
>> -	 * will return already-added prefix route via interface that
>> -	 * prefix route was assigned to, which might be non-loopback.
>> +	/* if route spec contains the device, check if gateway address
>> +	 * is a local address in the same L3 domain
>>  	 */
>> -	if (ipv6_chk_addr_and_flags(net, gw_addr,
>> -				    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> -				    dev : NULL, 0, 0)) {
>> -		NL_SET_ERR_MSG(extack, "Invalid gateway address");
>> -		goto out;
>> +	if (dev) {
>> +		/* if gw_addr is local we will fail to detect this in case
>> +		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
>> +		 * will return already-added prefix route via interface that
>> +		 * prefix route was assigned to, which might be non-loopback.
>> +		 */
>> +		if (ipv6_chk_addr_and_flags(net, gw_addr, dev,
>> +					    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> +					    false : true, 0, 0)) {
>> +			NL_SET_ERR_MSG(extack,
>> +				       "Gateway can not be a local address");
>> +			goto out;
>> +		}
> 
> Why do these two "if" go as tree, not as single "if (a && b)"?
> 
>>  	}
>>  
>>  	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
>> @@ -2683,6 +2690,18 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>>  			       "Egress device can not be loopback device for this route");
>>  		goto out;
>>  	}
>> +
>> +	/* if we did not check gw_addr above, do so now that the
>> +	 * egress device has been resolved.
>> +	 */
>> +	if (need_local_addr_check &&
> 
> Do we really need a variable with so long name? Can't we use "local_check" or something like this?
> 
>> +	    ipv6_chk_addr_and_flags(net, gw_addr, dev,
>> +				    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> +				    false : true, 0, 0)) {
> 
> Second time there is "gwa_type & IPV6_ADDR_LINKLOCAL ? false : true". gwa_type is constant,
> it doesn't change. Repeating constant expressions have to be be cached into local variable
> for improving readability.
> 
>> +		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
>> +		goto out;
>> +	}
>> +
>>  	err = 0;
>>  out:
>>  	return err;
>>
> 
> Thanks,
> Kirill
>
David Ahern March 7, 2018, 5:28 p.m. UTC | #3
On 3/7/18 4:53 AM, Kirill Tkhai wrote:
>> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
>> index c4185a7b0e90..132e5b95167a 100644
>> --- a/include/net/addrconf.h
>> +++ b/include/net/addrconf.h
>> @@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
>>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>>  		  const struct net_device *dev, int strict);
>>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>> -			    const struct net_device *dev, int strict,
>> -			    u32 banned_flags);
>> +			    const struct net_device *dev, bool skip_dev_check,
>> +			    int strict, u32 banned_flags);
> 
> This function already has 5 arguments, while this patch adds one more.
> Can't we use new flags argument for both of them?
> 
> Also, the name of function and input parameters are already so big, that they
> don't fit a single line already, while your patch adds more parameters.
> Can't we make it more slim? Something like ipv6_chk_addr_fl() instead of current
> name.

I think I can combine strict and the new skip_dev_check. I am going to
leave the function name as is.

> 
>>  #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
>>  int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
>> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
>> index b5fd116c046a..17d5d3f42d21 100644
>> --- a/net/ipv6/addrconf.c
>> +++ b/net/ipv6/addrconf.c
>> @@ -1851,22 +1851,40 @@ static int ipv6_count_addresses(const struct inet6_dev *idev)
>>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>>  		  const struct net_device *dev, int strict)
>>  {
>> -	return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
>> +	return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
>> +				       strict, IFA_F_TENTATIVE);
>>  }
>>  EXPORT_SYMBOL(ipv6_chk_addr);
> 
> This function was not introduced by this commit, but since the commit modifies it,
> and the function is pretty simple, we could declare it as static inline in header
> in separate patch.

That function is needed by netfilter code. Any consolidation is outside
the scope of this patch set.

> 
>>  
>> +/* device argument is used to find the L3 domain of interest. If
>> + * skip_dev_check is set, then the ifp device is not checked against
>> + * the passed in dev argument. So the 2 cases for addresses checks are:
>> + *   1. does the address exist in the L3 domain that dev is part of
>> + *      (skip_dev_check = true), or
>> + *
>> + *   2. does the address exist on the specific device
>> + *      (skip_dev_check = false)
>> + */
>>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>> -			    const struct net_device *dev, int strict,
>> -			    u32 banned_flags)
>> +			    const struct net_device *dev, bool skip_dev_check,
>> +			    int strict, u32 banned_flags)
>>  {
>>  	unsigned int hash = inet6_addr_hash(net, addr);
>> +	const struct net_device *l3mdev;
>>  	struct inet6_ifaddr *ifp;
>>  	u32 ifp_flags;
>>  
>>  	rcu_read_lock();
>> +
>> +	l3mdev = l3mdev_master_dev_rcu(dev);
>> +
>>  	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
>>  		if (!net_eq(dev_net(ifp->idev->dev), net))
>>  			continue;
>> +
>> +		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
>> +			continue;
>> +
>>  		/* Decouple optimistic from tentative for evaluation here.
>>  		 * Ban optimistic addresses explicitly, when required.
>>  		 */
>> @@ -1875,7 +1893,7 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>>  			    : ifp->flags;
>>  		if (ipv6_addr_equal(&ifp->addr, addr) &&
>>  		    !(ifp_flags&banned_flags) &&
>> -		    (!dev || ifp->idev->dev == dev ||
>> +		    (skip_dev_check || ifp->idev->dev == dev ||
>>  		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
>>  			rcu_read_unlock();
>>  			return 1;
> 
> There are two logical pieces in changes of this function:
> 
> 1)You become always pass not NULL dev and add skip_dev_check argument.
> 2)l3mdev_master_dev_rcu() check is introduced.

dev can still be null; l3mdev lookup handles a null argument.

> 
> They should go in separate patches.

The change needs to go in together since I am altering how this function
works.

> 
>> diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
>> index c61718dba2e6..d580d4d456a5 100644
>> --- a/net/ipv6/anycast.c
>> +++ b/net/ipv6/anycast.c
>> @@ -66,7 +66,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>>  		return -EPERM;
>>  	if (ipv6_addr_is_multicast(addr))
>>  		return -EINVAL;
>> -	if (ipv6_chk_addr(net, addr, NULL, 0))
>> +
>> +	if (ifindex)
>> +		dev = __dev_get_by_index(net, ifindex);
>> +
>> +	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
>>  		return -EINVAL;
>>  
>>  	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
>> @@ -90,8 +94,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
>>  			dev = __dev_get_by_flags(net, IFF_UP,
>>  						 IFF_UP | IFF_LOOPBACK);
>>  		}
>> -	} else
>> -		dev = __dev_get_by_index(net, ifindex);
>> +	}
> 
> The hunk moving __dev_get_by_index() dereference may go as separate change, as it's a refactoring.
> This will make the review easier.

Moving 1 line of code up a few lines should not need a standalone patch;
I think the above is understandable.

> 
>>  
>>  	if (!dev) {
>>  		err = -ENODEV;
>> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
>> index fbf08ce3f5ab..b27333d7b099 100644
>> --- a/net/ipv6/datagram.c
>> +++ b/net/ipv6/datagram.c
>> @@ -801,8 +801,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>>  			if (addr_type != IPV6_ADDR_ANY) {
>>  				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
>>  				if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
>> -				    !ipv6_chk_addr(net, &src_info->ipi6_addr,
>> -						   strict ? dev : NULL, 0) &&
>> +				    !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
>> +							     dev, !strict, 0,
>> +							     IFA_F_TENTATIVE) &&
>>  				    !ipv6_chk_acast_addr_src(net, dev,
>>  							     &src_info->ipi6_addr))
>>  					err = -EINVAL;
>> diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
>> index 56c4967f1868..1ce8244e8aee 100644
>> --- a/net/ipv6/ip6_tunnel.c
>> +++ b/net/ipv6/ip6_tunnel.c
>> @@ -758,9 +758,11 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
>>  			ldev = dev_get_by_index_rcu(net, p->link);
>>  
>>  		if ((ipv6_addr_is_multicast(laddr) ||
>> -		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
>> +		     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
>> +						    0, IFA_F_TENTATIVE))) &&
>>  		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
>> -		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
>> +		     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
>> +						     0, IFA_F_TENTATIVE))))
>>  			ret = 1;
>>  	}
>>  	return ret;
>> @@ -990,12 +992,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
>>  		if (p->link)
>>  			ldev = dev_get_by_index_rcu(net, p->link);
>>  
>> -		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
>> +		if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
>> +						      0, IFA_F_TENTATIVE)))
>>  			pr_warn("%s xmit: Local address not yet configured!\n",
>>  				p->name);
>>  		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
>>  			 !ipv6_addr_is_multicast(raddr) &&
>> -			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
>> +			 unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
>> +							  true, 0, IFA_F_TENTATIVE)))
>>  			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
>>  				p->name);
>>  		else
>> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
>> index 0a19ce3a6f7f..13bf775c7f1a 100644
>> --- a/net/ipv6/ndisc.c
>> +++ b/net/ipv6/ndisc.c
>> @@ -707,7 +707,7 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
>>  	int probes = atomic_read(&neigh->probes);
>>  
>>  	if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
>> -					   dev, 1,
>> +					   dev, false, 1,
>>  					   IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
>>  		saddr = &ipv6_hdr(skb)->saddr;
>>  	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>> index 3851c3ccfd7a..bbc62799eb3b 100644
>> --- a/net/ipv6/route.c
>> +++ b/net/ipv6/route.c
>> @@ -2633,18 +2633,25 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>>  	const struct in6_addr *gw_addr = &cfg->fc_gateway;
>>  	int gwa_type = ipv6_addr_type(gw_addr);
>>  	const struct net_device *dev = *_dev;
>> +	bool need_local_addr_check = !dev;
>>  	int err = -EINVAL;
>>  
>> -	/* if gw_addr is local we will fail to detect this in case
>> -	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
>> -	 * will return already-added prefix route via interface that
>> -	 * prefix route was assigned to, which might be non-loopback.
>> +	/* if route spec contains the device, check if gateway address
>> +	 * is a local address in the same L3 domain
>>  	 */
>> -	if (ipv6_chk_addr_and_flags(net, gw_addr,
>> -				    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> -				    dev : NULL, 0, 0)) {
>> -		NL_SET_ERR_MSG(extack, "Invalid gateway address");
>> -		goto out;
>> +	if (dev) {
>> +		/* if gw_addr is local we will fail to detect this in case
>> +		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
>> +		 * will return already-added prefix route via interface that
>> +		 * prefix route was assigned to, which might be non-loopback.
>> +		 */
>> +		if (ipv6_chk_addr_and_flags(net, gw_addr, dev,
>> +					    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> +					    false : true, 0, 0)) {
>> +			NL_SET_ERR_MSG(extack,
>> +				       "Gateway can not be a local address");
>> +			goto out;
>> +		}
> 
> Why do these two "if" go as tree, not as single "if (a && b)"?

yes, I should combine that.


> 
>>  	}
>>  
>>  	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
>> @@ -2683,6 +2690,18 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
>>  			       "Egress device can not be loopback device for this route");
>>  		goto out;
>>  	}
>> +
>> +	/* if we did not check gw_addr above, do so now that the
>> +	 * egress device has been resolved.
>> +	 */
>> +	if (need_local_addr_check &&
> 
> Do we really need a variable with so long name? Can't we use "local_check" or something like this?

given the limited use of the flag, I believe the longer name is more
readable. Making it shorter does not allow the 2 checks to consolidate
into 1 line, so nothing is gained by an overly short variable name.

> 
>> +	    ipv6_chk_addr_and_flags(net, gw_addr, dev,
>> +				    gwa_type & IPV6_ADDR_LINKLOCAL ?
>> +				    false : true, 0, 0)) {
> 
> Second time there is "gwa_type & IPV6_ADDR_LINKLOCAL ? false : true". gwa_type is constant,
> it doesn't change. Repeating constant expressions have to be be cached into local variable
> for improving readability.

they don't have to be but I will make a bool for it.


> 
>> +		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
>> +		goto out;
>> +	}
>> +
>>  	err = 0;
>>  out:
>>  	return err;
>>
> 
> Thanks,
> Kirill
>
David Ahern March 7, 2018, 7:53 p.m. UTC | #4
On 3/7/18 10:28 AM, David Ahern wrote:
> On 3/7/18 4:53 AM, Kirill Tkhai wrote:
>>> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
>>> index c4185a7b0e90..132e5b95167a 100644
>>> --- a/include/net/addrconf.h
>>> +++ b/include/net/addrconf.h
>>> @@ -69,8 +69,8 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg);
>>>  int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
>>>  		  const struct net_device *dev, int strict);
>>>  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
>>> -			    const struct net_device *dev, int strict,
>>> -			    u32 banned_flags);
>>> +			    const struct net_device *dev, bool skip_dev_check,
>>> +			    int strict, u32 banned_flags);
>>
>> This function already has 5 arguments, while this patch adds one more.
>> Can't we use new flags argument for both of them?
>>
>> Also, the name of function and input parameters are already so big, that they
>> don't fit a single line already, while your patch adds more parameters.
>> Can't we make it more slim? Something like ipv6_chk_addr_fl() instead of current
>> name.
> 
> I think I can combine strict and the new skip_dev_check. I am going to
> leave the function name as is.
> 

Upon further review, I can not combine those flags; I missed a level of
() around the scope check which is what strict modifies. They need to be
separate arguments.
diff mbox series

Patch

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index c4185a7b0e90..132e5b95167a 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -69,8 +69,8 @@  int addrconf_set_dstaddr(struct net *net, void __user *arg);
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 		  const struct net_device *dev, int strict);
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-			    const struct net_device *dev, int strict,
-			    u32 banned_flags);
+			    const struct net_device *dev, bool skip_dev_check,
+			    int strict, u32 banned_flags);
 
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b5fd116c046a..17d5d3f42d21 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1851,22 +1851,40 @@  static int ipv6_count_addresses(const struct inet6_dev *idev)
 int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
 		  const struct net_device *dev, int strict)
 {
-	return ipv6_chk_addr_and_flags(net, addr, dev, strict, IFA_F_TENTATIVE);
+	return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
+				       strict, IFA_F_TENTATIVE);
 }
 EXPORT_SYMBOL(ipv6_chk_addr);
 
+/* device argument is used to find the L3 domain of interest. If
+ * skip_dev_check is set, then the ifp device is not checked against
+ * the passed in dev argument. So the 2 cases for addresses checks are:
+ *   1. does the address exist in the L3 domain that dev is part of
+ *      (skip_dev_check = true), or
+ *
+ *   2. does the address exist on the specific device
+ *      (skip_dev_check = false)
+ */
 int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
-			    const struct net_device *dev, int strict,
-			    u32 banned_flags)
+			    const struct net_device *dev, bool skip_dev_check,
+			    int strict, u32 banned_flags)
 {
 	unsigned int hash = inet6_addr_hash(net, addr);
+	const struct net_device *l3mdev;
 	struct inet6_ifaddr *ifp;
 	u32 ifp_flags;
 
 	rcu_read_lock();
+
+	l3mdev = l3mdev_master_dev_rcu(dev);
+
 	hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
 		if (!net_eq(dev_net(ifp->idev->dev), net))
 			continue;
+
+		if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+			continue;
+
 		/* Decouple optimistic from tentative for evaluation here.
 		 * Ban optimistic addresses explicitly, when required.
 		 */
@@ -1875,7 +1893,7 @@  int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
 			    : ifp->flags;
 		if (ipv6_addr_equal(&ifp->addr, addr) &&
 		    !(ifp_flags&banned_flags) &&
-		    (!dev || ifp->idev->dev == dev ||
+		    (skip_dev_check || ifp->idev->dev == dev ||
 		     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
 			rcu_read_unlock();
 			return 1;
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index c61718dba2e6..d580d4d456a5 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -66,7 +66,11 @@  int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 		return -EPERM;
 	if (ipv6_addr_is_multicast(addr))
 		return -EINVAL;
-	if (ipv6_chk_addr(net, addr, NULL, 0))
+
+	if (ifindex)
+		dev = __dev_get_by_index(net, ifindex);
+
+	if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
 		return -EINVAL;
 
 	pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
@@ -90,8 +94,7 @@  int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 			dev = __dev_get_by_flags(net, IFF_UP,
 						 IFF_UP | IFF_LOOPBACK);
 		}
-	} else
-		dev = __dev_get_by_index(net, ifindex);
+	}
 
 	if (!dev) {
 		err = -ENODEV;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index fbf08ce3f5ab..b27333d7b099 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -801,8 +801,9 @@  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 			if (addr_type != IPV6_ADDR_ANY) {
 				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
 				if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
-				    !ipv6_chk_addr(net, &src_info->ipi6_addr,
-						   strict ? dev : NULL, 0) &&
+				    !ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
+							     dev, !strict, 0,
+							     IFA_F_TENTATIVE) &&
 				    !ipv6_chk_acast_addr_src(net, dev,
 							     &src_info->ipi6_addr))
 					err = -EINVAL;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 56c4967f1868..1ce8244e8aee 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -758,9 +758,11 @@  int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
 			ldev = dev_get_by_index_rcu(net, p->link);
 
 		if ((ipv6_addr_is_multicast(laddr) ||
-		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
+		     likely(ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+						    0, IFA_F_TENTATIVE))) &&
 		    ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) ||
-		     likely(!ipv6_chk_addr(net, raddr, NULL, 0))))
+		     likely(!ipv6_chk_addr_and_flags(net, raddr, ldev, true,
+						     0, IFA_F_TENTATIVE))))
 			ret = 1;
 	}
 	return ret;
@@ -990,12 +992,14 @@  int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
 		if (p->link)
 			ldev = dev_get_by_index_rcu(net, p->link);
 
-		if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0)))
+		if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
+						      0, IFA_F_TENTATIVE)))
 			pr_warn("%s xmit: Local address not yet configured!\n",
 				p->name);
 		else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
 			 !ipv6_addr_is_multicast(raddr) &&
-			 unlikely(ipv6_chk_addr(net, raddr, NULL, 0)))
+			 unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
+							  true, 0, IFA_F_TENTATIVE)))
 			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
 				p->name);
 		else
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0a19ce3a6f7f..13bf775c7f1a 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -707,7 +707,7 @@  static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	int probes = atomic_read(&neigh->probes);
 
 	if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
-					   dev, 1,
+					   dev, false, 1,
 					   IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
 		saddr = &ipv6_hdr(skb)->saddr;
 	probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3851c3ccfd7a..bbc62799eb3b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2633,18 +2633,25 @@  static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 	const struct in6_addr *gw_addr = &cfg->fc_gateway;
 	int gwa_type = ipv6_addr_type(gw_addr);
 	const struct net_device *dev = *_dev;
+	bool need_local_addr_check = !dev;
 	int err = -EINVAL;
 
-	/* if gw_addr is local we will fail to detect this in case
-	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
-	 * will return already-added prefix route via interface that
-	 * prefix route was assigned to, which might be non-loopback.
+	/* if route spec contains the device, check if gateway address
+	 * is a local address in the same L3 domain
 	 */
-	if (ipv6_chk_addr_and_flags(net, gw_addr,
-				    gwa_type & IPV6_ADDR_LINKLOCAL ?
-				    dev : NULL, 0, 0)) {
-		NL_SET_ERR_MSG(extack, "Invalid gateway address");
-		goto out;
+	if (dev) {
+		/* if gw_addr is local we will fail to detect this in case
+		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
+		 * will return already-added prefix route via interface that
+		 * prefix route was assigned to, which might be non-loopback.
+		 */
+		if (ipv6_chk_addr_and_flags(net, gw_addr, dev,
+					    gwa_type & IPV6_ADDR_LINKLOCAL ?
+					    false : true, 0, 0)) {
+			NL_SET_ERR_MSG(extack,
+				       "Gateway can not be a local address");
+			goto out;
+		}
 	}
 
 	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
@@ -2683,6 +2690,18 @@  static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
 			       "Egress device can not be loopback device for this route");
 		goto out;
 	}
+
+	/* if we did not check gw_addr above, do so now that the
+	 * egress device has been resolved.
+	 */
+	if (need_local_addr_check &&
+	    ipv6_chk_addr_and_flags(net, gw_addr, dev,
+				    gwa_type & IPV6_ADDR_LINKLOCAL ?
+				    false : true, 0, 0)) {
+		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
+		goto out;
+	}
+
 	err = 0;
 out:
 	return err;