diff mbox

[net-next,03/13] net: l3mdev: Allow send on enslaved interface

Message ID 1462419210-10463-4-git-send-email-dsa@cumulusnetworks.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

David Ahern May 5, 2016, 3:33 a.m. UTC
Allow udp and raw sockets to send by oif that is an enslaved interface
versus the l3mdev/VRF device. For example, this allows BFD to use ifindex
from IP_PKTINFO on a receive to send a response without the need to
convert to the VRF index. It also allows ping and ping6 to work when
specifying an enslaved interface (e.g., ping -I swp1 <ip>) which is
a natural use case.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
 drivers/net/vrf.c   |  2 ++
 net/ipv4/route.c    |  4 ++++
 net/l3mdev/l3mdev.c | 20 +++++++++++++++-----
 3 files changed, 21 insertions(+), 5 deletions(-)

Comments

Julian Anastasov May 5, 2016, 7:40 a.m. UTC | #1
Hello,

On Wed, 4 May 2016, David Ahern wrote:

> Allow udp and raw sockets to send by oif that is an enslaved interface
> versus the l3mdev/VRF device. For example, this allows BFD to use ifindex
> from IP_PKTINFO on a receive to send a response without the need to
> convert to the VRF index. It also allows ping and ping6 to work when
> specifying an enslaved interface (e.g., ping -I swp1 <ip>) which is
> a natural use case.
> 
> Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
> ---
>  drivers/net/vrf.c   |  2 ++
>  net/ipv4/route.c    |  4 ++++
>  net/l3mdev/l3mdev.c | 20 +++++++++++++++-----
>  3 files changed, 21 insertions(+), 5 deletions(-)
> 

> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 8c8c655bb2c4..a1f2830d8110 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>  	unsigned int flags = 0;
>  	struct fib_result res;
>  	struct rtable *rth;
> +	int master_idx;
>  	int orig_oif;
>  	int err = -ENETUNREACH;
>  
> @@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>  
>  	orig_oif = fl4->flowi4_oif;
>  
> +	master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
> +	if (master_idx)
> +		fl4->flowi4_oif = master_idx;

	Changing the flowi4_oif at this point can have
bad effects. I remember for recent commit for  __mkroute_output
where the route caching is disabled if traffic is redirected
to loopback. I think, such change can affect the route
caching, for example, now we use nexthop on master_idx to
cache routes for orig_oif. Such problems with the caching
in the past always caused lookups to return wrong cached result
for other users. But this is only my fears, I don't know
the actual result of this change. May be you are trying to
change flowi4_oif at one place instead of every caller.

Regards

--
Julian Anastasov <ja@ssi.bg>
David Ahern May 5, 2016, 2:50 p.m. UTC | #2
On 5/5/16 1:40 AM, Julian Anastasov wrote:
>> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
>> index 8c8c655bb2c4..a1f2830d8110 100644
>> --- a/net/ipv4/route.c
>> +++ b/net/ipv4/route.c
>> @@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>   	unsigned int flags = 0;
>>   	struct fib_result res;
>>   	struct rtable *rth;
>> +	int master_idx;
>>   	int orig_oif;
>>   	int err = -ENETUNREACH;
>>
>> @@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
>>
>>   	orig_oif = fl4->flowi4_oif;
>>
>> +	master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
>> +	if (master_idx)
>> +		fl4->flowi4_oif = master_idx;
>
> 	Changing the flowi4_oif at this point can have
> bad effects. I remember for recent commit for  __mkroute_output
> where the route caching is disabled if traffic is redirected
> to loopback. I think, such change can affect the route
> caching, for example, now we use nexthop on master_idx to
> cache routes for orig_oif. Such problems with the caching
> in the past always caused lookups to return wrong cached result
> for other users. But this is only my fears, I don't know
> the actual result of this change. May be you are trying to
> change flowi4_oif at one place instead of every caller.

Yes. VRFs require the oif to be the master index so that the FIB rules 
direct the lookup to the proper table. Without it we get the wrong result.
diff mbox

Patch

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 2f2aac1b598f..3a04b8cac757 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -646,6 +646,8 @@  static int vrf_get_saddr(struct net_device *dev, struct flowi4 *fl4)
 
 	fl4->flowi4_flags |= FLOWI_FLAG_SKIP_NH_OIF;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
+	/* make sure oif is set to VRF device for lookup */
+	fl4->flowi4_oif = dev->ifindex;
 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
 			     RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8c8c655bb2c4..a1f2830d8110 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2146,6 +2146,7 @@  struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	unsigned int flags = 0;
 	struct fib_result res;
 	struct rtable *rth;
+	int master_idx;
 	int orig_oif;
 	int err = -ENETUNREACH;
 
@@ -2155,6 +2156,9 @@  struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 
 	orig_oif = fl4->flowi4_oif;
 
+	master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
+	if (master_idx)
+		fl4->flowi4_oif = master_idx;
 	fl4->flowi4_iif = LOOPBACK_IFINDEX;
 	fl4->flowi4_tos = tos & IPTOS_RT_MASK;
 	fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 0fe4211e646f..0fd8cc1417cd 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -112,12 +112,19 @@  struct dst_entry *l3mdev_get_rt6_dst(struct net *net,
 	struct dst_entry *dst = NULL;
 	struct net_device *dev;
 
-	dev = dev_get_by_index(net, fl6->flowi6_oif);
-	if (dev) {
-		if (netif_is_l3_master(dev) &&
-		    dev->l3mdev_ops->l3mdev_get_rt6_dst)
+	if (fl6->flowi6_oif) {
+		rcu_read_lock();
+
+		dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
+		if (dev && netif_is_l3_slave(dev))
+			dev = netdev_master_upper_dev_get_rcu(dev);
+
+		if (dev && netif_is_l3_master(dev) &&
+		    dev->l3mdev_ops->l3mdev_get_rt6_dst) {
 			dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6);
-		dev_put(dev);
+		}
+
+		rcu_read_unlock();
 	}
 
 	return dst;
@@ -141,6 +148,9 @@  int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4)
 		rcu_read_lock();
 
 		dev = dev_get_by_index_rcu(net, ifindex);
+		if (dev && netif_is_l3_slave(dev))
+			dev = netdev_master_upper_dev_get_rcu(dev);
+
 		if (dev && netif_is_l3_master(dev) &&
 		    dev->l3mdev_ops->l3mdev_get_saddr) {
 			rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4);