diff mbox

[net,v3] openvswitch: Fix egress tunnel info.

Message ID 1445563036-9623-1-git-send-email-pshelar@nicira.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Pravin B Shelar Oct. 23, 2015, 1:17 a.m. UTC
While transitioning to netdev based vport we broke OVS
feature which allows user to retrieve tunnel packet egress
information for lwtunnel devices.  Following patch fixes it
by introducing ndo operation to get the tunnel egress info.
Same ndo operation can be used for lwtunnel devices and compat
ovs-tnl-vport devices. So after adding such device operation
we can remove similar operation from ovs-vport.

Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device").
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
--
v2-v3:
- Remove unused tun_info
v1-v2:
- changed ndo operation name to ndo_fill_metadata_dst()
- Fix geneve stats update
---
 drivers/net/geneve.c           |   40 ++++++++++++++++++++++-----
 drivers/net/vxlan.c            |   41 ++++++++++++++++++++++++++++
 include/linux/netdevice.h      |    7 +++++
 include/net/dst_metadata.h     |   32 ++++++++++++++++++++++
 net/core/dev.c                 |   27 ++++++++++++++++++
 net/ipv4/ip_gre.c              |   46 +++++++++++++++++++++++++------
 net/openvswitch/actions.c      |    9 ++----
 net/openvswitch/datapath.c     |    5 +--
 net/openvswitch/datapath.h     |    1 -
 net/openvswitch/flow_netlink.c |   18 +++++-------
 net/openvswitch/flow_netlink.h |    6 ++--
 net/openvswitch/vport-geneve.c |   13 ---------
 net/openvswitch/vport-gre.c    |    8 -----
 net/openvswitch/vport-vxlan.c  |   19 -------------
 net/openvswitch/vport.c        |   58 ----------------------------------------
 net/openvswitch/vport.h        |   35 ------------------------
 16 files changed, 192 insertions(+), 173 deletions(-)

Comments

David Miller Oct. 23, 2015, 2:39 a.m. UTC | #1
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 22 Oct 2015 18:17:16 -0700

> While transitioning to netdev based vport we broke OVS
> feature which allows user to retrieve tunnel packet egress
> information for lwtunnel devices.  Following patch fixes it
> by introducing ndo operation to get the tunnel egress info.
> Same ndo operation can be used for lwtunnel devices and compat
> ovs-tnl-vport devices. So after adding such device operation
> we can remove similar operation from ovs-vport.
> 
> Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device").
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jiri Benc Oct. 23, 2015, 12:17 p.m. UTC | #2
On Thu, 22 Oct 2015 18:17:16 -0700, Pravin B Shelar wrote:
> While transitioning to netdev based vport we broke OVS
> feature which allows user to retrieve tunnel packet egress
> information for lwtunnel devices.  Following patch fixes it
> by introducing ndo operation to get the tunnel egress info.
> Same ndo operation can be used for lwtunnel devices and compat
> ovs-tnl-vport devices. So after adding such device operation
> we can remove similar operation from ovs-vport.
> 
> Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device").
> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
> --
> v2-v3:
> - Remove unused tun_info
> v1-v2:
> - changed ndo operation name to ndo_fill_metadata_dst()
> - Fix geneve stats update

This looks good overall, thanks. I see some issues with the patch but
most of it can be fixed in net-next.git. See below.

[...]
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -2337,6 +2337,46 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
>  	return 0;
>  }
>  
> +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
> +				struct ip_tunnel_info *info,
> +				__be16 sport, __be16 dport)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	struct rtable *rt;
> +	struct flowi4 fl4;
> +
> +	memset(&fl4, 0, sizeof(fl4));
> +	fl4.flowi4_tos = RT_TOS(info->key.tos);
> +	fl4.flowi4_mark = skb->mark;
> +	fl4.flowi4_proto = IPPROTO_UDP;
> +	fl4.daddr = info->key.u.ipv4.dst;
> +
> +	rt = ip_route_output_key(vxlan->net, &fl4);
> +	if (IS_ERR(rt))
> +		return PTR_ERR(rt);
> +	ip_rt_put(rt);
> +
> +	info->key.u.ipv4.src = fl4.saddr;
> +	info->key.tp_src = sport;
> +	info->key.tp_dst = dport;
> +	return 0;
> +}

Do you plan to address the introduced code duplication for net-next.git?

> +
> +static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
> +{
> +	struct vxlan_dev *vxlan = netdev_priv(dev);
> +	struct ip_tunnel_info *info = skb_tunnel_info(skb);
> +	__be16 sport, dport;
> +
> +	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
> +				  vxlan->cfg.port_max, true);
> +	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
> +
> +	if (ip_tunnel_info_af(info) == AF_INET)
> +		return egress_ipv4_tun_info(dev, skb, info, sport, dport);
> +	return -EINVAL;

What about IPv6? There's IPv6 support for metadata based vxlan in
net.git, thus this should have IPv6 support, too. But then, this is
currently used only by ovs which got the IPv6 support only in
net-next.git, thus it may be enough to fix it there.

[...]
> --- a/include/net/dst_metadata.h
> +++ b/include/net/dst_metadata.h
[...]
> +static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
> +{
> +	struct metadata_dst *dst;
> +
> +	dst = tun_dst_unclone(skb);
> +	if (IS_ERR(dst))
> +		return NULL;
> +
> +	return &dst->u.tun_info;
> +}

This doesn't do what the name suggests and is, actually, ovs specific.
The ip_tunnel_info can be provided as a part of lwtstate and this
function should handle that case, too. This is not a problem for
net.git, as the function just returns EINVAL in such case, but should
be addressed for net-next.git. As ovs is currently the only user, I'd
be also fine with just a comment stating that, so it's clear for future
users of this function that it needs to be extended before it can be
used out of ovs.

[...]
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -99,6 +99,7 @@
>  #include <linux/rtnetlink.h>
>  #include <linux/stat.h>
>  #include <net/dst.h>
> +#include <net/dst_metadata.h>
>  #include <net/pkt_sched.h>
>  #include <net/checksum.h>
>  #include <net/xfrm.h>
> @@ -682,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev)
>  EXPORT_SYMBOL(dev_get_iflink);
>  
>  /**
> + *	dev_fill_metadata_dst - Retrieve tunnel egress information.
> + *	@dev: targeted interface
> + *	@skb: The packet.
> + *
> + *	For better visibility of tunnel traffic OVS needs to retrieve
> + *	egress tunnel information for a packet. Following API allows
> + *	user to get this info.
> + */
> +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
> +{
> +	struct ip_tunnel_info *info;
> +
> +	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
> +		return -EINVAL;
> +
> +	info = skb_tunnel_info_unclone(skb);
> +	if (!info)
> +		return -ENOMEM;

ENOMEM is a wrong error code to return. skb_tunnel_info_unclone should
return the error code returned by tun_dst_unclone, in particular the
EINVAL case which will be much more common than the ENOMEM case.

> +	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
> +		return -EINVAL;

It would be much better to check the mode before copying the metadata.

[...]
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
[...]
> @@ -749,13 +749,12 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
>  	return 0;
>  }
>  
> -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
> -				  const struct ip_tunnel_info *egress_tun_info,
> -				  const void *egress_tun_opts)
> +int ovs_nla_put_tunnel_info(struct sk_buff *skb,
> +			    struct ip_tunnel_info *tun_info)
>  {
> -	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
> -				    egress_tun_opts,
> -				    egress_tun_info->options_len);
> +	return __ipv4_tun_to_nlattr(skb, &tun_info->key,
> +				    ip_tunnel_info_opts(tun_info),
> +				    tun_info->options_len);
>  }

This should at least check whether the tun_info is indeed IPv4. Actual
IPv6 support for this function can be added to net-next.git.

 Jiri
Pravin B Shelar Oct. 23, 2015, 5:30 p.m. UTC | #3
On Fri, Oct 23, 2015 at 5:17 AM, Jiri Benc <jbenc@redhat.com> wrote:
> On Thu, 22 Oct 2015 18:17:16 -0700, Pravin B Shelar wrote:
>> While transitioning to netdev based vport we broke OVS
>> feature which allows user to retrieve tunnel packet egress
>> information for lwtunnel devices.  Following patch fixes it
>> by introducing ndo operation to get the tunnel egress info.
>> Same ndo operation can be used for lwtunnel devices and compat
>> ovs-tnl-vport devices. So after adding such device operation
>> we can remove similar operation from ovs-vport.
>>
>> Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device").
>> Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
>> --
>> v2-v3:
>> - Remove unused tun_info
>> v1-v2:
>> - changed ndo operation name to ndo_fill_metadata_dst()
>> - Fix geneve stats update
>
> This looks good overall, thanks. I see some issues with the patch but
> most of it can be fixed in net-next.git. See below.
>
Thanks for the review.

> [...]
>> --- a/drivers/net/vxlan.c
>> +++ b/drivers/net/vxlan.c
>> @@ -2337,6 +2337,46 @@ static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
>>       return 0;
>>  }
>>
>> +static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
>> +                             struct ip_tunnel_info *info,
>> +                             __be16 sport, __be16 dport)
>> +{
>> +     struct vxlan_dev *vxlan = netdev_priv(dev);
>> +     struct rtable *rt;
>> +     struct flowi4 fl4;
>> +
>> +     memset(&fl4, 0, sizeof(fl4));
>> +     fl4.flowi4_tos = RT_TOS(info->key.tos);
>> +     fl4.flowi4_mark = skb->mark;
>> +     fl4.flowi4_proto = IPPROTO_UDP;
>> +     fl4.daddr = info->key.u.ipv4.dst;
>> +
>> +     rt = ip_route_output_key(vxlan->net, &fl4);
>> +     if (IS_ERR(rt))
>> +             return PTR_ERR(rt);
>> +     ip_rt_put(rt);
>> +
>> +     info->key.u.ipv4.src = fl4.saddr;
>> +     info->key.tp_src = sport;
>> +     info->key.tp_dst = dport;
>> +     return 0;
>> +}
>
> Do you plan to address the introduced code duplication for net-next.git?
>
I see lot of refactoring scope for vxlan code even without this patch.
I am planing to address it in net-next.

>> +
>> +static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     struct vxlan_dev *vxlan = netdev_priv(dev);
>> +     struct ip_tunnel_info *info = skb_tunnel_info(skb);
>> +     __be16 sport, dport;
>> +
>> +     sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
>> +                               vxlan->cfg.port_max, true);
>> +     dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
>> +
>> +     if (ip_tunnel_info_af(info) == AF_INET)
>> +             return egress_ipv4_tun_info(dev, skb, info, sport, dport);
>> +     return -EINVAL;
>
> What about IPv6? There's IPv6 support for metadata based vxlan in
> net.git, thus this should have IPv6 support, too. But then, this is
> currently used only by ovs which got the IPv6 support only in
> net-next.git, thus it may be enough to fix it there.
>
I did choose to implement only for ipv4 since it is pretty late for
fix so wanted to keep simple as possible. IPv6 support is not there
yet anyways.

> [...]
>> --- a/include/net/dst_metadata.h
>> +++ b/include/net/dst_metadata.h
> [...]
>> +static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
>> +{
>> +     struct metadata_dst *dst;
>> +
>> +     dst = tun_dst_unclone(skb);
>> +     if (IS_ERR(dst))
>> +             return NULL;
>> +
>> +     return &dst->u.tun_info;
>> +}
>
> This doesn't do what the name suggests and is, actually, ovs specific.
> The ip_tunnel_info can be provided as a part of lwtstate and this
> function should handle that case, too. This is not a problem for
> net.git, as the function just returns EINVAL in such case, but should
> be addressed for net-next.git. As ovs is currently the only user, I'd
> be also fine with just a comment stating that, so it's clear for future
> users of this function that it needs to be extended before it can be
> used out of ovs.
>

I considered lwstate, but I am reluctant to add this complexity
without a usecase.

> [...]
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -99,6 +99,7 @@
>>  #include <linux/rtnetlink.h>
>>  #include <linux/stat.h>
>>  #include <net/dst.h>
>> +#include <net/dst_metadata.h>
>>  #include <net/pkt_sched.h>
>>  #include <net/checksum.h>
>>  #include <net/xfrm.h>
>> @@ -682,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev)
>>  EXPORT_SYMBOL(dev_get_iflink);
>>
>>  /**
>> + *   dev_fill_metadata_dst - Retrieve tunnel egress information.
>> + *   @dev: targeted interface
>> + *   @skb: The packet.
>> + *
>> + *   For better visibility of tunnel traffic OVS needs to retrieve
>> + *   egress tunnel information for a packet. Following API allows
>> + *   user to get this info.
>> + */
>> +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
>> +{
>> +     struct ip_tunnel_info *info;
>> +
>> +     if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
>> +             return -EINVAL;
>> +
>> +     info = skb_tunnel_info_unclone(skb);
>> +     if (!info)
>> +             return -ENOMEM;
>
> ENOMEM is a wrong error code to return. skb_tunnel_info_unclone should
> return the error code returned by tun_dst_unclone, in particular the
> EINVAL case which will be much more common than the ENOMEM case.
>
I agree in general it is true, but this is only called in OVS case. In
that context ENOMEM is common error case than any other case. But I
see your point, I will send patch to fix the return code.


>> +     if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
>> +             return -EINVAL;
>
> It would be much better to check the mode before copying the metadata.
>
This is pretty rare case, Thats why I would rather keep the code
simple and not to call skb_tunnel_info() and then
skb_tunnel_info_unclone() to optimize this case.

> [...]
>> --- a/net/openvswitch/flow_netlink.c
>> +++ b/net/openvswitch/flow_netlink.c
> [...]
>> @@ -749,13 +749,12 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
>>       return 0;
>>  }
>>
>> -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
>> -                               const struct ip_tunnel_info *egress_tun_info,
>> -                               const void *egress_tun_opts)
>> +int ovs_nla_put_tunnel_info(struct sk_buff *skb,
>> +                         struct ip_tunnel_info *tun_info)
>>  {
>> -     return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
>> -                                 egress_tun_opts,
>> -                                 egress_tun_info->options_len);
>> +     return __ipv4_tun_to_nlattr(skb, &tun_info->key,
>> +                                 ip_tunnel_info_opts(tun_info),
>> +                                 tun_info->options_len);
>>  }
>
> This should at least check whether the tun_info is indeed IPv4. Actual
> IPv6 support for this function can be added to net-next.git.
>
net tree only supports IPv4 tunnels. I am not sure value of this
check, specially since we need differ changes on net-next.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index cde29f8..445071c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -594,14 +594,12 @@  static struct rtable *geneve_get_rt(struct sk_buff *skb,
 	rt = ip_route_output_key(geneve->net, fl4);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
-		dev->stats.tx_carrier_errors++;
-		return rt;
+		return ERR_PTR(-ENETUNREACH);
 	}
 	if (rt->dst.dev == dev) { /* is this necessary? */
 		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
-		dev->stats.collisions++;
 		ip_rt_put(rt);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-ELOOP);
 	}
 	return rt;
 }
@@ -627,12 +625,12 @@  static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct ip_tunnel_info *info = NULL;
 	struct rtable *rt = NULL;
 	const struct iphdr *iip; /* interior IP header */
+	int err = -EINVAL;
 	struct flowi4 fl4;
 	__u8 tos, ttl;
 	__be16 sport;
 	bool udp_csum;
 	__be16 df;
-	int err;
 
 	if (geneve->collect_md) {
 		info = skb_tunnel_info(skb);
@@ -647,7 +645,7 @@  static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 	rt = geneve_get_rt(skb, dev, &fl4, info);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
-		dev->stats.tx_carrier_errors++;
+		err = PTR_ERR(rt);
 		goto tx_error;
 	}
 
@@ -699,10 +697,37 @@  static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 tx_error:
 	dev_kfree_skb(skb);
 err:
-	dev->stats.tx_errors++;
+	if (err == -ELOOP)
+		dev->stats.collisions++;
+	else if (err == -ENETUNREACH)
+		dev->stats.tx_carrier_errors++;
+	else
+		dev->stats.tx_errors++;
 	return NETDEV_TX_OK;
 }
 
+static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
+
+	rt = geneve_get_rt(skb, dev, &fl4, info);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	ip_rt_put(rt);
+	info->key.u.ipv4.src = fl4.saddr;
+	info->key.tp_src = udp_flow_src_port(geneve->net, skb,
+					     1, USHRT_MAX, true);
+	info->key.tp_dst = geneve->dst_port;
+	return 0;
+}
+
 static const struct net_device_ops geneve_netdev_ops = {
 	.ndo_init		= geneve_init,
 	.ndo_uninit		= geneve_uninit,
@@ -713,6 +738,7 @@  static const struct net_device_ops geneve_netdev_ops = {
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
+	.ndo_fill_metadata_dst	= geneve_fill_metadata_dst,
 };
 
 static void geneve_get_drvinfo(struct net_device *dev,
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index afdc65f..c1587ec 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2337,6 +2337,46 @@  static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
+				struct ip_tunnel_info *info,
+				__be16 sport, __be16 dport)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.flowi4_tos = RT_TOS(info->key.tos);
+	fl4.flowi4_mark = skb->mark;
+	fl4.flowi4_proto = IPPROTO_UDP;
+	fl4.daddr = info->key.u.ipv4.dst;
+
+	rt = ip_route_output_key(vxlan->net, &fl4);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+	ip_rt_put(rt);
+
+	info->key.u.ipv4.src = fl4.saddr;
+	info->key.tp_src = sport;
+	info->key.tp_dst = dport;
+	return 0;
+}
+
+static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	__be16 sport, dport;
+
+	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
+				  vxlan->cfg.port_max, true);
+	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;
+
+	if (ip_tunnel_info_af(info) == AF_INET)
+		return egress_ipv4_tun_info(dev, skb, info, sport, dport);
+	return -EINVAL;
+}
+
 static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_init		= vxlan_init,
 	.ndo_uninit		= vxlan_uninit,
@@ -2351,6 +2391,7 @@  static const struct net_device_ops vxlan_netdev_ops = {
 	.ndo_fdb_add		= vxlan_fdb_add,
 	.ndo_fdb_del		= vxlan_fdb_delete,
 	.ndo_fdb_dump		= vxlan_fdb_dump,
+	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
 };
 
 /* Info for udev, that this is a virtual tunnel endpoint */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2d15e38..210d11a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1054,6 +1054,10 @@  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	This function is used to pass protocol port error state information
  *	to the switch driver. The switch driver can react to the proto_down
  *      by doing a phys down on the associated switch port.
+ * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
+ *	This function is used to get egress tunnel information for given skb.
+ *	This is useful for retrieving outer tunnel header parameters while
+ *	sampling packet.
  *
  */
 struct net_device_ops {
@@ -1227,6 +1231,8 @@  struct net_device_ops {
 	int			(*ndo_get_iflink)(const struct net_device *dev);
 	int			(*ndo_change_proto_down)(struct net_device *dev,
 							 bool proto_down);
+	int			(*ndo_fill_metadata_dst)(struct net_device *dev,
+						       struct sk_buff *skb);
 };
 
 /**
@@ -2203,6 +2209,7 @@  void dev_add_offload(struct packet_offload *po);
 void dev_remove_offload(struct packet_offload *po);
 
 int dev_get_iflink(const struct net_device *dev);
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
 struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
 				      unsigned short mask);
 struct net_device *dev_get_by_name(struct net *net, const char *name);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index af9d538..ce00971 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -60,6 +60,38 @@  static inline struct metadata_dst *tun_rx_dst(int md_size)
 	return tun_dst;
 }
 
+static inline struct metadata_dst *tun_dst_unclone(struct sk_buff *skb)
+{
+	struct metadata_dst *md_dst = skb_metadata_dst(skb);
+	int md_size = md_dst->u.tun_info.options_len;
+	struct metadata_dst *new_md;
+
+	if (!md_dst)
+		return ERR_PTR(-EINVAL);
+
+	new_md = metadata_dst_alloc(md_size, GFP_ATOMIC);
+	if (!new_md)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&new_md->u.tun_info, &md_dst->u.tun_info,
+	       sizeof(struct ip_tunnel_info) + md_size);
+	skb_dst_drop(skb);
+	dst_hold(&new_md->dst);
+	skb_dst_set(skb, &new_md->dst);
+	return new_md;
+}
+
+static inline struct ip_tunnel_info *skb_tunnel_info_unclone(struct sk_buff *skb)
+{
+	struct metadata_dst *dst;
+
+	dst = tun_dst_unclone(skb);
+	if (IS_ERR(dst))
+		return NULL;
+
+	return &dst->u.tun_info;
+}
+
 static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb,
 						 __be16 flags,
 						 __be64 tunnel_id,
diff --git a/net/core/dev.c b/net/core/dev.c
index 6bb6470..c14748d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -99,6 +99,7 @@ 
 #include <linux/rtnetlink.h>
 #include <linux/stat.h>
 #include <net/dst.h>
+#include <net/dst_metadata.h>
 #include <net/pkt_sched.h>
 #include <net/checksum.h>
 #include <net/xfrm.h>
@@ -682,6 +683,32 @@  int dev_get_iflink(const struct net_device *dev)
 EXPORT_SYMBOL(dev_get_iflink);
 
 /**
+ *	dev_fill_metadata_dst - Retrieve tunnel egress information.
+ *	@dev: targeted interface
+ *	@skb: The packet.
+ *
+ *	For better visibility of tunnel traffic OVS needs to retrieve
+ *	egress tunnel information for a packet. Following API allows
+ *	user to get this info.
+ */
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+	struct ip_tunnel_info *info;
+
+	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
+		return -EINVAL;
+
+	info = skb_tunnel_info_unclone(skb);
+	if (!info)
+		return -ENOMEM;
+	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+		return -EINVAL;
+
+	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+
+/**
  *	__dev_get_by_name	- find a device by its name
  *	@net: the applicable net namespace
  *	@name: name to find
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index bd0679d..6145214 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -498,10 +498,26 @@  static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
 					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
 }
 
+static struct rtable *gre_get_rt(struct sk_buff *skb,
+				 struct net_device *dev,
+				 struct flowi4 *fl,
+				 const struct ip_tunnel_key *key)
+{
+	struct net *net = dev_net(dev);
+
+	memset(fl, 0, sizeof(*fl));
+	fl->daddr = key->u.ipv4.dst;
+	fl->saddr = key->u.ipv4.src;
+	fl->flowi4_tos = RT_TOS(key->tos);
+	fl->flowi4_mark = skb->mark;
+	fl->flowi4_proto = IPPROTO_GRE;
+
+	return ip_route_output_key(net, fl);
+}
+
 static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel_info *tun_info;
-	struct net *net = dev_net(dev);
 	const struct ip_tunnel_key *key;
 	struct flowi4 fl;
 	struct rtable *rt;
@@ -516,14 +532,7 @@  static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto err_free_skb;
 
 	key = &tun_info->key;
-	memset(&fl, 0, sizeof(fl));
-	fl.daddr = key->u.ipv4.dst;
-	fl.saddr = key->u.ipv4.src;
-	fl.flowi4_tos = RT_TOS(key->tos);
-	fl.flowi4_mark = skb->mark;
-	fl.flowi4_proto = IPPROTO_GRE;
-
-	rt = ip_route_output_key(net, &fl);
+	rt = gre_get_rt(skb, dev, &fl, key);
 	if (IS_ERR(rt))
 		goto err_free_skb;
 
@@ -566,6 +575,24 @@  err_free_skb:
 	dev->stats.tx_dropped++;
 }
 
+static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+{
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	if (ip_tunnel_info_af(info) != AF_INET)
+		return -EINVAL;
+
+	rt = gre_get_rt(skb, dev, &fl4, &info->key);
+	if (IS_ERR(rt))
+		return PTR_ERR(rt);
+
+	ip_rt_put(rt);
+	info->key.u.ipv4.src = fl4.saddr;
+	return 0;
+}
+
 static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 			      struct net_device *dev)
 {
@@ -1023,6 +1050,7 @@  static const struct net_device_ops gre_tap_netdev_ops = {
 	.ndo_change_mtu		= ip_tunnel_change_mtu,
 	.ndo_get_stats64	= ip_tunnel_get_stats64,
 	.ndo_get_iflink		= ip_tunnel_get_iflink,
+	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,
 };
 
 static void ipgre_tap_setup(struct net_device *dev)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c6a39bf..0bf0f40 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -768,7 +768,6 @@  static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			    struct sw_flow_key *key, const struct nlattr *attr,
 			    const struct nlattr *actions, int actions_len)
 {
-	struct ip_tunnel_info info;
 	struct dp_upcall_info upcall;
 	const struct nlattr *a;
 	int rem;
@@ -796,11 +795,9 @@  static int output_userspace(struct datapath *dp, struct sk_buff *skb,
 			if (vport) {
 				int err;
 
-				upcall.egress_tun_info = &info;
-				err = ovs_vport_get_egress_tun_info(vport, skb,
-								    &upcall);
-				if (err)
-					upcall.egress_tun_info = NULL;
+				err = dev_fill_metadata_dst(vport->dev, skb);
+				if (!err)
+					upcall.egress_tun_info = skb_tunnel_info(skb);
 			}
 
 			break;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index b816ff8..c5d08ee 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -490,9 +490,8 @@  static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
 
 	if (upcall_info->egress_tun_info) {
 		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
-		err = ovs_nla_put_egress_tunnel_key(user_skb,
-						    upcall_info->egress_tun_info,
-						    upcall_info->egress_tun_opts);
+		err = ovs_nla_put_tunnel_info(user_skb,
+					      upcall_info->egress_tun_info);
 		BUG_ON(err);
 		nla_nest_end(user_skb, nla);
 	}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index f88038a..67bdecd 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -117,7 +117,6 @@  struct ovs_skb_cb {
  */
 struct dp_upcall_info {
 	struct ip_tunnel_info *egress_tun_info;
-	const void *egress_tun_opts;
 	const struct nlattr *userdata;
 	const struct nlattr *actions;
 	int actions_len;
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index bd710bc..38536c1 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -717,7 +717,7 @@  static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
 	if ((output->tun_flags & TUNNEL_OAM) &&
 	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))
 		return -EMSGSIZE;
-	if (tun_opts) {
+	if (swkey_tun_opts_len) {
 		if (output->tun_flags & TUNNEL_GENEVE_OPT &&
 		    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
 			    swkey_tun_opts_len, tun_opts))
@@ -749,13 +749,12 @@  static int ipv4_tun_to_nlattr(struct sk_buff *skb,
 	return 0;
 }
 
-int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
-				  const struct ip_tunnel_info *egress_tun_info,
-				  const void *egress_tun_opts)
+int ovs_nla_put_tunnel_info(struct sk_buff *skb,
+			    struct ip_tunnel_info *tun_info)
 {
-	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
-				    egress_tun_opts,
-				    egress_tun_info->options_len);
+	return __ipv4_tun_to_nlattr(skb, &tun_info->key,
+				    ip_tunnel_info_opts(tun_info),
+				    tun_info->options_len);
 }
 
 static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,
@@ -2383,10 +2382,7 @@  static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
 		if (!start)
 			return -EMSGSIZE;
 
-		err = ipv4_tun_to_nlattr(skb, &tun_info->key,
-					 tun_info->options_len ?
-					     ip_tunnel_info_opts(tun_info) : NULL,
-					 tun_info->options_len);
+		err = ovs_nla_put_tunnel_info(skb, tun_info);
 		if (err)
 			return err;
 		nla_nest_end(skb, start);
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6ca3f0b..47dd142 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -55,9 +55,9 @@  int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
 int ovs_nla_get_match(struct net *, struct sw_flow_match *,
 		      const struct nlattr *key, const struct nlattr *mask,
 		      bool log);
-int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
-				  const struct ip_tunnel_info *,
-				  const void *egress_tun_opts);
+
+int ovs_nla_put_tunnel_info(struct sk_buff *skb,
+			    struct ip_tunnel_info *tun_info);
 
 bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
 int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 2735e9c..5f8aaaa 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -52,18 +52,6 @@  static int geneve_get_options(const struct vport *vport,
 	return 0;
 }
 
-static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				      struct dp_upcall_info *upcall)
-{
-	struct geneve_port *geneve_port = geneve_vport(vport);
-	struct net *net = ovs_dp_get_net(vport->dp);
-	__be16 dport = htons(geneve_port->port_no);
-	__be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
-
-	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
-					  skb, IPPROTO_UDP, sport, dport);
-}
-
 static struct vport *geneve_tnl_create(const struct vport_parms *parms)
 {
 	struct net *net = ovs_dp_get_net(parms->dp);
@@ -130,7 +118,6 @@  static struct vport_ops ovs_geneve_vport_ops = {
 	.get_options	= geneve_get_options,
 	.send		= ovs_netdev_send,
 	.owner          = THIS_MODULE,
-	.get_egress_tun_info	= geneve_get_egress_tun_info,
 };
 
 static int __init ovs_geneve_tnl_init(void)
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index 4d24481..64225bf 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -84,18 +84,10 @@  static struct vport *gre_create(const struct vport_parms *parms)
 	return ovs_netdev_link(vport, parms->name);
 }
 
-static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				   struct dp_upcall_info *upcall)
-{
-	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
-					  skb, IPPROTO_GRE, 0, 0);
-}
-
 static struct vport_ops ovs_gre_vport_ops = {
 	.type		= OVS_VPORT_TYPE_GRE,
 	.create		= gre_create,
 	.send		= ovs_netdev_send,
-	.get_egress_tun_info	= gre_get_egress_tun_info,
 	.destroy	= ovs_netdev_tunnel_destroy,
 	.owner		= THIS_MODULE,
 };
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index c11413d..e1c9c08 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -146,31 +146,12 @@  static struct vport *vxlan_create(const struct vport_parms *parms)
 	return ovs_netdev_link(vport, parms->name);
 }
 
-static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				     struct dp_upcall_info *upcall)
-{
-	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
-	struct net *net = ovs_dp_get_net(vport->dp);
-	__be16 dst_port = vxlan_dev_dst_port(vxlan);
-	__be16 src_port;
-	int port_min;
-	int port_max;
-
-	inet_get_local_port_range(net, &port_min, &port_max);
-	src_port = udp_flow_src_port(net, skb, 0, 0, true);
-
-	return ovs_tunnel_get_egress_info(upcall, net,
-					  skb, IPPROTO_UDP,
-					  src_port, dst_port);
-}
-
 static struct vport_ops ovs_vxlan_netdev_vport_ops = {
 	.type			= OVS_VPORT_TYPE_VXLAN,
 	.create			= vxlan_create,
 	.destroy		= ovs_netdev_tunnel_destroy,
 	.get_options		= vxlan_get_options,
 	.send			= ovs_netdev_send,
-	.get_egress_tun_info	= vxlan_get_egress_tun_info,
 };
 
 static int __init ovs_vxlan_tnl_init(void)
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 12a36ac..320c765 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -479,61 +479,3 @@  void ovs_vport_deferred_free(struct vport *vport)
 	call_rcu(&vport->rcu, free_vport_rcu);
 }
 EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-
-int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
-			       struct net *net,
-			       struct sk_buff *skb,
-			       u8 ipproto,
-			       __be16 tp_src,
-			       __be16 tp_dst)
-{
-	struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info;
-	const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb);
-	const struct ip_tunnel_key *tun_key;
-	u32 skb_mark = skb->mark;
-	struct rtable *rt;
-	struct flowi4 fl;
-
-	if (unlikely(!tun_info))
-		return -EINVAL;
-	if (ip_tunnel_info_af(tun_info) != AF_INET)
-		return -EINVAL;
-
-	tun_key = &tun_info->key;
-
-	/* Route lookup to get srouce IP address.
-	 * The process may need to be changed if the corresponding process
-	 * in vports ops changed.
-	 */
-	rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto);
-	if (IS_ERR(rt))
-		return PTR_ERR(rt);
-
-	ip_rt_put(rt);
-
-	/* Generate egress_tun_info based on tun_info,
-	 * saddr, tp_src and tp_dst
-	 */
-	ip_tunnel_key_init(&egress_tun_info->key,
-			   fl.saddr, tun_key->u.ipv4.dst,
-			   tun_key->tos,
-			   tun_key->ttl,
-			   tp_src, tp_dst,
-			   tun_key->tun_id,
-			   tun_key->tun_flags);
-	egress_tun_info->options_len = tun_info->options_len;
-	egress_tun_info->mode = tun_info->mode;
-	upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
-
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct dp_upcall_info *upcall)
-{
-	/* get_egress_tun_info() is only implemented on tunnel ports. */
-	if (unlikely(!vport->ops->get_egress_tun_info))
-		return -EINVAL;
-
-	return vport->ops->get_egress_tun_info(vport, skb, upcall);
-}
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index a413f3a..d341ad6 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -27,7 +27,6 @@ 
 #include <linux/skbuff.h>
 #include <linux/spinlock.h>
 #include <linux/u64_stats_sync.h>
-#include <net/route.h>
 
 #include "datapath.h"
 
@@ -53,16 +52,6 @@  int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
 int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
 u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
 
-int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
-			       struct net *net,
-			       struct sk_buff *,
-			       u8 ipproto,
-			       __be16 tp_src,
-			       __be16 tp_dst);
-
-int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
-				  struct dp_upcall_info *upcall);
-
 /**
  * struct vport_portids - array of netlink portids of a vport.
  *                        must be protected by rcu.
@@ -140,8 +129,6 @@  struct vport_parms {
  * have any configuration.
  * @send: Send a packet on the device.
  * zero for dropped packets or negative for error.
- * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
- * a packet.
  */
 struct vport_ops {
 	enum ovs_vport_type type;
@@ -154,9 +141,6 @@  struct vport_ops {
 	int (*get_options)(const struct vport *, struct sk_buff *);
 
 	void (*send)(struct vport *, struct sk_buff *);
-	int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
-				   struct dp_upcall_info *upcall);
-
 	struct module *owner;
 	struct list_head list;
 };
@@ -215,25 +199,6 @@  static inline const char *ovs_vport_name(struct vport *vport)
 int ovs_vport_ops_register(struct vport_ops *ops);
 void ovs_vport_ops_unregister(struct vport_ops *ops);
 
-static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
-						     const struct ip_tunnel_key *key,
-						     u32 mark,
-						     struct flowi4 *fl,
-						     u8 protocol)
-{
-	struct rtable *rt;
-
-	memset(fl, 0, sizeof(*fl));
-	fl->daddr = key->u.ipv4.dst;
-	fl->saddr = key->u.ipv4.src;
-	fl->flowi4_tos = RT_TOS(key->tos);
-	fl->flowi4_mark = mark;
-	fl->flowi4_proto = protocol;
-
-	rt = ip_route_output_key(net, fl);
-	return rt;
-}
-
 static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
 {
 	vport->ops->send(vport, skb);