diff mbox

[net-next,V4,4/4] net/sched: Introduce act_tunnel_key

Message ID 1472647584-6713-5-git-send-email-hadarh@mellanox.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Hadar Hen Zion Aug. 31, 2016, 12:46 p.m. UTC
From: Amir Vadai <amir@vadai.me>

This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The action will release the metadata created by the tunnel device
(decap), or set the metadata with the specified values for encap
operation.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
---
 include/net/tc_act/tc_tunnel_key.h        |  30 +++
 include/uapi/linux/tc_act/tc_tunnel_key.h |  42 ++++
 net/sched/Kconfig                         |  11 +
 net/sched/Makefile                        |   1 +
 net/sched/act_tunnel_key.c                | 349 ++++++++++++++++++++++++++++++
 5 files changed, 433 insertions(+)
 create mode 100644 include/net/tc_act/tc_tunnel_key.h
 create mode 100644 include/uapi/linux/tc_act/tc_tunnel_key.h
 create mode 100644 net/sched/act_tunnel_key.c

Comments

Jiri Pirko Aug. 31, 2016, 3:39 p.m. UTC | #1
Wed, Aug 31, 2016 at 02:46:24PM CEST, hadarh@mellanox.com wrote:
>From: Amir Vadai <amir@vadai.me>
>
>This action could be used before redirecting packets to a shared tunnel
>device, or when redirecting packets arriving from a such a device.
>
>The action will release the metadata created by the tunnel device
>(decap), or set the metadata with the specified values for encap
>operation.
>
>For example, the following flower filter will forward all ICMP packets
>destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
>redirecting, a metadata for the vxlan tunnel is created using the
>tunnel_key action and it's arguments:
>
>$ filter add dev net0 protocol ip parent ffff: \
>    flower \
>      ip_proto 1 \
>      dst_ip 11.11.11.2 \
>    action tunnel_key set \
>      src_ip 11.11.0.1 \
>      dst_ip 11.11.0.2 \
>      id 11 \
>    action mirred egress redirect dev vxlan0
>
>Signed-off-by: Amir Vadai <amir@vadai.me>
>Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>


Looks fine to me
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Shmulik Ladkani Aug. 31, 2016, 5:44 p.m. UTC | #2
Hi,

On Wed, 31 Aug 2016 15:46:24 +0300 Hadar Hen Zion <hadarh@mellanox.com> wrote:
> +static int tunnel_key_init(struct net *net, struct nlattr *nla,
> +			   struct nlattr *est, struct tc_action **a,
> +			   int ovr, int bind)
> +{
> +	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
> +	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
> +	struct metadata_dst *metadata = NULL;
> +	struct tc_tunnel_key *parm;
> +	struct tcf_tunnel_key *t;
> +	struct tcf_tunnel_key_params *params_old;
> +	struct tcf_tunnel_key_params *params_new;
> +	__be64 key_id;
> +	bool exists = false;
> +	int ret = 0;
> +	int err;
> +
> +	if (!nla)
> +		return -EINVAL;
> +
> +	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
> +	if (err < 0)
> +		return err;
> +
> +	if (!tb[TCA_TUNNEL_KEY_PARMS])
> +		return -EINVAL;
> +
> +	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
> +	exists = tcf_hash_check(tn, parm->index, a, bind);
> +	if (exists && bind)
> +		return 0;
> +
> +	switch (parm->t_action) {
> +	case TCA_TUNNEL_KEY_ACT_RELEASE:
> +		break;
> +	case TCA_TUNNEL_KEY_ACT_SET:
> +		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
> +			ret = -EINVAL;
> +			goto err_out;
> +		}
> +
> +		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
> +
> +		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
> +		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
> +			__be32 saddr;
> +			__be32 daddr;
> +
> +			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
> +			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
> +
> +			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
> +						    TUNNEL_KEY, key_id, 0);
> +		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
> +			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
> +			struct in6_addr saddr;
> +			struct in6_addr daddr;
> +
> +			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
> +			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
> +
> +			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
> +						      TUNNEL_KEY, key_id, 0);
> +		}
> +
> +		if (!metadata) {
> +			ret = -EINVAL;
> +			goto err_out;
> +		}
> +
> +		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
> +		break;
> +	default:
> +		goto err_out;
> +	}
> +
> +	if (!exists) {
> +		ret = tcf_hash_create(tn, parm->index, est, a,
> +				      &act_tunnel_key_ops, bind, true);
> +		if (ret)
> +			return ret;
> +
> +		ret = ACT_P_CREATED;
> +	} else {
> +		tcf_hash_release(*a, bind);
> +		if (!ovr)
> +			return -EEXIST;
> +	}
> +
> +	t = to_tunnel_key(*a);
> +
> +	ASSERT_RTNL();
> +	params_new = kzalloc(sizeof(*params_new),
> +			     GFP_KERNEL);

nit: Fits oneline. Fix if patch needs other amendments.

> +	if (unlikely(!params_new)) {
> +		if (ovr)
> +			tcf_hash_release(*a, bind);
> +		return -ENOMEM;

Seems we need to call tcf_hash_release regardless 'ovr':
In case (!exist), we've created a new hash few lines above.
Therefore in failure, don't we need a tcf_hash_release()?
Am I missing something?

> +	}
> +
> +	params_old = rtnl_dereference(t->params);
> +
> +	t->tcf_action = parm->action;
> +	params_new->tcft_action = parm->t_action;
> +	params_new->tcft_enc_metadata = metadata;
> +
> +	rcu_assign_pointer(t->params, params_new);
> +
> +	if (params_old)
> +		kfree_rcu(params_old, rcu);
> +
> +	if (ret == ACT_P_CREATED)
> +		tcf_hash_insert(tn, *a);
> +
> +	return ret;
> +
> +err_out:
> +	if (exists)
> +		tcf_hash_release(*a, bind);
> +	return ret;
> +}
> +
> +static void tunnel_key_release(struct tc_action *a, int bind)
> +{
> +	struct tcf_tunnel_key *t = to_tunnel_key(a);
> +	struct tcf_tunnel_key_params *params;
> +
> +	rcu_read_lock();
> +	params = rcu_dereference(t->params);
> +
> +	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
> +		dst_release(&params->tcft_enc_metadata->dst);
> +
> +	rcu_read_unlock();

Not an RCU expert, maybe I'm off...
This alters params in some way (dst_release), so shouldn't it be
considered an UPDATE, involving 'params' replacement?
Current code declares it as an rcu read section.

Thanks,
Shmulik
Eric Dumazet Aug. 31, 2016, 6:39 p.m. UTC | #3
On Wed, Aug 31, 2016 at 5:46 AM, Hadar Hen Zion <hadarh@mellanox.com> wrote:
>
> From: Amir Vadai <amir@vadai.me>
>
> This action could be used before redirecting packets to a shared tunnel
> device, or when redirecting packets arriving from a such a device.
>
>
> +
> +struct tcf_tunnel_key_params {
> +       struct rcu_head         rcu;
> +       int                     tcft_action;

Also add " int action;"

(see why later)

> +       struct metadata_dst     *tcft_enc_metadata;
> +};
> +



> +
> +static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
> +                         struct tcf_result *res)
> +{
> +       struct tcf_tunnel_key *t = to_tunnel_key(a);
> +       struct tcf_tunnel_key_params *params;
> +       int action;
> +
> +       rcu_read_lock();
> +
> +       params = rcu_dereference(t->params);
> +
> +       tcf_lastuse_update(&t->tcf_tm);
> +       bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
> +       action = t->tcf_action;

Ideally, you should read param->action instead of t->tcf_action to be
completely clean.

> +
> +       switch (params->tcft_action) {
> +       case TCA_TUNNEL_KEY_ACT_RELEASE:
> +               skb_dst_drop(skb);
> +               break;
> +       case TCA_TUNNEL_KEY_ACT_SET:
> +               skb_dst_drop(skb);
> +               skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
> +               break;
> +       default:
> +               WARN_ONCE(1, "Bad tunnel_key action.\n");
> +               break;
> +       }
> +
> +       rcu_read_unlock();
> +
> +       return action;
> +}
>
Hadar Hen-Zion Sept. 1, 2016, 9:28 a.m. UTC | #4
On Wed, Aug 31, 2016 at 9:39 PM, Eric Dumazet <edumazet@google.com> wrote:
> On Wed, Aug 31, 2016 at 5:46 AM, Hadar Hen Zion <hadarh@mellanox.com> wrote:
>>
>> From: Amir Vadai <amir@vadai.me>
>>
>> This action could be used before redirecting packets to a shared tunnel
>> device, or when redirecting packets arriving from a such a device.
>>
>>
>> +
>> +struct tcf_tunnel_key_params {
>> +       struct rcu_head         rcu;
>> +       int                     tcft_action;
>
> Also add " int action;"
>
> (see why later)
>
>> +       struct metadata_dst     *tcft_enc_metadata;
>> +};
>> +
>
>
>
>> +
>> +static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
>> +                         struct tcf_result *res)
>> +{
>> +       struct tcf_tunnel_key *t = to_tunnel_key(a);
>> +       struct tcf_tunnel_key_params *params;
>> +       int action;
>> +
>> +       rcu_read_lock();
>> +
>> +       params = rcu_dereference(t->params);
>> +
>> +       tcf_lastuse_update(&t->tcf_tm);
>> +       bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
>> +       action = t->tcf_action;
>
> Ideally, you should read param->action instead of t->tcf_action to be
> completely clean.

As you suggested above, I can do it by adding "int action" to struct
tcf_tunnel_key_paramse.
But, it means that act_tunnel_key would have a different behavior than
all the other actions and even though
"struct tc_action" has a designated parameters to store this action we
won't use it.
So it won't be completely clean...

Do you think we have a cleaner way to protect it?

>
>> +
>> +       switch (params->tcft_action) {
>> +       case TCA_TUNNEL_KEY_ACT_RELEASE:
>> +               skb_dst_drop(skb);
>> +               break;
>> +       case TCA_TUNNEL_KEY_ACT_SET:
>> +               skb_dst_drop(skb);
>> +               skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
>> +               break;
>> +       default:
>> +               WARN_ONCE(1, "Bad tunnel_key action.\n");
>> +               break;
>> +       }
>> +
>> +       rcu_read_unlock();
>> +
>> +       return action;
>> +}
>>
Hadar Hen-Zion Sept. 1, 2016, 11:59 a.m. UTC | #5
On Wed, Aug 31, 2016 at 8:44 PM, Shmulik Ladkani
<shmulik.ladkani@gmail.com> wrote:
> Hi,
>
> On Wed, 31 Aug 2016 15:46:24 +0300 Hadar Hen Zion <hadarh@mellanox.com> wrote:
>> +static int tunnel_key_init(struct net *net, struct nlattr *nla,
>> +                        struct nlattr *est, struct tc_action **a,
>> +                        int ovr, int bind)
>> +{
>> +     struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
>> +     struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
>> +     struct metadata_dst *metadata = NULL;
>> +     struct tc_tunnel_key *parm;
>> +     struct tcf_tunnel_key *t;
>> +     struct tcf_tunnel_key_params *params_old;
>> +     struct tcf_tunnel_key_params *params_new;
>> +     __be64 key_id;
>> +     bool exists = false;
>> +     int ret = 0;
>> +     int err;
>> +
>> +     if (!nla)
>> +             return -EINVAL;
>> +
>> +     err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
>> +     if (err < 0)
>> +             return err;
>> +
>> +     if (!tb[TCA_TUNNEL_KEY_PARMS])
>> +             return -EINVAL;
>> +
>> +     parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
>> +     exists = tcf_hash_check(tn, parm->index, a, bind);
>> +     if (exists && bind)
>> +             return 0;
>> +
>> +     switch (parm->t_action) {
>> +     case TCA_TUNNEL_KEY_ACT_RELEASE:
>> +             break;
>> +     case TCA_TUNNEL_KEY_ACT_SET:
>> +             if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
>> +                     ret = -EINVAL;
>> +                     goto err_out;
>> +             }
>> +
>> +             key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
>> +
>> +             if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
>> +                 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
>> +                     __be32 saddr;
>> +                     __be32 daddr;
>> +
>> +                     saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
>> +                     daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
>> +
>> +                     metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
>> +                                                 TUNNEL_KEY, key_id, 0);
>> +             } else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
>> +                        tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
>> +                     struct in6_addr saddr;
>> +                     struct in6_addr daddr;
>> +
>> +                     saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
>> +                     daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
>> +
>> +                     metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
>> +                                                   TUNNEL_KEY, key_id, 0);
>> +             }
>> +
>> +             if (!metadata) {
>> +                     ret = -EINVAL;
>> +                     goto err_out;
>> +             }
>> +
>> +             metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
>> +             break;
>> +     default:
>> +             goto err_out;
>> +     }
>> +
>> +     if (!exists) {
>> +             ret = tcf_hash_create(tn, parm->index, est, a,
>> +                                   &act_tunnel_key_ops, bind, true);
>> +             if (ret)
>> +                     return ret;
>> +
>> +             ret = ACT_P_CREATED;
>> +     } else {
>> +             tcf_hash_release(*a, bind);
>> +             if (!ovr)
>> +                     return -EEXIST;
>> +     }
>> +
>> +     t = to_tunnel_key(*a);
>> +
>> +     ASSERT_RTNL();
>> +     params_new = kzalloc(sizeof(*params_new),
>> +                          GFP_KERNEL);
>
> nit: Fits oneline. Fix if patch needs other amendments.

Sure, will do.
>
>> +     if (unlikely(!params_new)) {
>> +             if (ovr)
>> +                     tcf_hash_release(*a, bind);
>> +             return -ENOMEM;
>
> Seems we need to call tcf_hash_release regardless 'ovr':
> In case (!exist), we've created a new hash few lines above.
> Therefore in failure, don't we need a tcf_hash_release()?
> Am I missing something?

You are right, "if (ovr)" line should be removed.
>
>> +     }
>> +
>> +     params_old = rtnl_dereference(t->params);
>> +
>> +     t->tcf_action = parm->action;
>> +     params_new->tcft_action = parm->t_action;
>> +     params_new->tcft_enc_metadata = metadata;
>> +
>> +     rcu_assign_pointer(t->params, params_new);
>> +
>> +     if (params_old)
>> +             kfree_rcu(params_old, rcu);
>> +
>> +     if (ret == ACT_P_CREATED)
>> +             tcf_hash_insert(tn, *a);
>> +
>> +     return ret;
>> +
>> +err_out:
>> +     if (exists)
>> +             tcf_hash_release(*a, bind);
>> +     return ret;
>> +}
>> +
>> +static void tunnel_key_release(struct tc_action *a, int bind)
>> +{
>> +     struct tcf_tunnel_key *t = to_tunnel_key(a);
>> +     struct tcf_tunnel_key_params *params;
>> +
>> +     rcu_read_lock();
>> +     params = rcu_dereference(t->params);
>> +
>> +     if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
>> +             dst_release(&params->tcft_enc_metadata->dst);
>> +
>> +     rcu_read_unlock();
>
> Not an RCU expert, maybe I'm off...
> This alters params in some way (dst_release), so shouldn't it be
> considered an UPDATE, involving 'params' replacement?
> Current code declares it as an rcu read section.
>
dst_release function is using call_rcu to release the dst, so i think
we are safe here.


> Thanks,
> Shmulik
Eric Dumazet Sept. 1, 2016, 1:16 p.m. UTC | #6
On Thu, 2016-09-01 at 12:28 +0300, Hadar Hen Zion wrote:

> 
> As you suggested above, I can do it by adding "int action" to struct
> tcf_tunnel_key_paramse.
> But, it means that act_tunnel_key would have a different behavior than
> all the other actions and even though
> "struct tc_action" has a designated parameters to store this action we
> won't use it.
> So it won't be completely clean...
> 
> Do you think we have a cleaner way to protect it?

Fact that the act_ modules had a spinlock made them all share the same
structure.

Now we want RCU protection, here is the thing.

Say you want to access 3 different fields, A, B and C.

If you put A and B in the rcu protected pointer, but leave C in the
'control part, protected by spinlock' 

Then your fast path wont be able to have a consistent view of 3
variables A, B C.

It might read an old value of A & B, and the recently updated C,

Or it might read an old C, and the updated values of A & B

As Cong very kindly pointed to us/me, if we want to be 'clean', we want
to make sure we read a consistent 3-tuple.

I will send updates when I have time to act_mirred.c
Hadar Hen-Zion Sept. 1, 2016, 1:46 p.m. UTC | #7
On Thu, Sep 1, 2016 at 4:16 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2016-09-01 at 12:28 +0300, Hadar Hen Zion wrote:
>
>>
>> As you suggested above, I can do it by adding "int action" to struct
>> tcf_tunnel_key_paramse.
>> But, it means that act_tunnel_key would have a different behavior than
>> all the other actions and even though
>> "struct tc_action" has a designated parameters to store this action we
>> won't use it.
>> So it won't be completely clean...
>>
>> Do you think we have a cleaner way to protect it?
>
> Fact that the act_ modules had a spinlock made them all share the same
> structure.
>
> Now we want RCU protection, here is the thing.
>
> Say you want to access 3 different fields, A, B and C.
>
> If you put A and B in the rcu protected pointer, but leave C in the
> 'control part, protected by spinlock'
>
> Then your fast path wont be able to have a consistent view of 3
> variables A, B C.
>
> It might read an old value of A & B, and the recently updated C,
>
> Or it might read an old C, and the updated values of A & B

Yes, agree.

I'll add 'action' to struct tcf_tunnel_key_params.

Thanks,
Hadar


>
> As Cong very kindly pointed to us/me, if we want to be 'clean', we want
> to make sure we read a consistent 3-tuple.
>
> I will send updates when I have time to act_mirred.c
>
>
Shmulik Ladkani Sept. 1, 2016, 1:58 p.m. UTC | #8
On Thu, 1 Sep 2016 14:59:28 +0300 Hadar Hen Zion <hadarh@dev.mellanox.co.il> wrote:
> > Seems we need to call tcf_hash_release regardless 'ovr':
> > In case (!exist), we've created a new hash few lines above.
> > Therefore in failure, don't we need a tcf_hash_release()?
> > Am I missing something?  
> 
> You are right, "if (ovr)" line should be removed.

Looking at it again, seems the right condition should be (pls verify):

		if (ret == ACT_P_CREATED)
			tcf_hash_release(*a, bind);

Thanks,
Shmulik
diff mbox

Patch

diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h
new file mode 100644
index 0000000..8610504
--- /dev/null
+++ b/include/net/tc_act/tc_tunnel_key.h
@@ -0,0 +1,30 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __NET_TC_TUNNEL_KEY_H
+#define __NET_TC_TUNNEL_KEY_H
+
+#include <net/act_api.h>
+
+struct tcf_tunnel_key_params {
+	struct rcu_head		rcu;
+	int			tcft_action;
+	struct metadata_dst     *tcft_enc_metadata;
+};
+
+struct tcf_tunnel_key {
+	struct tc_action	      common;
+	struct tcf_tunnel_key_params *params;
+};
+
+#define to_tunnel_key(a) ((struct tcf_tunnel_key *)a)
+
+#endif /* __NET_TC_TUNNEL_KEY_H */
+
diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 0000000..f9ddf53
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET	    1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_TUNNEL_KEY_UNSPEC,
+	TCA_TUNNEL_KEY_TM,
+	TCA_TUNNEL_KEY_PARMS,
+	TCA_TUNNEL_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
+	TCA_TUNNEL_KEY_PAD,
+	__TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b..72e3426 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -761,6 +761,17 @@  config NET_ACT_IFE
 	  To compile this code as a module, choose M here: the
 	  module will be called act_ife.
 
+config NET_ACT_TUNNEL_KEY
+        tristate "IP tunnel metadata manipulation"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to set/release ip tunnel metadata.
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_tunnel_key.
+
 config NET_IFE_SKBMARK
         tristate "Support to encoding decoding skb mark on IFE action"
         depends on NET_ACT_IFE
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5..b9d046b 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -22,6 +22,7 @@  obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
+obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
new file mode 100644
index 0000000..62c4202
--- /dev/null
+++ b/net/sched/act_tunnel_key.c
@@ -0,0 +1,349 @@ 
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/dst.h>
+#include <net/dst_metadata.h>
+
+#include <linux/tc_act/tc_tunnel_key.h>
+#include <net/tc_act/tc_tunnel_key.h>
+
+#define TUNNEL_KEY_TAB_MASK     15
+
+static int tunnel_key_net_id;
+static struct tc_action_ops act_tunnel_key_ops;
+
+static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	int action;
+
+	rcu_read_lock();
+
+	params = rcu_dereference(t->params);
+
+	tcf_lastuse_update(&t->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
+	action = t->tcf_action;
+
+	switch (params->tcft_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		skb_dst_drop(skb);
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		skb_dst_drop(skb);
+		skb_dst_set(skb, dst_clone(&params->tcft_enc_metadata->dst));
+		break;
+	default:
+		WARN_ONCE(1, "Bad tunnel_key action.\n");
+		break;
+	}
+
+	rcu_read_unlock();
+
+	return action;
+}
+
+static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
+	[TCA_TUNNEL_KEY_PARMS]	    = { .len = sizeof(struct tc_tunnel_key) },
+	[TCA_TUNNEL_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV4_DST] = { .type = NLA_U32 },
+	[TCA_TUNNEL_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) },
+	[TCA_TUNNEL_KEY_ENC_KEY_ID]   = { .type = NLA_U32 },
+};
+
+static int tunnel_key_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+	struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+	struct metadata_dst *metadata = NULL;
+	struct tc_tunnel_key *parm;
+	struct tcf_tunnel_key *t;
+	struct tcf_tunnel_key_params *params_old;
+	struct tcf_tunnel_key_params *params_new;
+	__be64 key_id;
+	bool exists = false;
+	int ret = 0;
+	int err;
+
+	if (!nla)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_TUNNEL_KEY_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_TUNNEL_KEY_PARMS]);
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	switch (parm->t_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		if (!tb[TCA_TUNNEL_KEY_ENC_KEY_ID]) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		key_id = key32_to_tunnel_id(nla_get_be32(tb[TCA_TUNNEL_KEY_ENC_KEY_ID]));
+
+		if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
+		    tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
+			__be32 saddr;
+			__be32 daddr;
+
+			saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+			daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+
+			metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+						    TUNNEL_KEY, key_id, 0);
+		} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
+			   tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]) {
+			struct in6_addr saddr;
+			struct in6_addr daddr;
+
+			saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+			daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+
+			metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, 0,
+						      TUNNEL_KEY, key_id, 0);
+		}
+
+		if (!metadata) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+
+		metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX;
+		break;
+	default:
+		goto err_out;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_tunnel_key_ops, bind, true);
+		if (ret)
+			return ret;
+
+		ret = ACT_P_CREATED;
+	} else {
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	t = to_tunnel_key(*a);
+
+	ASSERT_RTNL();
+	params_new = kzalloc(sizeof(*params_new),
+			     GFP_KERNEL);
+	if (unlikely(!params_new)) {
+		if (ovr)
+			tcf_hash_release(*a, bind);
+		return -ENOMEM;
+	}
+
+	params_old = rtnl_dereference(t->params);
+
+	t->tcf_action = parm->action;
+	params_new->tcft_action = parm->t_action;
+	params_new->tcft_enc_metadata = metadata;
+
+	rcu_assign_pointer(t->params, params_new);
+
+	if (params_old)
+		kfree_rcu(params_old, rcu);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+
+	return ret;
+
+err_out:
+	if (exists)
+		tcf_hash_release(*a, bind);
+	return ret;
+}
+
+static void tunnel_key_release(struct tc_action *a, int bind)
+{
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET)
+		dst_release(&params->tcft_enc_metadata->dst);
+
+	rcu_read_unlock();
+}
+
+static int tunnel_key_dump_addresses(struct sk_buff *skb,
+				     const struct ip_tunnel_info *info)
+{
+	unsigned short family = ip_tunnel_info_af(info);
+
+	if (family == AF_INET) {
+		__be32 saddr = info->key.u.ipv4.src;
+		__be32 daddr = info->key.u.ipv4.dst;
+
+		if (!nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_SRC, saddr) &&
+		    !nla_put_in_addr(skb, TCA_TUNNEL_KEY_ENC_IPV4_DST, daddr))
+			return 0;
+	}
+
+	if (family == AF_INET6) {
+		const struct in6_addr *saddr6 = &info->key.u.ipv6.src;
+		const struct in6_addr *daddr6 = &info->key.u.ipv6.dst;
+
+		if (!nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_SRC, saddr6) &&
+		    !nla_put_in6_addr(skb,
+				      TCA_TUNNEL_KEY_ENC_IPV6_DST, daddr6))
+			return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_tunnel_key *t = to_tunnel_key(a);
+	struct tcf_tunnel_key_params *params;
+	struct tc_tunnel_key opt = {
+		.index    = t->tcf_index,
+		.refcnt   = t->tcf_refcnt - ref,
+		.bindcnt  = t->tcf_bindcnt - bind,
+		.action   = t->tcf_action,
+	};
+	struct tcf_t tm;
+	int ret = -1;
+
+	rcu_read_lock();
+	params = rcu_dereference(t->params);
+
+	opt.t_action = params->tcft_action;
+
+	if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	if (params->tcft_action == TCA_TUNNEL_KEY_ACT_SET) {
+		struct ip_tunnel_key *key =
+			&params->tcft_enc_metadata->u.tun_info.key;
+		__be32 key_id = tunnel_id_to_key32(key->tun_id);
+
+		if (nla_put_be32(skb, TCA_TUNNEL_KEY_ENC_KEY_ID, key_id) ||
+		    tunnel_key_dump_addresses(skb,
+					      &params->tcft_enc_metadata->u.tun_info))
+			goto nla_put_failure;
+	}
+
+	tcf_tm_dump(&tm, &t->tcf_tm);
+	if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
+			  &tm, TCA_TUNNEL_KEY_PAD))
+		goto nla_put_failure;
+
+	ret = skb->len;
+	goto out;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tunnel_key_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_tunnel_key_ops = {
+	.kind		=	"tunnel_key",
+	.type		=	TCA_ACT_TUNNEL_KEY,
+	.owner		=	THIS_MODULE,
+	.act		=	tunnel_key_act,
+	.dump		=	tunnel_key_dump,
+	.init		=	tunnel_key_init,
+	.cleanup	=	tunnel_key_release,
+	.walk		=	tunnel_key_walker,
+	.lookup		=	tunnel_key_search,
+	.size		=	sizeof(struct tcf_tunnel_key),
+};
+
+static __net_init int tunnel_key_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	return tc_action_net_init(tn, &act_tunnel_key_ops, TUNNEL_KEY_TAB_MASK);
+}
+
+static void __net_exit tunnel_key_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, tunnel_key_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations tunnel_key_net_ops = {
+	.init = tunnel_key_init_net,
+	.exit = tunnel_key_exit_net,
+	.id   = &tunnel_key_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init tunnel_key_init_module(void)
+{
+	return tcf_register_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+static void __exit tunnel_key_cleanup_module(void)
+{
+	tcf_unregister_action(&act_tunnel_key_ops, &tunnel_key_net_ops);
+}
+
+module_init(tunnel_key_init_module);
+module_exit(tunnel_key_cleanup_module);
+
+MODULE_AUTHOR("Amir Vadai <amir@vadai.me>");
+MODULE_DESCRIPTION("ip tunnel manipulation actions");
+MODULE_LICENSE("GPL v2");