diff mbox

[v3,net-next,1/1] net_sched: Introduce skbmod action

Message ID 1472386756-23085-1-git-send-email-jhs@emojatatu.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jamal Hadi Salim Aug. 28, 2016, 12:19 p.m. UTC
From: Jamal Hadi Salim <jhs@mojatatu.com>

This action is intended to be an upgrade from a usability perspective
from pedit (as well as operational debugability).
Compare this:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action pedit munge offset -14 u8 set 0x02 \
munge offset -13 u8 set 0x15 \
munge offset -12 u8 set 0x15 \
munge offset -11 u8 set 0x15 \
munge offset -10 u16 set 0x1515 \
pipe

to:

sudo tc filter add dev $ETH parent 1: protocol ip prio 10 \
u32 match ip protocol 1 0xff flowid 1:2 \
action skbmod dmac 02:15:15:15:15:15

Also try to do a MAC address swap with pedit or worse
try to debug a policy with destination mac, source mac and
etherype. Then make few rules out of those and you'll get my point.

In the future common use cases on pedit can be migrated to this action
(as an example different fields in ip v4/6, transports like tcp/udp/sctp
etc). For this first cut, this allows modifying basic ethernet header.

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
 include/net/tc_act/tc_skbmod.h        |  34 +++++
 include/uapi/linux/tc_act/tc_skbmod.h |  49 +++++++
 net/sched/Kconfig                     |  11 ++
 net/sched/Makefile                    |   1 +
 net/sched/act_skbmod.c                | 257 ++++++++++++++++++++++++++++++++++
 5 files changed, 352 insertions(+)
 create mode 100644 include/net/tc_act/tc_skbmod.h
 create mode 100644 include/uapi/linux/tc_act/tc_skbmod.h
 create mode 100644 net/sched/act_skbmod.c

Comments

Alexei Starovoitov Aug. 28, 2016, 2:48 p.m. UTC | #1
On Sun, Aug 28, 2016 at 08:19:16AM -0400, Jamal Hadi Salim wrote:
> --- /dev/null
> +++ b/include/uapi/linux/tc_act/tc_skbmod.h
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (c) 2016, Jamal Hadi Salim
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.

the address is incorrect.
That's why it's recommended to skip this section in all headers.
First paragraph in the above 'This program is ... GPL' would have been enough.
Eric Dumazet Aug. 28, 2016, 4:07 p.m. UTC | #2
On Sun, 2016-08-28 at 08:19 -0400, Jamal Hadi Salim wrote:
> From: Jamal Hadi Salim <jhs@mojatatu.com>

...

> +static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
> +			  struct tcf_result *res)
> +{
> +	struct tcf_skbmod *d = to_skbmod(a);
> +
> +	spin_lock(&d->tcf_lock);
> +	tcf_lastuse_update(&d->tcf_tm);
> +	bstats_update(&d->tcf_bstats, skb);
> +
> +	if (d->flags & SKBMOD_F_DMAC)
> +		ether_addr_copy(eth_hdr(skb)->h_dest, d->eth_dst);
> +	if (d->flags & SKBMOD_F_SMAC)
> +		ether_addr_copy(eth_hdr(skb)->h_source, d->eth_src);
> +	if (d->flags & SKBMOD_F_ETYPE)
> +		eth_hdr(skb)->h_proto = d->eth_type;
> +	if (d->flags & SKBMOD_F_SWAPMAC) {
> +		u8 tmpaddr[ETH_ALEN];
> +		/*XXX: I am sure we can come up with something more efficient */
> +		ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
> +		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
> +		ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
> +	}
> +
> +	spin_unlock(&d->tcf_lock);
> +	return d->tcf_action;
> +}


Adding an action with a spinlock held in fast path in 2016 is
a way to tell people : It is a toy, do not use it for real.

Sorry guys. Friends do not let friends do that anymore.
Cong Wang Aug. 29, 2016, 5:10 a.m. UTC | #3
On Sun, Aug 28, 2016 at 9:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
> Adding an action with a spinlock held in fast path in 2016 is
> a way to tell people : It is a toy, do not use it for real.
>
> Sorry guys. Friends do not let friends do that anymore.
>

Please stop joking, this is not funny at all:

% git grep tcf_lock -- net/sched
Jamal Hadi Salim Aug. 29, 2016, 10:35 a.m. UTC | #4
On 16-08-28 10:48 AM, Alexei Starovoitov wrote:
> On Sun, Aug 28, 2016 at 08:19:16AM -0400, Jamal Hadi Salim wrote:

> the address is incorrect.
> That's why it's recommended to skip this section in all headers.
> First paragraph in the above 'This program is ... GPL' would have been enough.
>

I am sure i cutnpasted that from some other file in net/sched. Will fix.

cheers,
jamal
Jamal Hadi Salim Aug. 29, 2016, 10:38 a.m. UTC | #5
On 16-08-28 12:07 PM, Eric Dumazet wrote:

> Adding an action with a spinlock held in fast path in 2016 is
> a way to tell people : It is a toy, do not use it for real.
>
> Sorry guys. Friends do not let friends do that anymore.

Generally agree.
Cong says he is working on generic patches. Let me see if i can
try something. Trickery with RCU is not one of my virtues -
so i may get it wrong but hope you can comment.

cheers,
jamal
Daniel Borkmann Aug. 29, 2016, 11 a.m. UTC | #6
On 08/29/2016 12:38 PM, Jamal Hadi Salim wrote:
> On 16-08-28 12:07 PM, Eric Dumazet wrote:
>
>> Adding an action with a spinlock held in fast path in 2016 is
>> a way to tell people : It is a toy, do not use it for real.
>>
>> Sorry guys. Friends do not let friends do that anymore.
>
> Generally agree.
> Cong says he is working on generic patches. Let me see if i can
> try something. Trickery with RCU is not one of my virtues -
> so i may get it wrong but hope you can comment.

You could probably just keep these meta data in a separate struct
managed by RCU that tcf_skbmod points to.

One more thing I commented on your last patch already is that you
would also need f.e. skb_ensure_writable() before mangling.
Jamal Hadi Salim Aug. 29, 2016, 11:35 a.m. UTC | #7
On 16-08-29 06:38 AM, Jamal Hadi Salim wrote:
> Let me see if i can
> try something. Trickery with RCU is not one of my virtues -
> so i may get it wrong but hope you can comment.

Something like attached? Compile tested.
I tried to emulate gact - but greping around I havent
seen sample code which uses  rcu_read_un/lock + synchronize_rcu
this way
[so be gentle if i got it wrong ;->]

I am not sure where Cong is going but probably introducing
a rcu head in each policy will allow to swap and free old one
with call_rcu may be a better approach. The cost may come in
slowing down replace operations.

cheers,
jamal
static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
			  struct tcf_result *res)
{
	struct tcf_skbmod *d = to_skbmod(a);
	int action = READ_ONCE(d->tcf_action);
	u64 flags = READ_ONCE(d->flags);

	tcf_lastuse_update(&d->tcf_tm);
	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
	if (unlikely(action == TC_ACT_SHOT)) {
		qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
		return action;
	}

	flags = READ_ONCE(d->flags);
	rcu_read_lock();
	if (flags & SKBMOD_F_DMAC)
		ether_addr_copy(eth_hdr(skb)->h_dest, d->eth_dst);
	if (flags & SKBMOD_F_SMAC)
		ether_addr_copy(eth_hdr(skb)->h_source, d->eth_src);
	if (flags & SKBMOD_F_ETYPE)
		eth_hdr(skb)->h_proto = d->eth_type;
	if (flags & SKBMOD_F_SWAPMAC) {
		u8 tmpaddr[ETH_ALEN];
		/*XXX: I am sure we can come up with something more efficient */
		ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
		ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
	}
	rcu_read_unlock();

	return action;
}

static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
			   struct nlattr *est, struct tc_action **a,
			   int ovr, int bind)
{
	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
	struct tc_skbmod *parm;
	struct tcf_skbmod *d;
	u32 lflags = 0;
	u8 *daddr = NULL;
	u8 *saddr = NULL;
	u16 eth_type = 0;

	bool exists = false;
	int ret = 0, err;

	if (nla == NULL)
		return -EINVAL;

	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
	if (err < 0)
		return err;

	if (tb[TCA_SKBMOD_PARMS] == NULL)
		return -EINVAL;

	if (tb[TCA_SKBMOD_DMAC]) {
		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
		lflags |= SKBMOD_F_DMAC;
	}

	if (tb[TCA_SKBMOD_SMAC]) {
		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
		lflags |= SKBMOD_F_SMAC;
	}

	if (tb[TCA_SKBMOD_ETYPE]) {
		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
		lflags |= SKBMOD_F_ETYPE;
	}

	parm = nla_data(tb[TCA_SKBMOD_PARMS]);

	if (parm->flags & SKBMOD_F_SWAPMAC)
		lflags = SKBMOD_F_SWAPMAC;

	exists = tcf_hash_check(tn, parm->index, a, bind);
	if (exists && bind)
		return 0;

	if (!lflags) {
		return -EINVAL;
	}

	if (!exists) {
		ret = tcf_hash_create(tn, parm->index, est, a,
				      &act_skbmod_ops, bind, false);
		if (ret)
			return ret;

		d = to_skbmod(*a);
		ret = ACT_P_CREATED;
	} else {
		d = to_skbmod(*a);
		tcf_hash_release(*a, bind);
		if (!ovr)
			return -EEXIST;
	}

	ASSERT_RTNL();
	d->flags = lflags;
	d->tcf_action = parm->action;

	if (ovr)
		spin_lock_bh(&d->tcf_lock);

	if (lflags & SKBMOD_F_DMAC)
		ether_addr_copy(d->eth_dst, daddr);
	if (lflags & SKBMOD_F_SMAC)
		ether_addr_copy(d->eth_src, saddr);
	if (lflags & SKBMOD_F_ETYPE)
		d->eth_type = htons(eth_type);


	if (ovr) {
		spin_unlock_bh(&d->tcf_lock);
		synchronize_rcu();
	}

	if (ret == ACT_P_CREATED)
		tcf_hash_insert(tn, *a);
	return ret;
}
Jamal Hadi Salim Aug. 29, 2016, 11:40 a.m. UTC | #8
On 16-08-29 07:00 AM, Daniel Borkmann wrote:

> You could probably just keep these meta data in a separate struct
> managed by RCU that tcf_skbmod points to.
>

Which seem like could be made generic for all actions. Maybe thats
what Cong is thinking.

> One more thing I commented on your last patch already is that you
> would also need f.e. skb_ensure_writable() before mangling.


Sorry missed that. Let me give it an attempt. I think challenge is
going to be what length to use. For now it is ethernet; but i had
a change which swapped VLANs that i took out.

cheers,
jamal
Jamal Hadi Salim Aug. 29, 2016, 11:55 a.m. UTC | #9
On 16-08-29 07:40 AM, Jamal Hadi Salim wrote:
> On 16-08-29 07:00 AM, Daniel Borkmann wrote:

> Sorry missed that. Let me give it an attempt. I think challenge is
> going to be what length to use. For now it is ethernet; but i had
> a change which swapped VLANs that i took out.
>

something like this?

-----
/* XXX: if you are going to edit more fields beyond ethernet header
  * (example when you add IP header replacement or vlan swap)
  * then MAX_EDIT_LEN needs to change appropriately
*/
#define MAX_EDIT_LEN ETH_HLEN
static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
                           struct tcf_result *res)
{
         struct tcf_skbmod *d = to_skbmod(a);
         int action = READ_ONCE(d->tcf_action);
         int err;
         u64 flags;

         err = skb_ensure_writable(skb, ETH_HLEN);
         if (unlikely(err)) /* best policy is to drop on the floor */
                 action = TC_ACT_SHOT;

         tcf_lastuse_update(&d->tcf_tm);
         bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
         if (unlikely(action == TC_ACT_SHOT)) {
                 qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
                 return action;
         }
----

cheers,
jamal
Eric Dumazet Aug. 29, 2016, 1:27 p.m. UTC | #10
On Sun, 2016-08-28 at 22:10 -0700, Cong Wang wrote:
> On Sun, Aug 28, 2016 at 9:07 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> > Adding an action with a spinlock held in fast path in 2016 is
> > a way to tell people : It is a toy, do not use it for real.
> >
> > Sorry guys. Friends do not let friends do that anymore.
> >
> 
> Please stop joking, this is not funny at all:
> 
> % git grep tcf_lock -- net/sched

This is absolutely trivial to write this skbmod without taking a lock.

If you guys do not care about that, I will simply NACK any new lazy
code.

Do not try to call me funny, simply acknowledge the mere facts and try
to keep your calm Cong.
Eric Dumazet Aug. 29, 2016, 6:20 p.m. UTC | #11
On Mon, 2016-08-29 at 07:35 -0400, Jamal Hadi Salim wrote:
>        flags = READ_ONCE(d->flags);
>         rcu_read_lock();
>         if (flags & SKBMOD_F_DMAC)
>                 ether_addr_copy(eth_hdr(skb)->h_dest, d->eth_dst);
>         if (flags & SKBMOD_F_SMAC)
>                 ether_addr_copy(eth_hdr(skb)->h_source, d->eth_src);
>         if (flags & SKBMOD_F_ETYPE)
>                 eth_hdr(skb)->h_proto = d->eth_type;
>         if (flags & SKBMOD_F_SWAPMAC) {
>                 u8 tmpaddr[ETH_ALEN];
>                 /*XXX: I am sure we can come up with something more
> efficient */
>                 ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
>                 ether_addr_copy(eth_hdr(skb)->h_dest,
> eth_hdr(skb)->h_source);
>                 ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
>         }
>         rcu_read_unlock();

You would need to store everything in an object, managed by rcu.

struct my_rcu_safe_struct {
     u32 flags;
     u8   eth_dst[ETH_ALEN];
     u8   eth_src[ETH_ALEN];
     __be16 eth_type;
};

And then allocate a new one when you need to update the infos (from
tcf_skbmod_init(())

RCU : Read Copy Update.

Then in the reader you would use 

rcu_read_lock();
myptr = rcu_dereference(d->ptr);
if (myptr) {
        if (myptr->flags & SKBMOD_F_DMAC)
                 ether_addr_copy(eth_hdr(skb)->h_dest, myptr->eth_dst);
         if (myptr->flags & SKBMOD_F_SMAC)
                 ether_addr_copy(eth_hdr(skb)->h_source, myptr->eth_src);
         if (myptr->flags & SKBMOD_F_ETYPE)
                 eth_hdr(skb)->h_proto = myptr->eth_type;
}
rcu_read_unlock();
Jamal Hadi Salim Aug. 30, 2016, 11:12 a.m. UTC | #12
On 16-08-29 02:20 PM, Eric Dumazet wrote:

>
> You would need to store everything in an object, managed by rcu.
>
> struct my_rcu_safe_struct {
>      u32 flags;
>      u8   eth_dst[ETH_ALEN];
>      u8   eth_src[ETH_ALEN];
>      __be16 eth_type;
> };
>
> And then allocate a new one when you need to update the infos (from
> tcf_skbmod_init(())
>
> RCU : Read Copy Update.
>
> Then in the reader you would use
>
> rcu_read_lock();
> myptr = rcu_dereference(d->ptr);
> if (myptr) {
>         if (myptr->flags & SKBMOD_F_DMAC)
>                  ether_addr_copy(eth_hdr(skb)->h_dest, myptr->eth_dst);
>          if (myptr->flags & SKBMOD_F_SMAC)
>                  ether_addr_copy(eth_hdr(skb)->h_source, myptr->eth_src);
>          if (myptr->flags & SKBMOD_F_ETYPE)
>                  eth_hdr(skb)->h_proto = myptr->eth_type;
> }
> rcu_read_unlock();

Thanks Eric.
This is the approach i thought Cong was going to take but in a way that
it applies to all actions.
It requires I do this extra allocation per update/create - I am not sure 
how much of a big deal that is (we take pride in our update rate).
Let me work and post something simple that captures these ideas
then wait to see what Cong has in mind for the general approach.

cheers,
jamal
Jamal Hadi Salim Aug. 30, 2016, 11:57 a.m. UTC | #13
On 16-08-30 07:12 AM, Jamal Hadi Salim wrote:

>
> Thanks Eric.
> This is the approach i thought Cong was going to take but in a way that
> it applies to all actions.
> It requires I do this extra allocation per update/create - I am not sure
> how much of a big deal that is (we take pride in our update rate).
> Let me work and post something simple that captures these ideas
> then wait to see what Cong has in mind for the general approach.

As attached here ... compile tested.

cheers,
jamal
From header file:
-----
struct tcf_skbmod_params {
        u64     flags; /*up to 64 types of operations; extend if needed */
        u8      eth_dst[ETH_ALEN];
        u16     eth_type;
        u8      eth_src[ETH_ALEN];
};

struct tcf_skbmod {
        struct tc_action        common;
        struct tcf_skbmod_params  *skbmod_p;
};
#define to_skbmod(a) ((struct tcf_skbmod *)a)
------

#define MAX_EDIT_LEN ETH_HLEN
static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
			  struct tcf_result *res)
{
	struct tcf_skbmod *d = to_skbmod(a);
	int action = READ_ONCE(d->tcf_action);
	struct tcf_skbmod_params *p;
	u64 flags;
	int err;

	/* XXX: if you are going to edit more fields beyond ethernet header
	 * (example when you add IP header replacement or vlan swap)
	 * then MAX_EDIT_LEN needs to change appropriately
	*/
	err = skb_ensure_writable(skb, ETH_HLEN);
	if (unlikely(err)) /* best policy is to drop on the floor */
		action = TC_ACT_SHOT;

	tcf_lastuse_update(&d->tcf_tm);
	bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
	if (unlikely(action == TC_ACT_SHOT)) {
		qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
		return action;
	}

	rcu_read_lock();
	p = rcu_dereference(d->skbmod_p);
	flags = p->flags;
	if (flags & SKBMOD_F_DMAC)
		ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
	if (flags & SKBMOD_F_SMAC)
		ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
	if (flags & SKBMOD_F_ETYPE)
		eth_hdr(skb)->h_proto = p->eth_type;
	if (flags & SKBMOD_F_SWAPMAC) {
		u8 tmpaddr[ETH_ALEN];
		/*XXX: I am sure we can come up with something more efficient */
		ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
		ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
	}
	rcu_read_unlock();

	return action;
}

static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
			   struct nlattr *est, struct tc_action **a,
			   int ovr, int bind)
{
	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
	struct tc_skbmod *parm;
	struct tcf_skbmod *d;
	struct tcf_skbmod_params *p, *p_old;
	u32 lflags = 0;
	u8 *daddr = NULL;
	u8 *saddr = NULL;
	u16 eth_type = 0;
	bool exists = false;
	int ret = 0, err;

	if (nla == NULL)
		return -EINVAL;

	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
	if (err < 0)
		return err;

	if (tb[TCA_SKBMOD_PARMS] == NULL)
		return -EINVAL;

	if (tb[TCA_SKBMOD_DMAC]) {
		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
		lflags |= SKBMOD_F_DMAC;
	}

	if (tb[TCA_SKBMOD_SMAC]) {
		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
		lflags |= SKBMOD_F_SMAC;
	}

	if (tb[TCA_SKBMOD_ETYPE]) {
		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
		lflags |= SKBMOD_F_ETYPE;
	}

	parm = nla_data(tb[TCA_SKBMOD_PARMS]);

	if (parm->flags & SKBMOD_F_SWAPMAC)
		lflags = SKBMOD_F_SWAPMAC;

	exists = tcf_hash_check(tn, parm->index, a, bind);
	if (exists && bind)
		return 0;

	if (!lflags) {
		return -EINVAL;
	}

	if (!exists) {
		ret = tcf_hash_create(tn, parm->index, est, a,
				      &act_skbmod_ops, bind, false);
		if (ret)
			return ret;

		d = to_skbmod(*a);
		ret = ACT_P_CREATED;
	} else {
		d = to_skbmod(*a);
		tcf_hash_release(*a, bind);
		if (!ovr)
			return -EEXIST;
	}

	ASSERT_RTNL();
	p = kzalloc(sizeof(struct tcf_skbmod_params), GFP_KERNEL);
	if (unlikely (!p)) {
		if (ovr)
			tcf_hash_release(*a, bind);
		return -ENOMEM;
	}

	p->flags = lflags;
	d->tcf_action = parm->action;

	p_old = rtnl_dereference(d->skbmod_p);

	if (ovr)
		spin_lock_bh(&d->tcf_lock);

	if (lflags & SKBMOD_F_DMAC)
		ether_addr_copy(p->eth_dst, daddr);
	if (lflags & SKBMOD_F_SMAC)
		ether_addr_copy(p->eth_src, saddr);
	if (lflags & SKBMOD_F_ETYPE)
		p->eth_type = htons(eth_type);

	rcu_assign_pointer(d->skbmod_p, p);
	if (ovr) {
		spin_unlock_bh(&d->tcf_lock);
		synchronize_rcu();
	}

	kfree(p_old);

	if (ret == ACT_P_CREATED)
		tcf_hash_insert(tn, *a);
	return ret;
}
Eric Dumazet Aug. 30, 2016, 12:20 p.m. UTC | #14
On Tue, 2016-08-30 at 07:12 -0400, Jamal Hadi Salim wrote:
> On 16-08-29 02:20 PM, Eric Dumazet wrote:
> 
> >
> > You would need to store everything in an object, managed by rcu.
> >
> > struct my_rcu_safe_struct {
> >      u32 flags;
> >      u8   eth_dst[ETH_ALEN];
> >      u8   eth_src[ETH_ALEN];
> >      __be16 eth_type;
> > };
> >
> > And then allocate a new one when you need to update the infos (from
> > tcf_skbmod_init(())
> >
> > RCU : Read Copy Update.
> >
> > Then in the reader you would use
> >
> > rcu_read_lock();
> > myptr = rcu_dereference(d->ptr);
> > if (myptr) {
> >         if (myptr->flags & SKBMOD_F_DMAC)
> >                  ether_addr_copy(eth_hdr(skb)->h_dest, myptr->eth_dst);
> >          if (myptr->flags & SKBMOD_F_SMAC)
> >                  ether_addr_copy(eth_hdr(skb)->h_source, myptr->eth_src);
> >          if (myptr->flags & SKBMOD_F_ETYPE)
> >                  eth_hdr(skb)->h_proto = myptr->eth_type;
> > }
> > rcu_read_unlock();
> 
> Thanks Eric.
> This is the approach i thought Cong was going to take but in a way that
> it applies to all actions.
> It requires I do this extra allocation per update/create - I am not sure 
> how much of a big deal that is (we take pride in our update rate).
> Let me work and post something simple that captures these ideas
> then wait to see what Cong has in mind for the general approach.


The percpu stats infra is already there, you have to change
tcf_hash_create() last parameter to 'true'.

For example you have to add in struct my_rcu_safe_struct a 'struct
rcu_head rcu;'  field to be able to use kfree_rcu() instead of
synchronize_rcu()

Some very simple actions do not even need rcu_dereference()

RCU have many different variants.

Sometimes READ_ONCE() and WRITE_ONCE() are enough, when all the 'flags'
can be in a single integer, and this avoids an extra dereference and
possible cache miss.
Eric Dumazet Aug. 30, 2016, 12:35 p.m. UTC | #15
On Tue, 2016-08-30 at 07:57 -0400, Jamal Hadi Salim wrote:
>         rcu_assign_pointer(d->skbmod_p, p);
>         if (ovr) {
>                 spin_unlock_bh(&d->tcf_lock);
>                 synchronize_rcu();
>         }
> 
>         kfree(p_old);

Overall patch looks good Jamal, thanks.


synchronize_rcu() might bee to expensive if you plan to change actions
hundred of times per second.

You could instead add a 'struct rcu_head rcu;'  field in struct
tcf_skbmod_params  (but make sure this is not exported to user space)

Then :

        if (ovr)
                 spin_unlock_bh(&d->tcf_lock);
         kfree_rcu(p_old, rcu);
Eric Dumazet Aug. 30, 2016, 12:44 p.m. UTC | #16
On Tue, 2016-08-30 at 07:57 -0400, Jamal Hadi Salim wrote:
>         if (flags & SKBMOD_F_SWAPMAC) {
>                 u8 tmpaddr[ETH_ALEN];
>                 /*XXX: I am sure we can come up with something more efficient */
>                 ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
>                 ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
>                 ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
>         }

While ether_addr_copy() is accepting u8 pointers, data must be aligned to u16 at least.

(See comments in include/linux/etherdevice.h)

Some arches/compilers might do things here that would generate a trap
if tmpaddr is not aligned.
Jamal Hadi Salim Sept. 6, 2016, 12:03 p.m. UTC | #17
On 16-08-30 08:35 AM, Eric Dumazet wrote:

> synchronize_rcu() might bee to expensive if you plan to change actions
> hundred of times per second.
>
> You could instead add a 'struct rcu_head rcu;'  field in struct
> tcf_skbmod_params  (but make sure this is not exported to user space)
>
> Then :
>
>         if (ovr)
>                  spin_unlock_bh(&d->tcf_lock);
>          kfree_rcu(p_old, rcu);
>

Ok, working on this variant. Will post today or tommorow.

cheers,
jamal
Jamal Hadi Salim Sept. 6, 2016, 12:08 p.m. UTC | #18
ng
On 16-08-30 08:44 AM, Eric Dumazet wrote:
> On Tue, 2016-08-30 at 07:57 -0400, Jamal Hadi Salim wrote:
>>         if (flags & SKBMOD_F_SWAPMAC) {
>>                 u8 tmpaddr[ETH_ALEN];
>>                 /*XXX: I am sure we can come up with something more efficient */
>>                 ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
>>                 ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
>>                 ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
>>         }
>
> While ether_addr_copy() is accepting u8 pointers, data must be aligned to u16 at least.
>
> (See comments in include/linux/etherdevice.h)
>
> Some arches/compilers might do things here that would generate a trap
> if tmpaddr is not aligned.
>

Hrm. How do you suggest dealing with this?

cheers,
jamal
Eric Dumazet Sept. 6, 2016, 2:25 p.m. UTC | #19
On Tue, 2016-09-06 at 08:08 -0400, Jamal Hadi Salim wrote:
> ng
> On 16-08-30 08:44 AM, Eric Dumazet wrote:
> > On Tue, 2016-08-30 at 07:57 -0400, Jamal Hadi Salim wrote:
> >>         if (flags & SKBMOD_F_SWAPMAC) {
> >>                 u8 tmpaddr[ETH_ALEN];
> >>                 /*XXX: I am sure we can come up with something more efficient */
> >>                 ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
> >>                 ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
> >>                 ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
> >>         }
> >
> > While ether_addr_copy() is accepting u8 pointers, data must be aligned to u16 at least.
> >
> > (See comments in include/linux/etherdevice.h)
> >
> > Some arches/compilers might do things here that would generate a trap
> > if tmpaddr is not aligned.
> >
> 
> Hrm. How do you suggest dealing with this?

Just use u16 in the array ?

u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */

ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
...
Jamal Hadi Salim Sept. 8, 2016, 10:38 a.m. UTC | #20
On 16-09-06 10:25 AM, Eric Dumazet wrote:

>
> Just use u16 in the array ?
>
> u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
>
> ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
> ...
>

Ok - thanks Eric. BTW:
Nobody has answered my other question:
I can get stats to increment with:
bstats_update(&d->tcf_bstats, skb);
but not:
bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);

Note, test is on a VM.

cheers,
jamal
Eric Dumazet Sept. 8, 2016, 4:02 p.m. UTC | #21
On Thu, 2016-09-08 at 06:38 -0400, Jamal Hadi Salim wrote:
> On 16-09-06 10:25 AM, Eric Dumazet wrote:
> 
> >
> > Just use u16 in the array ?
> >
> > u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
> >
> > ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
> > ...
> >
> 
> Ok - thanks Eric. BTW:
> Nobody has answered my other question:
> I can get stats to increment with:
> bstats_update(&d->tcf_bstats, skb);
> but not:
> bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);

This simply works for me.

But maybe you read the stats at the wrong place ?

Pleas give us more details, thanks !

lpaa24:~# tc -s action list action mirred ; tc -s -d qd sh dev ifb10

	action order 0: mirred (Egress Redirect to device ifb10) stolen
 	index 3 ref 1 bind 1 installed 18 sec
 	Action statistics:
	Sent 6647111559 bytes 4433436 pkt (dropped 0, overlimits 14473 requeues 0) 
	backlog 0b 0p requeues 0 
qdisc mq 1: root 
 Sent 6394816791 bytes 4225732 pkt (dropped 88025, overlimits 0 requeues 0) 
 backlog 108069552b 71406p requeues 0 
qdisc netem 8016: parent 1:5 limit 100000 delay 50.0ms loss 2%
 Sent 776873149 bytes 513412 pkt (dropped 10766, overlimits 0 requeues 0) 
 rate 721506Kbit 59595pps backlog 6952420b 4594p requeues 0 
qdisc netem 8018: parent 1:7 limit 100000 delay 50.0ms loss 2%
 Sent 1290830041 bytes 852967 pkt (dropped 18497, overlimits 0 requeues 0) 
 rate 1246Mbit 102941pps backlog 76607984b 50616p requeues 0 
qdisc netem 8013: parent 1:2 limit 100000 delay 50.0ms loss 2%
 Sent 783039400 bytes 517411 pkt (dropped 10512, overlimits 0 requeues 0) 
 rate 471897Kbit 38979pps backlog 688870b 455p requeues 0 
qdisc netem 8015: parent 1:4 limit 100000 delay 50.0ms loss 2%
 Sent 715503442 bytes 472806 pkt (dropped 9734, overlimits 0 requeues 0) 
 rate 432373Kbit 35713pps backlog 1627550b 1075p requeues 0 
qdisc netem 8019: parent 1:8 limit 100000 delay 50.0ms loss 2%
 Sent 735599060 bytes 486064 pkt (dropped 9915, overlimits 0 requeues 0) 
 rate 441280Kbit 36448pps backlog 2747910b 1815p requeues 0 
qdisc netem 8017: parent 1:6 limit 100000 delay 50.0ms loss 2%
 Sent 755679663 bytes 499311 pkt (dropped 10130, overlimits 0 requeues 0) 
 rate 445158Kbit 36765pps backlog 2174290b 1439p requeues 0 
qdisc netem 8014: parent 1:3 limit 100000 delay 50.0ms loss 2%
 Sent 689812958 bytes 455833 pkt (dropped 9583, overlimits 0 requeues 0) 
 rate 566772Kbit 46812pps backlog 13988176b 9244p requeues 0 
qdisc netem 8012: parent 1:1 limit 100000 delay 50.0ms loss 2%
 Sent 647479078 bytes 427928 pkt (dropped 8888, overlimits 0 requeues 0) 
 rate 559420Kbit 46207pps backlog 3282352b 2168p requeues 0
Eric Dumazet Sept. 8, 2016, 4:11 p.m. UTC | #22
On Thu, 2016-09-08 at 09:02 -0700, Eric Dumazet wrote:
> On Thu, 2016-09-08 at 06:38 -0400, Jamal Hadi Salim wrote:
> > On 16-09-06 10:25 AM, Eric Dumazet wrote:
> > 
> > >
> > > Just use u16 in the array ?
> > >
> > > u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
> > >
> > > ether_addr_copy((u8 *)tmpaddr, eth_hdr(skb)->h_dest);
> > > ...
> > >
> > 
> > Ok - thanks Eric. BTW:
> > Nobody has answered my other question:
> > I can get stats to increment with:
> > bstats_update(&d->tcf_bstats, skb);
> > but not:
> > bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
> 
> This simply works for me.
> 
> But maybe you read the stats at the wrong place ?
> 
> Pleas give us more details, thanks !

And double check the tcf_hash_create() call :

Last parameter is a boolean, and you must set it to true if you want per
cpu stats !
Jamal Hadi Salim Sept. 9, 2016, 12:33 p.m. UTC | #23
On 16-09-08 12:11 PM, Eric Dumazet wrote:
> On Thu, 2016-09-08 at 09:02 -0700, Eric Dumazet wrote:
>> On Thu, 2016-09-08 at 06:38 -0400, Jamal Hadi Salim wrote:
>>> On 16-09-06 10:25 AM, Eric Dumazet wrote:

>> This simply works for me.
>>
>> But maybe you read the stats at the wrong place ?
>>
>> Pleas give us more details, thanks !
>
> And double check the tcf_hash_create() call :
>
> Last parameter is a boolean, and you must set it to true if you want per
> cpu stats !
>

Dammit, yes that was it;-> Sigh. I shouldve caught it earlier.
Thanks! Now i feel more peaceful submitting the patch.

cheers,
jamal

>
diff mbox

Patch

diff --git a/include/net/tc_act/tc_skbmod.h b/include/net/tc_act/tc_skbmod.h
new file mode 100644
index 0000000..8ea0b25
--- /dev/null
+++ b/include/net/tc_act/tc_skbmod.h
@@ -0,0 +1,34 @@ 
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Jamal Hadi Salim <jhs@mojatatu.com>
+ */
+
+#ifndef __NET_TC_SKBMOD_H
+#define __NET_TC_SKBMOD_H
+
+#include <net/act_api.h>
+#include <linux/tc_act/tc_skbmod.h>
+
+struct tcf_skbmod {
+	struct tc_action	common;
+	u64	flags; /*up to 64 types of operations; extend if needed */
+	u8	eth_dst[ETH_ALEN];
+	u16	eth_type;
+	u8	eth_src[ETH_ALEN];
+};
+#define to_skbmod(a) ((struct tcf_skbmod *)a)
+
+#endif /* __NET_TC_SKBMOD_H */
diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h
new file mode 100644
index 0000000..a1fdff5
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_skbmod.h
@@ -0,0 +1,49 @@ 
+/*
+ * Copyright (c) 2016, Jamal Hadi Salim
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Author: Jamal Hadi Salim
+ */
+
+#ifndef __LINUX_TC_SKBMOD_H
+#define __LINUX_TC_SKBMOD_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_SKBMOD 15
+
+#define SKBMOD_F_DMAC	0x1
+#define SKBMOD_F_SMAC	0x2
+#define SKBMOD_F_ETYPE	0x4
+#define SKBMOD_F_SWAPMAC 0x8
+
+struct tc_skbmod {
+	tc_gen;
+	__u64 flags;
+};
+
+enum {
+	TCA_SKBMOD_UNSPEC,
+	TCA_SKBMOD_TM,
+	TCA_SKBMOD_PARMS,
+	TCA_SKBMOD_DMAC,
+	TCA_SKBMOD_SMAC,
+	TCA_SKBMOD_ETYPE,
+	TCA_SKBMOD_PAD,
+	__TCA_SKBMOD_MAX
+};
+#define TCA_SKBMOD_MAX (__TCA_SKBMOD_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index ccf931b..34b556d 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -749,6 +749,17 @@  config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_SKBMOD
+        tristate "skb data modification action"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow modification of skb data
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_skbmod.
+
 config NET_ACT_IFE
         tristate "Inter-FE action based on IETF ForCES InterFE LFB"
         depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index ae088a5..e82eff8 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@  obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_SKBMOD)	+= act_skbmod.o
 obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_IFE_SKBMARK)	+= act_meta_mark.o
 obj-$(CONFIG_NET_IFE_SKBPRIO)	+= act_meta_skbprio.o
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
new file mode 100644
index 0000000..506ad28
--- /dev/null
+++ b/net/sched/act_skbmod.c
@@ -0,0 +1,257 @@ 
+/*
+ * copyright Jamal Hadi Salim (2016)
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Jamal Hadi Salim <jhs@mojatatu.com>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+
+#include <linux/tc_act/tc_skbmod.h>
+#include <net/tc_act/tc_skbmod.h>
+
+#define SKBMOD_TAB_MASK     15
+
+static int skbmod_net_id;
+static struct tc_action_ops act_skbmod_ops;
+
+static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_skbmod *d = to_skbmod(a);
+
+	spin_lock(&d->tcf_lock);
+	tcf_lastuse_update(&d->tcf_tm);
+	bstats_update(&d->tcf_bstats, skb);
+
+	if (d->flags & SKBMOD_F_DMAC)
+		ether_addr_copy(eth_hdr(skb)->h_dest, d->eth_dst);
+	if (d->flags & SKBMOD_F_SMAC)
+		ether_addr_copy(eth_hdr(skb)->h_source, d->eth_src);
+	if (d->flags & SKBMOD_F_ETYPE)
+		eth_hdr(skb)->h_proto = d->eth_type;
+	if (d->flags & SKBMOD_F_SWAPMAC) {
+		u8 tmpaddr[ETH_ALEN];
+		/*XXX: I am sure we can come up with something more efficient */
+		ether_addr_copy(tmpaddr, eth_hdr(skb)->h_dest);
+		ether_addr_copy(eth_hdr(skb)->h_dest, eth_hdr(skb)->h_source);
+		ether_addr_copy(eth_hdr(skb)->h_source, tmpaddr);
+	}
+
+	spin_unlock(&d->tcf_lock);
+	return d->tcf_action;
+}
+
+static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
+	[TCA_SKBMOD_PARMS]		= { .len = sizeof(struct tc_skbmod) },
+	[TCA_SKBMOD_DMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_SMAC]		= { .len = ETH_ALEN },
+	[TCA_SKBMOD_ETYPE]		= { .type = NLA_U16 },
+};
+
+static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
+			   struct nlattr *est, struct tc_action **a,
+			   int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+	struct nlattr *tb[TCA_SKBMOD_MAX + 1];
+	struct tc_skbmod *parm;
+	struct tcf_skbmod *d;
+	u32 lflags = 0;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	u16 eth_type = 0;
+
+	bool exists = false;
+	int ret = 0, err;
+
+	if (nla == NULL)
+		return -EINVAL;
+
+	err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy);
+	if (err < 0)
+		return err;
+
+	if (tb[TCA_SKBMOD_PARMS] == NULL)
+		return -EINVAL;
+
+	if (tb[TCA_SKBMOD_DMAC]) {
+		daddr = nla_data(tb[TCA_SKBMOD_DMAC]);
+		lflags |= SKBMOD_F_DMAC;
+	}
+
+	if (tb[TCA_SKBMOD_SMAC]) {
+		saddr = nla_data(tb[TCA_SKBMOD_SMAC]);
+		lflags |= SKBMOD_F_SMAC;
+	}
+
+	if (tb[TCA_SKBMOD_ETYPE]) {
+		eth_type = nla_get_u16(tb[TCA_SKBMOD_ETYPE]);
+		lflags |= SKBMOD_F_ETYPE;
+	}
+
+	parm = nla_data(tb[TCA_SKBMOD_PARMS]);
+
+	if (parm->flags & SKBMOD_F_SWAPMAC)
+		lflags = SKBMOD_F_SWAPMAC;
+
+	exists = tcf_hash_check(tn, parm->index, a, bind);
+	if (exists && bind)
+		return 0;
+
+	if (!lflags) {
+		return -EINVAL;
+	}
+
+	if (!exists) {
+		ret = tcf_hash_create(tn, parm->index, est, a,
+				      &act_skbmod_ops, bind, false);
+		if (ret)
+			return ret;
+
+		d = to_skbmod(*a);
+		ret = ACT_P_CREATED;
+	} else {
+		d = to_skbmod(*a);
+		tcf_hash_release(*a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	spin_lock_bh(&d->tcf_lock);
+
+	if (lflags & SKBMOD_F_DMAC)
+		ether_addr_copy(d->eth_dst, daddr);
+	if (lflags & SKBMOD_F_SMAC)
+		ether_addr_copy(d->eth_src, saddr);
+	if (lflags & SKBMOD_F_ETYPE)
+		d->eth_type = htons(eth_type);
+
+	d->flags = lflags;
+	d->tcf_action = parm->action;
+
+	spin_unlock_bh(&d->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, *a);
+	return ret;
+}
+
+static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
+			   int bind, int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_skbmod *d = to_skbmod(a);
+	struct tc_skbmod opt = {
+		.index   = d->tcf_index,
+		.refcnt  = d->tcf_refcnt - ref,
+		.bindcnt = d->tcf_bindcnt - bind,
+		.action  = d->tcf_action,
+		.flags  = d->flags,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+	if ((d->flags & SKBMOD_F_DMAC) &&
+	    nla_put(skb, TCA_SKBMOD_DMAC, ETH_ALEN, d->eth_dst))
+		goto nla_put_failure;
+	if ((d->flags & SKBMOD_F_SMAC) &&
+	    nla_put(skb, TCA_SKBMOD_SMAC, ETH_ALEN, d->eth_src))
+		goto nla_put_failure;
+	if ((d->flags & SKBMOD_F_ETYPE) &&
+	    nla_put_u16(skb, TCA_SKBMOD_ETYPE, ntohs(d->eth_type)))
+		goto nla_put_failure;
+
+	tcf_tm_dump(&t, &d->tcf_tm);
+	if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
+		goto nla_put_failure;
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb,
+			     struct netlink_callback *cb, int type,
+			     const struct tc_action_ops *ops)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, ops);
+}
+
+static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_skbmod_ops = {
+	.kind		=	"skbmod",
+	.type		=	TCA_ACT_SKBMOD,
+	.owner		=	THIS_MODULE,
+	.act		=	tcf_skbmod_run,
+	.dump		=	tcf_skbmod_dump,
+	.init		=	tcf_skbmod_init,
+	.walk		=	tcf_skbmod_walker,
+	.lookup		=	tcf_skbmod_search,
+	.size		=	sizeof(struct tcf_skbmod),
+};
+
+static __net_init int skbmod_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	return tc_action_net_init(tn, &act_skbmod_ops, SKBMOD_TAB_MASK);
+}
+
+static void __net_exit skbmod_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, skbmod_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbmod_net_ops = {
+	.init = skbmod_init_net,
+	.exit = skbmod_exit_net,
+	.id   = &skbmod_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+MODULE_AUTHOR("Jamal Hadi Salim, <jhs@mojatatu.com>");
+MODULE_DESCRIPTION("SKB data mod-ing");
+MODULE_LICENSE("GPL");
+
+static int __init skbmod_init_module(void)
+{
+	return tcf_register_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+static void __exit skbmod_cleanup_module(void)
+{
+	tcf_unregister_action(&act_skbmod_ops, &skbmod_net_ops);
+}
+
+module_init(skbmod_init_module);
+module_exit(skbmod_cleanup_module);