diff mbox

[net-next,RFC,v2,2/3] ipv4: add support for light weight tunnel encap attributes

Message ID 1434689355-4088-3-git-send-email-roopa@cumulusnetworks.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Roopa Prabhu June 19, 2015, 4:49 a.m. UTC
From: Roopa Prabhu <roopa@cumulusnetworks.com>

Introduces two netlink attributes RTA_ENCAP_TYPE and
RTA_ENCAP to support attaching encap information to ipv4 routes.

RTA_ENCAP is a nested attribute as suggested by Thomas
(and also as Robert had it in his series). RTA_ENCAP
netlink policy is declared by the light weight tunnel
drivers that support this encap type.

fib code calls the following for each nexthop:
	- new route handler:
		lwt build state (that parses RTA_ENCAP and returns
	  	lwt state that lives in every fib_nh)
	- del dump hanlder:
		lwt release handler to release lwt state data
	- route dump hanlder:
		lwt dump encap to fill RTA_ENCAP data
	- during input route lookup
		sets dst->output to lwtunnel_output which
		in turn calls the corresponding lwt tunnel
		output function which applies the required
		encap and xmits the packet

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 include/net/ip_fib.h           |    7 ++-
 include/net/route.h            |    3 ++
 include/uapi/linux/rtnetlink.h |    3 +-
 net/ipv4/fib_frontend.c        |    8 ++++
 net/ipv4/fib_semantics.c       |   93 +++++++++++++++++++++++++++++++++++++++-
 net/ipv4/route.c               |   33 +++++++++++++-
 6 files changed, 142 insertions(+), 5 deletions(-)

Comments

Julian Anastasov June 19, 2015, 6:59 a.m. UTC | #1
Hello,

On Thu, 18 Jun 2015, Roopa Prabhu wrote:

> @@ -366,6 +371,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
>  	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
>  
>  	if (fi->fib_nhs) {
> +		size_t nh_encapsize = 0;

	Var not in #ifdef. Any warnings with CONFIG_LWTUNNEL=n?

>  		/* Also handles the special case fib_nhs == 1 */
>  
>  		/* each nexthop is packed in an attribute */
> @@ -374,8 +380,23 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
>  		/* may contain flow and gateway attribute */
>  		nhsize += 2 * nla_total_size(4);
>  
> +#ifdef CONFIG_LWTUNNEL
> +		/* grab encap info */
> +		for_nexthops(fi) {
> +			if (nh->nh_lwtstate) {
> +				/* RTA_ENCAP_TYPE */
> +				nh_encapsize += lwtunnel_get_encap_size(
> +						nh->nh_lwtstate);

	New labels not in #ifdef:

> +
> +err_inval:
> +	ret = -EINVAL;
> +
> +errout:
> +	return ret;
>  }

	Some other places may need changes:

- nh_comp: there is logic that decides if same fib_info
is reused from many fib nodes. There should be check
if NH matches by nh_lwtstate.

- xfrm4_fill_dst: not sure about this but some fields
are copied.

Regards

--
Julian Anastasov <ja@ssi.bg>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu June 19, 2015, 2:19 p.m. UTC | #2
On 6/18/15, 11:59 PM, Julian Anastasov wrote:
> 	Hello,
>
> On Thu, 18 Jun 2015, Roopa Prabhu wrote:
>
>> @@ -366,6 +371,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
>>   	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
>>   
>>   	if (fi->fib_nhs) {
>> +		size_t nh_encapsize = 0;
> 	Var not in #ifdef. Any warnings with CONFIG_LWTUNNEL=n?
>
>>   		/* Also handles the special case fib_nhs == 1 */
>>   
>>   		/* each nexthop is packed in an attribute */
>> @@ -374,8 +380,23 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
>>   		/* may contain flow and gateway attribute */
>>   		nhsize += 2 * nla_total_size(4);
>>   
>> +#ifdef CONFIG_LWTUNNEL
>> +		/* grab encap info */
>> +		for_nexthops(fi) {
>> +			if (nh->nh_lwtstate) {
>> +				/* RTA_ENCAP_TYPE */
>> +				nh_encapsize += lwtunnel_get_encap_size(
>> +						nh->nh_lwtstate);
> 	New labels not in #ifdef:
Will check and fix all warnings with CONFIG_LWTUNNEL off
>
>> +
>> +err_inval:
>> +	ret = -EINVAL;
>> +
>> +errout:
>> +	return ret;
>>   }
> 	Some other places may need changes:
>
> - nh_comp: there is logic that decides if same fib_info
> is reused from many fib nodes. There should be check
> if NH matches by nh_lwtstate.

yes, i will add that.
>
> - xfrm4_fill_dst: not sure about this but some fields
> are copied.
>
I have not picked up xfrm4_fill_dst specifically, but this infra is 
supposed to be similar to that.
I will look.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Robert Shearman June 19, 2015, 2:55 p.m. UTC | #3
On 19/06/15 15:19, roopa wrote:
> On 6/18/15, 11:59 PM, Julian Anastasov wrote:
>>     Hello,
>>
>> On Thu, 18 Jun 2015, Roopa Prabhu wrote:
>>
>>> @@ -366,6 +371,7 @@ static inline size_t fib_nlmsg_size(struct
>>> fib_info *fi)
>>>       payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
>>>       if (fi->fib_nhs) {
>>> +        size_t nh_encapsize = 0;
>>     Var not in #ifdef. Any warnings with CONFIG_LWTUNNEL=n?
>>
>>>           /* Also handles the special case fib_nhs == 1 */
>>>           /* each nexthop is packed in an attribute */
>>> @@ -374,8 +380,23 @@ static inline size_t fib_nlmsg_size(struct
>>> fib_info *fi)
>>>           /* may contain flow and gateway attribute */
>>>           nhsize += 2 * nla_total_size(4);
>>> +#ifdef CONFIG_LWTUNNEL
>>> +        /* grab encap info */
>>> +        for_nexthops(fi) {
>>> +            if (nh->nh_lwtstate) {
>>> +                /* RTA_ENCAP_TYPE */
>>> +                nh_encapsize += lwtunnel_get_encap_size(
>>> +                        nh->nh_lwtstate);
>>     New labels not in #ifdef:
> Will check and fix all warnings with CONFIG_LWTUNNEL off
>>
>>> +
>>> +err_inval:
>>> +    ret = -EINVAL;
>>> +
>>> +errout:
>>> +    return ret;
>>>   }
>>     Some other places may need changes:
>>
>> - nh_comp: there is logic that decides if same fib_info
>> is reused from many fib nodes. There should be check
>> if NH matches by nh_lwtstate.
>
> yes, i will add that.

One other place - fib_nh_match. This is used when deleting a route to 
verify that any supplied rtnetlink properties match the route in the fib.

Thanks,
Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Roopa Prabhu June 19, 2015, 3:15 p.m. UTC | #4
On 6/19/15, 7:55 AM, Robert Shearman wrote:
> On 19/06/15 15:19, roopa wrote:
>> On 6/18/15, 11:59 PM, Julian Anastasov wrote:
>>>       Some other places may need changes:
>>>
>>> - nh_comp: there is logic that decides if same fib_info
>>> is reused from many fib nodes. There should be check
>>> if NH matches by nh_lwtstate.
>>
>> yes, i will add that.
>
> One other place - fib_nh_match. This is used when deleting a route to 
> verify that any supplied rtnetlink properties match the route in the fib.
ack, thanks!.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Robert Shearman June 19, 2015, 3:19 p.m. UTC | #5
On 19/06/15 05:49, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> Introduces two netlink attributes RTA_ENCAP_TYPE and
> RTA_ENCAP to support attaching encap information to ipv4 routes.

Surely RTA_ENCAP_TYPE should be part of RTA_ENCAP, since the type 
doesn't make sense without the data and vice versa?

>
> RTA_ENCAP is a nested attribute as suggested by Thomas
> (and also as Robert had it in his series). RTA_ENCAP
> netlink policy is declared by the light weight tunnel
> drivers that support this encap type.
>
> fib code calls the following for each nexthop:
> 	- new route handler:
> 		lwt build state (that parses RTA_ENCAP and returns
> 	  	lwt state that lives in every fib_nh)
> 	- del dump hanlder:
> 		lwt release handler to release lwt state data
> 	- route dump hanlder:
> 		lwt dump encap to fill RTA_ENCAP data
> 	- during input route lookup
> 		sets dst->output to lwtunnel_output which
> 		in turn calls the corresponding lwt tunnel
> 		output function which applies the required
> 		encap and xmits the packet
>
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>

Thanks,
Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Roopa Prabhu June 19, 2015, 3:28 p.m. UTC | #6
On 6/19/15, 8:19 AM, Robert Shearman wrote:
> On 19/06/15 05:49, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> Introduces two netlink attributes RTA_ENCAP_TYPE and
>> RTA_ENCAP to support attaching encap information to ipv4 routes.
>
> Surely RTA_ENCAP_TYPE should be part of RTA_ENCAP, since the type 
> doesn't make sense without the data and vice versa?
I went back and forth on this. And started with what you are saying 
above. But then I wanted RTA_ENCAP netlink policy to be declared by 
individual lwtunnel drivers.
And to determine which RTA_ENCAP netlink policy to pick, you need to 
know the RTA_ENCAP policy type (or lwtunnel type)
which is encoded in RTA_ENCAP_TYPE. And I did not want to introduce 
another level of nest in RTA_ENCAP (because for nexthops we are already 
2 levels deep when parsing RTA_ENCAP).

Hence, fib code first looks for RTA_ENCAP and if RTA_ENCAP is specified, 
RTA_ENCAP_TYPE is a required attribute. My iproute2 patches handles this 
and makes sure
there is an  RTA_ENCAP_TYPE specified with RTA_ENCAP.

thanks,
Roopa


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Robert Shearman June 19, 2015, 5:17 p.m. UTC | #7
On 19/06/15 16:28, roopa wrote:
> On 6/19/15, 8:19 AM, Robert Shearman wrote:
>> On 19/06/15 05:49, Roopa Prabhu wrote:
>>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>>
>>> Introduces two netlink attributes RTA_ENCAP_TYPE and
>>> RTA_ENCAP to support attaching encap information to ipv4 routes.
>>
>> Surely RTA_ENCAP_TYPE should be part of RTA_ENCAP, since the type
>> doesn't make sense without the data and vice versa?
> I went back and forth on this. And started with what you are saying
> above. But then I wanted RTA_ENCAP netlink policy to be declared by
> individual lwtunnel drivers.
> And to determine which RTA_ENCAP netlink policy to pick, you need to
> know the RTA_ENCAP policy type (or lwtunnel type)
> which is encoded in RTA_ENCAP_TYPE. And I did not want to introduce
> another level of nest in RTA_ENCAP (because for nexthops we are already
> 2 levels deep when parsing RTA_ENCAP).

No need for that - use the example of how RTA_MULTIPATH is used for 
ipv4/ipv6:

+----------------------+
| RTA_MULTIPATH        |
+----------------------+
| +------------------+ |
| | struct rtnexthop | |
| +------------------+ |
| | RTA_GATEWAY, etc.| |
| +------------------+ |
+----------------------+

You could do similar for RTA_ENCAP where the type is stored in the data 
prior to the nested attributes starting. E.g.:

+----------------------+
| RTA_ENCAP            |
+----------------------+
| +------------------+ |
| | struct rtencap   | |
| +------------------+ |
| | MPLS_IPTUNNEL_DST| |
| +------------------+ |
+----------------------+

struct rtencap {
	__u16 rte_type;
};

>
> Hence, fib code first looks for RTA_ENCAP and if RTA_ENCAP is specified,
> RTA_ENCAP_TYPE is a required attribute. My iproute2 patches handles this
> and makes sure
> there is an  RTA_ENCAP_TYPE specified with RTA_ENCAP.

No doubt, but surely it's better to present an unambiguous API to 
userspace if possible?

Thanks,
Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Roopa Prabhu June 19, 2015, 6:42 p.m. UTC | #8
On 6/19/15, 10:17 AM, Robert Shearman wrote:
>
> No need for that - use the example of how RTA_MULTIPATH is used for 
> ipv4/ipv6:
>
> +----------------------+
> | RTA_MULTIPATH        |
> +----------------------+
> | +------------------+ |
> | | struct rtnexthop | |
> | +------------------+ |
> | | RTA_GATEWAY, etc.| |
> | +------------------+ |
> +----------------------+
>
> You could do similar for RTA_ENCAP where the type is stored in the 
> data prior to the nested attributes starting. E.g.:
>
> +----------------------+
> | RTA_ENCAP            |
> +----------------------+
> | +------------------+ |
> | | struct rtencap   | |
> | +------------------+ |
> | | MPLS_IPTUNNEL_DST| |
> | +------------------+ |
> +----------------------+
>
> struct rtencap {
>     __u16 rte_type;
> };
I did think about that...but today the rtnextop seems like it was 
written a struct initially and then extended with attributes only 
because the struct could not be extended (I maybe wrong). But half the 
fields are in a struct and the others are attributes. It gets confusing.
And i was trying to avoid that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Thomas Graf June 21, 2015, 8:20 p.m. UTC | #9
On 06/18/15 at 09:49pm, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
> 
> Introduces two netlink attributes RTA_ENCAP_TYPE and
> RTA_ENCAP to support attaching encap information to ipv4 routes.
> 
> RTA_ENCAP is a nested attribute as suggested by Thomas
> (and also as Robert had it in his series). RTA_ENCAP
> netlink policy is declared by the light weight tunnel
> drivers that support this encap type.
> 
> fib code calls the following for each nexthop:
> 	- new route handler:
> 		lwt build state (that parses RTA_ENCAP and returns
> 	  	lwt state that lives in every fib_nh)
> 	- del dump hanlder:
> 		lwt release handler to release lwt state data
> 	- route dump hanlder:
> 		lwt dump encap to fill RTA_ENCAP data
> 	- during input route lookup
> 		sets dst->output to lwtunnel_output which
> 		in turn calls the corresponding lwt tunnel
> 		output function which applies the required
> 		encap and xmits the packet

Thanks for putting in the flag! I think introducing helpers for the
lwt work would help as it can centralize the required ifdef magic
and be defined as a static inline nop or macros so the core routing
code doesn't need to have ifdefs spread around.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Roopa Prabhu June 22, 2015, 2:30 a.m. UTC | #10
On 6/21/15, 1:20 PM, Thomas Graf wrote:
> On 06/18/15 at 09:49pm, Roopa Prabhu wrote:
>> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>>
>> Introduces two netlink attributes RTA_ENCAP_TYPE and
>> RTA_ENCAP to support attaching encap information to ipv4 routes.
>>
>> RTA_ENCAP is a nested attribute as suggested by Thomas
>> (and also as Robert had it in his series). RTA_ENCAP
>> netlink policy is declared by the light weight tunnel
>> drivers that support this encap type.
>>
>> fib code calls the following for each nexthop:
>> 	- new route handler:
>> 		lwt build state (that parses RTA_ENCAP and returns
>> 	  	lwt state that lives in every fib_nh)
>> 	- del dump hanlder:
>> 		lwt release handler to release lwt state data
>> 	- route dump hanlder:
>> 		lwt dump encap to fill RTA_ENCAP data
>> 	- during input route lookup
>> 		sets dst->output to lwtunnel_output which
>> 		in turn calls the corresponding lwt tunnel
>> 		output function which applies the required
>> 		encap and xmits the packet
> Thanks for putting in the flag! I think introducing helpers for the
> lwt work would help as it can centralize the required ifdef magic
> and be defined as a static inline nop or macros so the core routing
> code doesn't need to have ifdefs spread around.
yes that would be better, let me see what I can do.

thanks,
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
Thomas Graf July 3, 2015, 10 a.m. UTC | #11
On 06/18/15 at 09:49pm, Roopa Prabhu wrote:
> +#ifdef CONFIG_LWTUNNEL
> +		if (fi->fib_nh->nh_lwtstate) {
> +			struct lwtunnel_state *lwtstate;
> +
> +			lwtstate = fi->fib_nh->nh_lwtstate;
> +			if (nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type))
> +				goto nla_put_failure;
> +			lwtunnel_fill_encap(skb, lwtstate);
> +		}
>  	}
> +#endif

Misplaced #endif ;-)

Other than that I managed to rebase my changes onto yours and it
looks clean.

Since we also discussed this a bit at NFWS, I'm enclosing a quick
summary:

 * Overall consensus that a lightweight flow based encapsulation
   makes sense.
 * Realization that what we actually want is stackable skb metadata
   between layers without over engineering it.
 * Consensus to avoid adding it to skb_shared_info and try to reuse
   the skb dst field.
 * New dst_metadata type similar to xfrm_dst which can carry metadata
   such as encapsulation instructions/information.
 * Can be made stackable to implement nested encapsulation if needed.
   Left out in the beginning to keep it simple.
 * Possible optimization option by putting the dst_metadata into a
   per cpu scratch buffer or stack without taking a reference and
   only force the reference & allocation when the skb is about to
   be queued. The regular fast path should never queue a skb with
   dst metadata attached.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu July 5, 2015, 6:21 a.m. UTC | #12
On 7/3/15, 3:00 AM, Thomas Graf wrote:
> On 06/18/15 at 09:49pm, Roopa Prabhu wrote:
>> +#ifdef CONFIG_LWTUNNEL
>> +		if (fi->fib_nh->nh_lwtstate) {
>> +			struct lwtunnel_state *lwtstate;
>> +
>> +			lwtstate = fi->fib_nh->nh_lwtstate;
>> +			if (nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type))
>> +				goto nla_put_failure;
>> +			lwtunnel_fill_encap(skb, lwtstate);
>> +		}
>>   	}
>> +#endif
> Misplaced #endif ;-)
Thx. I have fixed this since,...did not realize it came in as part of 
this RFC series.
>
> Other than that I managed to rebase my changes onto yours and it
> looks clean.
Glad to know!. thanks Thomas. I had a few more changes (mostly 
cleanup/bug fixes, ipv6 support and mostly earlier feedback from you)
in my local clone, pushed it to my github tree just now.
This also tries to not use CONFIG_LWTUNNEL all over the place. I had it 
that way initially also because of fib struct members
under #ifdef CONFIG_LWTUNNEL. (If we think at a later point that it is 
better to #ifdef CONFIG_LWTUNNEL fib struct members,
I can bring some of that back in). And, Only control path (rtnetlink) 
for ipv6 mpls iptunnels has been tested.

>
> Since we also discussed this a bit at NFWS, I'm enclosing a quick
> summary:
>
>   * Overall consensus that a lightweight flow based encapsulation
>     makes sense.
>   * Realization that what we actually want is stackable skb metadata
>     between layers without over engineering it.
>   * Consensus to avoid adding it to skb_shared_info and try to reuse
>     the skb dst field.
>   * New dst_metadata type similar to xfrm_dst which can carry metadata
>     such as encapsulation instructions/information.
>   * Can be made stackable to implement nested encapsulation if needed.
>     Left out in the beginning to keep it simple.
>   * Possible optimization option by putting the dst_metadata into a
>     per cpu scratch buffer or stack without taking a reference and
>     only force the reference & allocation when the skb is about to
>     be queued. The regular fast path should never queue a skb with
>     dst metadata attached.
Thanks for the summary. this helps.
I have been thinking of moving lwtstate from rtable to struct dst_entry.
I will also look at the dst_metadata.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Thomas Graf July 6, 2015, 1:03 p.m. UTC | #13
On 07/04/15 at 11:21pm, roopa wrote:
> Thx. I have fixed this since,...did not realize it came in as part of this
> RFC series.
> >
> >Other than that I managed to rebase my changes onto yours and it
> >looks clean.
> Glad to know!. thanks Thomas. I had a few more changes (mostly cleanup/bug
> fixes, ipv6 support and mostly earlier feedback from you)
> in my local clone, pushed it to my github tree just now.

> This also tries to not use CONFIG_LWTUNNEL all over the place. I had it that
> way initially also because of fib struct members

The integration into the routing code looks much better now. Any
chance you can rebase your tree and squash it into logical
commits? I'm rebasing again onto yours and I think we should submit
all of this in a single series so everything can be reviewed in a
single thread.

> under #ifdef CONFIG_LWTUNNEL. (If we think at a later point that it is
> better to #ifdef CONFIG_LWTUNNEL fib struct members,
> I can bring some of that back in). And, Only control path (rtnetlink) for
> ipv6 mpls iptunnels has been tested.

All of this is only useful if distros enable this by default which
minimize the value of a config option quite a bit. I think what you
have your latest tree is a good balance.

> I have been thinking of moving lwtstate from rtable to struct dst_entry.
> I will also look at the dst_metadata.

I thought about the same. The introduction of dst_metadata makes
that a bit pointless because we don't want to store a pointer to the
tunnel metadata but allocate the metadata as a dst directly to avoid
the indirection. We need the separate code paths anyway until v6 and
v4 routing are merged so we would save very little while penalizing
everyone except rtable and fib6.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roopa Prabhu July 6, 2015, 3:24 p.m. UTC | #14
On 7/6/15, 6:03 AM, Thomas Graf wrote:
> The integration into the routing code looks much better now. Any
> chance you can rebase your tree and squash it into logical
> commits?
Will do today.
>   I'm rebasing again onto yours and I think we should submit
> all of this in a single series so everything can be reviewed in a
> single thread.
agreed.
>
> All of this is only useful if distros enable this by default which
> minimize the value of a config option quite a bit. I think what you
> have your latest tree is a good balance.
ok
>
>> I have been thinking of moving lwtstate from rtable to struct dst_entry.
>> I will also look at the dst_metadata.
> I thought about the same. The introduction of dst_metadata makes
> that a bit pointless because we don't want to store a pointer to the
> tunnel metadata but allocate the metadata as a dst directly to avoid
> the indirection. We need the separate code paths anyway until v6 and
> v4 routing are merged so we would save very little while penalizing
> everyone except rtable and fib6.
agreed,

thanks,
Roopa

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 54271ed..49f18d7 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -44,7 +44,9 @@  struct fib_config {
 	u32			fc_flow;
 	u32			fc_nlflags;
 	struct nl_info		fc_nlinfo;
- };
+	struct nlattr		*fc_encap;
+	u16			fc_encap_type;
+};
 
 struct fib_info;
 struct rtable;
@@ -89,6 +91,9 @@  struct fib_nh {
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
 	struct fnhe_hash_bucket	__rcu *nh_exceptions;
+#ifdef CONFIG_LWTUNNEL
+	struct lwtunnel_state	*nh_lwtstate;
+#endif
 };
 
 /*
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03..39a6495 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -66,6 +66,9 @@  struct rtable {
 
 	struct list_head	rt_uncached;
 	struct uncached_list	*rt_uncached_list;
+#ifdef CONFIG_LWTUNNEL
+	struct lwtunnel_state   *rt_lwtstate;
+#endif
 };
 
 static inline bool rt_is_input_route(const struct rtable *rt)
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f..6c089ad 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -308,6 +308,8 @@  enum rtattr_type_t {
 	RTA_VIA,
 	RTA_NEWDST,
 	RTA_PREF,
+	RTA_ENCAP_TYPE,
+	RTA_ENCAP,
 	__RTA_MAX
 };
 
@@ -357,7 +359,6 @@  struct rtvia {
 };
 
 /* RTM_CACHEINFO */
-
 struct rta_cacheinfo {
 	__u32	rta_clntref;
 	__u32	rta_lastuse;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e..fbe0630 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -591,6 +591,8 @@  const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_METRICS]		= { .type = NLA_NESTED },
 	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
 	[RTA_FLOW]		= { .type = NLA_U32 },
+	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
+	[RTA_ENCAP]		= { .type = NLA_NESTED },
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
@@ -656,6 +658,12 @@  static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 		case RTA_TABLE:
 			cfg->fc_table = nla_get_u32(attr);
 			break;
+		case RTA_ENCAP:
+			cfg->fc_encap = attr;
+			break;
+		case RTA_ENCAP_TYPE:
+			cfg->fc_encap_type = nla_get_u16(attr);
+			break;
 		}
 	}
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 28ec3c1..54dd287 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -42,6 +42,7 @@ 
 #include <net/ip_fib.h>
 #include <net/netlink.h>
 #include <net/nexthop.h>
+#include <net/lwtunnel.h>
 
 #include "fib_lookup.h"
 
@@ -208,6 +209,10 @@  static void free_fib_info_rcu(struct rcu_head *head)
 	change_nexthops(fi) {
 		if (nexthop_nh->nh_dev)
 			dev_put(nexthop_nh->nh_dev);
+#ifdef CONFIG_LWTUNNEL
+		if (nexthop_nh->nh_lwtstate)
+			lwtunnel_state_put(nexthop_nh->nh_lwtstate);
+#endif
 		free_nh_exceptions(nexthop_nh);
 		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
@@ -366,6 +371,7 @@  static inline size_t fib_nlmsg_size(struct fib_info *fi)
 	payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
 
 	if (fi->fib_nhs) {
+		size_t nh_encapsize = 0;
 		/* Also handles the special case fib_nhs == 1 */
 
 		/* each nexthop is packed in an attribute */
@@ -374,8 +380,23 @@  static inline size_t fib_nlmsg_size(struct fib_info *fi)
 		/* may contain flow and gateway attribute */
 		nhsize += 2 * nla_total_size(4);
 
+#ifdef CONFIG_LWTUNNEL
+		/* grab encap info */
+		for_nexthops(fi) {
+			if (nh->nh_lwtstate) {
+				/* RTA_ENCAP_TYPE */
+				nh_encapsize += lwtunnel_get_encap_size(
+						nh->nh_lwtstate);
+				/* RTA_ENCAP */
+				nh_encapsize +=  nla_total_size(2);
+			}
+		} endfor_nexthops(fi);
+#endif
+
 		/* all nexthops are packed in a nested attribute */
-		payload += nla_total_size(fi->fib_nhs * nhsize);
+		payload += nla_total_size((fi->fib_nhs * nhsize) +
+					  nh_encapsize);
+
 	}
 
 	return payload;
@@ -452,6 +473,9 @@  static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 		       int remaining, struct fib_config *cfg)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
+	int ret;
+
 	change_nexthops(fi) {
 		int attrlen;
 
@@ -475,12 +499,40 @@  static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			if (nexthop_nh->nh_tclassid)
 				fi->fib_net->ipv4.fib_num_tclassid_users++;
 #endif
+#ifdef CONFIG_LWTUNNEL
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			if (nla) {
+				struct lwtunnel_state *lwtstate;
+				struct net_device *dev = NULL;
+				struct nlattr *nla_entype;
+
+				nla_entype = nla_find(attrs, attrlen,
+						      RTA_ENCAP_TYPE);
+				if (!nla_entype)
+					goto err_inval;
+				if (cfg->fc_oif)
+					dev = __dev_get_by_index(net, cfg->fc_oif);
+				ret = lwtunnel_build_state(dev, nla_get_u16(
+							   nla_entype),
+							   nla, &lwtstate);
+				if (ret)
+					goto errout;
+				lwtunnel_state_get(lwtstate);
+				nexthop_nh->nh_lwtstate = lwtstate;
+			}
+#endif
 		}
 
 		rtnh = rtnh_next(rtnh, &remaining);
 	} endfor_nexthops(fi);
 
 	return 0;
+
+err_inval:
+	ret = -EINVAL;
+
+errout:
+	return ret;
 }
 
 #endif
@@ -875,6 +927,25 @@  struct fib_info *fib_create_info(struct fib_config *cfg)
 	} else {
 		struct fib_nh *nh = fi->fib_nh;
 
+#ifdef CONFIG_LWTUNNEL
+		if (cfg->fc_encap) {
+			struct lwtunnel_state *lwtstate;
+			struct net_device *dev = NULL;
+			int ret;
+
+			if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
+				goto err_inval;
+			if (cfg->fc_oif)
+				dev = __dev_get_by_index(net, cfg->fc_oif);
+			ret = lwtunnel_build_state(dev, cfg->fc_encap_type,
+						   cfg->fc_encap, &lwtstate);
+			if (ret)
+				goto err_inval;
+
+			lwtunnel_state_get(lwtstate);
+			nh->nh_lwtstate = lwtstate;
+		}
+#endif
 		nh->nh_oif = cfg->fc_oif;
 		nh->nh_gw = cfg->fc_gw;
 		nh->nh_flags = cfg->fc_flags;
@@ -1034,7 +1105,17 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		    nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
 			goto nla_put_failure;
 #endif
+#ifdef CONFIG_LWTUNNEL
+		if (fi->fib_nh->nh_lwtstate) {
+			struct lwtunnel_state *lwtstate;
+
+			lwtstate = fi->fib_nh->nh_lwtstate;
+			if (nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type))
+				goto nla_put_failure;
+			lwtunnel_fill_encap(skb, lwtstate);
+		}
 	}
+#endif
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	if (fi->fib_nhs > 1) {
 		struct rtnexthop *rtnh;
@@ -1061,6 +1142,16 @@  int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 			    nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
 				goto nla_put_failure;
 #endif
+#ifdef CONFIG_LWTUNNEL
+			if (nh->nh_lwtstate) {
+				struct lwtunnel_state *lwtstate;
+
+				lwtstate = nh->nh_lwtstate;
+				if (nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type))
+					goto nla_put_failure;
+				lwtunnel_fill_encap(skb, lwtstate);
+			}
+#endif
 			/* length of rtnetlink header + attributes */
 			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
 		} endfor_nexthops(fi);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f605598..c6549f9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -102,6 +102,7 @@ 
 #include <net/tcp.h>
 #include <net/icmp.h>
 #include <net/xfrm.h>
+#include <net/lwtunnel.h>
 #include <net/netevent.h>
 #include <net/rtnetlink.h>
 #ifdef CONFIG_SYSCTL
@@ -1355,6 +1356,9 @@  static void ipv4_dst_destroy(struct dst_entry *dst)
 		list_del(&rt->rt_uncached);
 		spin_unlock_bh(&ul->lock);
 	}
+#ifdef CONFIG_LWTUNNEL
+	lwtunnel_state_put(rt->rt_lwtstate);
+#endif
 }
 
 void rt_flush_dev(struct net_device *dev)
@@ -1403,6 +1407,14 @@  static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		rt->dst.tclassid = nh->nh_tclassid;
 #endif
+#ifdef CONFIG_LWTUNNEL
+		if (nh->nh_lwtstate) {
+			lwtunnel_state_get(nh->nh_lwtstate);
+			rt->rt_lwtstate = nh->nh_lwtstate;
+		} else {
+			rt->rt_lwtstate = NULL;
+		}
+#endif
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
@@ -1488,6 +1500,9 @@  static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+#ifdef CONFIG_LWTUNNEL
+	rth->rt_lwtstate = NULL;
+#endif
 	if (our) {
 		rth->dst.input= ip_local_deliver;
 		rth->rt_flags |= RTCF_LOCAL;
@@ -1618,12 +1633,19 @@  static int __mkroute_input(struct sk_buff *skb,
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+#ifdef CONFIG_LWTUNNEL
+	rth->rt_lwtstate = NULL;
+#endif
 	RT_CACHE_STAT_INC(in_slow_tot);
 
 	rth->dst.input = ip_forward;
 	rth->dst.output = ip_output;
 
 	rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
+#ifdef CONFIG_LWTUNNEL
+	if (lwtunnel_output_redirect(rth->rt_lwtstate))
+		rth->dst.output = lwtunnel_output;
+#endif
 	skb_dst_set(skb, &rth->dst);
 out:
 	err = 0;
@@ -1792,6 +1814,9 @@  local_input:
 	rth->rt_gateway	= 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
+#ifdef CONFIG_LWTUNNEL
+	rth->rt_lwtstate = NULL;
+#endif
 	RT_CACHE_STAT_INC(in_slow_tot);
 	if (res.type == RTN_UNREACHABLE) {
 		rth->dst.input= ip_error;
@@ -1981,7 +2006,9 @@  add:
 	rth->rt_gateway = 0;
 	rth->rt_uses_gateway = 0;
 	INIT_LIST_HEAD(&rth->rt_uncached);
-
+#ifdef CONFIG_LWTUNNEL
+	rth->rt_lwtstate = NULL;
+#endif
 	RT_CACHE_STAT_INC(out_slow_tot);
 
 	if (flags & RTCF_LOCAL)
@@ -2261,7 +2288,9 @@  struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
 		rt->rt_uses_gateway = ort->rt_uses_gateway;
 
 		INIT_LIST_HEAD(&rt->rt_uncached);
-
+#ifdef CONFIG_LWTUNNEL
+		rt->rt_lwtstate = NULL;
+#endif
 		dst_free(new);
 	}