diff mbox

[net-next] mpls: allow TTL propagation to/from IP packets to be configured

Message ID 1485808575-16852-1-git-send-email-rshearma@brocade.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Robert Shearman Jan. 30, 2017, 8:36 p.m. UTC
It is sometimes desirable to present an MPLS transport network as a
single hop to traffic transiting it because it prevents confusion when
diagnosing failures. An example of where confusion can be generated is
when addresses used in the provider network overlap with addresses in
the overlay network and the addresses get exposed through ICMP errors
generated as packets transit the provider network.

Therefore, provide the ability to control whether the TTL value from
an MPLS packet is propagated to an IPv4/IPv6 packet when the last
label is popped through the addition of a new per-namespace sysctl:
"net.mpls.ip_ttl_propagate" which defaults to enabled.

Use the same sysctl to control whether the TTL is propagated from IP
packets into the MPLS header. If the TTL isn't propagated then a
default TTL value is used which can be configured via a new sysctl:
"net.mpls.default_ttl".

Signed-off-by: Robert Shearman <rshearma@brocade.com>
---
 Documentation/networking/mpls-sysctl.txt | 19 +++++++++
 include/net/netns/mpls.h                 |  3 ++
 net/mpls/af_mpls.c                       | 70 ++++++++++++++++++++++++--------
 net/mpls/mpls_iptunnel.c                 | 12 +++++-
 4 files changed, 85 insertions(+), 19 deletions(-)

Comments

Eric W. Biederman Jan. 31, 2017, 12:17 a.m. UTC | #1
Robert Shearman <rshearma@brocade.com> writes:

> It is sometimes desirable to present an MPLS transport network as a
> single hop to traffic transiting it because it prevents confusion when
> diagnosing failures. An example of where confusion can be generated is
> when addresses used in the provider network overlap with addresses in
> the overlay network and the addresses get exposed through ICMP errors
> generated as packets transit the provider network.
>
> Therefore, provide the ability to control whether the TTL value from
> an MPLS packet is propagated to an IPv4/IPv6 packet when the last
> label is popped through the addition of a new per-namespace sysctl:
> "net.mpls.ip_ttl_propagate" which defaults to enabled.
>
> Use the same sysctl to control whether the TTL is propagated from IP
> packets into the MPLS header. If the TTL isn't propagated then a
> default TTL value is used which can be configured via a new sysctl:
> "net.mpls.default_ttl".

Instead of having a global sysctl can we please have a different way
to configure the ingress/egress?

My general memory is that this makes sense for a slightly different
tunnel type.   Making it a per mpls tunnel property instead of global
property feels like it should be much more maintainable.

Similarly with the related behavior of what to do if the mpls ttl is
exhausted during the trip through the tunnel.  Drop or dig through the
packet and send an ICMP error message at the ip layer.

Eric
David Ahern Jan. 31, 2017, 12:41 a.m. UTC | #2
On 1/30/17 1:36 PM, Robert Shearman wrote:
> It is sometimes desirable to present an MPLS transport network as a
> single hop to traffic transiting it because it prevents confusion when
> diagnosing failures. An example of where confusion can be generated is
> when addresses used in the provider network overlap with addresses in
> the overlay network and the addresses get exposed through ICMP errors
> generated as packets transit the provider network.
> 
> Therefore, provide the ability to control whether the TTL value from
> an MPLS packet is propagated to an IPv4/IPv6 packet when the last
> label is popped through the addition of a new per-namespace sysctl:
> "net.mpls.ip_ttl_propagate" which defaults to enabled.
> 
> Use the same sysctl to control whether the TTL is propagated from IP
> packets into the MPLS header. If the TTL isn't propagated then a
> default TTL value is used which can be configured via a new sysctl:
> "net.mpls.default_ttl".
> 
> Signed-off-by: Robert Shearman <rshearma@brocade.com>
> ---
>  Documentation/networking/mpls-sysctl.txt | 19 +++++++++
>  include/net/netns/mpls.h                 |  3 ++
>  net/mpls/af_mpls.c                       | 70 ++++++++++++++++++++++++--------
>  net/mpls/mpls_iptunnel.c                 | 12 +++++-
>  4 files changed, 85 insertions(+), 19 deletions(-)
> 
> diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
> index 15d8d16934fd..b8f0725ff09e 100644
> --- a/Documentation/networking/mpls-sysctl.txt
> +++ b/Documentation/networking/mpls-sysctl.txt
> @@ -19,6 +19,25 @@ platform_labels - INTEGER
>  	Possible values: 0 - 1048575
>  	Default: 0
>  
> +ip_ttl_propagate - BOOL
> +	Control whether TTL is propagated from the IPv4/IPv6 header to
> +	the MPLS header on imposing labels and propagated from the
> +	MPLS header to the IPv4/IPv6 header on popping the last label.
> +
> +	If disabled, the MPLS transport network will appear as a
> +	single hop to transit traffic.
> +
> +	0 - disabled
> +	1 - enabled (default)
> +

It seems like you are going after RFC 3443 with this change. Can you add comment to that effect? i.e.,  ip_ttl_propagate enabled is the Uniform Model and ip_ttl_propagate disabled is the Short Pipe Model.
David Ahern Jan. 31, 2017, 1:09 a.m. UTC | #3
On 1/30/17 1:36 PM, Robert Shearman wrote:
> @@ -243,24 +245,29 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
>  		payload_type = ip_hdr(skb)->version;
>  
>  	switch (payload_type) {
> -	case MPT_IPV4: {
> -		struct iphdr *hdr4 = ip_hdr(skb);
> -		skb->protocol = htons(ETH_P_IP);
> -		csum_replace2(&hdr4->check,
> -			      htons(hdr4->ttl << 8),
> -			      htons(dec.ttl << 8));
> -		hdr4->ttl = dec.ttl;
> +	case MPT_IPV4:
> +		if (net->mpls.ip_ttl_propagate) {
> +			struct iphdr *hdr4 = ip_hdr(skb);
> +
> +			skb->protocol = htons(ETH_P_IP);

The protocol setting here and ...

> +			csum_replace2(&hdr4->check,
> +				      htons(hdr4->ttl << 8),
> +				      htons(dec.ttl << 8));
> +			hdr4->ttl = dec.ttl;
> +		}
>  		success = true;
>  		break;
> -	}
> -	case MPT_IPV6: {
> -		struct ipv6hdr *hdr6 = ipv6_hdr(skb);
> -		skb->protocol = htons(ETH_P_IPV6);
> -		hdr6->hop_limit = dec.ttl;
> +	case MPT_IPV6:
> +		if (net->mpls.ip_ttl_propagate) {
> +			struct ipv6hdr *hdr6 = ipv6_hdr(skb);
> +
> +			skb->protocol = htons(ETH_P_IPV6);

here need to be done outside of net->mpls.ip_ttl_propagate otherwise ...

> +			hdr6->hop_limit = dec.ttl;
> +		}
>  		success = true;
>  		break;
> -	}
>  	case MPT_UNSPEC:
> +		/* Should have decided which protocol it is by now */
>  		break;
>  	}
>  

disabling ip_ttl_propagate causes a corrupted packet to show up at the end host (after the LSP):

IP4:
16:54:08.895372 46:a9:1c:9f:30:ba > fa:61:57:d6:1a:7d, ethertype MPLS unicast (0x8847), length 98: MPLS (label 282624, exp 0, ttl 84)
	(label 433380, exp 0, ttl 0)
	(label 262160, exp 7, [S], ttl 182)
	0x0000:  ac10 0101 0a0a 0a0a 0800 1677 05d3 0001  ...........w....
	0x0010:  30e0 8f58 0000 0000 4fa9 0d00 0000 0000  0..X....O.......
	0x0020:  1011 1213 1415 1617 1819 1a1b 1c1d 1e1f  ................
	0x0030:  2021 2223 2425 2627 2829 2a2b 2c2d 2e2f  .!"#$%&'()*+,-./
	0x0040:  3031 3233 3435 3637                      01234567


IPv6:
16:57:40.517520 46:a9:1c:9f:30:ba > fa:61:57:d6:1a:7d, ethertype MPLS unicast (0x8847), length 118: MPLS (label 393290, exp 5, ttl 240)
	(label 1027, exp 5, ttl 64)
	(label 131072, exp 0, ttl 1)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 0)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 0)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 1)
	(label 196608, exp 0, ttl 1)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 0)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 0)
	(label 0 (IPv4 explicit NULL), exp 0, ttl 1)
	(label 524297, exp 1, [S], ttl 102)
	0x0000:  0628 0001 04e1 8f58 0000 0000 3be5 0700  .(.....X....;...
	0x0010:  0000 0000 1011 1213 1415 1617 1819 1a1b  ................
	0x0020:  1c1d 1e1f 2021 2223 2425 2627 2829 2a2b  .....!"#$%&'()*+
	0x0030:  2c2d 2e2f 3031 3233 3435 3637            ,-./01234567
Robert Shearman Jan. 31, 2017, 11:59 a.m. UTC | #4
On 31/01/17 00:17, Eric W. Biederman wrote:
> Robert Shearman <rshearma@brocade.com> writes:
>
>> It is sometimes desirable to present an MPLS transport network as a
>> single hop to traffic transiting it because it prevents confusion when
>> diagnosing failures. An example of where confusion can be generated is
>> when addresses used in the provider network overlap with addresses in
>> the overlay network and the addresses get exposed through ICMP errors
>> generated as packets transit the provider network.
>>
>> Therefore, provide the ability to control whether the TTL value from
>> an MPLS packet is propagated to an IPv4/IPv6 packet when the last
>> label is popped through the addition of a new per-namespace sysctl:
>> "net.mpls.ip_ttl_propagate" which defaults to enabled.
>>
>> Use the same sysctl to control whether the TTL is propagated from IP
>> packets into the MPLS header. If the TTL isn't propagated then a
>> default TTL value is used which can be configured via a new sysctl:
>> "net.mpls.default_ttl".
>
> Instead of having a global sysctl can we please have a different way
> to configure the ingress/egress?
>
> My general memory is that this makes sense for a slightly different
> tunnel type.   Making it a per mpls tunnel property instead of global
> property feels like it should be much more maintainable.

RFC 3443 that David Ahern referenced does indeed infer that this should 
be a per-LSP property. However, it says:

>    We also note here that signaling the LSP type (Pipe, Short Pipe or
>    Uniform Model) is out of the scope of this document, and that is also
>    not addressed in the current versions of the label distribution
>    protocols, e.g. LDP [MPLS-LDP] and RSVP-TE [MPLS-RSVP].  Currently,
>    the LSP type is configured by the network operator manually by means
>    of either a command line or network management interface.

AIUI, the situation of label distribution protocols not signaling this 
property hasn't changed from when this RFC has written, which limits the 
usefulness of a per-LSP property, and perhaps also indicates a lack of 
desire from users of this.

Do you still feel it's worth implementing on a per-LSP basis? If so, any 
opinion on how it should be done for the pop case? Either a new per-path 
RTA attribute can be added, e.g. RTA_TTL_PROPAGATE, or a new rtnh flag 
could be added, e.g. RTNH_F_TTL_PROPAGATE.

> Similarly with the related behavior of what to do if the mpls ttl is
> exhausted during the trip through the tunnel.  Drop or dig through the
> packet and send an ICMP error message at the ip layer.

That's an interesting suggestion, but I don't think it will be useful 
when carrying another LSP over the LSP in question, since the LSR will 
have no idea what the label is being used for (i.e. the payload). If 
there is only one label in the packet then the router should know what 
the payload is of the label and since this is implicitly IPv4 or IPv6 at 
the moment (since those are the only types of traffic for which the 
labels can be used) then surely the ICMP should always be generated in 
that case?

Thanks,
Rob
Robert Shearman Jan. 31, 2017, noon UTC | #5
On 31/01/17 00:41, David Ahern wrote:
> On 1/30/17 1:36 PM, Robert Shearman wrote:
>> It is sometimes desirable to present an MPLS transport network as a
>> single hop to traffic transiting it because it prevents confusion when
>> diagnosing failures. An example of where confusion can be generated is
>> when addresses used in the provider network overlap with addresses in
>> the overlay network and the addresses get exposed through ICMP errors
>> generated as packets transit the provider network.
>>
>> Therefore, provide the ability to control whether the TTL value from
>> an MPLS packet is propagated to an IPv4/IPv6 packet when the last
>> label is popped through the addition of a new per-namespace sysctl:
>> "net.mpls.ip_ttl_propagate" which defaults to enabled.
>>
>> Use the same sysctl to control whether the TTL is propagated from IP
>> packets into the MPLS header. If the TTL isn't propagated then a
>> default TTL value is used which can be configured via a new sysctl:
>> "net.mpls.default_ttl".
>>
>> Signed-off-by: Robert Shearman <rshearma@brocade.com>
>> ---
>>  Documentation/networking/mpls-sysctl.txt | 19 +++++++++
>>  include/net/netns/mpls.h                 |  3 ++
>>  net/mpls/af_mpls.c                       | 70 ++++++++++++++++++++++++--------
>>  net/mpls/mpls_iptunnel.c                 | 12 +++++-
>>  4 files changed, 85 insertions(+), 19 deletions(-)
>>
>> diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
>> index 15d8d16934fd..b8f0725ff09e 100644
>> --- a/Documentation/networking/mpls-sysctl.txt
>> +++ b/Documentation/networking/mpls-sysctl.txt
>> @@ -19,6 +19,25 @@ platform_labels - INTEGER
>>  	Possible values: 0 - 1048575
>>  	Default: 0
>>
>> +ip_ttl_propagate - BOOL
>> +	Control whether TTL is propagated from the IPv4/IPv6 header to
>> +	the MPLS header on imposing labels and propagated from the
>> +	MPLS header to the IPv4/IPv6 header on popping the last label.
>> +
>> +	If disabled, the MPLS transport network will appear as a
>> +	single hop to transit traffic.
>> +
>> +	0 - disabled
>> +	1 - enabled (default)
>> +
>
> It seems like you are going after RFC 3443 with this change. Can you add comment to that effect? i.e.,  ip_ttl_propagate enabled is the Uniform Model and ip_ttl_propagate disabled is the Short Pipe Model.
>

Good idea, will add it in the appropriate place depending on the chosen API.

Thanks,
Rob
Robert Shearman Jan. 31, 2017, 12:01 p.m. UTC | #6
On 31/01/17 01:09, David Ahern wrote:
> On 1/30/17 1:36 PM, Robert Shearman wrote:
>> @@ -243,24 +245,29 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
>>  		payload_type = ip_hdr(skb)->version;
>>
>>  	switch (payload_type) {
>> -	case MPT_IPV4: {
>> -		struct iphdr *hdr4 = ip_hdr(skb);
>> -		skb->protocol = htons(ETH_P_IP);
>> -		csum_replace2(&hdr4->check,
>> -			      htons(hdr4->ttl << 8),
>> -			      htons(dec.ttl << 8));
>> -		hdr4->ttl = dec.ttl;
>> +	case MPT_IPV4:
>> +		if (net->mpls.ip_ttl_propagate) {
>> +			struct iphdr *hdr4 = ip_hdr(skb);
>> +
>> +			skb->protocol = htons(ETH_P_IP);
>
> The protocol setting here and ...
>
>> +			csum_replace2(&hdr4->check,
>> +				      htons(hdr4->ttl << 8),
>> +				      htons(dec.ttl << 8));
>> +			hdr4->ttl = dec.ttl;
>> +		}
>>  		success = true;
>>  		break;
>> -	}
>> -	case MPT_IPV6: {
>> -		struct ipv6hdr *hdr6 = ipv6_hdr(skb);
>> -		skb->protocol = htons(ETH_P_IPV6);
>> -		hdr6->hop_limit = dec.ttl;
>> +	case MPT_IPV6:
>> +		if (net->mpls.ip_ttl_propagate) {
>> +			struct ipv6hdr *hdr6 = ipv6_hdr(skb);
>> +
>> +			skb->protocol = htons(ETH_P_IPV6);
>
> here need to be done outside of net->mpls.ip_ttl_propagate otherwise ...
>
>> +			hdr6->hop_limit = dec.ttl;
>> +		}
>>  		success = true;
>>  		break;
>> -	}
>>  	case MPT_UNSPEC:
>> +		/* Should have decided which protocol it is by now */
>>  		break;
>>  	}
>>
>
> disabling ip_ttl_propagate causes a corrupted packet to show up at the end host (after the LSP):

Oops, good catch. Will fix in v2.

Thanks,
Rob
Eric W. Biederman Feb. 3, 2017, 3:21 a.m. UTC | #7
Robert Shearman <rshearma@brocade.com> writes:

> On 31/01/17 00:17, Eric W. Biederman wrote:
>> Robert Shearman <rshearma@brocade.com> writes:
>>
>>> It is sometimes desirable to present an MPLS transport network as a
>>> single hop to traffic transiting it because it prevents confusion when
>>> diagnosing failures. An example of where confusion can be generated is
>>> when addresses used in the provider network overlap with addresses in
>>> the overlay network and the addresses get exposed through ICMP errors
>>> generated as packets transit the provider network.
>>>
>>> Therefore, provide the ability to control whether the TTL value from
>>> an MPLS packet is propagated to an IPv4/IPv6 packet when the last
>>> label is popped through the addition of a new per-namespace sysctl:
>>> "net.mpls.ip_ttl_propagate" which defaults to enabled.
>>>
>>> Use the same sysctl to control whether the TTL is propagated from IP
>>> packets into the MPLS header. If the TTL isn't propagated then a
>>> default TTL value is used which can be configured via a new sysctl:
>>> "net.mpls.default_ttl".
>>
>> Instead of having a global sysctl can we please have a different way
>> to configure the ingress/egress?
>>
>> My general memory is that this makes sense for a slightly different
>> tunnel type.   Making it a per mpls tunnel property instead of global
>> property feels like it should be much more maintainable.
>
> RFC 3443 that David Ahern referenced does indeed infer that this
> should be a per-LSP property. However, it says:
>
>>    We also note here that signaling the LSP type (Pipe, Short Pipe or
>>    Uniform Model) is out of the scope of this document, and that is also
>>    not addressed in the current versions of the label distribution
>>    protocols, e.g. LDP [MPLS-LDP] and RSVP-TE [MPLS-RSVP].  Currently,
>>    the LSP type is configured by the network operator manually by means
>>    of either a command line or network management interface.
>
> AIUI, the situation of label distribution protocols not signaling this
> property hasn't changed from when this RFC has written, which limits
> the usefulness of a per-LSP property, and perhaps also indicates a
> lack of desire from users of this.
>
> Do you still feel it's worth implementing on a per-LSP basis? If so,
> any opinion on how it should be done for the pop case? Either a new
> per-path RTA attribute can be added, e.g. RTA_TTL_PROPAGATE, or a new
> rtnh flag could be added, e.g. RTNH_F_TTL_PROPAGATE.

My brain is mostly elswhere right now so I don't have an implementation
on how it should be implemented.   However Linux fundamentally gets used
interesting ways, and if we don't implement the option as per mpls exit
now someone will come along and need to do the work later.

Perhaps it will only be used with hard coded static configurations, and
it is fundamentally a per tunnel property.

It will be less work to maintain, and the code will run faster in the
long run if we don't have two code paths to maintain.

Eric
David Ahern Feb. 3, 2017, 4:02 a.m. UTC | #8
On 2/2/17 8:21 PM, Eric W. Biederman wrote:
> 
> My brain is mostly elswhere right now so I don't have an implementation
> on how it should be implemented.   However Linux fundamentally gets used
> interesting ways, and if we don't implement the option as per mpls exit
> now someone will come along and need to do the work later.
> 
> Perhaps it will only be used with hard coded static configurations, and
> it is fundamentally a per tunnel property.
> 
> It will be less work to maintain, and the code will run faster in the
> long run if we don't have two code paths to maintain.

I can see the argument for per-tunnel knobs, but looking at ios and nx-os docs it seems appropriate to have a global knob as well. In that regards having sysctl knobs solves the global setting and when/if needed we can add MPLS_IPTUNNEL_ZZZZZ encap attributes for the per-tunnel settings.

Does that seem reasonable?
Robert Shearman March 8, 2017, 12:46 a.m. UTC | #9
It is sometimes desirable to present an MPLS transport network as a
single hop to traffic transiting it because it prevents confusion when
diagnosing failures. An example of where confusion can be generated is
when addresses used in the provider network overlap with addresses in
the overlay network and the addresses get exposed through ICMP errors
generated as packets transit the provider network.

In addition, RFC 3443 defines two methods of deriving TTL for an
outgoing packet: Uniform Model where the TTL is propagated to/from the
MPLS header and both Pipe Models and Short Pipe Models (with and
without PHP) where the TTL is not propagated to/from the MPLS header.

Changes in v2:
 - add references to RFC 3443 as suggested by David Ahern
 - fix setting of skb->protocol as noticed by David Ahern
 - implement per-route/per-LWT configurability as suggested by Eric
   Biederman
 - split into two patches for ease of review

Robert Shearman (2):
  mpls: allow TTL propagation to IP packets to be configured
  mpls: allow TTL propagation from IP packets to be configured

 Documentation/networking/mpls-sysctl.txt | 19 ++++++
 include/net/mpls_iptunnel.h              |  2 +
 include/net/netns/mpls.h                 |  3 +
 include/uapi/linux/mpls_iptunnel.h       |  2 +
 include/uapi/linux/rtnetlink.h           |  1 +
 net/mpls/af_mpls.c                       | 99 ++++++++++++++++++++++++++------
 net/mpls/internal.h                      |  7 +++
 net/mpls/mpls_iptunnel.c                 | 64 ++++++++++++++++-----
 8 files changed, 168 insertions(+), 29 deletions(-)
Robert Shearman March 10, 2017, 8:43 p.m. UTC | #10
It is sometimes desirable to present an MPLS transport network as a
single hop to traffic transiting it because it prevents confusion when
diagnosing failures. An example of where confusion can be generated is
when addresses used in the provider network overlap with addresses in
the overlay network and the addresses get exposed through ICMP errors
generated as packets transit the provider network.

In addition, RFC 3443 defines two methods of deriving TTL for an
outgoing packet: Uniform Model where the TTL is propagated to/from the
MPLS header and both Pipe Models and Short Pipe Models (with and
without PHP) where the TTL is not propagated to/from the MPLS header.

Changes in v3:
 - decrement ttl on popping last label when not doing ttl propagation,
   as suggested by David Ahern.
 - add comment to describe what the somewhat complex conditionals are
   doing to work out what ttl to use in mpls_iptunnel.c.
 - rearrange fields fields in struct netns_mpls to keep the platform
   label fields together, as suggested by David Ahern.

Changes in v2:
 - add references to RFC 3443 as suggested by David Ahern
 - fix setting of skb->protocol as noticed by David Ahern
 - implement per-route/per-LWT configurability as suggested by Eric
   Biederman
 - split into two patches for ease of review

Robert Shearman (2):
  mpls: allow TTL propagation to IP packets to be configured
  mpls: allow TTL propagation from IP packets to be configured

 Documentation/networking/mpls-sysctl.txt | 19 +++++++
 include/net/mpls_iptunnel.h              |  2 +
 include/net/netns/mpls.h                 |  3 +
 include/uapi/linux/mpls_iptunnel.h       |  2 +
 include/uapi/linux/rtnetlink.h           |  1 +
 net/mpls/af_mpls.c                       | 98 +++++++++++++++++++++++++++++---
 net/mpls/internal.h                      |  7 +++
 net/mpls/mpls_iptunnel.c                 | 73 +++++++++++++++++++-----
 8 files changed, 184 insertions(+), 21 deletions(-)
Roopa Prabhu March 13, 2017, 8:28 p.m. UTC | #11
On 3/10/17, 12:43 PM, Robert Shearman wrote:
> It is sometimes desirable to present an MPLS transport network as a
> single hop to traffic transiting it because it prevents confusion when
> diagnosing failures. An example of where confusion can be generated is
> when addresses used in the provider network overlap with addresses in
> the overlay network and the addresses get exposed through ICMP errors
> generated as packets transit the provider network.
>
> In addition, RFC 3443 defines two methods of deriving TTL for an
> outgoing packet: Uniform Model where the TTL is propagated to/from the
> MPLS header and both Pipe Models and Short Pipe Models (with and
> without PHP) where the TTL is not propagated to/from the MPLS header.
>
> Changes in v3:
>  - decrement ttl on popping last label when not doing ttl propagation,
>    as suggested by David Ahern.
>  - add comment to describe what the somewhat complex conditionals are
>    doing to work out what ttl to use in mpls_iptunnel.c.
>  - rearrange fields fields in struct netns_mpls to keep the platform
>    label fields together, as suggested by David Ahern.
>
> Changes in v2:
>  - add references to RFC 3443 as suggested by David Ahern
>  - fix setting of skb->protocol as noticed by David Ahern
>  - implement per-route/per-LWT configurability as suggested by Eric
>    Biederman
>  - split into two patches for ease of review
>
> Robert Shearman (2):
>   mpls: allow TTL propagation to IP packets to be configured
>   mpls: allow TTL propagation from IP packets to be configured
>
>  
Acked-by: Roopa Prabhu <roopa@cumulusnetworks.com>
David Miller March 13, 2017, 10:29 p.m. UTC | #12
From: Robert Shearman <rshearma@brocade.com>
Date: Fri, 10 Mar 2017 20:43:23 +0000

> It is sometimes desirable to present an MPLS transport network as a
> single hop to traffic transiting it because it prevents confusion when
> diagnosing failures. An example of where confusion can be generated is
> when addresses used in the provider network overlap with addresses in
> the overlay network and the addresses get exposed through ICMP errors
> generated as packets transit the provider network.
> 
> In addition, RFC 3443 defines two methods of deriving TTL for an
> outgoing packet: Uniform Model where the TTL is propagated to/from the
> MPLS header and both Pipe Models and Short Pipe Models (with and
> without PHP) where the TTL is not propagated to/from the MPLS header.
> 
> Changes in v3:
>  - decrement ttl on popping last label when not doing ttl propagation,
>    as suggested by David Ahern.
>  - add comment to describe what the somewhat complex conditionals are
>    doing to work out what ttl to use in mpls_iptunnel.c.
>  - rearrange fields fields in struct netns_mpls to keep the platform
>    label fields together, as suggested by David Ahern.
> 
> Changes in v2:
>  - add references to RFC 3443 as suggested by David Ahern
>  - fix setting of skb->protocol as noticed by David Ahern
>  - implement per-route/per-LWT configurability as suggested by Eric
>    Biederman
>  - split into two patches for ease of review

Series applied, thanks.
diff mbox

Patch

diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt
index 15d8d16934fd..b8f0725ff09e 100644
--- a/Documentation/networking/mpls-sysctl.txt
+++ b/Documentation/networking/mpls-sysctl.txt
@@ -19,6 +19,25 @@  platform_labels - INTEGER
 	Possible values: 0 - 1048575
 	Default: 0
 
+ip_ttl_propagate - BOOL
+	Control whether TTL is propagated from the IPv4/IPv6 header to
+	the MPLS header on imposing labels and propagated from the
+	MPLS header to the IPv4/IPv6 header on popping the last label.
+
+	If disabled, the MPLS transport network will appear as a
+	single hop to transit traffic.
+
+	0 - disabled
+	1 - enabled (default)
+
+default_ttl - BOOL
+	Default TTL value to use for MPLS packets where it cannot be
+	propagated from an IP header, either because one isn't present
+	or ip_ttl_propagate has been disabled.
+
+	Possible values: 1 - 255
+	Default: 255
+
 conf/<interface>/input - BOOL
 	Control whether packets can be input on this interface.
 
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
index d29203651c01..1b68aed6e1b9 100644
--- a/include/net/netns/mpls.h
+++ b/include/net/netns/mpls.h
@@ -10,7 +10,10 @@  struct ctl_table_header;
 
 struct netns_mpls {
 	size_t platform_labels;
+	int ip_ttl_propagate;
+	int default_ttl;
 	struct mpls_route __rcu * __rcu *platform_label;
+
 	struct ctl_table_header *ctl;
 };
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 64d3bf269a26..bf5f0792e8a2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -31,7 +31,9 @@ 
 #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
 
 static int zero = 0;
+static int one = 1;
 static int label_limit = (1 << 20) - 1;
+static int ttl_max = 255;
 
 static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt,
 		       struct nlmsghdr *nlh, struct net *net, u32 portid,
@@ -219,8 +221,8 @@  static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
 	return &rt->rt_nh[nh_index];
 }
 
-static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
-			struct mpls_entry_decoded dec)
+static bool mpls_egress(struct net *net, struct mpls_route *rt,
+			struct sk_buff *skb, struct mpls_entry_decoded dec)
 {
 	enum mpls_payload_type payload_type;
 	bool success = false;
@@ -243,24 +245,29 @@  static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
 		payload_type = ip_hdr(skb)->version;
 
 	switch (payload_type) {
-	case MPT_IPV4: {
-		struct iphdr *hdr4 = ip_hdr(skb);
-		skb->protocol = htons(ETH_P_IP);
-		csum_replace2(&hdr4->check,
-			      htons(hdr4->ttl << 8),
-			      htons(dec.ttl << 8));
-		hdr4->ttl = dec.ttl;
+	case MPT_IPV4:
+		if (net->mpls.ip_ttl_propagate) {
+			struct iphdr *hdr4 = ip_hdr(skb);
+
+			skb->protocol = htons(ETH_P_IP);
+			csum_replace2(&hdr4->check,
+				      htons(hdr4->ttl << 8),
+				      htons(dec.ttl << 8));
+			hdr4->ttl = dec.ttl;
+		}
 		success = true;
 		break;
-	}
-	case MPT_IPV6: {
-		struct ipv6hdr *hdr6 = ipv6_hdr(skb);
-		skb->protocol = htons(ETH_P_IPV6);
-		hdr6->hop_limit = dec.ttl;
+	case MPT_IPV6:
+		if (net->mpls.ip_ttl_propagate) {
+			struct ipv6hdr *hdr6 = ipv6_hdr(skb);
+
+			skb->protocol = htons(ETH_P_IPV6);
+			hdr6->hop_limit = dec.ttl;
+		}
 		success = true;
 		break;
-	}
 	case MPT_UNSPEC:
+		/* Should have decided which protocol it is by now */
 		break;
 	}
 
@@ -360,7 +367,7 @@  static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 
 	if (unlikely(!new_header_size && dec.bos)) {
 		/* Penultimate hop popping */
-		if (!mpls_egress(rt, skb, dec))
+		if (!mpls_egress(dev_net(out_dev), rt, skb, dec))
 			goto err;
 	} else {
 		bool bos;
@@ -1764,6 +1771,9 @@  static int mpls_platform_labels(struct ctl_table *table, int write,
 	return ret;
 }
 
+#define MPLS_NS_SYSCTL_OFFSET(field)		\
+	(&((struct net *)0)->field)
+
 static const struct ctl_table mpls_table[] = {
 	{
 		.procname	= "platform_labels",
@@ -1772,21 +1782,47 @@  static const struct ctl_table mpls_table[] = {
 		.mode		= 0644,
 		.proc_handler	= mpls_platform_labels,
 	},
+	{
+		.procname	= "ip_ttl_propagate",
+		.data		= MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate),
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "default_ttl",
+		.data		= MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl),
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &ttl_max,
+	},
 	{ }
 };
 
 static int mpls_net_init(struct net *net)
 {
 	struct ctl_table *table;
+	int i;
 
 	net->mpls.platform_labels = 0;
 	net->mpls.platform_label = NULL;
+	net->mpls.ip_ttl_propagate = 1;
+	net->mpls.default_ttl = 255;
 
 	table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL);
 	if (table == NULL)
 		return -ENOMEM;
 
-	table[0].data = net;
+	/* Table data contains only offsets relative to the base of
+	 * the mdev at this point, so make them absolute.
+	 */
+	for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++)
+		table[i].data = (char *)net + (uintptr_t)table[i].data;
+
 	net->mpls.ctl = register_net_sysctl(net, "net/mpls", table);
 	if (net->mpls.ctl == NULL) {
 		kfree(table);
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 67b7a955de65..c6a8e1c7c5f5 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -49,6 +49,7 @@  static int mpls_xmit(struct sk_buff *skb)
 	struct rtable *rt = NULL;
 	struct rt6_info *rt6 = NULL;
 	struct mpls_dev *out_mdev;
+	struct net *net;
 	int err = 0;
 	bool bos;
 	int i;
@@ -56,13 +57,20 @@  static int mpls_xmit(struct sk_buff *skb)
 
 	/* Find the output device */
 	out_dev = dst->dev;
+	net = dev_net(out_dev);
 
 	/* Obtain the ttl */
 	if (dst->ops->family == AF_INET) {
-		ttl = ip_hdr(skb)->ttl;
+		if (net->mpls.ip_ttl_propagate)
+			ttl = ip_hdr(skb)->ttl;
+		else
+			ttl = net->mpls.default_ttl;
 		rt = (struct rtable *)dst;
 	} else if (dst->ops->family == AF_INET6) {
-		ttl = ipv6_hdr(skb)->hop_limit;
+		if (net->mpls.ip_ttl_propagate)
+			ttl = ipv6_hdr(skb)->hop_limit;
+		else
+			ttl = net->mpls.default_ttl;
 		rt6 = (struct rt6_info *)dst;
 	} else {
 		goto drop;