diff mbox series

[ovs-dev,net-next] openvswitch: add TTL decrement action

Message ID 20191112102518.4406-1-mcroce@redhat.com
State Awaiting Upstream
Headers show
Series [ovs-dev,net-next] openvswitch: add TTL decrement action | expand

Commit Message

Matteo Croce Nov. 12, 2019, 10:25 a.m. UTC
New action to decrement TTL instead of setting it to a fixed value.
This action will decrement the TTL and, in case of expired TTL, send the
packet to userspace via output_userspace() to take care of it.

Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.

Tested with a corresponding change in the userspace:

    # ovs-dpctl dump-flows
    in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,1
    in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,2
    in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
    in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1

    # ping -c1 192.168.0.2 -t 42
    IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
        192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
    # ping -c1 192.168.0.2 -t 120
    IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
        192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
    # ping -c1 192.168.0.2 -t 1
    #

Co-authored-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 include/uapi/linux/openvswitch.h |  2 ++
 net/openvswitch/actions.c        | 46 ++++++++++++++++++++++++++++++++
 net/openvswitch/flow_netlink.c   |  6 +++++
 3 files changed, 54 insertions(+)

Comments

Simon Horman Nov. 12, 2019, 3 p.m. UTC | #1
On Tue, Nov 12, 2019 at 11:25:18AM +0100, Matteo Croce wrote:
> New action to decrement TTL instead of setting it to a fixed value.
> This action will decrement the TTL and, in case of expired TTL, send the
> packet to userspace via output_userspace() to take care of it.
> 
> Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> 
> Tested with a corresponding change in the userspace:
> 
>     # ovs-dpctl dump-flows
>     in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,1
>     in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,2
>     in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
>     in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
> 
>     # ping -c1 192.168.0.2 -t 42
>     IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
>         192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
>     # ping -c1 192.168.0.2 -t 120
>     IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
>         192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
>     # ping -c1 192.168.0.2 -t 1
>     #
> 
> Co-authored-by: Bindiya Kurle <bindiyakurle@gmail.com>
> Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
> Signed-off-by: Matteo Croce <mcroce@redhat.com>

Usually OVS achieves this behaviour by matching on the TTL and
setting it to the desired value, pre-calculated as TTL -1.
With that in mind could you explain the motivation for this
change?

> ---
>  include/uapi/linux/openvswitch.h |  2 ++
>  net/openvswitch/actions.c        | 46 ++++++++++++++++++++++++++++++++
>  net/openvswitch/flow_netlink.c   |  6 +++++
>  3 files changed, 54 insertions(+)
> 
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 1887a451c388..a3bdb1ecd1e7 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -890,6 +890,7 @@ struct check_pkt_len_arg {
>   * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set
>   * of actions if greater than the specified packet length, else execute
>   * another set of actions.
> + * @OVS_ACTION_ATTR_DEC_TTL: Decrement the IP TTL.
>   *
>   * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
>   * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
> @@ -925,6 +926,7 @@ enum ovs_action_attr {
>  	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
>  	OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
>  	OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
> +	OVS_ACTION_ATTR_DEC_TTL,      /* Decrement ttl action */
>  
>  	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
>  				       * from userspace. */
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 12936c151cc0..077b7f309c93 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -1174,6 +1174,43 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
>  			     nla_len(actions), last, clone_flow_key);
>  }
>  
> +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> +	int err;
> +
> +	if (skb->protocol == htons(ETH_P_IPV6)) {
> +		struct ipv6hdr *nh = ipv6_hdr(skb);
> +
> +		err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +					  sizeof(*nh));
> +		if (unlikely(err))
> +			return err;
> +
> +		if (nh->hop_limit <= 1)
> +			return -EHOSTUNREACH;
> +
> +		key->ip.ttl = --nh->hop_limit;
> +	} else {
> +		struct iphdr *nh = ip_hdr(skb);
> +		u8 old_ttl;
> +
> +		err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +					  sizeof(*nh));
> +		if (unlikely(err))
> +			return err;
> +
> +		if (nh->ttl <= 1)
> +			return -EHOSTUNREACH;
> +
> +		old_ttl = nh->ttl--;
> +		csum_replace2(&nh->check, htons(old_ttl << 8),
> +			      htons(nh->ttl << 8));
> +		key->ip.ttl = nh->ttl;
> +	}

The above may send packets with TTL = 0, is that desired?

> +
> +	return 0;
> +}
> +
>  /* Execute a list of actions against 'skb'. */
>  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  			      struct sw_flow_key *key,
> @@ -1345,6 +1382,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>  
>  			break;
>  		}
> +
> +		case OVS_ACTION_ATTR_DEC_TTL:
> +			err = execute_dec_ttl(skb, key);
> +			if (err == -EHOSTUNREACH) {
> +				output_userspace(dp, skb, key, a, attr,
> +						 len, OVS_CB(skb)->cutlen);
> +				OVS_CB(skb)->cutlen = 0;
> +			}
> +			break;
>  		}
>  
>  		if (unlikely(err)) {
> diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
> index 65c2e3458ff5..d17f2d4b420f 100644
> --- a/net/openvswitch/flow_netlink.c
> +++ b/net/openvswitch/flow_netlink.c
> @@ -79,6 +79,7 @@ static bool actions_may_change_flow(const struct nlattr *actions)
>  		case OVS_ACTION_ATTR_SET_MASKED:
>  		case OVS_ACTION_ATTR_METER:
>  		case OVS_ACTION_ATTR_CHECK_PKT_LEN:
> +		case OVS_ACTION_ATTR_DEC_TTL:
>  		default:
>  			return true;
>  		}
> @@ -3005,6 +3006,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
>  			[OVS_ACTION_ATTR_METER] = sizeof(u32),
>  			[OVS_ACTION_ATTR_CLONE] = (u32)-1,
>  			[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
> +			[OVS_ACTION_ATTR_DEC_TTL] = 0,
>  		};
>  		const struct ovs_action_push_vlan *vlan;
>  		int type = nla_type(a);
> @@ -3233,6 +3235,10 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
>  			break;
>  		}
>  
> +		case OVS_ACTION_ATTR_DEC_TTL:
> +			/* Nothing to do.  */
> +			break;
> +
>  		default:
>  			OVS_NLERR(log, "Unknown Action type %d", type);
>  			return -EINVAL;
> -- 
> 2.23.0
>
Matteo Croce Nov. 12, 2019, 3:46 p.m. UTC | #2
On Tue, Nov 12, 2019 at 4:00 PM Simon Horman <simon.horman@netronome.com> wrote:
>
> On Tue, Nov 12, 2019 at 11:25:18AM +0100, Matteo Croce wrote:
> > New action to decrement TTL instead of setting it to a fixed value.
> > This action will decrement the TTL and, in case of expired TTL, send the
> > packet to userspace via output_userspace() to take care of it.
> >
> > Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> >
>
> Usually OVS achieves this behaviour by matching on the TTL and
> setting it to the desired value, pre-calculated as TTL -1.
> With that in mind could you explain the motivation for this
> change?
>

Hi,

the problem is that OVS creates a flow for each ttl it see. I can let
vswitchd create 255 flows with like this:

$ for i in {2..255}; do ping 192.168.0.2 -t $i -c1 -w1 &>/dev/null & done
$ ovs-dpctl dump-flows |fgrep -c 'set(ipv4(ttl'
255


> > @@ -1174,6 +1174,43 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
> >                            nla_len(actions), last, clone_flow_key);
> >  }
> >
> > +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
> > +{
> > +     int err;
> > +
> > +     if (skb->protocol == htons(ETH_P_IPV6)) {
> > +             struct ipv6hdr *nh = ipv6_hdr(skb);
> > +
> > +             err = skb_ensure_writable(skb, skb_network_offset(skb) +
> > +                                       sizeof(*nh));
> > +             if (unlikely(err))
> > +                     return err;
> > +
> > +             if (nh->hop_limit <= 1)
> > +                     return -EHOSTUNREACH;
> > +
> > +             key->ip.ttl = --nh->hop_limit;
> > +     } else {
> > +             struct iphdr *nh = ip_hdr(skb);
> > +             u8 old_ttl;
> > +
> > +             err = skb_ensure_writable(skb, skb_network_offset(skb) +
> > +                                       sizeof(*nh));
> > +             if (unlikely(err))
> > +                     return err;
> > +
> > +             if (nh->ttl <= 1)
> > +                     return -EHOSTUNREACH;
> > +
> > +             old_ttl = nh->ttl--;
> > +             csum_replace2(&nh->check, htons(old_ttl << 8),
> > +                           htons(nh->ttl << 8));
> > +             key->ip.ttl = nh->ttl;
> > +     }
>
> The above may send packets with TTL = 0, is that desired?
>

If TTL is 1 or 0, execute_dec_ttl() returns -EHOSTUNREACH, and the
caller will just send the packet to userspace and then free it.
I think this is enough, am I missing something?

Bye,
Pravin Shelar Nov. 13, 2019, 4:54 a.m. UTC | #3
On Tue, Nov 12, 2019 at 2:25 AM Matteo Croce <mcroce@redhat.com> wrote:
>
> New action to decrement TTL instead of setting it to a fixed value.
> This action will decrement the TTL and, in case of expired TTL, send the
> packet to userspace via output_userspace() to take care of it.
>
> Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
>
> Tested with a corresponding change in the userspace:
>
>     # ovs-dpctl dump-flows
>     in_port(2),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,1
>     in_port(1),eth(),eth_type(0x0800), packets:0, bytes:0, used:never, actions:dec_ttl,2
>     in_port(1),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:2
>     in_port(2),eth(),eth_type(0x0806), packets:0, bytes:0, used:never, actions:1
>
>     # ping -c1 192.168.0.2 -t 42
>     IP (tos 0x0, ttl 41, id 61647, offset 0, flags [DF], proto ICMP (1), length 84)
>         192.168.0.1 > 192.168.0.2: ICMP echo request, id 386, seq 1, length 64
>     # ping -c1 192.168.0.2 -t 120
>     IP (tos 0x0, ttl 119, id 62070, offset 0, flags [DF], proto ICMP (1), length 84)
>         192.168.0.1 > 192.168.0.2: ICMP echo request, id 388, seq 1, length 64
>     # ping -c1 192.168.0.2 -t 1
>     #
>
> Co-authored-by: Bindiya Kurle <bindiyakurle@gmail.com>
> Signed-off-by: Bindiya Kurle <bindiyakurle@gmail.com>
> Signed-off-by: Matteo Croce <mcroce@redhat.com>
> ---
>  include/uapi/linux/openvswitch.h |  2 ++
>  net/openvswitch/actions.c        | 46 ++++++++++++++++++++++++++++++++
>  net/openvswitch/flow_netlink.c   |  6 +++++
>  3 files changed, 54 insertions(+)
>
> diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
> index 1887a451c388..a3bdb1ecd1e7 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -890,6 +890,7 @@ struct check_pkt_len_arg {
>   * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set
>   * of actions if greater than the specified packet length, else execute
>   * another set of actions.
> + * @OVS_ACTION_ATTR_DEC_TTL: Decrement the IP TTL.
>   *
>   * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
>   * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
> @@ -925,6 +926,7 @@ enum ovs_action_attr {
>         OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
>         OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
>         OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
> +       OVS_ACTION_ATTR_DEC_TTL,      /* Decrement ttl action */
>
>         __OVS_ACTION_ATTR_MAX,        /* Nothing past this will be accepted
>                                        * from userspace. */
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 12936c151cc0..077b7f309c93 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -1174,6 +1174,43 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
>                              nla_len(actions), last, clone_flow_key);
>  }
>
> +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
> +{
> +       int err;
> +
> +       if (skb->protocol == htons(ETH_P_IPV6)) {
> +               struct ipv6hdr *nh = ipv6_hdr(skb);
> +
> +               err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +                                         sizeof(*nh));
> +               if (unlikely(err))
> +                       return err;
> +
> +               if (nh->hop_limit <= 1)
> +                       return -EHOSTUNREACH;
> +
> +               key->ip.ttl = --nh->hop_limit;
> +       } else {
> +               struct iphdr *nh = ip_hdr(skb);
> +               u8 old_ttl;
> +
> +               err = skb_ensure_writable(skb, skb_network_offset(skb) +
> +                                         sizeof(*nh));
> +               if (unlikely(err))
> +                       return err;
> +
> +               if (nh->ttl <= 1)
> +                       return -EHOSTUNREACH;
> +
> +               old_ttl = nh->ttl--;
> +               csum_replace2(&nh->check, htons(old_ttl << 8),
> +                             htons(nh->ttl << 8));
> +               key->ip.ttl = nh->ttl;
> +       }
> +
> +       return 0;
> +}
> +
>  /* Execute a list of actions against 'skb'. */
>  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>                               struct sw_flow_key *key,
> @@ -1345,6 +1382,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
>
>                         break;
>                 }
> +
> +               case OVS_ACTION_ATTR_DEC_TTL:
> +                       err = execute_dec_ttl(skb, key);
> +                       if (err == -EHOSTUNREACH) {
> +                               output_userspace(dp, skb, key, a, attr,
> +                                                len, OVS_CB(skb)->cutlen);
> +                               OVS_CB(skb)->cutlen = 0;
> +                       }
This needs to be programmable rather than fixed action. Can you add
nested actions list as argument to execute in case of this exception.
This way we can implement rate limiting or port redirections for
handling such packet.
Simon Horman Nov. 13, 2019, 6:57 a.m. UTC | #4
On Tue, Nov 12, 2019 at 04:46:12PM +0100, Matteo Croce wrote:
> On Tue, Nov 12, 2019 at 4:00 PM Simon Horman <simon.horman@netronome.com> wrote:
> >
> > On Tue, Nov 12, 2019 at 11:25:18AM +0100, Matteo Croce wrote:
> > > New action to decrement TTL instead of setting it to a fixed value.
> > > This action will decrement the TTL and, in case of expired TTL, send the
> > > packet to userspace via output_userspace() to take care of it.
> > >
> > > Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> > >
> >
> > Usually OVS achieves this behaviour by matching on the TTL and
> > setting it to the desired value, pre-calculated as TTL -1.
> > With that in mind could you explain the motivation for this
> > change?
> >
> 
> Hi,
> 
> the problem is that OVS creates a flow for each ttl it see. I can let
> vswitchd create 255 flows with like this:
> 
> $ for i in {2..255}; do ping 192.168.0.2 -t $i -c1 -w1 &>/dev/null & done
> $ ovs-dpctl dump-flows |fgrep -c 'set(ipv4(ttl'
> 255

Hi,

so the motivation is to reduce the number of megaflows in the case
where flows otherwise match but the TTL differs. I think this makes sense
and the absence of this feature may date back to designs made before
megaflow support was added - just guessing.

I think this is a reasonable feature but I think it would be good
to explain the motivation in the changelog.

> > > @@ -1174,6 +1174,43 @@ static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
> > >                            nla_len(actions), last, clone_flow_key);
> > >  }
> > >
> > > +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
> > > +{
> > > +     int err;
> > > +
> > > +     if (skb->protocol == htons(ETH_P_IPV6)) {
> > > +             struct ipv6hdr *nh = ipv6_hdr(skb);
> > > +
> > > +             err = skb_ensure_writable(skb, skb_network_offset(skb) +
> > > +                                       sizeof(*nh));
> > > +             if (unlikely(err))
> > > +                     return err;
> > > +
> > > +             if (nh->hop_limit <= 1)
> > > +                     return -EHOSTUNREACH;
> > > +
> > > +             key->ip.ttl = --nh->hop_limit;
> > > +     } else {
> > > +             struct iphdr *nh = ip_hdr(skb);
> > > +             u8 old_ttl;
> > > +
> > > +             err = skb_ensure_writable(skb, skb_network_offset(skb) +
> > > +                                       sizeof(*nh));
> > > +             if (unlikely(err))
> > > +                     return err;
> > > +
> > > +             if (nh->ttl <= 1)
> > > +                     return -EHOSTUNREACH;
> > > +
> > > +             old_ttl = nh->ttl--;
> > > +             csum_replace2(&nh->check, htons(old_ttl << 8),
> > > +                           htons(nh->ttl << 8));
> > > +             key->ip.ttl = nh->ttl;
> > > +     }
> >
> > The above may send packets with TTL = 0, is that desired?
> >
> 
> If TTL is 1 or 0, execute_dec_ttl() returns -EHOSTUNREACH, and the
> caller will just send the packet to userspace and then free it.
> I think this is enough, am I missing something?

No, you are not. I was missing something.
I now think this logic is fine.
Ben Pfaff Nov. 18, 2019, 4:19 p.m. UTC | #5
On Tue, Nov 12, 2019 at 04:46:12PM +0100, Matteo Croce wrote:
> On Tue, Nov 12, 2019 at 4:00 PM Simon Horman <simon.horman@netronome.com> wrote:
> >
> > On Tue, Nov 12, 2019 at 11:25:18AM +0100, Matteo Croce wrote:
> > > New action to decrement TTL instead of setting it to a fixed value.
> > > This action will decrement the TTL and, in case of expired TTL, send the
> > > packet to userspace via output_userspace() to take care of it.
> > >
> > > Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> > >
> >
> > Usually OVS achieves this behaviour by matching on the TTL and
> > setting it to the desired value, pre-calculated as TTL -1.
> > With that in mind could you explain the motivation for this
> > change?
> >
> 
> Hi,
> 
> the problem is that OVS creates a flow for each ttl it see. I can let
> vswitchd create 255 flows with like this:
> 
> $ for i in {2..255}; do ping 192.168.0.2 -t $i -c1 -w1 &>/dev/null & done
> $ ovs-dpctl dump-flows |fgrep -c 'set(ipv4(ttl'
> 255

Sure, you can easily invent a situation.  In real traffic there's not
usually such a variety of TTLs for a flow that matches on the number of
fields that OVS usually needs to match.  Do you see a real problem given
actual traffic in practice?
Matteo Croce Nov. 22, 2019, 3:32 p.m. UTC | #6
On Mon, Nov 18, 2019 at 5:20 PM Ben Pfaff <blp@ovn.org> wrote:
>
> On Tue, Nov 12, 2019 at 04:46:12PM +0100, Matteo Croce wrote:
> > On Tue, Nov 12, 2019 at 4:00 PM Simon Horman <simon.horman@netronome.com> wrote:
> > >
> > > On Tue, Nov 12, 2019 at 11:25:18AM +0100, Matteo Croce wrote:
> > > > New action to decrement TTL instead of setting it to a fixed value.
> > > > This action will decrement the TTL and, in case of expired TTL, send the
> > > > packet to userspace via output_userspace() to take care of it.
> > > >
> > > > Supports both IPv4 and IPv6 via the ttl and hop_limit fields, respectively.
> > > >
> > >
> > > Usually OVS achieves this behaviour by matching on the TTL and
> > > setting it to the desired value, pre-calculated as TTL -1.
> > > With that in mind could you explain the motivation for this
> > > change?
> > >
> >
> > Hi,
> >
> > the problem is that OVS creates a flow for each ttl it see. I can let
> > vswitchd create 255 flows with like this:
> >
> > $ for i in {2..255}; do ping 192.168.0.2 -t $i -c1 -w1 &>/dev/null & done
> > $ ovs-dpctl dump-flows |fgrep -c 'set(ipv4(ttl'
> > 255
>
> Sure, you can easily invent a situation.  In real traffic there's not
> usually such a variety of TTLs for a flow that matches on the number of
> fields that OVS usually needs to match.  Do you see a real problem given
> actual traffic in practice?
>

Hi Ben,

yes, my situation was a bit artificious, but you can get a similar
situation in practice.

Imagine a router with some subnetworks behind it on N levels, with
some nodes hosting virtual machines.
Windows and Linux have different default TTL values, 128 and 64
respectively, so you could see N*2 different TTL values.

Bye,



--
Matteo Croce
per aspera ad upstream
diff mbox series

Patch

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 1887a451c388..a3bdb1ecd1e7 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -890,6 +890,7 @@  struct check_pkt_len_arg {
  * @OVS_ACTION_ATTR_CHECK_PKT_LEN: Check the packet length and execute a set
  * of actions if greater than the specified packet length, else execute
  * another set of actions.
+ * @OVS_ACTION_ATTR_DEC_TTL: Decrement the IP TTL.
  *
  * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
  * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
@@ -925,6 +926,7 @@  enum ovs_action_attr {
 	OVS_ACTION_ATTR_METER,        /* u32 meter ID. */
 	OVS_ACTION_ATTR_CLONE,        /* Nested OVS_CLONE_ATTR_*.  */
 	OVS_ACTION_ATTR_CHECK_PKT_LEN, /* Nested OVS_CHECK_PKT_LEN_ATTR_*. */
+	OVS_ACTION_ATTR_DEC_TTL,      /* Decrement ttl action */
 
 	__OVS_ACTION_ATTR_MAX,	      /* Nothing past this will be accepted
 				       * from userspace. */
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 12936c151cc0..077b7f309c93 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -1174,6 +1174,43 @@  static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb,
 			     nla_len(actions), last, clone_flow_key);
 }
 
+static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key)
+{
+	int err;
+
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct ipv6hdr *nh = ipv6_hdr(skb);
+
+		err = skb_ensure_writable(skb, skb_network_offset(skb) +
+					  sizeof(*nh));
+		if (unlikely(err))
+			return err;
+
+		if (nh->hop_limit <= 1)
+			return -EHOSTUNREACH;
+
+		key->ip.ttl = --nh->hop_limit;
+	} else {
+		struct iphdr *nh = ip_hdr(skb);
+		u8 old_ttl;
+
+		err = skb_ensure_writable(skb, skb_network_offset(skb) +
+					  sizeof(*nh));
+		if (unlikely(err))
+			return err;
+
+		if (nh->ttl <= 1)
+			return -EHOSTUNREACH;
+
+		old_ttl = nh->ttl--;
+		csum_replace2(&nh->check, htons(old_ttl << 8),
+			      htons(nh->ttl << 8));
+		key->ip.ttl = nh->ttl;
+	}
+
+	return 0;
+}
+
 /* Execute a list of actions against 'skb'. */
 static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 			      struct sw_flow_key *key,
@@ -1345,6 +1382,15 @@  static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
 
 			break;
 		}
+
+		case OVS_ACTION_ATTR_DEC_TTL:
+			err = execute_dec_ttl(skb, key);
+			if (err == -EHOSTUNREACH) {
+				output_userspace(dp, skb, key, a, attr,
+						 len, OVS_CB(skb)->cutlen);
+				OVS_CB(skb)->cutlen = 0;
+			}
+			break;
 		}
 
 		if (unlikely(err)) {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 65c2e3458ff5..d17f2d4b420f 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -79,6 +79,7 @@  static bool actions_may_change_flow(const struct nlattr *actions)
 		case OVS_ACTION_ATTR_SET_MASKED:
 		case OVS_ACTION_ATTR_METER:
 		case OVS_ACTION_ATTR_CHECK_PKT_LEN:
+		case OVS_ACTION_ATTR_DEC_TTL:
 		default:
 			return true;
 		}
@@ -3005,6 +3006,7 @@  static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			[OVS_ACTION_ATTR_METER] = sizeof(u32),
 			[OVS_ACTION_ATTR_CLONE] = (u32)-1,
 			[OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1,
+			[OVS_ACTION_ATTR_DEC_TTL] = 0,
 		};
 		const struct ovs_action_push_vlan *vlan;
 		int type = nla_type(a);
@@ -3233,6 +3235,10 @@  static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
 			break;
 		}
 
+		case OVS_ACTION_ATTR_DEC_TTL:
+			/* Nothing to do.  */
+			break;
+
 		default:
 			OVS_NLERR(log, "Unknown Action type %d", type);
 			return -EINVAL;