diff mbox series

[net-next,v6,04/11] ipv4: Dump route exceptions if requested

Message ID b5aacd9a3a3f4b256dfd091cdd8771d0f6a1aea2.1560987611.git.sbrivio@redhat.com
State Changes Requested
Delegated to: David Miller
Headers show
Series Fix listing (IPv4, IPv6) and flushing (IPv6) of cached route exceptions | expand

Commit Message

Stefano Brivio June 19, 2019, 11:59 p.m. UTC
Since commit 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions."), cached
exception routes are stored as a separate entity, so they are not dumped
on a FIB dump, even if the RTM_F_CLONED flag is passed.

This implies that the command 'ip route list cache' doesn't return any
result anymore.

If the RTM_F_CLONED is passed, and strict checking requested, retrieve
nexthop exception routes and dump them. If no strict checking is
requested, filtering can't be performed consistently: dump everything in
that case.

With this, we need to add an argument to the netlink callback in order to
track how many entries were already dumped for the last leaf included in
a partial netlink dump.

Note that this is only as accurate as the existing tracking mechanism for
leaves: if a partial dump is restarted after exceptions are removed or
expired, we might skip some non-dumped entries. To improve this, we could
attach a 'sernum' attribute (similar to the one used for IPv6) to nexthop
entities, and bump this counter whenever exceptions change.

Listing of exception routes (modified routes pre-3.5) was tested against
these versions of kernel and iproute2:

                    iproute2
kernel         4.14.0   4.15.0   4.19.0   5.0.0   5.1.0
 3.5-rc4         +        +        +        +       +
 4.4
 4.9
 4.14
 4.15
 4.19
 5.0
 5.1
 fixed           +        +        +        +       +

v6:
   - Rebased onto net-next
   - Loop over nexthop paths too. Move loop over fnhe buckets to route.c,
     avoids need to export rt_fill_info() and to touch exceptions from
     fib_trie.c. Pass NULL as flow to rt_fill_info(), it now allows that
     (all suggested by David Ahern)

Fixes: 4895c771c7f0 ("ipv4: Add FIB nexthop exceptions.")
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/route.h |  4 +++
 net/ipv4/fib_trie.c | 60 ++++++++++++++++++++++++++++++++++++---------
 net/ipv4/route.c    | 56 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 12 deletions(-)

Comments

David Ahern June 20, 2019, 1:31 p.m. UTC | #1
On 6/19/19 5:59 PM, Stefano Brivio wrote:
> diff --git a/include/net/route.h b/include/net/route.h
> index 065b47754f05..e7f65388a6d4 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -44,6 +44,7 @@
>  #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
>  
>  struct fib_nh;
> +struct fib_alias;
>  struct fib_info;
>  struct uncached_list;
>  struct rtable {

we should not expose fib_alias to route.c.

> @@ -230,6 +231,9 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
>  void rt_add_uncached_list(struct rtable *rt);
>  void rt_del_uncached_list(struct rtable *rt);
>  
> +int fnhe_dump_buckets(struct fib_alias *fa, int nhsel, struct sk_buff *skb,
> +		      struct netlink_callback *cb, int *fa_index, int fa_start);
> +
>  static inline void ip_rt_put(struct rtable *rt)
>  {
>  	/* dst_release() accepts a NULL parameter.
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 94e5d83db4db..03f51e5192e5 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -2078,28 +2078,51 @@ void fib_free_table(struct fib_table *tb)
>  	call_rcu(&tb->rcu, __trie_free_rcu);
>  }
>  
> +static int fib_dump_fnhe_from_leaf(struct fib_alias *fa, struct sk_buff *skb,
> +				   struct netlink_callback *cb,
> +				   int *fa_index, int fa_start)
> +{
> +	struct fib_info *fi = fa->fa_info;
> +	int nhsel;
> +
> +	if (!fi || fi->fib_flags & RTNH_F_DEAD)
> +		return 0;
> +
> +	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
> +		int err;
> +
> +		err = fnhe_dump_buckets(fa, nhsel, skb, cb, fa_index, fa_start);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}

fib_info would be the better argument to pass in to the fnhe dump, and
I think the loop over where the bucket is should be in route.c as well.
So how about fib_info_dump_fnhe() as the helper exposed from route.c,
and it does the loop over nexthops and calls fnhe_dump_buckets.

As for the loop, you could fill an skb without finishing a bucket inside
of a nexthop so you need top track which nexthop is current as well.
Stefano Brivio June 20, 2019, 7:01 p.m. UTC | #2
On Thu, 20 Jun 2019 07:31:32 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 6/19/19 5:59 PM, Stefano Brivio wrote:
> > diff --git a/include/net/route.h b/include/net/route.h
> > index 065b47754f05..e7f65388a6d4 100644
> > --- a/include/net/route.h
> > +++ b/include/net/route.h
> > @@ -44,6 +44,7 @@
> >  #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
> >  
> >  struct fib_nh;
> > +struct fib_alias;
> >  struct fib_info;
> >  struct uncached_list;
> >  struct rtable {  
> 
> we should not expose fib_alias to route.c.

I was also not enthusiastic about that, but...:

> > @@ -230,6 +231,9 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
> >  void rt_add_uncached_list(struct rtable *rt);
> >  void rt_del_uncached_list(struct rtable *rt);
> >  
> > +int fnhe_dump_buckets(struct fib_alias *fa, int nhsel, struct sk_buff *skb,
> > +		      struct netlink_callback *cb, int *fa_index, int fa_start);
> > +
> >  static inline void ip_rt_put(struct rtable *rt)
> >  {
> >  	/* dst_release() accepts a NULL parameter.
> > diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> > index 94e5d83db4db..03f51e5192e5 100644
> > --- a/net/ipv4/fib_trie.c
> > +++ b/net/ipv4/fib_trie.c
> > @@ -2078,28 +2078,51 @@ void fib_free_table(struct fib_table *tb)
> >  	call_rcu(&tb->rcu, __trie_free_rcu);
> >  }
> >  
> > +static int fib_dump_fnhe_from_leaf(struct fib_alias *fa, struct sk_buff *skb,
> > +				   struct netlink_callback *cb,
> > +				   int *fa_index, int fa_start)
> > +{
> > +	struct fib_info *fi = fa->fa_info;
> > +	int nhsel;
> > +
> > +	if (!fi || fi->fib_flags & RTNH_F_DEAD)
> > +		return 0;
> > +
> > +	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
> > +		int err;
> > +
> > +		err = fnhe_dump_buckets(fa, nhsel, skb, cb, fa_index, fa_start);
> > +		if (err)
> > +			return err;
> > +	}
> > +
> > +	return 0;
> > +}  
> 
> fib_info would be the better argument to pass in to the fnhe dump, and

...we need to pass the table ID to rt_fill_info(). Sure, I can pass
that explicitly, but doing so kind of tells me I'm not passing the
right argument, with sufficient information. What do you think?

> I think the loop over where the bucket is should be in route.c as well.
> So how about fib_info_dump_fnhe() as the helper exposed from route.c,
> and it does the loop over nexthops and calls fnhe_dump_buckets.

Yes, I could do that conveniently if I'm passing a fib_info there. I'm
stlll undecided if it's worth it, I guess I don't really have a
preference.

> As for the loop, you could fill an skb without finishing a bucket inside
> of a nexthop so you need top track which nexthop is current as well.

I think this is not a problem, and also checked that selftests trigger
this. Buckets are transparent to the counter for partial dumps (s_fa),
they are just an arbitrary grouping from that perspective, just like
items on the chain for the same bucket.

Take this example, s_i values in [], s_fa values in ():

  node (fa) #1 [1]
    nexthop #1
    bucket #1 -> #0 in chain (1)
    bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
    bucket #3 -> #0 in chain (5) -> #1 in chain (6)

    nexthop #2
    bucket #1 -> #0 in chain (7) -> #1 in chain (8)
    bucket #2 -> #0 in chain (9)
  --
  node (fa) #2 [2]
    nexthop #1
    bucket #1 -> #0 in chain (1) -> #1 in chain (2)
    bucket #2 -> #0 in chain (3)


If I stop at (3), (4), (7) for "node #1", or at (2) for "node #2", it
doesn't really matter, because nexthops and buckets are always
traversed in the same way (walking flattens all that).

For IPv4, I could even drop the in-tree/in-node distinction (s_i/s_fa).
But accounting becomes terribly inconvenient though, and it would be
inconsistent with what I needed to do for IPv6 (skip/skip_in_node): we
have 'sernum' there, which is used to mark what node we need to restart
from in case of changes. Within a node, however, I can't make any
assumptions like that, so if the fib6 tree changes, I'll restart from
the beginning of the node (see discussion with Martin on v1).

My idea would be to keep it like it is at the moment, and later make it
as "accurate" as it is on IPv6, introducing something like 'sernum'. If
we start with this, it will be more convenient to do that later.
David Ahern June 20, 2019, 7:21 p.m. UTC | #3
On 6/20/19 1:01 PM, Stefano Brivio wrote:
>>> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
>>> index 94e5d83db4db..03f51e5192e5 100644
>>> --- a/net/ipv4/fib_trie.c
>>> +++ b/net/ipv4/fib_trie.c
>>> @@ -2078,28 +2078,51 @@ void fib_free_table(struct fib_table *tb)
>>>  	call_rcu(&tb->rcu, __trie_free_rcu);
>>>  }
>>>  
>>> +static int fib_dump_fnhe_from_leaf(struct fib_alias *fa, struct sk_buff *skb,
>>> +				   struct netlink_callback *cb,
>>> +				   int *fa_index, int fa_start)
>>> +{
>>> +	struct fib_info *fi = fa->fa_info;
>>> +	int nhsel;
>>> +
>>> +	if (!fi || fi->fib_flags & RTNH_F_DEAD)
>>> +		return 0;
>>> +
>>> +	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
>>> +		int err;
>>> +
>>> +		err = fnhe_dump_buckets(fa, nhsel, skb, cb, fa_index, fa_start);
>>> +		if (err)
>>> +			return err;
>>> +	}
>>> +
>>> +	return 0;
>>> +}  
>>
>> fib_info would be the better argument to pass in to the fnhe dump, and
> 
> ...we need to pass the table ID to rt_fill_info(). Sure, I can pass
> that explicitly, but doing so kind of tells me I'm not passing the
> right argument, with sufficient information. What do you think?

I think that is preferable to passing fib_alias.

> 
>> I think the loop over where the bucket is should be in route.c as well.
>> So how about fib_info_dump_fnhe() as the helper exposed from route.c,
>> and it does the loop over nexthops and calls fnhe_dump_buckets.
> 
> Yes, I could do that conveniently if I'm passing a fib_info there. I'm
> stlll undecided if it's worth it, I guess I don't really have a
> preference.
> 
>> As for the loop, you could fill an skb without finishing a bucket inside
>> of a nexthop so you need top track which nexthop is current as well.
> 
> I think this is not a problem, and also checked that selftests trigger
> this. Buckets are transparent to the counter for partial dumps (s_fa),
> they are just an arbitrary grouping from that perspective, just like
> items on the chain for the same bucket.
> 
> Take this example, s_i values in [], s_fa values in ():
> 
>   node (fa) #1 [1]
>     nexthop #1
>     bucket #1 -> #0 in chain (1)
>     bucket #2 -> #0 in chain (2) -> #1 in chain (3) -> #2 in chain (4)
>     bucket #3 -> #0 in chain (5) -> #1 in chain (6)
> 
>     nexthop #2
>     bucket #1 -> #0 in chain (7) -> #1 in chain (8)
>     bucket #2 -> #0 in chain (9)
>   --
>   node (fa) #2 [2]
>     nexthop #1
>     bucket #1 -> #0 in chain (1) -> #1 in chain (2)
>     bucket #2 -> #0 in chain (3)
> 
> 
> If I stop at (3), (4), (7) for "node #1", or at (2) for "node #2", it
> doesn't really matter, because nexthops and buckets are always
> traversed in the same way (walking flattens all that).
> 
> For IPv4, I could even drop the in-tree/in-node distinction (s_i/s_fa).
> But accounting becomes terribly inconvenient though, and it would be
> inconsistent with what I needed to do for IPv6 (skip/skip_in_node): we
> have 'sernum' there, which is used to mark what node we need to restart
> from in case of changes. Within a node, however, I can't make any
> assumptions like that, so if the fib6 tree changes, I'll restart from
> the beginning of the node (see discussion with Martin on v1).
> 
> My idea would be to keep it like it is at the moment, and later make it
> as "accurate" as it is on IPv6, introducing something like 'sernum'. If
> we start with this, it will be more convenient to do that later.
> 

ok, if you have it covered. can you add that description above to the
commit message. Be good to capture how it is covered.
diff mbox series

Patch

diff --git a/include/net/route.h b/include/net/route.h
index 065b47754f05..e7f65388a6d4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -44,6 +44,7 @@ 
 #define RT_CONN_FLAGS_TOS(sk,tos)   (RT_TOS(tos) | sock_flag(sk, SOCK_LOCALROUTE))
 
 struct fib_nh;
+struct fib_alias;
 struct fib_info;
 struct uncached_list;
 struct rtable {
@@ -230,6 +231,9 @@  void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
 void rt_add_uncached_list(struct rtable *rt);
 void rt_del_uncached_list(struct rtable *rt);
 
+int fnhe_dump_buckets(struct fib_alias *fa, int nhsel, struct sk_buff *skb,
+		      struct netlink_callback *cb, int *fa_index, int fa_start);
+
 static inline void ip_rt_put(struct rtable *rt)
 {
 	/* dst_release() accepts a NULL parameter.
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 94e5d83db4db..03f51e5192e5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2078,28 +2078,51 @@  void fib_free_table(struct fib_table *tb)
 	call_rcu(&tb->rcu, __trie_free_rcu);
 }
 
+static int fib_dump_fnhe_from_leaf(struct fib_alias *fa, struct sk_buff *skb,
+				   struct netlink_callback *cb,
+				   int *fa_index, int fa_start)
+{
+	struct fib_info *fi = fa->fa_info;
+	int nhsel;
+
+	if (!fi || fi->fib_flags & RTNH_F_DEAD)
+		return 0;
+
+	for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+		int err;
+
+		err = fnhe_dump_buckets(fa, nhsel, skb, cb, fa_index, fa_start);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 			     struct sk_buff *skb, struct netlink_callback *cb,
 			     struct fib_dump_filter *filter)
 {
 	unsigned int flags = NLM_F_MULTI;
 	__be32 xkey = htonl(l->key);
+	int i, s_i, i_fa, s_fa, err;
 	struct fib_alias *fa;
-	int i, s_i;
 
-	if (filter->filter_set)
+	if (filter->filter_set ||
+	    !filter->dump_exceptions || !filter->dump_routes)
 		flags |= NLM_F_DUMP_FILTERED;
 
 	s_i = cb->args[4];
+	s_fa = cb->args[5];
 	i = 0;
 
 	/* rcu_read_lock is hold by caller */
 	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
-		int err;
-
 		if (i < s_i)
 			goto next;
 
+		i_fa = 0;
+
 		if (tb->tb_id != fa->tb_id)
 			goto next;
 
@@ -2116,21 +2139,34 @@  static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 				goto next;
 		}
 
-		err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
-				    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
-				    tb->tb_id, fa->fa_type,
-				    xkey, KEYLENGTH - fa->fa_slen,
-				    fa->fa_tos, fa->fa_info, flags);
-		if (err < 0) {
-			cb->args[4] = i;
-			return err;
+		if (filter->dump_routes && !s_fa) {
+			err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
+					    cb->nlh->nlmsg_seq, RTM_NEWROUTE,
+					    tb->tb_id, fa->fa_type,
+					    xkey, KEYLENGTH - fa->fa_slen,
+					    fa->fa_tos, fa->fa_info, flags);
+			if (err < 0)
+				goto stop;
+			i_fa++;
 		}
+
+		if (filter->dump_exceptions) {
+			err = fib_dump_fnhe_from_leaf(fa, skb, cb, &i_fa, s_fa);
+			if (err < 0)
+				goto stop;
+		}
+
 next:
 		i++;
 	}
 
 	cb->args[4] = i;
 	return skb->len;
+
+stop:
+	cb->args[4] = i;
+	cb->args[5] = i_fa;
+	return err;
 }
 
 /* rcu_read_lock needs to be hold by caller from readside */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 052a80373b1d..b3961a135dfc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2813,6 +2813,62 @@  static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
 	return -EMSGSIZE;
 }
 
+int fnhe_dump_buckets(struct fib_alias *fa, int nhsel, struct sk_buff *skb,
+		      struct netlink_callback *cb, int *fa_index, int fa_start)
+{
+	struct net *net = sock_net(cb->skb->sk);
+	struct fnhe_hash_bucket *bucket;
+	struct fib_nh_common *nhc;
+	int i, genid;
+
+	nhc = fib_info_nhc(fa->fa_info, nhsel);
+	if (nhc->nhc_flags & RTNH_F_DEAD)
+		return 0;
+
+	bucket = rcu_dereference(nhc->nhc_exceptions);
+	if (!bucket)
+		return 0;
+
+	genid = fnhe_genid(net);
+
+	for (i = 0; i < FNHE_HASH_SIZE; i++) {
+		struct fib_nh_exception *fnhe;
+
+		for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+		     fnhe = rcu_dereference(fnhe->fnhe_next)) {
+			struct rtable *rt;
+			int err;
+
+			if (*fa_index < fa_start)
+				goto next;
+
+			if (fnhe->fnhe_genid != genid)
+				goto next;
+
+			if (fnhe->fnhe_expires &&
+			    time_after(jiffies, fnhe->fnhe_expires))
+				goto next;
+
+			rt = rcu_dereference(fnhe->fnhe_rth_input);
+			if (!rt)
+				rt = rcu_dereference(fnhe->fnhe_rth_output);
+			if (!rt)
+				goto next;
+
+			err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+					   fa->tb_id, NULL, skb,
+					   NETLINK_CB(cb->skb).portid,
+					   cb->nlh->nlmsg_seq);
+			if (err)
+				return err;
+next:
+			(*fa_index)++;
+		}
+	}
+
+	return 0;
+}
+
 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
 						   u8 ip_proto, __be16 sport,
 						   __be16 dport)