diff mbox

[net-next,v2,09/11] ipv4: fib: Add an API to request a FIB dump

Message ID 1479911670-4525-10-git-send-email-jiri@resnulli.us
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Jiri Pirko Nov. 23, 2016, 2:34 p.m. UTC
From: Ido Schimmel <idosch@mellanox.com>

Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
introduced a new notification chain to notify listeners (f.e., switchdev
drivers) about addition and deletion of routes.

However, upon registration to the chain the FIB tables can already be
populated, which means potential listeners will have an incomplete view
of the tables.

Solve that by adding an API to request a FIB dump. The dump itself it
done using RCU in order not to starve consumers that need RTNL to make
progress.

For each net namespace the integrity of the dump is ensured by reading
the atomic change sequence counter before and after the dump. This
allows us to avoid the problematic situation in which the dumping
process sends a ENTRY_ADD notification following ENTRY_DEL generated by
another process holding RTNL.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
---
 include/net/ip_fib.h |   1 +
 net/ipv4/fib_trie.c  | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 118 insertions(+)

Comments

Hannes Frederic Sowa Nov. 23, 2016, 5:47 p.m. UTC | #1
On 23.11.2016 15:34, Jiri Pirko wrote:
> From: Ido Schimmel <idosch@mellanox.com>
> 
> Commit b90eb7549499 ("fib: introduce FIB notification infrastructure")
> introduced a new notification chain to notify listeners (f.e., switchdev
> drivers) about addition and deletion of routes.
> 
> However, upon registration to the chain the FIB tables can already be
> populated, which means potential listeners will have an incomplete view
> of the tables.
> 
> Solve that by adding an API to request a FIB dump. The dump itself it
> done using RCU in order not to starve consumers that need RTNL to make
> progress.
> 
> For each net namespace the integrity of the dump is ensured by reading
> the atomic change sequence counter before and after the dump. This
> allows us to avoid the problematic situation in which the dumping
> process sends a ENTRY_ADD notification following ENTRY_DEL generated by
> another process holding RTNL.
> 
> Signed-off-by: Ido Schimmel <idosch@mellanox.com>
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/net/ip_fib.h |   1 +
>  net/ipv4/fib_trie.c  | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 118 insertions(+)
> 
> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index 6c67b93..c76303e 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -221,6 +221,7 @@ enum fib_event_type {
>  	FIB_EVENT_RULE_DEL,
>  };
>  
> +bool fib_notifier_dump(struct notifier_block *nb);
>  int register_fib_notifier(struct notifier_block *nb);
>  int unregister_fib_notifier(struct notifier_block *nb);
>  int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index b1d2d09..9770edfe 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -86,6 +86,67 @@
>  
>  static ATOMIC_NOTIFIER_HEAD(fib_chain);
>  
> +static int call_fib_notifier(struct notifier_block *nb, struct net *net,
> +			     enum fib_event_type event_type,
> +			     struct fib_notifier_info *info)
> +{
> +	info->net = net;
> +	return nb->notifier_call(nb, event_type, info);
> +}
> +
> +static void fib_rules_notify(struct net *net, struct notifier_block *nb,
> +			     enum fib_event_type event_type)
> +{
> +#ifdef CONFIG_IP_MULTIPLE_TABLES
> +	struct fib_notifier_info info;
> +
> +	if (net->ipv4.fib_has_custom_rules)
> +		call_fib_notifier(nb, net, event_type, &info);
> +#endif
> +}
> +
> +static void fib_notify(struct net *net, struct notifier_block *nb,
> +		       enum fib_event_type event_type);
> +
> +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
> +				   enum fib_event_type event_type, u32 dst,
> +				   int dst_len, struct fib_info *fi,
> +				   u8 tos, u8 type, u32 tb_id, u32 nlflags)
> +{
> +	struct fib_entry_notifier_info info = {
> +		.dst = dst,
> +		.dst_len = dst_len,
> +		.fi = fi,
> +		.tos = tos,
> +		.type = type,
> +		.tb_id = tb_id,
> +		.nlflags = nlflags,
> +	};
> +	return call_fib_notifier(nb, net, event_type, &info.info);
> +}
> +
> +bool fib_notifier_dump(struct notifier_block *nb)
> +{
> +	struct net *net;
> +	bool ret = true;



> +	rcu_read_lock();
> +	for_each_net_rcu(net) {
> +		int fib_seq = atomic_read(&net->ipv4.fib_seq);
> +
> +		fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
> +		fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
> +		if (atomic_read(&net->ipv4.fib_seq) != fib_seq) {
> +			ret = false;
> +			goto out_unlock;
> +		}

Hmm, I think you need to read the sequence counter under rtnl_lock to
have an ordering with the rest of the updates to the RCU trie. Otherwise
you don't know if the fib trie has the correct view regarding to the
incoming notifications as a whole. This is also necessary during restarts.

You can also try to register the notifier after the dump and check for
the sequence number after registering the notifier, maybe that is easier
(and restart unregisters and does the same).

Bye,
Hannes
Ido Schimmel Nov. 23, 2016, 7:53 p.m. UTC | #2
On Wed, Nov 23, 2016 at 06:47:03PM +0100, Hannes Frederic Sowa wrote:
> Hmm, I think you need to read the sequence counter under rtnl_lock to
> have an ordering with the rest of the updates to the RCU trie. Otherwise
> you don't know if the fib trie has the correct view regarding to the
> incoming notifications as a whole. This is also necessary during restarts.

I spent quite a lot of time thinking about this specific issue, but I
couldn't convince myself that the read should be done under RTNL and I'm
not sure I understand your reasoning. Can you please elaborate?

If, before each notification sent, we call atomic_inc() and then call
atomic_read() at the end, then how can we be tricked?

Thanks for looking into this!
Hannes Frederic Sowa Nov. 23, 2016, 11:04 p.m. UTC | #3
On 23.11.2016 20:53, Ido Schimmel wrote:
> On Wed, Nov 23, 2016 at 06:47:03PM +0100, Hannes Frederic Sowa wrote:
>> Hmm, I think you need to read the sequence counter under rtnl_lock to
>> have an ordering with the rest of the updates to the RCU trie. Otherwise
>> you don't know if the fib trie has the correct view regarding to the
>> incoming notifications as a whole. This is also necessary during restarts.
>
> I spent quite a lot of time thinking about this specific issue, but I
> couldn't convince myself that the read should be done under RTNL and I'm
> not sure I understand your reasoning. Can you please elaborate?
>
> If, before each notification sent, we call atomic_inc() and then call
> atomic_read() at the end, then how can we be tricked?

The race I am suspecting to happen is:

<CPU0> fib_register()

<CPU1> delete route by notifier
<CPU1> enqueue delete cmd into ordered queue

<CPU0> starts dump
<CPU0> sees deleted route by CPU1 because route not yet removed from RCU
<CPU0> enqueues route for addition

sometimes later in the ordered queue:

delete route -> route not in hw, nop
add route from dump -> route added to hardware

The result should actually have been that route isn't in hw.

Bye,
Hannes
Ido Schimmel Nov. 24, 2016, 8:47 a.m. UTC | #4
On Thu, Nov 24, 2016 at 12:04:57AM +0100, Hannes Frederic Sowa wrote:
> On 23.11.2016 20:53, Ido Schimmel wrote:
> > On Wed, Nov 23, 2016 at 06:47:03PM +0100, Hannes Frederic Sowa wrote:
> >> Hmm, I think you need to read the sequence counter under rtnl_lock to
> >> have an ordering with the rest of the updates to the RCU trie. Otherwise
> >> you don't know if the fib trie has the correct view regarding to the
> >> incoming notifications as a whole. This is also necessary during restarts.
> >
> > I spent quite a lot of time thinking about this specific issue, but I
> > couldn't convince myself that the read should be done under RTNL and I'm
> > not sure I understand your reasoning. Can you please elaborate?
> >
> > If, before each notification sent, we call atomic_inc() and then call
> > atomic_read() at the end, then how can we be tricked?
> 
> The race I am suspecting to happen is:
> 
> <CPU0> fib_register()
> 
> <CPU1> delete route by notifier
> <CPU1> enqueue delete cmd into ordered queue
> 
> <CPU0> starts dump
> <CPU0> sees deleted route by CPU1 because route not yet removed from RCU
> <CPU0> enqueues route for addition

Yea, I missed this trivial case... My mind was fixed on problems that
could happen after the dump already started. :(

Regarding your suggestion, I think the API will be more useful if we
don't bundle fib_register() and fib_dump() together. We can do the
following instead:

1) Sum 'fib_seq' (doesn't need to be atomic_t anymore) from all net
namespaces under RTNL
2) Dump FIB tables under RCU
3) Do 1) again
4) Compare results from 1) and 3) and retry (according to sysctl limit)
if results differ. Before each retry the module's callback (if passed)
will be invoked.

Sounds OK?
Hannes Frederic Sowa Nov. 24, 2016, 12:34 p.m. UTC | #5
On 24.11.2016 09:47, Ido Schimmel wrote:
> On Thu, Nov 24, 2016 at 12:04:57AM +0100, Hannes Frederic Sowa wrote:
>> On 23.11.2016 20:53, Ido Schimmel wrote:
>>> On Wed, Nov 23, 2016 at 06:47:03PM +0100, Hannes Frederic Sowa wrote:
>>>> Hmm, I think you need to read the sequence counter under rtnl_lock to
>>>> have an ordering with the rest of the updates to the RCU trie. Otherwise
>>>> you don't know if the fib trie has the correct view regarding to the
>>>> incoming notifications as a whole. This is also necessary during restarts.
>>>
>>> I spent quite a lot of time thinking about this specific issue, but I
>>> couldn't convince myself that the read should be done under RTNL and I'm
>>> not sure I understand your reasoning. Can you please elaborate?
>>>
>>> If, before each notification sent, we call atomic_inc() and then call
>>> atomic_read() at the end, then how can we be tricked?
>>
>> The race I am suspecting to happen is:
>>
>> <CPU0> fib_register()
>>
>> <CPU1> delete route by notifier
>> <CPU1> enqueue delete cmd into ordered queue
>>
>> <CPU0> starts dump
>> <CPU0> sees deleted route by CPU1 because route not yet removed from RCU
>> <CPU0> enqueues route for addition
> 
> Yea, I missed this trivial case... My mind was fixed on problems that
> could happen after the dump already started. :(
> 
> Regarding your suggestion, I think the API will be more useful if we
> don't bundle fib_register() and fib_dump() together. We can do the
> following instead:
> 
> 1) Sum 'fib_seq' (doesn't need to be atomic_t anymore) from all net
> namespaces under RTNL

You anyway only support init_net, no?

I didn't fully understood what you mean by sum? Using one for the whole
system?

We already have net->ipv4.rt_genid as a per-namespace routing change
counter, have you looked at that?

> 2) Dump FIB tables under RCU
> 3) Do 1) again
> 4) Compare results from 1) and 3) and retry (according to sysctl limit)
> if results differ. Before each retry the module's callback (if passed)
> will be invoked.
> 
> Sounds OK?

Ah, you want to sum up all the fib_seq from all namespaces. Now I got it.

Not sure if that is such a good idea actually. It might make problems
later on if offloading will maybe one day become a per-netns knob for
the respective admins.

But semantically it should work.

If it turns out to be much easier than doing it per-netns, I think this
approach should work.

Bye,
Hannes
diff mbox

Patch

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 6c67b93..c76303e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -221,6 +221,7 @@  enum fib_event_type {
 	FIB_EVENT_RULE_DEL,
 };
 
+bool fib_notifier_dump(struct notifier_block *nb);
 int register_fib_notifier(struct notifier_block *nb);
 int unregister_fib_notifier(struct notifier_block *nb);
 int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index b1d2d09..9770edfe 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -86,6 +86,67 @@ 
 
 static ATOMIC_NOTIFIER_HEAD(fib_chain);
 
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+			     enum fib_event_type event_type,
+			     struct fib_notifier_info *info)
+{
+	info->net = net;
+	return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+			     enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	struct fib_notifier_info info;
+
+	if (net->ipv4.fib_has_custom_rules)
+		call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+		       enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+				   enum fib_event_type event_type, u32 dst,
+				   int dst_len, struct fib_info *fi,
+				   u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+	struct fib_entry_notifier_info info = {
+		.dst = dst,
+		.dst_len = dst_len,
+		.fi = fi,
+		.tos = tos,
+		.type = type,
+		.tb_id = tb_id,
+		.nlflags = nlflags,
+	};
+	return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+bool fib_notifier_dump(struct notifier_block *nb)
+{
+	struct net *net;
+	bool ret = true;
+
+	rcu_read_lock();
+	for_each_net_rcu(net) {
+		int fib_seq = atomic_read(&net->ipv4.fib_seq);
+
+		fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+		fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+		if (atomic_read(&net->ipv4.fib_seq) != fib_seq) {
+			ret = false;
+			goto out_unlock;
+		}
+	}
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(fib_notifier_dump);
+
 int register_fib_notifier(struct notifier_block *nb)
 {
 	return atomic_notifier_chain_register(&fib_chain, nb);
@@ -1902,6 +1963,62 @@  int fib_table_flush(struct net *net, struct fib_table *tb)
 	return found;
 }
 
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+			    struct fib_table *tb, struct notifier_block *nb,
+			    enum fib_event_type event_type)
+{
+	struct fib_alias *fa;
+
+	hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (!fi)
+			continue;
+
+		/* local and main table can share the same trie,
+		 * so don't notify twice for the same entry.
+		 */
+		if (tb->tb_id != fa->tb_id)
+			continue;
+
+		call_fib_entry_notifier(nb, net, event_type, l->key,
+					KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+					fa->fa_type, fa->tb_id, 0);
+	}
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+			     struct notifier_block *nb,
+			     enum fib_event_type event_type)
+{
+	struct trie *t = (struct trie *)tb->tb_data;
+	struct key_vector *l, *tp = t->kv;
+	t_key key = 0;
+
+	while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+		fib_leaf_notify(net, l, tb, nb, event_type);
+
+		key = l->key + 1;
+		/* stop in case of wrap around */
+		if (key < l->key)
+			break;
+	}
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+		       enum fib_event_type event_type)
+{
+	unsigned int h;
+
+	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+		struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+		struct fib_table *tb;
+
+		hlist_for_each_entry_rcu(tb, head, tb_hlist)
+			fib_table_notify(net, tb, nb, event_type);
+	}
+}
+
 static void __trie_free_rcu(struct rcu_head *head)
 {
 	struct fib_table *tb = container_of(head, struct fib_table, rcu);