diff mbox

[net] ipv6: do not overwrite inetpeer metrics prematurely

Message ID 20140306095054.EEC5EE6E04@unicorn.suse.cz
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Michal Kubecek March 6, 2014, 9:50 a.m. UTC
If an IPv6 host route with metrics exists, an attempt to add a
new route for the same target with different metrics fails but
rewrites the metrics anyway:

12sp0:~ # ip route add fec0::1 dev eth0 rto_min 1000
12sp0:~ # ip -6 route show
fe80::/64 dev eth0  proto kernel  metric 256
fec0::1 dev eth0  metric 1024  rto_min lock 1s
12sp0:~ # ip route add fec0::1 dev eth0 rto_min 1500
RTNETLINK answers: File exists
12sp0:~ # ip -6 route show
fe80::/64 dev eth0  proto kernel  metric 256
fec0::1 dev eth0  metric 1024  rto_min lock 1.5s

This is caused by all IPv6 host routes using the metrics in
their inetpeer (or the shared default). This also holds for the
new route created in ip6_route_add() which shares the metrics
with the already existing route and thus ip6_route_add()
rewrites the metrics even if the new route ends up not being
used at all.

Allocate separate metrics in ip6_route_add() and copy them into
inetpeer (and update dst->_metrics) just before the new route is
actually inserted in fib6_add_rt2node().

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 net/ipv6/ip6_fib.c | 28 ++++++++++++++++++++++++++++
 net/ipv6/route.c   | 10 +++++-----
 2 files changed, 33 insertions(+), 5 deletions(-)

Comments

David Miller March 6, 2014, 7:24 p.m. UTC | #1
From: Michal Kubecek <mkubecek@suse.cz>
Date: Thu,  6 Mar 2014 10:50:54 +0100 (CET)

> If an IPv6 host route with metrics exists, an attempt to add a
> new route for the same target with different metrics fails but
> rewrites the metrics anyway:
> 
> 12sp0:~ # ip route add fec0::1 dev eth0 rto_min 1000
> 12sp0:~ # ip -6 route show
> fe80::/64 dev eth0  proto kernel  metric 256
> fec0::1 dev eth0  metric 1024  rto_min lock 1s
> 12sp0:~ # ip route add fec0::1 dev eth0 rto_min 1500
> RTNETLINK answers: File exists
> 12sp0:~ # ip -6 route show
> fe80::/64 dev eth0  proto kernel  metric 256
> fec0::1 dev eth0  metric 1024  rto_min lock 1.5s
> 
> This is caused by all IPv6 host routes using the metrics in
> their inetpeer (or the shared default). This also holds for the
> new route created in ip6_route_add() which shares the metrics
> with the already existing route and thus ip6_route_add()
> rewrites the metrics even if the new route ends up not being
> used at all.
> 
> Allocate separate metrics in ip6_route_add() and copy them into
> inetpeer (and update dst->_metrics) just before the new route is
> actually inserted in fib6_add_rt2node().
> 
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>

Thanks for working on this.

> +static void rt6_metrics_to_peer(struct rt6_info *rt)
> +{
> +	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
> +	struct dst_entry *dst = &rt->dst;
> +	unsigned long old = dst->_metrics;
> +
> +	if (!(rt->dst.flags & DST_HOST) || dst_metrics_read_only(dst))
> +		return;
> +	if (peer && dst_metrics_ptr(dst) == peer->metrics)
> +		return;
> +
> +	dst->ops->cow_metrics(dst, old);
> +	if (dst->_metrics != old) {
> +		u32 *old_p = __DST_METRICS_PTR(old);
> +
> +		memcpy(dst_metrics_ptr(dst), old_p, RTAX_MAX * sizeof(u32));
> +		kfree(old_p);
> +	}
> +}

Hmmm...  if inet_metrics_new() is true then ->cow_metrics() will copy the
metrics from old to new.  So you therefore shouldn't have to do the copy
explicitly here.

If inet_metrics_new() is not true, you are overwriting non-new metrics.

If there is some reason why what rt6_metrics_to_peer() is doing is OK, I'd
like you to explain this in the commit message.

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek March 6, 2014, 8:06 p.m. UTC | #2
On Thu, Mar 06, 2014 at 02:24:25PM -0500, David Miller wrote:
> 
> > +static void rt6_metrics_to_peer(struct rt6_info *rt)
> > +{
> > +	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
> > +	struct dst_entry *dst = &rt->dst;
> > +	unsigned long old = dst->_metrics;
> > +
> > +	if (!(rt->dst.flags & DST_HOST) || dst_metrics_read_only(dst))
> > +		return;
> > +	if (peer && dst_metrics_ptr(dst) == peer->metrics)
> > +		return;
> > +
> > +	dst->ops->cow_metrics(dst, old);
> > +	if (dst->_metrics != old) {
> > +		u32 *old_p = __DST_METRICS_PTR(old);
> > +
> > +		memcpy(dst_metrics_ptr(dst), old_p, RTAX_MAX * sizeof(u32));
> > +		kfree(old_p);
> > +	}
> > +}
> 
> Hmmm...  if inet_metrics_new() is true then ->cow_metrics() will copy the
> metrics from old to new.  So you therefore shouldn't have to do the copy
> explicitly here.

If we are replacing an existing host route with metrics, e.g.

  ip route add fec0::1 dev eth0 rto_min 1000
  ip route change fec0::1 dev eth0 rto_min 1500

then peer will be the existing inetpeer and inet_metrics_new() will be
false. However, we still need to copy the new metrics from the netlink
message over the old ones.

> If inet_metrics_new() is not true, you are overwriting non-new metrics.

The only problem with this is IMHO that if inet_metrics_new() is true,
i.e. when adding a new route with new inetpeer (or old inetpeer whose
metrics were not used before), the memcpy() is done twice, once in
ipv6_cow_metrics() and once in rt6_metrics_to_peer(). We are copying the
same data twice so that the result is correct but it's not efficient and
it's not nice.

The only way I can see to avoid this (except using own metrics always
instead of those in struct inetpeer as we do for non-host routes) would
be not to call ipv6_cow_metrics() at all and write a special function
for this purpose in net/ipv6/route.c which would duplicate the parts of
ipv6_cow_metrics() we really need (and add the free()). Do you think
this is the way to go?

                                                        Michal Kubecek

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 7, 2014, 8:52 p.m. UTC | #3
From: Michal Kubecek <mkubecek@suse.cz>
Date: Thu, 6 Mar 2014 21:06:58 +0100

> On Thu, Mar 06, 2014 at 02:24:25PM -0500, David Miller wrote:
>> 
>> > +static void rt6_metrics_to_peer(struct rt6_info *rt)
>> > +{
>> > +	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
>> > +	struct dst_entry *dst = &rt->dst;
>> > +	unsigned long old = dst->_metrics;
>> > +
>> > +	if (!(rt->dst.flags & DST_HOST) || dst_metrics_read_only(dst))
>> > +		return;
>> > +	if (peer && dst_metrics_ptr(dst) == peer->metrics)
>> > +		return;
>> > +
>> > +	dst->ops->cow_metrics(dst, old);
>> > +	if (dst->_metrics != old) {
>> > +		u32 *old_p = __DST_METRICS_PTR(old);
>> > +
>> > +		memcpy(dst_metrics_ptr(dst), old_p, RTAX_MAX * sizeof(u32));
>> > +		kfree(old_p);
>> > +	}
>> > +}
>> 
>> Hmmm...  if inet_metrics_new() is true then ->cow_metrics() will copy the
>> metrics from old to new.  So you therefore shouldn't have to do the copy
>> explicitly here.
> 
> If we are replacing an existing host route with metrics, e.g.
> 
>   ip route add fec0::1 dev eth0 rto_min 1000
>   ip route change fec0::1 dev eth0 rto_min 1500
> 
> then peer will be the existing inetpeer and inet_metrics_new() will be
> false. However, we still need to copy the new metrics from the netlink
> message over the old ones.
> 
>> If inet_metrics_new() is not true, you are overwriting non-new metrics.
> 
> The only problem with this is IMHO that if inet_metrics_new() is true,
> i.e. when adding a new route with new inetpeer (or old inetpeer whose
> metrics were not used before), the memcpy() is done twice, once in
> ipv6_cow_metrics() and once in rt6_metrics_to_peer(). We are copying the
> same data twice so that the result is correct but it's not efficient and
> it's not nice.
> 
> The only way I can see to avoid this (except using own metrics always
> instead of those in struct inetpeer as we do for non-host routes) would
> be not to call ipv6_cow_metrics() at all and write a special function
> for this purpose in net/ipv6/route.c which would duplicate the parts of
> ipv6_cow_metrics() we really need (and add the free()). Do you think
> this is the way to go?

Thank you for explaining all of this, I would like to think about this
some more.

My initial suspicion is that the something about the test in cow
metrics might need to be adjusted.

The conceptual attributes we have built for inetpeer metrics, that of
"newness" and "read-only", might not be built adequately for the task
at hand here.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michal Kubecek March 7, 2014, 9:38 p.m. UTC | #4
On Fri, Mar 07, 2014 at 03:52:58PM -0500, David Miller wrote:
> From: Michal Kubecek <mkubecek@suse.cz>
> Date: Thu, 6 Mar 2014 21:06:58 +0100
> 
> > On Thu, Mar 06, 2014 at 02:24:25PM -0500, David Miller wrote:
> >> 
> >> > +static void rt6_metrics_to_peer(struct rt6_info *rt)
> >> > +{
> >> > +	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
> >> > +	struct dst_entry *dst = &rt->dst;
> >> > +	unsigned long old = dst->_metrics;
> >> > +
> >> > +	if (!(rt->dst.flags & DST_HOST) || dst_metrics_read_only(dst))
> >> > +		return;
> >> > +	if (peer && dst_metrics_ptr(dst) == peer->metrics)
> >> > +		return;
> >> > +
> >> > +	dst->ops->cow_metrics(dst, old);
> >> > +	if (dst->_metrics != old) {
> >> > +		u32 *old_p = __DST_METRICS_PTR(old);
> >> > +
> >> > +		memcpy(dst_metrics_ptr(dst), old_p, RTAX_MAX * sizeof(u32));
> >> > +		kfree(old_p);
> >> > +	}
> >> > +}
> >> 
> >> Hmmm...  if inet_metrics_new() is true then ->cow_metrics() will copy the
> >> metrics from old to new.  So you therefore shouldn't have to do the copy
> >> explicitly here.
> > 
> > If we are replacing an existing host route with metrics, e.g.
> > 
> >   ip route add fec0::1 dev eth0 rto_min 1000
> >   ip route change fec0::1 dev eth0 rto_min 1500
> > 
> > then peer will be the existing inetpeer and inet_metrics_new() will be
> > false. However, we still need to copy the new metrics from the netlink
> > message over the old ones.
> > 
> >> If inet_metrics_new() is not true, you are overwriting non-new metrics.
> > 
> > The only problem with this is IMHO that if inet_metrics_new() is true,
> > i.e. when adding a new route with new inetpeer (or old inetpeer whose
> > metrics were not used before), the memcpy() is done twice, once in
> > ipv6_cow_metrics() and once in rt6_metrics_to_peer(). We are copying the
> > same data twice so that the result is correct but it's not efficient and
> > it's not nice.
> > 
> > The only way I can see to avoid this (except using own metrics always
> > instead of those in struct inetpeer as we do for non-host routes) would
> > be not to call ipv6_cow_metrics() at all and write a special function
> > for this purpose in net/ipv6/route.c which would duplicate the parts of
> > ipv6_cow_metrics() we really need (and add the free()). Do you think
> > this is the way to go?
> 
> Thank you for explaining all of this, I would like to think about this
> some more.
> 
> My initial suspicion is that the something about the test in cow
> metrics might need to be adjusted.
> 
> The conceptual attributes we have built for inetpeer metrics, that of
> "newness" and "read-only", might not be built adequately for the task
> at hand here.

Today I also realized another problem with current code: if we already
have inetpeer with metrics and use "ip route change" with metrics, e.g.

  ip route add fec0::1 dev eth0 rto_min 1000
  ip route change fec0::1 dev eth0 hoplimit 10

only the metrics listed in the netlink message (hoplimit here) are
modified and the rest (rto_min here) is preserved. This is inconsistent
with non-host IPv6 routes and IPv4 routes where the whole set of metrics
is replaced.

                                                         Michal Kubecek

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa March 8, 2014, 8:06 a.m. UTC | #5
On Fri, Mar 07, 2014 at 03:52:58PM -0500, David Miller wrote:
> Thank you for explaining all of this, I would like to think about this
> some more.
> 
> My initial suspicion is that the something about the test in cow
> metrics might need to be adjusted.

Hmm, you mean we can do the check when looking up the route and testing that
on cloning or cowing and maybe do the copy then?

Just trying to think out loud:

I think we always should have the metrics (if we have them) in inet_peer
in case the dst is a DST_HOST. We cannot do that directly in ip6_route_add
because we would also publish the metrics in case we don't actually
install the route because of an error.  So this is a no-go, we need to
stage them first.

In case we do that in ip6_rt_copy / ip6_pol_route:

We don't reinsert DST_HOST routes into fib so they will never go through
ip6_rt_copy, as such they are never pushed through ipv6_cow_metrics logic.

A new check would be needed in ip6_pol_route for DST_HOST routes and
check inet_peer metrics pointer with dst->metrics one. As we don't set
RTF_CACHE in rt6i_flags afterwards we would need to do that on every DST_HOST
result every time (if metrics pointer is writable).

So moving the metrics from staging area into inet_peer metrics in
rt2node seems like the best solution to me so far. I am fine with the
current approach. I also agree that current patch should not have a
performance impact, as we don't reinsert DST_HOST nodes into the fib,
so the new inline rt6_metrics_to_peer should be ok to protect against
this common case.

> The conceptual attributes we have built for inetpeer metrics, that of
> "newness" and "read-only", might not be built adequately for the task
> at hand here.

One question regarding the patch:

In case fc_mx is set we now always kzalloc a non-inet-peer region to
store the metrics for staging. I would find it a bit easier to switch
away from dst_metric_set and use array notation to make it more clear we
don't operate on inet_peer metrics below the install_route: mark in
ip6_route_add. This shouldn't alter the behaviour but just make it clear we
don't operate on inet_peer metric storage.

Hope this made any sense,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa March 8, 2014, 8:34 a.m. UTC | #6
On Fri, Mar 07, 2014 at 10:38:42PM +0100, Michal Kubecek wrote:
> Today I also realized another problem with current code: if we already
> have inetpeer with metrics and use "ip route change" with metrics, e.g.
> 
>   ip route add fec0::1 dev eth0 rto_min 1000
>   ip route change fec0::1 dev eth0 hoplimit 10

Hmm, those routes seem to not have RTF_NONEXTHOP (no gateway). I am
a bit concerend that rt6_select returns !RTF_CACHE node and we do end
up doing the inetpeer lookup on every lookup on them. Maybe you have
already investigated?

Thanks,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller March 10, 2014, 12:26 a.m. UTC | #7
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Sat, 8 Mar 2014 09:06:16 +0100

> I think we always should have the metrics (if we have them) in inet_peer
> in case the dst is a DST_HOST. We cannot do that directly in ip6_route_add
> because we would also publish the metrics in case we don't actually
> install the route because of an error.  So this is a no-go, we need to
> stage them first.

Absolutely, we cannot make the metrics visible until the final commit
of the route insert.

This worked transparently before we placed metrics into the inetpeer
cache, because the metrics lives in the dst itself.

I also agree with Michal's observarion of another unintended
behavioral change by this conversion.  Specifically the handling of
metric changes of existing routes.  The original code replaced all of
the metrics completely with defaults overlayed with whatever was
provided in the route change request, whereas the current inetpeer
code only updates metrics explicitly provided.

This is part of why we need to remove that DST_HOST check currently
guarding the kzalloc()'d metrics in ip6_route_add().

> So moving the metrics from staging area into inet_peer metrics in
> rt2node seems like the best solution to me so far. I am fine with the
> current approach.

I agree.

> In case fc_mx is set we now always kzalloc a non-inet-peer region to
> store the metrics for staging. I would find it a bit easier to switch
> away from dst_metric_set and use array notation to make it more clear we
> don't operate on inet_peer metrics below the install_route: mark in
> ip6_route_add. This shouldn't alter the behaviour but just make it clear we
> don't operate on inet_peer metric storage.

In fact, thinking about this some more, it's almost silly to allocate
this temporary array at all.

A different, potentially much cleaner approach, would be to pass the
cfg->fc_mx down into the __ip6_ins_rt() code path.

Then there is _no_ ambiguity.  These are metrics from a netlink
request and we must write them into the final route which we end
up with.

This makes it's way down into fib6_add_rt2node(), and right before the
insertion we make the metrics writable, clear them, and copy in the
explicit values provided from the netlink message.

All of these operations can error, and that's fine, the call chain
can handle errors signalled from here already.

Michal, Hannes, what do you think?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Frederic Sowa March 10, 2014, 12:52 a.m. UTC | #8
On Sun, Mar 09, 2014 at 08:26:58PM -0400, David Miller wrote:
> In fact, thinking about this some more, it's almost silly to allocate
> this temporary array at all.
> 
> A different, potentially much cleaner approach, would be to pass the
> cfg->fc_mx down into the __ip6_ins_rt() code path.

Maybe some people call this a layering violation, but I like the idea! :)

> Then there is _no_ ambiguity.  These are metrics from a netlink
> request and we must write them into the final route which we end
> up with.

Exactly, we can be sure that we don't accidentally always lookup
the inetpeer if we try to reinsert a cloned route where !(rt->rt6i_flags &
(RTF_NONEXTHOP | RTF_GATEWAY)) validates to true and DST_HOST is set.

This addresses the feedback from my other mail because I think if
we insert interface routes (ip r a 2000::/xx dev foo) we don't set
RTF_NONEXTHOP. (Maybe we should change that in rtm_to_fib6_config.)

> This makes it's way down into fib6_add_rt2node(), and right before the
> insertion we make the metrics writable, clear them, and copy in the
> explicit values provided from the netlink message.
> 
> All of these operations can error, and that's fine, the call chain
> can handle errors signalled from here already.
> 
> Michal, Hannes, what do you think?

So we need to change __ip6_ins_rt to take fib6_config as the second
argument instead, still have the info pointer in fc_nlinfo and only have
to wrap the info pointer in ip6_ins_rt into another structure. fib6_add
seems to be straightforward from there.

Sounds good to me!

Thanks,

  Hannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 075602f..3067715 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -638,6 +638,32 @@  static inline bool rt6_qualify_for_ecmp(struct rt6_info *rt)
 	       RTF_GATEWAY;
 }
 
+/*	Called just before inserting a new route into the tree.
+ *	If it is a host route and has its own metrics allocated, copy
+ *	them to its inetpeer (create and/or bind if needed) and free
+ *	the temporary block.
+ */
+
+static void rt6_metrics_to_peer(struct rt6_info *rt)
+{
+	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
+	struct dst_entry *dst = &rt->dst;
+	unsigned long old = dst->_metrics;
+
+	if (!(rt->dst.flags & DST_HOST) || dst_metrics_read_only(dst))
+		return;
+	if (peer && dst_metrics_ptr(dst) == peer->metrics)
+		return;
+
+	dst->ops->cow_metrics(dst, old);
+	if (dst->_metrics != old) {
+		u32 *old_p = __DST_METRICS_PTR(old);
+
+		memcpy(dst_metrics_ptr(dst), old_p, RTAX_MAX * sizeof(u32));
+		kfree(old_p);
+	}
+}
+
 /*
  *	Insert routing information in a node.
  */
@@ -752,6 +778,7 @@  static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 
 add:
 		rt->dst.rt6_next = iter;
+		rt6_metrics_to_peer(rt);
 		*ins = rt;
 		rt->rt6i_node = fn;
 		atomic_inc(&rt->rt6i_ref);
@@ -770,6 +797,7 @@  add:
 			pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
 			return -ENOENT;
 		}
+		rt6_metrics_to_peer(rt);
 		*ins = rt;
 		rt->rt6i_node = fn;
 		rt->dst.rt6_next = iter->dst.rt6_next;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 11dac21..4ba981e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -324,8 +324,10 @@  static void ip6_dst_destroy(struct dst_entry *dst)
 	struct rt6_info *rt = (struct rt6_info *)dst;
 	struct inet6_dev *idev = rt->rt6i_idev;
 	struct dst_entry *from = dst->from;
+	struct inet_peer *peer = rt6_has_peer(rt) ? rt6_peer_ptr(rt) : NULL;
 
-	if (!(rt->dst.flags & DST_HOST))
+	if (!(dst->flags & DST_HOST) || !peer ||
+	    DST_METRICS_PTR(dst) != peer->metrics)
 		dst_destroy_metrics_generic(dst);
 
 	if (idev) {
@@ -336,10 +338,8 @@  static void ip6_dst_destroy(struct dst_entry *dst)
 	dst->from = NULL;
 	dst_release(from);
 
-	if (rt6_has_peer(rt)) {
-		struct inet_peer *peer = rt6_peer_ptr(rt);
+	if (peer)
 		inet_putpeer(peer);
-	}
 }
 
 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
@@ -1546,7 +1546,7 @@  int ip6_route_add(struct fib6_config *cfg)
 	if (rt->rt6i_dst.plen == 128)
 	       rt->dst.flags |= DST_HOST;
 
-	if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
+	if (cfg->fc_mx) {
 		u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
 		if (!metrics) {
 			err = -ENOMEM;