Message ID | 1495747655.6465.113.camel@edumazet-glaptop3.roam.corp.google.com |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
Hello, On Thu, 25 May 2017, Eric Dumazet wrote: > From: Eric Dumazet <edumazet@google.com> > > Andrey Konovalov reported crashes in ipv4_mtu() > > I could reproduce the issue with KASAN kernels, between > 10.246.7.151 and 10.246.7.152 : > > 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & > > 2) At the same time run following loop : > while : > do > ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > done > > > Cong Wang attempted to add back rt->fi in commit > 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") > but this proved to add some issues that were complex to solve. > > Instead, I suggested to add a refcount to the metrics themselves, > being a standalone object (in particular, no reference to other objects) > > I tried to make this patch as small as possible to ease its backport, > instead of being super clean. Note that we believe that only ipv4 dst > need to take care of the metric refcount. But if this is wrong, > this patch adds the basic infrastructure to extend this to other > families. > > Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang > for his efforts on this problem. > > Fixes: 2860583fe840 ("ipv4: Kill rt->fi") > Signed-off-by: Eric Dumazet <edumazet@google.com> > Reported-by: Andrey Konovalov <andreyknvl@google.com> Nice work, thanks! Reviewed-by: Julian Anastasov <ja@ssi.bg> > --- > include/net/dst.h | 8 +++++++- > include/net/ip_fib.h | 10 +++++----- > net/core/dst.c | 23 ++++++++++++++--------- > net/ipv4/fib_semantics.c | 17 ++++++++++------- > net/ipv4/route.c | 10 +++++++++- > 5 files changed, 45 insertions(+), 23 deletions(-) > > diff --git a/include/net/dst.h b/include/net/dst.h > index 049af33da3b6c95897d544670cea65c542317673..cfc0437841665d7ed46a714915c50d723c24901c 100644 > --- a/include/net/dst.h > +++ b/include/net/dst.h > @@ -107,10 +107,16 @@ struct dst_entry { > }; > }; > > +struct dst_metrics { > + u32 metrics[RTAX_MAX]; > + atomic_t refcnt; > +}; > +extern const struct dst_metrics dst_default_metrics; > + > u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); > -extern const u32 dst_default_metrics[]; > > #define DST_METRICS_READ_ONLY 0x1UL > +#define DST_METRICS_REFCOUNTED 0x2UL > #define DST_METRICS_FLAGS 0x3UL > #define __DST_METRICS_PTR(Y) \ > ((u32 *)((Y) & ~DST_METRICS_FLAGS)) > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h > index 6692c5758b332d468f1e0611ecc4f3e03ae03b2b..f7f6aa789c6174c41ca9739206d586c559c1f3a1 100644 > --- a/include/net/ip_fib.h > +++ b/include/net/ip_fib.h > @@ -114,11 +114,11 @@ struct fib_info { > __be32 fib_prefsrc; > u32 fib_tb_id; > u32 fib_priority; > - u32 *fib_metrics; > -#define fib_mtu fib_metrics[RTAX_MTU-1] > -#define fib_window fib_metrics[RTAX_WINDOW-1] > -#define fib_rtt fib_metrics[RTAX_RTT-1] > -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] > + struct dst_metrics *fib_metrics; > +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] > +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] > +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] > +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] > int fib_nhs; > #ifdef CONFIG_IP_ROUTE_MULTIPATH > int fib_weight; > diff --git a/net/core/dst.c b/net/core/dst.c > index 960e503b5a529a2c4f1866f49c150493ee98d7da..6192f11beec9077de964e2aeff4f78547f08b8da 100644 > --- a/net/core/dst.c > +++ b/net/core/dst.c > @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) > } > EXPORT_SYMBOL(dst_discard_out); > > -const u32 dst_default_metrics[RTAX_MAX + 1] = { > +const struct dst_metrics dst_default_metrics = { > /* This initializer is needed to force linker to place this variable > * into const section. Otherwise it might end into bss section. > * We really want to avoid false sharing on this variable, and catch > * any writes on it. > */ > - [RTAX_MAX] = 0xdeadbeef, > + .refcnt = ATOMIC_INIT(1), > }; > > void dst_init(struct dst_entry *dst, struct dst_ops *ops, > @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, > if (dev) > dev_hold(dev); > dst->ops = ops; > - dst_init_metrics(dst, dst_default_metrics, true); > + dst_init_metrics(dst, dst_default_metrics.metrics, true); > dst->expires = 0UL; > dst->path = dst; > dst->from = NULL; > @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); > > u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) > { > - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); > + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); > > if (p) { > - u32 *old_p = __DST_METRICS_PTR(old); > + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); > unsigned long prev, new; > > - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); > + atomic_set(&p->refcnt, 1); > + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); > > new = (unsigned long) p; > prev = cmpxchg(&dst->_metrics, old, new); > > if (prev != old) { > kfree(p); > - p = __DST_METRICS_PTR(prev); > + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); > if (prev & DST_METRICS_READ_ONLY) > p = NULL; > + } else if (prev & DST_METRICS_REFCOUNTED) { > + if (atomic_dec_and_test(&old_p->refcnt)) > + kfree(old_p); > } > } > - return p; > + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); > + return (u32 *)p; > } > EXPORT_SYMBOL(dst_cow_metrics_generic); > > @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) > { > unsigned long prev, new; > > - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; > + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; > prev = cmpxchg(&dst->_metrics, old, new); > if (prev == old) > kfree(__DST_METRICS_PTR(old)); > diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c > index da449ddb8cc172bd9091c00057a69a095f98b56d..ad9ad4aab5da7c7d11c3b80edbdfcbdd3d7153fe 100644 > --- a/net/ipv4/fib_semantics.c > +++ b/net/ipv4/fib_semantics.c > @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) > static void free_fib_info_rcu(struct rcu_head *head) > { > struct fib_info *fi = container_of(head, struct fib_info, rcu); > + struct dst_metrics *m; > > change_nexthops(fi) { > if (nexthop_nh->nh_dev) > @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) > rt_fibinfo_free(&nexthop_nh->nh_rth_input); > } endfor_nexthops(fi); > > - if (fi->fib_metrics != (u32 *) dst_default_metrics) > - kfree(fi->fib_metrics); > + m = fi->fib_metrics; > + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) > + kfree(m); > kfree(fi); > } > > @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) > val = 255; > if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) > return -EINVAL; > - fi->fib_metrics[type - 1] = val; > + fi->fib_metrics->metrics[type - 1] = val; > } > > if (ecn_ca) > - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; > + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; > > return 0; > } > @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) > goto failure; > fib_info_cnt++; > if (cfg->fc_mx) { > - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); > + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); > if (!fi->fib_metrics) > goto failure; > + atomic_set(&fi->fib_metrics->refcnt, 1); > } else > - fi->fib_metrics = (u32 *) dst_default_metrics; > + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; > > fi->fib_net = net; > fi->fib_protocol = cfg->fc_protocol; > @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, > if (fi->fib_priority && > nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) > goto nla_put_failure; > - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) > + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) > goto nla_put_failure; > > if (fi->fib_prefsrc && > diff --git a/net/ipv4/route.c b/net/ipv4/route.c > index 655d9eebe43e16a59102edcd3ea4bc177c6b341d..6883b3d4ba8f69de2cb924612d60f5671a219a84 100644 > --- a/net/ipv4/route.c > +++ b/net/ipv4/route.c > @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) > > static void ipv4_dst_destroy(struct dst_entry *dst) > { > + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); > struct rtable *rt = (struct rtable *) dst; > > + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) > + kfree(p); > + > if (!list_empty(&rt->rt_uncached)) { > struct uncached_list *ul = rt->rt_uncached_list; > > @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, > rt->rt_gateway = nh->nh_gw; > rt->rt_uses_gateway = 1; > } > - dst_init_metrics(&rt->dst, fi->fib_metrics, true); > + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); > + if (fi->fib_metrics != &dst_default_metrics) { > + rt->dst._metrics |= DST_METRICS_REFCOUNTED; > + atomic_inc(&fi->fib_metrics->refcnt); > + } > #ifdef CONFIG_IP_ROUTE_CLASSID > rt->dst.tclassid = nh->nh_tclassid; > #endif Regards -- Julian Anastasov <ja@ssi.bg>
On Thu, May 25, 2017 at 2:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > From: Eric Dumazet <edumazet@google.com> > > Andrey Konovalov reported crashes in ipv4_mtu() > > I could reproduce the issue with KASAN kernels, between > 10.246.7.151 and 10.246.7.152 : > > 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & > > 2) At the same time run following loop : > while : > do > ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > done > > > Cong Wang attempted to add back rt->fi in commit > 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") > but this proved to add some issues that were complex to solve. > > Instead, I suggested to add a refcount to the metrics themselves, > being a standalone object (in particular, no reference to other objects) > > I tried to make this patch as small as possible to ease its backport, > instead of being super clean. Note that we believe that only ipv4 dst > need to take care of the metric refcount. But if this is wrong, > this patch adds the basic infrastructure to extend this to other > families. > > Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang > for his efforts on this problem. > > Fixes: 2860583fe840 ("ipv4: Kill rt->fi") > Signed-off-by: Eric Dumazet <edumazet@google.com> > Reported-by: Andrey Konovalov <andreyknvl@google.com> Nice work!! Thanks for your effort of making it small! Just one nit below. > -const u32 dst_default_metrics[RTAX_MAX + 1] = { > +const struct dst_metrics dst_default_metrics = { > /* This initializer is needed to force linker to place this variable > * into const section. Otherwise it might end into bss section. > * We really want to avoid false sharing on this variable, and catch > * any writes on it. > */ > - [RTAX_MAX] = 0xdeadbeef, > + .refcnt = ATOMIC_INIT(1), > }; The code comment above is no longer needed since we have to initialize refcnt to 1, instead of merely for const section. Anyway, Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
On Fri, May 26, 2017 at 10:08 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote: > On Thu, May 25, 2017 at 2:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > Just one nit below. > >> -const u32 dst_default_metrics[RTAX_MAX + 1] = { >> +const struct dst_metrics dst_default_metrics = { >> /* This initializer is needed to force linker to place this variable >> * into const section. Otherwise it might end into bss section. >> * We really want to avoid false sharing on this variable, and catch >> * any writes on it. >> */ >> - [RTAX_MAX] = 0xdeadbeef, >> + .refcnt = ATOMIC_INIT(1), >> }; > > The code comment above is no longer needed since > we have to initialize refcnt to 1, instead of merely for const > section. I believe the comment is still needed, because normally we make sure dst_default_metrics.refcnt is never touched (incremened nor decremented) So its value should not really matter ? I found that ATOMIC_INIT(1) was less ugly than the 0xdeadbeef
On Fri, May 26, 2017 at 10:13 AM, Eric Dumazet <edumazet@google.com> wrote: > On Fri, May 26, 2017 at 10:08 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote: >> On Thu, May 25, 2017 at 2:27 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote: > >> Just one nit below. >> >>> -const u32 dst_default_metrics[RTAX_MAX + 1] = { >>> +const struct dst_metrics dst_default_metrics = { >>> /* This initializer is needed to force linker to place this variable >>> * into const section. Otherwise it might end into bss section. >>> * We really want to avoid false sharing on this variable, and catch >>> * any writes on it. >>> */ >>> - [RTAX_MAX] = 0xdeadbeef, >>> + .refcnt = ATOMIC_INIT(1), >>> }; >> >> The code comment above is no longer needed since >> we have to initialize refcnt to 1, instead of merely for const >> section. > > I believe the comment is still needed, because normally we make sure > dst_default_metrics.refcnt is never touched (incremened nor > decremented) > > So its value should not really matter ? Oh, you're right, we always have the check on != &dst_default_metrics. > > I found that ATOMIC_INIT(1) was less ugly than the 0xdeadbeef Of course.
From: Eric Dumazet <eric.dumazet@gmail.com> Date: Thu, 25 May 2017 14:27:35 -0700 > From: Eric Dumazet <edumazet@google.com> > > Andrey Konovalov reported crashes in ipv4_mtu() > > I could reproduce the issue with KASAN kernels, between > 10.246.7.151 and 10.246.7.152 : > > 1) 20 concurrent netperf -t TCP_RR -H 10.246.7.152 -l 1000 & > > 2) At the same time run following loop : > while : > do > ip ro add 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > ip ro del 10.246.7.152 dev eth0 src 10.246.7.151 mtu 1500 > done > > > Cong Wang attempted to add back rt->fi in commit > 82486aa6f1b9 ("ipv4: restore rt->fi for reference counting") > but this proved to add some issues that were complex to solve. > > Instead, I suggested to add a refcount to the metrics themselves, > being a standalone object (in particular, no reference to other objects) > > I tried to make this patch as small as possible to ease its backport, > instead of being super clean. Note that we believe that only ipv4 dst > need to take care of the metric refcount. But if this is wrong, > this patch adds the basic infrastructure to extend this to other > families. > > Many thanks to Julian Anastasov for reviewing this patch, and Cong Wang > for his efforts on this problem. > > Fixes: 2860583fe840 ("ipv4: Kill rt->fi") > Signed-off-by: Eric Dumazet <edumazet@google.com> > Reported-by: Andrey Konovalov <andreyknvl@google.com> Applied, thanks everyone for following through on this bug fix. And sorry for introducing the problem in the first place :)
diff --git a/include/net/dst.h b/include/net/dst.h index 049af33da3b6c95897d544670cea65c542317673..cfc0437841665d7ed46a714915c50d723c24901c 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -107,10 +107,16 @@ struct dst_entry { }; }; +struct dst_metrics { + u32 metrics[RTAX_MAX]; + atomic_t refcnt; +}; +extern const struct dst_metrics dst_default_metrics; + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old); -extern const u32 dst_default_metrics[]; #define DST_METRICS_READ_ONLY 0x1UL +#define DST_METRICS_REFCOUNTED 0x2UL #define DST_METRICS_FLAGS 0x3UL #define __DST_METRICS_PTR(Y) \ ((u32 *)((Y) & ~DST_METRICS_FLAGS)) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c5758b332d468f1e0611ecc4f3e03ae03b2b..f7f6aa789c6174c41ca9739206d586c559c1f3a1 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -114,11 +114,11 @@ struct fib_info { __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; - u32 *fib_metrics; -#define fib_mtu fib_metrics[RTAX_MTU-1] -#define fib_window fib_metrics[RTAX_WINDOW-1] -#define fib_rtt fib_metrics[RTAX_RTT-1] -#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + struct dst_metrics *fib_metrics; +#define fib_mtu fib_metrics->metrics[RTAX_MTU-1] +#define fib_window fib_metrics->metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics->metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_weight; diff --git a/net/core/dst.c b/net/core/dst.c index 960e503b5a529a2c4f1866f49c150493ee98d7da..6192f11beec9077de964e2aeff4f78547f08b8da 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -151,13 +151,13 @@ int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(dst_discard_out); -const u32 dst_default_metrics[RTAX_MAX + 1] = { +const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ - [RTAX_MAX] = 0xdeadbeef, + .refcnt = ATOMIC_INIT(1), }; void dst_init(struct dst_entry *dst, struct dst_ops *ops, @@ -169,7 +169,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, if (dev) dev_hold(dev); dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); + dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; dst->path = dst; dst->from = NULL; @@ -314,25 +314,30 @@ EXPORT_SYMBOL(dst_release); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { - u32 *p = kmalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); + struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { - u32 *old_p = __DST_METRICS_PTR(old); + struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; - memcpy(p, old_p, sizeof(u32) * RTAX_MAX); + atomic_set(&p->refcnt, 1); + memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); - p = __DST_METRICS_PTR(prev); + p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; + } else if (prev & DST_METRICS_REFCOUNTED) { + if (atomic_dec_and_test(&old_p->refcnt)) + kfree(old_p); } } - return p; + BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); + return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); @@ -341,7 +346,7 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; - new = ((unsigned long) dst_default_metrics) | DST_METRICS_READ_ONLY; + new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449ddb8cc172bd9091c00057a69a095f98b56d..ad9ad4aab5da7c7d11c3b80edbdfcbdd3d7153fe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -203,6 +203,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) static void free_fib_info_rcu(struct rcu_head *head) { struct fib_info *fi = container_of(head, struct fib_info, rcu); + struct dst_metrics *m; change_nexthops(fi) { if (nexthop_nh->nh_dev) @@ -213,8 +214,9 @@ static void free_fib_info_rcu(struct rcu_head *head) rt_fibinfo_free(&nexthop_nh->nh_rth_input); } endfor_nexthops(fi); - if (fi->fib_metrics != (u32 *) dst_default_metrics) - kfree(fi->fib_metrics); + m = fi->fib_metrics; + if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) + kfree(m); kfree(fi); } @@ -971,11 +973,11 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; - fi->fib_metrics[type - 1] = val; + fi->fib_metrics->metrics[type - 1] = val; } if (ecn_ca) - fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; return 0; } @@ -1033,11 +1035,12 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; fib_info_cnt++; if (cfg->fc_mx) { - fi->fib_metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); if (!fi->fib_metrics) goto failure; + atomic_set(&fi->fib_metrics->refcnt, 1); } else - fi->fib_metrics = (u32 *) dst_default_metrics; + fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; @@ -1238,7 +1241,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) goto nla_put_failure; - if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0) goto nla_put_failure; if (fi->fib_prefsrc && diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9eebe43e16a59102edcd3ea4bc177c6b341d..6883b3d4ba8f69de2cb924612d60f5671a219a84 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1385,8 +1385,12 @@ static void rt_add_uncached_list(struct rtable *rt) static void ipv4_dst_destroy(struct dst_entry *dst) { + struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); struct rtable *rt = (struct rtable *) dst; + if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt)) + kfree(p); + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1438,7 +1442,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); + if (fi->fib_metrics != &dst_default_metrics) { + rt->dst._metrics |= DST_METRICS_REFCOUNTED; + atomic_inc(&fi->fib_metrics->refcnt); + } #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif