Message ID | 20200328191259.17145-1-xiyou.wangcong@gmail.com |
---|---|
State | Accepted |
Delegated to: | David Miller |
Headers | show |
Series | [net] net_sched: add a temporary refcnt for struct tcindex_data | expand |
On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote: > Although we intentionally use an ordered workqueue for all tc > filter works, the ordering is not guaranteed by RCU work, > given that tcf_queue_work() is esstenially a call_rcu(). > > This problem is demostrated by Thomas: > > CPU 0: > tcf_queue_work() > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > -> Migration to CPU 1 > > CPU 1: > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > so the 2nd work could be queued before the 1st one, which leads > to a free-after-free. > > Enforcing this order in RCU work is hard as it requires to change > RCU code too. Fortunately we can workaround this problem in tcindex > filter by taking a temporary refcnt, we only refcnt it right before > we begin to destroy it. This simplifies the code a lot as a full > refcnt requires much more changes in tcindex_set_parms(). > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Paul E. McKenney <paulmck@kernel.org> > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > Cc: Jiri Pirko <jiri@resnulli.us> > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Looks plausible, but what did you do to verify that the structures were in fact being freed? See below for more detail. Thanx, Paul > --- > net/sched/cls_tcindex.c | 44 +++++++++++++++++++++++++++++++++++------ > 1 file changed, 38 insertions(+), 6 deletions(-) > > diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c > index 9904299424a1..065345832a69 100644 > --- a/net/sched/cls_tcindex.c > +++ b/net/sched/cls_tcindex.c > @@ -11,6 +11,7 @@ > #include <linux/skbuff.h> > #include <linux/errno.h> > #include <linux/slab.h> > +#include <linux/refcount.h> > #include <net/act_api.h> > #include <net/netlink.h> > #include <net/pkt_cls.h> > @@ -26,9 +27,12 @@ > #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ > > > +struct tcindex_data; > + > struct tcindex_filter_result { > struct tcf_exts exts; > struct tcf_result res; > + struct tcindex_data *p; > struct rcu_work rwork; > }; > > @@ -49,6 +53,7 @@ struct tcindex_data { > u32 hash; /* hash table size; 0 if undefined */ > u32 alloc_hash; /* allocated size */ > u32 fall_through; /* 0: only classify if explicit match */ > + refcount_t refcnt; /* a temporary refcnt for perfect hash */ > struct rcu_work rwork; > }; > > @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) > return tcf_exts_has_actions(&r->exts) || r->res.classid; > } > > +static void tcindex_data_get(struct tcindex_data *p) > +{ > + refcount_inc(&p->refcnt); > +} > + > +static void tcindex_data_put(struct tcindex_data *p) > +{ > + if (refcount_dec_and_test(&p->refcnt)) { > + kfree(p->perfect); > + kfree(p->h); > + kfree(p); > + } > +} > + > static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, > u16 key) > { > @@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) > { > tcf_exts_destroy(&r->exts); > tcf_exts_put_net(&r->exts); > + tcindex_data_put(r->p); > } > > static void tcindex_destroy_rexts_work(struct work_struct *work) > @@ -212,6 +232,8 @@ static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, > else > __tcindex_destroy_fexts(f); > } else { > + tcindex_data_get(p); Good! You need this one to prevent the counter prematurely reaching zero. > + > if (tcf_exts_get_net(&r->exts)) > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > else > @@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work) > struct tcindex_data, > rwork); > > - kfree(p->perfect); > - kfree(p->h); > - kfree(p); > + tcindex_data_put(p); But what did you do to verify that the counter actually reaches zero? > } > > static inline int > @@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { > }; > > static int tcindex_filter_result_init(struct tcindex_filter_result *r, > + struct tcindex_data *p, > struct net *net) > { > memset(r, 0, sizeof(*r)); > + r->p = p; > return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, > TCA_TCINDEX_POLICE); > } > @@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) > TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); > if (err < 0) > goto errout; > + cp->perfect[i].p = cp; > } > > return 0; > @@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, > cp->alloc_hash = p->alloc_hash; > cp->fall_through = p->fall_through; > cp->tp = tp; > + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ The bit about checking that the counter reaches zero is underscored by the apparent initialization to the value 1. > if (tb[TCA_TCINDEX_HASH]) > cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); > @@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, > } > cp->h = p->h; > > - err = tcindex_filter_result_init(&new_filter_result, net); > + err = tcindex_filter_result_init(&new_filter_result, cp, net); > if (err < 0) > goto errout_alloc; > if (old_r) > @@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, > goto errout_alloc; > f->key = handle; > f->next = NULL; > - err = tcindex_filter_result_init(&f->result, net); > + err = tcindex_filter_result_init(&f->result, cp, net); > if (err < 0) { > kfree(f); > goto errout_alloc; > @@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, > } > > if (old_r && old_r != r) { > - err = tcindex_filter_result_init(old_r, net); > + err = tcindex_filter_result_init(old_r, cp, net); > if (err < 0) { > kfree(f); > goto errout_alloc; > @@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, > for (i = 0; i < p->hash; i++) { > struct tcindex_filter_result *r = p->perfect + i; > > + /* tcf_queue_work() does not guarantee the ordering we > + * want, so we have to take this refcnt temporarily to > + * ensure 'p' is freed after all tcindex_filter_result > + * here. Imperfect hash does not need this, because it > + * uses linked lists rather than an array. > + */ > + tcindex_data_get(p); > + > tcf_unbind_filter(tp, &r->res); > if (tcf_exts_get_net(&r->exts)) > tcf_queue_work(&r->rwork, > -- > 2.21.1 >
On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote: > > Although we intentionally use an ordered workqueue for all tc > > filter works, the ordering is not guaranteed by RCU work, > > given that tcf_queue_work() is esstenially a call_rcu(). > > > > This problem is demostrated by Thomas: > > > > CPU 0: > > tcf_queue_work() > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > > > -> Migration to CPU 1 > > > > CPU 1: > > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > > > so the 2nd work could be queued before the 1st one, which leads > > to a free-after-free. > > > > Enforcing this order in RCU work is hard as it requires to change > > RCU code too. Fortunately we can workaround this problem in tcindex > > filter by taking a temporary refcnt, we only refcnt it right before > > we begin to destroy it. This simplifies the code a lot as a full > > refcnt requires much more changes in tcindex_set_parms(). > > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > > Cc: Thomas Gleixner <tglx@linutronix.de> > > Cc: Paul E. McKenney <paulmck@kernel.org> > > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > > Cc: Jiri Pirko <jiri@resnulli.us> > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> > > Looks plausible, but what did you do to verify that the structures > were in fact being freed? See below for more detail. I ran the syzbot reproducer for about 20 minutes, there was no memory leak reported after scanning. Thanks.
On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote: > On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote: > > > Although we intentionally use an ordered workqueue for all tc > > > filter works, the ordering is not guaranteed by RCU work, > > > given that tcf_queue_work() is esstenially a call_rcu(). > > > > > > This problem is demostrated by Thomas: > > > > > > CPU 0: > > > tcf_queue_work() > > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > > > > > -> Migration to CPU 1 > > > > > > CPU 1: > > > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > > > > > so the 2nd work could be queued before the 1st one, which leads > > > to a free-after-free. > > > > > > Enforcing this order in RCU work is hard as it requires to change > > > RCU code too. Fortunately we can workaround this problem in tcindex > > > filter by taking a temporary refcnt, we only refcnt it right before > > > we begin to destroy it. This simplifies the code a lot as a full > > > refcnt requires much more changes in tcindex_set_parms(). > > > > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > > > Cc: Thomas Gleixner <tglx@linutronix.de> > > > Cc: Paul E. McKenney <paulmck@kernel.org> > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > > > Cc: Jiri Pirko <jiri@resnulli.us> > > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> > > > > Looks plausible, but what did you do to verify that the structures > > were in fact being freed? See below for more detail. > > I ran the syzbot reproducer for about 20 minutes, there was no > memory leak reported after scanning. And if you (say) set the initial reference count to two instead of one, there is a memory leak reported, correct? Thanx, Paul
On Mon, Mar 30, 2020 at 7:30 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote: > > On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > > > > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote: > > > > Although we intentionally use an ordered workqueue for all tc > > > > filter works, the ordering is not guaranteed by RCU work, > > > > given that tcf_queue_work() is esstenially a call_rcu(). > > > > > > > > This problem is demostrated by Thomas: > > > > > > > > CPU 0: > > > > tcf_queue_work() > > > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > > > > > > > -> Migration to CPU 1 > > > > > > > > CPU 1: > > > > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > > > > > > > so the 2nd work could be queued before the 1st one, which leads > > > > to a free-after-free. > > > > > > > > Enforcing this order in RCU work is hard as it requires to change > > > > RCU code too. Fortunately we can workaround this problem in tcindex > > > > filter by taking a temporary refcnt, we only refcnt it right before > > > > we begin to destroy it. This simplifies the code a lot as a full > > > > refcnt requires much more changes in tcindex_set_parms(). > > > > > > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > > > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > > > > Cc: Thomas Gleixner <tglx@linutronix.de> > > > > Cc: Paul E. McKenney <paulmck@kernel.org> > > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > > > > Cc: Jiri Pirko <jiri@resnulli.us> > > > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> > > > > > > Looks plausible, but what did you do to verify that the structures > > > were in fact being freed? See below for more detail. > > > > I ran the syzbot reproducer for about 20 minutes, there was no > > memory leak reported after scanning. > > And if you (say) set the initial reference count to two instead of one, > there is a memory leak reported, correct? No, I didn't do an A/B test. I just added a printk right before the kfree(), if it helps to convince you, here is one portion of the kernel log: [ 39.159298] a.out (703) used greatest stack depth: 11624 bytes left [ 39.166365] a.out (701) used greatest stack depth: 11352 bytes left [ 39.453257] freeing struct tcindex_data. [ 39.573554] freeing struct tcindex_data. [ 39.681540] freeing struct tcindex_data. [ 39.781158] freeing struct tcindex_data. [ 39.877726] freeing struct tcindex_data. [ 39.985515] freeing struct tcindex_data. [ 40.097687] freeing struct tcindex_data. [ 40.213691] freeing struct tcindex_data. [ 40.271465] device bridge_slave_1 left promiscuous mode [ 40.274078] bridge0: port 2(bridge_slave_1) entered disabled state [ 40.297258] device bridge_slave_0 left promiscuous mode [ 40.299377] bridge0: port 1(bridge_slave_0) entered disabled state [ 40.733355] device hsr_slave_0 left promiscuous mode [ 40.749322] device hsr_slave_1 left promiscuous mode [ 40.784220] team0 (unregistering): Port device team_slave_1 removed [ 40.792641] team0 (unregistering): Port device team_slave_0 removed [ 40.806302] bond0 (unregistering): (slave bond_slave_1): Releasing backup interface [ 40.836972] bond0 (unregistering): (slave bond_slave_0): Releasing backup interface [ 40.931688] bond0 (unregistering): Released all slaves [ 44.149970] freeing struct tcindex_data. [ 44.159568] freeing struct tcindex_data. [ 44.172786] freeing struct tcindex_data. [ 44.813214] freeing struct tcindex_data. [ 44.821857] freeing struct tcindex_data. [ 44.825064] freeing struct tcindex_data. [ 44.826889] freeing struct tcindex_data. [ 45.294254] freeing struct tcindex_data. [ 45.297980] freeing struct tcindex_data. [ 45.300623] freeing struct tcindex_data. And no memory leak of course: [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak [root@localhost tmp]# cat /sys/kernel/debug/kmemleak Thanks.
On Mon, Mar 30, 2020 at 07:54:09PM -0700, Cong Wang wrote: > On Mon, Mar 30, 2020 at 7:30 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > > > On Mon, Mar 30, 2020 at 04:24:42PM -0700, Cong Wang wrote: > > > On Mon, Mar 30, 2020 at 2:35 PM Paul E. McKenney <paulmck@kernel.org> wrote: > > > > > > > > On Sat, Mar 28, 2020 at 12:12:59PM -0700, Cong Wang wrote: > > > > > Although we intentionally use an ordered workqueue for all tc > > > > > filter works, the ordering is not guaranteed by RCU work, > > > > > given that tcf_queue_work() is esstenially a call_rcu(). > > > > > > > > > > This problem is demostrated by Thomas: > > > > > > > > > > CPU 0: > > > > > tcf_queue_work() > > > > > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > > > > > > > > > -> Migration to CPU 1 > > > > > > > > > > CPU 1: > > > > > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > > > > > > > > > so the 2nd work could be queued before the 1st one, which leads > > > > > to a free-after-free. > > > > > > > > > > Enforcing this order in RCU work is hard as it requires to change > > > > > RCU code too. Fortunately we can workaround this problem in tcindex > > > > > filter by taking a temporary refcnt, we only refcnt it right before > > > > > we begin to destroy it. This simplifies the code a lot as a full > > > > > refcnt requires much more changes in tcindex_set_parms(). > > > > > > > > > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > > > > > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > > > > > Cc: Thomas Gleixner <tglx@linutronix.de> > > > > > Cc: Paul E. McKenney <paulmck@kernel.org> > > > > > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > > > > > Cc: Jiri Pirko <jiri@resnulli.us> > > > > > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> > > > > > > > > Looks plausible, but what did you do to verify that the structures > > > > were in fact being freed? See below for more detail. > > > > > > I ran the syzbot reproducer for about 20 minutes, there was no > > > memory leak reported after scanning. > > > > And if you (say) set the initial reference count to two instead of one, > > there is a memory leak reported, correct? > > No, I didn't do an A/B test. I just added a printk right before the kfree(), > if it helps to convince you, here is one portion of the kernel log: > > [ 39.159298] a.out (703) used greatest stack depth: 11624 bytes left > [ 39.166365] a.out (701) used greatest stack depth: 11352 bytes left > [ 39.453257] freeing struct tcindex_data. > [ 39.573554] freeing struct tcindex_data. > [ 39.681540] freeing struct tcindex_data. > [ 39.781158] freeing struct tcindex_data. > [ 39.877726] freeing struct tcindex_data. > [ 39.985515] freeing struct tcindex_data. > [ 40.097687] freeing struct tcindex_data. > [ 40.213691] freeing struct tcindex_data. > [ 40.271465] device bridge_slave_1 left promiscuous mode > [ 40.274078] bridge0: port 2(bridge_slave_1) entered disabled state > [ 40.297258] device bridge_slave_0 left promiscuous mode > [ 40.299377] bridge0: port 1(bridge_slave_0) entered disabled state > [ 40.733355] device hsr_slave_0 left promiscuous mode > [ 40.749322] device hsr_slave_1 left promiscuous mode > [ 40.784220] team0 (unregistering): Port device team_slave_1 removed > [ 40.792641] team0 (unregistering): Port device team_slave_0 removed > [ 40.806302] bond0 (unregistering): (slave bond_slave_1): Releasing > backup interface > [ 40.836972] bond0 (unregistering): (slave bond_slave_0): Releasing > backup interface > [ 40.931688] bond0 (unregistering): Released all slaves > [ 44.149970] freeing struct tcindex_data. > [ 44.159568] freeing struct tcindex_data. > [ 44.172786] freeing struct tcindex_data. > [ 44.813214] freeing struct tcindex_data. > [ 44.821857] freeing struct tcindex_data. > [ 44.825064] freeing struct tcindex_data. > [ 44.826889] freeing struct tcindex_data. > [ 45.294254] freeing struct tcindex_data. > [ 45.297980] freeing struct tcindex_data. > [ 45.300623] freeing struct tcindex_data. > > And no memory leak of course: > > [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak > [root@localhost tmp]# echo scan > /sys/kernel/debug/kmemleak > [root@localhost tmp]# cat /sys/kernel/debug/kmemleak Much more convincing, thank you! Feel free to add: Reviewed-by: Paul E. McKenney <paulmck@kernel.org> Thanx, Paul
From: Cong Wang <xiyou.wangcong@gmail.com> Date: Sat, 28 Mar 2020 12:12:59 -0700 > Although we intentionally use an ordered workqueue for all tc > filter works, the ordering is not guaranteed by RCU work, > given that tcf_queue_work() is esstenially a call_rcu(). > > This problem is demostrated by Thomas: > > CPU 0: > tcf_queue_work() > tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); > > -> Migration to CPU 1 > > CPU 1: > tcf_queue_work(&p->rwork, tcindex_destroy_work); > > so the 2nd work could be queued before the 1st one, which leads > to a free-after-free. > > Enforcing this order in RCU work is hard as it requires to change > RCU code too. Fortunately we can workaround this problem in tcindex > filter by taking a temporary refcnt, we only refcnt it right before > we begin to destroy it. This simplifies the code a lot as a full > refcnt requires much more changes in tcindex_set_parms(). > > Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com > Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") > Cc: Thomas Gleixner <tglx@linutronix.de> > Cc: Paul E. McKenney <paulmck@kernel.org> > Cc: Jamal Hadi Salim <jhs@mojatatu.com> > Cc: Jiri Pirko <jiri@resnulli.us> > Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> Applied and queued up for -stable, thanks.
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 9904299424a1..065345832a69 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -11,6 +11,7 @@ #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/refcount.h> #include <net/act_api.h> #include <net/netlink.h> #include <net/pkt_cls.h> @@ -26,9 +27,12 @@ #define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ +struct tcindex_data; + struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct tcindex_data *p; struct rcu_work rwork; }; @@ -49,6 +53,7 @@ struct tcindex_data { u32 hash; /* hash table size; 0 if undefined */ u32 alloc_hash; /* allocated size */ u32 fall_through; /* 0: only classify if explicit match */ + refcount_t refcnt; /* a temporary refcnt for perfect hash */ struct rcu_work rwork; }; @@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) return tcf_exts_has_actions(&r->exts) || r->res.classid; } +static void tcindex_data_get(struct tcindex_data *p) +{ + refcount_inc(&p->refcnt); +} + +static void tcindex_data_put(struct tcindex_data *p) +{ + if (refcount_dec_and_test(&p->refcnt)) { + kfree(p->perfect); + kfree(p->h); + kfree(p); + } +} + static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, u16 key) { @@ -141,6 +160,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) { tcf_exts_destroy(&r->exts); tcf_exts_put_net(&r->exts); + tcindex_data_put(r->p); } static void tcindex_destroy_rexts_work(struct work_struct *work) @@ -212,6 +232,8 @@ static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, else __tcindex_destroy_fexts(f); } else { + tcindex_data_get(p); + if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); else @@ -228,9 +250,7 @@ static void tcindex_destroy_work(struct work_struct *work) struct tcindex_data, rwork); - kfree(p->perfect); - kfree(p->h); - kfree(p); + tcindex_data_put(p); } static inline int @@ -248,9 +268,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { }; static int tcindex_filter_result_init(struct tcindex_filter_result *r, + struct tcindex_data *p, struct net *net) { memset(r, 0, sizeof(*r)); + r->p = p; return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); } @@ -290,6 +312,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) goto errout; + cp->perfect[i].p = cp; } return 0; @@ -334,6 +357,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, cp->alloc_hash = p->alloc_hash; cp->fall_through = p->fall_through; cp->tp = tp; + refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ if (tb[TCA_TCINDEX_HASH]) cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); @@ -366,7 +390,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } cp->h = p->h; - err = tcindex_filter_result_init(&new_filter_result, net); + err = tcindex_filter_result_init(&new_filter_result, cp, net); if (err < 0) goto errout_alloc; if (old_r) @@ -434,7 +458,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, goto errout_alloc; f->key = handle; f->next = NULL; - err = tcindex_filter_result_init(&f->result, net); + err = tcindex_filter_result_init(&f->result, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -447,7 +471,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, } if (old_r && old_r != r) { - err = tcindex_filter_result_init(old_r, net); + err = tcindex_filter_result_init(old_r, cp, net); if (err < 0) { kfree(f); goto errout_alloc; @@ -571,6 +595,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, for (i = 0; i < p->hash; i++) { struct tcindex_filter_result *r = p->perfect + i; + /* tcf_queue_work() does not guarantee the ordering we + * want, so we have to take this refcnt temporarily to + * ensure 'p' is freed after all tcindex_filter_result + * here. Imperfect hash does not need this, because it + * uses linked lists rather than an array. + */ + tcindex_data_get(p); + tcf_unbind_filter(tp, &r->res); if (tcf_exts_get_net(&r->exts)) tcf_queue_work(&r->rwork,
Although we intentionally use an ordered workqueue for all tc filter works, the ordering is not guaranteed by RCU work, given that tcf_queue_work() is esstenially a call_rcu(). This problem is demostrated by Thomas: CPU 0: tcf_queue_work() tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); -> Migration to CPU 1 CPU 1: tcf_queue_work(&p->rwork, tcindex_destroy_work); so the 2nd work could be queued before the 1st one, which leads to a free-after-free. Enforcing this order in RCU work is hard as it requires to change RCU code too. Fortunately we can workaround this problem in tcindex filter by taking a temporary refcnt, we only refcnt it right before we begin to destroy it. This simplifies the code a lot as a full refcnt requires much more changes in tcindex_set_parms(). Reported-by: syzbot+46f513c3033d592409d2@syzkaller.appspotmail.com Fixes: 3d210534cc93 ("net_sched: fix a race condition in tcindex_destroy()") Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Paul E. McKenney <paulmck@kernel.org> Cc: Jamal Hadi Salim <jhs@mojatatu.com> Cc: Jiri Pirko <jiri@resnulli.us> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> --- net/sched/cls_tcindex.c | 44 +++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-)