diff mbox series

[nf-next,2/8] netfilter: nf_tables: nf_tables_gettable: use call_rcu

Message ID 20180527093153.13114-3-fw@strlen.de
State Accepted
Delegated to: Pablo Neira
Headers show
Series netfilter: nf_tables: make get and dump operations lockless | expand

Commit Message

Florian Westphal May 27, 2018, 9:31 a.m. UTC
We can make all dumps and lookups lockless.

Dumps currently only hold the nfnl mutex on the dump request itself.
Dumps can span multiple syscalls, dump continuation doesn't acquire the
nfnl mutex anywhere, i.e. the dump callbacks in nf_tables already use
rcu and never rely on nfnl mutex being held.

So, just switch all dumpers to rcu.
This requires taking a module reference before dropping the rcu lock
so rmmod is blocked, we also need to hold module reference over
the entire dump operation sequence. netlink already supports this
via the .module member in the netlink_dump_control struct.

For the non-dump case (i.e. lookup of a specific tables, chains, etc),
we need to swtich to _rcu list iteration primitive and make sure we
use GFP_ATOMIC.

This initial patch also adds the new
nft_netlink_dump_start_rcu() helper that takes care of the
get_ref, drop-rcu-lock,start dump, get-rcu-lock,put-ref
sequence.

The helper will be reused for all dumps in the followup patches.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/netfilter/nf_tables_api.c | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Comments

kernel test robot May 29, 2018, 4:11 a.m. UTC | #1
Hi Florian,

I love your patch! Perhaps something to improve:

[auto build test WARNING on nf-next/master]

url:    https://github.com/0day-ci/linux/commits/Florian-Westphal/netfilter-nf_tables-make-get-and-dump-operations-lockless/20180529-071211
base:   https://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git master
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   net/netfilter/nf_tables_api.c:1193:31: sparse: incorrect type in return expression (different address spaces) @@    expected struct nft_stats [noderef] <asn:3>* @@    got sn:3>* @@
   net/netfilter/nf_tables_api.c:1193:31:    expected struct nft_stats [noderef] <asn:3>*
   net/netfilter/nf_tables_api.c:1193:31:    got void *
   net/netfilter/nf_tables_api.c:1196:31: sparse: incorrect type in return expression (different address spaces) @@    expected struct nft_stats [noderef] <asn:3>* @@    got sn:3>* @@
   net/netfilter/nf_tables_api.c:1196:31:    expected struct nft_stats [noderef] <asn:3>*
   net/netfilter/nf_tables_api.c:1196:31:    got void *
   net/netfilter/nf_tables_api.c:1200:31: sparse: incorrect type in return expression (different address spaces) @@    expected struct nft_stats [noderef] <asn:3>* @@    got sn:3>* @@
   net/netfilter/nf_tables_api.c:1200:31:    expected struct nft_stats [noderef] <asn:3>*
   net/netfilter/nf_tables_api.c:1200:31:    got void *
   net/netfilter/nf_tables_api.c:1223:28: sparse: cast between address spaces (<asn:3>-><asn:4>)
   net/netfilter/nf_tables_api.c:1223:28: sparse: incompatible types in comparison expression (different address spaces)
   net/netfilter/nf_tables_api.c:1472:23: sparse: incorrect type in assignment (different address spaces) @@    expected struct nft_stats *stats @@    got struct nft_stats struct nft_stats *stats @@
   net/netfilter/nf_tables_api.c:1480:29: sparse: incorrect type in argument 1 (different address spaces) @@    expected void [noderef] <asn:3>*__pdata @@    got [noderef] <asn:3>*__pdata @@
   net/netfilter/nf_tables_api.c:1484:38: sparse: incorrect type in assignment (different address spaces) @@    expected struct nft_stats [noderef] <asn:3>*stats @@    got [noderef] <asn:3>*stats @@
   net/netfilter/nf_tables_api.c:1498:37: sparse: incorrect type in argument 1 (different address spaces) @@    expected void [noderef] <asn:3>*__pdata @@    got [noderef] <asn:3>*__pdata @@
   net/netfilter/nf_tables_api.c:2772:16: sparse: incorrect type in return expression (different base types) @@    expected unsigned long long @@    got restricted unsigned long long @@
   net/netfilter/nf_tables_api.c:2822:47: sparse: incorrect type in argument 3 (different base types) @@    expected restricted __be64 [usertype] value @@    got __be64 [usertype] value @@
   net/netfilter/nf_tables_api.c:3481:47: sparse: incorrect type in argument 3 (different base types) @@    expected restricted __be64 [usertype] value @@    got __be64 [usertype] value @@
   net/netfilter/nf_tables_api.c:3495:55: sparse: incorrect type in argument 3 (different base types) @@    expected restricted __be64 [usertype] value @@    got __be64 [usertype] value @@
>> include/linux/rcupdate.h:686:9: sparse: context imbalance in 'nft_netlink_dump_start_rcu' - unexpected unlock

vim +/nft_netlink_dump_start_rcu +686 include/linux/rcupdate.h

^1da177e4 Linus Torvalds   2005-04-16  636  
^1da177e4 Linus Torvalds   2005-04-16  637  /*
^1da177e4 Linus Torvalds   2005-04-16  638   * So where is rcu_write_lock()?  It does not exist, as there is no
^1da177e4 Linus Torvalds   2005-04-16  639   * way for writers to lock out RCU readers.  This is a feature, not
^1da177e4 Linus Torvalds   2005-04-16  640   * a bug -- this property is what provides RCU's performance benefits.
^1da177e4 Linus Torvalds   2005-04-16  641   * Of course, writers must coordinate with each other.  The normal
^1da177e4 Linus Torvalds   2005-04-16  642   * spinlock primitives work well for this, but any other technique may be
^1da177e4 Linus Torvalds   2005-04-16  643   * used as well.  RCU does not care how the writers keep out of each
^1da177e4 Linus Torvalds   2005-04-16  644   * others' way, as long as they do so.
^1da177e4 Linus Torvalds   2005-04-16  645   */
3d76c0829 Paul E. McKenney 2009-09-28  646  
3d76c0829 Paul E. McKenney 2009-09-28  647  /**
ca5ecddfa Paul E. McKenney 2010-04-28  648   * rcu_read_unlock() - marks the end of an RCU read-side critical section.
3d76c0829 Paul E. McKenney 2009-09-28  649   *
f27bc4873 Paul E. McKenney 2014-05-04  650   * In most situations, rcu_read_unlock() is immune from deadlock.
f27bc4873 Paul E. McKenney 2014-05-04  651   * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
f27bc4873 Paul E. McKenney 2014-05-04  652   * is responsible for deboosting, which it does via rt_mutex_unlock().
f27bc4873 Paul E. McKenney 2014-05-04  653   * Unfortunately, this function acquires the scheduler's runqueue and
f27bc4873 Paul E. McKenney 2014-05-04  654   * priority-inheritance spinlocks.  This means that deadlock could result
f27bc4873 Paul E. McKenney 2014-05-04  655   * if the caller of rcu_read_unlock() already holds one of these locks or
ce36f2f3e Oleg Nesterov    2014-09-28  656   * any lock that is ever acquired while holding them; or any lock which
ce36f2f3e Oleg Nesterov    2014-09-28  657   * can be taken from interrupt context because rcu_boost()->rt_mutex_lock()
ce36f2f3e Oleg Nesterov    2014-09-28  658   * does not disable irqs while taking ->wait_lock.
f27bc4873 Paul E. McKenney 2014-05-04  659   *
f27bc4873 Paul E. McKenney 2014-05-04  660   * That said, RCU readers are never priority boosted unless they were
f27bc4873 Paul E. McKenney 2014-05-04  661   * preempted.  Therefore, one way to avoid deadlock is to make sure
f27bc4873 Paul E. McKenney 2014-05-04  662   * that preemption never happens within any RCU read-side critical
f27bc4873 Paul E. McKenney 2014-05-04  663   * section whose outermost rcu_read_unlock() is called with one of
f27bc4873 Paul E. McKenney 2014-05-04  664   * rt_mutex_unlock()'s locks held.  Such preemption can be avoided in
f27bc4873 Paul E. McKenney 2014-05-04  665   * a number of ways, for example, by invoking preempt_disable() before
f27bc4873 Paul E. McKenney 2014-05-04  666   * critical section's outermost rcu_read_lock().
f27bc4873 Paul E. McKenney 2014-05-04  667   *
f27bc4873 Paul E. McKenney 2014-05-04  668   * Given that the set of locks acquired by rt_mutex_unlock() might change
f27bc4873 Paul E. McKenney 2014-05-04  669   * at any time, a somewhat more future-proofed approach is to make sure
f27bc4873 Paul E. McKenney 2014-05-04  670   * that that preemption never happens within any RCU read-side critical
f27bc4873 Paul E. McKenney 2014-05-04  671   * section whose outermost rcu_read_unlock() is called with irqs disabled.
f27bc4873 Paul E. McKenney 2014-05-04  672   * This approach relies on the fact that rt_mutex_unlock() currently only
f27bc4873 Paul E. McKenney 2014-05-04  673   * acquires irq-disabled locks.
f27bc4873 Paul E. McKenney 2014-05-04  674   *
f27bc4873 Paul E. McKenney 2014-05-04  675   * The second of these two approaches is best in most situations,
f27bc4873 Paul E. McKenney 2014-05-04  676   * however, the first approach can also be useful, at least to those
f27bc4873 Paul E. McKenney 2014-05-04  677   * developers willing to keep abreast of the set of locks acquired by
f27bc4873 Paul E. McKenney 2014-05-04  678   * rt_mutex_unlock().
f27bc4873 Paul E. McKenney 2014-05-04  679   *
3d76c0829 Paul E. McKenney 2009-09-28  680   * See rcu_read_lock() for more information.
3d76c0829 Paul E. McKenney 2009-09-28  681   */
bc33f24bd Paul E. McKenney 2009-08-22  682  static inline void rcu_read_unlock(void)
bc33f24bd Paul E. McKenney 2009-08-22  683  {
f78f5b90c Paul E. McKenney 2015-06-18  684  	RCU_LOCKDEP_WARN(!rcu_is_watching(),
bde23c689 Heiko Carstens   2012-02-01  685  			 "rcu_read_unlock() used illegally while idle");
bc33f24bd Paul E. McKenney 2009-08-22 @686  	__release(RCU);
bc33f24bd Paul E. McKenney 2009-08-22  687  	__rcu_read_unlock();
d24209bb6 Paul E. McKenney 2015-01-21  688  	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
bc33f24bd Paul E. McKenney 2009-08-22  689  }
^1da177e4 Linus Torvalds   2005-04-16  690  

:::::: The code at line 686 was first introduced by commit
:::::: bc33f24bdca8b6e97376e3a182ab69e6cdefa989 rcu: Consolidate sparse and lockdep declarations in include/linux/rcupdate.h

:::::: TO: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
:::::: CC: Ingo Molnar <mingo@elte.hu>

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal May 29, 2018, 9:33 a.m. UTC | #2
kbuild test robot <lkp@intel.com> wrote:
> >> include/linux/rcupdate.h:686:9: sparse: context imbalance in 'nft_netlink_dump_start_rcu' - unexpected unlock

Yep, i forgot to mention this in change log.

I don't know how to fix this.
nft_netlink_dump_start_rcu() is called with rcu read lock held.

But we call netlink_dump_start and that might sleep.

So I'm grabbing the module ref, release rcu read lock and re-acquire
it on return.

I didn't find out how to annotate this.

Given large number of similar warnings in alike-patterned code
I guess there isn't a way to do this.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox series

Patch

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c7e74545a1a8..c22795db2953 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -373,7 +373,7 @@  static struct nft_table *nft_table_lookup(const struct net *net,
 	if (nla == NULL)
 		return ERR_PTR(-EINVAL);
 
-	list_for_each_entry(table, &net->nft.tables, list) {
+	list_for_each_entry_rcu(table, &net->nft.tables, list) {
 		if (!nla_strcmp(nla, table->name) &&
 		    table->family == family &&
 		    nft_active_genmask(table, genmask))
@@ -546,6 +546,24 @@  static int nf_tables_dump_tables(struct sk_buff *skb,
 	return skb->len;
 }
 
+static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
+				      const struct nlmsghdr *nlh,
+				      struct netlink_dump_control *c)
+{
+	int err;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EINVAL;
+
+	rcu_read_unlock();
+	err = netlink_dump_start(nlsk, skb, nlh, c);
+	rcu_read_lock();
+	module_put(THIS_MODULE);
+
+	return err;
+}
+
+/* called with rcu_read_lock held */
 static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 			      struct sk_buff *skb, const struct nlmsghdr *nlh,
 			      const struct nlattr * const nla[],
@@ -561,8 +579,10 @@  static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 	if (nlh->nlmsg_flags & NLM_F_DUMP) {
 		struct netlink_dump_control c = {
 			.dump = nf_tables_dump_tables,
+			.module = THIS_MODULE,
 		};
-		return netlink_dump_start(nlsk, skb, nlh, &c);
+
+		return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
 	}
 
 	table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask);
@@ -571,7 +591,7 @@  static int nf_tables_gettable(struct net *net, struct sock *nlsk,
 		return PTR_ERR(table);
 	}
 
-	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
 	if (!skb2)
 		return -ENOMEM;
 
@@ -5676,7 +5696,7 @@  static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.policy		= nft_table_policy,
 	},
 	[NFT_MSG_GETTABLE] = {
-		.call		= nf_tables_gettable,
+		.call_rcu	= nf_tables_gettable,
 		.attr_count	= NFTA_TABLE_MAX,
 		.policy		= nft_table_policy,
 	},