[iptables,v3,04/11] nft-cache: Introduce cache levels
diff mbox series

Message ID 20191008161447.6595-5-phil@nwl.cc
State Changes Requested
Delegated to: Pablo Neira
Headers show
Series
  • Improve iptables-nft performance with large rulesets
Related show

Commit Message

Phil Sutter Oct. 8, 2019, 4:14 p.m. UTC
Replace the simple have_cache boolean by a cache level indicator
defining how complete the cache is. Since have_cache indicated full
cache (including rules), make code depending on it check for cache level
NFT_CL_RULES.

Core cache fetching routine __nft_build_cache() accepts a new level via
parameter and raises cache completeness to that level.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
 iptables/nft-cache.c | 51 +++++++++++++++++++++++++++++++-------------
 iptables/nft.h       |  9 +++++++-
 2 files changed, 44 insertions(+), 16 deletions(-)

Comments

Pablo Neira Ayuso Oct. 9, 2019, 9:37 a.m. UTC | #1
Hi Phil,

On Tue, Oct 08, 2019 at 06:14:40PM +0200, Phil Sutter wrote:
> Replace the simple have_cache boolean by a cache level indicator
> defining how complete the cache is. Since have_cache indicated full
> cache (including rules), make code depending on it check for cache level
> NFT_CL_RULES.
> 
> Core cache fetching routine __nft_build_cache() accepts a new level via
> parameter and raises cache completeness to that level.
> 
> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
>  iptables/nft-cache.c | 51 +++++++++++++++++++++++++++++++-------------
>  iptables/nft.h       |  9 +++++++-
>  2 files changed, 44 insertions(+), 16 deletions(-)
> 
> diff --git a/iptables/nft-cache.c b/iptables/nft-cache.c
> index 5444419a5cc3b..22a87e94efd76 100644
> --- a/iptables/nft-cache.c
> +++ b/iptables/nft-cache.c
> @@ -224,30 +224,49 @@ static int fetch_rule_cache(struct nft_handle *h)
>  	return 0;
>  }
>  
> -static void __nft_build_cache(struct nft_handle *h)
> +static void __nft_build_cache(struct nft_handle *h, enum nft_cache_level level)
>  {
>  	uint32_t genid_start, genid_stop;
>  
> +	if (level <= h->cache_level)
> +		return;
>  retry:
>  	mnl_genid_get(h, &genid_start);
> -	fetch_table_cache(h);
> -	fetch_chain_cache(h);
> -	fetch_rule_cache(h);
> -	h->have_cache = true;
> -	mnl_genid_get(h, &genid_stop);
>  
> +	switch (h->cache_level) {
> +	case NFT_CL_NONE:
> +		fetch_table_cache(h);
> +		if (level == NFT_CL_TABLES)
> +			break;
> +		/* fall through */
> +	case NFT_CL_TABLES:

If the existing level is TABLES and use wants chains, then you have to
invalidate the existing table cache, then fetch the tables and chains
to make sure cache is consistent. I mean, extending an existing cache
might lead to inconsistencies.

Am I missing anything?

> +		fetch_chain_cache(h);
> +		if (level == NFT_CL_CHAINS)
> +			break;
> +		/* fall through */
> +	case NFT_CL_CHAINS:
> +		fetch_rule_cache(h);
> +		if (level == NFT_CL_RULES)
> +			break;
> +		/* fall through */
> +	case NFT_CL_RULES:
> +		break;
> +	}
> +
> +	mnl_genid_get(h, &genid_stop);
>  	if (genid_start != genid_stop) {
>  		flush_chain_cache(h, NULL);
>  		goto retry;
>  	}
>  
> +	h->cache_level = level;
>  	h->nft_genid = genid_start;
>  }
Pablo Neira Ayuso Oct. 9, 2019, 10:29 a.m. UTC | #2
On Wed, Oct 09, 2019 at 11:37:23AM +0200, Pablo Neira Ayuso wrote:
> Hi Phil,
> 
> On Tue, Oct 08, 2019 at 06:14:40PM +0200, Phil Sutter wrote:
> > Replace the simple have_cache boolean by a cache level indicator
> > defining how complete the cache is. Since have_cache indicated full
> > cache (including rules), make code depending on it check for cache level
> > NFT_CL_RULES.
> > 
> > Core cache fetching routine __nft_build_cache() accepts a new level via
> > parameter and raises cache completeness to that level.
> > 
> > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > ---
> >  iptables/nft-cache.c | 51 +++++++++++++++++++++++++++++++-------------
> >  iptables/nft.h       |  9 +++++++-
> >  2 files changed, 44 insertions(+), 16 deletions(-)
> > 
> > diff --git a/iptables/nft-cache.c b/iptables/nft-cache.c
> > index 5444419a5cc3b..22a87e94efd76 100644
> > --- a/iptables/nft-cache.c
> > +++ b/iptables/nft-cache.c
> > @@ -224,30 +224,49 @@ static int fetch_rule_cache(struct nft_handle *h)
> >  	return 0;
> >  }
> >  
> > -static void __nft_build_cache(struct nft_handle *h)
> > +static void __nft_build_cache(struct nft_handle *h, enum nft_cache_level level)
> >  {
> >  	uint32_t genid_start, genid_stop;
> >  
> > +	if (level <= h->cache_level)
> > +		return;
> >  retry:
> >  	mnl_genid_get(h, &genid_start);
> > -	fetch_table_cache(h);
> > -	fetch_chain_cache(h);
> > -	fetch_rule_cache(h);
> > -	h->have_cache = true;
> > -	mnl_genid_get(h, &genid_stop);
> >  
> > +	switch (h->cache_level) {
> > +	case NFT_CL_NONE:
> > +		fetch_table_cache(h);
> > +		if (level == NFT_CL_TABLES)
> > +			break;
> > +		/* fall through */
> > +	case NFT_CL_TABLES:
> 
> If the existing level is TABLES and use wants chains, then you have to
> invalidate the existing table cache, then fetch the tables and chains
> to make sure cache is consistent. I mean, extending an existing cache
> might lead to inconsistencies.
> 
> Am I missing anything?

If I'm correct, I wonder if we should go for splitting the parsing
from the evaluation phase here. Probably generate the rule by pointing
to the table and chain as string, then evaluate the ruleset update
batch to obtain the cache level in one go. This is the approach that
I had in mind with nftables, so you might avoid dumping the ruleset
over and over in an environment where dynamic updates are frequent.

The idea is to avoid fetching a cache, then canceling it by the rule
coming afterwards just because the cache is incomplete. So the cache
that is required is calculated once, then you go to the kernel and
fetch it (making sure generation number tells you that your cache is
consistent).

Makes sense to you?
Phil Sutter Oct. 10, 2019, 10:09 p.m. UTC | #3
Hi Pablo,

On Wed, Oct 09, 2019 at 12:29:01PM +0200, Pablo Neira Ayuso wrote:
> On Wed, Oct 09, 2019 at 11:37:23AM +0200, Pablo Neira Ayuso wrote:
> > Hi Phil,
> > 
> > On Tue, Oct 08, 2019 at 06:14:40PM +0200, Phil Sutter wrote:
> > > Replace the simple have_cache boolean by a cache level indicator
> > > defining how complete the cache is. Since have_cache indicated full
> > > cache (including rules), make code depending on it check for cache level
> > > NFT_CL_RULES.
> > > 
> > > Core cache fetching routine __nft_build_cache() accepts a new level via
> > > parameter and raises cache completeness to that level.
> > > 
> > > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > > ---
> > >  iptables/nft-cache.c | 51 +++++++++++++++++++++++++++++++-------------
> > >  iptables/nft.h       |  9 +++++++-
> > >  2 files changed, 44 insertions(+), 16 deletions(-)
> > > 
> > > diff --git a/iptables/nft-cache.c b/iptables/nft-cache.c
> > > index 5444419a5cc3b..22a87e94efd76 100644
> > > --- a/iptables/nft-cache.c
> > > +++ b/iptables/nft-cache.c
> > > @@ -224,30 +224,49 @@ static int fetch_rule_cache(struct nft_handle *h)
> > >  	return 0;
> > >  }
> > >  
> > > -static void __nft_build_cache(struct nft_handle *h)
> > > +static void __nft_build_cache(struct nft_handle *h, enum nft_cache_level level)
> > >  {
> > >  	uint32_t genid_start, genid_stop;
> > >  
> > > +	if (level <= h->cache_level)
> > > +		return;
> > >  retry:
> > >  	mnl_genid_get(h, &genid_start);
> > > -	fetch_table_cache(h);
> > > -	fetch_chain_cache(h);
> > > -	fetch_rule_cache(h);
> > > -	h->have_cache = true;
> > > -	mnl_genid_get(h, &genid_stop);
> > >  
> > > +	switch (h->cache_level) {
> > > +	case NFT_CL_NONE:
> > > +		fetch_table_cache(h);
> > > +		if (level == NFT_CL_TABLES)
> > > +			break;
> > > +		/* fall through */
> > > +	case NFT_CL_TABLES:
> > 
> > If the existing level is TABLES and use wants chains, then you have to
> > invalidate the existing table cache, then fetch the tables and chains
> > to make sure cache is consistent. I mean, extending an existing cache
> > might lead to inconsistencies.
> > 
> > Am I missing anything?

Hmm, this is a valid point indeed. At least one can't depend on stored
genid to match the local cache's state which defeats its purpose.

> If I'm correct, I wonder if we should go for splitting the parsing
> from the evaluation phase here. Probably generate the rule by pointing
> to the table and chain as string, then evaluate the ruleset update
> batch to obtain the cache level in one go. This is the approach that
> I had in mind with nftables, so you might avoid dumping the ruleset
> over and over in an environment where dynamic updates are frequent.
> 
> The idea is to avoid fetching a cache, then canceling it by the rule
> coming afterwards just because the cache is incomplete. So the cache
> that is required is calculated once, then you go to the kernel and
> fetch it (making sure generation number tells you that your cache is
> consistent).
> 
> Makes sense to you?

Well, I understand your approach and it's a proper way to deal with the
situation, but I fear this means quite some effort. I imagine we either
extend the xtables-restore table delete logic to add and disable more
dependency commands based on kernel ruleset contents or add these
dependency commands when iterating over the command set and populating
the cache.

The thing is, we do this mostly just for xtables-restore --noflush
logic, which in turn is probably not used often but popular among "power
users" trying to speed up the bunch of iptables commands they have to
enter at once.

Maybe we could go with a simpler solution for now, which is to check
kernel genid again and drop the local cache if it differs from what's
stored. If it doesn't, the current cache is still up to date and we may
just fetch what's missing. Or does that leave room for a race condition?

Thanks, Phil
Pablo Neira Ayuso Oct. 11, 2019, 9:28 a.m. UTC | #4
On Fri, Oct 11, 2019 at 12:09:11AM +0200, Phil Sutter wrote:
> Hi Pablo,
> 
> On Wed, Oct 09, 2019 at 12:29:01PM +0200, Pablo Neira Ayuso wrote:
> > On Wed, Oct 09, 2019 at 11:37:23AM +0200, Pablo Neira Ayuso wrote:
> > > Hi Phil,
> > > 
> > > On Tue, Oct 08, 2019 at 06:14:40PM +0200, Phil Sutter wrote:
> > > > Replace the simple have_cache boolean by a cache level indicator
> > > > defining how complete the cache is. Since have_cache indicated full
> > > > cache (including rules), make code depending on it check for cache level
> > > > NFT_CL_RULES.
> > > > 
> > > > Core cache fetching routine __nft_build_cache() accepts a new level via
> > > > parameter and raises cache completeness to that level.
> > > > 
> > > > Signed-off-by: Phil Sutter <phil@nwl.cc>
> > > > ---
> > > >  iptables/nft-cache.c | 51 +++++++++++++++++++++++++++++++-------------
> > > >  iptables/nft.h       |  9 +++++++-
> > > >  2 files changed, 44 insertions(+), 16 deletions(-)
> > > > 
> > > > diff --git a/iptables/nft-cache.c b/iptables/nft-cache.c
> > > > index 5444419a5cc3b..22a87e94efd76 100644
> > > > --- a/iptables/nft-cache.c
> > > > +++ b/iptables/nft-cache.c
> > > > @@ -224,30 +224,49 @@ static int fetch_rule_cache(struct nft_handle *h)
> > > >  	return 0;
> > > >  }
> > > >  
> > > > -static void __nft_build_cache(struct nft_handle *h)
> > > > +static void __nft_build_cache(struct nft_handle *h, enum nft_cache_level level)
> > > >  {
> > > >  	uint32_t genid_start, genid_stop;
> > > >  
> > > > +	if (level <= h->cache_level)
> > > > +		return;
> > > >  retry:
> > > >  	mnl_genid_get(h, &genid_start);
> > > > -	fetch_table_cache(h);
> > > > -	fetch_chain_cache(h);
> > > > -	fetch_rule_cache(h);
> > > > -	h->have_cache = true;
> > > > -	mnl_genid_get(h, &genid_stop);
> > > >  
> > > > +	switch (h->cache_level) {
> > > > +	case NFT_CL_NONE:
> > > > +		fetch_table_cache(h);
> > > > +		if (level == NFT_CL_TABLES)
> > > > +			break;
> > > > +		/* fall through */
> > > > +	case NFT_CL_TABLES:
> > > 
> > > If the existing level is TABLES and use wants chains, then you have to
> > > invalidate the existing table cache, then fetch the tables and chains
> > > to make sure cache is consistent. I mean, extending an existing cache
> > > might lead to inconsistencies.
> > > 
> > > Am I missing anything?
> 
> Hmm, this is a valid point indeed. At least one can't depend on stored
> genid to match the local cache's state which defeats its purpose.

Exactly.

> > If I'm correct, I wonder if we should go for splitting the parsing
> > from the evaluation phase here. Probably generate the rule by pointing
> > to the table and chain as string, then evaluate the ruleset update
> > batch to obtain the cache level in one go. This is the approach that
> > I had in mind with nftables, so you might avoid dumping the ruleset
> > over and over in an environment where dynamic updates are frequent.
> > 
> > The idea is to avoid fetching a cache, then canceling it by the rule
> > coming afterwards just because the cache is incomplete. So the cache
> > that is required is calculated once, then you go to the kernel and
> > fetch it (making sure generation number tells you that your cache is
> > consistent).
> > 
> > Makes sense to you?
> 
> Well, I understand your approach and it's a proper way to deal with the
> situation, but I fear this means quite some effort. I imagine we either
> extend the xtables-restore table delete logic to add and disable more
> dependency commands based on kernel ruleset contents or add these
> dependency commands when iterating over the command set and populating
> the cache.

It's always more work to make a bit of architectural changes, yes.

> The thing is, we do this mostly just for xtables-restore --noflush
> logic, which in turn is probably not used often but popular among "power
> users" trying to speed up the bunch of iptables commands they have to
> enter at once.

Yes, this is the kind of user that deals with frequent dynamic
updates.

> Maybe we could go with a simpler solution for now, which is to check
> kernel genid again and drop the local cache if it differs from what's
> stored. If it doesn't, the current cache is still up to date and we may
> just fetch what's missing. Or does that leave room for a race condition?

You could also just parse the ruleset twice in userspace, once to
calculate the cache you need and another to actually create the
transaction batch and push it into the kernel. That's a bit poor man
approach, but it might work. You would need to invoke
xtables_restore_parse() twice.

This more simpler approach probably requires less changes to the
existing codebase, that can be "reverted" later on if someone needs to
speed up this by removing this parsing happening twice.
Pablo Neira Ayuso Oct. 11, 2019, 10:20 a.m. UTC | #5
On Fri, Oct 11, 2019 at 12:09:11AM +0200, Phil Sutter wrote:
[...]
> Maybe we could go with a simpler solution for now, which is to check
> kernel genid again and drop the local cache if it differs from what's
> stored. If it doesn't, the current cache is still up to date and we may
> just fetch what's missing. Or does that leave room for a race condition?

My concern with this approach is that, in the dynamic ruleset update
scenarios, assuming very frequent updates, you might lose race when
building the cache in stages. Hence, forcing you to restart from
scratch in the middle of the transaction handling.

I prefer to calculate the cache that is needed in one go by analyzing
the batch, it's simpler. Note that we might lose race still since
kernel might tell us we're working on an obsolete generation number ID
cache, forcing us to restart.
Pablo Neira Ayuso Oct. 14, 2019, 10 a.m. UTC | #6
On Fri, Oct 11, 2019 at 01:24:52PM +0200, Phil Sutter wrote:
> Hi,
> 
> On Fri, Oct 11, 2019 at 11:28:23AM +0200, Pablo Neira Ayuso wrote:
> [...]
> > You could also just parse the ruleset twice in userspace, once to
> > calculate the cache you need and another to actually create the
> > transaction batch and push it into the kernel. That's a bit poor man
> > approach, but it might work. You would need to invoke
> > xtables_restore_parse() twice.
> 
> The problem with parsing twice is having to cache input which may be
> huge for xtables-restore.
> 
> On Fri, Oct 11, 2019 at 12:20:52PM +0200, Pablo Neira Ayuso wrote:
> > On Fri, Oct 11, 2019 at 12:09:11AM +0200, Phil Sutter wrote:
> > [...]
> > > Maybe we could go with a simpler solution for now, which is to check
> > > kernel genid again and drop the local cache if it differs from what's
> > > stored. If it doesn't, the current cache is still up to date and we may
> > > just fetch what's missing. Or does that leave room for a race condition?
> > 
> > My concern with this approach is that, in the dynamic ruleset update
> > scenarios, assuming very frequent updates, you might lose race when
> > building the cache in stages. Hence, forcing you to restart from
> > scratch in the middle of the transaction handling.
> 
> In a very busy environment there's always trouble, simply because we
> can't atomically fetch ruleset from kernel and adjust and submit our
> batch. Dealing with that means we're back at xtables-lock.
> 
> > I prefer to calculate the cache that is needed in one go by analyzing
> > the batch, it's simpler. Note that we might lose race still since
> > kernel might tell us we're working on an obsolete generation number ID
> > cache, forcing us to restart.
> 
> My idea for conditional cache reset is based on the assumption that
> conflicts are rare and we want to optimize for non-conflict case. So
> core logic would be:
> 
> 1) fetch kernel genid into genid_start
> 2) if cache level > NFT_CL_NONE and cache genid != genid_start:
>    2a) drop local caches
>    2b) set cache level to NFT_CL_NONE
> 3) call cache fetchers based on cache level and desired level
> 4) fetch kernel genid into genid_end
> 5) if genid_start != genid_end goto 1
> 
> So this is basically the old algorithm but with (2) added. What do you
> think?

Please, make testcases to validate that races don't happen. Debugging
cache inconsistencies is not easy, that's why I like the idea of
calculating the cache first, then build it in one go. I'm fine with
starting with a more simple approach in the short term. Note that
reports from users on these cache inconsistency problems are usually
sparse, which is usually a bit frustrating. I understand a larger
rework might to accomodate the more simple approach will take more
time.

Patch
diff mbox series

diff --git a/iptables/nft-cache.c b/iptables/nft-cache.c
index 5444419a5cc3b..22a87e94efd76 100644
--- a/iptables/nft-cache.c
+++ b/iptables/nft-cache.c
@@ -224,30 +224,49 @@  static int fetch_rule_cache(struct nft_handle *h)
 	return 0;
 }
 
-static void __nft_build_cache(struct nft_handle *h)
+static void __nft_build_cache(struct nft_handle *h, enum nft_cache_level level)
 {
 	uint32_t genid_start, genid_stop;
 
+	if (level <= h->cache_level)
+		return;
 retry:
 	mnl_genid_get(h, &genid_start);
-	fetch_table_cache(h);
-	fetch_chain_cache(h);
-	fetch_rule_cache(h);
-	h->have_cache = true;
-	mnl_genid_get(h, &genid_stop);
 
+	switch (h->cache_level) {
+	case NFT_CL_NONE:
+		fetch_table_cache(h);
+		if (level == NFT_CL_TABLES)
+			break;
+		/* fall through */
+	case NFT_CL_TABLES:
+		fetch_chain_cache(h);
+		if (level == NFT_CL_CHAINS)
+			break;
+		/* fall through */
+	case NFT_CL_CHAINS:
+		fetch_rule_cache(h);
+		if (level == NFT_CL_RULES)
+			break;
+		/* fall through */
+	case NFT_CL_RULES:
+		break;
+	}
+
+	mnl_genid_get(h, &genid_stop);
 	if (genid_start != genid_stop) {
 		flush_chain_cache(h, NULL);
 		goto retry;
 	}
 
+	h->cache_level = level;
 	h->nft_genid = genid_start;
 }
 
 void nft_build_cache(struct nft_handle *h)
 {
-	if (!h->have_cache)
-		__nft_build_cache(h);
+	if (h->cache_level < NFT_CL_RULES)
+		__nft_build_cache(h, NFT_CL_RULES);
 }
 
 void nft_fake_cache(struct nft_handle *h)
@@ -263,7 +282,7 @@  void nft_fake_cache(struct nft_handle *h)
 
 		h->cache->table[type].chains = nftnl_chain_list_alloc();
 	}
-	h->have_cache = true;
+	h->cache_level = NFT_CL_RULES;
 	mnl_genid_get(h, &h->nft_genid);
 }
 
@@ -331,19 +350,22 @@  static int flush_cache(struct nft_handle *h, struct nft_cache *c,
 
 void flush_chain_cache(struct nft_handle *h, const char *tablename)
 {
-	if (!h->have_cache)
+	if (!h->cache_level)
 		return;
 
 	if (flush_cache(h, h->cache, tablename))
-		h->have_cache = false;
+		h->cache_level = NFT_CL_NONE;
 }
 
 void nft_rebuild_cache(struct nft_handle *h)
 {
-	if (h->have_cache)
+	enum nft_cache_level level = h->cache_level;
+
+	if (h->cache_level)
 		__nft_flush_cache(h);
 
-	__nft_build_cache(h);
+	h->cache_level = NFT_CL_NONE;
+	__nft_build_cache(h, level);
 }
 
 void nft_release_cache(struct nft_handle *h)
@@ -354,8 +376,7 @@  void nft_release_cache(struct nft_handle *h)
 
 struct nftnl_table_list *nftnl_table_list_get(struct nft_handle *h)
 {
-	if (!h->cache->tables)
-		fetch_table_cache(h);
+	__nft_build_cache(h, NFT_CL_TABLES);
 
 	return h->cache->tables;
 }
diff --git a/iptables/nft.h b/iptables/nft.h
index 451c266016d8d..9ae3122a1c515 100644
--- a/iptables/nft.h
+++ b/iptables/nft.h
@@ -27,6 +27,13 @@  struct builtin_table {
 	struct builtin_chain chains[NF_INET_NUMHOOKS];
 };
 
+enum nft_cache_level {
+	NFT_CL_NONE,
+	NFT_CL_TABLES,
+	NFT_CL_CHAINS,
+	NFT_CL_RULES
+};
+
 struct nft_cache {
 	struct nftnl_table_list		*tables;
 	struct {
@@ -53,7 +60,7 @@  struct nft_handle {
 	unsigned int		cache_index;
 	struct nft_cache	__cache[2];
 	struct nft_cache	*cache;
-	bool			have_cache;
+	enum nft_cache_level	cache_level;
 	bool			restore;
 	bool			noflush;
 	int8_t			config_done;