Patchwork [3/3] netfilter: ctnetlink: allow userspace to set labels

login
register
mail settings
Submitter Florian Westphal
Date Nov. 15, 2012, 3:55 p.m.
Message ID <1352994915-3859-4-git-send-email-fw@strlen.de>
Download mbox | patch
Permalink /patch/199332/
State Not Applicable
Headers show

Comments

Florian Westphal - Nov. 15, 2012, 3:55 p.m.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/net/netfilter/nf_conntrack_labels.h |    3 ++
 net/netfilter/nf_conntrack_labels.c         |   34 +++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c        |   23 ++++++++++++++++++
 3 files changed, 60 insertions(+), 0 deletions(-)
Pablo Neira - Nov. 27, 2012, 11:18 a.m.
Hi Florian,

On Thu, Nov 15, 2012 at 04:55:15PM +0100, Florian Westphal wrote:
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  include/net/netfilter/nf_conntrack_labels.h |    3 ++
>  net/netfilter/nf_conntrack_labels.c         |   34 +++++++++++++++++++++++++++
>  net/netfilter/nf_conntrack_netlink.c        |   23 ++++++++++++++++++
>  3 files changed, 60 insertions(+), 0 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
> index fdd88fb..e058b2b 100644
> --- a/include/net/netfilter/nf_conntrack_labels.h
> +++ b/include/net/netfilter/nf_conntrack_labels.h
> @@ -52,3 +52,6 @@ static inline void nf_conntrack_labels_fini(struct net *net) {}
>  
>  bool nf_connlabel_match(const struct nf_conn *ct, u16 bit);
>  int nf_connlabel_set(struct nf_conn *ct, u16 bit);
> +
> +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data,
> +			    unsigned int length);
> diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
> index fe9c0c6..8bff33c 100644
> --- a/net/netfilter/nf_conntrack_labels.c
> +++ b/net/netfilter/nf_conntrack_labels.c
> @@ -52,6 +52,40 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
>  }
>  EXPORT_SYMBOL_GPL(nf_connlabel_set);
>  
> +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
> +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data, unsigned int length)
> +{
> +	const size_t maxblen = (1024 / BITS_PER_LONG) * sizeof(long);
> +	struct nf_conn_labels *labels;
> +	unsigned int size;
> +
> +	if (length > maxblen)
> +		return -EMSGSIZE;
> +
> +	labels = nf_ct_labels_find(ct);
> +	if (!labels)
> +		return -ENOSPC;
> +
> +	size = labels->words * sizeof(long);
> +
> +	if (size < length)
> +		length = size;
> +
> +	if (length)
> +		memcpy(labels->bits, data, length);
> +
> +	if (size > length) {
> +		unsigned int pad = size - length;
> +		char *mem = (void *) labels->bits;
> +		memset(mem + pad, 0, pad);
> +	}
> +
> +	nf_conntrack_event_cache(IPCT_LABEL, ct);
> +	return 0;
> +}

Via ctnetlink_new_conntrack, we should be able to create and set the
connlabel if we want to support state-sync of connlabels.

That requires calling _ext_add(...) to allocate the label, based on
cda[CTA_LABELS], and set it. In that case we're safe to memcpy without
interfering with any ongoing bit testing since that conntrack is not
in the hashes yet.

For the update case, I think we'll have to iterate over the mask and
use xchg to update words, thus, we avoid any interference ongoing bit
testing.

> +EXPORT_SYMBOL_GPL(nfnetlink_connlabel_set);
> +#endif
> +
>  static struct nf_ct_ext_type labels_extend __read_mostly = {
>  	.len    = sizeof(struct nf_conn_labels),
>  	.align  = __alignof__(struct nf_conn_labels),
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 43a1247..834fe99 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1229,6 +1229,16 @@ ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
>  }
>  
>  static inline int
> +ctnetlink_attach_label(struct nf_conn *ct, const struct nlattr * const cda[])
> +{
> +#ifdef CONFIG_NF_CONNTRACK_LABELS
> +	return nfnetlink_connlabel_set(ct, nla_data(cda[CTA_LABELS]), nla_len(cda[CTA_LABELS]));
> +#else
> +	return -EOPNOTSUPP;
> +#endif
> +}
> +
> +static inline int
>  ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
>  {
>  	struct nf_conntrack_helper *helper;
> @@ -1441,6 +1451,11 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
>  			return err;
>  	}
>  #endif
> +	if (cda[CTA_LABELS]) {
> +		err = ctnetlink_attach_label(ct, cda);
> +		if (err < 0)
> +			return err;
> +	}
>  
>  	return 0;
>  }
> @@ -1649,6 +1664,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  			else
>  				events = IPCT_NEW;
>  
> +			if (cda[CTA_LABELS] && ctnetlink_attach_label(ct, cda) == 0)
> +				events |= (1 << IPCT_LABEL);
> +
>  			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
>  						      (1 << IPCT_ASSURED) |
>  						      (1 << IPCT_HELPER) |
> @@ -1946,6 +1964,11 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
>  		if (err < 0)
>  			return err;
>  	}
> +	if (cda[CTA_LABELS]) {
> +		err = ctnetlink_attach_label(ct, cda);
> +		if (err < 0)
> +			return err;
> +	}
>  #if defined(CONFIG_NF_CONNTRACK_MARK)
>  	if (cda[CTA_MARK])
>  		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
> -- 
> 1.7.8.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Nov. 27, 2012, 11:50 a.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
> > +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data, unsigned int length)
> > +{
[..]
> > +	labels = nf_ct_labels_find(ct);
> > +	if (!labels)
> > +		return -ENOSPC;
[..]

> Via ctnetlink_new_conntrack, we should be able to create and set the
> connlabel if we want to support state-sync of connlabels.

Right.   Good point.

> That requires calling _ext_add(...) to allocate the label, based on
> cda[CTA_LABELS], and set it. In that case we're safe to memcpy without
> interfering with any ongoing bit testing since that conntrack is not
> in the hashes yet.

True.  So we can't race with other _ext_add() callers either.
I'll add this functionality, thanks for pointing this out.

> For the update case, I think we'll have to iterate over the mask and
> use xchg to update words, thus, we avoid any interference ongoing bit
> testing.

Could you elaborate?
Why is memcpy not good enough here?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Nov. 27, 2012, 12:31 p.m.
On Tue, Nov 27, 2012 at 12:50:00PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
> > > +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data, unsigned int length)
> > > +{
> [..]
> > > +	labels = nf_ct_labels_find(ct);
> > > +	if (!labels)
> > > +		return -ENOSPC;
> [..]
> 
> > Via ctnetlink_new_conntrack, we should be able to create and set the
> > connlabel if we want to support state-sync of connlabels.
> 
> Right.   Good point.
> 
> > That requires calling _ext_add(...) to allocate the label, based on
> > cda[CTA_LABELS], and set it. In that case we're safe to memcpy without
> > interfering with any ongoing bit testing since that conntrack is not
> > in the hashes yet.
> 
> True.  So we can't race with other _ext_add() callers either.
> I'll add this functionality, thanks for pointing this out.
> 
> > For the update case, I think we'll have to iterate over the mask and
> > use xchg to update words, thus, we avoid any interference ongoing bit
> > testing.
> 
> Could you elaborate?
> Why is memcpy not good enough here?

while updating the connlabel via memcpy, some test_bit on the
connlabel may be already happening. I was suggesting some way to avoid
racing with it.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Nov. 27, 2012, 12:39 p.m.
While at it, can you also address a couple of nitpicking suggestions?
See below.

On Thu, Nov 15, 2012 at 04:55:15PM +0100, Florian Westphal wrote:
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  include/net/netfilter/nf_conntrack_labels.h |    3 ++
>  net/netfilter/nf_conntrack_labels.c         |   34 +++++++++++++++++++++++++++
>  net/netfilter/nf_conntrack_netlink.c        |   23 ++++++++++++++++++
>  3 files changed, 60 insertions(+), 0 deletions(-)
> 
> diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
> index fdd88fb..e058b2b 100644
> --- a/include/net/netfilter/nf_conntrack_labels.h
> +++ b/include/net/netfilter/nf_conntrack_labels.h
> @@ -52,3 +52,6 @@ static inline void nf_conntrack_labels_fini(struct net *net) {}
>  
>  bool nf_connlabel_match(const struct nf_conn *ct, u16 bit);
>  int nf_connlabel_set(struct nf_conn *ct, u16 bit);
> +
> +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data,
> +			    unsigned int length);
> diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
> index fe9c0c6..8bff33c 100644
> --- a/net/netfilter/nf_conntrack_labels.c
> +++ b/net/netfilter/nf_conntrack_labels.c
> @@ -52,6 +52,40 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit)
>  }
>  EXPORT_SYMBOL_GPL(nf_connlabel_set);
>  
> +#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
> +int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data, unsigned int length)

Probably rename this to nf_connlabel_set? It is not specific of
nfnetlink (although it is the only client of it so far).

> +{
> +	const size_t maxblen = (1024 / BITS_PER_LONG) * sizeof(long);
> +	struct nf_conn_labels *labels;
> +	unsigned int size;
> +
> +	if (length > maxblen)
> +		return -EMSGSIZE;
> +
> +	labels = nf_ct_labels_find(ct);
> +	if (!labels)
> +		return -ENOSPC;
> +
> +	size = labels->words * sizeof(long);
> +
> +	if (size < length)
> +		length = size;
> +
> +	if (length)
> +		memcpy(labels->bits, data, length);
> +
> +	if (size > length) {
> +		unsigned int pad = size - length;
> +		char *mem = (void *) labels->bits;
> +		memset(mem + pad, 0, pad);
> +	}
> +
> +	nf_conntrack_event_cache(IPCT_LABEL, ct);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(nfnetlink_connlabel_set);
> +#endif
> +
>  static struct nf_ct_ext_type labels_extend __read_mostly = {
>  	.len    = sizeof(struct nf_conn_labels),
>  	.align  = __alignof__(struct nf_conn_labels),
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 43a1247..834fe99 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1229,6 +1229,16 @@ ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
>  }
>  
>  static inline int
> +ctnetlink_attach_label(struct nf_conn *ct, const struct nlattr * const cda[])
> +{
> +#ifdef CONFIG_NF_CONNTRACK_LABELS
> +	return nfnetlink_connlabel_set(ct, nla_data(cda[CTA_LABELS]), nla_len(cda[CTA_LABELS]));
> +#else
> +	return -EOPNOTSUPP;
> +#endif
> +}

Better move this function above to include/net/netfilter/nf_conntrack_labels.h.

> +
> +static inline int
>  ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
>  {
>  	struct nf_conntrack_helper *helper;
> @@ -1441,6 +1451,11 @@ ctnetlink_change_conntrack(struct nf_conn *ct,
>  			return err;
>  	}
>  #endif
> +	if (cda[CTA_LABELS]) {
> +		err = ctnetlink_attach_label(ct, cda);
> +		if (err < 0)
> +			return err;
> +	}
>  
>  	return 0;
>  }
> @@ -1649,6 +1664,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
>  			else
>  				events = IPCT_NEW;
>  
> +			if (cda[CTA_LABELS] && ctnetlink_attach_label(ct, cda) == 0)
> +				events |= (1 << IPCT_LABEL);
> +
>  			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
>  						      (1 << IPCT_ASSURED) |
>  						      (1 << IPCT_HELPER) |
> @@ -1946,6 +1964,11 @@ ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
>  		if (err < 0)
>  			return err;
>  	}
> +	if (cda[CTA_LABELS]) {
> +		err = ctnetlink_attach_label(ct, cda);
> +		if (err < 0)
> +			return err;
> +	}
>  #if defined(CONFIG_NF_CONNTRACK_MARK)
>  	if (cda[CTA_MARK])
>  		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));
> -- 
> 1.7.8.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Nov. 27, 2012, 1:09 p.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > For the update case, I think we'll have to iterate over the mask and
> > > use xchg to update words, thus, we avoid any interference ongoing bit
> > > testing.
> > 
> > Could you elaborate?
> > Why is memcpy not good enough here?
> 
> while updating the connlabel via memcpy, some test_bit on the
> connlabel may be already happening. I was suggesting some way to avoid
> racing with it.

I don't understand why its racing.

Is there a case where we update a word, and test_bit can return
"bit is set", even if the bit in the word is neither currently
set nor about to be set?

If not, then I don't see the race; either the test happens
before we copied the word, or afterwards; regardless of copy vs.
xchg?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Nov. 27, 2012, 2:13 p.m.
On Tue, Nov 27, 2012 at 02:09:04PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > > For the update case, I think we'll have to iterate over the mask and
> > > > use xchg to update words, thus, we avoid any interference ongoing bit
> > > > testing.
> > > 
> > > Could you elaborate?
> > > Why is memcpy not good enough here?
> > 
> > while updating the connlabel via memcpy, some test_bit on the
> > connlabel may be already happening. I was suggesting some way to avoid
> > racing with it.
> 
> I don't understand why its racing.
> 
> Is there a case where we update a word, and test_bit can return
> "bit is set", even if the bit in the word is neither currently
> set nor about to be set?
> 
> If not, then I don't see the race; either the test happens
> before we copied the word, or afterwards; regardless of copy vs.
> xchg?

I was thinking on the case in which we are setting bits via the
connlabel extension and modifying this from ctnetlink at the same
time.

But I don't see any way to make it any better, I think your approach
is fine for the update case.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Nov. 27, 2012, 2:24 p.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> I was thinking on the case in which we are setting bits via the
> connlabel extension and modifying this from ctnetlink at the same
> time.

Indeed, in that case we might scribble over a bit that has been set
the instant before.

And yes, this might be a problem.
The only way to fix it (AFAICS) would be to add a new interface to
allow (un)setting specific bits from userspace, so that userspace
could request "set this bit" or "clear that bit", rather than the
current "dump/modify/replace" cycle.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Nov. 30, 2012, 1:58 p.m.
On Tue, Nov 27, 2012 at 03:24:33PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > I was thinking on the case in which we are setting bits via the
> > connlabel extension and modifying this from ctnetlink at the same
> > time.
> 
> Indeed, in that case we might scribble over a bit that has been set
> the instant before.
> 
> And yes, this might be a problem.
> The only way to fix it (AFAICS) would be to add a new interface to
> allow (un)setting specific bits from userspace, so that userspace
> could request "set this bit" or "clear that bit", rather than the
> current "dump/modify/replace" cycle.

I see. That replacement operation still seems useful to me though.

Quick idea: I think we can also support atomic replacement at word
size using xchg, so the replacement operation can still happen at word
level. Setting many bits at once would be also faster with that.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Nov. 30, 2012, 2:02 p.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> Quick idea: I think we can also support atomic replacement at word
> size using xchg, so the replacement operation can still happen at word
> level. Setting many bits at once would be also faster with that.

Unfortunately, no -- the interface is too rudimentary.
Example: You want to set bis 0, 2, 6; but leave all other
bit that are set intact.

So you first need to make a dump to fetch the current labels set.
Then, you set bits 0, 2, 6 and send the new state to the kernel.

But between the dump and the set operation, a new bit might
have been set.  So even when using xchg it will be un-set again...
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Nov. 30, 2012, 6:34 p.m.
On Fri, Nov 30, 2012 at 03:02:54PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > Quick idea: I think we can also support atomic replacement at word
> > size using xchg, so the replacement operation can still happen at word
> > level. Setting many bits at once would be also faster with that.
> 
> Unfortunately, no -- the interface is too rudimentary.
> Example: You want to set bis 0, 2, 6; but leave all other
> bit that are set intact.
> 
> So you first need to make a dump to fetch the current labels set.
> Then, you set bits 0, 2, 6 and send the new state to the kernel.
> 
> But between the dump and the set operation, a new bit might
> have been set.  So even when using xchg it will be un-set again...

what about cmpxchg inside a loop? I think we can assume that the
probability of interference while updating a word is low.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Nov. 30, 2012, 9:36 p.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> On Fri, Nov 30, 2012 at 03:02:54PM +0100, Florian Westphal wrote:
[..]
> > Unfortunately, no -- the interface is too rudimentary.
> > Example: You want to set bis 0, 2, 6; but leave all other
> > bit that are set intact.
> > 
> > So you first need to make a dump to fetch the current labels set.
> > Then, you set bits 0, 2, 6 and send the new state to the kernel.
> > 
> > But between the dump and the set operation, a new bit might
> > have been set.  So even when using xchg it will be un-set again...
> 
> what about cmpxchg inside a loop? I think we can assume that the
> probability of interference while updating a word is low.

I'll try to give an example.

Userspace tool wants to set bit 0 on all labels, and remove all
others.  Except label 1<<31, which should be left alone.

With the proposed interface, you would do something like

dump_conntracks();

for_each_ct_object(..) {
	u32 word = get_ct_labels(ct);

	word &= 0x80000000; /* clear all but bit 31 */
	word |= 1;	    /* set bit 0 */
	send_change_to_kernel(ct); /* tell kernel */
}

No matter what xchg tricks you do in the kernel: if 1<<31 was set
after the dump completed, it will be un-set again via
send_change_to_kernel(), i.e. we clear bit 1<<31, even though we didn't
want to.  I don't see how this can be solved; kernel has no idea that
userspace doesn't wish to alter 1<<31.

We would have to add explicit support for setting/clearing single bits
to make this work, then userspace could say 'set bit 0'. 'clear bit 1',
etc.

HOWEVER, i've failed to come up with a plausible usage scenario where
such an interface would be useful/required. 8-}

So, I'd propose to leave things as they are, i.e. userspace commits
the entire connlabel bitvector. The ruleset would presumably
re-set required labels anyway on the next packet.

If someone can come up with a usage scenario where this
isn't sufficient, we could always add such a 'clear/set bit' command
later.

What do you think?

Cheers,
Florian
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Dec. 3, 2012, 11:04 a.m.
Hi Florian,

On Fri, Nov 30, 2012 at 10:36:31PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > On Fri, Nov 30, 2012 at 03:02:54PM +0100, Florian Westphal wrote:
> [..]
> > > Unfortunately, no -- the interface is too rudimentary.
> > > Example: You want to set bis 0, 2, 6; but leave all other
> > > bit that are set intact.
> > > 
> > > So you first need to make a dump to fetch the current labels set.
> > > Then, you set bits 0, 2, 6 and send the new state to the kernel.
> > > 
> > > But between the dump and the set operation, a new bit might
> > > have been set.  So even when using xchg it will be un-set again...
> > 
> > what about cmpxchg inside a loop? I think we can assume that the
> > probability of interference while updating a word is low.
> 
> I'll try to give an example.
> 
> Userspace tool wants to set bit 0 on all labels, and remove all
> others.  Except label 1<<31, which should be left alone.
> 
> With the proposed interface, you would do something like
> 
> dump_conntracks();
> 
> for_each_ct_object(..) {
> 	u32 word = get_ct_labels(ct);
> 
> 	word &= 0x80000000; /* clear all but bit 31 */
> 	word |= 1;	    /* set bit 0 */
> 	send_change_to_kernel(ct); /* tell kernel */
> }
>
> No matter what xchg tricks you do in the kernel: if 1<<31 was set
> after the dump completed, it will be un-set again via
> send_change_to_kernel(), i.e. we clear bit 1<<31, even though we didn't
> want to.  I don't see how this can be solved; kernel has no idea that
> userspace doesn't wish to alter 1<<31.

We can return -EAGAIN to userspace with cmpxchg. From kernel-space:

old = word;
word |= flags & mask; /* to set/unset a bunch of bits */
if (xchgcmp(&word, new, old) != old)
        return -EAGAIN;

So user-space has to refresh its connlabel and try again with the
fresh one. But that's too much for for user-space.

An alternative for this would be like:

do {
        old = word;
        word |= flags & mask;
} while (xchgcmp(&word, new, old) != old;

So we make sure that we are operating with the fresh word.

I'm assuming we're fine if kernel just set some bit and later on
we explicitly unset it.

Note that I'm also assuming that we pass flags and mask as attribute
as we do for nf_conntrack_tcp.c (see netlink attributes).

> We would have to add explicit support for setting/clearing single bits
> to make this work, then userspace could say 'set bit 0'. 'clear bit 1',
> etc.
> 
> HOWEVER, i've failed to come up with a plausible usage scenario where
> such an interface would be useful/required. 8-}
>
> So, I'd propose to leave things as they are, i.e. userspace commits
> the entire connlabel bitvector. The ruleset would presumably
> re-set required labels anyway on the next packet.
> 
> If someone can come up with a usage scenario where this
> isn't sufficient, we could always add such a 'clear/set bit' command
> later.
> 
> What do you think?

If you think we can make it with the suggestions above, good. If not,
we can keep this back and revisit it later once we get a clear
use-case. You have leave the libnetfilter_conntrack changes on some
branch.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal - Dec. 3, 2012, 11:13 a.m.
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > No matter what xchg tricks you do in the kernel: if 1<<31 was set
> > after the dump completed, it will be un-set again via
> > send_change_to_kernel(), i.e. we clear bit 1<<31, even though we didn't
> > want to.  I don't see how this can be solved; kernel has no idea that
> > userspace doesn't wish to alter 1<<31.
> 
> We can return -EAGAIN to userspace with cmpxchg. From kernel-space:
> 
> old = word;
> word |= flags & mask; /* to set/unset a bunch of bits */
> if (xchgcmp(&word, new, old) != old)
>         return -EAGAIN;

Ah.  you're pulling a mask parameter out of your hat :-)

> I'm assuming we're fine if kernel just set some bit and later on
> we explicitly unset it.

Right.

> Note that I'm also assuming that we pass flags and mask as attribute
> as we do for nf_conntrack_tcp.c (see netlink attributes).

Yes, there is no such thing at the moment.
Userspace just sends a CTA_LABEL attribute, which is a bit-vector
(u32 array).

I can add CTA_LABEL_MASK, of course, and change the memcpy accordingly.

In fact, I think I'll do so to see how much additional code it would be.

Thanks for your hint,
Florian
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira - Dec. 3, 2012, 12:58 p.m.
On Mon, Dec 03, 2012 at 12:13:32PM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > No matter what xchg tricks you do in the kernel: if 1<<31 was set
> > > after the dump completed, it will be un-set again via
> > > send_change_to_kernel(), i.e. we clear bit 1<<31, even though we didn't
> > > want to.  I don't see how this can be solved; kernel has no idea that
> > > userspace doesn't wish to alter 1<<31.
> > 
> > We can return -EAGAIN to userspace with cmpxchg. From kernel-space:
> > 
> > old = word;
> > word |= flags & mask; /* to set/unset a bunch of bits */
> > if (xchgcmp(&word, new, old) != old)
> >         return -EAGAIN;
> 
> Ah.  you're pulling a mask parameter out of your hat :-)

I did :-)

BTW, -EAGAIN already has a meaning for nfnetlink, so some other error
should be returned for the approach above.

Not sure if you checked the other approach I mentioned:

do {
        old = word;
        word |= flags & mask;
} while (xchgcmp(&word, new, old) != old);

So ctnetlink would keep trying until no interference happen.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index fdd88fb..e058b2b 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -52,3 +52,6 @@  static inline void nf_conntrack_labels_fini(struct net *net) {}
 
 bool nf_connlabel_match(const struct nf_conn *ct, u16 bit);
 int nf_connlabel_set(struct nf_conn *ct, u16 bit);
+
+int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data,
+			    unsigned int length);
diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c
index fe9c0c6..8bff33c 100644
--- a/net/netfilter/nf_conntrack_labels.c
+++ b/net/netfilter/nf_conntrack_labels.c
@@ -52,6 +52,40 @@  int nf_connlabel_set(struct nf_conn *ct, u16 bit)
 }
 EXPORT_SYMBOL_GPL(nf_connlabel_set);
 
+#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
+int nfnetlink_connlabel_set(struct nf_conn *ct, const void *data, unsigned int length)
+{
+	const size_t maxblen = (1024 / BITS_PER_LONG) * sizeof(long);
+	struct nf_conn_labels *labels;
+	unsigned int size;
+
+	if (length > maxblen)
+		return -EMSGSIZE;
+
+	labels = nf_ct_labels_find(ct);
+	if (!labels)
+		return -ENOSPC;
+
+	size = labels->words * sizeof(long);
+
+	if (size < length)
+		length = size;
+
+	if (length)
+		memcpy(labels->bits, data, length);
+
+	if (size > length) {
+		unsigned int pad = size - length;
+		char *mem = (void *) labels->bits;
+		memset(mem + pad, 0, pad);
+	}
+
+	nf_conntrack_event_cache(IPCT_LABEL, ct);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nfnetlink_connlabel_set);
+#endif
+
 static struct nf_ct_ext_type labels_extend __read_mostly = {
 	.len    = sizeof(struct nf_conn_labels),
 	.align  = __alignof__(struct nf_conn_labels),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 43a1247..834fe99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1229,6 +1229,16 @@  ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
 }
 
 static inline int
+ctnetlink_attach_label(struct nf_conn *ct, const struct nlattr * const cda[])
+{
+#ifdef CONFIG_NF_CONNTRACK_LABELS
+	return nfnetlink_connlabel_set(ct, nla_data(cda[CTA_LABELS]), nla_len(cda[CTA_LABELS]));
+#else
+	return -EOPNOTSUPP;
+#endif
+}
+
+static inline int
 ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[])
 {
 	struct nf_conntrack_helper *helper;
@@ -1441,6 +1451,11 @@  ctnetlink_change_conntrack(struct nf_conn *ct,
 			return err;
 	}
 #endif
+	if (cda[CTA_LABELS]) {
+		err = ctnetlink_attach_label(ct, cda);
+		if (err < 0)
+			return err;
+	}
 
 	return 0;
 }
@@ -1649,6 +1664,9 @@  ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
 			else
 				events = IPCT_NEW;
 
+			if (cda[CTA_LABELS] && ctnetlink_attach_label(ct, cda) == 0)
+				events |= (1 << IPCT_LABEL);
+
 			nf_conntrack_eventmask_report((1 << IPCT_REPLY) |
 						      (1 << IPCT_ASSURED) |
 						      (1 << IPCT_HELPER) |
@@ -1946,6 +1964,11 @@  ctnetlink_nfqueue_parse_ct(const struct nlattr *cda[], struct nf_conn *ct)
 		if (err < 0)
 			return err;
 	}
+	if (cda[CTA_LABELS]) {
+		err = ctnetlink_attach_label(ct, cda);
+		if (err < 0)
+			return err;
+	}
 #if defined(CONFIG_NF_CONNTRACK_MARK)
 	if (cda[CTA_MARK])
 		ct->mark = ntohl(nla_get_be32(cda[CTA_MARK]));