diff mbox

[nf-next,2/2] netfilter: x_tables: fix cgroup's NF_INET_LOCAL_IN sk lookups

Message ID 20fdc704558880831cbbaa8bba5e4855591cd4ba.1427209409.git.daniel@iogearbox.net
State Not Applicable
Delegated to: Pablo Neira
Headers show

Commit Message

Daniel Borkmann March 24, 2015, 3:30 p.m. UTC
While originally only being intended for outgoing traffic, commit
a00e76349f35 ("netfilter: x_tables: allow to use cgroup match for
LOCAL_IN nf hooks") enabled xt_cgroups for the NF_INET_LOCAL_IN hook
as well, in order to allow for nfacct accounting.

This basically was under the assumption that socket early demux will
resolve it. It's correct that demux happens after PRE_ROUTING, but
before LOCAL_IN.

However, that as-is only partially works, i.e. it works for the case
of established TCP and connected UDP sockets when early demux is
enabled, but not for various other ingress scenarios e.g. unconnected
UDP, request sockets, etc.

Instead of reverting commit a00e76349f35, I think it's worth to fix
it up as there are applications requiring xt_cgroup to match on
ingress and egress side. In order to do so, we need to perform a
full lookup on skb->sk (ingress) miss, similarly as being done in
xt_socket.

Therefore, we need to make use of shared helpers xt_sk_lookup() and
xt_sk_lookup6(). Thanks to Daniel for the report and also additional
testing.

Reported-by: Daniel Mack <daniel@zonque.org>
Fixes: a00e76349f35 ("netfilter: x_tables: allow to use cgroup match for LOCAL_IN nf hooks")
Reference: http://thread.gmane.org/gmane.linux.network/355527
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Daniel Mack <daniel@zonque.org>
Cc: Alexey Perevalov <a.perevalov@samsung.com>
Cc: Florian Westphal <fw@strlen.de>
---
 net/netfilter/Kconfig     |  5 +++
 net/netfilter/xt_cgroup.c | 86 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 73 insertions(+), 18 deletions(-)

Comments

Pablo Neira Ayuso March 25, 2015, 4:03 p.m. UTC | #1
On Tue, Mar 24, 2015 at 04:30:29PM +0100, Daniel Borkmann wrote:
> diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
> index 7198d66..cd2468d 100644
> --- a/net/netfilter/xt_cgroup.c
> +++ b/net/netfilter/xt_cgroup.c
> @@ -16,8 +16,11 @@
>  #include <linux/module.h>
>  #include <linux/netfilter/x_tables.h>
>  #include <linux/netfilter/xt_cgroup.h>
> +
>  #include <net/sock.h>
>  
> +#include "xt_sk_helper.h"
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
>  MODULE_DESCRIPTION("Xtables: process control group matching");
> @@ -34,38 +37,85 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par)
>  	return 0;
>  }
>  
> -static bool
> -cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
> +static bool cgroup_mt(const struct sk_buff *skb,
> +		      const struct xt_action_param *par,
> +		      struct sock *(*cgroup_mt_slow)(const struct sk_buff *skb,
> +						     const struct net_device *indev))
>  {
>  	const struct xt_cgroup_info *info = par->matchinfo;
> +	struct sock *sk = skb->sk;
> +	u32 sk_classid;
> +
> +	if (sk) {
> +		sk_classid = sk->sk_classid;
> +	} else {
> +		if (par->in != NULL)
> +			sk = cgroup_mt_slow(skb, par->in);

Is this working with timewait sock?

> +		if (sk == NULL)
> +			return false;
> +
> +		sk_classid = sk->sk_classid;
> +		sock_gen_put(sk);
> +	}
> +
> +	return (info->id == sk_classid) ^ info->invert;
> +}
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann March 25, 2015, 4:39 p.m. UTC | #2
On 03/25/2015 05:03 PM, Pablo Neira Ayuso wrote:
> On Tue, Mar 24, 2015 at 04:30:29PM +0100, Daniel Borkmann wrote:
>> diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
>> index 7198d66..cd2468d 100644
>> --- a/net/netfilter/xt_cgroup.c
>> +++ b/net/netfilter/xt_cgroup.c
>> @@ -16,8 +16,11 @@
>>   #include <linux/module.h>
>>   #include <linux/netfilter/x_tables.h>
>>   #include <linux/netfilter/xt_cgroup.h>
>> +
>>   #include <net/sock.h>
>>
>> +#include "xt_sk_helper.h"
>> +
>>   MODULE_LICENSE("GPL");
>>   MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
>>   MODULE_DESCRIPTION("Xtables: process control group matching");
>> @@ -34,38 +37,85 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par)
>>   	return 0;
>>   }
>>
>> -static bool
>> -cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
>> +static bool cgroup_mt(const struct sk_buff *skb,
>> +		      const struct xt_action_param *par,
>> +		      struct sock *(*cgroup_mt_slow)(const struct sk_buff *skb,
>> +						     const struct net_device *indev))
>>   {
>>   	const struct xt_cgroup_info *info = par->matchinfo;
>> +	struct sock *sk = skb->sk;
>> +	u32 sk_classid;
>> +
>> +	if (sk) {
>> +		sk_classid = sk->sk_classid;
>> +	} else {
>> +		if (par->in != NULL)
>> +			sk = cgroup_mt_slow(skb, par->in);
>
> Is this working with timewait sock?

Yes, all socket objects that are allocated (sk_alloc()) get a
sk_classid of the current task. Given that both share the same
lookup handler, we don't ignore them here as some xt_socket
flags could after the lookup optionally do.

>> +		if (sk == NULL)
>> +			return false;
>> +
>> +		sk_classid = sk->sk_classid;
>> +		sock_gen_put(sk);
>> +	}
>> +
>> +	return (info->id == sk_classid) ^ info->invert;
>> +}
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso March 25, 2015, 5:17 p.m. UTC | #3
On Wed, Mar 25, 2015 at 05:39:04PM +0100, Daniel Borkmann wrote:
> On 03/25/2015 05:03 PM, Pablo Neira Ayuso wrote:
> >On Tue, Mar 24, 2015 at 04:30:29PM +0100, Daniel Borkmann wrote:
> >>diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
> >>index 7198d66..cd2468d 100644
> >>--- a/net/netfilter/xt_cgroup.c
> >>+++ b/net/netfilter/xt_cgroup.c
> >>@@ -16,8 +16,11 @@
> >>  #include <linux/module.h>
> >>  #include <linux/netfilter/x_tables.h>
> >>  #include <linux/netfilter/xt_cgroup.h>
> >>+
> >>  #include <net/sock.h>
> >>
> >>+#include "xt_sk_helper.h"
> >>+
> >>  MODULE_LICENSE("GPL");
> >>  MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
> >>  MODULE_DESCRIPTION("Xtables: process control group matching");
> >>@@ -34,38 +37,85 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par)
> >>  	return 0;
> >>  }
> >>
> >>-static bool
> >>-cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
> >>+static bool cgroup_mt(const struct sk_buff *skb,
> >>+		      const struct xt_action_param *par,
> >>+		      struct sock *(*cgroup_mt_slow)(const struct sk_buff *skb,
> >>+						     const struct net_device *indev))
> >>  {
> >>  	const struct xt_cgroup_info *info = par->matchinfo;
> >>+	struct sock *sk = skb->sk;
> >>+	u32 sk_classid;
> >>+
> >>+	if (sk) {
> >>+		sk_classid = sk->sk_classid;
> >>+	} else {
> >>+		if (par->in != NULL)
> >>+			sk = cgroup_mt_slow(skb, par->in);
> >
> >Is this working with timewait sock?
> 
> Yes, all socket objects that are allocated (sk_alloc()) get a
> sk_classid of the current task. Given that both share the same
> lookup handler, we don't ignore them here as some xt_socket
> flags could after the lookup optionally do.

I mean, we may get a packet from the input path while in TIME_WAIT, and
sk will be actually a inet_timewait_sock, which has a different
layout (no sk_classid).

> >>+		if (sk == NULL)
> >>+			return false;
> >>+
> >>+		sk_classid = sk->sk_classid;
> >>+		sock_gen_put(sk);
> >>+	}
> >>+
> >>+	return (info->id == sk_classid) ^ info->invert;
> >>+}
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann March 25, 2015, 5:27 p.m. UTC | #4
On 03/25/2015 06:17 PM, Pablo Neira Ayuso wrote:
...
> I mean, we may get a packet from the input path while in TIME_WAIT, and
> sk will be actually a inet_timewait_sock, which has a different
> layout (no sk_classid).

Sorry, you are correct, thanks for the hint. We would actually
need to test if we deal with a full socket, iow sk_fullsock(sk).
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso March 25, 2015, 8:26 p.m. UTC | #5
On Tue, Mar 24, 2015 at 04:30:29PM +0100, Daniel Borkmann wrote:
> While originally only being intended for outgoing traffic, commit
> a00e76349f35 ("netfilter: x_tables: allow to use cgroup match for
> LOCAL_IN nf hooks") enabled xt_cgroups for the NF_INET_LOCAL_IN hook
> as well, in order to allow for nfacct accounting.
> 
> This basically was under the assumption that socket early demux will
> resolve it. It's correct that demux happens after PRE_ROUTING, but
> before LOCAL_IN.
> 
> However, that as-is only partially works, i.e. it works for the case
> of established TCP and connected UDP sockets when early demux is
> enabled, but not for various other ingress scenarios e.g. unconnected
> UDP, request sockets, etc.
>
> Instead of reverting commit a00e76349f35, I think it's worth to fix
> it up as there are applications requiring xt_cgroup to match on
> ingress and egress side. In order to do so, we need to perform a
> full lookup on skb->sk (ingress) miss, similarly as being done in
> xt_socket.
> 
> Therefore, we need to make use of shared helpers xt_sk_lookup() and
> xt_sk_lookup6(). Thanks to Daniel for the report and also additional
> testing.

So this is basically needed when early demux is disabled?

This is a rather large rework, I would like to know what scenarios
we're not currently catching with the existing code.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann March 25, 2015, 9:34 p.m. UTC | #6
On 03/25/2015 09:26 PM, Pablo Neira Ayuso wrote:
> On Tue, Mar 24, 2015 at 04:30:29PM +0100, Daniel Borkmann wrote:
>> While originally only being intended for outgoing traffic, commit
>> a00e76349f35 ("netfilter: x_tables: allow to use cgroup match for
>> LOCAL_IN nf hooks") enabled xt_cgroups for the NF_INET_LOCAL_IN hook
>> as well, in order to allow for nfacct accounting.
>>
>> This basically was under the assumption that socket early demux will
>> resolve it. It's correct that demux happens after PRE_ROUTING, but
>> before LOCAL_IN.
>>
>> However, that as-is only partially works, i.e. it works for the case
>> of established TCP and connected UDP sockets when early demux is
>> enabled, but not for various other ingress scenarios e.g. unconnected
>> UDP, request sockets, etc.
>>
>> Instead of reverting commit a00e76349f35, I think it's worth to fix
>> it up as there are applications requiring xt_cgroup to match on
>> ingress and egress side. In order to do so, we need to perform a
>> full lookup on skb->sk (ingress) miss, similarly as being done in
>> xt_socket.
>>
>> Therefore, we need to make use of shared helpers xt_sk_lookup() and
>> xt_sk_lookup6(). Thanks to Daniel for the report and also additional
>> testing.
>
> So this is basically needed when early demux is disabled?
>
> This is a rather large rework, I would like to know what scenarios
> we're not currently catching with the existing code.

Hm, perhaps Daniel can elaborate better, what I have seen in my
testing when xt_cgroup fails to match the cgroup on ingress traffic
is i) early demux sysctl disabled, ii) udp on unconnected sockets
(which I understand is the majority of udp traffic), iii) tcp and
udp (any kind) on localhost communications. Daniel's original report
can be found here [1].

Thanks,
Daniel

   [1] http://thread.gmane.org/gmane.linux.network/355527
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Mack March 25, 2015, 9:54 p.m. UTC | #7
On 03/25/2015 10:34 PM, Daniel Borkmann wrote:
> On 03/25/2015 09:26 PM, Pablo Neira Ayuso wrote:

>> So this is basically needed when early demux is disabled?
>>
>> This is a rather large rework, I would like to know what scenarios
>> we're not currently catching with the existing code.
> 
> Hm, perhaps Daniel can elaborate better, what I have seen in my
> testing when xt_cgroup fails to match the cgroup on ingress traffic
> is i) early demux sysctl disabled, ii) udp on unconnected sockets
> (which I understand is the majority of udp traffic), iii) tcp and
> udp (any kind) on localhost communications. Daniel's original report
> can be found here [1].

Currently, ingress matching fails if the xt_cgroup module's match
callback is called with skb->sk == NULL, which is the case in the
scenarios described above. Also, according to Cong, this is as well
always the case if the ingress network device is 'lo'.

We want to use xt_cgroup to realize a per-application firewall for both
filtering and accounting. For this, being able to catch every network
packet that is destined for or originated by a task that is assigned to
a certain net_cls CGroup controller is essential. Also, the match has to
be effective regardless of the network interface in use.

In my tests, Daniel's patches work perfectly fine.


Thanks,
Daniel

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 971cd75..044bd22 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -960,8 +960,13 @@  config NETFILTER_XT_MATCH_BPF
 
 config NETFILTER_XT_MATCH_CGROUP
 	tristate '"control group" match support'
+	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
+	depends on !NF_CONNTRACK || NF_CONNTRACK
+	depends on (IPV6 || IPV6=n)
 	depends on CGROUPS
+	select NF_DEFRAG_IPV4
+	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	select CGROUP_NET_CLASSID
 	---help---
 	Socket/process control group matching allows you to match locally
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 7198d66..cd2468d 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -16,8 +16,11 @@ 
 #include <linux/module.h>
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_cgroup.h>
+
 #include <net/sock.h>
 
+#include "xt_sk_helper.h"
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
 MODULE_DESCRIPTION("Xtables: process control group matching");
@@ -34,38 +37,85 @@  static int cgroup_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
-static bool
-cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+static bool cgroup_mt(const struct sk_buff *skb,
+		      const struct xt_action_param *par,
+		      struct sock *(*cgroup_mt_slow)(const struct sk_buff *skb,
+						     const struct net_device *indev))
 {
 	const struct xt_cgroup_info *info = par->matchinfo;
+	struct sock *sk = skb->sk;
+	u32 sk_classid;
+
+	if (sk) {
+		sk_classid = sk->sk_classid;
+	} else {
+		if (par->in != NULL)
+			sk = cgroup_mt_slow(skb, par->in);
+		if (sk == NULL)
+			return false;
+
+		sk_classid = sk->sk_classid;
+		sock_gen_put(sk);
+	}
+
+	return (info->id == sk_classid) ^ info->invert;
+}
 
-	if (skb->sk == NULL)
-		return false;
+static bool
+cgroup_mt_v4(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return cgroup_mt(skb, par, xt_sk_lookup);
+}
 
-	return (info->id == skb->sk->sk_classid) ^ info->invert;
+#ifdef XT_HAVE_IPV6
+static bool
+cgroup_mt_v6(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return cgroup_mt(skb, par, xt_sk_lookup6);
 }
+#endif
 
-static struct xt_match cgroup_mt_reg __read_mostly = {
-	.name       = "cgroup",
-	.revision   = 0,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = cgroup_mt_check,
-	.match      = cgroup_mt,
-	.matchsize  = sizeof(struct xt_cgroup_info),
-	.me         = THIS_MODULE,
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-		      (1 << NF_INET_POST_ROUTING) |
-		      (1 << NF_INET_LOCAL_IN),
+static struct xt_match cgroup_mt_reg[] __read_mostly = {
+	{
+		.name       = "cgroup",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.checkentry = cgroup_mt_check,
+		.match      = cgroup_mt_v4,
+		.matchsize  = sizeof(struct xt_cgroup_info),
+		.me         = THIS_MODULE,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING) |
+			      (1 << NF_INET_LOCAL_IN),
+	},
+#ifdef XT_HAVE_IPV6
+	{
+		.name       = "cgroup",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.checkentry = cgroup_mt_check,
+		.match      = cgroup_mt_v6,
+		.matchsize  = sizeof(struct xt_cgroup_info),
+		.me         = THIS_MODULE,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+			      (1 << NF_INET_POST_ROUTING) |
+			      (1 << NF_INET_LOCAL_IN),
+	}
+#endif
 };
 
 static int __init cgroup_mt_init(void)
 {
-	return xt_register_match(&cgroup_mt_reg);
+	nf_defrag_ipv4_enable();
+#ifdef XT_HAVE_IPV6
+	nf_defrag_ipv6_enable();
+#endif
+	return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
 }
 
 static void __exit cgroup_mt_exit(void)
 {
-	xt_unregister_match(&cgroup_mt_reg);
+	xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
 }
 
 module_init(cgroup_mt_init);