diff mbox

[v7,5/6] net: ipv4, ipv6: run cgroup eBPF egress programs

Message ID 1477390454-12553-6-git-send-email-daniel@zonque.org
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Daniel Mack Oct. 25, 2016, 10:14 a.m. UTC
If the cgroup associated with the receiving socket has an eBPF
programs installed, run them from ip_output(), ip6_output() and
ip_mc_output().

eBPF programs used in this context are expected to either return 1 to
let the packet pass, or != 1 to drop them. The programs have access to
the skb through bpf_skb_load_bytes(), and the payload starts at the
network headers (L3).

Note that cgroup_bpf_run_filter() is stubbed out as static inline nop
for !CONFIG_CGROUP_BPF, and is otherwise guarded by a static key if
the feature is unused.

Signed-off-by: Daniel Mack <daniel@zonque.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 net/ipv4/ip_output.c  | 17 +++++++++++++++++
 net/ipv6/ip6_output.c |  9 +++++++++
 2 files changed, 26 insertions(+)

Comments

David Miller Oct. 31, 2016, 4:40 p.m. UTC | #1
From: Daniel Mack <daniel@zonque.org>
Date: Tue, 25 Oct 2016 12:14:13 +0200

> @@ -312,6 +314,13 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>  	skb->dev = dev;
>  	skb->protocol = htons(ETH_P_IP);
>  
> +	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
> +				    BPF_CGROUP_INET_EGRESS);
> +	if (ret) {
> +		kfree_skb(skb);
> +		return ret;
> +	}
> +
>  	/*
>  	 *	Multicasts are looped back for other local users
>  	 */
> @@ -364,12 +373,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>  int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>  {
>  	struct net_device *dev = skb_dst(skb)->dev;
> +	int ret;
>  
>  	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
>  
>  	skb->dev = dev;
>  	skb->protocol = htons(ETH_P_IP);
>  
> +	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
> +				    BPF_CGROUP_INET_EGRESS);
> +	if (ret) {
> +		kfree_skb(skb);
> +		return ret;
> +	}
> +
>  	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
>  			    net, sk, skb, NULL, dev,
>  			    ip_finish_output,

The "sk" here is not necessarily the application socket.  It could be
a UDP tunnel socket or similar encapsulation object.

"skb->sk" is always the application socket, so is probably what you
need to pass down into the cgroup bpf run filter hook.
Daniel Borkmann Nov. 2, 2016, 1:17 a.m. UTC | #2
On 10/31/2016 05:40 PM, David Miller wrote:
> From: Daniel Mack <daniel@zonque.org>
> Date: Tue, 25 Oct 2016 12:14:13 +0200
>
>> @@ -312,6 +314,13 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>>   	skb->dev = dev;
>>   	skb->protocol = htons(ETH_P_IP);
>>
>> +	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
>> +				    BPF_CGROUP_INET_EGRESS);
>> +	if (ret) {
>> +		kfree_skb(skb);
>> +		return ret;
>> +	}
>> +
>>   	/*
>>   	 *	Multicasts are looped back for other local users
>>   	 */
>> @@ -364,12 +373,20 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>>   int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
>>   {
>>   	struct net_device *dev = skb_dst(skb)->dev;
>> +	int ret;
>>
>>   	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
>>
>>   	skb->dev = dev;
>>   	skb->protocol = htons(ETH_P_IP);
>>
>> +	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
>> +				    BPF_CGROUP_INET_EGRESS);
>> +	if (ret) {
>> +		kfree_skb(skb);
>> +		return ret;
>> +	}
>> +
>>   	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
>>   			    net, sk, skb, NULL, dev,
>>   			    ip_finish_output,
>
> The "sk" here is not necessarily the application socket.  It could be
> a UDP tunnel socket or similar encapsulation object.
>
> "skb->sk" is always the application socket, so is probably what you
> need to pass down into the cgroup bpf run filter hook.

Wouldn't that mean however, when you go through stacked devices that
you'd run the same eBPF cgroup program for skb->sk multiple times?
(Except for things like crossing netns due to the skb_orphan() there.)

For example, when you're doing accounting through this facility, then
TX part is counted X times instead of ideally just once, and for RX
afaik, we seem to short-cut sk_filter_trim_cap() on the encaps anyway.

I guess one could work around this by marking the skb with a verdict
once it was accounted for the first time, either from within eBPF prog
or from stack side. For eBPF we'd need to implement our own .is_valid_access()
callback for the verifier to allow writing into skb meta data like skb->mark
(sk filter can only write into cb[] for passing data between tail calls)
plus making sure in either case that no other user is mangling that flag
further on; hmm, probably rather unintuitive and fragile.

Unless I'm missing something obvious, I would currently think that just
passing "sk" might be more correct.
diff mbox

Patch

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 05d1058..ee4b249 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -74,6 +74,7 @@ 
 #include <net/checksum.h>
 #include <net/inetpeer.h>
 #include <net/lwtunnel.h>
+#include <linux/bpf-cgroup.h>
 #include <linux/igmp.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_bridge.h>
@@ -303,6 +304,7 @@  int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct rtable *rt = skb_rtable(skb);
 	struct net_device *dev = rt->dst.dev;
+	int ret;
 
 	/*
 	 *	If the indicated interface is up and running, send the packet.
@@ -312,6 +314,13 @@  int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
 
+	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
+				    BPF_CGROUP_INET_EGRESS);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
 	/*
 	 *	Multicasts are looped back for other local users
 	 */
@@ -364,12 +373,20 @@  int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct net_device *dev = skb_dst(skb)->dev;
+	int ret;
 
 	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
 
 	skb->dev = dev;
 	skb->protocol = htons(ETH_P_IP);
 
+	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
+				    BPF_CGROUP_INET_EGRESS);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
 	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 			    net, sk, skb, NULL, dev,
 			    ip_finish_output,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 6001e78..1947026 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -39,6 +39,7 @@ 
 #include <linux/module.h>
 #include <linux/slab.h>
 
+#include <linux/bpf-cgroup.h>
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 
@@ -143,6 +144,7 @@  int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct net_device *dev = skb_dst(skb)->dev;
 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+	int ret;
 
 	if (unlikely(idev->cnf.disable_ipv6)) {
 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
@@ -150,6 +152,13 @@  int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 		return 0;
 	}
 
+	ret = cgroup_bpf_run_filter(sk_to_full_sk(sk), skb,
+				    BPF_CGROUP_INET_EGRESS);
+	if (ret) {
+		kfree_skb(skb);
+		return ret;
+	}
+
 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 			    net, sk, skb, NULL, dev,
 			    ip6_finish_output,