diff mbox

[net] netlink, mmap: transform mmap skb into full skb on taps

Message ID 20150911.124205.1992250802389107845.davem@davemloft.net
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

David Miller Sept. 11, 2015, 7:42 p.m. UTC
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 11 Sep 2015 12:25:45 +0200

> Already calling into skb_clone() is an issue itself, as the data
> area is user space buffer, and skb_clone() as well as skb_copy()
> access skb_shinfo() area. :/ So in that regard netlink mmap skbs are
> even further restrictive on what we can do than netlink large skbs.

Indeed, this is fatal.

So we'd still need something special like your
netlink_to_full_skb_clone to elide trying to touch the skb_shinfo
area.

I thought briefly about somehow cobbling up extra space in the ring
entries so we could have a real skb_shinfo() there, but that's illegal
too as the user could scribble all over it randomly while we interpret
the contents.  We don't own that memory.  So this doesn't work.

We could rename the clone_preserves_destructor and have it also mean
that the SKB lacks frags and skb_shinfo() should not be inspected.

Something like this:

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Daniel Borkmann Sept. 11, 2015, 8:35 p.m. UTC | #1
On 09/11/2015 09:42 PM, David Miller wrote:
> From: Daniel Borkmann <daniel@iogearbox.net>
> Date: Fri, 11 Sep 2015 12:25:45 +0200
>
>> Already calling into skb_clone() is an issue itself, as the data
>> area is user space buffer, and skb_clone() as well as skb_copy()
>> access skb_shinfo() area. :/ So in that regard netlink mmap skbs are
>> even further restrictive on what we can do than netlink large skbs.
>
> Indeed, this is fatal.
>
> So we'd still need something special like your
> netlink_to_full_skb_clone to elide trying to touch the skb_shinfo
> area.
>
> I thought briefly about somehow cobbling up extra space in the ring
> entries so we could have a real skb_shinfo() there, but that's illegal
> too as the user could scribble all over it randomly while we interpret
> the contents.  We don't own that memory.  So this doesn't work.

Yes, agreed.

> We could rename the clone_preserves_destructor and have it also mean
> that the SKB lacks frags and skb_shinfo() should not be inspected.
>
> Something like this:
[...]
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 2738d35..898c53d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
[...]
> @@ -2220,7 +2221,8 @@ static inline void skb_orphan(struct sk_buff *skb)
>    */
>   static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
>   {
> -	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
> +	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) ||
> +		   skb->private_buffers))

(These two would need to be swapped.)

>   		return 0;
>   	return skb_copy_ubufs(skb, gfp_mask);
>   }
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index dad4dd3..54f9d6e 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -825,7 +825,10 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
>   	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
>   	n->cloned = 1;
>   	n->nohdr = 0;
> -	n->destructor = NULL;
> +	if (!skb->private_buffers)
> +		n->destructor = NULL;
> +	else
> +		C(destructor);
>   	C(tail);
>   	C(end);
>   	C(head);

We would also have to conditionally skip the __skb_clone()'s ...

   atomic_inc(&(skb_shinfo(skb)->dataref));

Thus, the issue here is that while netlink_alloc_large_skb() and
netlink_ring_setup_skb() would set both skb->private_buffers = 1,
the large skb case would actually need to inspect dataref count
(which it also can legally do) to properly release the vmalloc'ed
area again, while the other case must not even touch it. So if I
see this correctly, it looks like it's unfortunately not possible
to combine the two cases in a single flag. :/

If there's a good case to burn this flag outside of netlink for e.g.
vmalloc backend memory on skbs, it could be solved like that, while
the mmap case be declared netlink's problem. ;) I currently don't
have a better idea than to copy these guys, hmmm.

[...]
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 7f86d3b..523adac 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -854,6 +855,14 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
>   #define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
>   #endif /* CONFIG_NETLINK_MMAP */
>
> +static bool skb_can_release_head(struct sk_buff *skb)
> +{
> +	if (!skb->cloned ||
> +	    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
> +		return true;
> +	return false;
> +}
> +
>   static void netlink_skb_destructor(struct sk_buff *skb)
>   {
>   #ifdef CONFIG_NETLINK_MMAP
> @@ -866,31 +875,35 @@ static void netlink_skb_destructor(struct sk_buff *skb)
[...]
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 11, 2015, 9:34 p.m. UTC | #2
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 11 Sep 2015 22:35:08 +0200

> On 09/11/2015 09:42 PM, David Miller wrote:
>> @@ -2220,7 +2221,8 @@ static inline void skb_orphan(struct sk_buff
>> *skb)
>>    */
>>   static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t
>>   gfp_mask)
>>   {
>> -	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
>> +	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) ||
>> +		   skb->private_buffers))
> 
> (These two would need to be swapped.)

Right, good catch.

>> -	n->destructor = NULL;
>> +	if (!skb->private_buffers)
>> +		n->destructor = NULL;
>> +	else
>> +		C(destructor);
>>   	C(tail);
>>   	C(end);
>>   	C(head);
> 
> We would also have to conditionally skip the __skb_clone()'s ...
> 
>   atomic_inc(&(skb_shinfo(skb)->dataref));

Sigh, the solution was too good to be true I guess. :-/ That's right,
we can't touch skb_shinfo for mmap skbs.

Another approach would be to put the mmap user data into a page frag,
but that obviously has some costs associated with it.  However,
nothing in netlink is ready for fragged skbs yet.  It's the reason why
we have the large skb via vmalloc facility.

Long term fixing that is probably the way to go, but that would be an
enormous project.

So for now I'm going to apply your patch Daniel, thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Daniel Borkmann Sept. 11, 2015, 10:18 p.m. UTC | #3
On 09/11/2015 11:34 PM, David Miller wrote:
...
> Another approach would be to put the mmap user data into a page frag,
> but that obviously has some costs associated with it.  However,
> nothing in netlink is ready for fragged skbs yet.  It's the reason why
> we have the large skb via vmalloc facility.
>
> Long term fixing that is probably the way to go, but that would be an
> enormous project.

Yeah, that could resolve it.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 639e9b8..47d5875 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -97,22 +97,6 @@  int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
 int netlink_sendskb(struct sock *sk, struct sk_buff *skb);
 
-static inline struct sk_buff *
-netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
-{
-	struct sk_buff *nskb;
-
-	nskb = skb_clone(skb, gfp_mask);
-	if (!nskb)
-		return NULL;
-
-	/* This is a large skb, set destructor callback to release head */
-	if (is_vmalloc_addr(skb->head))
-		nskb->destructor = skb->destructor;
-
-	return nskb;
-}
-
 /*
  *	skb should fit one page. This choice is good for headerless malloc.
  *	But we should limit to 8K so that userspace does not have to
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 2738d35..898c53d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -584,8 +584,9 @@  struct sk_buff {
 				fclone:2,
 				peeked:1,
 				head_frag:1,
-				xmit_more:1;
-	/* one bit hole */
+				xmit_more:1,
+				private_buffers:1;
+
 	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
@@ -2220,7 +2221,8 @@  static inline void skb_orphan(struct sk_buff *skb)
  */
 static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
 {
-	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)))
+	if (likely(!(skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) ||
+		   skb->private_buffers))
 		return 0;
 	return skb_copy_ubufs(skb, gfp_mask);
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index dad4dd3..54f9d6e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -825,7 +825,10 @@  static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
 	n->cloned = 1;
 	n->nohdr = 0;
-	n->destructor = NULL;
+	if (!skb->private_buffers)
+		n->destructor = NULL;
+	else
+		C(destructor);
 	C(tail);
 	C(end);
 	C(head);
@@ -1675,6 +1678,9 @@  int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
 		to     += copy;
 	}
 
+	if (skb->private_buffers)
+		goto out;
+
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 		int end;
 		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -1720,7 +1726,7 @@  int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
 		}
 		start = end;
 	}
-
+out:
 	if (!len)
 		return 0;
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6fcbd21..2ec5425 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1075,7 +1075,7 @@  static void nl_fib_input(struct sk_buff *skb)
 	    nlmsg_len(nlh) < sizeof(*frn))
 		return;
 
-	skb = netlink_skb_clone(skb, GFP_KERNEL);
+	skb = skb_clone(skb, GFP_KERNEL);
 	if (!skb)
 		return;
 	nlh = nlmsg_hdr(skb);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 70277b1..4c612481 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -291,7 +291,7 @@  static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
 replay:
 	status = 0;
 
-	skb = netlink_skb_clone(oskb, GFP_KERNEL);
+	skb = skb_clone(oskb, GFP_KERNEL);
 	if (!skb)
 		return netlink_ack(oskb, nlh, -ENOMEM);
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 7f86d3b..523adac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -719,6 +719,7 @@  static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
 	skb->end	= skb->tail + size;
 	skb->len	= 0;
 
+	skb->private_buffers = 1;
 	skb->destructor	= netlink_skb_destructor;
 	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
 	NETLINK_CB(skb).sk = sk;
@@ -854,6 +855,14 @@  static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
 #define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
 #endif /* CONFIG_NETLINK_MMAP */
 
+static bool skb_can_release_head(struct sk_buff *skb)
+{
+	if (!skb->cloned ||
+	    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+		return true;
+	return false;
+}
+
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
 #ifdef CONFIG_NETLINK_MMAP
@@ -866,31 +875,35 @@  static void netlink_skb_destructor(struct sk_buff *skb)
 	 * the status. In the direction userspace to kernel, the status is
 	 * always reset here after the packet was processed and freed.
 	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		hdr = netlink_mmap_hdr(skb);
-		sk = NETLINK_CB(skb).sk;
+	if (!netlink_skb_is_mmaped(skb))
+		goto not_mmaped;
 
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
-		}
+	if (!skb_can_release_head(skb))
+		goto clone_refs_remain;
 
-		WARN_ON(atomic_read(&ring->pending) == 0);
-		atomic_dec(&ring->pending);
-		sock_put(sk);
+	hdr = netlink_mmap_hdr(skb);
+	sk = NETLINK_CB(skb).sk;
 
-		skb->head = NULL;
+	if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
+		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
+		ring = &nlk_sk(sk)->tx_ring;
+	} else {
+		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+			hdr->nm_len = 0;
+			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+		}
+		ring = &nlk_sk(sk)->rx_ring;
 	}
+
+	WARN_ON(atomic_read(&ring->pending) == 0);
+	atomic_dec(&ring->pending);
+	sock_put(sk);
+clone_refs_remain:
+	skb->head = NULL;
+not_mmaped:
 #endif
 	if (is_vmalloc_addr(skb->head)) {
-		if (!skb->cloned ||
-		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+		if (skb_can_release_head(skb))
 			vfree(skb->head);
 
 		skb->head = NULL;
@@ -1669,6 +1682,7 @@  static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
 	if (data == NULL)
 		return NULL;
 
+	skb->private_buffers = 1;
 	skb = __build_skb(data, size);
 	if (skb == NULL)
 		vfree(data);