diff mbox

[PATCHv2,net-next] netlink: allow large data transfers from user-space

Message ID 1370277599-27072-1-git-send-email-pablo@netfilter.org
State Changes Requested, archived
Headers show

Commit Message

Pablo Neira Ayuso June 3, 2013, 4:39 p.m. UTC
I can hit ENOBUFS in the sendmsg() path with a large batch that is
composed of many netlink messages. Here that limit is 8 MBytes of
skbuff data area as kmalloc does not manage to get more than that.

While discussing atomic rule-set for nftables with Patrick McHardy,
we decided to put all rule-set updates that need to be applied
atomically in one single batch to simplify the existing approach.
However, as explained above, the existing netlink code limits us
to a maximum of ~20000 rules that fit in one single batch without
hitting ENOBUFS. iptables does not have such limitation as it is
using vmalloc.

This patch adds netlink_alloc_large_skb() which is only used in
the netlink_sendmsg() path. It uses alloc_skb if the memory
requested is <= one memory page, that should be the common case
for most subsystems, else vmalloc for higher memory allocations.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
v1: initial version
v2: Use NLMSG_GOODSIZE instead of PAGE_SIZE, suggested by Eric Dumazet.

 net/netlink/af_netlink.c |   37 +++++++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

Comments

Patrick McHardy June 3, 2013, 5:01 p.m. UTC | #1
On Mon, Jun 03, 2013 at 06:39:59PM +0200, Pablo Neira Ayuso wrote:
> I can hit ENOBUFS in the sendmsg() path with a large batch that is
> composed of many netlink messages. Here that limit is 8 MBytes of
> skbuff data area as kmalloc does not manage to get more than that.
> 
> While discussing atomic rule-set for nftables with Patrick McHardy,
> we decided to put all rule-set updates that need to be applied
> atomically in one single batch to simplify the existing approach.
> However, as explained above, the existing netlink code limits us
> to a maximum of ~20000 rules that fit in one single batch without
> hitting ENOBUFS. iptables does not have such limitation as it is
> using vmalloc.
> 
> This patch adds netlink_alloc_large_skb() which is only used in
> the netlink_sendmsg() path. It uses alloc_skb if the memory
> requested is <= one memory page, that should be the common case
> for most subsystems, else vmalloc for higher memory allocations.

I know I suggested to do this - just wondering right now, how will we
indiciate to userspace that a change has been applied atomically when
sending notifications? Not sure whether it matters unless userspace
will be able to get a dump while we're in the middle of updating
the ruleset. I guess that won't be possible, right?

> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> v1: initial version
> v2: Use NLMSG_GOODSIZE instead of PAGE_SIZE, suggested by Eric Dumazet.
> 
>  net/netlink/af_netlink.c |   37 +++++++++++++++++++++++++++++++++++--
>  1 file changed, 35 insertions(+), 2 deletions(-)
> 
> diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
> index 12ac6b4..7c71d07 100644
> --- a/net/netlink/af_netlink.c
> +++ b/net/netlink/af_netlink.c
> @@ -750,6 +750,10 @@ static void netlink_skb_destructor(struct sk_buff *skb)
>  		skb->data = NULL;
>  	}
>  #endif
> +	if (is_vmalloc_addr(skb->head)) {
> +		vfree(skb->head);
> +		skb->data = NULL;
> +	}
>  	if (skb->sk != NULL)
>  		sock_rfree(skb);
>  }
> @@ -1420,6 +1424,35 @@ struct sock *netlink_getsockbyfilp(struct file *filp)
>  	return sock;
>  }
>  
> +static struct sk_buff *netlink_alloc_large_skb(unsigned int size)
> +{
> +	struct sk_buff *skb;
> +	void *data;
> +
> +	if (size <= NLMSG_GOODSIZE)
> +		return alloc_skb(size, GFP_KERNEL);
> +
> +	skb = alloc_skb_head(GFP_KERNEL);
> +	if (skb == NULL)
> +		return NULL;
> +
> +	data = vmalloc(size);
> +	if (data == NULL)
> +		goto err;
> +
> +	skb->head	= data;
> +	skb->data	= data;
> +	skb_reset_tail_pointer(skb);
> +	skb->end	= skb->tail + size;
> +	skb->len	= 0;
> +	skb->destructor = netlink_skb_destructor;
> +
> +	return skb;
> +err:
> +	kfree_skb(skb);
> +	return NULL;
> +}
> +
>  /*
>   * Attach a skb to a netlink socket.
>   * The caller must hold a reference to the destination socket. On error, the
> @@ -1510,7 +1543,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
>  		return skb;
>  
>  	delta = skb->end - skb->tail;
> -	if (delta * 2 < skb->truesize)
> +	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
>  		return skb;
>  
>  	if (skb_shared(skb)) {
> @@ -2096,7 +2129,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
>  	if (len > sk->sk_sndbuf - 32)
>  		goto out;
>  	err = -ENOBUFS;
> -	skb = alloc_skb(len, GFP_KERNEL);
> +	skb = netlink_alloc_large_skb(len);
>  	if (skb == NULL)
>  		goto out;
>  
> -- 
> 1.7.10.4
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 3, 2013, 5:12 p.m. UTC | #2
On Mon, 2013-06-03 at 18:39 +0200, Pablo Neira Ayuso wrote:

> +	if (is_vmalloc_addr(skb->head)) {
> +		vfree(skb->head);
> +		skb->data = NULL;
> +	}
>  

You probably meant :

	skb->head = NULL;

Now, cross our fingers we do not skb_orphan() these skb too early :)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso June 3, 2013, 5:29 p.m. UTC | #3
Hi Patrick!

On Mon, Jun 03, 2013 at 07:01:37PM +0200, Patrick McHardy wrote:
> On Mon, Jun 03, 2013 at 06:39:59PM +0200, Pablo Neira Ayuso wrote:
> > I can hit ENOBUFS in the sendmsg() path with a large batch that is
> > composed of many netlink messages. Here that limit is 8 MBytes of
> > skbuff data area as kmalloc does not manage to get more than that.
> > 
> > While discussing atomic rule-set for nftables with Patrick McHardy,
> > we decided to put all rule-set updates that need to be applied
> > atomically in one single batch to simplify the existing approach.
> > However, as explained above, the existing netlink code limits us
> > to a maximum of ~20000 rules that fit in one single batch without
> > hitting ENOBUFS. iptables does not have such limitation as it is
> > using vmalloc.
> > 
> > This patch adds netlink_alloc_large_skb() which is only used in
> > the netlink_sendmsg() path. It uses alloc_skb if the memory
> > requested is <= one memory page, that should be the common case
> > for most subsystems, else vmalloc for higher memory allocations.
> 
> I know I suggested to do this - just wondering right now, how will
> we indiciate to userspace that a change has been applied atomically
> when sending notifications? Not sure whether it matters unless
> userspace will be able to get a dump while we're in the middle of
> updating the ruleset. I guess that won't be possible, right?

Userspace gets dump messages with the NLM_F_DUMP_INTR flag set in case
of interference, so it knows it has to retry the dump to get a fresh
rule-set. The current nftables code does not work that way, it needs a
small patch I have here though.

Regards.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso June 3, 2013, 5:41 p.m. UTC | #4
Hi Eric!

On Mon, Jun 03, 2013 at 10:12:39AM -0700, Eric Dumazet wrote:
> On Mon, 2013-06-03 at 18:39 +0200, Pablo Neira Ayuso wrote:
> 
> > +	if (is_vmalloc_addr(skb->head)) {
> > +		vfree(skb->head);
> > +		skb->data = NULL;
> > +	}
> >  
> 
> You probably meant :
> 
> 	skb->head = NULL;
> 
> Now, cross our fingers we do not skb_orphan() these skb too early :)

skb_release_all checks for skb->data != NULL to release skb->head as
it would have been allocated via kmalloc. This didn't look very
intuitive to me when I hit a panic while testing my patch either.

So unless I'm missing anything, that skb->data is OK with the existing
code we have.

Regards.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet June 3, 2013, 6 p.m. UTC | #5
On Mon, 2013-06-03 at 19:41 +0200, Pablo Neira Ayuso wrote:
> Hi Eric!
> 
> On Mon, Jun 03, 2013 at 10:12:39AM -0700, Eric Dumazet wrote:
> > On Mon, 2013-06-03 at 18:39 +0200, Pablo Neira Ayuso wrote:
> > 
> > > +	if (is_vmalloc_addr(skb->head)) {
> > > +		vfree(skb->head);
> > > +		skb->data = NULL;
> > > +	}
> > >  
> > 
> > You probably meant :
> > 
> > 	skb->head = NULL;
> > 
> > Now, cross our fingers we do not skb_orphan() these skb too early :)
> 
> skb_release_all checks for skb->data != NULL to release skb->head as
> it would have been allocated via kmalloc. This didn't look very
> intuitive to me when I hit a panic while testing my patch either.

Oh well, this is probably an old typo because of skb_release_data() name

So we indeed do :

	if (likely(skb->data))
		skb_release_data(skb);

But it really should be

	if (likely(skb->head))
		skb_release_head(skb);

as skb->data is not pointing to the beginning of the area to be freed.

BTW, __alloc_skb_head() inits skb->data to NULL, and leaves skb->head
uninitialized. Thats is really not nice...




--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso June 3, 2013, 7:21 p.m. UTC | #6
On Mon, Jun 03, 2013 at 11:00:37AM -0700, Eric Dumazet wrote:
> On Mon, 2013-06-03 at 19:41 +0200, Pablo Neira Ayuso wrote:
> > Hi Eric!
> > 
> > On Mon, Jun 03, 2013 at 10:12:39AM -0700, Eric Dumazet wrote:
> > > On Mon, 2013-06-03 at 18:39 +0200, Pablo Neira Ayuso wrote:
> > > 
> > > > +	if (is_vmalloc_addr(skb->head)) {
> > > > +		vfree(skb->head);
> > > > +		skb->data = NULL;
> > > > +	}
> > > >  
> > > 
> > > You probably meant :
> > > 
> > > 	skb->head = NULL;
> > > 
> > > Now, cross our fingers we do not skb_orphan() these skb too early :)
> > 
> > skb_release_all checks for skb->data != NULL to release skb->head as
> > it would have been allocated via kmalloc. This didn't look very
> > intuitive to me when I hit a panic while testing my patch either.
> 
> Oh well, this is probably an old typo because of skb_release_data() name
> 
> So we indeed do :
> 
> 	if (likely(skb->data))
> 		skb_release_data(skb);
> 
> But it really should be
> 
> 	if (likely(skb->head))
> 		skb_release_head(skb);
> 
> as skb->data is not pointing to the beginning of the area to be freed.

Indeed, this change is recent, happened in 0ebd0ac.

> BTW, __alloc_skb_head() inits skb->data to NULL, and leaves skb->head
> uninitialized. Thats is really not nice...

Should be skb->head = NULL and leave skb->data unset.

I'll send a patch to fix that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 12ac6b4..7c71d07 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -750,6 +750,10 @@  static void netlink_skb_destructor(struct sk_buff *skb)
 		skb->data = NULL;
 	}
 #endif
+	if (is_vmalloc_addr(skb->head)) {
+		vfree(skb->head);
+		skb->data = NULL;
+	}
 	if (skb->sk != NULL)
 		sock_rfree(skb);
 }
@@ -1420,6 +1424,35 @@  struct sock *netlink_getsockbyfilp(struct file *filp)
 	return sock;
 }
 
+static struct sk_buff *netlink_alloc_large_skb(unsigned int size)
+{
+	struct sk_buff *skb;
+	void *data;
+
+	if (size <= NLMSG_GOODSIZE)
+		return alloc_skb(size, GFP_KERNEL);
+
+	skb = alloc_skb_head(GFP_KERNEL);
+	if (skb == NULL)
+		return NULL;
+
+	data = vmalloc(size);
+	if (data == NULL)
+		goto err;
+
+	skb->head	= data;
+	skb->data	= data;
+	skb_reset_tail_pointer(skb);
+	skb->end	= skb->tail + size;
+	skb->len	= 0;
+	skb->destructor = netlink_skb_destructor;
+
+	return skb;
+err:
+	kfree_skb(skb);
+	return NULL;
+}
+
 /*
  * Attach a skb to a netlink socket.
  * The caller must hold a reference to the destination socket. On error, the
@@ -1510,7 +1543,7 @@  static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 		return skb;
 
 	delta = skb->end - skb->tail;
-	if (delta * 2 < skb->truesize)
+	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
 		return skb;
 
 	if (skb_shared(skb)) {
@@ -2096,7 +2129,7 @@  static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
 	err = -ENOBUFS;
-	skb = alloc_skb(len, GFP_KERNEL);
+	skb = netlink_alloc_large_skb(len);
 	if (skb == NULL)
 		goto out;