diff mbox

vhost-net: fall back to vmalloc if high-order allocation fails

Message ID 87k3r31vbc.fsf@silenus.orebokech.com
State Rejected, archived
Delegated to: David Miller
Headers show

Commit Message

Romain Francoise Jan. 23, 2013, 8:46 p.m. UTC
Creating a vhost-net device allocates an object large enough (34320 bytes
on x86-64) to trigger an order-4 allocation, which may fail if memory if
fragmented:

 libvirtd: page allocation failure: order:4, mode:0x2000d0
 ...
 SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
   cache: size-65536, object size: 65536, order: 4
   node 0: slabs: 8/8, objs: 8/8, free: 0

In that situation, rather than forcing the caller to use regular
virtio-net, try to allocate the descriptor with vmalloc().

Signed-off-by: Romain Francoise <romain@orebokech.com>
---
 drivers/vhost/net.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

Comments

Michael S. Tsirkin Jan. 23, 2013, 9:04 p.m. UTC | #1
On Wed, Jan 23, 2013 at 09:46:47PM +0100, Romain Francoise wrote:
> Creating a vhost-net device allocates an object large enough (34320 bytes
> on x86-64) to trigger an order-4 allocation, which may fail if memory if
> fragmented:
> 
>  libvirtd: page allocation failure: order:4, mode:0x2000d0
>  ...
>  SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
>    cache: size-65536, object size: 65536, order: 4
>    node 0: slabs: 8/8, objs: 8/8, free: 0
> 
> In that situation, rather than forcing the caller to use regular
> virtio-net, try to allocate the descriptor with vmalloc().
> 
> Signed-off-by: Romain Francoise <romain@orebokech.com>

Thanks for the patch.
Hmm, I haven't seen this.
Maybe we should try and reduce our memory usage,
I will look into this.

> ---
>  drivers/vhost/net.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index ebd08b2..1ded79b 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -18,6 +18,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/file.h>
>  #include <linux/slab.h>
> +#include <linux/vmalloc.h>
>  
>  #include <linux/net.h>
>  #include <linux/if_packet.h>
> @@ -603,12 +604,23 @@ static void handle_rx_net(struct vhost_work *work)
>  	handle_rx(net);
>  }
>  
> +static void vhost_net_kvfree(void *addr)
> +{
> +	if (is_vmalloc_addr(addr))
> +		vfree(addr);
> +	else
> +		kfree(addr);
> +}
> +
>  static int vhost_net_open(struct inode *inode, struct file *f)
>  {
> -	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
> +	struct vhost_net *n;
>  	struct vhost_dev *dev;
>  	int r;
>  
> +	n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
> +	if (!n)
> +		n = vmalloc(sizeof *n);
>  	if (!n)
>  		return -ENOMEM;
>  
> @@ -617,7 +629,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
>  	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
>  	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
>  	if (r < 0) {
> -		kfree(n);
> +		vhost_net_kvfree(n);
>  		return r;
>  	}
>  
> @@ -719,7 +731,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
>  	/* We do an extra flush before freeing memory,
>  	 * since jobs can re-queue themselves. */
>  	vhost_net_flush(n);
> -	kfree(n);
> +	vhost_net_kvfree(n);
>  	return 0;
>  }
>  
> -- 
> 1.8.1.1
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight Jan. 24, 2013, 9:45 a.m. UTC | #2
> +	n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
> +	if (!n)
> +		n = vmalloc(sizeof *n);

I'm slightly confused by this construct.
I thought kmalloc(... GFP_KERNEL) would sleep waiting for
memory (rather than return NULL).

I realise that (for multi-page sizes) that kmalloc() and
vmalloc() both need to find a contiguous block of kernel
virtual addresses - in different address ranges, and
that vmalloc() then has to find physical memory pages
(which will not be contiguous).

I think this means that kmalloc() is likely to be faster
(if it doesn't have to sleep), but that vmalloc() is
allocating from a much larger resource.

This make me that that large allocations that are not
temporary should probably be allocated with vmalloc().

Is there a 'NO_SLEEP' flag to kmalloc()? is that all
GFP_ATOMIC requests? If so you might try a non-sleeping
kmalloc() with a vmalloc() if it fails.

This all looks as though there should be a GFP_NONCONTIG
flag (or similar) so that kmalloc() can make a decision itself.

Of at least a wrapper - like the one for free().

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin Jan. 24, 2013, 10:06 a.m. UTC | #3
On Thu, Jan 24, 2013 at 09:45:50AM -0000, David Laight wrote:
> > +	n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
> > +	if (!n)
> > +		n = vmalloc(sizeof *n);
> 
> I'm slightly confused by this construct.
> I thought kmalloc(... GFP_KERNEL) would sleep waiting for
> memory (rather than return NULL).
> 
> I realise that (for multi-page sizes) that kmalloc() and
> vmalloc() both need to find a contiguous block of kernel
> virtual addresses - in different address ranges, and
> that vmalloc() then has to find physical memory pages
> (which will not be contiguous).
> 
> I think this means that kmalloc() is likely to be faster
> (if it doesn't have to sleep), but that vmalloc() is
> allocating from a much larger resource.
> 
> This make me that that large allocations that are not
> temporary should probably be allocated with vmalloc().

vmalloc has some issues for example afaik it's not backed by
a huge page so  I think its use puts more stress on the TLB cache.

> Is there a 'NO_SLEEP' flag to kmalloc()? is that all
> GFP_ATOMIC requests? If so you might try a non-sleeping
> kmalloc() with a vmalloc() if it fails.
> 
> This all looks as though there should be a GFP_NONCONTIG
> flag (or similar) so that kmalloc() can make a decision itself.
> 
> Of at least a wrapper - like the one for free().
> 
> 	David
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Laight Jan. 24, 2013, 10:37 a.m. UTC | #4
> > I think this means that kmalloc() is likely to be faster
> > (if it doesn't have to sleep), but that vmalloc() is
> > allocating from a much larger resource.
> >
> > This make me that that large allocations that are not
> > temporary should probably be allocated with vmalloc().
> 
> vmalloc has some issues for example afaik it's not backed by
> a huge page so  I think its use puts more stress on the TLB cache.

Thinks further ...
64bit systems are likely to have enough kernel VA to be able
to map all of physical memory into contiguous VA.

I don't know if Linux does that, but I know code to map it was added
to NetBSD amd64 in order to speed up kernel accesses to 'random'
pages - it might have been partially backed out due to bugs!

If physical memory is mapped like that then kmalloc() requests
can any of physical memory and be unlikely to fail - since user
pages can be moved in order to generate contiguous free blocks.

Doesn't help with 32bit systems - they had to stop mapping all
of physical memory into kernel space a long time ago.

	David



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Wang Jan. 25, 2013, 5:15 a.m. UTC | #5
On Friday, January 25, 2013 03:03:13 AM Cong Wang wrote:
> ["Followup-To:" header set to gmane.linux.network.]
> 
> On Wed, 23 Jan 2013 at 20:46 GMT, Romain Francoise <romain@orebokech.com> 
wrote:
> > Creating a vhost-net device allocates an object large enough (34320 bytes
> > on x86-64) to trigger an order-4 allocation, which may fail if memory if
> > 
> > fragmented:
> >  libvirtd: page allocation failure: order:4, mode:0x2000d0
> >  ...
> >  SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
> >  
> >    cache: size-65536, object size: 65536, order: 4
> >    node 0: slabs: 8/8, objs: 8/8, free: 0
> > 
> > In that situation, rather than forcing the caller to use regular
> > virtio-net, try to allocate the descriptor with vmalloc().
> 
> The real problem is vhost_net struct is really big, it
> should be reduced rather than workarounded like this.
> 

Looks like iov in vhost_virtqueue is a little big:

    struct iovec iov[UIO_MAXIOV];

Maybe we can use pointer and allocate it like indirect in 
vhost_dev_alloc_iovecs().

> > +static void vhost_net_kvfree(void *addr)
> > +{
> > +	if (is_vmalloc_addr(addr))
> > +		vfree(addr);
> > +	else
> > +		kfree(addr);
> > +}
> > +
> 
> This kind of stuff should really go to mm, not netdev.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Jan. 28, 2013, 3:11 a.m. UTC | #6
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 23 Jan 2013 23:04:11 +0200

> Maybe we should try and reduce our memory usage,
> I will look into this.

As has been pointed out, 32K of the size is from those iovecs in
the queues.

The size of this structure is frankly offensive, and even if you add
some levels of indirection even just one iovec chunk is 16K on 64-bit
which is in my opinion still unacceptably large.

TCP sockets aren't even %25 the size of this beast. :-)

I'm not going to apply this vmalloc patch, because if I apply it the
fundamental problem here just gets swept under the carpet even longer.

Sorry.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Romain Francoise Jan. 28, 2013, 9:23 a.m. UTC | #7
David Miller <davem@davemloft.net> writes:

> I'm not going to apply this vmalloc patch, because if I apply it the
> fundamental problem here just gets swept under the carpet even longer.

No problem, I'll keep this as a local change until vhost-net's allocation
strategy gains some sanity.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Romain Francoise June 28, 2013, 7:16 a.m. UTC | #8
"Michael S. Tsirkin" <mst@redhat.com> writes:

> On Wed, Jan 23, 2013 at 09:46:47PM +0100, Romain Francoise wrote:
>> Creating a vhost-net device allocates an object large enough (34320 bytes
>> on x86-64) to trigger an order-4 allocation, which may fail if memory if
>> fragmented:
>> 
>>  libvirtd: page allocation failure: order:4, mode:0x2000d0
>>  ...
>>  SLAB: Unable to allocate memory on node 0 (gfp=0xd0)
>>    cache: size-65536, object size: 65536, order: 4
>>    node 0: slabs: 8/8, objs: 8/8, free: 0
>> 
>> In that situation, rather than forcing the caller to use regular
>> virtio-net, try to allocate the descriptor with vmalloc().
>> 
>> Signed-off-by: Romain Francoise <romain@orebokech.com>

> Thanks for the patch.
> Hmm, I haven't seen this.
> Maybe we should try and reduce our memory usage,
> I will look into this.

Did you get a chance to investigate this? I'm still getting the same
allocation failures with v3.10-rc7 after reverting my local patch.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ebd08b2..1ded79b 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@ 
 #include <linux/rcupdate.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <linux/net.h>
 #include <linux/if_packet.h>
@@ -603,12 +604,23 @@  static void handle_rx_net(struct vhost_work *work)
 	handle_rx(net);
 }
 
+static void vhost_net_kvfree(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
 static int vhost_net_open(struct inode *inode, struct file *f)
 {
-	struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
+	struct vhost_net *n;
 	struct vhost_dev *dev;
 	int r;
 
+	n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN);
+	if (!n)
+		n = vmalloc(sizeof *n);
 	if (!n)
 		return -ENOMEM;
 
@@ -617,7 +629,7 @@  static int vhost_net_open(struct inode *inode, struct file *f)
 	n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
 	r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
-		kfree(n);
+		vhost_net_kvfree(n);
 		return r;
 	}
 
@@ -719,7 +731,7 @@  static int vhost_net_release(struct inode *inode, struct file *f)
 	/* We do an extra flush before freeing memory,
 	 * since jobs can re-queue themselves. */
 	vhost_net_flush(n);
-	kfree(n);
+	vhost_net_kvfree(n);
 	return 0;
 }