diff mbox series

[bpf-next,02/11] xsk: introduce xdp_umem_page

Message ID 20180604120601.18123-3-bjorn.topel@gmail.com
State Accepted, archived
Delegated to: BPF Maintainers
Headers show
Series AF_XDP: introducing zero-copy support | expand

Commit Message

Björn Töpel June 4, 2018, 12:05 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

The xdp_umem_page holds the address for a page. Trade memory for
faster lookup. Later, we'll add DMA address here as well.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/net/xdp_sock.h |  7 ++++++-
 net/xdp/xdp_umem.c     | 15 ++++++++++++++-
 net/xdp/xdp_umem.h     |  3 +--
 3 files changed, 21 insertions(+), 4 deletions(-)

Comments

Jiri Slaby March 13, 2019, 9:39 a.m. UTC | #1
On 04. 06. 18, 14:05, Björn Töpel wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
> 
> The xdp_umem_page holds the address for a page. Trade memory for
> faster lookup. Later, we'll add DMA address here as well.
...
> --- a/include/net/xdp_sock.h
> +++ b/include/net/xdp_sock.h
...
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem)
>  		goto out;
>  
>  	mmput(mm);
> +	kfree(umem->pages);
> +	umem->pages = NULL;
> +

Are you sure about the placement of kfree here? Why is it dependent on
task && mm above?

IMO the kfree should be below "out:":

>  	xdp_umem_unaccount_pages(umem);
>  out:
>  	kfree(umem);
Syzkaller reported this memleak:
r0 = socket$xdp(0x2c, 0x3, 0x0)
setsockopt$XDP_UMEM_REG(r0, 0x11b, 0x4,
&(0x7f0000000100)={&(0x7f0000000000)=""/210, 0x20000, 0x1000, 0x7}, 0x18)
BUG: memory leak
unreferenced object 0xffff88003648de68 (size 512):
  comm "syz-executor.3", pid 11688, jiffies 4295555546 (age 15.752s)
  hex dump (first 32 bytes):
    00 00 40 23 00 88 ff ff 00 00 00 00 00 00 00 00  ..@#............
    00 10 40 23 00 88 ff ff 00 00 00 00 00 00 00 00  ..@#............
  backtrace:
    [<ffffffffa9f8346c>] xsk_setsockopt+0x40c/0x510 net/xdp/xsk.c:539
    [<ffffffffa9935c41>] SyS_setsockopt+0x171/0x370 net/socket.c:1862
    [<ffffffffa800b28c>] do_syscall_64+0x26c/0x6e0
arch/x86/entry/common.c:284
    [<ffffffffaa00009a>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2
    [<ffffffffffffffff>] 0xffffffffffffffff


Given the size of the leak, it looks like umem->pages is leaked:
mr->len/page_size*sizeof(*umem->pages)
0x20000/4096*16=512

So I added a check, and really, task is NULL in my testcase -- the
program is gone when the deferred work triggers. But umem->pages is not
freed.

Moving the free after "out:", no leaks happen anymore.

Easily reproducible with:
#include <err.h>
#include <stdlib.h>
#include <unistd.h>

#include <sys/socket.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <linux/if_xdp.h>

void fun()
{
        static char buffer[0x20000] __attribute__((aligned(4096)));
        struct xdp_umem_reg mr = {
                (unsigned long)buffer,
                0x20000,
                0x1000,
                0x7,
         //&(0x7f0000000100)={&(0x7f0000000000)=""/210, 0x20000, 0x1000,
0x7}
        };
        int r0;
        r0 = socket(AF_XDP, SOCK_RAW, 0);
        if (r0 < 0)
                err(1, "socket");
        if (setsockopt(r0, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr)) < 0)
                err(1, "setsockopt");
        close(r0);
}

int main()
{
        int a;
        while (1) {
                for (a = 0; a < 40; a++)
                        if (!fork()) {
                                fun();
                                exit(0);
                        }
                for (a = 0; a < 100; a++)
                        wait(NULL);
        }
}


thanks,
Björn Töpel March 13, 2019, 11:23 a.m. UTC | #2
On 2019-03-13 10:39, Jiri Slaby wrote:
> On 04. 06. 18, 14:05, Björn Töpel wrote:
>> From: Björn Töpel <bjorn.topel@intel.com>
>> 
>> The xdp_umem_page holds the address for a page. Trade memory for 
>> faster lookup. Later, we'll add DMA address here as well.
> ...
>> --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h
> ...
>> --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,9 @@
>> static void xdp_umem_release(struct xdp_umem *umem) goto out;
>> 
>> mmput(mm); +	kfree(umem->pages); +	umem->pages = NULL; +
> 
> Are you sure about the placement of kfree here? Why is it dependent
> on task && mm above?
> 
> IMO the kfree should be below "out:":
> 
>> xdp_umem_unaccount_pages(umem); out: kfree(umem);
> Syzkaller reported this memleak: r0 = socket$xdp(0x2c, 0x3, 0x0) 
> setsockopt$XDP_UMEM_REG(r0, 0x11b, 0x4, 
> &(0x7f0000000100)={&(0x7f0000000000)=""/210, 0x20000, 0x1000, 0x7},
> 0x18) BUG: memory leak unreferenced object 0xffff88003648de68 (size
> 512): comm "syz-executor.3", pid 11688, jiffies 4295555546 (age
> 15.752s) hex dump (first 32 bytes): 00 00 40 23 00 88 ff ff 00 00 00
> 00 00 00 00 00  ..@#............ 00 10 40 23 00 88 ff ff 00 00 00 00
> 00 00 00 00  ..@#............ backtrace: [<ffffffffa9f8346c>]
> xsk_setsockopt+0x40c/0x510 net/xdp/xsk.c:539 [<ffffffffa9935c41>]
> SyS_setsockopt+0x171/0x370 net/socket.c:1862 [<ffffffffa800b28c>]
> do_syscall_64+0x26c/0x6e0 arch/x86/entry/common.c:284 
> [<ffffffffaa00009a>] entry_SYSCALL_64_after_hwframe+0x3d/0xa2 
> [<ffffffffffffffff>] 0xffffffffffffffff
> 
> 
> Given the size of the leak, it looks like umem->pages is leaked: 
> mr->len/page_size*sizeof(*umem->pages) 0x20000/4096*16=512
> 
> So I added a check, and really, task is NULL in my testcase -- the 
> program is gone when the deferred work triggers. But umem->pages is
> not freed.
> 
> Moving the free after "out:", no leaks happen anymore.
> 
> Easily reproducible with: #include <err.h> #include <stdlib.h> 
> #include <unistd.h>
> 
> #include <sys/socket.h> #include <sys/types.h> #include <sys/wait.h>
> 
> #include <linux/if_xdp.h>
> 
> void fun() { static char buffer[0x20000]
> __attribute__((aligned(4096))); struct xdp_umem_reg mr = { (unsigned
> long)buffer, 0x20000, 0x1000, 0x7, 
> //&(0x7f0000000100)={&(0x7f0000000000)=""/210, 0x20000, 0x1000, 0x7} 
> }; int r0; r0 = socket(AF_XDP, SOCK_RAW, 0); if (r0 < 0) err(1,
> "socket"); if (setsockopt(r0, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr))
> < 0) err(1, "setsockopt"); close(r0); }
> 
> int main() { int a; while (1) { for (a = 0; a < 40; a++) if (!fork())
> { fun(); exit(0); } for (a = 0; a < 100; a++) wait(NULL); } }
> 
> 
> thanks,
> 

Nice catch, Jiri! Thank you!

It turns out that the whole task/pid dance is useless. It was a
left-over from the first AF_XDP RFC when we did per task accounting,
instead of per user accounting.

I will do some testing with the patch below, and then submit it as a
proper patch.


Cheers,
Björn

--

From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= <bjorn.topel@intel.com>
Date: Wed, 13 Mar 2019 12:00:51 +0100
Subject: [PATCH] xsk: fix umem memory leak on cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the umem is cleaned up, the task that created it might already be
gone. If the task was gone, the xdp_umem_release function did not free
the pages member of struct xdp_umem.

It turned out that the task lookup was not needed at all; The code was
a left-over when we moved from task accounting to user accounting [1].

This patch fixes the memory leak by removing the task lookup logic
completely.

[1] 
https://lore.kernel.org/netdev/20180131135356.19134-3-bjorn.topel@gmail.com/

Link: 
https://lore.kernel.org/netdev/c1cb2ca8-6a14-3980-8672-f3de0bb38dfd@suse.cz/
Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt")
Reported-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
  include/net/xdp_sock.h |  1 -
  net/xdp/xdp_umem.c     | 19 +------------------
  2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 61cf7dbb6782..d074b6d60f8a 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -36,7 +36,6 @@ struct xdp_umem {
  	u32 headroom;
  	u32 chunk_size_nohr;
  	struct user_struct *user;
-	struct pid *pid;
  	unsigned long address;
  	refcount_t users;
  	struct work_struct work;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 77520eacee8f..989e52386c35 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -193,9 +193,6 @@ static void xdp_umem_unaccount_pages(struct xdp_umem 
*umem)

  static void xdp_umem_release(struct xdp_umem *umem)
  {
-	struct task_struct *task;
-	struct mm_struct *mm;
-
  	xdp_umem_clear_dev(umem);

  	ida_simple_remove(&umem_ida, umem->id);
@@ -214,21 +211,10 @@ static void xdp_umem_release(struct xdp_umem *umem)

  	xdp_umem_unpin_pages(umem);

-	task = get_pid_task(umem->pid, PIDTYPE_PID);
-	put_pid(umem->pid);
-	if (!task)
-		goto out;
-	mm = get_task_mm(task);
-	put_task_struct(task);
-	if (!mm)
-		goto out;
-
-	mmput(mm);
  	kfree(umem->pages);
  	umem->pages = NULL;

  	xdp_umem_unaccount_pages(umem);
-out:
  	kfree(umem);
  }

@@ -357,7 +343,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
struct xdp_umem_reg *mr)
  	if (size_chk < 0)
  		return -EINVAL;

-	umem->pid = get_task_pid(current, PIDTYPE_PID);
  	umem->address = (unsigned long)addr;
  	umem->chunk_mask = ~((u64)chunk_size - 1);
  	umem->size = size;
@@ -373,7 +358,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
struct xdp_umem_reg *mr)

  	err = xdp_umem_account_pages(umem);
  	if (err)
-		goto out;
+		return err;

  	err = xdp_umem_pin_pages(umem);
  	if (err)
@@ -392,8 +377,6 @@ static int xdp_umem_reg(struct xdp_umem *umem, 
struct xdp_umem_reg *mr)

  out_account:
  	xdp_umem_unaccount_pages(umem);
-out:
-	put_pid(umem->pid);
  	return err;
  }
diff mbox series

Patch

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 3a6cd88f179d..caf343a7e224 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -20,10 +20,14 @@  struct xdp_umem_props {
 	u64 size;
 };
 
+struct xdp_umem_page {
+	void *addr;
+};
+
 struct xdp_umem {
 	struct xsk_queue *fq;
 	struct xsk_queue *cq;
-	struct page **pgs;
+	struct xdp_umem_page *pages;
 	struct xdp_umem_props props;
 	u32 headroom;
 	u32 chunk_size_nohr;
@@ -32,6 +36,7 @@  struct xdp_umem {
 	unsigned long address;
 	refcount_t users;
 	struct work_struct work;
+	struct page **pgs;
 	u32 npgs;
 };
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 2793a503223e..aca826011f6c 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -65,6 +65,9 @@  static void xdp_umem_release(struct xdp_umem *umem)
 		goto out;
 
 	mmput(mm);
+	kfree(umem->pages);
+	umem->pages = NULL;
+
 	xdp_umem_unaccount_pages(umem);
 out:
 	kfree(umem);
@@ -155,7 +158,7 @@  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
 	unsigned int chunks, chunks_per_page;
 	u64 addr = mr->addr, size = mr->len;
-	int size_chk, err;
+	int size_chk, err, i;
 
 	if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
 		/* Strictly speaking we could support this, if:
@@ -213,6 +216,16 @@  static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
 	err = xdp_umem_pin_pages(umem);
 	if (err)
 		goto out_account;
+
+	umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
+	if (!umem->pages) {
+		err = -ENOMEM;
+		goto out_account;
+	}
+
+	for (i = 0; i < umem->npgs; i++)
+		umem->pages[i].addr = page_address(umem->pgs[i]);
+
 	return 0;
 
 out_account:
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index 9433e8af650a..40e8fa4a92af 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -10,8 +10,7 @@ 
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
-	return page_address(umem->pgs[addr >> PAGE_SHIFT]) +
-		(addr & (PAGE_SIZE - 1));
+	return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1));
 }
 
 bool xdp_umem_validate_queues(struct xdp_umem *umem);