diff mbox series

[net-next,v2,2/2] page_pool: remove hold/release count from tracepoints

Message ID 20191114163715.4184099-3-jonathan.lemon@gmail.com
State Changes Requested
Delegated to: David Miller
Headers show
Series Change page_pool timeout handling | expand

Commit Message

Jonathan Lemon Nov. 14, 2019, 4:37 p.m. UTC
When the last page is released from the page pool, it is possible
that the delayed removal thread sees inflight == 0, and frees the
pool.  While the freed pointer is only copied by the tracepoint
and not dereferenced, it really isn't correct.  Avoid this case by
reporting the page release before releasing the page.

This also removes a second atomic operation from the release path.

Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
 include/trace/events/page_pool.h | 24 ++++++++++--------------
 net/core/page_pool.c             |  8 +++++---
 2 files changed, 15 insertions(+), 17 deletions(-)

Comments

Jesper Dangaard Brouer Nov. 14, 2019, 9:07 p.m. UTC | #1
On Thu, 14 Nov 2019 08:37:15 -0800
Jonathan Lemon <jonathan.lemon@gmail.com> wrote:

> When the last page is released from the page pool, it is possible
> that the delayed removal thread sees inflight == 0, and frees the
> pool.  While the freed pointer is only copied by the tracepoint
> and not dereferenced, it really isn't correct.  Avoid this case by
> reporting the page release before releasing the page.

I don't like this patch!

I'm actually using these counters, in my current version of my bpftrace
leak detector for page_pool:

https://github.com/xdp-project/xdp-project/blob/master/areas/mem/bpftrace/page_pool_track_leaks01.bt



> This also removes a second atomic operation from the release path.
> 
> Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
> ---
>  include/trace/events/page_pool.h | 24 ++++++++++--------------
>  net/core/page_pool.c             |  8 +++++---
>  2 files changed, 15 insertions(+), 17 deletions(-)
[...]

> @@ -222,9 +222,11 @@ static void __page_pool_clean_page(struct page_pool *pool,
>  			     DMA_ATTR_SKIP_CPU_SYNC);
>  	page->dma_addr = 0;
>  skip_dma_unmap:
> +	trace_page_pool_page_release(pool, page);
> +	/* This may be the last page returned, releasing the pool, so
> +	 * it is not safe to reference pool afterwards.
> +	 */
>  	atomic_inc(&pool->pages_state_release_cnt);
> -	trace_page_pool_state_release(pool, page,
> -			      atomic_read(&pool->pages_state_release_cnt));
>  }

I will prefer that you do an atomic_inc_return, and send the cnt to the
existing tracepoint.  I'm not dereferencing the pool in my tracepoint
use-case, and as Alexei wrote, this would still be 'safe' (as in not
crashing) for a tracepoint if someone do.
Jonathan Lemon Nov. 14, 2019, 9:56 p.m. UTC | #2
On 14 Nov 2019, at 13:07, Jesper Dangaard Brouer wrote:
> I will prefer that you do an atomic_inc_return, and send the cnt to the
> existing tracepoint.  I'm not dereferencing the pool in my tracepoint
> use-case, and as Alexei wrote, this would still be 'safe' (as in not
> crashing) for a tracepoint if someone do.

Okay, will make that change, and send out a revision.
diff mbox series

Patch

diff --git a/include/trace/events/page_pool.h b/include/trace/events/page_pool.h
index 47b5ee880aa9..0adf9aed9f5b 100644
--- a/include/trace/events/page_pool.h
+++ b/include/trace/events/page_pool.h
@@ -35,50 +35,46 @@  TRACE_EVENT(page_pool_inflight,
 	  __entry->pool, __entry->inflight, __entry->hold, __entry->release)
 );
 
-TRACE_EVENT(page_pool_state_release,
+TRACE_EVENT(page_pool_page_release,
 
 	TP_PROTO(const struct page_pool *pool,
-		 const struct page *page, u32 release),
+		 const struct page *page)
 
-	TP_ARGS(pool, page, release),
+	TP_ARGS(pool, page),
 
 	TP_STRUCT__entry(
 		__field(const struct page_pool *,	pool)
 		__field(const struct page *,		page)
-		__field(u32,				release)
 	),
 
 	TP_fast_assign(
 		__entry->pool		= pool;
 		__entry->page		= page;
-		__entry->release	= release;
 	),
 
-	TP_printk("page_pool=%p page=%p release=%u",
-		  __entry->pool, __entry->page, __entry->release)
+	TP_printk("page_pool=%p page=%p",
+		  __entry->pool, __entry->page)
 );
 
-TRACE_EVENT(page_pool_state_hold,
+TRACE_EVENT(page_pool_page_hold,
 
 	TP_PROTO(const struct page_pool *pool,
-		 const struct page *page, u32 hold),
+		 const struct page *page),
 
-	TP_ARGS(pool, page, hold),
+	TP_ARGS(pool, page),
 
 	TP_STRUCT__entry(
 		__field(const struct page_pool *,	pool)
 		__field(const struct page *,		page)
-		__field(u32,				hold)
 	),
 
 	TP_fast_assign(
 		__entry->pool	= pool;
 		__entry->page	= page;
-		__entry->hold	= hold;
 	),
 
-	TP_printk("page_pool=%p page=%p hold=%u",
-		  __entry->pool, __entry->page, __entry->hold)
+	TP_printk("page_pool=%p page=%p",
+		  __entry->pool, __entry->page)
 );
 
 #endif /* _TRACE_PAGE_POOL_H */
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index bfe96326335d..1e66341fdac8 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -163,7 +163,7 @@  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 	/* Track how many pages are held 'in-flight' */
 	pool->pages_state_hold_cnt++;
 
-	trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+	trace_page_pool_page_hold(pool, page);
 
 	/* When page just alloc'ed is should/must have refcnt 1. */
 	return page;
@@ -222,9 +222,11 @@  static void __page_pool_clean_page(struct page_pool *pool,
 			     DMA_ATTR_SKIP_CPU_SYNC);
 	page->dma_addr = 0;
 skip_dma_unmap:
+	trace_page_pool_page_release(pool, page);
+	/* This may be the last page returned, releasing the pool, so
+	 * it is not safe to reference pool afterwards.
+	 */
 	atomic_inc(&pool->pages_state_release_cnt);
-	trace_page_pool_state_release(pool, page,
-			      atomic_read(&pool->pages_state_release_cnt));
 }
 
 /* unmap the page and clean our state */