diff mbox series

[net-next,v1,1/2] xdp: revert forced mem allocator removal for page_pool

Message ID 157323722276.10408.11333995838112864686.stgit@firesoul
State Changes Requested
Delegated to: David Miller
Headers show
Series Change XDP lifetime guarantees for page_pool objects | expand

Commit Message

Jesper Dangaard Brouer Nov. 8, 2019, 6:20 p.m. UTC
Forced removal of XDP mem allocator, specifically related to page_pool, turned
out to be a wrong approach.  Special thanks to Jonathan Lemon for convincing me.
This patch is a partial revert of commit d956a048cd3f (“xdp: force mem allocator
removal and periodic warning”).

It is much better to provide a guarantee that page_pool object stays valid
until 'inflight' pages reach zero, making it safe to remove.

We keep the periodic warning via a work-queue, but increased interval to
5-minutes. The reason is to have a way to catch bugs, where inflight
pages/packets never reach zero, indicating some kind of leak. These kind of
bugs have been observed while converting drivers over to use page_pool API.

Details on when to crash the kernel. If page_pool API is misused and
somehow __page_pool_free() is invoked while there are still inflight
frames, then (like before) a WARN() is triggered and not a BUG(). This can
potentially lead to use-after-free, which we try to catch via poisoning the
page_pool object memory with some NULL pointers. Doing it this way,
pinpoint both the driver (likely) prematurely freeing page_pool via WARN(),
and crash-dump for inflight page/packet show who to blame for late return.

Fixes: d956a048cd3f (“xdp: force mem allocator removal and periodic warning”)
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/trace/events/xdp.h |   35 +++--------------------------------
 net/core/page_pool.c       |    8 ++++++--
 net/core/xdp.c             |   36 +++++++++++++-----------------------
 3 files changed, 22 insertions(+), 57 deletions(-)

Comments

Jonathan Lemon Nov. 8, 2019, 7:16 p.m. UTC | #1
On 8 Nov 2019, at 10:20, Jesper Dangaard Brouer wrote:

> Forced removal of XDP mem allocator, specifically related to 
> page_pool, turned
> out to be a wrong approach.  Special thanks to Jonathan Lemon for 
> convincing me.
> This patch is a partial revert of commit d956a048cd3f (“xdp: force 
> mem allocator
> removal and periodic warning”).
>
> It is much better to provide a guarantee that page_pool object stays 
> valid
> until 'inflight' pages reach zero, making it safe to remove.
>
> We keep the periodic warning via a work-queue, but increased interval 
> to
> 5-minutes. The reason is to have a way to catch bugs, where inflight
> pages/packets never reach zero, indicating some kind of leak. These 
> kind of
> bugs have been observed while converting drivers over to use page_pool 
> API.
>
> Details on when to crash the kernel. If page_pool API is misused and
> somehow __page_pool_free() is invoked while there are still inflight
> frames, then (like before) a WARN() is triggered and not a BUG(). This 
> can
> potentially lead to use-after-free, which we try to catch via 
> poisoning the
> page_pool object memory with some NULL pointers. Doing it this way,
> pinpoint both the driver (likely) prematurely freeing page_pool via 
> WARN(),
> and crash-dump for inflight page/packet show who to blame for late 
> return.
>
> Fixes: d956a048cd3f (“xdp: force mem allocator removal and periodic 
> warning”)
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  include/trace/events/xdp.h |   35 +++--------------------------------
>  net/core/page_pool.c       |    8 ++++++--
>  net/core/xdp.c             |   36 
> +++++++++++++-----------------------
>  3 files changed, 22 insertions(+), 57 deletions(-)
>
> diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
> index c7e3c9c5bad3..a3ead2b1f00e 100644
> --- a/include/trace/events/xdp.h
> +++ b/include/trace/events/xdp.h
> @@ -318,9 +318,9 @@ __MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
>  TRACE_EVENT(mem_disconnect,
>
>  	TP_PROTO(const struct xdp_mem_allocator *xa,
> -		 bool safe_to_remove, bool force),
> +		 bool safe_to_remove),
>
> -	TP_ARGS(xa, safe_to_remove, force),
> +	TP_ARGS(xa, safe_to_remove),
>
>  	TP_STRUCT__entry(
>  		__field(const struct xdp_mem_allocator *,	xa)
> @@ -328,7 +328,6 @@ TRACE_EVENT(mem_disconnect,
>  		__field(u32,		mem_type)
>  		__field(const void *,	allocator)
>  		__field(bool,		safe_to_remove)
> -		__field(bool,		force)
>  		__field(int,		disconnect_cnt)
>  	),
>
> @@ -338,17 +337,15 @@ TRACE_EVENT(mem_disconnect,
>  		__entry->mem_type	= xa->mem.type;
>  		__entry->allocator	= xa->allocator;
>  		__entry->safe_to_remove	= safe_to_remove;
> -		__entry->force		= force;
>  		__entry->disconnect_cnt	= xa->disconnect_cnt;
>  	),
>
>  	TP_printk("mem_id=%d mem_type=%s allocator=%p"
> -		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
> +		  " safe_to_remove=%s disconnect_cnt=%d",
>  		  __entry->mem_id,
>  		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
>  		  __entry->allocator,
>  		  __entry->safe_to_remove ? "true" : "false",
> -		  __entry->force ? "true" : "false",
>  		  __entry->disconnect_cnt
>  	)
>  );
> @@ -387,32 +384,6 @@ TRACE_EVENT(mem_connect,
>  	)
>  );
>
> -TRACE_EVENT(mem_return_failed,
> -
> -	TP_PROTO(const struct xdp_mem_info *mem,
> -		 const struct page *page),
> -
> -	TP_ARGS(mem, page),
> -
> -	TP_STRUCT__entry(
> -		__field(const struct page *,	page)
> -		__field(u32,		mem_id)
> -		__field(u32,		mem_type)
> -	),
> -
> -	TP_fast_assign(
> -		__entry->page		= page;
> -		__entry->mem_id		= mem->id;
> -		__entry->mem_type	= mem->type;
> -	),
> -
> -	TP_printk("mem_id=%d mem_type=%s page=%p",
> -		  __entry->mem_id,
> -		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
> -		  __entry->page
> -	)
> -);
> -
>  #endif /* _TRACE_XDP_H */
>
>  #include <trace/define_trace.h>
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 5bc65587f1c4..226f2eb30418 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool 
> *pool)
>
>  	distance = _distance(hold_cnt, release_cnt);
>
> -	/* Drivers should fix this, but only problematic when DMA is used */
> +	/* BUG but warn as kernel should crash later */
>  	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
>  	     distance, hold_cnt, release_cnt);
>  }
> @@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
>  	WARN(pool->alloc.count, "API usage violation");
>  	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>
> -	/* Can happen due to forced shutdown */
>  	if (!__page_pool_safe_to_destroy(pool))
>  		__warn_in_flight(pool);

If it's not safe to destroy, we shouldn't be getting here.



>  	ptr_ring_cleanup(&pool->ring, NULL);
>
> +	/* Make sure kernel will crash on use-after-free */
> +	pool->ring.queue = NULL;
> +	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
> +	pool->alloc.count = PP_ALLOC_CACHE_SIZE;

The pool is going to be freed.  This is useless code; if we're
really concerned about use-after-free, the correct place for catching
this is with the memory-allocator tools, not scattering things like
this ad-hoc over the codebase.


> +
>  	if (pool->p.flags & PP_FLAG_DMA_MAP)
>  		put_device(pool->p.dev);
>
> diff --git a/net/core/xdp.c b/net/core/xdp.c
> index 20781ad5f9c3..8673f199d9f4 100644
> --- a/net/core/xdp.c
> +++ b/net/core/xdp.c
> @@ -85,7 +85,7 @@ static void __xdp_mem_allocator_rcu_free(struct 
> rcu_head *rcu)
>  	kfree(xa);
>  }
>
> -static bool __mem_id_disconnect(int id, bool force)
> +static bool __mem_id_disconnect(int id)
>  {
>  	struct xdp_mem_allocator *xa;
>  	bool safe_to_remove = true;
> @@ -104,30 +104,26 @@ static bool __mem_id_disconnect(int id, bool 
> force)
>  	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
>  		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
>
> -	trace_mem_disconnect(xa, safe_to_remove, force);
> +	trace_mem_disconnect(xa, safe_to_remove);
>
> -	if ((safe_to_remove || force) &&
> +	if ((safe_to_remove) &&

Remove extra parenthesis.
Jesper Dangaard Brouer Nov. 9, 2019, 4:11 p.m. UTC | #2
On Fri, 08 Nov 2019 11:16:43 -0800
"Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:

> > diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> > index 5bc65587f1c4..226f2eb30418 100644
> > --- a/net/core/page_pool.c
> > +++ b/net/core/page_pool.c
> > @@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool 
> > *pool)
> >
> >  	distance = _distance(hold_cnt, release_cnt);
> >
> > -	/* Drivers should fix this, but only problematic when DMA is used */
> > +	/* BUG but warn as kernel should crash later */
> >  	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> >  	     distance, hold_cnt, release_cnt);

Because this is kept as a WARN, I set pool->ring.queue = NULL later.

> >  }
> > @@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
> >  	WARN(pool->alloc.count, "API usage violation");
> >  	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> >
> > -	/* Can happen due to forced shutdown */
> >  	if (!__page_pool_safe_to_destroy(pool))
> >  		__warn_in_flight(pool);  
> 
> If it's not safe to destroy, we shouldn't be getting here.

Don't make such assumptions. The API is going to be used by driver
developer and they are always a little too creative...

The page_pool is a separate facility, it is not tied to the
xdp_rxq_info memory model.  Some drivers use page_pool directly e.g.
drivers/net/ethernet/stmicro/stmmac.  It can easily trigger this case,
when some extend that driver.

 
> >  	ptr_ring_cleanup(&pool->ring, NULL);
> >
> > +	/* Make sure kernel will crash on use-after-free */
> > +	pool->ring.queue = NULL;
> > +	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
> > +	pool->alloc.count = PP_ALLOC_CACHE_SIZE;  
> 
> The pool is going to be freed.  This is useless code; if we're
> really concerned about use-after-free, the correct place for catching
> this is with the memory-allocator tools, not scattering things like
> this ad-hoc over the codebase.

No, I need this code here, because we kept the above WARN() and didn't
change that into a BUG().  It is obviously not a full solution for
use-after-free detection.  The memory subsystem have kmemleak to catch
this kind of stuff, but nobody runs this in production.  I need this
here to catch some obvious runtime cases.
Jonathan Lemon Nov. 9, 2019, 5:34 p.m. UTC | #3
On 9 Nov 2019, at 8:11, Jesper Dangaard Brouer wrote:

> On Fri, 08 Nov 2019 11:16:43 -0800
> "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
>
>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>> index 5bc65587f1c4..226f2eb30418 100644
>>> --- a/net/core/page_pool.c
>>> +++ b/net/core/page_pool.c
>>> @@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool
>>> *pool)
>>>
>>>  	distance = _distance(hold_cnt, release_cnt);
>>>
>>> -	/* Drivers should fix this, but only problematic when DMA is used */
>>> +	/* BUG but warn as kernel should crash later */
>>>  	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
>>>  	     distance, hold_cnt, release_cnt);
>
> Because this is kept as a WARN, I set pool->ring.queue = NULL later.

... which is also an API violation, reaching into the ring internals.
I strongly dislike this.


>>>  }
>>> @@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
>>>  	WARN(pool->alloc.count, "API usage violation");
>>>  	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>>>
>>> -	/* Can happen due to forced shutdown */
>>>  	if (!__page_pool_safe_to_destroy(pool))
>>>  		__warn_in_flight(pool);
>>
>> If it's not safe to destroy, we shouldn't be getting here.
>
> Don't make such assumptions. The API is going to be used by driver
> developer and they are always a little too creative...

If the driver hits this case, the driver has a bug, and it isn't
safe to continue in any fashion.  The developer needs to fix their
driver in that case.  (see stmmac code)


> The page_pool is a separate facility, it is not tied to the
> xdp_rxq_info memory model.  Some drivers use page_pool directly e.g.
> drivers/net/ethernet/stmicro/stmmac.  It can easily trigger this case,
> when some extend that driver.

Yes, and I pointed out that the mem_info should likely be completely
detached from xdp.c since it really has nothing to do with XDP.
The stmmac driver is actually broken at the moment, as it tries to
free the pool immediately without a timeout.

What should be happening is that drivers just call page_pool_destroy(),
which kicks off the shutdown process if this was the last user ref,
and delays destruction if packets are in flight.



>>>  	ptr_ring_cleanup(&pool->ring, NULL);
>>>
>>> +	/* Make sure kernel will crash on use-after-free */
>>> +	pool->ring.queue = NULL;
>>> +	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
>>> +	pool->alloc.count = PP_ALLOC_CACHE_SIZE;
>>
>> The pool is going to be freed.  This is useless code; if we're
>> really concerned about use-after-free, the correct place for catching
>> this is with the memory-allocator tools, not scattering things like
>> this ad-hoc over the codebase.
>
> No, I need this code here, because we kept the above WARN() and didn't
> change that into a BUG().  It is obviously not a full solution for
> use-after-free detection.  The memory subsystem have kmemleak to catch
> this kind of stuff, but nobody runs this in production.  I need this
> here to catch some obvious runtime cases.

The WARN() indicates something went off the rails already.  I really
don't like half-assed solutions like the above; it may or may not work
properly.  If it doesn't work properly, then what's the point?
Jesper Dangaard Brouer Nov. 10, 2019, 7:59 a.m. UTC | #4
On Sat, 09 Nov 2019 09:34:50 -0800
"Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:

> On 9 Nov 2019, at 8:11, Jesper Dangaard Brouer wrote:
> 
> > On Fri, 08 Nov 2019 11:16:43 -0800
> > "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
> >  
> >>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> >>> index 5bc65587f1c4..226f2eb30418 100644
> >>> --- a/net/core/page_pool.c
> >>> +++ b/net/core/page_pool.c
> >>> @@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool
> >>> *pool)
> >>>
> >>>  	distance = _distance(hold_cnt, release_cnt);
> >>>
> >>> -	/* Drivers should fix this, but only problematic when DMA is used */
> >>> +	/* BUG but warn as kernel should crash later */
> >>>  	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
> >>>  	     distance, hold_cnt, release_cnt);  
> >
> > Because this is kept as a WARN, I set pool->ring.queue = NULL later.  
> 
> ... which is also an API violation, reaching into the ring internals.
> I strongly dislike this.

I understand your dislike of reaching into ptr_ring "internals".
But my plan was to add this here, and then in a followup patch move this
pool->ring.queue=NULL into the ptr_ring.

 
> >>>  }
> >>> @@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
> >>>  	WARN(pool->alloc.count, "API usage violation");
> >>>  	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
> >>>
> >>> -	/* Can happen due to forced shutdown */
> >>>  	if (!__page_pool_safe_to_destroy(pool))
> >>>  		__warn_in_flight(pool);  
> >>
> >> If it's not safe to destroy, we shouldn't be getting here.  
> >
> > Don't make such assumptions. The API is going to be used by driver
> > developer and they are always a little too creative...  
> 
> If the driver hits this case, the driver has a bug, and it isn't
> safe to continue in any fashion.  The developer needs to fix their
> driver in that case.  (see stmmac code)

The stmmac driver is NOT broken, they simply use page_pool as their
driver level page-cache.  That is exactly what page_pool was designed
for, creating a generic page-cache for drivers to use.  They use this
to simplify their driver.  They don't use the advanced features, which
requires hooking into mem model reg.

> 
> > The page_pool is a separate facility, it is not tied to the
> > xdp_rxq_info memory model.  Some drivers use page_pool directly e.g.
> > drivers/net/ethernet/stmicro/stmmac.  It can easily trigger this case,
> > when some extend that driver.  
> 
> Yes, and I pointed out that the mem_info should likely be completely
> detached from xdp.c since it really has nothing to do with XDP.
> The stmmac driver is actually broken at the moment, as it tries to
> free the pool immediately without a timeout.
> 
> What should be happening is that drivers just call page_pool_destroy(),
> which kicks off the shutdown process if this was the last user ref,
> and delays destruction if packets are in flight.

Sorry, but I'm getting frustrated with you. I've already explained you
(offlist), that the memory model reg/unreg system have been created to
support multiple memory models (even per RX-queue).  We already have
AF_XDP zero copy, but I actually want to keep the flexibility and add
more in the future.

 
> >>>  	ptr_ring_cleanup(&pool->ring, NULL);
> >>>
> >>> +	/* Make sure kernel will crash on use-after-free */
> >>> +	pool->ring.queue = NULL;
> >>> +	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
> >>> +	pool->alloc.count = PP_ALLOC_CACHE_SIZE;  
> >>
> >> The pool is going to be freed.  This is useless code; if we're
> >> really concerned about use-after-free, the correct place for catching
> >> this is with the memory-allocator tools, not scattering things like
> >> this ad-hoc over the codebase.  
> >
> > No, I need this code here, because we kept the above WARN() and didn't
> > change that into a BUG().  It is obviously not a full solution for
> > use-after-free detection.  The memory subsystem have kmemleak to catch
> > this kind of stuff, but nobody runs this in production.  I need this
> > here to catch some obvious runtime cases.  
> 
> The WARN() indicates something went off the rails already.  I really
> don't like half-assed solutions like the above; it may or may not work
> properly.  If it doesn't work properly, then what's the point?

So, you are suggesting to use BUG_ON() instead and crash the kernel
immediately... you do know Linus hates when we do that, right?
Jonathan Lemon Nov. 10, 2019, 7:56 p.m. UTC | #5
On 9 Nov 2019, at 23:59, Jesper Dangaard Brouer wrote:

> On Sat, 09 Nov 2019 09:34:50 -0800
> "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
>
>> On 9 Nov 2019, at 8:11, Jesper Dangaard Brouer wrote:
>>
>>> On Fri, 08 Nov 2019 11:16:43 -0800
>>> "Jonathan Lemon" <jonathan.lemon@gmail.com> wrote:
>>>
>>>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>>>> index 5bc65587f1c4..226f2eb30418 100644
>>>>> --- a/net/core/page_pool.c
>>>>> +++ b/net/core/page_pool.c
>>>>> @@ -346,7 +346,7 @@ static void __warn_in_flight(struct page_pool
>>>>> *pool)
>>>>>
>>>>>  	distance = _distance(hold_cnt, release_cnt);
>>>>>
>>>>> -	/* Drivers should fix this, but only problematic when DMA is used */
>>>>> +	/* BUG but warn as kernel should crash later */
>>>>>  	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
>>>>>  	     distance, hold_cnt, release_cnt);
>>>
>>> Because this is kept as a WARN, I set pool->ring.queue = NULL later.
>>
>> ... which is also an API violation, reaching into the ring internals.
>> I strongly dislike this.
>
> I understand your dislike of reaching into ptr_ring "internals".
> But my plan was to add this here, and then in a followup patch move this
> pool->ring.queue=NULL into the ptr_ring.
>
>
>>>>>  }
>>>>> @@ -360,12 +360,16 @@ void __page_pool_free(struct page_pool *pool)
>>>>>  	WARN(pool->alloc.count, "API usage violation");
>>>>>  	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
>>>>>
>>>>> -	/* Can happen due to forced shutdown */
>>>>>  	if (!__page_pool_safe_to_destroy(pool))
>>>>>  		__warn_in_flight(pool);
>>>>
>>>> If it's not safe to destroy, we shouldn't be getting here.
>>>
>>> Don't make such assumptions. The API is going to be used by driver
>>> developer and they are always a little too creative...
>>
>> If the driver hits this case, the driver has a bug, and it isn't
>> safe to continue in any fashion.  The developer needs to fix their
>> driver in that case.  (see stmmac code)
>
> The stmmac driver is NOT broken, they simply use page_pool as their
> driver level page-cache.  That is exactly what page_pool was designed
> for, creating a generic page-cache for drivers to use.  They use this
> to simplify their driver.  They don't use the advanced features, which
> requires hooking into mem model reg.

We both know that Ilias is working on extending the lifetime of the
page mapping so it covers the time the page is held by the skb while
it transits the stack.  This work requires a timeout feature of some
sort so the pool is not destroyed until the total inflight packet count
hits zero.  This is introduced in 2050eae626bd7a6591abbf17e26f706a700b201b

Now, while it could be true that the driver is not utilizing this right
now, nor the "advanced" features, as you call it, as soon as it does, then
there is an issue.

I also think you're arguing both sides here - either the driver is not
broken, which means it's safe to destroy the pool without checking anything,
or "The API is going to be used by driver developer and they are always a
little too creative", which means the driver has a bug and there needs to
be a mechanism to handle this.



>>
>>> The page_pool is a separate facility, it is not tied to the
>>> xdp_rxq_info memory model.  Some drivers use page_pool directly e.g.
>>> drivers/net/ethernet/stmicro/stmmac.  It can easily trigger this case,
>>> when some extend that driver.
>>
>> Yes, and I pointed out that the mem_info should likely be completely
>> detached from xdp.c since it really has nothing to do with XDP.
>> The stmmac driver is actually broken at the moment, as it tries to
>> free the pool immediately without a timeout.
>>
>> What should be happening is that drivers just call page_pool_destroy(),
>> which kicks off the shutdown process if this was the last user ref,
>> and delays destruction if packets are in flight.
>
> Sorry, but I'm getting frustrated with you. I've already explained you
> (offlist), that the memory model reg/unreg system have been created to
> support multiple memory models (even per RX-queue).  We already have
> AF_XDP zero copy, but I actually want to keep the flexibility and add
> more in the future.

Again, I'm not sure what your point is here.  I have no problem with the
xdp memory models.  However, the memory models are a consumer of the pool,
and the pool should be independent of the memory model.  In other words,
it should be possible for me to use the pool and a timeout feature without
having to bother with xdp memory models at all.  Later, if I want to have
the xdp features use the pool, then I can also do that:

Use case 1:
  create pool.
  get page from pool.
  attach page to skb.
  send skb up to stack.
  skb is freed, returned to pool.

(no xdp logic is required here)


Use case 2:
  create pool
  create xdp memory model
  attach mem model to pool
  get page from pool
  send page out via xdp
  return page to xdp model.
  xdp memory model returns page to pool


Use case 3:
  create pool
  get page from pool
  copy data from page into skb
  return page to pool

(no timeout/lifetime is required IF things work correctly)

In cases 1 and 2 a timeout mechanism is required.  In all cases, it
is not safe to free the pool if the inflight counter is not 0.  So
where is the problem?  If the inflight counter is not 0 for cases
1 and 2, we cannot destroy the pool.  For case 3, there shouldn't be
outstanding packets (except for a driver bug) so the delayed
destruction never triggers.  In the case of a driver bug, the pool
destruction is permanently delayed, and there is no crash, and no
use-after-free.


>
>>>>>  	ptr_ring_cleanup(&pool->ring, NULL);
>>>>>
>>>>> +	/* Make sure kernel will crash on use-after-free */
>>>>> +	pool->ring.queue = NULL;
>>>>> +	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
>>>>> +	pool->alloc.count = PP_ALLOC_CACHE_SIZE;
>>>>
>>>> The pool is going to be freed.  This is useless code; if we're
>>>> really concerned about use-after-free, the correct place for catching
>>>> this is with the memory-allocator tools, not scattering things like
>>>> this ad-hoc over the codebase.
>>>
>>> No, I need this code here, because we kept the above WARN() and didn't
>>> change that into a BUG().  It is obviously not a full solution for
>>> use-after-free detection.  The memory subsystem have kmemleak to catch
>>> this kind of stuff, but nobody runs this in production.  I need this
>>> here to catch some obvious runtime cases.
>>
>> The WARN() indicates something went off the rails already.  I really
>> don't like half-assed solutions like the above; it may or may not work
>> properly.  If it doesn't work properly, then what's the point?
>
> So, you are suggesting to use BUG_ON() instead and crash the kernel
> immediately... you do know Linus hates when we do that, right?

No, I'm suggesting that the delayed pool destruction is mandatory for
all cases, even "non-xdp cases", as explained above, so it handles driver
screwups safely, and avoids the use-after-free case completely.

The pool itself knows it can't be freed (because it maintains the packet
in flight counter), so the pool itself should be responsible for it's
delayed destruction, not external code (xdp).  This then sidesteps the
entire "crash or poison things if there was a use after free".
diff mbox series

Patch

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index c7e3c9c5bad3..a3ead2b1f00e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -318,9 +318,9 @@  __MEM_TYPE_MAP(__MEM_TYPE_TP_FN)
 TRACE_EVENT(mem_disconnect,
 
 	TP_PROTO(const struct xdp_mem_allocator *xa,
-		 bool safe_to_remove, bool force),
+		 bool safe_to_remove),
 
-	TP_ARGS(xa, safe_to_remove, force),
+	TP_ARGS(xa, safe_to_remove),
 
 	TP_STRUCT__entry(
 		__field(const struct xdp_mem_allocator *,	xa)
@@ -328,7 +328,6 @@  TRACE_EVENT(mem_disconnect,
 		__field(u32,		mem_type)
 		__field(const void *,	allocator)
 		__field(bool,		safe_to_remove)
-		__field(bool,		force)
 		__field(int,		disconnect_cnt)
 	),
 
@@ -338,17 +337,15 @@  TRACE_EVENT(mem_disconnect,
 		__entry->mem_type	= xa->mem.type;
 		__entry->allocator	= xa->allocator;
 		__entry->safe_to_remove	= safe_to_remove;
-		__entry->force		= force;
 		__entry->disconnect_cnt	= xa->disconnect_cnt;
 	),
 
 	TP_printk("mem_id=%d mem_type=%s allocator=%p"
-		  " safe_to_remove=%s force=%s disconnect_cnt=%d",
+		  " safe_to_remove=%s disconnect_cnt=%d",
 		  __entry->mem_id,
 		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
 		  __entry->allocator,
 		  __entry->safe_to_remove ? "true" : "false",
-		  __entry->force ? "true" : "false",
 		  __entry->disconnect_cnt
 	)
 );
@@ -387,32 +384,6 @@  TRACE_EVENT(mem_connect,
 	)
 );
 
-TRACE_EVENT(mem_return_failed,
-
-	TP_PROTO(const struct xdp_mem_info *mem,
-		 const struct page *page),
-
-	TP_ARGS(mem, page),
-
-	TP_STRUCT__entry(
-		__field(const struct page *,	page)
-		__field(u32,		mem_id)
-		__field(u32,		mem_type)
-	),
-
-	TP_fast_assign(
-		__entry->page		= page;
-		__entry->mem_id		= mem->id;
-		__entry->mem_type	= mem->type;
-	),
-
-	TP_printk("mem_id=%d mem_type=%s page=%p",
-		  __entry->mem_id,
-		  __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB),
-		  __entry->page
-	)
-);
-
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5bc65587f1c4..226f2eb30418 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -346,7 +346,7 @@  static void __warn_in_flight(struct page_pool *pool)
 
 	distance = _distance(hold_cnt, release_cnt);
 
-	/* Drivers should fix this, but only problematic when DMA is used */
+	/* BUG but warn as kernel should crash later */
 	WARN(1, "Still in-flight pages:%d hold:%u released:%u",
 	     distance, hold_cnt, release_cnt);
 }
@@ -360,12 +360,16 @@  void __page_pool_free(struct page_pool *pool)
 	WARN(pool->alloc.count, "API usage violation");
 	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
 
-	/* Can happen due to forced shutdown */
 	if (!__page_pool_safe_to_destroy(pool))
 		__warn_in_flight(pool);
 
 	ptr_ring_cleanup(&pool->ring, NULL);
 
+	/* Make sure kernel will crash on use-after-free */
+	pool->ring.queue = NULL;
+	pool->alloc.cache[PP_ALLOC_CACHE_SIZE - 1] = NULL;
+	pool->alloc.count = PP_ALLOC_CACHE_SIZE;
+
 	if (pool->p.flags & PP_FLAG_DMA_MAP)
 		put_device(pool->p.dev);
 
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 20781ad5f9c3..8673f199d9f4 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -85,7 +85,7 @@  static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
 	kfree(xa);
 }
 
-static bool __mem_id_disconnect(int id, bool force)
+static bool __mem_id_disconnect(int id)
 {
 	struct xdp_mem_allocator *xa;
 	bool safe_to_remove = true;
@@ -104,30 +104,26 @@  static bool __mem_id_disconnect(int id, bool force)
 	if (xa->mem.type == MEM_TYPE_PAGE_POOL)
 		safe_to_remove = page_pool_request_shutdown(xa->page_pool);
 
-	trace_mem_disconnect(xa, safe_to_remove, force);
+	trace_mem_disconnect(xa, safe_to_remove);
 
-	if ((safe_to_remove || force) &&
+	if ((safe_to_remove) &&
 	    !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
 		call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
 
 	mutex_unlock(&mem_id_lock);
-	return (safe_to_remove|force);
+	return safe_to_remove;
 }
 
-#define DEFER_TIME (msecs_to_jiffies(1000))
-#define DEFER_WARN_INTERVAL (30 * HZ)
-#define DEFER_MAX_RETRIES 120
+#define DEFER_TIME (msecs_to_jiffies(1000UL))
+#define DEFER_WARN_INTERVAL (600UL * HZ)
 
 static void mem_id_disconnect_defer_retry(struct work_struct *wq)
 {
 	struct delayed_work *dwq = to_delayed_work(wq);
 	struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
-	bool force = false;
+	unsigned long defer_time;
 
-	if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
-		force = true;
-
-	if (__mem_id_disconnect(xa->mem.id, force))
+	if (__mem_id_disconnect(xa->mem.id))
 		return;
 
 	/* Periodic warning */
@@ -140,7 +136,8 @@  static void mem_id_disconnect_defer_retry(struct work_struct *wq)
 	}
 
 	/* Still not ready to be disconnected, retry later */
-	schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+	defer_time = min(DEFER_WARN_INTERVAL, DEFER_TIME * xa->disconnect_cnt);
+	schedule_delayed_work(&xa->defer_wq, defer_time);
 }
 
 void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
@@ -161,7 +158,7 @@  void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
 	if (id == 0)
 		return;
 
-	if (__mem_id_disconnect(id, false))
+	if (__mem_id_disconnect(id))
 		return;
 
 	/* Could not disconnect, defer new disconnect attempt to later */
@@ -402,15 +399,8 @@  static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
 		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
 		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
 		page = virt_to_head_page(data);
-		if (likely(xa)) {
-			napi_direct &= !xdp_return_frame_no_direct();
-			page_pool_put_page(xa->page_pool, page, napi_direct);
-		} else {
-			/* Hopefully stack show who to blame for late return */
-			WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
-			trace_mem_return_failed(mem, page);
-			put_page(page);
-		}
+		napi_direct &= !xdp_return_frame_no_direct();
+		page_pool_put_page(xa->page_pool, page, napi_direct);
 		rcu_read_unlock();
 		break;
 	case MEM_TYPE_PAGE_SHARED: