diff mbox

[v10,07/12] net/mlx4_en: add page recycle to prepare rx ring for tx support

Message ID 1468955817-10604-8-git-send-email-bblanco@plumgrid.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Brenden Blanco July 19, 2016, 7:16 p.m. UTC
The mlx4 driver by default allocates order-3 pages for the ring to
consume in multiple fragments. When the device has an xdp program, this
behavior will prevent tx actions since the page must be re-mapped in
TODEVICE mode, which cannot be done if the page is still shared.

Start by making the allocator configurable based on whether xdp is
running, such that order-0 pages are always used and never shared.

Since this will stress the page allocator, add a simple page cache to
each rx ring. Pages in the cache are left dma-mapped, and in drop-only
stress tests the page allocator is eliminated from the perf report.

Note that setting an xdp program will now require the rings to be
reconfigured.

Before:
 26.91%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_process_rx_cq
 17.88%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_alloc_frags
  6.00%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_free_frag
  4.49%  ksoftirqd/0  [kernel.vmlinux]  [k] get_page_from_freelist
  3.21%  swapper      [kernel.vmlinux]  [k] intel_idle
  2.73%  ksoftirqd/0  [kernel.vmlinux]  [k] bpf_map_lookup_elem
  2.57%  swapper      [mlx4_en]         [k] mlx4_en_process_rx_cq

After:
 31.72%  swapper      [kernel.vmlinux]       [k] intel_idle
  8.79%  swapper      [mlx4_en]              [k] mlx4_en_process_rx_cq
  7.54%  swapper      [kernel.vmlinux]       [k] poll_idle
  6.36%  swapper      [mlx4_core]            [k] mlx4_eq_int
  4.21%  swapper      [kernel.vmlinux]       [k] tasklet_action
  4.03%  swapper      [kernel.vmlinux]       [k] cpuidle_enter_state
  3.43%  swapper      [mlx4_en]              [k] mlx4_en_prepare_rx_desc
  2.18%  swapper      [kernel.vmlinux]       [k] native_irq_return_iret
  1.37%  swapper      [kernel.vmlinux]       [k] menu_select
  1.09%  swapper      [kernel.vmlinux]       [k] bpf_map_lookup_elem

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 38 +++++++++++++-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 70 +++++++++++++++++++++++---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   | 11 +++-
 3 files changed, 109 insertions(+), 10 deletions(-)

Comments

Alexei Starovoitov July 19, 2016, 9:49 p.m. UTC | #1
On Tue, Jul 19, 2016 at 12:16:52PM -0700, Brenden Blanco wrote:
> The mlx4 driver by default allocates order-3 pages for the ring to
> consume in multiple fragments. When the device has an xdp program, this
> behavior will prevent tx actions since the page must be re-mapped in
> TODEVICE mode, which cannot be done if the page is still shared.
> 
> Start by making the allocator configurable based on whether xdp is
> running, such that order-0 pages are always used and never shared.
> 
> Since this will stress the page allocator, add a simple page cache to
> each rx ring. Pages in the cache are left dma-mapped, and in drop-only
> stress tests the page allocator is eliminated from the perf report.
> 
> Note that setting an xdp program will now require the rings to be
> reconfigured.
> 
> Before:
>  26.91%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_process_rx_cq
>  17.88%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_alloc_frags
>   6.00%  ksoftirqd/0  [mlx4_en]         [k] mlx4_en_free_frag
>   4.49%  ksoftirqd/0  [kernel.vmlinux]  [k] get_page_from_freelist
>   3.21%  swapper      [kernel.vmlinux]  [k] intel_idle
>   2.73%  ksoftirqd/0  [kernel.vmlinux]  [k] bpf_map_lookup_elem
>   2.57%  swapper      [mlx4_en]         [k] mlx4_en_process_rx_cq
> 
> After:
>  31.72%  swapper      [kernel.vmlinux]       [k] intel_idle
>   8.79%  swapper      [mlx4_en]              [k] mlx4_en_process_rx_cq
>   7.54%  swapper      [kernel.vmlinux]       [k] poll_idle
>   6.36%  swapper      [mlx4_core]            [k] mlx4_eq_int
>   4.21%  swapper      [kernel.vmlinux]       [k] tasklet_action
>   4.03%  swapper      [kernel.vmlinux]       [k] cpuidle_enter_state
>   3.43%  swapper      [mlx4_en]              [k] mlx4_en_prepare_rx_desc
>   2.18%  swapper      [kernel.vmlinux]       [k] native_irq_return_iret
>   1.37%  swapper      [kernel.vmlinux]       [k] menu_select
>   1.09%  swapper      [kernel.vmlinux]       [k] bpf_map_lookup_elem
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
...
> +#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
> +struct mlx4_en_page_cache {
> +	u32 index;
> +	struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
> +};

amazing that this tiny recycling pool makes such a huge difference.
Acked-by: Alexei Starovoitov <ast@kernel.org>
Eric Dumazet July 25, 2016, 7:35 a.m. UTC | #2
On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:
> The mlx4 driver by default allocates order-3 pages for the ring to
> consume in multiple fragments. When the device has an xdp program, this
> behavior will prevent tx actions since the page must be re-mapped in
> TODEVICE mode, which cannot be done if the page is still shared.
> 
> Start by making the allocator configurable based on whether xdp is
> running, such that order-0 pages are always used and never shared.
> 
> Since this will stress the page allocator, add a simple page cache to
> each rx ring. Pages in the cache are left dma-mapped, and in drop-only
> stress tests the page allocator is eliminated from the perf report.
> 
> Note that setting an xdp program will now require the rings to be
> reconfigured.

Again, this has nothing to do with XDP ?

Please submit a separate patch, switching this driver to order-0
allocations.

I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
send a generic patch, but had been traveling lately, and currently in
vacation.

order-3 pages are problematic when dealing with hostile traffic anyway,
so we should exclusively use order-0 pages, and page recycling like
Intel drivers.

http://lists.openwall.net/netdev/2016/04/11/88
Alexei Starovoitov Aug. 3, 2016, 5:45 p.m. UTC | #3
On Mon, Jul 25, 2016 at 09:35:20AM +0200, Eric Dumazet wrote:
> On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:
> > The mlx4 driver by default allocates order-3 pages for the ring to
> > consume in multiple fragments. When the device has an xdp program, this
> > behavior will prevent tx actions since the page must be re-mapped in
> > TODEVICE mode, which cannot be done if the page is still shared.
> > 
> > Start by making the allocator configurable based on whether xdp is
> > running, such that order-0 pages are always used and never shared.
> > 
> > Since this will stress the page allocator, add a simple page cache to
> > each rx ring. Pages in the cache are left dma-mapped, and in drop-only
> > stress tests the page allocator is eliminated from the perf report.
> > 
> > Note that setting an xdp program will now require the rings to be
> > reconfigured.
> 
> Again, this has nothing to do with XDP ?
> 
> Please submit a separate patch, switching this driver to order-0
> allocations.
> 
> I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
> send a generic patch, but had been traveling lately, and currently in
> vacation.
> 
> order-3 pages are problematic when dealing with hostile traffic anyway,
> so we should exclusively use order-0 pages, and page recycling like
> Intel drivers.
> 
> http://lists.openwall.net/netdev/2016/04/11/88

Completely agree. These multi-page tricks work only for benchmarks and
not for production.
Eric,
if you can submit that patch for mlx4 that would be awesome.

I think we should default to order-0 for both mlx4 and mlx5.
Alternatively we're thinking to do a netlink or ethtool switch to
preserve old behavior, but frankly I don't see who needs this order-N
allocation schemes.
Jesper Dangaard Brouer Aug. 4, 2016, 4:19 p.m. UTC | #4
On Wed, 3 Aug 2016 10:45:13 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Mon, Jul 25, 2016 at 09:35:20AM +0200, Eric Dumazet wrote:
> > On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:  
> > > The mlx4 driver by default allocates order-3 pages for the ring to
> > > consume in multiple fragments. When the device has an xdp program, this
> > > behavior will prevent tx actions since the page must be re-mapped in
> > > TODEVICE mode, which cannot be done if the page is still shared.
> > > 
> > > Start by making the allocator configurable based on whether xdp is
> > > running, such that order-0 pages are always used and never shared.
> > > 
> > > Since this will stress the page allocator, add a simple page cache to
> > > each rx ring. Pages in the cache are left dma-mapped, and in drop-only
> > > stress tests the page allocator is eliminated from the perf report.
> > > 
> > > Note that setting an xdp program will now require the rings to be
> > > reconfigured.  
> > 
> > Again, this has nothing to do with XDP ?
> > 
> > Please submit a separate patch, switching this driver to order-0
> > allocations.
> > 
> > I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
> > send a generic patch, but had been traveling lately, and currently in
> > vacation.
> > 
> > order-3 pages are problematic when dealing with hostile traffic anyway,
> > so we should exclusively use order-0 pages, and page recycling like
> > Intel drivers.
> > 
> > http://lists.openwall.net/netdev/2016/04/11/88  
> 
> Completely agree. These multi-page tricks work only for benchmarks and
> not for production.
> Eric, if you can submit that patch for mlx4 that would be awesome.
> 
> I think we should default to order-0 for both mlx4 and mlx5.
> Alternatively we're thinking to do a netlink or ethtool switch to
> preserve old behavior, but frankly I don't see who needs this order-N
> allocation schemes.

I actually agree, that we should switch to order-0 allocations.

*BUT* this will cause performance regressions on platforms with
expensive DMA operations (as they no longer amortize the cost of
mapping a larger page).

Plus, the base cost of order-0 page is 246 cycles (see [1] slide#9),
and the 10G wirespeed target is approx 201 cycles.  Thus, for these
speeds some page recycling tricks are needed.  I described how the Intel
drives does a cool trick in [1] slide#14, but it does not address the
DMA part and costs some extra atomic ops.

I've started coding on the page-pool last week, which address both the
DMA mapping and recycling (with less atomic ops). (p.s. still on
vacation this week).

http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
Alexander H Duyck Aug. 5, 2016, 12:30 a.m. UTC | #5
On Thu, Aug 4, 2016 at 9:19 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Wed, 3 Aug 2016 10:45:13 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
>> On Mon, Jul 25, 2016 at 09:35:20AM +0200, Eric Dumazet wrote:
>> > On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:
>> > > The mlx4 driver by default allocates order-3 pages for the ring to
>> > > consume in multiple fragments. When the device has an xdp program, this
>> > > behavior will prevent tx actions since the page must be re-mapped in
>> > > TODEVICE mode, which cannot be done if the page is still shared.
>> > >
>> > > Start by making the allocator configurable based on whether xdp is
>> > > running, such that order-0 pages are always used and never shared.
>> > >
>> > > Since this will stress the page allocator, add a simple page cache to
>> > > each rx ring. Pages in the cache are left dma-mapped, and in drop-only
>> > > stress tests the page allocator is eliminated from the perf report.
>> > >
>> > > Note that setting an xdp program will now require the rings to be
>> > > reconfigured.
>> >
>> > Again, this has nothing to do with XDP ?
>> >
>> > Please submit a separate patch, switching this driver to order-0
>> > allocations.
>> >
>> > I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
>> > send a generic patch, but had been traveling lately, and currently in
>> > vacation.
>> >
>> > order-3 pages are problematic when dealing with hostile traffic anyway,
>> > so we should exclusively use order-0 pages, and page recycling like
>> > Intel drivers.
>> >
>> > http://lists.openwall.net/netdev/2016/04/11/88
>>
>> Completely agree. These multi-page tricks work only for benchmarks and
>> not for production.
>> Eric, if you can submit that patch for mlx4 that would be awesome.
>>
>> I think we should default to order-0 for both mlx4 and mlx5.
>> Alternatively we're thinking to do a netlink or ethtool switch to
>> preserve old behavior, but frankly I don't see who needs this order-N
>> allocation schemes.
>
> I actually agree, that we should switch to order-0 allocations.
>
> *BUT* this will cause performance regressions on platforms with
> expensive DMA operations (as they no longer amortize the cost of
> mapping a larger page).

The trick is to use page reuse like we do for the Intel NICs.  If you
can get away with just reusing the page you don't have to keep making
the expensive map/unmap calls.

> Plus, the base cost of order-0 page is 246 cycles (see [1] slide#9),
> and the 10G wirespeed target is approx 201 cycles.  Thus, for these
> speeds some page recycling tricks are needed.  I described how the Intel
> drives does a cool trick in [1] slide#14, but it does not address the
> DMA part and costs some extra atomic ops.

I'm not sure what you mean about it not addressing the DMA part.  Last
I knew we should be just as fast using the page reuse in the Intel
drivers as the Mellanox driver using the 32K page.  The only real
difference in cost is the spot where we are atomically incrementing
the page count since that is the atomic I assume you are referring to.

I had thought about it and amortizing the atomic operation would
probably be pretty straight forward.  All we would have to do is the
same trick we use in the page frag allocator.  We could add a separate
page_count type variable to the Rx buffer info structure and decrement
that instead.  If I am not mistaken that would allow us to drop it
down to only one atomic update of the page count every 64K or so uses
of the page.

> I've started coding on the page-pool last week, which address both the
> DMA mapping and recycling (with less atomic ops). (p.s. still on
> vacation this week).
>
> http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf

I really wonder if we couldn't get away with creating some sort of 2
tiered allocator for this.  So instead of allocating a page pool we
just reserved blocks of memory like we do with huge pages.  Then you
have essentially a huge page that is mapped to a given device for DMA
and reserved for it to use as a memory resource to allocate the order
0 pages out of.  Doing it that way would likely have multiple
advantages when working with things like IOMMU since the pages would
all belong to one linear block so it would likely consume less
resources on those devices, and it wouldn't be that far off from how
DPDK is making use of huge pages in order to improve it's memory
access times and such.

- Alex
Alexei Starovoitov Aug. 5, 2016, 3:55 a.m. UTC | #6
On Thu, Aug 04, 2016 at 05:30:56PM -0700, Alexander Duyck wrote:
> On Thu, Aug 4, 2016 at 9:19 AM, Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > On Wed, 3 Aug 2016 10:45:13 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >
> >> On Mon, Jul 25, 2016 at 09:35:20AM +0200, Eric Dumazet wrote:
> >> > On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:
> >> > > The mlx4 driver by default allocates order-3 pages for the ring to
> >> > > consume in multiple fragments. When the device has an xdp program, this
> >> > > behavior will prevent tx actions since the page must be re-mapped in
> >> > > TODEVICE mode, which cannot be done if the page is still shared.
> >> > >
> >> > > Start by making the allocator configurable based on whether xdp is
> >> > > running, such that order-0 pages are always used and never shared.
> >> > >
> >> > > Since this will stress the page allocator, add a simple page cache to
> >> > > each rx ring. Pages in the cache are left dma-mapped, and in drop-only
> >> > > stress tests the page allocator is eliminated from the perf report.
> >> > >
> >> > > Note that setting an xdp program will now require the rings to be
> >> > > reconfigured.
> >> >
> >> > Again, this has nothing to do with XDP ?
> >> >
> >> > Please submit a separate patch, switching this driver to order-0
> >> > allocations.
> >> >
> >> > I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
> >> > send a generic patch, but had been traveling lately, and currently in
> >> > vacation.
> >> >
> >> > order-3 pages are problematic when dealing with hostile traffic anyway,
> >> > so we should exclusively use order-0 pages, and page recycling like
> >> > Intel drivers.
> >> >
> >> > http://lists.openwall.net/netdev/2016/04/11/88
> >>
> >> Completely agree. These multi-page tricks work only for benchmarks and
> >> not for production.
> >> Eric, if you can submit that patch for mlx4 that would be awesome.
> >>
> >> I think we should default to order-0 for both mlx4 and mlx5.
> >> Alternatively we're thinking to do a netlink or ethtool switch to
> >> preserve old behavior, but frankly I don't see who needs this order-N
> >> allocation schemes.
> >
> > I actually agree, that we should switch to order-0 allocations.
> >
> > *BUT* this will cause performance regressions on platforms with
> > expensive DMA operations (as they no longer amortize the cost of
> > mapping a larger page).

order-0 is mainly about correctness under memory pressure.
As Eric pointed out order-N is a serious issue for hostile traffic,
but even for normal traffic it's a problem. Sooner or later
only order-0 pages will be available.
Performance considerations come second.

> The trick is to use page reuse like we do for the Intel NICs.  If you
> can get away with just reusing the page you don't have to keep making
> the expensive map/unmap calls.

you mean two packet per page trick?
I think it's trading off performance vs memory.
It's useful. I wish there was a knob to turn it on/off instead
of relying on mtu size threshold.

> > I've started coding on the page-pool last week, which address both the
> > DMA mapping and recycling (with less atomic ops). (p.s. still on
> > vacation this week).
> >
> > http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
> 
> I really wonder if we couldn't get away with creating some sort of 2
> tiered allocator for this.  So instead of allocating a page pool we
> just reserved blocks of memory like we do with huge pages.  Then you
> have essentially a huge page that is mapped to a given device for DMA
> and reserved for it to use as a memory resource to allocate the order
> 0 pages out of.  Doing it that way would likely have multiple
> advantages when working with things like IOMMU since the pages would
> all belong to one linear block so it would likely consume less
> resources on those devices, and it wouldn't be that far off from how
> DPDK is making use of huge pages in order to improve it's memory
> access times and such.

interesting idea. Like dma_map 1GB region and then allocate
pages from it only? but the rest of the kernel won't be able
to use them? so only some smaller region then? or it will be
a boot time flag to reserve this pseudo-huge page?
I don't think any of that is needed for XDP. As demonstrated by current
mlx4 it's very fast already. No bottlenecks in page allocators.
Tiny page recycle array does the magic because most of the traffic
is not going to the stack.
This order-0 vs order-N discussion is for the main stack.
Not related to XDP.
Eric Dumazet Aug. 5, 2016, 7:15 a.m. UTC | #7
On Thu, 2016-08-04 at 18:19 +0200, Jesper Dangaard Brouer wrote:

> I actually agree, that we should switch to order-0 allocations.
> 
> *BUT* this will cause performance regressions on platforms with
> expensive DMA operations (as they no longer amortize the cost of
> mapping a larger page).


We much prefer reliable behavior, even it it is ~1 % slower than the
super-optimized thing that opens highways for attackers.

Anyway, in most cases pages are re-used, so we only call
dma_sync_single_range_for_cpu(), and there is no way to avoid this.

Using order-0 pages [1] is actually faster, since when we use high-order
pages (multiple frames per 'page') we can not reuse the pages.

[1] I had a local patch to allocate these pages using a very simple
allocator allocating max order (order-10) pages and splitting them into
order-0 ages, in order to lower TLB footprint. But I could not measure a
gain doing so on x86, at least on my lab machines.
Alexander H Duyck Aug. 5, 2016, 3:15 p.m. UTC | #8
On Thu, Aug 4, 2016 at 8:55 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Aug 04, 2016 at 05:30:56PM -0700, Alexander Duyck wrote:
>> On Thu, Aug 4, 2016 at 9:19 AM, Jesper Dangaard Brouer
>> <brouer@redhat.com> wrote:
>> >
>> > On Wed, 3 Aug 2016 10:45:13 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>> >
>> >> On Mon, Jul 25, 2016 at 09:35:20AM +0200, Eric Dumazet wrote:
>> >> > On Tue, 2016-07-19 at 12:16 -0700, Brenden Blanco wrote:
>> >> > > The mlx4 driver by default allocates order-3 pages for the ring to
>> >> > > consume in multiple fragments. When the device has an xdp program, this
>> >> > > behavior will prevent tx actions since the page must be re-mapped in
>> >> > > TODEVICE mode, which cannot be done if the page is still shared.
>> >> > >
>> >> > > Start by making the allocator configurable based on whether xdp is
>> >> > > running, such that order-0 pages are always used and never shared.
>> >> > >
>> >> > > Since this will stress the page allocator, add a simple page cache to
>> >> > > each rx ring. Pages in the cache are left dma-mapped, and in drop-only
>> >> > > stress tests the page allocator is eliminated from the perf report.
>> >> > >
>> >> > > Note that setting an xdp program will now require the rings to be
>> >> > > reconfigured.
>> >> >
>> >> > Again, this has nothing to do with XDP ?
>> >> >
>> >> > Please submit a separate patch, switching this driver to order-0
>> >> > allocations.
>> >> >
>> >> > I mentioned this order-3 vs order-0 issue earlier [1], and proposed to
>> >> > send a generic patch, but had been traveling lately, and currently in
>> >> > vacation.
>> >> >
>> >> > order-3 pages are problematic when dealing with hostile traffic anyway,
>> >> > so we should exclusively use order-0 pages, and page recycling like
>> >> > Intel drivers.
>> >> >
>> >> > http://lists.openwall.net/netdev/2016/04/11/88
>> >>
>> >> Completely agree. These multi-page tricks work only for benchmarks and
>> >> not for production.
>> >> Eric, if you can submit that patch for mlx4 that would be awesome.
>> >>
>> >> I think we should default to order-0 for both mlx4 and mlx5.
>> >> Alternatively we're thinking to do a netlink or ethtool switch to
>> >> preserve old behavior, but frankly I don't see who needs this order-N
>> >> allocation schemes.
>> >
>> > I actually agree, that we should switch to order-0 allocations.
>> >
>> > *BUT* this will cause performance regressions on platforms with
>> > expensive DMA operations (as they no longer amortize the cost of
>> > mapping a larger page).
>
> order-0 is mainly about correctness under memory pressure.
> As Eric pointed out order-N is a serious issue for hostile traffic,
> but even for normal traffic it's a problem. Sooner or later
> only order-0 pages will be available.
> Performance considerations come second.
>
>> The trick is to use page reuse like we do for the Intel NICs.  If you
>> can get away with just reusing the page you don't have to keep making
>> the expensive map/unmap calls.
>
> you mean two packet per page trick?
> I think it's trading off performance vs memory.
> It's useful. I wish there was a knob to turn it on/off instead
> of relying on mtu size threshold.

The MTU size doesn't really play a role in the Intel drivers in
regards to page reuse anymore.  We pretty much are just treating the
page as a pair of 2K buffers.  It does have some disadvantages in that
we cannot pack the frames as tight in the case of jumbo frames with
GRO, but at the same time jumbo frames are just not that common.

>> > I've started coding on the page-pool last week, which address both the
>> > DMA mapping and recycling (with less atomic ops). (p.s. still on
>> > vacation this week).
>> >
>> > http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
>>
>> I really wonder if we couldn't get away with creating some sort of 2
>> tiered allocator for this.  So instead of allocating a page pool we
>> just reserved blocks of memory like we do with huge pages.  Then you
>> have essentially a huge page that is mapped to a given device for DMA
>> and reserved for it to use as a memory resource to allocate the order
>> 0 pages out of.  Doing it that way would likely have multiple
>> advantages when working with things like IOMMU since the pages would
>> all belong to one linear block so it would likely consume less
>> resources on those devices, and it wouldn't be that far off from how
>> DPDK is making use of huge pages in order to improve it's memory
>> access times and such.
>
> interesting idea. Like dma_map 1GB region and then allocate
> pages from it only? but the rest of the kernel won't be able
> to use them? so only some smaller region then? or it will be
> a boot time flag to reserve this pseudo-huge page?

Yeah, something like that.  If we were already talking about
allocating a pool of pages it might make sense to just setup something
like this where you could reserve a 1GB region for a single 10G device
for instance.  Then it would make the whole thing much easier to deal
with since you would have a block of memory that should perform very
well in terms of DMA accesses.

> I don't think any of that is needed for XDP. As demonstrated by current
> mlx4 it's very fast already. No bottlenecks in page allocators.
> Tiny page recycle array does the magic because most of the traffic
> is not going to the stack.

Agreed.  If you aren't handing the frames up we don't really don't
even have to bother.  In the Intel drivers for instance if the frame
size is less than 256 bytes we just copy the whole thing out since it
is cheaper to just extend the header copy rather than taking the extra
hit for get_page/put_page.

> This order-0 vs order-N discussion is for the main stack.
> Not related to XDP.

Agreed.

- Alex
David Laight Aug. 5, 2016, 3:33 p.m. UTC | #9
From: Alexander Duyck

> Sent: 05 August 2016 16:15

...
> >

> > interesting idea. Like dma_map 1GB region and then allocate

> > pages from it only? but the rest of the kernel won't be able

> > to use them? so only some smaller region then? or it will be

> > a boot time flag to reserve this pseudo-huge page?

> 

> Yeah, something like that.  If we were already talking about

> allocating a pool of pages it might make sense to just setup something

> like this where you could reserve a 1GB region for a single 10G device

> for instance.  Then it would make the whole thing much easier to deal

> with since you would have a block of memory that should perform very

> well in terms of DMA accesses.


ISTM that the main kernel allocator ought to be keeping a cache
of pages that are mapped into the various IOMMU.
This might be a per-driver cache, but could be much wider.

Then if some code wants such a page it can be allocated one that is
already mapped.
Under memory pressure the pages could then be reused for other purposes.

...
> In the Intel drivers for instance if the frame

> size is less than 256 bytes we just copy the whole thing out since it

> is cheaper to just extend the header copy rather than taking the extra

> hit for get_page/put_page.


How fast is 'rep movsb' (on cached addresses) on recent x86 cpu?
It might actually be worth unconditionally copying the entire frame
on those cpus.

A long time ago we found the breakeven point for the copy to be about
1kb on sparc mbus/sbus systems - and that might not have been aligning
the copy.

	David
Alexander H Duyck Aug. 5, 2016, 4 p.m. UTC | #10
On Fri, Aug 5, 2016 at 8:33 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Alexander Duyck
>> Sent: 05 August 2016 16:15
> ...
>> >
>> > interesting idea. Like dma_map 1GB region and then allocate
>> > pages from it only? but the rest of the kernel won't be able
>> > to use them? so only some smaller region then? or it will be
>> > a boot time flag to reserve this pseudo-huge page?
>>
>> Yeah, something like that.  If we were already talking about
>> allocating a pool of pages it might make sense to just setup something
>> like this where you could reserve a 1GB region for a single 10G device
>> for instance.  Then it would make the whole thing much easier to deal
>> with since you would have a block of memory that should perform very
>> well in terms of DMA accesses.
>
> ISTM that the main kernel allocator ought to be keeping a cache
> of pages that are mapped into the various IOMMU.
> This might be a per-driver cache, but could be much wider.
>
> Then if some code wants such a page it can be allocated one that is
> already mapped.
> Under memory pressure the pages could then be reused for other purposes.
>
> ...
>> In the Intel drivers for instance if the frame
>> size is less than 256 bytes we just copy the whole thing out since it
>> is cheaper to just extend the header copy rather than taking the extra
>> hit for get_page/put_page.
>
> How fast is 'rep movsb' (on cached addresses) on recent x86 cpu?
> It might actually be worth unconditionally copying the entire frame
> on those cpus.

The cost for rep movsb on modern x86 is about 1 cycle for every 16
bytes plus some fixed amount of setup time.  The time it usually takes
for something like an atomic operation can vary but it is usually in
the 10s of cycles when you factor in a get_page/put_page which is why
I ended up going with 256 being the upper limit as I recall since that
allowed for the best performance without starting to incur any
penalty.

> A long time ago we found the breakeven point for the copy to be about
> 1kb on sparc mbus/sbus systems - and that might not have been aligning
> the copy.

I wouldn't know about other architectures.

- Alex
Alexei Starovoitov Aug. 8, 2016, 2:15 a.m. UTC | #11
On Fri, Aug 05, 2016 at 09:15:33AM +0200, Eric Dumazet wrote:
> On Thu, 2016-08-04 at 18:19 +0200, Jesper Dangaard Brouer wrote:
> 
> > I actually agree, that we should switch to order-0 allocations.
> > 
> > *BUT* this will cause performance regressions on platforms with
> > expensive DMA operations (as they no longer amortize the cost of
> > mapping a larger page).
> 
> 
> We much prefer reliable behavior, even it it is ~1 % slower than the
> super-optimized thing that opens highways for attackers.

+1
It's more important to have deterministic performance at fresh boot
and after long uptime when high order-N are gone.

> Anyway, in most cases pages are re-used, so we only call
> dma_sync_single_range_for_cpu(), and there is no way to avoid this.
> 
> Using order-0 pages [1] is actually faster, since when we use high-order
> pages (multiple frames per 'page') we can not reuse the pages.
> 
> [1] I had a local patch to allocate these pages using a very simple
> allocator allocating max order (order-10) pages and splitting them into
> order-0 ages, in order to lower TLB footprint. But I could not measure a
> gain doing so on x86, at least on my lab machines.

Which driver was that?
I suspect that should indeed be the case for any driver that
uses build_skb and <256 copybreak.

Saeed,
could you please share the performance numbers for mlx5 order-0 vs order-N ?
You mentioned that there was some performance improvement. We need to know
how much we'll lose when we turn off order-N.
Thanks!
Jesper Dangaard Brouer Aug. 8, 2016, 8:01 a.m. UTC | #12
On Sun, 7 Aug 2016 19:15:27 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, Aug 05, 2016 at 09:15:33AM +0200, Eric Dumazet wrote:
> > On Thu, 2016-08-04 at 18:19 +0200, Jesper Dangaard Brouer wrote:
> >   
> > > I actually agree, that we should switch to order-0 allocations.
> > > 
> > > *BUT* this will cause performance regressions on platforms with
> > > expensive DMA operations (as they no longer amortize the cost of
> > > mapping a larger page).  
> > 
> > 
> > We much prefer reliable behavior, even it it is ~1 % slower than the
> > super-optimized thing that opens highways for attackers.  
> 
> +1
> It's more important to have deterministic performance at fresh boot
> and after long uptime when high order-N are gone.

Yes, exactly. Doing high order-N pages allocations might look good on
benchmarks on a freshly booted system, but once the page allocator gets
fragmented (after long uptime) then performance characteristics change.
(Discussed this with Christoph Lameter during MM-summit, and he have
seen issues with this kind of fragmentation in production)


> > Anyway, in most cases pages are re-used, so we only call
> > dma_sync_single_range_for_cpu(), and there is no way to avoid this.
> > 
> > Using order-0 pages [1] is actually faster, since when we use high-order
> > pages (multiple frames per 'page') we can not reuse the pages.
> > 
> > [1] I had a local patch to allocate these pages using a very simple
> > allocator allocating max order (order-10) pages and splitting them into
> > order-0 ages, in order to lower TLB footprint. But I could not measure a
> > gain doing so on x86, at least on my lab machines.  
> 
> Which driver was that?
> I suspect that should indeed be the case for any driver that
> uses build_skb and <256 copybreak.
> 
> Saeed,
> could you please share the performance numbers for mlx5 order-0 vs order-N ?
> You mentioned that there was some performance improvement. We need to know
> how much we'll lose when we turn off order-N.

I'm not sure the compare will be "fair" with the mlx5 driver, because
(1) the N-order page mode (MPWQE) is a hardware feature, plus (2) the
order-0 page mode is done "wrongly" (by preallocating SKBs together
with RX ring entries).

AFAIK it is a hardware feature the MPQWE (Multi-Packet Work Queue
Element) or Striding RQ, for ConnectX4-Lx.  Thus, the need to support
two modes in the mlx5 driver.

Commit[1] 461017cb006a ("net/mlx5e: Support RX multi-packet WQE
(Striding RQ)") states this gives a 10-15% performance improvement for
netperf TCP stream (and ability to absorb bursty traffic).

 [1] https://git.kernel.org/torvalds/c/461017cb006


The MPWQE mode, uses order-5 pages.  The critical question is: what
happens to the performance when order-5 allocations gets slower (or
impossible) due to page fragmentation? (Notice the page allocator uses
a central lock for order-N pages)
Alexei Starovoitov Aug. 8, 2016, 6:34 p.m. UTC | #13
On Mon, Aug 08, 2016 at 10:01:15AM +0200, Jesper Dangaard Brouer wrote:
> 
> On Sun, 7 Aug 2016 19:15:27 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Fri, Aug 05, 2016 at 09:15:33AM +0200, Eric Dumazet wrote:
> > > On Thu, 2016-08-04 at 18:19 +0200, Jesper Dangaard Brouer wrote:
> > >   
> > > > I actually agree, that we should switch to order-0 allocations.
> > > > 
> > > > *BUT* this will cause performance regressions on platforms with
> > > > expensive DMA operations (as they no longer amortize the cost of
> > > > mapping a larger page).  
> > > 
> > > 
> > > We much prefer reliable behavior, even it it is ~1 % slower than the
> > > super-optimized thing that opens highways for attackers.  
> > 
> > +1
> > It's more important to have deterministic performance at fresh boot
> > and after long uptime when high order-N are gone.
> 
> Yes, exactly. Doing high order-N pages allocations might look good on
> benchmarks on a freshly booted system, but once the page allocator gets
> fragmented (after long uptime) then performance characteristics change.
> (Discussed this with Christoph Lameter during MM-summit, and he have
> seen issues with this kind of fragmentation in production)
> 
> 
> > > Anyway, in most cases pages are re-used, so we only call
> > > dma_sync_single_range_for_cpu(), and there is no way to avoid this.
> > > 
> > > Using order-0 pages [1] is actually faster, since when we use high-order
> > > pages (multiple frames per 'page') we can not reuse the pages.
> > > 
> > > [1] I had a local patch to allocate these pages using a very simple
> > > allocator allocating max order (order-10) pages and splitting them into
> > > order-0 ages, in order to lower TLB footprint. But I could not measure a
> > > gain doing so on x86, at least on my lab machines.  
> > 
> > Which driver was that?
> > I suspect that should indeed be the case for any driver that
> > uses build_skb and <256 copybreak.
> > 
> > Saeed,
> > could you please share the performance numbers for mlx5 order-0 vs order-N ?
> > You mentioned that there was some performance improvement. We need to know
> > how much we'll lose when we turn off order-N.
> 
> I'm not sure the compare will be "fair" with the mlx5 driver, because
> (1) the N-order page mode (MPWQE) is a hardware feature, plus (2) the
> order-0 page mode is done "wrongly" (by preallocating SKBs together
> with RX ring entries).
> 
> AFAIK it is a hardware feature the MPQWE (Multi-Packet Work Queue
> Element) or Striding RQ, for ConnectX4-Lx.  Thus, the need to support
> two modes in the mlx5 driver.
> 
> Commit[1] 461017cb006a ("net/mlx5e: Support RX multi-packet WQE
> (Striding RQ)") states this gives a 10-15% performance improvement for
> netperf TCP stream (and ability to absorb bursty traffic).
> 
>  [1] https://git.kernel.org/torvalds/c/461017cb006

I suspect this 10% perf improvement is due to build_skb approach
instead of mpqwe which works fine with order-0 pages as well.
The request for perf numbers was for mlx5 order-0 vs order-N _with_
build_skb. In other words using MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ
with order-0.
Old mlx5e_handle_rx_cqe should also be converted to build_skb
even when striding_rq is not available in hw, it's a win.

> The MPWQE mode, uses order-5 pages.  The critical question is: what
> happens to the performance when order-5 allocations gets slower (or
> impossible) due to page fragmentation? (Notice the page allocator uses
> a central lock for order-N pages)

it suppose to fallback to order-0. See mlx5e_alloc_rx_fragmented_mpwqe.
which scares me a lot, since I don't see how such logic could have
been stress tested and we'll be hitting it in production.
Jesper Dangaard Brouer Aug. 9, 2016, 12:14 p.m. UTC | #14
> > On Sun, 7 Aug 2016 19:15:27 -0700 Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
[...]
> > > could you please share the performance numbers for mlx5 order-0 vs order-N ?
> > > You mentioned that there was some performance improvement. We need to know
> > > how much we'll lose when we turn off order-N.  

There is an really easy way (after XDP) to benchmark this
order-0 vs order-N, for the driver mlx4.

I simply load a XDP program, that returns XDP_PASS, because loading XDP
will reallocate the RX rings to use a single frame/packet and order-0
pages (for RX ring slots).

Result summary: (order-3 pages) 4,453,022 -> (XDP_PASS) 3,295,798 pps
 * 3295798 - 4453022 = -1157224 pps slower
 * (3295798/4453022-1)*100 = -25.98% slower
 * (1/4453022-1/3295798)*10^9 - -78.85 nanosec slower
 * Approx convert nanosec to cycles (78.85 * 4GHz) = 315 cycles slower

Where does this performance regression originate from. Well, this
basically only changed the page allocation strategy and number of DMA
calls in the driver.  Thus, lets look at the performance of the page
allocator (see tool Page_bench_ and MM_slides_ page 9)

On this machine:
 * Cost of order-0: 237 cycles(tsc)  59.336 ns
 * Cost of order-3: 423 cycles(tsc) 106.029 ns

The order-3 cost is amortized, as it can store 21 frames of size 1536,
to cost per page-fragment 20 cycles / 5.049 ns. Thus, I would expect
to see a (59.336-5.049) 54.287 ns performance reduction, not 78.85,
which is 24.563 ns higher than expected (extra dma maps cannot explain
this on a Intel platform).

There is a higher percentage of L3/LLC-load-misses, which is strange,
as I though the simple XDP (inc map cnt and return XDP_PASS) program
should not touch the data.  Quick experiment with xdp-prog that touch
data like xdp1 and always return XDP_PASS, show 3209235 with is only
8ns slower ((1/3209235-1/3295798)*10^9 = 8.184 ns).  Thus, the extra
24ns (or 16ns) might originate from an earlier cache-miss.

Conclusion: These measurements confirm that we need a page recycle
facility for the drivers before switching to order-0 allocations.


Links:

.. _Page_bench: https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c

.. _MM_slides: http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.odp



Benchmarking notes and perf results below:

Base setup:
 * Drop packets in iptables RAW
 * Disable Ethernet flow control
 * Disable GRO (changes driver code path)
 * Mlx4 NIC CX3-pro (mlx4_core log_num_mgm_entry_size=-2)
 * CPU: i7-4790K CPU @ 4.00GHz (turbostat report 4.3GHz)

Baseline: 4.7.0-baseline+ #102 SMP PREEMPT
 * instant rx:4558943 tx:0 pps n:162 average: rx:4453022 tx:0 pps
   (instant variation TX 0.000 ns (min:0.000 max:0.000) RX 5.217 ns)

Baseline perf stat::

 $ sudo perf stat -C3 -e L1-icache-load-misses -e cycles:k -e  instructions:k -e cache-misses:k -e   cache-references:k  -e LLC-store-misses:k -e LLC-store -e LLC-load-misses:k -e  LLC-load -r 5 sleep 1

Performance counter stats for 'CPU(s) 3' (5 runs) ::

       271,417  L1-icache-load-misses  ( +-  0.69% )  (33.32%)
 4,383,371,009  cycles:k               ( +-  0.01% )  (44.51%)
 7,587,502,193  instructions:k #  1.50  insns per cycle     (+- 0.01% )(55.62%)
     5,856,640  cache-misses:k # 48.435 % of all cache refs (+- 0.01% )(66.72%)
    12,091,854  cache-references:k                         ( +-  0.04%)(66.72%)
       451,681  LLC-store-misses                           ( +-  0.13%)(66.72%)
       463,152  LLC-store                                  ( +-  0.12%)(66.68%)
     5,408,934  LLC-load-misses # 47.26% of all LL-cache hits (0.01%) (22.19%)
    11,446,060  LLC-load                                 ( +-  0.04%) (22.19%)

 Samples: 40K of event 'cycles', Event count (approx.): 43956150960 ::
  Overhead  Command        Shared Object        Symbol
 +   36.59%  ksoftirqd/3    [kernel.vmlinux]     [k] memcpy_erms
 +    6.76%  ksoftirqd/3    [mlx4_en]            [k] mlx4_en_process_rx_cq
 +    6.66%  ksoftirqd/3    [ip_tables]          [k] ipt_do_table
 +    6.03%  ksoftirqd/3    [kernel.vmlinux]     [k] __build_skb
 +    4.65%  ksoftirqd/3    [kernel.vmlinux]     [k] ip_rcv
 +    4.22%  ksoftirqd/3    [mlx4_en]            [k] mlx4_en_prepare_rx_desc
 +    3.46%  ksoftirqd/3    [mlx4_en]            [k] mlx4_en_free_frag
 +    3.37%  ksoftirqd/3    [kernel.vmlinux]     [k] __netif_receive_skb_core
 +    3.04%  ksoftirqd/3    [kernel.vmlinux]     [k] __netdev_alloc_skb
 +    2.80%  ksoftirqd/3    [kernel.vmlinux]     [k] kmem_cache_alloc
 +    2.38%  ksoftirqd/3    [kernel.vmlinux]     [k] __free_page_frag
 +    1.88%  ksoftirqd/3    [kernel.vmlinux]     [k] kmem_cache_free
 +    1.65%  ksoftirqd/3    [kernel.vmlinux]     [k] nf_iterate
 +    1.59%  ksoftirqd/3    [kernel.vmlinux]     [k] nf_hook_slow
 +    1.31%  ksoftirqd/3    [kernel.vmlinux]     [k] __rcu_read_unlock
 +    0.91%  ksoftirqd/3    [kernel.vmlinux]     [k] __alloc_page_frag
 +    0.88%  ksoftirqd/3    [kernel.vmlinux]     [k] eth_type_trans
 +    0.77%  ksoftirqd/3    [kernel.vmlinux]     [k] dev_gro_receive
 +    0.76%  ksoftirqd/3    [kernel.vmlinux]     [k] skb_release_data
 +    0.76%  ksoftirqd/3    [kernel.vmlinux]     [k] __local_bh_enable_ip
 +    0.72%  ksoftirqd/3    [kernel.vmlinux]     [k] netif_receive_skb_internal
 +    0.66%  ksoftirqd/3    [kernel.vmlinux]     [k] napi_gro_receive
 +    0.66%  ksoftirqd/3    [kernel.vmlinux]     [k] __rcu_read_lock
 +    0.65%  ksoftirqd/3    [kernel.vmlinux]     [k] skb_release_head_state
 +    0.57%  ksoftirqd/3    [kernel.vmlinux]     [k] get_page_from_freelist
 +    0.57%  ksoftirqd/3    [kernel.vmlinux]     [k] __free_pages_ok
 +    0.51%  ksoftirqd/3    [kernel.vmlinux]     [k] kfree_skb
 +    0.43%  ksoftirqd/3    [kernel.vmlinux]     [k] skb_release_all

Result-xdp-pass: loading XDP_PASS program
 * instant rx:3374269 tx:0 pps n:537 average: rx:3295798 tx:0 pps
   (instant variation TX 0.000 ns (min:0.000 max:0.000) RX 7.056 ns)

Difference: 4,453,022 -> 3,295,798 pps
 * 3295798 - 4453022 = -1157224 pps slower
 * (3295798/4453022-1)*100 = -25.98% slower
 * (1/4453022-1/3295798)*10^9 - -78.85 nanosec slower

Perf stats xdp-pass::

  Performance counter stats for 'CPU(s) 3' (5 runs):

       294,219 L1-icache-load-misses  (+-0.25% )  (33.33%)
 4,382,764,897 cycles:k               (+-0.00% )  (44.51%)
 7,223,252,624 instructions:k #  1.65  insns per cycle     (+-0.00%)(55.62%)
     7,166,907 cache-misses:k # 58.792 % of all cache refs (+-0.01%)(66.72%)
    12,190,275 cache-references:k        (+-0.03% )  (66.72%)
       525,262 LLC-store-misses          (+-0.11% )  (66.72%)
       587,354 LLC-store                 (+-0.09% )  (66.68%)
     6,647,957 LLC-load-misses # 58.23% of all LL-cache hits (+-0.02%)(22.19%)
    11,417,001 LLC-load                                      (+-0.03%)(22.19%)

There is a higher percentage of L3/LLC-load-misses, which is strange,
as I though the simple XDP (return XDP_PASS and inc map cnt) program
would not touch the data.

Perf report xdp-pass::

 Samples: 40K of event 'cycles', Event count (approx.): 43953682891
   Overhead  Command        Shared Object     Symbol
 +   25.79%  ksoftirqd/3    [kernel.vmlinux]  [k] memcpy_erms
 +    7.29%  ksoftirqd/3    [mlx4_en]         [k] mlx4_en_process_rx_cq
 +    5.42%  ksoftirqd/3    [mlx4_en]         [k] mlx4_en_free_frag
 +    5.16%  ksoftirqd/3    [kernel.vmlinux]  [k] get_page_from_freelist
 +    4.55%  ksoftirqd/3    [ip_tables]       [k] ipt_do_table
 +    4.46%  ksoftirqd/3    [mlx4_en]         [k] mlx4_alloc_pages.isra.19
 +    3.97%  ksoftirqd/3    [kernel.vmlinux]  [k] __build_skb
 +    3.67%  ksoftirqd/3    [kernel.vmlinux]  [k] free_hot_cold_page
 +    3.46%  ksoftirqd/3    [kernel.vmlinux]  [k] ip_rcv
 +    2.71%  ksoftirqd/3    [kernel.vmlinux]  [k] __alloc_pages_nodemask
 +    2.62%  ksoftirqd/3    [kernel.vmlinux]  [k] __netif_receive_skb_core
 +    2.46%  ksoftirqd/3    [kernel.vmlinux]  [k] kmem_cache_alloc
 +    2.24%  ksoftirqd/3    [kernel.vmlinux]  [k] __netdev_alloc_skb
 +    2.15%  ksoftirqd/3    [mlx4_en]         [k] mlx4_en_prepare_rx_desc
 +    1.88%  ksoftirqd/3    [kernel.vmlinux]  [k] __free_page_frag
 +    1.55%  ksoftirqd/3    [kernel.vmlinux]  [k] kmem_cache_free
 +    1.42%  ksoftirqd/3    [kernel.vmlinux]  [k] __rcu_read_unlock
 +    1.27%  ksoftirqd/3    [kernel.vmlinux]  [k] nf_iterate
 +    1.14%  ksoftirqd/3    [kernel.vmlinux]  [k] nf_hook_slow
 +    1.05%  ksoftirqd/3    [kernel.vmlinux]  [k] alloc_pages_current
 +    0.83%  ksoftirqd/3    [kernel.vmlinux]  [k] __inc_zone_state
 +    0.73%  ksoftirqd/3    [kernel.vmlinux]  [k] __list_del_entry
 +    0.69%  ksoftirqd/3    [kernel.vmlinux]  [k] __list_add
 +    0.64%  ksoftirqd/3    [kernel.vmlinux]  [k] __local_bh_enable_ip
 +    0.64%  ksoftirqd/3    [kernel.vmlinux]  [k] __rcu_read_lock
 +    0.62%  ksoftirqd/3    [kernel.vmlinux]  [k] dev_gro_receive
 +    0.62%  ksoftirqd/3    [kernel.vmlinux]  [k] swiotlb_map_page
 +    0.61%  ksoftirqd/3    [kernel.vmlinux]  [k] skb_release_data
 +    0.60%  ksoftirqd/3    [kernel.vmlinux]  [k] __alloc_page_frag
 +    0.58%  ksoftirqd/3    [kernel.vmlinux]  [k] eth_type_trans
 +    0.57%  ksoftirqd/3    [kernel.vmlinux]  [k] policy_zonelist
 +    0.51%  ksoftirqd/3    [pps_core]        [k] 0x000000000000692d
 +    0.51%  ksoftirqd/3    [kernel.vmlinux]  [k] netif_receive_skb_internal
 +    0.50%  ksoftirqd/3    [kernel.vmlinux]  [k] napi_gro_receive
 +    0.49%  ksoftirqd/3    [kernel.vmlinux]  [k] __put_page
 +    0.49%  ksoftirqd/3    [kernel.vmlinux]  [k] skb_release_head_state
 +    0.42%  ksoftirqd/3    [kernel.vmlinux]  [k] kfree_skb
 +    0.34%  ksoftirqd/3    [pps_core]        [k] 0x0000000000006935
 +    0.33%  ksoftirqd/3    [kernel.vmlinux]  [k] skb_free_head
 +    0.32%  ksoftirqd/3    [kernel.vmlinux]  [k] __netif_receive_skb
 +    0.31%  ksoftirqd/3    [kernel.vmlinux]  [k] swiotlb_sync_single
 +    0.31%  ksoftirqd/3    [kernel.vmlinux]  [k] skb_gro_reset_offset
 +    0.29%  ksoftirqd/3    [kernel.vmlinux]  [k] swiotlb_sync_single_for_cpu
 +    0.29%  ksoftirqd/3    [kernel.vmlinux]  [k] list_del
 +    0.27%  ksoftirqd/3    [iptable_raw]     [k] iptable_raw_hook
 +    0.27%  ksoftirqd/3    [kernel.vmlinux]  [k] skb_release_all
 +    0.26%  ksoftirqd/3    [kernel.vmlinux]  [k] kfree_skbmem
 +    0.25%  ksoftirqd/3    [kernel.vmlinux]  [k] swiotlb_unmap_page
 +    0.23%  ksoftirqd/3    [kernel.vmlinux]  [k] bpf_map_lookup_elem
 +    0.22%  ksoftirqd/3    [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
 +    0.20%  ksoftirqd/3    [kernel.vmlinux]  [k] __page_cache_release

In perf-diff notice the increase for:
 * get_page_from_freelist(0.57%) +4.59%,
 * mlx4_en_free_frag     (3.46%) +1.96%,
 * mlx4_alloc_pages      (0.26%) +4.20%
 * __alloc_pages_nodemask(0.14%) +2.57%
 * swiotlb_map_page      (0.04%) +0.57%

Perf diff::

 # Baseline    Delta  Shared Object        Symbol
 # ........  .......  ...................  ................................
 #
    36.59%  -10.80%  [kernel.vmlinux]     [k] memcpy_erms
     6.76%   +0.53%  [mlx4_en]            [k] mlx4_en_process_rx_cq
     6.66%   -2.11%  [ip_tables]          [k] ipt_do_table
     6.03%   -2.06%  [kernel.vmlinux]     [k] __build_skb
     4.65%   -1.18%  [kernel.vmlinux]     [k] ip_rcv
     4.22%   -2.06%  [mlx4_en]            [k] mlx4_en_prepare_rx_desc
     3.46%   +1.96%  [mlx4_en]            [k] mlx4_en_free_frag
     3.37%   -0.75%  [kernel.vmlinux]     [k] __netif_receive_skb_core
     3.04%   -0.80%  [kernel.vmlinux]     [k] __netdev_alloc_skb
     2.80%   -0.34%  [kernel.vmlinux]     [k] kmem_cache_alloc
     2.38%   -0.50%  [kernel.vmlinux]     [k] __free_page_frag
     1.88%   -0.34%  [kernel.vmlinux]     [k] kmem_cache_free
     1.65%   -0.38%  [kernel.vmlinux]     [k] nf_iterate
     1.59%   -0.45%  [kernel.vmlinux]     [k] nf_hook_slow
     1.31%   +0.11%  [kernel.vmlinux]     [k] __rcu_read_unlock
     0.91%   -0.31%  [kernel.vmlinux]     [k] __alloc_page_frag
     0.88%   -0.30%  [kernel.vmlinux]     [k] eth_type_trans
     0.77%   -0.15%  [kernel.vmlinux]     [k] dev_gro_receive
     0.76%   -0.15%  [kernel.vmlinux]     [k] skb_release_data
     0.76%   -0.12%  [kernel.vmlinux]     [k] __local_bh_enable_ip
     0.72%   -0.21%  [kernel.vmlinux]     [k] netif_receive_skb_internal
     0.66%   -0.16%  [kernel.vmlinux]     [k] napi_gro_receive
     0.66%   -0.02%  [kernel.vmlinux]     [k] __rcu_read_lock
     0.65%   -0.17%  [kernel.vmlinux]     [k] skb_release_head_state
     0.57%   +4.59%  [kernel.vmlinux]     [k] get_page_from_freelist
     0.57%           [kernel.vmlinux]     [k] __free_pages_ok
     0.51%   -0.09%  [kernel.vmlinux]     [k] kfree_skb
     0.43%   -0.15%  [kernel.vmlinux]     [k] skb_release_all
     0.42%   -0.11%  [kernel.vmlinux]     [k] skb_gro_reset_offset
     0.41%   -0.08%  [kernel.vmlinux]     [k] skb_free_head
     0.39%   -0.07%  [kernel.vmlinux]     [k] __netif_receive_skb
     0.36%   -0.08%  [iptable_raw]        [k] iptable_raw_hook
     0.34%   -0.08%  [kernel.vmlinux]     [k] kfree_skbmem
     0.28%   +0.01%  [kernel.vmlinux]     [k] swiotlb_sync_single_for_cpu
     0.26%   +4.20%  [mlx4_en]            [k] mlx4_alloc_pages.isra.19
     0.20%   +0.11%  [kernel.vmlinux]     [k] swiotlb_sync_single
     0.15%   -0.03%  [kernel.vmlinux]     [k] __do_softirq
     0.14%   +2.57%  [kernel.vmlinux]     [k] __alloc_pages_nodemask
     0.14%           [kernel.vmlinux]     [k] free_one_page
     0.13%   -0.13%  [kernel.vmlinux]     [k] _raw_spin_lock_irqsave
     0.13%   -0.12%  [kernel.vmlinux]     [k] _raw_spin_lock
     0.10%           [kernel.vmlinux]     [k] __mod_zone_page_state
     0.09%   +0.06%  [kernel.vmlinux]     [k] net_rx_action
     0.09%           [kernel.vmlinux]     [k] __rmqueue
     0.07%           [kernel.vmlinux]     [k] __zone_watermark_ok
     0.07%           [kernel.vmlinux]     [k] PageHuge
     0.06%   +0.77%  [kernel.vmlinux]     [k] __inc_zone_state
     0.76%   -0.15%  [kernel.vmlinux]     [k] skb_release_data
     0.76%   -0.12%  [kernel.vmlinux]     [k] __local_bh_enable_ip
     0.72%   -0.21%  [kernel.vmlinux]     [k] netif_receive_skb_internal
     0.66%   -0.16%  [kernel.vmlinux]     [k] napi_gro_receive
     0.66%   -0.02%  [kernel.vmlinux]     [k] __rcu_read_lock
     0.65%   -0.17%  [kernel.vmlinux]     [k] skb_release_head_state
     0.57%   +4.59%  [kernel.vmlinux]     [k] get_page_from_freelist
     0.57%           [kernel.vmlinux]     [k] __free_pages_ok
     0.51%   -0.09%  [kernel.vmlinux]     [k] kfree_skb
     0.43%   -0.15%  [kernel.vmlinux]     [k] skb_release_all
     0.42%   -0.11%  [kernel.vmlinux]     [k] skb_gro_reset_offset
     0.41%   -0.08%  [kernel.vmlinux]     [k] skb_free_head
     0.39%   -0.07%  [kernel.vmlinux]     [k] __netif_receive_skb
     0.36%   -0.08%  [iptable_raw]        [k] iptable_raw_hook
     0.34%   -0.08%  [kernel.vmlinux]     [k] kfree_skbmem
     0.28%   +0.01%  [kernel.vmlinux]     [k] swiotlb_sync_single_for_cpu
     0.26%   +4.20%  [mlx4_en]            [k] mlx4_alloc_pages.isra.19
     0.20%   +0.11%  [kernel.vmlinux]     [k] swiotlb_sync_single
     0.15%   -0.03%  [kernel.vmlinux]     [k] __do_softirq
     0.14%   +2.57%  [kernel.vmlinux]     [k] __alloc_pages_nodemask
     0.14%           [kernel.vmlinux]     [k] free_one_page
     0.13%   -0.13%  [kernel.vmlinux]     [k] _raw_spin_lock_irqsave
     0.13%   -0.12%  [kernel.vmlinux]     [k] _raw_spin_lock
     0.10%           [kernel.vmlinux]     [k] __mod_zone_page_state
     0.09%   +0.06%  [kernel.vmlinux]     [k] net_rx_action
     0.09%           [kernel.vmlinux]     [k] __rmqueue
     0.07%           [kernel.vmlinux]     [k] __zone_watermark_ok
     0.07%           [kernel.vmlinux]     [k] PageHuge
     0.06%   +0.77%  [kernel.vmlinux]     [k] __inc_zone_state
     0.06%   +0.98%  [kernel.vmlinux]     [k] alloc_pages_current
     0.06%   +0.51%  [kernel.vmlinux]     [k] policy_zonelist
     0.06%   +0.01%  [kernel.vmlinux]     [k] delay_tsc
     0.05%   -0.00%  [mlx4_en]            [k] mlx4_en_poll_rx_cq
     0.05%   +0.01%  [kernel.vmlinux]     [k] __memcpy
     0.04%   +0.57%  [kernel.vmlinux]     [k] swiotlb_map_page
     0.04%   +0.69%  [kernel.vmlinux]     [k] __list_del_entry
     0.04%           [kernel.vmlinux]     [k] free_compound_page
     0.04%           [kernel.vmlinux]     [k] __put_compound_page
     0.03%   +0.66%  [kernel.vmlinux]     [k] __list_add
diff mbox

Patch

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index c34a33d..47ae2a2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2529,12 +2529,33 @@  static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
 static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	struct bpf_prog *old_prog;
 	int xdp_ring_num;
+	int port_up = 0;
+	int err;
 	int i;
 
 	xdp_ring_num = prog ? ALIGN(priv->rx_ring_num, MLX4_EN_NUM_UP) : 0;
 
+	/* No need to reconfigure buffers when simply swapping the
+	 * program for a new one.
+	 */
+	if (priv->xdp_ring_num == xdp_ring_num) {
+		if (prog) {
+			prog = bpf_prog_add(prog, priv->rx_ring_num - 1);
+			if (IS_ERR(prog))
+				return PTR_ERR(prog);
+		}
+		for (i = 0; i < priv->rx_ring_num; i++) {
+			/* This xchg is paired with READ_ONCE in the fastpath */
+			old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
+			if (old_prog)
+				bpf_prog_put(old_prog);
+		}
+		return 0;
+	}
+
 	if (priv->num_frags > 1) {
 		en_err(priv, "Cannot set XDP if MTU requires multiple frags\n");
 		return -EOPNOTSUPP;
@@ -2546,15 +2567,30 @@  static int mlx4_xdp_set(struct net_device *dev, struct bpf_prog *prog)
 			return PTR_ERR(prog);
 	}
 
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
 	priv->xdp_ring_num = xdp_ring_num;
 
-	/* This xchg is paired with READ_ONCE in the fast path */
 	for (i = 0; i < priv->rx_ring_num; i++) {
 		old_prog = xchg(&priv->rx_ring[i]->xdp_prog, prog);
 		if (old_prog)
 			bpf_prog_put(old_prog);
 	}
 
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err) {
+			en_err(priv, "Failed starting port %d for XDP change\n",
+			       priv->port);
+			queue_work(mdev->workqueue, &priv->watchdog_task);
+		}
+	}
+
+	mutex_unlock(&mdev->state_lock);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 6729545..9dd5dc1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -58,7 +58,7 @@  static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 	struct page *page;
 	dma_addr_t dma;
 
-	for (order = MLX4_EN_ALLOC_PREFER_ORDER; ;) {
+	for (order = frag_info->order; ;) {
 		gfp_t gfp = _gfp;
 
 		if (order)
@@ -71,7 +71,7 @@  static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
 			return -ENOMEM;
 	}
 	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE << order,
-			   PCI_DMA_FROMDEVICE);
+			   frag_info->dma_dir);
 	if (dma_mapping_error(priv->ddev, dma)) {
 		put_page(page);
 		return -ENOMEM;
@@ -125,7 +125,8 @@  out:
 	while (i--) {
 		if (page_alloc[i].page != ring_alloc[i].page) {
 			dma_unmap_page(priv->ddev, page_alloc[i].dma,
-				page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
+				page_alloc[i].page_size,
+				priv->frag_info[i].dma_dir);
 			page = page_alloc[i].page;
 			/* Revert changes done by mlx4_alloc_pages */
 			page_ref_sub(page, page_alloc[i].page_size /
@@ -146,7 +147,7 @@  static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
 
 	if (next_frag_end > frags[i].page_size)
 		dma_unmap_page(priv->ddev, frags[i].dma, frags[i].page_size,
-			       PCI_DMA_FROMDEVICE);
+			       frag_info->dma_dir);
 
 	if (frags[i].page)
 		put_page(frags[i].page);
@@ -177,7 +178,8 @@  out:
 
 		page_alloc = &ring->page_alloc[i];
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-			       page_alloc->page_size, PCI_DMA_FROMDEVICE);
+			       page_alloc->page_size,
+			       priv->frag_info[i].dma_dir);
 		page = page_alloc->page;
 		/* Revert changes done by mlx4_alloc_pages */
 		page_ref_sub(page, page_alloc->page_size /
@@ -202,7 +204,7 @@  static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 		       i, page_count(page_alloc->page));
 
 		dma_unmap_page(priv->ddev, page_alloc->dma,
-				page_alloc->page_size, PCI_DMA_FROMDEVICE);
+				page_alloc->page_size, frag_info->dma_dir);
 		while (page_alloc->page_offset + frag_info->frag_stride <
 		       page_alloc->page_size) {
 			put_page(page_alloc->page);
@@ -245,6 +247,12 @@  static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 	struct mlx4_en_rx_alloc *frags = ring->rx_info +
 					(index << priv->log_rx_info);
 
+	if (ring->page_cache.index > 0) {
+		frags[0] = ring->page_cache.buf[--ring->page_cache.index];
+		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma);
+		return 0;
+	}
+
 	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
 }
 
@@ -503,6 +511,24 @@  void mlx4_en_recover_from_oom(struct mlx4_en_priv *priv)
 	}
 }
 
+/* When the rx ring is running in page-per-packet mode, a released frame can go
+ * directly into a small cache, to avoid unmapping or touching the page
+ * allocator. In bpf prog performance scenarios, buffers are either forwarded
+ * or dropped, never converted to skbs, so every page can come directly from
+ * this cache when it is sized to be a multiple of the napi budget.
+ */
+bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring,
+			struct mlx4_en_rx_alloc *frame)
+{
+	struct mlx4_en_page_cache *cache = &ring->page_cache;
+
+	if (cache->index >= MLX4_EN_CACHE_SIZE)
+		return false;
+
+	cache->buf[cache->index++] = *frame;
+	return true;
+}
+
 void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 			     struct mlx4_en_rx_ring **pring,
 			     u32 size, u16 stride)
@@ -525,6 +551,16 @@  void mlx4_en_destroy_rx_ring(struct mlx4_en_priv *priv,
 void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 				struct mlx4_en_rx_ring *ring)
 {
+	int i;
+
+	for (i = 0; i < ring->page_cache.index; i++) {
+		struct mlx4_en_rx_alloc *frame = &ring->page_cache.buf[i];
+
+		dma_unmap_page(priv->ddev, frame->dma, frame->page_size,
+			       priv->frag_info[0].dma_dir);
+		put_page(frame->page);
+	}
+	ring->page_cache.index = 0;
 	mlx4_en_free_rx_buf(priv, ring);
 	if (ring->stride <= TXBB_SIZE)
 		ring->buf -= TXBB_SIZE;
@@ -866,6 +902,8 @@  int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 				bpf_warn_invalid_xdp_action(act);
 			case XDP_ABORTED:
 			case XDP_DROP:
+				if (mlx4_en_rx_recycle(ring, frags))
+					goto consumed;
 				goto next;
 			}
 		}
@@ -1021,6 +1059,7 @@  next:
 		for (nr = 0; nr < priv->num_frags; nr++)
 			mlx4_en_free_frag(priv, frags, nr);
 
+consumed:
 		++cq->mcq.cons_index;
 		index = (cq->mcq.cons_index) & ring->size_mask;
 		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
@@ -1096,19 +1135,34 @@  static const int frag_sizes[] = {
 
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
+	enum dma_data_direction dma_dir = PCI_DMA_FROMDEVICE;
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
+	int order = MLX4_EN_ALLOC_PREFER_ORDER;
+	u32 align = SMP_CACHE_BYTES;
 	int buf_size = 0;
 	int i = 0;
 
+	/* bpf requires buffers to be set up as 1 packet per page.
+	 * This only works when num_frags == 1.
+	 */
+	if (priv->xdp_ring_num) {
+		/* This will gain efficient xdp frame recycling at the expense
+		 * of more costly truesize accounting
+		 */
+		align = PAGE_SIZE;
+		order = 0;
+	}
+
 	while (buf_size < eff_mtu) {
+		priv->frag_info[i].order = order;
 		priv->frag_info[i].frag_size =
 			(eff_mtu > buf_size + frag_sizes[i]) ?
 				frag_sizes[i] : eff_mtu - buf_size;
 		priv->frag_info[i].frag_prefix_size = buf_size;
 		priv->frag_info[i].frag_stride =
-				ALIGN(priv->frag_info[i].frag_size,
-				      SMP_CACHE_BYTES);
+				ALIGN(priv->frag_info[i].frag_size, align);
+		priv->frag_info[i].dma_dir = dma_dir;
 		buf_size += priv->frag_info[i].frag_size;
 		i++;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index eb1238d..eff4be0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -259,6 +259,12 @@  struct mlx4_en_rx_alloc {
 	u32		page_size;
 };
 
+#define MLX4_EN_CACHE_SIZE (2 * NAPI_POLL_WEIGHT)
+struct mlx4_en_page_cache {
+	u32 index;
+	struct mlx4_en_rx_alloc buf[MLX4_EN_CACHE_SIZE];
+};
+
 struct mlx4_en_tx_ring {
 	/* cache line used and dirtied in tx completion
 	 * (mlx4_en_free_tx_buf())
@@ -324,6 +330,7 @@  struct mlx4_en_rx_ring {
 	void *buf;
 	void *rx_info;
 	struct bpf_prog *xdp_prog;
+	struct mlx4_en_page_cache page_cache;
 	unsigned long bytes;
 	unsigned long packets;
 	unsigned long csum_ok;
@@ -443,7 +450,9 @@  struct mlx4_en_mc_list {
 struct mlx4_en_frag_info {
 	u16 frag_size;
 	u16 frag_prefix_size;
-	u16 frag_stride;
+	u32 frag_stride;
+	enum dma_data_direction dma_dir;
+	int order;
 };
 
 #ifdef CONFIG_MLX4_EN_DCB