diff mbox

[net-next,V2,1/3] tap: use build_skb() for small packet

Message ID 1502451678-17358-2-git-send-email-jasowang@redhat.com
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Jason Wang Aug. 11, 2017, 11:41 a.m. UTC
We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
skb in the past. This socket based method is not suitable for high
speed userspace like virtualization which usually:

- ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
  possible
- don't want to be block at sendmsg()

To eliminate the above overheads, this patch tries to use build_skb()
for small packet. We will do this only when the following conditions
are all met:

- TAP instead of TUN
- sk_sndbuf is INT_MAX
- caller don't want to be blocked
- zerocopy is not used
- packet size is smaller enough to use build_skb()

Pktgen from guest to host shows ~11% improvement for rx pps of tap:

Before: ~1.70Mpps
After : ~1.88Mpps

What's more important, this makes it possible to implement XDP for tap
before creating skbs.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 112 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 91 insertions(+), 21 deletions(-)

Comments

Eric Dumazet Aug. 16, 2017, 3:45 a.m. UTC | #1
On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> skb in the past. This socket based method is not suitable for high
> speed userspace like virtualization which usually:
> 
> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>   possible
> - don't want to be block at sendmsg()
> 
> To eliminate the above overheads, this patch tries to use build_skb()
> for small packet. We will do this only when the following conditions
> are all met:
> 
> - TAP instead of TUN
> - sk_sndbuf is INT_MAX
> - caller don't want to be blocked
> - zerocopy is not used
> - packet size is smaller enough to use build_skb()
> 
> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> 
> Before: ~1.70Mpps
> After : ~1.88Mpps
> 
> What's more important, this makes it possible to implement XDP for tap
> before creating skbs.


Well well well.

You do realize that tun_build_skb() is not thread safe ?

general protection fault: 0000 [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
task: ffff880069f265c0 task.stack: ffff880067688000
RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
RIP: 0010:put_page include/linux/mm.h:811 [inline]
RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 skb_release_all+0x4a/0x60 net/core/skbuff.c:631
 __kfree_skb net/core/skbuff.c:645 [inline]
 kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
 __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
 netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
 netif_receive_skb+0xae/0x390 net/core/dev.c:4551
 tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
 tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
 tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
 call_write_iter include/linux/fs.h:1742 [inline]
 new_sync_write fs/read_write.c:457 [inline]
 __vfs_write+0x684/0x970 fs/read_write.c:470
 vfs_write+0x189/0x510 fs/read_write.c:518
 SYSC_write fs/read_write.c:565 [inline]
 SyS_write+0xef/0x220 fs/read_write.c:557
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x40bab1
RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85 
RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
---[ end trace 54050eb1ec52ff83 ]---
Michael S. Tsirkin Aug. 16, 2017, 3:55 a.m. UTC | #2
On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > skb in the past. This socket based method is not suitable for high
> > speed userspace like virtualization which usually:
> > 
> > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> >   possible
> > - don't want to be block at sendmsg()
> > 
> > To eliminate the above overheads, this patch tries to use build_skb()
> > for small packet. We will do this only when the following conditions
> > are all met:
> > 
> > - TAP instead of TUN
> > - sk_sndbuf is INT_MAX
> > - caller don't want to be blocked
> > - zerocopy is not used
> > - packet size is smaller enough to use build_skb()
> > 
> > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> > 
> > Before: ~1.70Mpps
> > After : ~1.88Mpps
> > 
> > What's more important, this makes it possible to implement XDP for tap
> > before creating skbs.
> 
> 
> Well well well.
> 
> You do realize that tun_build_skb() is not thread safe ?

The issue is alloc frag, isn't it?
I guess for now we can limit this to XDP mode only, and
just allocate full pages in that mode.


> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>    (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>  skb_release_all+0x4a/0x60 net/core/skbuff.c:631
>  __kfree_skb net/core/skbuff.c:645 [inline]
>  kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
>  __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
>  __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
>  netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
>  netif_receive_skb+0xae/0x390 net/core/dev.c:4551
>  tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
>  tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
>  tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
>  call_write_iter include/linux/fs.h:1742 [inline]
>  new_sync_write fs/read_write.c:457 [inline]
>  __vfs_write+0x684/0x970 fs/read_write.c:470
>  vfs_write+0x189/0x510 fs/read_write.c:518
>  SYSC_write fs/read_write.c:565 [inline]
>  SyS_write+0xef/0x220 fs/read_write.c:557
>  entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85 
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---
Jason Wang Aug. 16, 2017, 3:55 a.m. UTC | #3
On 2017年08月16日 11:45, Eric Dumazet wrote:
> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>> skb in the past. This socket based method is not suitable for high
>> speed userspace like virtualization which usually:
>>
>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>    possible
>> - don't want to be block at sendmsg()
>>
>> To eliminate the above overheads, this patch tries to use build_skb()
>> for small packet. We will do this only when the following conditions
>> are all met:
>>
>> - TAP instead of TUN
>> - sk_sndbuf is INT_MAX
>> - caller don't want to be blocked
>> - zerocopy is not used
>> - packet size is smaller enough to use build_skb()
>>
>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>
>> Before: ~1.70Mpps
>> After : ~1.88Mpps
>>
>> What's more important, this makes it possible to implement XDP for tap
>> before creating skbs.
>
> Well well well.
>
> You do realize that tun_build_skb() is not thread safe ?

Ok, I think the issue if skb_page_frag_refill(), need a spinlock 
probably. Will prepare a patch.

Thanks

>
> general protection fault: 0000 [#1] SMP KASAN
> Dumping ftrace buffer:
>     (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 3982 Comm: syz-executor0 Not tainted 4.13.0-rc5-next-20170815+ #3
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> task: ffff880069f265c0 task.stack: ffff880067688000
> RIP: 0010:__read_once_size include/linux/compiler.h:276 [inline]
> RIP: 0010:compound_head include/linux/page-flags.h:146 [inline]
> RIP: 0010:put_page include/linux/mm.h:811 [inline]
> RIP: 0010:__skb_frag_unref include/linux/skbuff.h:2743 [inline]
> RIP: 0010:skb_release_data+0x26c/0x790 net/core/skbuff.c:568
> RSP: 0018:ffff88006768ef20 EFLAGS: 00010206
> RAX: 00d70cb5b39acdeb RBX: dffffc0000000000 RCX: 1ffff1000ced1e13
> RDX: 0000000000000000 RSI: ffff88003ec28c38 RDI: 06b865ad9cd66f59
> RBP: ffff88006768f040 R08: ffffea0000ee74a0 R09: ffffed0007ab4200
> R10: 0000000000028c28 R11: 0000000000000010 R12: ffff88003c5581b0
> R13: ffffed000ced1dfb R14: 1ffff1000ced1df3 R15: 06b865ad9cd66f39
> FS:  00007ffbc9ef7700(0000) GS:ffff88003ec00000(0000) knlGS:0000000000000000
> CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000002001aff0 CR3: 000000003d623000 CR4: 00000000000006f0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> Call Trace:
>   skb_release_all+0x4a/0x60 net/core/skbuff.c:631
>   __kfree_skb net/core/skbuff.c:645 [inline]
>   kfree_skb+0x15d/0x4c0 net/core/skbuff.c:663
>   __netif_receive_skb_core+0x10f8/0x33d0 net/core/dev.c:4425
>   __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4456
>   netif_receive_skb_internal+0x10b/0x5e0 net/core/dev.c:4527
>   netif_receive_skb+0xae/0x390 net/core/dev.c:4551
>   tun_rx_batched.isra.43+0x5e7/0x860 drivers/net/tun.c:1221
>   tun_get_user+0x11dd/0x2150 drivers/net/tun.c:1542
>   tun_chr_write_iter+0xd8/0x190 drivers/net/tun.c:1568
>   call_write_iter include/linux/fs.h:1742 [inline]
>   new_sync_write fs/read_write.c:457 [inline]
>   __vfs_write+0x684/0x970 fs/read_write.c:470
>   vfs_write+0x189/0x510 fs/read_write.c:518
>   SYSC_write fs/read_write.c:565 [inline]
>   SyS_write+0xef/0x220 fs/read_write.c:557
>   entry_SYSCALL_64_fastpath+0x1f/0xbe
> RIP: 0033:0x40bab1
> RSP: 002b:00007ffbc9ef6c00 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
> RAX: ffffffffffffffda RBX: 0000000000000036 RCX: 000000000040bab1
> RDX: 0000000000000036 RSI: 0000000020002000 RDI: 0000000000000003
> RBP: 0000000000a5f870 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000000
> R13: 0000000000000000 R14: 00007ffbc9ef79c0 R15: 00007ffbc9ef7700
> Code: c6 e8 c9 78 8d fd 4c 89 e0 48 c1 e8 03 80 3c 18 00 0f 85 93 04 00 00 4d 8b 3c 24 41 c6 45 00 00 49 8d 7f 20 48 89 f8 48 c1 e8 03 <80> 3c 18 00 0f 85 6b 04 00 00 41 80 7d 00 00 49 8b 47 20 0f 85
> RIP: __read_once_size include/linux/compiler.h:276 [inline] RSP: ffff88006768ef20
> RIP: compound_head include/linux/page-flags.h:146 [inline] RSP: ffff88006768ef20
> RIP: put_page include/linux/mm.h:811 [inline] RSP: ffff88006768ef20
> RIP: __skb_frag_unref include/linux/skbuff.h:2743 [inline] RSP: ffff88006768ef20
> RIP: skb_release_data+0x26c/0x790 net/core/skbuff.c:568 RSP: ffff88006768ef20
> ---[ end trace 54050eb1ec52ff83 ]---
>
Jason Wang Aug. 16, 2017, 3:57 a.m. UTC | #4
On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>> skb in the past. This socket based method is not suitable for high
>>> speed userspace like virtualization which usually:
>>>
>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>>    possible
>>> - don't want to be block at sendmsg()
>>>
>>> To eliminate the above overheads, this patch tries to use build_skb()
>>> for small packet. We will do this only when the following conditions
>>> are all met:
>>>
>>> - TAP instead of TUN
>>> - sk_sndbuf is INT_MAX
>>> - caller don't want to be blocked
>>> - zerocopy is not used
>>> - packet size is smaller enough to use build_skb()
>>>
>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>
>>> Before: ~1.70Mpps
>>> After : ~1.88Mpps
>>>
>>> What's more important, this makes it possible to implement XDP for tap
>>> before creating skbs.
>> Well well well.
>>
>> You do realize that tun_build_skb() is not thread safe ?
> The issue is alloc frag, isn't it?
> I guess for now we can limit this to XDP mode only, and
> just allocate full pages in that mode.
>
>

Limit this to XDP mode only does not prevent user from sending packets 
to same queue in parallel I think?

Thanks
Michael S. Tsirkin Aug. 16, 2017, 3:59 a.m. UTC | #5
On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
> 
> 
> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
> > On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
> > > On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
> > > > We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
> > > > skb in the past. This socket based method is not suitable for high
> > > > speed userspace like virtualization which usually:
> > > > 
> > > > - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
> > > >    possible
> > > > - don't want to be block at sendmsg()
> > > > 
> > > > To eliminate the above overheads, this patch tries to use build_skb()
> > > > for small packet. We will do this only when the following conditions
> > > > are all met:
> > > > 
> > > > - TAP instead of TUN
> > > > - sk_sndbuf is INT_MAX
> > > > - caller don't want to be blocked
> > > > - zerocopy is not used
> > > > - packet size is smaller enough to use build_skb()
> > > > 
> > > > Pktgen from guest to host shows ~11% improvement for rx pps of tap:
> > > > 
> > > > Before: ~1.70Mpps
> > > > After : ~1.88Mpps
> > > > 
> > > > What's more important, this makes it possible to implement XDP for tap
> > > > before creating skbs.
> > > Well well well.
> > > 
> > > You do realize that tun_build_skb() is not thread safe ?
> > The issue is alloc frag, isn't it?
> > I guess for now we can limit this to XDP mode only, and
> > just allocate full pages in that mode.
> > 
> > 
> 
> Limit this to XDP mode only does not prevent user from sending packets to
> same queue in parallel I think?
> 
> Thanks

Yes but then you can just drop the page frag allocator since
XDP is assumed not to care about truesize for most packets.
Jason Wang Aug. 16, 2017, 4:07 a.m. UTC | #6
On 2017年08月16日 11:59, Michael S. Tsirkin wrote:
> On Wed, Aug 16, 2017 at 11:57:51AM +0800, Jason Wang wrote:
>>
>> On 2017年08月16日 11:55, Michael S. Tsirkin wrote:
>>> On Tue, Aug 15, 2017 at 08:45:20PM -0700, Eric Dumazet wrote:
>>>> On Fri, 2017-08-11 at 19:41 +0800, Jason Wang wrote:
>>>>> We use tun_alloc_skb() which calls sock_alloc_send_pskb() to allocate
>>>>> skb in the past. This socket based method is not suitable for high
>>>>> speed userspace like virtualization which usually:
>>>>>
>>>>> - ignore sk_sndbuf (INT_MAX) and expect to receive the packet as fast as
>>>>>     possible
>>>>> - don't want to be block at sendmsg()
>>>>>
>>>>> To eliminate the above overheads, this patch tries to use build_skb()
>>>>> for small packet. We will do this only when the following conditions
>>>>> are all met:
>>>>>
>>>>> - TAP instead of TUN
>>>>> - sk_sndbuf is INT_MAX
>>>>> - caller don't want to be blocked
>>>>> - zerocopy is not used
>>>>> - packet size is smaller enough to use build_skb()
>>>>>
>>>>> Pktgen from guest to host shows ~11% improvement for rx pps of tap:
>>>>>
>>>>> Before: ~1.70Mpps
>>>>> After : ~1.88Mpps
>>>>>
>>>>> What's more important, this makes it possible to implement XDP for tap
>>>>> before creating skbs.
>>>> Well well well.
>>>>
>>>> You do realize that tun_build_skb() is not thread safe ?
>>> The issue is alloc frag, isn't it?
>>> I guess for now we can limit this to XDP mode only, and
>>> just allocate full pages in that mode.
>>>
>>>
>> Limit this to XDP mode only does not prevent user from sending packets to
>> same queue in parallel I think?
>>
>> Thanks
> Yes but then you can just drop the page frag allocator since
> XDP is assumed not to care about truesize for most packets.
>

Ok, let me do some test to see the numbers between the two methods first.

Thanks
diff mbox

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d21510d..9736df4 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -105,6 +105,8 @@  do {								\
 } while (0)
 #endif
 
+#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
 /* TUN device flags */
 
 /* IFF_ATTACH_QUEUE is never stored in device flags,
@@ -170,6 +172,7 @@  struct tun_file {
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
+	struct page_frag alloc_frag;
 };
 
 struct tun_flow_entry {
@@ -571,6 +574,8 @@  static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			skb_array_cleanup(&tfile->tx_array);
+		if (tfile->alloc_frag.page)
+			put_page(tfile->alloc_frag.page);
 		sock_put(&tfile->sk);
 	}
 }
@@ -1190,6 +1195,61 @@  static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
 	}
 }
 
+static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
+			      int len, int noblock, bool zerocopy)
+{
+	if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		return false;
+
+	if (tfile->socket.sk->sk_sndbuf != INT_MAX)
+		return false;
+
+	if (!noblock)
+		return false;
+
+	if (zerocopy)
+		return false;
+
+	if (SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+	    SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
+		return false;
+
+	return true;
+}
+
+static struct sk_buff *tun_build_skb(struct tun_file *tfile,
+				     struct iov_iter *from,
+				     int len)
+{
+	struct page_frag *alloc_frag = &tfile->alloc_frag;
+	struct sk_buff *skb;
+	int buflen = SKB_DATA_ALIGN(len + TUN_RX_PAD) +
+		     SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	char *buf;
+	size_t copied;
+
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
+		return ERR_PTR(-ENOMEM);
+
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	copied = copy_page_from_iter(alloc_frag->page,
+				     alloc_frag->offset + TUN_RX_PAD,
+				     len, from);
+	if (copied != len)
+		return ERR_PTR(-EFAULT);
+
+	skb = build_skb(buf, buflen);
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	skb_reserve(skb, TUN_RX_PAD);
+	skb_put(skb, len);
+	get_page(alloc_frag->page);
+	alloc_frag->offset += buflen;
+
+	return skb;
+}
+
 /* Get packet from user space buffer */
 static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			    void *msg_control, struct iov_iter *from,
@@ -1263,30 +1323,38 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (!zerocopy) {
-		copylen = len;
-		if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
-			linear = good_linear;
-		else
-			linear = tun16_to_cpu(tun, gso.hdr_len);
-	}
-
-	skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
-	if (IS_ERR(skb)) {
-		if (PTR_ERR(skb) != -EAGAIN)
+	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+		skb = tun_build_skb(tfile, from, len);
+		if (IS_ERR(skb)) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
-		return PTR_ERR(skb);
-	}
+			return PTR_ERR(skb);
+		}
+	} else {
+		if (!zerocopy) {
+			copylen = len;
+			if (tun16_to_cpu(tun, gso.hdr_len) > good_linear)
+				linear = good_linear;
+			else
+				linear = tun16_to_cpu(tun, gso.hdr_len);
+		}
 
-	if (zerocopy)
-		err = zerocopy_sg_from_iter(skb, from);
-	else
-		err = skb_copy_datagram_from_iter(skb, 0, from, len);
+		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (IS_ERR(skb)) {
+			if (PTR_ERR(skb) != -EAGAIN)
+				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			return PTR_ERR(skb);
+		}
 
-	if (err) {
-		this_cpu_inc(tun->pcpu_stats->rx_dropped);
-		kfree_skb(skb);
-		return -EFAULT;
+		if (zerocopy)
+			err = zerocopy_sg_from_iter(skb, from);
+		else
+			err = skb_copy_datagram_from_iter(skb, 0, from, len);
+
+		if (err) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			kfree_skb(skb);
+			return -EFAULT;
+		}
 	}
 
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
@@ -2377,6 +2445,8 @@  static int tun_chr_open(struct inode *inode, struct file * file)
 	tfile->sk.sk_write_space = tun_sock_write_space;
 	tfile->sk.sk_sndbuf = INT_MAX;
 
+	tfile->alloc_frag.page = NULL;
+
 	file->private_data = tfile;
 	INIT_LIST_HEAD(&tfile->next);