diff mbox series

[v3] tun: fix use-after-free when register netdev failed

Message ID 1566221479-16094-1-git-send-email-yangyingliang@huawei.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [v3] tun: fix use-after-free when register netdev failed | expand

Commit Message

Yang Yingliang Aug. 19, 2019, 1:31 p.m. UTC
I got a UAF repport in tun driver when doing fuzzy test:

[  466.269490] ==================================================================
[  466.271792] BUG: KASAN: use-after-free in tun_chr_read_iter+0x2ca/0x2d0
[  466.271806] Read of size 8 at addr ffff888372139250 by task tun-test/2699
[  466.271810]
[  466.271824] CPU: 1 PID: 2699 Comm: tun-test Not tainted 5.3.0-rc1-00001-g5a9433db2614-dirty #427
[  466.271833] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
[  466.271838] Call Trace:
[  466.271858]  dump_stack+0xca/0x13e
[  466.271871]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271890]  print_address_description+0x79/0x440
[  466.271906]  ? vprintk_func+0x5e/0xf0
[  466.271920]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271935]  __kasan_report+0x15c/0x1df
[  466.271958]  ? tun_chr_read_iter+0x2ca/0x2d0
[  466.271976]  kasan_report+0xe/0x20
[  466.271987]  tun_chr_read_iter+0x2ca/0x2d0
[  466.272013]  do_iter_readv_writev+0x4b7/0x740
[  466.272032]  ? default_llseek+0x2d0/0x2d0
[  466.272072]  do_iter_read+0x1c5/0x5e0
[  466.272110]  vfs_readv+0x108/0x180
[  466.299007]  ? compat_rw_copy_check_uvector+0x440/0x440
[  466.299020]  ? fsnotify+0x888/0xd50
[  466.299040]  ? __fsnotify_parent+0xd0/0x350
[  466.299064]  ? fsnotify_first_mark+0x1e0/0x1e0
[  466.304548]  ? vfs_write+0x264/0x510
[  466.304569]  ? ksys_write+0x101/0x210
[  466.304591]  ? do_preadv+0x116/0x1a0
[  466.304609]  do_preadv+0x116/0x1a0
[  466.309829]  do_syscall_64+0xc8/0x600
[  466.309849]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.309861] RIP: 0033:0x4560f9
[  466.309875] Code: 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
[  466.309889] RSP: 002b:00007ffffa5166e8 EFLAGS: 00000206 ORIG_RAX: 0000000000000127
[  466.322992] RAX: ffffffffffffffda RBX: 0000000000400460 RCX: 00000000004560f9
[  466.322999] RDX: 0000000000000003 RSI: 00000000200008c0 RDI: 0000000000000003
[  466.323007] RBP: 00007ffffa516700 R08: 0000000000000004 R09: 0000000000000000
[  466.323014] R10: 0000000000000000 R11: 0000000000000206 R12: 000000000040cb10
[  466.323021] R13: 0000000000000000 R14: 00000000006d7018 R15: 0000000000000000
[  466.323057]
[  466.323064] Allocated by task 2605:
[  466.335165]  save_stack+0x19/0x80
[  466.336240]  __kasan_kmalloc.constprop.8+0xa0/0xd0
[  466.337755]  kmem_cache_alloc+0xe8/0x320
[  466.339050]  getname_flags+0xca/0x560
[  466.340229]  user_path_at_empty+0x2c/0x50
[  466.341508]  vfs_statx+0xe6/0x190
[  466.342619]  __do_sys_newstat+0x81/0x100
[  466.343908]  do_syscall_64+0xc8/0x600
[  466.345303]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.347034]
[  466.347517] Freed by task 2605:
[  466.348471]  save_stack+0x19/0x80
[  466.349476]  __kasan_slab_free+0x12e/0x180
[  466.350726]  kmem_cache_free+0xc8/0x430
[  466.351874]  putname+0xe2/0x120
[  466.352921]  filename_lookup+0x257/0x3e0
[  466.354319]  vfs_statx+0xe6/0x190
[  466.355498]  __do_sys_newstat+0x81/0x100
[  466.356889]  do_syscall_64+0xc8/0x600
[  466.358037]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  466.359567]
[  466.360050] The buggy address belongs to the object at ffff888372139100
[  466.360050]  which belongs to the cache names_cache of size 4096
[  466.363735] The buggy address is located 336 bytes inside of
[  466.363735]  4096-byte region [ffff888372139100, ffff88837213a100)
[  466.367179] The buggy address belongs to the page:
[  466.368604] page:ffffea000dc84e00 refcount:1 mapcount:0 mapping:ffff8883df1b4f00 index:0x0 compound_mapcount: 0
[  466.371582] flags: 0x2fffff80010200(slab|head)
[  466.372910] raw: 002fffff80010200 dead000000000100 dead000000000122 ffff8883df1b4f00
[  466.375209] raw: 0000000000000000 0000000000070007 00000001ffffffff 0000000000000000
[  466.377778] page dumped because: kasan: bad access detected
[  466.379730]
[  466.380288] Memory state around the buggy address:
[  466.381844]  ffff888372139100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  466.384009]  ffff888372139180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  466.386131] >ffff888372139200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  466.388257]                                                  ^
[  466.390234]  ffff888372139280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  466.392512]  ffff888372139300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  466.394667] ==================================================================

tun_chr_read_iter() accessed the memory which freed by free_netdev()
called by tun_set_iff():

        CPUA                                     CPUB
    tun_set_iff()
      alloc_netdev_mqs()
      tun_attach()
                                            tun_chr_read_iter()
                                              tun_get()
      register_netdevice() <-- inject error
      tun_detach_all()
        synchronize_net()
                                              tun_do_read()
                                                tun_ring_recv()
                                                  schedule()
      free_netdev()
        netdev_freemem()
                                              tun_put()
                                                dev_put() <-- UAF

Call netif_set_real_num_t/rx_queues() before register_netdevice().

Call tun_attach() after register_netdevice() to make sure tfile->tun
is not published until the netdevice is registered. So the read/write
thread can not use the tun pointer that may freed by free_netdev().
(The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
be freed by netdev_freemem().)

---
Changes in v3:
 - call netif_set_real_num_t/rx_queues() before register_netdevice()

Changes in v2:
 - add a param in tun_set_real_num_queues()
 - move tun_set_real_num_queues() out of tun_attach()
 - call tun_set_real_num_queues() before register_netdevice()
 - call tun_attach() after register_netdevice()
---

Fixes: eb0fb363f920 ("tuntap: attach queue 0 before registering netdevice")
Reported-by: Hulk Robot <hulkci@huawei.com>
Suggested-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Yang Yingliang <yangyingliang@huawei.com>
---
 drivers/net/tun.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

Comments

David Miller Aug. 20, 2019, 1:25 a.m. UTC | #1
From: Yang Yingliang <yangyingliang@huawei.com>
Date: Mon, 19 Aug 2019 21:31:19 +0800

> Call tun_attach() after register_netdevice() to make sure tfile->tun
> is not published until the netdevice is registered. So the read/write
> thread can not use the tun pointer that may freed by free_netdev().
> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
> be freed by netdev_freemem().)

register_netdevice() must always be the last operation in the order of
network device setup.

At the point register_netdevice() is called, the device is visible globally
and therefore all of it's software state must be fully initialized and
ready for us.

You're going to have to find another solution to these problems.
Jason Wang Aug. 20, 2019, 2:28 a.m. UTC | #2
On 2019/8/20 上午9:25, David Miller wrote:
> From: Yang Yingliang <yangyingliang@huawei.com>
> Date: Mon, 19 Aug 2019 21:31:19 +0800
>
>> Call tun_attach() after register_netdevice() to make sure tfile->tun
>> is not published until the netdevice is registered. So the read/write
>> thread can not use the tun pointer that may freed by free_netdev().
>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
>> be freed by netdev_freemem().)
> register_netdevice() must always be the last operation in the order of
> network device setup.
>
> At the point register_netdevice() is called, the device is visible globally
> and therefore all of it's software state must be fully initialized and
> ready for us.
>
> You're going to have to find another solution to these problems.


The device is loosely coupled with sockets/queues. Each side is allowed 
to be go away without caring the other side. So in this case, there's a 
small window that network stack think the device has one queue but 
actually not, the code can then safely drop them. Maybe it's ok here 
with some comments?

Or if not, we can try to hold the device before tun_attach and drop it 
after register_netdevice().

Thanks
Jason Wang Aug. 22, 2019, 2:13 a.m. UTC | #3
On 2019/8/20 上午10:28, Jason Wang wrote:
>
> On 2019/8/20 上午9:25, David Miller wrote:
>> From: Yang Yingliang <yangyingliang@huawei.com>
>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>
>>> Call tun_attach() after register_netdevice() to make sure tfile->tun
>>> is not published until the netdevice is registered. So the read/write
>>> thread can not use the tun pointer that may freed by free_netdev().
>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
>>> be freed by netdev_freemem().)
>> register_netdevice() must always be the last operation in the order of
>> network device setup.
>>
>> At the point register_netdevice() is called, the device is visible 
>> globally
>> and therefore all of it's software state must be fully initialized and
>> ready for us.
>>
>> You're going to have to find another solution to these problems.
>
>
> The device is loosely coupled with sockets/queues. Each side is 
> allowed to be go away without caring the other side. So in this case, 
> there's a small window that network stack think the device has one 
> queue but actually not, the code can then safely drop them. Maybe it's 
> ok here with some comments?
>
> Or if not, we can try to hold the device before tun_attach and drop it 
> after register_netdevice().


Hi Yang:

I think maybe we can try to hold refcnt instead of playing real num 
queues here. Do you want to post a V4?

Thanks


>
> Thanks
>
Yang Yingliang Aug. 22, 2019, 6:07 a.m. UTC | #4
On 2019/8/22 10:13, Jason Wang wrote:
>
> On 2019/8/20 上午10:28, Jason Wang wrote:
>>
>> On 2019/8/20 上午9:25, David Miller wrote:
>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>
>>>> Call tun_attach() after register_netdevice() to make sure tfile->tun
>>>> is not published until the netdevice is registered. So the read/write
>>>> thread can not use the tun pointer that may freed by free_netdev().
>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they can
>>>> be freed by netdev_freemem().)
>>> register_netdevice() must always be the last operation in the order of
>>> network device setup.
>>>
>>> At the point register_netdevice() is called, the device is visible 
>>> globally
>>> and therefore all of it's software state must be fully initialized and
>>> ready for us.
>>>
>>> You're going to have to find another solution to these problems.
>>
>>
>> The device is loosely coupled with sockets/queues. Each side is 
>> allowed to be go away without caring the other side. So in this case, 
>> there's a small window that network stack think the device has one 
>> queue but actually not, the code can then safely drop them. Maybe 
>> it's ok here with some comments?
>>
>> Or if not, we can try to hold the device before tun_attach and drop 
>> it after register_netdevice().
>
>
> Hi Yang:
>
> I think maybe we can try to hold refcnt instead of playing real num 
> queues here. Do you want to post a V4?
I think the refcnt can prevent freeing the memory in this case.
When register_netdevice() failed, free_netdev() will be called directly,
dev->pcpu_refcnt and dev are freed without checking refcnt of dev.

>
> Thanks
>
>
>>
>> Thanks
>>
>
> .
>
Yang Yingliang Aug. 22, 2019, 12:55 p.m. UTC | #5
On 2019/8/22 14:07, Yang Yingliang wrote:
>
>
> On 2019/8/22 10:13, Jason Wang wrote:
>>
>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>
>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>
>>>>> Call tun_attach() after register_netdevice() to make sure tfile->tun
>>>>> is not published until the netdevice is registered. So the read/write
>>>>> thread can not use the tun pointer that may freed by free_netdev().
>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they 
>>>>> can
>>>>> be freed by netdev_freemem().)
>>>> register_netdevice() must always be the last operation in the order of
>>>> network device setup.
>>>>
>>>> At the point register_netdevice() is called, the device is visible 
>>>> globally
>>>> and therefore all of it's software state must be fully initialized and
>>>> ready for us.
>>>>
>>>> You're going to have to find another solution to these problems.
>>>
>>>
>>> The device is loosely coupled with sockets/queues. Each side is 
>>> allowed to be go away without caring the other side. So in this 
>>> case, there's a small window that network stack think the device has 
>>> one queue but actually not, the code can then safely drop them. 
>>> Maybe it's ok here with some comments?
>>>
>>> Or if not, we can try to hold the device before tun_attach and drop 
>>> it after register_netdevice().
>>
>>
>> Hi Yang:
>>
>> I think maybe we can try to hold refcnt instead of playing real num 
>> queues here. Do you want to post a V4?
> I think the refcnt can prevent freeing the memory in this case.
> When register_netdevice() failed, free_netdev() will be called directly,
> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
How about using patch-v1 that using a flag to check whether the device 
registered successfully.

>
>>
>> Thanks
>>
>>
>>>
>>> Thanks
>>>
>>
>> .
>>
>
>
>
> .
>
Jason Wang Aug. 23, 2019, 3:05 a.m. UTC | #6
----- Original Message -----
> 
> 
> On 2019/8/22 14:07, Yang Yingliang wrote:
> >
> >
> > On 2019/8/22 10:13, Jason Wang wrote:
> >>
> >> On 2019/8/20 上午10:28, Jason Wang wrote:
> >>>
> >>> On 2019/8/20 上午9:25, David Miller wrote:
> >>>> From: Yang Yingliang <yangyingliang@huawei.com>
> >>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
> >>>>
> >>>>> Call tun_attach() after register_netdevice() to make sure tfile->tun
> >>>>> is not published until the netdevice is registered. So the read/write
> >>>>> thread can not use the tun pointer that may freed by free_netdev().
> >>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they
> >>>>> can
> >>>>> be freed by netdev_freemem().)
> >>>> register_netdevice() must always be the last operation in the order of
> >>>> network device setup.
> >>>>
> >>>> At the point register_netdevice() is called, the device is visible
> >>>> globally
> >>>> and therefore all of it's software state must be fully initialized and
> >>>> ready for us.
> >>>>
> >>>> You're going to have to find another solution to these problems.
> >>>
> >>>
> >>> The device is loosely coupled with sockets/queues. Each side is
> >>> allowed to be go away without caring the other side. So in this
> >>> case, there's a small window that network stack think the device has
> >>> one queue but actually not, the code can then safely drop them.
> >>> Maybe it's ok here with some comments?
> >>>
> >>> Or if not, we can try to hold the device before tun_attach and drop
> >>> it after register_netdevice().
> >>
> >>
> >> Hi Yang:
> >>
> >> I think maybe we can try to hold refcnt instead of playing real num
> >> queues here. Do you want to post a V4?
> > I think the refcnt can prevent freeing the memory in this case.
> > When register_netdevice() failed, free_netdev() will be called directly,
> > dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
> How about using patch-v1 that using a flag to check whether the device
> registered successfully.
>

As I said, it lacks sufficient locks or barriers. To be clear, I meant
something like (compile-test only):

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..e52678f9f049 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                              (ifr->ifr_flags & TUN_FEATURES);
 
                INIT_LIST_HEAD(&tun->disabled);
+               dev_hold(dev);
                err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
                                 ifr->ifr_flags & IFF_NAPI_FRAGS);
                if (err < 0)
@@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                err = register_netdevice(tun->dev);
                if (err < 0)
                        goto err_detach;
+               dev_put(dev);
        }
 
        netif_carrier_on(tun->dev);
@@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
        return 0;
 
 err_detach:
+       dev_put(dev);
        tun_detach_all(dev);
        /* register_netdevice() already called tun_free_netdev() */
        goto err_free_dev;
 
 err_free_flow:
+       dev_put(dev);
        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
 err_free_stat:

What's your thought?

Thanks
Yang Yingliang Aug. 23, 2019, 9:36 a.m. UTC | #7
On 2019/8/23 11:05, Jason Wang wrote:
> ----- Original Message -----
>>
>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>
>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>
>>>>>>> Call tun_attach() after register_netdevice() to make sure tfile->tun
>>>>>>> is not published until the netdevice is registered. So the read/write
>>>>>>> thread can not use the tun pointer that may freed by free_netdev().
>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they
>>>>>>> can
>>>>>>> be freed by netdev_freemem().)
>>>>>> register_netdevice() must always be the last operation in the order of
>>>>>> network device setup.
>>>>>>
>>>>>> At the point register_netdevice() is called, the device is visible
>>>>>> globally
>>>>>> and therefore all of it's software state must be fully initialized and
>>>>>> ready for us.
>>>>>>
>>>>>> You're going to have to find another solution to these problems.
>>>>>
>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>> allowed to be go away without caring the other side. So in this
>>>>> case, there's a small window that network stack think the device has
>>>>> one queue but actually not, the code can then safely drop them.
>>>>> Maybe it's ok here with some comments?
>>>>>
>>>>> Or if not, we can try to hold the device before tun_attach and drop
>>>>> it after register_netdevice().
>>>>
>>>> Hi Yang:
>>>>
>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>> queues here. Do you want to post a V4?
>>> I think the refcnt can prevent freeing the memory in this case.
>>> When register_netdevice() failed, free_netdev() will be called directly,
>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>> How about using patch-v1 that using a flag to check whether the device
>> registered successfully.
>>
> As I said, it lacks sufficient locks or barriers. To be clear, I meant
> something like (compile-test only):
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index db16d7a13e00..e52678f9f049 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>                                (ifr->ifr_flags & TUN_FEATURES);
>   
>                  INIT_LIST_HEAD(&tun->disabled);
> +               dev_hold(dev);
>                  err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>                  if (err < 0)
> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>                  err = register_netdevice(tun->dev);
>                  if (err < 0)
>                          goto err_detach;
> +               dev_put(dev);
>          }
>   
>          netif_carrier_on(tun->dev);
> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>          return 0;
>   
>   err_detach:
> +       dev_put(dev);
>          tun_detach_all(dev);
>          /* register_netdevice() already called tun_free_netdev() */
>          goto err_free_dev;
>   
>   err_free_flow:
> +       dev_put(dev);
>          tun_flow_uninit(tun);
>          security_tun_dev_free_security(tun->security);
>   err_free_stat:
>
> What's your thought?

The dev pointer are freed without checking the refcount in free_netdev() called by err_free_dev

path, so I don't understand how the refcount protects this pointer.

Thanks,
Yang

>
> Thanks
>
> .
>
Jason Wang Sept. 2, 2019, 5:32 a.m. UTC | #8
On 2019/8/23 下午5:36, Yang Yingliang wrote:
>
>
> On 2019/8/23 11:05, Jason Wang wrote:
>> ----- Original Message -----
>>>
>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>
>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>
>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>> tfile->tun
>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>> read/write
>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>> free_netdev().
>>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), they
>>>>>>>> can
>>>>>>>> be freed by netdev_freemem().)
>>>>>>> register_netdevice() must always be the last operation in the 
>>>>>>> order of
>>>>>>> network device setup.
>>>>>>>
>>>>>>> At the point register_netdevice() is called, the device is visible
>>>>>>> globally
>>>>>>> and therefore all of it's software state must be fully 
>>>>>>> initialized and
>>>>>>> ready for us.
>>>>>>>
>>>>>>> You're going to have to find another solution to these problems.
>>>>>>
>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>> allowed to be go away without caring the other side. So in this
>>>>>> case, there's a small window that network stack think the device has
>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>> Maybe it's ok here with some comments?
>>>>>>
>>>>>> Or if not, we can try to hold the device before tun_attach and drop
>>>>>> it after register_netdevice().
>>>>>
>>>>> Hi Yang:
>>>>>
>>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>>> queues here. Do you want to post a V4?
>>>> I think the refcnt can prevent freeing the memory in this case.
>>>> When register_netdevice() failed, free_netdev() will be called 
>>>> directly,
>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>> How about using patch-v1 that using a flag to check whether the device
>>> registered successfully.
>>>
>> As I said, it lacks sufficient locks or barriers. To be clear, I meant
>> something like (compile-test only):
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index db16d7a13e00..e52678f9f049 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct 
>> file *file, struct ifreq *ifr)
>>                                (ifr->ifr_flags & TUN_FEATURES);
>>                    INIT_LIST_HEAD(&tun->disabled);
>> +               dev_hold(dev);
>>                  err = tun_attach(tun, file, false, ifr->ifr_flags & 
>> IFF_NAPI,
>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>                  if (err < 0)
>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct 
>> file *file, struct ifreq *ifr)
>>                  err = register_netdevice(tun->dev);
>>                  if (err < 0)
>>                          goto err_detach;
>> +               dev_put(dev);
>>          }
>>            netif_carrier_on(tun->dev);
>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>> struct file *file, struct ifreq *ifr)
>>          return 0;
>>     err_detach:
>> +       dev_put(dev);
>>          tun_detach_all(dev);
>>          /* register_netdevice() already called tun_free_netdev() */
>>          goto err_free_dev;
>>     err_free_flow:
>> +       dev_put(dev);
>>          tun_flow_uninit(tun);
>>          security_tun_dev_free_security(tun->security);
>>   err_free_stat:
>>
>> What's your thought?
>
> The dev pointer are freed without checking the refcount in 
> free_netdev() called by err_free_dev
>
> path, so I don't understand how the refcount protects this pointer.
>

The refcount are guaranteed to be zero there, isn't it?

Thanks


> Thanks,
> Yang
>
>>
>> Thanks
>>
>> .
>>
>
>
Yang Yingliang Sept. 3, 2019, 1:45 a.m. UTC | #9
On 2019/9/2 13:32, Jason Wang wrote:
>
> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>
>>
>> On 2019/8/23 11:05, Jason Wang wrote:
>>> ----- Original Message -----
>>>>
>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>
>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>
>>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>>> tfile->tun
>>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>>> read/write
>>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>>> free_netdev().
>>>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), 
>>>>>>>>> they
>>>>>>>>> can
>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>> register_netdevice() must always be the last operation in the 
>>>>>>>> order of
>>>>>>>> network device setup.
>>>>>>>>
>>>>>>>> At the point register_netdevice() is called, the device is visible
>>>>>>>> globally
>>>>>>>> and therefore all of it's software state must be fully 
>>>>>>>> initialized and
>>>>>>>> ready for us.
>>>>>>>>
>>>>>>>> You're going to have to find another solution to these problems.
>>>>>>>
>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>> case, there's a small window that network stack think the device 
>>>>>>> has
>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>> Maybe it's ok here with some comments?
>>>>>>>
>>>>>>> Or if not, we can try to hold the device before tun_attach and drop
>>>>>>> it after register_netdevice().
>>>>>>
>>>>>> Hi Yang:
>>>>>>
>>>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>>>> queues here. Do you want to post a V4?
>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>> When register_netdevice() failed, free_netdev() will be called 
>>>>> directly,
>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>> How about using patch-v1 that using a flag to check whether the device
>>>> registered successfully.
>>>>
>>> As I said, it lacks sufficient locks or barriers. To be clear, I meant
>>> something like (compile-test only):
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index db16d7a13e00..e52678f9f049 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, struct 
>>> file *file, struct ifreq *ifr)
>>>                                (ifr->ifr_flags & TUN_FEATURES);
>>>                    INIT_LIST_HEAD(&tun->disabled);
>>> +               dev_hold(dev);
>>>                  err = tun_attach(tun, file, false, ifr->ifr_flags & 
>>> IFF_NAPI,
>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>                  if (err < 0)
>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, struct 
>>> file *file, struct ifreq *ifr)
>>>                  err = register_netdevice(tun->dev);
>>>                  if (err < 0)
>>>                          goto err_detach;
>>> +               dev_put(dev);
>>>          }
>>>            netif_carrier_on(tun->dev);
>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>>> struct file *file, struct ifreq *ifr)
>>>          return 0;
>>>     err_detach:
>>> +       dev_put(dev);
>>>          tun_detach_all(dev);
>>>          /* register_netdevice() already called tun_free_netdev() */
>>>          goto err_free_dev;
>>>     err_free_flow:
>>> +       dev_put(dev);
>>>          tun_flow_uninit(tun);
>>>          security_tun_dev_free_security(tun->security);
>>>   err_free_stat:
>>>
>>> What's your thought?
>>
>> The dev pointer are freed without checking the refcount in 
>> free_netdev() called by err_free_dev
>>
>> path, so I don't understand how the refcount protects this pointer.
>>
>
> The refcount are guaranteed to be zero there, isn't it?
No, it's not.

err_free_dev:
         free_netdev(dev);

void free_netdev(struct net_device *dev)
{
...
         /* pcpu_refcnt can be freed without checking refcount */
         free_percpu(dev->pcpu_refcnt);
         dev->pcpu_refcnt = NULL;

         /*  Compatibility with error handling in drivers */
         if (dev->reg_state == NETREG_UNINITIALIZED) {
                 /* dev can be freed without checking refcount */
                 netdev_freemem(dev);
                 return;
         }
...
}

>
> Thanks
>
>
>> Thanks,
>> Yang
>>
>>>
>>> Thanks
>>>
>>> .
>>>
>>
>>
>
> .
>
Jason Wang Sept. 3, 2019, 3:03 a.m. UTC | #10
On 2019/9/3 上午9:45, Yang Yingliang wrote:
>
>
> On 2019/9/2 13:32, Jason Wang wrote:
>>
>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>
>>>
>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>> ----- Original Message -----
>>>>>
>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>
>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>
>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>>>> tfile->tun
>>>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>>>> read/write
>>>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>>>> free_netdev().
>>>>>>>>>> (The tun and dev pointer are allocated by alloc_netdev_mqs(), 
>>>>>>>>>> they
>>>>>>>>>> can
>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>> register_netdevice() must always be the last operation in the 
>>>>>>>>> order of
>>>>>>>>> network device setup.
>>>>>>>>>
>>>>>>>>> At the point register_netdevice() is called, the device is 
>>>>>>>>> visible
>>>>>>>>> globally
>>>>>>>>> and therefore all of it's software state must be fully 
>>>>>>>>> initialized and
>>>>>>>>> ready for us.
>>>>>>>>>
>>>>>>>>> You're going to have to find another solution to these problems.
>>>>>>>>
>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>> case, there's a small window that network stack think the 
>>>>>>>> device has
>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>
>>>>>>>> Or if not, we can try to hold the device before tun_attach and 
>>>>>>>> drop
>>>>>>>> it after register_netdevice().
>>>>>>>
>>>>>>> Hi Yang:
>>>>>>>
>>>>>>> I think maybe we can try to hold refcnt instead of playing real num
>>>>>>> queues here. Do you want to post a V4?
>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>> When register_netdevice() failed, free_netdev() will be called 
>>>>>> directly,
>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>>> How about using patch-v1 that using a flag to check whether the 
>>>>> device
>>>>> registered successfully.
>>>>>
>>>> As I said, it lacks sufficient locks or barriers. To be clear, I meant
>>>> something like (compile-test only):
>>>>
>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>> index db16d7a13e00..e52678f9f049 100644
>>>> --- a/drivers/net/tun.c
>>>> +++ b/drivers/net/tun.c
>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
>>>> struct file *file, struct ifreq *ifr)
>>>>                                (ifr->ifr_flags & TUN_FEATURES);
>>>>                    INIT_LIST_HEAD(&tun->disabled);
>>>> +               dev_hold(dev);
>>>>                  err = tun_attach(tun, file, false, ifr->ifr_flags 
>>>> & IFF_NAPI,
>>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>>                  if (err < 0)
>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
>>>> struct file *file, struct ifreq *ifr)
>>>>                  err = register_netdevice(tun->dev);
>>>>                  if (err < 0)
>>>>                          goto err_detach;
>>>> +               dev_put(dev);
>>>>          }
>>>>            netif_carrier_on(tun->dev);
>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>>>> struct file *file, struct ifreq *ifr)
>>>>          return 0;
>>>>     err_detach:
>>>> +       dev_put(dev);
>>>>          tun_detach_all(dev);
>>>>          /* register_netdevice() already called tun_free_netdev() */
>>>>          goto err_free_dev;
>>>>     err_free_flow:
>>>> +       dev_put(dev);
>>>>          tun_flow_uninit(tun);
>>>>          security_tun_dev_free_security(tun->security);
>>>>   err_free_stat:
>>>>
>>>> What's your thought?
>>>
>>> The dev pointer are freed without checking the refcount in 
>>> free_netdev() called by err_free_dev
>>>
>>> path, so I don't understand how the refcount protects this pointer.
>>>
>>
>> The refcount are guaranteed to be zero there, isn't it?
> No, it's not.
>
> err_free_dev:
>         free_netdev(dev);
>
> void free_netdev(struct net_device *dev)
> {
> ...
>         /* pcpu_refcnt can be freed without checking refcount */
>         free_percpu(dev->pcpu_refcnt);
>         dev->pcpu_refcnt = NULL;
>
>         /*  Compatibility with error handling in drivers */
>         if (dev->reg_state == NETREG_UNINITIALIZED) {
>                 /* dev can be freed without checking refcount */
>                 netdev_freemem(dev);
>                 return;
>         }
> ...
> }


Right, but what I meant is in my patch, when code reaches free_netdev() 
the refcnt is zero. What did I miss?

Thanks


>
>>
>> Thanks
>>
>>
>>> Thanks,
>>> Yang
>>>
>>>>
>>>> Thanks
>>>>
>>>> .
>>>>
>>>
>>>
>>
>> .
>>
>
>
Yang Yingliang Sept. 3, 2019, 5:42 a.m. UTC | #11
On 2019/9/3 11:03, Jason Wang wrote:
>
> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>
>>
>> On 2019/9/2 13:32, Jason Wang wrote:
>>>
>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>
>>>>
>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>> ----- Original Message -----
>>>>>>
>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>
>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>
>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>>>>> tfile->tun
>>>>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>>>>> read/write
>>>>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>>>>> free_netdev().
>>>>>>>>>>> (The tun and dev pointer are allocated by 
>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>> can
>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>> register_netdevice() must always be the last operation in the 
>>>>>>>>>> order of
>>>>>>>>>> network device setup.
>>>>>>>>>>
>>>>>>>>>> At the point register_netdevice() is called, the device is 
>>>>>>>>>> visible
>>>>>>>>>> globally
>>>>>>>>>> and therefore all of it's software state must be fully 
>>>>>>>>>> initialized and
>>>>>>>>>> ready for us.
>>>>>>>>>>
>>>>>>>>>> You're going to have to find another solution to these problems.
>>>>>>>>>
>>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>>> case, there's a small window that network stack think the 
>>>>>>>>> device has
>>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>
>>>>>>>>> Or if not, we can try to hold the device before tun_attach and 
>>>>>>>>> drop
>>>>>>>>> it after register_netdevice().
>>>>>>>>
>>>>>>>> Hi Yang:
>>>>>>>>
>>>>>>>> I think maybe we can try to hold refcnt instead of playing real 
>>>>>>>> num
>>>>>>>> queues here. Do you want to post a V4?
>>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>>> When register_netdevice() failed, free_netdev() will be called 
>>>>>>> directly,
>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>>>> How about using patch-v1 that using a flag to check whether the 
>>>>>> device
>>>>>> registered successfully.
>>>>>>
>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I 
>>>>> meant
>>>>> something like (compile-test only):
>>>>>
>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>> --- a/drivers/net/tun.c
>>>>> +++ b/drivers/net/tun.c
>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
>>>>> struct file *file, struct ifreq *ifr)
>>>>>                                (ifr->ifr_flags & TUN_FEATURES);
>>>>>                    INIT_LIST_HEAD(&tun->disabled);
>>>>> +               dev_hold(dev);
>>>>>                  err = tun_attach(tun, file, false, ifr->ifr_flags 
>>>>> & IFF_NAPI,
>>>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>>>                  if (err < 0)
>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
>>>>> struct file *file, struct ifreq *ifr)
>>>>>                  err = register_netdevice(tun->dev);
>>>>>                  if (err < 0)
>>>>>                          goto err_detach;
>>>>> +               dev_put(dev);
>>>>>          }
>>>>>            netif_carrier_on(tun->dev);
>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>>>>> struct file *file, struct ifreq *ifr)
>>>>>          return 0;
>>>>>     err_detach:
>>>>> +       dev_put(dev);
>>>>>          tun_detach_all(dev);
>>>>>          /* register_netdevice() already called tun_free_netdev() */
>>>>>          goto err_free_dev;
>>>>>     err_free_flow:
>>>>> +       dev_put(dev);
>>>>>          tun_flow_uninit(tun);
>>>>>          security_tun_dev_free_security(tun->security);
>>>>>   err_free_stat:
>>>>>
>>>>> What's your thought?
>>>>
>>>> The dev pointer are freed without checking the refcount in 
>>>> free_netdev() called by err_free_dev
>>>>
>>>> path, so I don't understand how the refcount protects this pointer.
>>>>
>>>
>>> The refcount are guaranteed to be zero there, isn't it?
>> No, it's not.
>>
>> err_free_dev:
>>         free_netdev(dev);
>>
>> void free_netdev(struct net_device *dev)
>> {
>> ...
>>         /* pcpu_refcnt can be freed without checking refcount */
>>         free_percpu(dev->pcpu_refcnt);
>>         dev->pcpu_refcnt = NULL;
>>
>>         /*  Compatibility with error handling in drivers */
>>         if (dev->reg_state == NETREG_UNINITIALIZED) {
>>                 /* dev can be freed without checking refcount */
>>                 netdev_freemem(dev);
>>                 return;
>>         }
>> ...
>> }
>
>
> Right, but what I meant is in my patch, when code reaches 
> free_netdev() the refcnt is zero. What did I miss?
Yes, but it can't fix the UAF problem.
>
> Thanks
>
>
>>
>>>
>>> Thanks
>>>
>>>
>>>> Thanks,
>>>> Yang
>>>>
>>>>>
>>>>> Thanks
>>>>>
>>>>> .
>>>>>
>>>>
>>>>
>>>
>>> .
>>>
>>
>>
>
> .
>
Jason Wang Sept. 3, 2019, 6:06 a.m. UTC | #12
On 2019/9/3 下午1:42, Yang Yingliang wrote:
>
>
> On 2019/9/3 11:03, Jason Wang wrote:
>>
>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>
>>>
>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>
>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>
>>>>>
>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>> ----- Original Message -----
>>>>>>>
>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>
>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>
>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>>>>>> read/write
>>>>>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>> (The tun and dev pointer are allocated by 
>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>> can
>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>> register_netdevice() must always be the last operation in 
>>>>>>>>>>> the order of
>>>>>>>>>>> network device setup.
>>>>>>>>>>>
>>>>>>>>>>> At the point register_netdevice() is called, the device is 
>>>>>>>>>>> visible
>>>>>>>>>>> globally
>>>>>>>>>>> and therefore all of it's software state must be fully 
>>>>>>>>>>> initialized and
>>>>>>>>>>> ready for us.
>>>>>>>>>>>
>>>>>>>>>>> You're going to have to find another solution to these 
>>>>>>>>>>> problems.
>>>>>>>>>>
>>>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>>>> case, there's a small window that network stack think the 
>>>>>>>>>> device has
>>>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>
>>>>>>>>>> Or if not, we can try to hold the device before tun_attach 
>>>>>>>>>> and drop
>>>>>>>>>> it after register_netdevice().
>>>>>>>>>
>>>>>>>>> Hi Yang:
>>>>>>>>>
>>>>>>>>> I think maybe we can try to hold refcnt instead of playing 
>>>>>>>>> real num
>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>>>> When register_netdevice() failed, free_netdev() will be called 
>>>>>>>> directly,
>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of dev.
>>>>>>> How about using patch-v1 that using a flag to check whether the 
>>>>>>> device
>>>>>>> registered successfully.
>>>>>>>
>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I 
>>>>>> meant
>>>>>> something like (compile-test only):
>>>>>>
>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>> --- a/drivers/net/tun.c
>>>>>> +++ b/drivers/net/tun.c
>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>                                (ifr->ifr_flags & TUN_FEATURES);
>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>> +               dev_hold(dev);
>>>>>>                  err = tun_attach(tun, file, false, 
>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>>>>                  if (err < 0)
>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>                  err = register_netdevice(tun->dev);
>>>>>>                  if (err < 0)
>>>>>>                          goto err_detach;
>>>>>> +               dev_put(dev);
>>>>>>          }
>>>>>>            netif_carrier_on(tun->dev);
>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>          return 0;
>>>>>>     err_detach:
>>>>>> +       dev_put(dev);
>>>>>>          tun_detach_all(dev);
>>>>>>          /* register_netdevice() already called tun_free_netdev() */
>>>>>>          goto err_free_dev;
>>>>>>     err_free_flow:
>>>>>> +       dev_put(dev);
>>>>>>          tun_flow_uninit(tun);
>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>   err_free_stat:
>>>>>>
>>>>>> What's your thought?
>>>>>
>>>>> The dev pointer are freed without checking the refcount in 
>>>>> free_netdev() called by err_free_dev
>>>>>
>>>>> path, so I don't understand how the refcount protects this pointer.
>>>>>
>>>>
>>>> The refcount are guaranteed to be zero there, isn't it?
>>> No, it's not.
>>>
>>> err_free_dev:
>>>         free_netdev(dev);
>>>
>>> void free_netdev(struct net_device *dev)
>>> {
>>> ...
>>>         /* pcpu_refcnt can be freed without checking refcount */
>>>         free_percpu(dev->pcpu_refcnt);
>>>         dev->pcpu_refcnt = NULL;
>>>
>>>         /*  Compatibility with error handling in drivers */
>>>         if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>                 /* dev can be freed without checking refcount */
>>>                 netdev_freemem(dev);
>>>                 return;
>>>         }
>>> ...
>>> }
>>
>>
>> Right, but what I meant is in my patch, when code reaches 
>> free_netdev() the refcnt is zero. What did I miss?
> Yes, but it can't fix the UAF problem.


Well, it looks to me that the dev_put() in tun_put() won't release the 
device in this case.

Thanks


>>
>> Thanks
>>
>>
>>>
>>>>
>>>> Thanks
>>>>
>>>>
>>>>> Thanks,
>>>>> Yang
>>>>>
>>>>>>
>>>>>> Thanks
>>>>>>
>>>>>> .
>>>>>>
>>>>>
>>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>
>> .
>>
>
>
Yang Yingliang Sept. 3, 2019, 7:35 a.m. UTC | #13
On 2019/9/3 14:06, Jason Wang wrote:
>
> On 2019/9/3 下午1:42, Yang Yingliang wrote:
>>
>>
>> On 2019/9/3 11:03, Jason Wang wrote:
>>>
>>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>>
>>>>
>>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>>
>>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>>
>>>>>>
>>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>>> ----- Original Message -----
>>>>>>>>
>>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>>
>>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>>
>>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure 
>>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>>> is not published until the netdevice is registered. So the 
>>>>>>>>>>>>> read/write
>>>>>>>>>>>>> thread can not use the tun pointer that may freed by 
>>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>>> (The tun and dev pointer are allocated by 
>>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>>> can
>>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>>> register_netdevice() must always be the last operation in 
>>>>>>>>>>>> the order of
>>>>>>>>>>>> network device setup.
>>>>>>>>>>>>
>>>>>>>>>>>> At the point register_netdevice() is called, the device is 
>>>>>>>>>>>> visible
>>>>>>>>>>>> globally
>>>>>>>>>>>> and therefore all of it's software state must be fully 
>>>>>>>>>>>> initialized and
>>>>>>>>>>>> ready for us.
>>>>>>>>>>>>
>>>>>>>>>>>> You're going to have to find another solution to these 
>>>>>>>>>>>> problems.
>>>>>>>>>>>
>>>>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>>>>> case, there's a small window that network stack think the 
>>>>>>>>>>> device has
>>>>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>>
>>>>>>>>>>> Or if not, we can try to hold the device before tun_attach 
>>>>>>>>>>> and drop
>>>>>>>>>>> it after register_netdevice().
>>>>>>>>>>
>>>>>>>>>> Hi Yang:
>>>>>>>>>>
>>>>>>>>>> I think maybe we can try to hold refcnt instead of playing 
>>>>>>>>>> real num
>>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>>>>> When register_netdevice() failed, free_netdev() will be called 
>>>>>>>>> directly,
>>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of 
>>>>>>>>> dev.
>>>>>>>> How about using patch-v1 that using a flag to check whether the 
>>>>>>>> device
>>>>>>>> registered successfully.
>>>>>>>>
>>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I 
>>>>>>> meant
>>>>>>> something like (compile-test only):
>>>>>>>
>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>>> --- a/drivers/net/tun.c
>>>>>>> +++ b/drivers/net/tun.c
>>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net, 
>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>                                (ifr->ifr_flags & TUN_FEATURES);
>>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>>> +               dev_hold(dev);
>>>>>>>                  err = tun_attach(tun, file, false, 
>>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>>>>>                  if (err < 0)
>>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net, 
>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>                  err = register_netdevice(tun->dev);
>>>>>>>                  if (err < 0)
>>>>>>>                          goto err_detach;
>>>>>>> +               dev_put(dev);
>>>>>>>          }
>>>>>>>            netif_carrier_on(tun->dev);
>>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net, 
>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>          return 0;
>>>>>>>     err_detach:
>>>>>>> +       dev_put(dev);
>>>>>>>          tun_detach_all(dev);
>>>>>>>          /* register_netdevice() already called 
>>>>>>> tun_free_netdev() */
>>>>>>>          goto err_free_dev;
>>>>>>>     err_free_flow:
>>>>>>> +       dev_put(dev);
>>>>>>>          tun_flow_uninit(tun);
>>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>>   err_free_stat:
>>>>>>>
>>>>>>> What's your thought?
>>>>>>
>>>>>> The dev pointer are freed without checking the refcount in 
>>>>>> free_netdev() called by err_free_dev
>>>>>>
>>>>>> path, so I don't understand how the refcount protects this pointer.
>>>>>>
>>>>>
>>>>> The refcount are guaranteed to be zero there, isn't it?
>>>> No, it's not.
>>>>
>>>> err_free_dev:
>>>>         free_netdev(dev);
>>>>
>>>> void free_netdev(struct net_device *dev)
>>>> {
>>>> ...
>>>>         /* pcpu_refcnt can be freed without checking refcount */
>>>>         free_percpu(dev->pcpu_refcnt);
>>>>         dev->pcpu_refcnt = NULL;
>>>>
>>>>         /*  Compatibility with error handling in drivers */
>>>>         if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>>                 /* dev can be freed without checking refcount */
>>>>                 netdev_freemem(dev);
>>>>                 return;
>>>>         }
>>>> ...
>>>> }
>>>
>>>
>>> Right, but what I meant is in my patch, when code reaches 
>>> free_netdev() the refcnt is zero. What did I miss?
>> Yes, but it can't fix the UAF problem.
>
>
> Well, it looks to me that the dev_put() in tun_put() won't release the 
> device in this case.

The device is not released in tun_put().
This is how the UAF occurs:

         CPUA                                           CPUB
     tun_set_iff()
       alloc_netdev_mqs()
       tun_attach()
                                                     tun_chr_read_iter()
                                                       tun_get()
                                                       tun_do_read()
                                                         tun_ring_recv()
       register_netdevice() <-- inject error
       goto err_detach
       tun_detach_all() <-- set RCV_SHUTDOWN
       free_netdev() <-- called from
                        err_free_dev path
         netdev_freemem() <-- free the memory
                           without check refcount
         (In this path, the refcount cannot prevent
          freeing the memory of dev, and the memory
          will be used by dev_put() called by
          tun_chr_read_iter() on CPUB.)
                                                        (Break from tun_ring_recv(), because RCV_SHUTDOWN is set)
                                                      tun_put()
                                                      dev_put() <-- use the memory freed by netdev_freemem()


>
> Thanks
>
Jason Wang Sept. 3, 2019, 10:50 a.m. UTC | #14
----- Original Message -----
> 
> 
> On 2019/9/3 14:06, Jason Wang wrote:
> >
> > On 2019/9/3 下午1:42, Yang Yingliang wrote:
> >>
> >>
> >> On 2019/9/3 11:03, Jason Wang wrote:
> >>>
> >>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
> >>>>
> >>>>
> >>>> On 2019/9/2 13:32, Jason Wang wrote:
> >>>>>
> >>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
> >>>>>>
> >>>>>>
> >>>>>> On 2019/8/23 11:05, Jason Wang wrote:
> >>>>>>> ----- Original Message -----
> >>>>>>>>
> >>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
> >>>>>>>>>
> >>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
> >>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
> >>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
> >>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
> >>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
> >>>>>>>>>>>>
> >>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
> >>>>>>>>>>>>> tfile->tun
> >>>>>>>>>>>>> is not published until the netdevice is registered. So the
> >>>>>>>>>>>>> read/write
> >>>>>>>>>>>>> thread can not use the tun pointer that may freed by
> >>>>>>>>>>>>> free_netdev().
> >>>>>>>>>>>>> (The tun and dev pointer are allocated by
> >>>>>>>>>>>>> alloc_netdev_mqs(), they
> >>>>>>>>>>>>> can
> >>>>>>>>>>>>> be freed by netdev_freemem().)
> >>>>>>>>>>>> register_netdevice() must always be the last operation in
> >>>>>>>>>>>> the order of
> >>>>>>>>>>>> network device setup.
> >>>>>>>>>>>>
> >>>>>>>>>>>> At the point register_netdevice() is called, the device is
> >>>>>>>>>>>> visible
> >>>>>>>>>>>> globally
> >>>>>>>>>>>> and therefore all of it's software state must be fully
> >>>>>>>>>>>> initialized and
> >>>>>>>>>>>> ready for us.
> >>>>>>>>>>>>
> >>>>>>>>>>>> You're going to have to find another solution to these
> >>>>>>>>>>>> problems.
> >>>>>>>>>>>
> >>>>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
> >>>>>>>>>>> allowed to be go away without caring the other side. So in this
> >>>>>>>>>>> case, there's a small window that network stack think the
> >>>>>>>>>>> device has
> >>>>>>>>>>> one queue but actually not, the code can then safely drop them.
> >>>>>>>>>>> Maybe it's ok here with some comments?
> >>>>>>>>>>>
> >>>>>>>>>>> Or if not, we can try to hold the device before tun_attach
> >>>>>>>>>>> and drop
> >>>>>>>>>>> it after register_netdevice().
> >>>>>>>>>>
> >>>>>>>>>> Hi Yang:
> >>>>>>>>>>
> >>>>>>>>>> I think maybe we can try to hold refcnt instead of playing
> >>>>>>>>>> real num
> >>>>>>>>>> queues here. Do you want to post a V4?
> >>>>>>>>> I think the refcnt can prevent freeing the memory in this case.
> >>>>>>>>> When register_netdevice() failed, free_netdev() will be called
> >>>>>>>>> directly,
> >>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of
> >>>>>>>>> dev.
> >>>>>>>> How about using patch-v1 that using a flag to check whether the
> >>>>>>>> device
> >>>>>>>> registered successfully.
> >>>>>>>>
> >>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I
> >>>>>>> meant
> >>>>>>> something like (compile-test only):
> >>>>>>>
> >>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>>>>>> index db16d7a13e00..e52678f9f049 100644
> >>>>>>> --- a/drivers/net/tun.c
> >>>>>>> +++ b/drivers/net/tun.c
> >>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
> >>>>>>> struct file *file, struct ifreq *ifr)
> >>>>>>>                                (ifr->ifr_flags & TUN_FEATURES);
> >>>>>>> INIT_LIST_HEAD(&tun->disabled);
> >>>>>>> +               dev_hold(dev);
> >>>>>>>                  err = tun_attach(tun, file, false,
> >>>>>>> ifr->ifr_flags & IFF_NAPI,
> >>>>>>>                                   ifr->ifr_flags & IFF_NAPI_FRAGS);
> >>>>>>>                  if (err < 0)
> >>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
> >>>>>>> struct file *file, struct ifreq *ifr)
> >>>>>>>                  err = register_netdevice(tun->dev);
> >>>>>>>                  if (err < 0)
> >>>>>>>                          goto err_detach;
> >>>>>>> +               dev_put(dev);
> >>>>>>>          }
> >>>>>>>            netif_carrier_on(tun->dev);
> >>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
> >>>>>>> struct file *file, struct ifreq *ifr)
> >>>>>>>          return 0;
> >>>>>>>     err_detach:
> >>>>>>> +       dev_put(dev);
> >>>>>>>          tun_detach_all(dev);
> >>>>>>>          /* register_netdevice() already called
> >>>>>>> tun_free_netdev() */
> >>>>>>>          goto err_free_dev;
> >>>>>>>     err_free_flow:
> >>>>>>> +       dev_put(dev);
> >>>>>>>          tun_flow_uninit(tun);
> >>>>>>> security_tun_dev_free_security(tun->security);
> >>>>>>>   err_free_stat:
> >>>>>>>
> >>>>>>> What's your thought?
> >>>>>>
> >>>>>> The dev pointer are freed without checking the refcount in
> >>>>>> free_netdev() called by err_free_dev
> >>>>>>
> >>>>>> path, so I don't understand how the refcount protects this pointer.
> >>>>>>
> >>>>>
> >>>>> The refcount are guaranteed to be zero there, isn't it?
> >>>> No, it's not.
> >>>>
> >>>> err_free_dev:
> >>>>         free_netdev(dev);
> >>>>
> >>>> void free_netdev(struct net_device *dev)
> >>>> {
> >>>> ...
> >>>>         /* pcpu_refcnt can be freed without checking refcount */
> >>>>         free_percpu(dev->pcpu_refcnt);
> >>>>         dev->pcpu_refcnt = NULL;
> >>>>
> >>>>         /*  Compatibility with error handling in drivers */
> >>>>         if (dev->reg_state == NETREG_UNINITIALIZED) {
> >>>>                 /* dev can be freed without checking refcount */
> >>>>                 netdev_freemem(dev);
> >>>>                 return;
> >>>>         }
> >>>> ...
> >>>> }
> >>>
> >>>
> >>> Right, but what I meant is in my patch, when code reaches
> >>> free_netdev() the refcnt is zero. What did I miss?
> >> Yes, but it can't fix the UAF problem.
> >
> >
> > Well, it looks to me that the dev_put() in tun_put() won't release the
> > device in this case.
> 
> The device is not released in tun_put().
> This is how the UAF occurs:
> 
>          CPUA                                           CPUB
>      tun_set_iff()
>        alloc_netdev_mqs()
>        tun_attach()
>                                                      tun_chr_read_iter()
>                                                        tun_get()
>                                                        tun_do_read()
>                                                          tun_ring_recv()
>        register_netdevice() <-- inject error
>        goto err_detach
>        tun_detach_all() <-- set RCV_SHUTDOWN
>        free_netdev() <-- called from
>                         err_free_dev path
>          netdev_freemem() <-- free the memory
>                            without check refcount
>          (In this path, the refcount cannot prevent
>           freeing the memory of dev, and the memory
>           will be used by dev_put() called by
>           tun_chr_read_iter() on CPUB.)
>                                                         (Break from
>                                                         tun_ring_recv(),
>                                                         because RCV_SHUTDOWN
>                                                         is set)
>                                                       tun_put()
>                                                       dev_put() <-- use the
>                                                       memory freed by
>                                                       netdev_freemem()
> 
>

My bad, thanks for the patience. Since all evil come from the
tfile->tun, how about delay the publishing of tfile->tun until the
success of registration to make sure dev_put() and dev_hold() work.
(Compile test only)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..aab0be40d443 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
 }
 
 static int tun_attach(struct tun_struct *tun, struct file *file,
-		      bool skip_filter, bool napi, bool napi_frags)
+		      bool skip_filter, bool napi, bool napi_frags,
+		      bool publish_tun)
 {
 	struct tun_file *tfile = file->private_data;
 	struct net_device *dev = tun->dev;
@@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 	 * initialized tfile; otherwise we risk using half-initialized
 	 * object.
 	 */
-	rcu_assign_pointer(tfile->tun, tun);
+	if (publish_tun)
+		rcu_assign_pointer(tfile->tun, tun);
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 	tun_set_real_num_queues(tun);
@@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
 				 ifr->ifr_flags & IFF_NAPI,
-				 ifr->ifr_flags & IFF_NAPI_FRAGS);
+				 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
 		if (err < 0)
 			return err;
 
@@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
 		INIT_LIST_HEAD(&tun->disabled);
 		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-				 ifr->ifr_flags & IFF_NAPI_FRAGS);
+				 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
 		if (err < 0)
 			goto err_free_flow;
 
 		err = register_netdevice(tun->dev);
 		if (err < 0)
 			goto err_detach;
+		/* free_netdev() won't check refcnt, to aovid race
+		 * with dev_put() we need publish tun after registration.
+		 */
+		rcu_assign_pointer(tfile->tun, tun);
 	}
 
 	netif_carrier_on(tun->dev);
@@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
 		if (ret < 0)
 			goto unlock;
 		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
-				 tun->flags & IFF_NAPI_FRAGS);
+				 tun->flags & IFF_NAPI_FRAGS, true);
 	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
 		tun = rtnl_dereference(tfile->tun);
 		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
Yang Yingliang Sept. 5, 2019, 2:03 a.m. UTC | #15
On 2019/9/3 18:50, Jason Wang wrote:
>
> ----- Original Message -----
>>
>> On 2019/9/3 14:06, Jason Wang wrote:
>>> On 2019/9/3 下午1:42, Yang Yingliang wrote:
>>>>
>>>> On 2019/9/3 11:03, Jason Wang wrote:
>>>>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>>>>
>>>>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>>>>
>>>>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>>>>> ----- Original Message -----
>>>>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>>>>>>>> read/write
>>>>>>>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>>>>> (The tun and dev pointer are allocated by
>>>>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>>>>> can
>>>>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>>>>> register_netdevice() must always be the last operation in
>>>>>>>>>>>>>> the order of
>>>>>>>>>>>>>> network device setup.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> At the point register_netdevice() is called, the device is
>>>>>>>>>>>>>> visible
>>>>>>>>>>>>>> globally
>>>>>>>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>>>>>>>> initialized and
>>>>>>>>>>>>>> ready for us.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> You're going to have to find another solution to these
>>>>>>>>>>>>>> problems.
>>>>>>>>>>>>> The device is loosely coupled with sockets/queues. Each side is
>>>>>>>>>>>>> allowed to be go away without caring the other side. So in this
>>>>>>>>>>>>> case, there's a small window that network stack think the
>>>>>>>>>>>>> device has
>>>>>>>>>>>>> one queue but actually not, the code can then safely drop them.
>>>>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>>>>
>>>>>>>>>>>>> Or if not, we can try to hold the device before tun_attach
>>>>>>>>>>>>> and drop
>>>>>>>>>>>>> it after register_netdevice().
>>>>>>>>>>>> Hi Yang:
>>>>>>>>>>>>
>>>>>>>>>>>> I think maybe we can try to hold refcnt instead of playing
>>>>>>>>>>>> real num
>>>>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>>>>> I think the refcnt can prevent freeing the memory in this case.
>>>>>>>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>>>>>>>> directly,
>>>>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of
>>>>>>>>>>> dev.
>>>>>>>>>> How about using patch-v1 that using a flag to check whether the
>>>>>>>>>> device
>>>>>>>>>> registered successfully.
>>>>>>>>>>
>>>>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I
>>>>>>>>> meant
>>>>>>>>> something like (compile-test only):
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>                                 (ifr->ifr_flags & TUN_FEATURES);
>>>>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>>>>> +               dev_hold(dev);
>>>>>>>>>                   err = tun_attach(tun, file, false,
>>>>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>>>>                                    ifr->ifr_flags & IFF_NAPI_FRAGS);
>>>>>>>>>                   if (err < 0)
>>>>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>                   err = register_netdevice(tun->dev);
>>>>>>>>>                   if (err < 0)
>>>>>>>>>                           goto err_detach;
>>>>>>>>> +               dev_put(dev);
>>>>>>>>>           }
>>>>>>>>>             netif_carrier_on(tun->dev);
>>>>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>           return 0;
>>>>>>>>>      err_detach:
>>>>>>>>> +       dev_put(dev);
>>>>>>>>>           tun_detach_all(dev);
>>>>>>>>>           /* register_netdevice() already called
>>>>>>>>> tun_free_netdev() */
>>>>>>>>>           goto err_free_dev;
>>>>>>>>>      err_free_flow:
>>>>>>>>> +       dev_put(dev);
>>>>>>>>>           tun_flow_uninit(tun);
>>>>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>>>>    err_free_stat:
>>>>>>>>>
>>>>>>>>> What's your thought?
>>>>>>>> The dev pointer are freed without checking the refcount in
>>>>>>>> free_netdev() called by err_free_dev
>>>>>>>>
>>>>>>>> path, so I don't understand how the refcount protects this pointer.
>>>>>>>>
>>>>>>> The refcount are guaranteed to be zero there, isn't it?
>>>>>> No, it's not.
>>>>>>
>>>>>> err_free_dev:
>>>>>>          free_netdev(dev);
>>>>>>
>>>>>> void free_netdev(struct net_device *dev)
>>>>>> {
>>>>>> ...
>>>>>>          /* pcpu_refcnt can be freed without checking refcount */
>>>>>>          free_percpu(dev->pcpu_refcnt);
>>>>>>          dev->pcpu_refcnt = NULL;
>>>>>>
>>>>>>          /*  Compatibility with error handling in drivers */
>>>>>>          if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>>>>                  /* dev can be freed without checking refcount */
>>>>>>                  netdev_freemem(dev);
>>>>>>                  return;
>>>>>>          }
>>>>>> ...
>>>>>> }
>>>>>
>>>>> Right, but what I meant is in my patch, when code reaches
>>>>> free_netdev() the refcnt is zero. What did I miss?
>>>> Yes, but it can't fix the UAF problem.
>>>
>>> Well, it looks to me that the dev_put() in tun_put() won't release the
>>> device in this case.
>> The device is not released in tun_put().
>> This is how the UAF occurs:
>>
>>           CPUA                                           CPUB
>>       tun_set_iff()
>>         alloc_netdev_mqs()
>>         tun_attach()
>>                                                       tun_chr_read_iter()
>>                                                         tun_get()
>>                                                         tun_do_read()
>>                                                           tun_ring_recv()
>>         register_netdevice() <-- inject error
>>         goto err_detach
>>         tun_detach_all() <-- set RCV_SHUTDOWN
>>         free_netdev() <-- called from
>>                          err_free_dev path
>>           netdev_freemem() <-- free the memory
>>                             without check refcount
>>           (In this path, the refcount cannot prevent
>>            freeing the memory of dev, and the memory
>>            will be used by dev_put() called by
>>            tun_chr_read_iter() on CPUB.)
>>                                                          (Break from
>>                                                          tun_ring_recv(),
>>                                                          because RCV_SHUTDOWN
>>                                                          is set)
>>                                                        tun_put()
>>                                                        dev_put() <-- use the
>>                                                        memory freed by
>>                                                        netdev_freemem()
>>
>>
> My bad, thanks for the patience. Since all evil come from the
> tfile->tun, how about delay the publishing of tfile->tun until the
> success of registration to make sure dev_put() and dev_hold() work.
> (Compile test only)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index db16d7a13e00..aab0be40d443 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
>   }
>   
>   static int tun_attach(struct tun_struct *tun, struct file *file,
> -		      bool skip_filter, bool napi, bool napi_frags)
> +		      bool skip_filter, bool napi, bool napi_frags,
> +		      bool publish_tun)
>   {
>   	struct tun_file *tfile = file->private_data;
>   	struct net_device *dev = tun->dev;
> @@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>   	 * initialized tfile; otherwise we risk using half-initialized
>   	 * object.
>   	 */
> -	rcu_assign_pointer(tfile->tun, tun);
> +	if (publish_tun)
> +		rcu_assign_pointer(tfile->tun, tun);
>   	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>   	tun->numqueues++;
>   	tun_set_real_num_queues(tun);
> @@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>   
>   		err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
>   				 ifr->ifr_flags & IFF_NAPI,
> -				 ifr->ifr_flags & IFF_NAPI_FRAGS);
> +				 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
>   		if (err < 0)
>   			return err;
>   
> @@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>   
>   		INIT_LIST_HEAD(&tun->disabled);
>   		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
> -				 ifr->ifr_flags & IFF_NAPI_FRAGS);
> +				 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
>   		if (err < 0)
>   			goto err_free_flow;
>   
>   		err = register_netdevice(tun->dev);
>   		if (err < 0)
>   			goto err_detach;
> +		/* free_netdev() won't check refcnt, to aovid race
> +		 * with dev_put() we need publish tun after registration.
> +		 */
> +		rcu_assign_pointer(tfile->tun, tun);
>   	}
>   
>   	netif_carrier_on(tun->dev);
> @@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
>   		if (ret < 0)
>   			goto unlock;
>   		ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
> -				 tun->flags & IFF_NAPI_FRAGS);
> +				 tun->flags & IFF_NAPI_FRAGS, true);
>   	} else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
>   		tun = rtnl_dereference(tfile->tun);
>   		if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
I tested this patch, it can fix this UAF.

But as Eric replied in my patch v1, tun_get() will return NULL as long 
as tun_set_iff() (TUNSETIFF ioctl())
has not yet been called. This could break some applications, since 
tun_get() is used from poll()
and other syscalls.

I think it should return '-EAGIAN' instead of '-EBADFD' in this way. I 
did some change in patch v1,
if it's OK, I will send a v4.

  drivers/net/tun.c | 34 ++++++++++++++++++++++++++++++----
  1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..0abc654010e3 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -115,6 +115,7 @@ do {                                \
  /* High bits in flags field are unused. */
  #define TUN_VNET_LE     0x80000000
  #define TUN_VNET_BE     0x40000000
+#define TUN_DEV_REGISTERED    0x20000000

  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
                IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
@@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile, 
bool clean)
              netif_carrier_off(tun->dev);

              if (!(tun->flags & IFF_PERSIST) &&
-                tun->dev->reg_state == NETREG_REGISTERED)
+                tun->dev->reg_state == NETREG_REGISTERED) {
+                tun->flags &= ~TUN_DEV_REGISTERED;
                  unregister_netdevice(tun->dev);
+            }
          }
          if (tun)
              xdp_rxq_info_unreg(&tfile->xdp_rxq);
@@ -884,8 +887,12 @@ static struct tun_struct *tun_get(struct tun_file 
*tfile)

      rcu_read_lock();
      tun = rcu_dereference(tfile->tun);
-    if (tun)
-        dev_hold(tun->dev);
+    if (tun) {
+        if (tun->flags & TUN_DEV_REGISTERED)
+            dev_hold(tun->dev);
+        else
+            tun = ERR_PTR(-EAGAIN);
+    }
      rcu_read_unlock();

      return tun;
@@ -1428,7 +1435,7 @@ static __poll_t tun_chr_poll(struct file *file, 
poll_table *wait)
      struct sock *sk;
      __poll_t mask = 0;

-    if (!tun)
+    if (IS_ERR_OR_NULL(tun))
          return EPOLLERR;

      sk = tfile->socket.sk;
@@ -2017,6 +2024,9 @@ static ssize_t tun_chr_write_iter(struct kiocb 
*iocb, struct iov_iter *from)
      if (!tun)
          return -EBADFD;

+    if (IS_ERR(tun))
+        return PTR_ERR(tun);
+
      result = tun_get_user(tun, tfile, NULL, from,
                    file->f_flags & O_NONBLOCK, false);

@@ -2242,6 +2252,10 @@ static ssize_t tun_chr_read_iter(struct kiocb 
*iocb, struct iov_iter *to)

      if (!tun)
          return -EBADFD;
+
+    if (IS_ERR(tun))
+        return PTR_ERR(tun);
+
      ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
      ret = min_t(ssize_t, ret, len);
      if (ret > 0)
@@ -2525,6 +2539,9 @@ static int tun_sendmsg(struct socket *sock, struct 
msghdr *m, size_t total_len)
      if (!tun)
          return -EBADFD;

+    if (IS_ERR(tun))
+        return PTR_ERR(tun);
+
      if (ctl && (ctl->type == TUN_MSG_PTR)) {
          struct tun_page tpage;
          int n = ctl->num;
@@ -2573,6 +2590,11 @@ static int tun_recvmsg(struct socket *sock, 
struct msghdr *m, size_t total_len,
          goto out_free;
      }

+    if (IS_ERR(tun)) {
+        ret = PTR_ERR(tun);
+        goto out_free;
+    }
+
      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
          ret = -EINVAL;
          goto out_put_tun;
@@ -2622,6 +2644,9 @@ static int tun_peek_len(struct socket *sock)
      if (!tun)
          return 0;

+    if (IS_ERR(tun))
+        return PTR_ERR(tun);
+
      ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
      tun_put(tun);

@@ -2836,6 +2861,7 @@ static int tun_set_iff(struct net *net, struct 
file *file, struct ifreq *ifr)
          err = register_netdevice(tun->dev);
          if (err < 0)
              goto err_detach;
+        tun->flags |= TUN_DEV_REGISTERED;
      }

      netif_carrier_on(tun->dev);
Jason Wang Sept. 5, 2019, 3:10 a.m. UTC | #16
On 2019/9/5 上午10:03, Yang Yingliang wrote:
>
>
> On 2019/9/3 18:50, Jason Wang wrote:
>>
>> ----- Original Message -----
>>>
>>> On 2019/9/3 14:06, Jason Wang wrote:
>>>> On 2019/9/3 下午1:42, Yang Yingliang wrote:
>>>>>
>>>>> On 2019/9/3 11:03, Jason Wang wrote:
>>>>>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>>>>>
>>>>>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>>>>>
>>>>>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>>>>>> ----- Original Message -----
>>>>>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>>>>>>>>> read/write
>>>>>>>>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>>>>>> (The tun and dev pointer are allocated by
>>>>>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>>>>>> can
>>>>>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>>>>>> register_netdevice() must always be the last operation in
>>>>>>>>>>>>>>> the order of
>>>>>>>>>>>>>>> network device setup.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> At the point register_netdevice() is called, the device is
>>>>>>>>>>>>>>> visible
>>>>>>>>>>>>>>> globally
>>>>>>>>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>>>>>>>>> initialized and
>>>>>>>>>>>>>>> ready for us.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> You're going to have to find another solution to these
>>>>>>>>>>>>>>> problems.
>>>>>>>>>>>>>> The device is loosely coupled with sockets/queues. Each
>>>>>>>>>>>>>> side is
>>>>>>>>>>>>>> allowed to be go away without caring the other side. So
>>>>>>>>>>>>>> in this
>>>>>>>>>>>>>> case, there's a small window that network stack think the
>>>>>>>>>>>>>> device has
>>>>>>>>>>>>>> one queue but actually not, the code can then safely drop
>>>>>>>>>>>>>> them.
>>>>>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Or if not, we can try to hold the device before tun_attach
>>>>>>>>>>>>>> and drop
>>>>>>>>>>>>>> it after register_netdevice().
>>>>>>>>>>>>> Hi Yang:
>>>>>>>>>>>>>
>>>>>>>>>>>>> I think maybe we can try to hold refcnt instead of playing
>>>>>>>>>>>>> real num
>>>>>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>>>>>> I think the refcnt can prevent freeing the memory in this
>>>>>>>>>>>> case.
>>>>>>>>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>>>>>>>>> directly,
>>>>>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of
>>>>>>>>>>>> dev.
>>>>>>>>>>> How about using patch-v1 that using a flag to check whether the
>>>>>>>>>>> device
>>>>>>>>>>> registered successfully.
>>>>>>>>>>>
>>>>>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I
>>>>>>>>>> meant
>>>>>>>>>> something like (compile-test only):
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>                                 (ifr->ifr_flags & TUN_FEATURES);
>>>>>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>>>>>> +               dev_hold(dev);
>>>>>>>>>>                   err = tun_attach(tun, file, false,
>>>>>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>>>>>                                    ifr->ifr_flags &
>>>>>>>>>> IFF_NAPI_FRAGS);
>>>>>>>>>>                   if (err < 0)
>>>>>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>                   err = register_netdevice(tun->dev);
>>>>>>>>>>                   if (err < 0)
>>>>>>>>>>                           goto err_detach;
>>>>>>>>>> +               dev_put(dev);
>>>>>>>>>>           }
>>>>>>>>>>             netif_carrier_on(tun->dev);
>>>>>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>           return 0;
>>>>>>>>>>      err_detach:
>>>>>>>>>> +       dev_put(dev);
>>>>>>>>>>           tun_detach_all(dev);
>>>>>>>>>>           /* register_netdevice() already called
>>>>>>>>>> tun_free_netdev() */
>>>>>>>>>>           goto err_free_dev;
>>>>>>>>>>      err_free_flow:
>>>>>>>>>> +       dev_put(dev);
>>>>>>>>>>           tun_flow_uninit(tun);
>>>>>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>>>>>    err_free_stat:
>>>>>>>>>>
>>>>>>>>>> What's your thought?
>>>>>>>>> The dev pointer are freed without checking the refcount in
>>>>>>>>> free_netdev() called by err_free_dev
>>>>>>>>>
>>>>>>>>> path, so I don't understand how the refcount protects this
>>>>>>>>> pointer.
>>>>>>>>>
>>>>>>>> The refcount are guaranteed to be zero there, isn't it?
>>>>>>> No, it's not.
>>>>>>>
>>>>>>> err_free_dev:
>>>>>>>          free_netdev(dev);
>>>>>>>
>>>>>>> void free_netdev(struct net_device *dev)
>>>>>>> {
>>>>>>> ...
>>>>>>>          /* pcpu_refcnt can be freed without checking refcount */
>>>>>>>          free_percpu(dev->pcpu_refcnt);
>>>>>>>          dev->pcpu_refcnt = NULL;
>>>>>>>
>>>>>>>          /*  Compatibility with error handling in drivers */
>>>>>>>          if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>>>>>                  /* dev can be freed without checking refcount */
>>>>>>>                  netdev_freemem(dev);
>>>>>>>                  return;
>>>>>>>          }
>>>>>>> ...
>>>>>>> }
>>>>>>
>>>>>> Right, but what I meant is in my patch, when code reaches
>>>>>> free_netdev() the refcnt is zero. What did I miss?
>>>>> Yes, but it can't fix the UAF problem.
>>>>
>>>> Well, it looks to me that the dev_put() in tun_put() won't release the
>>>> device in this case.
>>> The device is not released in tun_put().
>>> This is how the UAF occurs:
>>>
>>>           CPUA                                           CPUB
>>>       tun_set_iff()
>>>         alloc_netdev_mqs()
>>>         tun_attach()
>>>                                                      
>>> tun_chr_read_iter()
>>>                                                         tun_get()
>>>                                                         tun_do_read()
>>>                                                          
>>> tun_ring_recv()
>>>         register_netdevice() <-- inject error
>>>         goto err_detach
>>>         tun_detach_all() <-- set RCV_SHUTDOWN
>>>         free_netdev() <-- called from
>>>                          err_free_dev path
>>>           netdev_freemem() <-- free the memory
>>>                             without check refcount
>>>           (In this path, the refcount cannot prevent
>>>            freeing the memory of dev, and the memory
>>>            will be used by dev_put() called by
>>>            tun_chr_read_iter() on CPUB.)
>>>                                                          (Break from
>>>                                                         
>>> tun_ring_recv(),
>>>                                                          because
>>> RCV_SHUTDOWN
>>>                                                          is set)
>>>                                                        tun_put()
>>>                                                        dev_put() <--
>>> use the
>>>                                                        memory freed by
>>>                                                        netdev_freemem()
>>>
>>>
>> My bad, thanks for the patience. Since all evil come from the
>> tfile->tun, how about delay the publishing of tfile->tun until the
>> success of registration to make sure dev_put() and dev_hold() work.
>> (Compile test only)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index db16d7a13e00..aab0be40d443 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
>>   }
>>     static int tun_attach(struct tun_struct *tun, struct file *file,
>> -              bool skip_filter, bool napi, bool napi_frags)
>> +              bool skip_filter, bool napi, bool napi_frags,
>> +              bool publish_tun)
>>   {
>>       struct tun_file *tfile = file->private_data;
>>       struct net_device *dev = tun->dev;
>> @@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun,
>> struct file *file,
>>        * initialized tfile; otherwise we risk using half-initialized
>>        * object.
>>        */
>> -    rcu_assign_pointer(tfile->tun, tun);
>> +    if (publish_tun)
>> +        rcu_assign_pointer(tfile->tun, tun);
>>       rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>       tun->numqueues++;
>>       tun_set_real_num_queues(tun);
>> @@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct
>> file *file, struct ifreq *ifr)
>>             err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
>>                    ifr->ifr_flags & IFF_NAPI,
>> -                 ifr->ifr_flags & IFF_NAPI_FRAGS);
>> +                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
>>           if (err < 0)
>>               return err;
>>   @@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net,
>> struct file *file, struct ifreq *ifr)
>>             INIT_LIST_HEAD(&tun->disabled);
>>           err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
>> -                 ifr->ifr_flags & IFF_NAPI_FRAGS);
>> +                 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
>>           if (err < 0)
>>               goto err_free_flow;
>>             err = register_netdevice(tun->dev);
>>           if (err < 0)
>>               goto err_detach;
>> +        /* free_netdev() won't check refcnt, to aovid race
>> +         * with dev_put() we need publish tun after registration.
>> +         */
>> +        rcu_assign_pointer(tfile->tun, tun);
>>       }
>>         netif_carrier_on(tun->dev);
>> @@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file,
>> struct ifreq *ifr)
>>           if (ret < 0)
>>               goto unlock;
>>           ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
>> -                 tun->flags & IFF_NAPI_FRAGS);
>> +                 tun->flags & IFF_NAPI_FRAGS, true);
>>       } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
>>           tun = rtnl_dereference(tfile->tun);
>>           if (!tun || !(tun->flags & IFF_MULTI_QUEUE) ||
>> tfile->detached)
> I tested this patch, it can fix this UAF.
>
> But as Eric replied in my patch v1, tun_get() will return NULL as long
> as tun_set_iff() (TUNSETIFF ioctl())
> has not yet been called. 


Isn't this the expected behavior. Without TUNSETIFF, it means the
netdevice is not attached, tun_get() should return NULL here.


> This could break some applications, since tun_get() is used from poll()
> and other syscalls.
>
> I think it should return '-EAGIAN' instead of '-EBADFD' in this way. I
> did some change in patch v1,
> if it's OK, I will send a v4.
>
>  drivers/net/tun.c | 34 ++++++++++++++++++++++++++++++----
>  1 file changed, 30 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index db16d7a13e00..0abc654010e3 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -115,6 +115,7 @@ do {                                \
>  /* High bits in flags field are unused. */
>  #define TUN_VNET_LE     0x80000000
>  #define TUN_VNET_BE     0x40000000
> +#define TUN_DEV_REGISTERED    0x20000000
>
>  #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
>                IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
> @@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile,
> bool clean)
>              netif_carrier_off(tun->dev);
>
>              if (!(tun->flags & IFF_PERSIST) &&
> -                tun->dev->reg_state == NETREG_REGISTERED)
> +                tun->dev->reg_state == NETREG_REGISTERED) {
> +                tun->flags &= ~TUN_DEV_REGISTERED;


As I said for previous versions. It's not good that try to invent new
internal state like this, and you need carefully to deal with the
synchronization, it could be lock or barriers. Consider the
synchronization of tun is already complex, let's better try to avoid
adding more but using exist mechanism, e.g pointer publishing through RCU.

Thanks


>                  unregister_netdevice(tun->dev);
> +            }
>          }
>          if (tun)
>              xdp_rxq_info_unreg(&tfile->xdp_rxq);
> @@ -884,8 +887,12 @@ static struct tun_struct *tun_get(struct tun_file
> *tfile)
>
>      rcu_read_lock();
>      tun = rcu_dereference(tfile->tun);
> -    if (tun)
> -        dev_hold(tun->dev);
> +    if (tun) {
> +        if (tun->flags & TUN_DEV_REGISTERED)
> +            dev_hold(tun->dev);
> +        else
> +            tun = ERR_PTR(-EAGAIN);
> +    }
>      rcu_read_unlock();
>
>      return tun;
> @@ -1428,7 +1435,7 @@ static __poll_t tun_chr_poll(struct file *file,
> poll_table *wait)
>      struct sock *sk;
>      __poll_t mask = 0;
>
> -    if (!tun)
> +    if (IS_ERR_OR_NULL(tun))
>          return EPOLLERR;
>
>      sk = tfile->socket.sk;
> @@ -2017,6 +2024,9 @@ static ssize_t tun_chr_write_iter(struct kiocb
> *iocb, struct iov_iter *from)
>      if (!tun)
>          return -EBADFD;
>
> +    if (IS_ERR(tun))
> +        return PTR_ERR(tun);
> +
>      result = tun_get_user(tun, tfile, NULL, from,
>                    file->f_flags & O_NONBLOCK, false);
>
> @@ -2242,6 +2252,10 @@ static ssize_t tun_chr_read_iter(struct kiocb
> *iocb, struct iov_iter *to)
>
>      if (!tun)
>          return -EBADFD;
> +
> +    if (IS_ERR(tun))
> +        return PTR_ERR(tun);
> +
>      ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
>      ret = min_t(ssize_t, ret, len);
>      if (ret > 0)
> @@ -2525,6 +2539,9 @@ static int tun_sendmsg(struct socket *sock,
> struct msghdr *m, size_t total_len)
>      if (!tun)
>          return -EBADFD;
>
> +    if (IS_ERR(tun))
> +        return PTR_ERR(tun);
> +
>      if (ctl && (ctl->type == TUN_MSG_PTR)) {
>          struct tun_page tpage;
>          int n = ctl->num;
> @@ -2573,6 +2590,11 @@ static int tun_recvmsg(struct socket *sock,
> struct msghdr *m, size_t total_len,
>          goto out_free;
>      }
>
> +    if (IS_ERR(tun)) {
> +        ret = PTR_ERR(tun);
> +        goto out_free;
> +    }
> +
>      if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
>          ret = -EINVAL;
>          goto out_put_tun;
> @@ -2622,6 +2644,9 @@ static int tun_peek_len(struct socket *sock)
>      if (!tun)
>          return 0;
>
> +    if (IS_ERR(tun))
> +        return PTR_ERR(tun);
> +
>      ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
>      tun_put(tun);
>
> @@ -2836,6 +2861,7 @@ static int tun_set_iff(struct net *net, struct
> file *file, struct ifreq *ifr)
>          err = register_netdevice(tun->dev);
>          if (err < 0)
>              goto err_detach;
> +        tun->flags |= TUN_DEV_REGISTERED;
>      }
>
>      netif_carrier_on(tun->dev);
Yang Yingliang Sept. 10, 2019, 2:31 a.m. UTC | #17
On 2019/9/5 11:10, Jason Wang wrote:
> On 2019/9/5 上午10:03, Yang Yingliang wrote:
>>
>> On 2019/9/3 18:50, Jason Wang wrote:
>>> ----- Original Message -----
>>>> On 2019/9/3 14:06, Jason Wang wrote:
>>>>> On 2019/9/3 下午1:42, Yang Yingliang wrote:
>>>>>> On 2019/9/3 11:03, Jason Wang wrote:
>>>>>>> On 2019/9/3 上午9:45, Yang Yingliang wrote:
>>>>>>>> On 2019/9/2 13:32, Jason Wang wrote:
>>>>>>>>> On 2019/8/23 下午5:36, Yang Yingliang wrote:
>>>>>>>>>> On 2019/8/23 11:05, Jason Wang wrote:
>>>>>>>>>>> ----- Original Message -----
>>>>>>>>>>>> On 2019/8/22 14:07, Yang Yingliang wrote:
>>>>>>>>>>>>> On 2019/8/22 10:13, Jason Wang wrote:
>>>>>>>>>>>>>> On 2019/8/20 上午10:28, Jason Wang wrote:
>>>>>>>>>>>>>>> On 2019/8/20 上午9:25, David Miller wrote:
>>>>>>>>>>>>>>>> From: Yang Yingliang <yangyingliang@huawei.com>
>>>>>>>>>>>>>>>> Date: Mon, 19 Aug 2019 21:31:19 +0800
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>>> Call tun_attach() after register_netdevice() to make sure
>>>>>>>>>>>>>>>>> tfile->tun
>>>>>>>>>>>>>>>>> is not published until the netdevice is registered. So the
>>>>>>>>>>>>>>>>> read/write
>>>>>>>>>>>>>>>>> thread can not use the tun pointer that may freed by
>>>>>>>>>>>>>>>>> free_netdev().
>>>>>>>>>>>>>>>>> (The tun and dev pointer are allocated by
>>>>>>>>>>>>>>>>> alloc_netdev_mqs(), they
>>>>>>>>>>>>>>>>> can
>>>>>>>>>>>>>>>>> be freed by netdev_freemem().)
>>>>>>>>>>>>>>>> register_netdevice() must always be the last operation in
>>>>>>>>>>>>>>>> the order of
>>>>>>>>>>>>>>>> network device setup.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> At the point register_netdevice() is called, the device is
>>>>>>>>>>>>>>>> visible
>>>>>>>>>>>>>>>> globally
>>>>>>>>>>>>>>>> and therefore all of it's software state must be fully
>>>>>>>>>>>>>>>> initialized and
>>>>>>>>>>>>>>>> ready for us.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> You're going to have to find another solution to these
>>>>>>>>>>>>>>>> problems.
>>>>>>>>>>>>>>> The device is loosely coupled with sockets/queues. Each
>>>>>>>>>>>>>>> side is
>>>>>>>>>>>>>>> allowed to be go away without caring the other side. So
>>>>>>>>>>>>>>> in this
>>>>>>>>>>>>>>> case, there's a small window that network stack think the
>>>>>>>>>>>>>>> device has
>>>>>>>>>>>>>>> one queue but actually not, the code can then safely drop
>>>>>>>>>>>>>>> them.
>>>>>>>>>>>>>>> Maybe it's ok here with some comments?
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Or if not, we can try to hold the device before tun_attach
>>>>>>>>>>>>>>> and drop
>>>>>>>>>>>>>>> it after register_netdevice().
>>>>>>>>>>>>>> Hi Yang:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I think maybe we can try to hold refcnt instead of playing
>>>>>>>>>>>>>> real num
>>>>>>>>>>>>>> queues here. Do you want to post a V4?
>>>>>>>>>>>>> I think the refcnt can prevent freeing the memory in this
>>>>>>>>>>>>> case.
>>>>>>>>>>>>> When register_netdevice() failed, free_netdev() will be called
>>>>>>>>>>>>> directly,
>>>>>>>>>>>>> dev->pcpu_refcnt and dev are freed without checking refcnt of
>>>>>>>>>>>>> dev.
>>>>>>>>>>>> How about using patch-v1 that using a flag to check whether the
>>>>>>>>>>>> device
>>>>>>>>>>>> registered successfully.
>>>>>>>>>>>>
>>>>>>>>>>> As I said, it lacks sufficient locks or barriers. To be clear, I
>>>>>>>>>>> meant
>>>>>>>>>>> something like (compile-test only):
>>>>>>>>>>>
>>>>>>>>>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>>>>>>>>>> index db16d7a13e00..e52678f9f049 100644
>>>>>>>>>>> --- a/drivers/net/tun.c
>>>>>>>>>>> +++ b/drivers/net/tun.c
>>>>>>>>>>> @@ -2828,6 +2828,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>>                                  (ifr->ifr_flags & TUN_FEATURES);
>>>>>>>>>>> INIT_LIST_HEAD(&tun->disabled);
>>>>>>>>>>> +               dev_hold(dev);
>>>>>>>>>>>                    err = tun_attach(tun, file, false,
>>>>>>>>>>> ifr->ifr_flags & IFF_NAPI,
>>>>>>>>>>>                                     ifr->ifr_flags &
>>>>>>>>>>> IFF_NAPI_FRAGS);
>>>>>>>>>>>                    if (err < 0)
>>>>>>>>>>> @@ -2836,6 +2837,7 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>>                    err = register_netdevice(tun->dev);
>>>>>>>>>>>                    if (err < 0)
>>>>>>>>>>>                            goto err_detach;
>>>>>>>>>>> +               dev_put(dev);
>>>>>>>>>>>            }
>>>>>>>>>>>              netif_carrier_on(tun->dev);
>>>>>>>>>>> @@ -2852,11 +2854,13 @@ static int tun_set_iff(struct net *net,
>>>>>>>>>>> struct file *file, struct ifreq *ifr)
>>>>>>>>>>>            return 0;
>>>>>>>>>>>       err_detach:
>>>>>>>>>>> +       dev_put(dev);
>>>>>>>>>>>            tun_detach_all(dev);
>>>>>>>>>>>            /* register_netdevice() already called
>>>>>>>>>>> tun_free_netdev() */
>>>>>>>>>>>            goto err_free_dev;
>>>>>>>>>>>       err_free_flow:
>>>>>>>>>>> +       dev_put(dev);
>>>>>>>>>>>            tun_flow_uninit(tun);
>>>>>>>>>>> security_tun_dev_free_security(tun->security);
>>>>>>>>>>>     err_free_stat:
>>>>>>>>>>>
>>>>>>>>>>> What's your thought?
>>>>>>>>>> The dev pointer are freed without checking the refcount in
>>>>>>>>>> free_netdev() called by err_free_dev
>>>>>>>>>>
>>>>>>>>>> path, so I don't understand how the refcount protects this
>>>>>>>>>> pointer.
>>>>>>>>>>
>>>>>>>>> The refcount are guaranteed to be zero there, isn't it?
>>>>>>>> No, it's not.
>>>>>>>>
>>>>>>>> err_free_dev:
>>>>>>>>           free_netdev(dev);
>>>>>>>>
>>>>>>>> void free_netdev(struct net_device *dev)
>>>>>>>> {
>>>>>>>> ...
>>>>>>>>           /* pcpu_refcnt can be freed without checking refcount */
>>>>>>>>           free_percpu(dev->pcpu_refcnt);
>>>>>>>>           dev->pcpu_refcnt = NULL;
>>>>>>>>
>>>>>>>>           /*  Compatibility with error handling in drivers */
>>>>>>>>           if (dev->reg_state == NETREG_UNINITIALIZED) {
>>>>>>>>                   /* dev can be freed without checking refcount */
>>>>>>>>                   netdev_freemem(dev);
>>>>>>>>                   return;
>>>>>>>>           }
>>>>>>>> ...
>>>>>>>> }
>>>>>>> Right, but what I meant is in my patch, when code reaches
>>>>>>> free_netdev() the refcnt is zero. What did I miss?
>>>>>> Yes, but it can't fix the UAF problem.
>>>>> Well, it looks to me that the dev_put() in tun_put() won't release the
>>>>> device in this case.
>>>> The device is not released in tun_put().
>>>> This is how the UAF occurs:
>>>>
>>>>            CPUA                                           CPUB
>>>>        tun_set_iff()
>>>>          alloc_netdev_mqs()
>>>>          tun_attach()
>>>>                                                       
>>>> tun_chr_read_iter()
>>>>                                                          tun_get()
>>>>                                                          tun_do_read()
>>>>                                                           
>>>> tun_ring_recv()
>>>>          register_netdevice() <-- inject error
>>>>          goto err_detach
>>>>          tun_detach_all() <-- set RCV_SHUTDOWN
>>>>          free_netdev() <-- called from
>>>>                           err_free_dev path
>>>>            netdev_freemem() <-- free the memory
>>>>                              without check refcount
>>>>            (In this path, the refcount cannot prevent
>>>>             freeing the memory of dev, and the memory
>>>>             will be used by dev_put() called by
>>>>             tun_chr_read_iter() on CPUB.)
>>>>                                                           (Break from
>>>>                                                          
>>>> tun_ring_recv(),
>>>>                                                           because
>>>> RCV_SHUTDOWN
>>>>                                                           is set)
>>>>                                                         tun_put()
>>>>                                                         dev_put() <--
>>>> use the
>>>>                                                         memory freed by
>>>>                                                         netdev_freemem()
>>>>
>>>>
>>> My bad, thanks for the patience. Since all evil come from the
>>> tfile->tun, how about delay the publishing of tfile->tun until the
>>> success of registration to make sure dev_put() and dev_hold() work.
>>> (Compile test only)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index db16d7a13e00..aab0be40d443 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -787,7 +787,8 @@ static void tun_detach_all(struct net_device *dev)
>>>    }
>>>      static int tun_attach(struct tun_struct *tun, struct file *file,
>>> -              bool skip_filter, bool napi, bool napi_frags)
>>> +              bool skip_filter, bool napi, bool napi_frags,
>>> +              bool publish_tun)
>>>    {
>>>        struct tun_file *tfile = file->private_data;
>>>        struct net_device *dev = tun->dev;
>>> @@ -870,7 +871,8 @@ static int tun_attach(struct tun_struct *tun,
>>> struct file *file,
>>>         * initialized tfile; otherwise we risk using half-initialized
>>>         * object.
>>>         */
>>> -    rcu_assign_pointer(tfile->tun, tun);
>>> +    if (publish_tun)
>>> +        rcu_assign_pointer(tfile->tun, tun);
>>>        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>        tun->numqueues++;
>>>        tun_set_real_num_queues(tun);
>>> @@ -2730,7 +2732,7 @@ static int tun_set_iff(struct net *net, struct
>>> file *file, struct ifreq *ifr)
>>>              err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
>>>                     ifr->ifr_flags & IFF_NAPI,
>>> -                 ifr->ifr_flags & IFF_NAPI_FRAGS);
>>> +                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
>>>            if (err < 0)
>>>                return err;
>>>    @@ -2829,13 +2831,17 @@ static int tun_set_iff(struct net *net,
>>> struct file *file, struct ifreq *ifr)
>>>              INIT_LIST_HEAD(&tun->disabled);
>>>            err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
>>> -                 ifr->ifr_flags & IFF_NAPI_FRAGS);
>>> +                 ifr->ifr_flags & IFF_NAPI_FRAGS, false);
>>>            if (err < 0)
>>>                goto err_free_flow;
>>>              err = register_netdevice(tun->dev);
>>>            if (err < 0)
>>>                goto err_detach;
>>> +        /* free_netdev() won't check refcnt, to aovid race
>>> +         * with dev_put() we need publish tun after registration.
>>> +         */
>>> +        rcu_assign_pointer(tfile->tun, tun);
>>>        }
>>>          netif_carrier_on(tun->dev);
>>> @@ -2978,7 +2984,7 @@ static int tun_set_queue(struct file *file,
>>> struct ifreq *ifr)
>>>            if (ret < 0)
>>>                goto unlock;
>>>            ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
>>> -                 tun->flags & IFF_NAPI_FRAGS);
>>> +                 tun->flags & IFF_NAPI_FRAGS, true);
>>>        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
>>>            tun = rtnl_dereference(tfile->tun);
>>>            if (!tun || !(tun->flags & IFF_MULTI_QUEUE) ||
>>> tfile->detached)
>> I tested this patch, it can fix this UAF.
>>
>> But as Eric replied in my patch v1, tun_get() will return NULL as long
>> as tun_set_iff() (TUNSETIFF ioctl())
>> has not yet been called.
>
> Isn't this the expected behavior. Without TUNSETIFF, it means the
> netdevice is not attached, tun_get() should return NULL here.
>
>
>> This could break some applications, since tun_get() is used from poll()
>> and other syscalls.
>>
>> I think it should return '-EAGIAN' instead of '-EBADFD' in this way. I
>> did some change in patch v1,
>> if it's OK, I will send a v4.
>>
>>   drivers/net/tun.c | 34 ++++++++++++++++++++++++++++++----
>>   1 file changed, 30 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>> index db16d7a13e00..0abc654010e3 100644
>> --- a/drivers/net/tun.c
>> +++ b/drivers/net/tun.c
>> @@ -115,6 +115,7 @@ do {                                \
>>   /* High bits in flags field are unused. */
>>   #define TUN_VNET_LE     0x80000000
>>   #define TUN_VNET_BE     0x40000000
>> +#define TUN_DEV_REGISTERED    0x20000000
>>
>>   #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
>>                 IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
>> @@ -719,8 +720,10 @@ static void __tun_detach(struct tun_file *tfile,
>> bool clean)
>>               netif_carrier_off(tun->dev);
>>
>>               if (!(tun->flags & IFF_PERSIST) &&
>> -                tun->dev->reg_state == NETREG_REGISTERED)
>> +                tun->dev->reg_state == NETREG_REGISTERED) {
>> +                tun->flags &= ~TUN_DEV_REGISTERED;
>
> As I said for previous versions. It's not good that try to invent new
> internal state like this, and you need carefully to deal with the
> synchronization, it could be lock or barriers. Consider the
> synchronization of tun is already complex, let's better try to avoid
> adding more but using exist mechanism, e.g pointer publishing through RCU.
OK, need I post a V4 by using the diff file you sent ?
>
> Thanks
>
>
>>                   unregister_netdevice(tun->dev);
>> +            }
>>           }
>>           if (tun)
>>               xdp_rxq_info_unreg(&tfile->xdp_rxq);
>> @@ -884,8 +887,12 @@ static struct tun_struct *tun_get(struct tun_file
>> *tfile)
>>
>>       rcu_read_lock();
>>       tun = rcu_dereference(tfile->tun);
>> -    if (tun)
>> -        dev_hold(tun->dev);
>> +    if (tun) {
>> +        if (tun->flags & TUN_DEV_REGISTERED)
>> +            dev_hold(tun->dev);
>> +        else
>> +            tun = ERR_PTR(-EAGAIN);
>> +    }
>>       rcu_read_unlock();
>>
>>       return tun;
>> @@ -1428,7 +1435,7 @@ static __poll_t tun_chr_poll(struct file *file,
>> poll_table *wait)
>>       struct sock *sk;
>>       __poll_t mask = 0;
>>
>> -    if (!tun)
>> +    if (IS_ERR_OR_NULL(tun))
>>           return EPOLLERR;
>>
>>       sk = tfile->socket.sk;
>> @@ -2017,6 +2024,9 @@ static ssize_t tun_chr_write_iter(struct kiocb
>> *iocb, struct iov_iter *from)
>>       if (!tun)
>>           return -EBADFD;
>>
>> +    if (IS_ERR(tun))
>> +        return PTR_ERR(tun);
>> +
>>       result = tun_get_user(tun, tfile, NULL, from,
>>                     file->f_flags & O_NONBLOCK, false);
>>
>> @@ -2242,6 +2252,10 @@ static ssize_t tun_chr_read_iter(struct kiocb
>> *iocb, struct iov_iter *to)
>>
>>       if (!tun)
>>           return -EBADFD;
>> +
>> +    if (IS_ERR(tun))
>> +        return PTR_ERR(tun);
>> +
>>       ret = tun_do_read(tun, tfile, to, file->f_flags & O_NONBLOCK, NULL);
>>       ret = min_t(ssize_t, ret, len);
>>       if (ret > 0)
>> @@ -2525,6 +2539,9 @@ static int tun_sendmsg(struct socket *sock,
>> struct msghdr *m, size_t total_len)
>>       if (!tun)
>>           return -EBADFD;
>>
>> +    if (IS_ERR(tun))
>> +        return PTR_ERR(tun);
>> +
>>       if (ctl && (ctl->type == TUN_MSG_PTR)) {
>>           struct tun_page tpage;
>>           int n = ctl->num;
>> @@ -2573,6 +2590,11 @@ static int tun_recvmsg(struct socket *sock,
>> struct msghdr *m, size_t total_len,
>>           goto out_free;
>>       }
>>
>> +    if (IS_ERR(tun)) {
>> +        ret = PTR_ERR(tun);
>> +        goto out_free;
>> +    }
>> +
>>       if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
>>           ret = -EINVAL;
>>           goto out_put_tun;
>> @@ -2622,6 +2644,9 @@ static int tun_peek_len(struct socket *sock)
>>       if (!tun)
>>           return 0;
>>
>> +    if (IS_ERR(tun))
>> +        return PTR_ERR(tun);
>> +
>>       ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
>>       tun_put(tun);
>>
>> @@ -2836,6 +2861,7 @@ static int tun_set_iff(struct net *net, struct
>> file *file, struct ifreq *ifr)
>>           err = register_netdevice(tun->dev);
>>           if (err < 0)
>>               goto err_detach;
>> +        tun->flags |= TUN_DEV_REGISTERED;
>>       }
>>
>>       netif_carrier_on(tun->dev);
Jason Wang Sept. 10, 2019, 2:36 a.m. UTC | #18
On 2019/9/10 上午10:31, Yang Yingliang wrote:
>>>
>>>               if (!(tun->flags & IFF_PERSIST) &&
>>> -                tun->dev->reg_state == NETREG_REGISTERED)
>>> +                tun->dev->reg_state == NETREG_REGISTERED) {
>>> +                tun->flags &= ~TUN_DEV_REGISTERED;
>>
>> As I said for previous versions. It's not good that try to invent new
>> internal state like this, and you need carefully to deal with the
>> synchronization, it could be lock or barriers. Consider the
>> synchronization of tun is already complex, let's better try to avoid
>> adding more but using exist mechanism, e.g pointer publishing through 
>> RCU.
> OK, need I post a V4 by using the diff file you sent ? 


Yes, please.

Thanks
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index db16d7a13e00..07d1e945385a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2828,14 +2828,19 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 			      (ifr->ifr_flags & TUN_FEATURES);
 
 		INIT_LIST_HEAD(&tun->disabled);
-		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
-				 ifr->ifr_flags & IFF_NAPI_FRAGS);
-		if (err < 0)
-			goto err_free_flow;
+
+		netif_set_real_num_tx_queues(tun->dev, 1);
+		netif_set_real_num_rx_queues(tun->dev, 1);
 
 		err = register_netdevice(tun->dev);
 		if (err < 0)
-			goto err_detach;
+			/* register_netdevice() already called tun_free_netdev() */
+			goto err_free_dev;
+
+		err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI,
+				 ifr->ifr_flags & IFF_NAPI_FRAGS);
+		if (err < 0)
+			goto err_unregister;
 	}
 
 	netif_carrier_on(tun->dev);
@@ -2851,14 +2856,10 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	strcpy(ifr->ifr_name, tun->dev->name);
 	return 0;
 
-err_detach:
-	tun_detach_all(dev);
-	/* register_netdevice() already called tun_free_netdev() */
-	goto err_free_dev;
+err_unregister:
+	unregister_netdevice(dev);
+	return err;
 
-err_free_flow:
-	tun_flow_uninit(tun);
-	security_tun_dev_free_security(tun->security);
 err_free_stat:
 	free_percpu(tun->pcpu_stats);
 err_free_dev: