diff mbox series

[net,1/2] tun: hold napi_mutex for all napi operations

Message ID 20190107200224.220467-1-sdf@google.com
State Changes Requested
Delegated to: David Miller
Headers show
Series [net,1/2] tun: hold napi_mutex for all napi operations | expand

Commit Message

Stanislav Fomichev Jan. 7, 2019, 8:02 p.m. UTC
BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1
Call Trace:
 ? napi_gro_frags+0xa7/0x2c0
 tun_get_user+0xb50/0xf20
 tun_chr_write_iter+0x53/0x70
 new_sync_write+0xff/0x160
 vfs_write+0x191/0x1e0
 __x64_sys_write+0x5e/0xd0
 do_syscall_64+0x47/0xf0
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

I think there is a subtle race between sending a packet via tap and
attaching it:

CPU0:                    CPU1:
tun_chr_ioctl(TUNSETIFF)
  tun_set_iff
    tun_attach
      rcu_assign_pointer(tfile->tun, tun);
                         tun_fops->write_iter()
                           tun_chr_write_iter
                             tun_napi_alloc_frags
                               napi_get_frags
                                 napi->skb = napi_alloc_skb
      tun_napi_init
        netif_napi_add
          napi->skb = NULL
                              napi->skb is NULL here
                              napi_gro_frags
                                napi_frags_skb
				  skb = napi->skb
				  skb_reset_mac_header(skb)
				  panic()

To fix, do the following:
* Move rcu_assign_pointer(tfile->tun, tun) to be the last thing we do
  in tun_attach(); this should guarantee that when we call tun_get()
  we always get an initialized object
* As another safeguard, always grab napi_mutex whenever doing any
  napi operation; this should prevent napi state change between
  calls to napi_get_frags and napi_gro_frags

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/tun.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

Comments

Eric Dumazet Jan. 7, 2019, 8:22 p.m. UTC | #1
On Mon, Jan 7, 2019 at 12:02 PM Stanislav Fomichev <sdf@google.com> wrote:
>
> BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1
> Call Trace:
>  ? napi_gro_frags+0xa7/0x2c0
>  tun_get_user+0xb50/0xf20
>  tun_chr_write_iter+0x53/0x70
>  new_sync_write+0xff/0x160
>  vfs_write+0x191/0x1e0
>  __x64_sys_write+0x5e/0xd0
>  do_syscall_64+0x47/0xf0
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> I think there is a subtle race between sending a packet via tap and
> attaching it:
>
> CPU0:                    CPU1:
> tun_chr_ioctl(TUNSETIFF)
>   tun_set_iff
>     tun_attach
>       rcu_assign_pointer(tfile->tun, tun);
>                          tun_fops->write_iter()
>                            tun_chr_write_iter
>                              tun_napi_alloc_frags
>                                napi_get_frags
>                                  napi->skb = napi_alloc_skb
>       tun_napi_init
>         netif_napi_add
>           napi->skb = NULL
>                               napi->skb is NULL here
>                               napi_gro_frags
>                                 napi_frags_skb
>                                   skb = napi->skb
>                                   skb_reset_mac_header(skb)
>                                   panic()
>
> To fix, do the following:
> * Move rcu_assign_pointer(tfile->tun, tun) to be the last thing we do
>   in tun_attach(); this should guarantee that when we call tun_get()
>   we always get an initialized object
> * As another safeguard, always grab napi_mutex whenever doing any
>   napi operation; this should prevent napi state change between
>   calls to napi_get_frags and napi_gro_frags
>
> Reported-by: syzbot <syzkaller@googlegroups.com>
> Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  drivers/net/tun.c | 18 +++++++++++++++---
>  1 file changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index a4fdad475594..7875f06011f2 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -323,22 +323,30 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
>         tfile->napi_enabled = napi_en;
>         tfile->napi_frags_enabled = napi_en && napi_frags;
>         if (napi_en) {
> +               mutex_lock(&tfile->napi_mutex);
>                 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
>                                NAPI_POLL_WEIGHT);
>                 napi_enable(&tfile->napi);
> +               mutex_unlock(&tfile->napi_mutex);
>         }
>  }
>
>  static void tun_napi_disable(struct tun_file *tfile)
>  {
> -       if (tfile->napi_enabled)
> +       if (tfile->napi_enabled) {
> +               mutex_lock(&tfile->napi_mutex);
>                 napi_disable(&tfile->napi);
> +               mutex_unlock(&tfile->napi_mutex);
> +       }
>  }
>
>  static void tun_napi_del(struct tun_file *tfile)
>  {
> -       if (tfile->napi_enabled)
> +       if (tfile->napi_enabled) {
> +               mutex_lock(&tfile->napi_mutex);
>                 netif_napi_del(&tfile->napi);
> +               mutex_unlock(&tfile->napi_mutex);
> +       }
>  }
>
>  static bool tun_napi_frags_enabled(const struct tun_file *tfile)
> @@ -856,7 +864,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>                 err = 0;
>         }
>
> -       rcu_assign_pointer(tfile->tun, tun);
>         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>         tun->numqueues++;
>
> @@ -876,6 +883,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>          * refcnt.
>          */
>
> +       /* All tun_fops depend on tun_get() returning non-null pointer.
> +        * Thus, assigning tun to a tfile should be the last init operation,
> +        * otherwise we risk using half-initialized object.
> +        */
> +       rcu_assign_pointer(tfile->tun, tun);
>  out:
>         return err;
>  }

Hmmm I believe the issue is different : We need to call
tun_napi_init() before doing the publish in the tun->tfiles[] array

My patch was :

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a4fdad47559462fbd049a89f880cc3fb33d1151d..dc751d1cbc21a2e2687c5739b44322cd64d0cb46
100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -857,15 +857,15 @@ static int tun_attach(struct tun_struct *tun,
struct file *file,
        }

        rcu_assign_pointer(tfile->tun, tun);
+       if (!tfile->detached) {
+               tun_napi_init(tun, tfile, napi, napi_frags);
+               sock_hold(&tfile->sk);
+       }
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;

-       if (tfile->detached) {
+       if (tfile->detached)
                tun_enable_queue(tfile);
-       } else {
-               sock_hold(&tfile->sk);
-               tun_napi_init(tun, tfile, napi, napi_frags);
-       }

        if (rtnl_dereference(tun->xdp_prog))
                sock_set_flag(&tfile->sk, SOCK_XDP);
Stanislav Fomichev Jan. 7, 2019, 9:02 p.m. UTC | #2
On 01/07, Eric Dumazet wrote:
> On Mon, Jan 7, 2019 at 12:02 PM Stanislav Fomichev <sdf@google.com> wrote:
> >
> > BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1
> > Call Trace:
> >  ? napi_gro_frags+0xa7/0x2c0
> >  tun_get_user+0xb50/0xf20
> >  tun_chr_write_iter+0x53/0x70
> >  new_sync_write+0xff/0x160
> >  vfs_write+0x191/0x1e0
> >  __x64_sys_write+0x5e/0xd0
> >  do_syscall_64+0x47/0xf0
> >  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >
> > I think there is a subtle race between sending a packet via tap and
> > attaching it:
> >
> > CPU0:                    CPU1:
> > tun_chr_ioctl(TUNSETIFF)
> >   tun_set_iff
> >     tun_attach
> >       rcu_assign_pointer(tfile->tun, tun);
> >                          tun_fops->write_iter()
> >                            tun_chr_write_iter
> >                              tun_napi_alloc_frags
> >                                napi_get_frags
> >                                  napi->skb = napi_alloc_skb
> >       tun_napi_init
> >         netif_napi_add
> >           napi->skb = NULL
> >                               napi->skb is NULL here
> >                               napi_gro_frags
> >                                 napi_frags_skb
> >                                   skb = napi->skb
> >                                   skb_reset_mac_header(skb)
> >                                   panic()
> >
> > To fix, do the following:
> > * Move rcu_assign_pointer(tfile->tun, tun) to be the last thing we do
> >   in tun_attach(); this should guarantee that when we call tun_get()
> >   we always get an initialized object
> > * As another safeguard, always grab napi_mutex whenever doing any
> >   napi operation; this should prevent napi state change between
> >   calls to napi_get_frags and napi_gro_frags
> >
> > Reported-by: syzbot <syzkaller@googlegroups.com>
> > Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
> >
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  drivers/net/tun.c | 18 +++++++++++++++---
> >  1 file changed, 15 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> > index a4fdad475594..7875f06011f2 100644
> > --- a/drivers/net/tun.c
> > +++ b/drivers/net/tun.c
> > @@ -323,22 +323,30 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
> >         tfile->napi_enabled = napi_en;
> >         tfile->napi_frags_enabled = napi_en && napi_frags;
> >         if (napi_en) {
> > +               mutex_lock(&tfile->napi_mutex);
> >                 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
> >                                NAPI_POLL_WEIGHT);
> >                 napi_enable(&tfile->napi);
> > +               mutex_unlock(&tfile->napi_mutex);
> >         }
> >  }
> >
> >  static void tun_napi_disable(struct tun_file *tfile)
> >  {
> > -       if (tfile->napi_enabled)
> > +       if (tfile->napi_enabled) {
> > +               mutex_lock(&tfile->napi_mutex);
> >                 napi_disable(&tfile->napi);
> > +               mutex_unlock(&tfile->napi_mutex);
> > +       }
> >  }
> >
> >  static void tun_napi_del(struct tun_file *tfile)
> >  {
> > -       if (tfile->napi_enabled)
> > +       if (tfile->napi_enabled) {
> > +               mutex_lock(&tfile->napi_mutex);
> >                 netif_napi_del(&tfile->napi);
> > +               mutex_unlock(&tfile->napi_mutex);
> > +       }
> >  }
> >
> >  static bool tun_napi_frags_enabled(const struct tun_file *tfile)
> > @@ -856,7 +864,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> >                 err = 0;
> >         }
> >
> > -       rcu_assign_pointer(tfile->tun, tun);
> >         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >         tun->numqueues++;
> >
> > @@ -876,6 +883,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> >          * refcnt.
> >          */
> >
> > +       /* All tun_fops depend on tun_get() returning non-null pointer.
> > +        * Thus, assigning tun to a tfile should be the last init operation,
> > +        * otherwise we risk using half-initialized object.
> > +        */
> > +       rcu_assign_pointer(tfile->tun, tun);
> >  out:
> >         return err;
> >  }
> 
> Hmmm I believe the issue is different : We need to call
> tun_napi_init() before doing the publish in the tun->tfiles[] array
> 
> My patch was :
Still fails with your patch.

Maybe the best way is to move both of those publishes (tfile->tun and
tun->tfiles[]) to the end of tun_attach? It looks like tfile->tun
is a publish for syscall side (most of tun_socket_ops and tun_fops call
tun_get which looks at tun->dev and bail out if it's NULL) and tun->tfiles
is (mostly, but not really) for napi side.

Here is a repro I'm using if you want to poke it:

syz-execprog -threaded -collide -repeat=0 -procs=6 <rep.syz>

# See https://goo.gl/kgGztJ for information about syzkaller reproducers.
#{"threaded":true,"collide":true,"repeat":true,"procs":6,"sandbox":"none","fault_call":-1,"tun":true,"tmpdir":true,"cgroups":true,"netdev":true,"resetnet":true,"segv":true}
openat$tun(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r0 = openat$tun(0xffffffffffffff9c,
&(0x7f0000001a00)='/dev/net/tun\x00', 0x0, 0x0)
ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000300)={"6e72300100",
0x1132})
r1 = socket$kcm(0x2, 0x3, 0x2)
ioctl$PERF_EVENT_IOC_SET_FILTER(r1, 0x8914,
&(0x7f0000000780)="6e7230010060a19ef9d2c673d9a1571cb9e1369bcd61ef7e49793ae18712eceb1daa769497800b7fbbd35b170c10751d39aeb660d863e49b8c4f3b3dad48902b5b2d6cfd0abd372c63bcf5d70df3fd4d2e8d443c88bc0e5637dd82fc3435bed4de5d693c9a781c863e05d8a6f8689a5be29216061f3ff53f8b6b396678e7ba155ef9152d7e43b1eccb2331eb8eb1ed5586dcf8b3b0b999361a44ff2c22c2abbef42dd24eabe6723346a6e46c0499a21442d8d00dcb57f013ff7595edd0ff076930de3675d34117a44eb0e4f832936da44e57e43a3e36bd48d2a85bf4fd4a804e83f2f3cf378a435af5e287d4e27337b4ada11b26219832ec6b2b38446b3b95fe3771e9f42ca30fb21e12f0a3d8bc2d85454af9fcc0232d8fd909448b01f46c593d31ea1c926465e35a4199079c3ca41128b17cb01fbf5b522be0fd02022ada37fecc14b6c8c8831883b85a1106f2f867020d529f17a350f20dd3bf51a98cfda70c2e3638a483fd3f87940bb478b07c4c110394c0093d17955089f2ca97bbe075124c9b1ff6500d536a95d96f03d48596e008bf0a028b539cec796cec9bf585eb80fe3e0d26")
perf_event_open$cgroup(&(0x7f0000000440)={0x7, 0x70, 0x1, 0x0, 0x6e36,
0x38000000, 0x0, 0x1, 0x21060, 0x0, 0x0, 0x0, 0x20, 0x3, 0xfc50, 0x9,
0x8000, 0x0, 0x0, 0x0, 0x46c8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0, 0x8, 0x3f, 0x401, 0x8001, 0x8, 0x9, 0x0, 0x0, 0x1, 0x1,
@perf_config_ext={0x100000000, 0x6}, 0x10, 0x5, 0x2, 0x7,
0xfffffffffffffff9, 0x0, 0x7}, 0xffffffffffffffff, 0x5,
0xffffffffffffffff, 0x0)
# 0x2 = O_RDWR
r2 = openat$tun(0xffffffffffffff9c,
&(0x7f0000001a00)='/dev/net/tun\x00', 0x2, 0x0)
#  IFF_TAP         0x0002
#  IFF_NAPI        0x0010
#  IFF_NAPI_FRAGS  0x0020
#  IFF_MULTI_QUEUE 0x0100
#  IFF_MULTICAST = 0x1000
ioctl$TUNSETIFF(r2, 0x400454ca, &(0x7f0000000300)={"6e72300100",
0x1132})
write$cgroup_int(r2, &(0x7f0000000000), 0x17b)

> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index a4fdad47559462fbd049a89f880cc3fb33d1151d..dc751d1cbc21a2e2687c5739b44322cd64d0cb46
> 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -857,15 +857,15 @@ static int tun_attach(struct tun_struct *tun,
> struct file *file,
>         }
> 
>         rcu_assign_pointer(tfile->tun, tun);
> +       if (!tfile->detached) {
> +               tun_napi_init(tun, tfile, napi, napi_frags);
> +               sock_hold(&tfile->sk);
> +       }
>         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>         tun->numqueues++;
> 
> -       if (tfile->detached) {
> +       if (tfile->detached)
>                 tun_enable_queue(tfile);
> -       } else {
> -               sock_hold(&tfile->sk);
> -               tun_napi_init(tun, tfile, napi, napi_frags);
> -       }
> 
>         if (rtnl_dereference(tun->xdp_prog))
>                 sock_set_flag(&tfile->sk, SOCK_XDP);
Eric Dumazet Jan. 7, 2019, 9:10 p.m. UTC | #3
On 01/07/2019 01:02 PM, Stanislav Fomichev wrote:
> On 01/07, Eric Dumazet wrote:
>> On Mon, Jan 7, 2019 at 12:02 PM Stanislav Fomichev <sdf@google.com> wrote:
>>>
>>> BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1
>>> Call Trace:
>>>  ? napi_gro_frags+0xa7/0x2c0
>>>  tun_get_user+0xb50/0xf20
>>>  tun_chr_write_iter+0x53/0x70
>>>  new_sync_write+0xff/0x160
>>>  vfs_write+0x191/0x1e0
>>>  __x64_sys_write+0x5e/0xd0
>>>  do_syscall_64+0x47/0xf0
>>>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
>>>
>>> I think there is a subtle race between sending a packet via tap and
>>> attaching it:
>>>
>>> CPU0:                    CPU1:
>>> tun_chr_ioctl(TUNSETIFF)
>>>   tun_set_iff
>>>     tun_attach
>>>       rcu_assign_pointer(tfile->tun, tun);
>>>                          tun_fops->write_iter()
>>>                            tun_chr_write_iter
>>>                              tun_napi_alloc_frags
>>>                                napi_get_frags
>>>                                  napi->skb = napi_alloc_skb
>>>       tun_napi_init
>>>         netif_napi_add
>>>           napi->skb = NULL
>>>                               napi->skb is NULL here
>>>                               napi_gro_frags
>>>                                 napi_frags_skb
>>>                                   skb = napi->skb
>>>                                   skb_reset_mac_header(skb)
>>>                                   panic()
>>>
>>> To fix, do the following:
>>> * Move rcu_assign_pointer(tfile->tun, tun) to be the last thing we do
>>>   in tun_attach(); this should guarantee that when we call tun_get()
>>>   we always get an initialized object
>>> * As another safeguard, always grab napi_mutex whenever doing any
>>>   napi operation; this should prevent napi state change between
>>>   calls to napi_get_frags and napi_gro_frags
>>>
>>> Reported-by: syzbot <syzkaller@googlegroups.com>
>>> Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
>>>
>>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
>>> ---
>>>  drivers/net/tun.c | 18 +++++++++++++++---
>>>  1 file changed, 15 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
>>> index a4fdad475594..7875f06011f2 100644
>>> --- a/drivers/net/tun.c
>>> +++ b/drivers/net/tun.c
>>> @@ -323,22 +323,30 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
>>>         tfile->napi_enabled = napi_en;
>>>         tfile->napi_frags_enabled = napi_en && napi_frags;
>>>         if (napi_en) {
>>> +               mutex_lock(&tfile->napi_mutex);
>>>                 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
>>>                                NAPI_POLL_WEIGHT);
>>>                 napi_enable(&tfile->napi);
>>> +               mutex_unlock(&tfile->napi_mutex);
>>>         }
>>>  }
>>>
>>>  static void tun_napi_disable(struct tun_file *tfile)
>>>  {
>>> -       if (tfile->napi_enabled)
>>> +       if (tfile->napi_enabled) {
>>> +               mutex_lock(&tfile->napi_mutex);
>>>                 napi_disable(&tfile->napi);
>>> +               mutex_unlock(&tfile->napi_mutex);
>>> +       }
>>>  }
>>>
>>>  static void tun_napi_del(struct tun_file *tfile)
>>>  {
>>> -       if (tfile->napi_enabled)
>>> +       if (tfile->napi_enabled) {
>>> +               mutex_lock(&tfile->napi_mutex);
>>>                 netif_napi_del(&tfile->napi);
>>> +               mutex_unlock(&tfile->napi_mutex);
>>> +       }
>>>  }
>>>
>>>  static bool tun_napi_frags_enabled(const struct tun_file *tfile)
>>> @@ -856,7 +864,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>>>                 err = 0;
>>>         }
>>>
>>> -       rcu_assign_pointer(tfile->tun, tun);
>>>         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
>>>         tun->numqueues++;
>>>
>>> @@ -876,6 +883,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
>>>          * refcnt.
>>>          */
>>>
>>> +       /* All tun_fops depend on tun_get() returning non-null pointer.
>>> +        * Thus, assigning tun to a tfile should be the last init operation,
>>> +        * otherwise we risk using half-initialized object.
>>> +        */
>>> +       rcu_assign_pointer(tfile->tun, tun);
>>>  out:
>>>         return err;
>>>  }
>>
>> Hmmm I believe the issue is different : We need to call
>> tun_napi_init() before doing the publish in the tun->tfiles[] array
>>
>> My patch was :
> Still fails with your patch.
> 
> Maybe the best way is to move both of those publishes (tfile->tun and
> tun->tfiles[]) to the end of tun_attach? It looks like tfile->tun
> is a publish for syscall side (most of tun_socket_ops and tun_fops call
> tun_get which looks at tun->dev and bail out if it's NULL) and tun->tfiles
> is (mostly, but not really) for napi side.
> 
> Here is a repro I'm using if you want to poke it:
> 
> syz-execprog -threaded -collide -repeat=0 -procs=6 <rep.syz>
> 
> # See https://goo.gl/kgGztJ for information about syzkaller reproducers.
> #{"threaded":true,"collide":true,"repeat":true,"procs":6,"sandbox":"none","fault_call":-1,"tun":true,"tmpdir":true,"cgroups":true,"netdev":true,"resetnet":true,"segv":true}
> openat$tun(0xffffffffffffff9c, 0x0, 0x0, 0x0)
> r0 = openat$tun(0xffffffffffffff9c,
> &(0x7f0000001a00)='/dev/net/tun\x00', 0x0, 0x0)
> ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000300)={"6e72300100",
> 0x1132})
> r1 = socket$kcm(0x2, 0x3, 0x2)
> ioctl$PERF_EVENT_IOC_SET_FILTER(r1, 0x8914,
> &(0x7f0000000780)="6e7230010060a19ef9d2c673d9a1571cb9e1369bcd61ef7e49793ae18712eceb1daa769497800b7fbbd35b170c10751d39aeb660d863e49b8c4f3b3dad48902b5b2d6cfd0abd372c63bcf5d70df3fd4d2e8d443c88bc0e5637dd82fc3435bed4de5d693c9a781c863e05d8a6f8689a5be29216061f3ff53f8b6b396678e7ba155ef9152d7e43b1eccb2331eb8eb1ed5586dcf8b3b0b999361a44ff2c22c2abbef42dd24eabe6723346a6e46c0499a21442d8d00dcb57f013ff7595edd0ff076930de3675d34117a44eb0e4f832936da44e57e43a3e36bd48d2a85bf4fd4a804e83f2f3cf378a435af5e287d4e27337b4ada11b26219832ec6b2b38446b3b95fe3771e9f42ca30fb21e12f0a3d8bc2d85454af9fcc0232d8fd909448b01f46c593d31ea1c926465e35a4199079c3ca41128b17cb01fbf5b522be0fd02022ada37fecc14b6c8c8831883b85a1106f2f867020d529f17a350f20dd3bf51a98cfda70c2e3638a483fd3f87940bb478b07c4c110394c0093d17955089f2ca97bbe075124c9b1ff6500d536a95d96f03d48596e008bf0a028b539cec796cec9bf585eb80fe3e0d26")
> perf_event_open$cgroup(&(0x7f0000000440)={0x7, 0x70, 0x1, 0x0, 0x6e36,
> 0x38000000, 0x0, 0x1, 0x21060, 0x0, 0x0, 0x0, 0x20, 0x3, 0xfc50, 0x9,
> 0x8000, 0x0, 0x0, 0x0, 0x46c8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
> 0x0, 0x0, 0x8, 0x3f, 0x401, 0x8001, 0x8, 0x9, 0x0, 0x0, 0x1, 0x1,
> @perf_config_ext={0x100000000, 0x6}, 0x10, 0x5, 0x2, 0x7,
> 0xfffffffffffffff9, 0x0, 0x7}, 0xffffffffffffffff, 0x5,
> 0xffffffffffffffff, 0x0)
> # 0x2 = O_RDWR
> r2 = openat$tun(0xffffffffffffff9c,
> &(0x7f0000001a00)='/dev/net/tun\x00', 0x2, 0x0)
> #  IFF_TAP         0x0002
> #  IFF_NAPI        0x0010
> #  IFF_NAPI_FRAGS  0x0020
> #  IFF_MULTI_QUEUE 0x0100
> #  IFF_MULTICAST = 0x1000
> ioctl$TUNSETIFF(r2, 0x400454ca, &(0x7f0000000300)={"6e72300100",
> 0x1132})
> write$cgroup_int(r2, &(0x7f0000000000), 0x17b)
> 
>

I dunno, I would prefer to not throw all these 
mutex_lock(&tfile->napi_mutex)/mutex_unlock(&tfile->napi_mutex) all over the places.

Please publish a minimal patches, or a patch series explaining why each fix is needed.

I do not really see why tun_napi_disable() and/or tun_napi_del() needs extra synchronization.
Stanislav Fomichev Jan. 7, 2019, 9:29 p.m. UTC | #4
On 01/07, Eric Dumazet wrote:
> 
> 
> On 01/07/2019 01:02 PM, Stanislav Fomichev wrote:
> > On 01/07, Eric Dumazet wrote:
> >> On Mon, Jan 7, 2019 at 12:02 PM Stanislav Fomichev <sdf@google.com> wrote:
> >>>
> >>> BUG: unable to handle kernel NULL pointer dereference at 00000000000000d1
> >>> Call Trace:
> >>>  ? napi_gro_frags+0xa7/0x2c0
> >>>  tun_get_user+0xb50/0xf20
> >>>  tun_chr_write_iter+0x53/0x70
> >>>  new_sync_write+0xff/0x160
> >>>  vfs_write+0x191/0x1e0
> >>>  __x64_sys_write+0x5e/0xd0
> >>>  do_syscall_64+0x47/0xf0
> >>>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> >>>
> >>> I think there is a subtle race between sending a packet via tap and
> >>> attaching it:
> >>>
> >>> CPU0:                    CPU1:
> >>> tun_chr_ioctl(TUNSETIFF)
> >>>   tun_set_iff
> >>>     tun_attach
> >>>       rcu_assign_pointer(tfile->tun, tun);
> >>>                          tun_fops->write_iter()
> >>>                            tun_chr_write_iter
> >>>                              tun_napi_alloc_frags
> >>>                                napi_get_frags
> >>>                                  napi->skb = napi_alloc_skb
> >>>       tun_napi_init
> >>>         netif_napi_add
> >>>           napi->skb = NULL
> >>>                               napi->skb is NULL here
> >>>                               napi_gro_frags
> >>>                                 napi_frags_skb
> >>>                                   skb = napi->skb
> >>>                                   skb_reset_mac_header(skb)
> >>>                                   panic()
> >>>
> >>> To fix, do the following:
> >>> * Move rcu_assign_pointer(tfile->tun, tun) to be the last thing we do
> >>>   in tun_attach(); this should guarantee that when we call tun_get()
> >>>   we always get an initialized object
> >>> * As another safeguard, always grab napi_mutex whenever doing any
> >>>   napi operation; this should prevent napi state change between
> >>>   calls to napi_get_frags and napi_gro_frags
> >>>
> >>> Reported-by: syzbot <syzkaller@googlegroups.com>
> >>> Fixes: 90e33d459407 ("tun: enable napi_gro_frags() for TUN/TAP driver")
> >>>
> >>> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> >>> ---
> >>>  drivers/net/tun.c | 18 +++++++++++++++---
> >>>  1 file changed, 15 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> >>> index a4fdad475594..7875f06011f2 100644
> >>> --- a/drivers/net/tun.c
> >>> +++ b/drivers/net/tun.c
> >>> @@ -323,22 +323,30 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
> >>>         tfile->napi_enabled = napi_en;
> >>>         tfile->napi_frags_enabled = napi_en && napi_frags;
> >>>         if (napi_en) {
> >>> +               mutex_lock(&tfile->napi_mutex);
> >>>                 netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
> >>>                                NAPI_POLL_WEIGHT);
> >>>                 napi_enable(&tfile->napi);
> >>> +               mutex_unlock(&tfile->napi_mutex);
> >>>         }
> >>>  }
> >>>
> >>>  static void tun_napi_disable(struct tun_file *tfile)
> >>>  {
> >>> -       if (tfile->napi_enabled)
> >>> +       if (tfile->napi_enabled) {
> >>> +               mutex_lock(&tfile->napi_mutex);
> >>>                 napi_disable(&tfile->napi);
> >>> +               mutex_unlock(&tfile->napi_mutex);
> >>> +       }
> >>>  }
> >>>
> >>>  static void tun_napi_del(struct tun_file *tfile)
> >>>  {
> >>> -       if (tfile->napi_enabled)
> >>> +       if (tfile->napi_enabled) {
> >>> +               mutex_lock(&tfile->napi_mutex);
> >>>                 netif_napi_del(&tfile->napi);
> >>> +               mutex_unlock(&tfile->napi_mutex);
> >>> +       }
> >>>  }
> >>>
> >>>  static bool tun_napi_frags_enabled(const struct tun_file *tfile)
> >>> @@ -856,7 +864,6 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> >>>                 err = 0;
> >>>         }
> >>>
> >>> -       rcu_assign_pointer(tfile->tun, tun);
> >>>         rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
> >>>         tun->numqueues++;
> >>>
> >>> @@ -876,6 +883,11 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
> >>>          * refcnt.
> >>>          */
> >>>
> >>> +       /* All tun_fops depend on tun_get() returning non-null pointer.
> >>> +        * Thus, assigning tun to a tfile should be the last init operation,
> >>> +        * otherwise we risk using half-initialized object.
> >>> +        */
> >>> +       rcu_assign_pointer(tfile->tun, tun);
> >>>  out:
> >>>         return err;
> >>>  }
> >>
> >> Hmmm I believe the issue is different : We need to call
> >> tun_napi_init() before doing the publish in the tun->tfiles[] array
> >>
> >> My patch was :
> > Still fails with your patch.
> > 
> > Maybe the best way is to move both of those publishes (tfile->tun and
> > tun->tfiles[]) to the end of tun_attach? It looks like tfile->tun
> > is a publish for syscall side (most of tun_socket_ops and tun_fops call
> > tun_get which looks at tun->dev and bail out if it's NULL) and tun->tfiles
> > is (mostly, but not really) for napi side.
> > 
> > Here is a repro I'm using if you want to poke it:
> > 
> > syz-execprog -threaded -collide -repeat=0 -procs=6 <rep.syz>
> > 
> > # See https://goo.gl/kgGztJ for information about syzkaller reproducers.
> > #{"threaded":true,"collide":true,"repeat":true,"procs":6,"sandbox":"none","fault_call":-1,"tun":true,"tmpdir":true,"cgroups":true,"netdev":true,"resetnet":true,"segv":true}
> > openat$tun(0xffffffffffffff9c, 0x0, 0x0, 0x0)
> > r0 = openat$tun(0xffffffffffffff9c,
> > &(0x7f0000001a00)='/dev/net/tun\x00', 0x0, 0x0)
> > ioctl$TUNSETIFF(r0, 0x400454ca, &(0x7f0000000300)={"6e72300100",
> > 0x1132})
> > r1 = socket$kcm(0x2, 0x3, 0x2)
> > ioctl$PERF_EVENT_IOC_SET_FILTER(r1, 0x8914,
> > &(0x7f0000000780)="6e7230010060a19ef9d2c673d9a1571cb9e1369bcd61ef7e49793ae18712eceb1daa769497800b7fbbd35b170c10751d39aeb660d863e49b8c4f3b3dad48902b5b2d6cfd0abd372c63bcf5d70df3fd4d2e8d443c88bc0e5637dd82fc3435bed4de5d693c9a781c863e05d8a6f8689a5be29216061f3ff53f8b6b396678e7ba155ef9152d7e43b1eccb2331eb8eb1ed5586dcf8b3b0b999361a44ff2c22c2abbef42dd24eabe6723346a6e46c0499a21442d8d00dcb57f013ff7595edd0ff076930de3675d34117a44eb0e4f832936da44e57e43a3e36bd48d2a85bf4fd4a804e83f2f3cf378a435af5e287d4e27337b4ada11b26219832ec6b2b38446b3b95fe3771e9f42ca30fb21e12f0a3d8bc2d85454af9fcc0232d8fd909448b01f46c593d31ea1c926465e35a4199079c3ca41128b17cb01fbf5b522be0fd02022ada37fecc14b6c8c8831883b85a1106f2f867020d529f17a350f20dd3bf51a98cfda70c2e3638a483fd3f87940bb478b07c4c110394c0093d17955089f2ca97bbe075124c9b1ff6500d536a95d96f03d48596e008bf0a028b539cec796cec9bf585eb80fe3e0d26")
> > perf_event_open$cgroup(&(0x7f0000000440)={0x7, 0x70, 0x1, 0x0, 0x6e36,
> > 0x38000000, 0x0, 0x1, 0x21060, 0x0, 0x0, 0x0, 0x20, 0x3, 0xfc50, 0x9,
> > 0x8000, 0x0, 0x0, 0x0, 0x46c8, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
> > 0x0, 0x0, 0x8, 0x3f, 0x401, 0x8001, 0x8, 0x9, 0x0, 0x0, 0x1, 0x1,
> > @perf_config_ext={0x100000000, 0x6}, 0x10, 0x5, 0x2, 0x7,
> > 0xfffffffffffffff9, 0x0, 0x7}, 0xffffffffffffffff, 0x5,
> > 0xffffffffffffffff, 0x0)
> > # 0x2 = O_RDWR
> > r2 = openat$tun(0xffffffffffffff9c,
> > &(0x7f0000001a00)='/dev/net/tun\x00', 0x2, 0x0)
> > #  IFF_TAP         0x0002
> > #  IFF_NAPI        0x0010
> > #  IFF_NAPI_FRAGS  0x0020
> > #  IFF_MULTI_QUEUE 0x0100
> > #  IFF_MULTICAST = 0x1000
> > ioctl$TUNSETIFF(r2, 0x400454ca, &(0x7f0000000300)={"6e72300100",
> > 0x1132})
> > write$cgroup_int(r2, &(0x7f0000000000), 0x17b)
> > 
> >
> 
> I dunno, I would prefer to not throw all these 
> mutex_lock(&tfile->napi_mutex)/mutex_unlock(&tfile->napi_mutex) all over the places.
> 
> Please publish a minimal patches, or a patch series explaining why each fix is needed.
> 
> I do not really see why tun_napi_disable() and/or tun_napi_del() needs extra synchronization.
They don't, I've added them mostly for consistency.

The main issue I was trying to fix (and the one I think I'm hitting) is
the one in tun_get_user, where we do the following:
1. mutex_lock(&tfile->napi_mutex);
2. skb = tun_napi_alloc_frags(tfile, copylen, from);
  * skb = napi_get_frags(&tfile->napi);
    * napi->skb = napi_alloc_skb()
3. <something happens here that sets napi->skb to NULL>
4. napi_gro_frags(&tfile->napi);
  * skb = napi_frags_skb(napi);
    * struct sk_buff *skb = napi->skb;
    * skb_reset_mac_header(skb);
    * null pointer deref
5. mutex_unlock(&tfile->napi_mutex);

We can basically always grab napi_mutex for all napi-related operations to
make sure nothing happens to napi->skb in between napi_get_frags and
napi_frags_skb or don't publish tfile->tun so tun_get_user never gets called.

I'll send a v2 without napi_mutex'es, I've added them mostly as an
additional safeguard. Publishing both tfile->tun and tun->tfiles at
the end of tun_attach should be enough.
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index a4fdad475594..7875f06011f2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -323,22 +323,30 @@  static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 	tfile->napi_enabled = napi_en;
 	tfile->napi_frags_enabled = napi_en && napi_frags;
 	if (napi_en) {
+		mutex_lock(&tfile->napi_mutex);
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_unlock(&tfile->napi_mutex);
 	}
 }
 
 static void tun_napi_disable(struct tun_file *tfile)
 {
-	if (tfile->napi_enabled)
+	if (tfile->napi_enabled) {
+		mutex_lock(&tfile->napi_mutex);
 		napi_disable(&tfile->napi);
+		mutex_unlock(&tfile->napi_mutex);
+	}
 }
 
 static void tun_napi_del(struct tun_file *tfile)
 {
-	if (tfile->napi_enabled)
+	if (tfile->napi_enabled) {
+		mutex_lock(&tfile->napi_mutex);
 		netif_napi_del(&tfile->napi);
+		mutex_unlock(&tfile->napi_mutex);
+	}
 }
 
 static bool tun_napi_frags_enabled(const struct tun_file *tfile)
@@ -856,7 +864,6 @@  static int tun_attach(struct tun_struct *tun, struct file *file,
 		err = 0;
 	}
 
-	rcu_assign_pointer(tfile->tun, tun);
 	rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
 	tun->numqueues++;
 
@@ -876,6 +883,11 @@  static int tun_attach(struct tun_struct *tun, struct file *file,
 	 * refcnt.
 	 */
 
+	/* All tun_fops depend on tun_get() returning non-null pointer.
+	 * Thus, assigning tun to a tfile should be the last init operation,
+	 * otherwise we risk using half-initialized object.
+	 */
+	rcu_assign_pointer(tfile->tun, tun);
 out:
 	return err;
 }