diff mbox series

[v2,net-next,2/2] tun: enable napi_gro_frags() for TUN/TAP driver

Message ID 20170922021715.2618-3-peterpenkov96@gmail.com
State Superseded, archived
Delegated to: David Miller
Headers show
Series Improve code coverage of syzkaller | expand

Commit Message

Petar Penkov Sept. 22, 2017, 2:17 a.m. UTC
Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities. This flag is accepted only if the
device is in TAP mode and has the IFF_NAPI flag set as well. This is
done because both of these are explicit requirements for correct
operation in this mode.

Signed-off-by: Petar Penkov <peterpenkov96@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Mahesh Bandewar <maheshb@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Cc: davem@davemloft.net
Cc: ppenkov@stanford.edu
---
 drivers/net/tun.c           | 131 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 126 insertions(+), 6 deletions(-)

Comments

Willem de Bruijn Sept. 22, 2017, 2:06 p.m. UTC | #1
> @@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>         if (tfile->detached)
>                 return -EINVAL;
>
> +       if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +

This should perhaps be moved into the !dev branch, directly below the
ns_capable check.

>         dev = __dev_get_by_name(net, ifr->ifr_name);
>         if (dev) {
>                 if (ifr->ifr_flags & IFF_TUN_EXCL)
> @@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>         tun->flags = (tun->flags & ~TUN_FEATURES) |
>                 (ifr->ifr_flags & TUN_FEATURES);
>
> +       if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
> +               tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
> +

Similarly, this check only need to be performed in that branch.
Instead of reverting to non-frags mode, a tun_set_iff with the wrong
set of flags should probably fail hard.
On Fri, Sep 22, 2017 at 7:06 AM, Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>> @@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>         if (tfile->detached)
>>                 return -EINVAL;
>>
>> +       if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
>> +               return -EPERM;
>> +
>
> This should perhaps be moved into the !dev branch, directly below the
> ns_capable check.
>
Hmm, does that mean fail only on creation but allow to attach if
exists? That would be wrong, isn't it? Correct me if I'm wrong but we
want to prevent both these scenarios if user does not have sufficient
privileges (i.e. NET_ADMIN in init-ns).

>>         dev = __dev_get_by_name(net, ifr->ifr_name);
>>         if (dev) {
>>                 if (ifr->ifr_flags & IFF_TUN_EXCL)
>> @@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>         tun->flags = (tun->flags & ~TUN_FEATURES) |
>>                 (ifr->ifr_flags & TUN_FEATURES);
>>
>> +       if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
>> +               tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
>> +
>
> Similarly, this check only need to be performed in that branch.
> Instead of reverting to non-frags mode, a tun_set_iff with the wrong
> set of flags should probably fail hard.
Yes, agree, wrong set of flags should fail hard and probably be done
before attach or open, no?
Petar Penkov Sept. 22, 2017, 5:48 p.m. UTC | #3
On Fri, Sep 22, 2017 at 9:51 AM, Mahesh Bandewar (महेश बंडेवार)
<maheshb@google.com> wrote:
> On Fri, Sep 22, 2017 at 7:06 AM, Willem de Bruijn
> <willemdebruijn.kernel@gmail.com> wrote:
>>> @@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>         if (tfile->detached)
>>>                 return -EINVAL;
>>>
>>> +       if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
>>> +               return -EPERM;
>>> +
>>
>> This should perhaps be moved into the !dev branch, directly below the
>> ns_capable check.
>>
> Hmm, does that mean fail only on creation but allow to attach if
> exists? That would be wrong, isn't it? Correct me if I'm wrong but we
> want to prevent both these scenarios if user does not have sufficient
> privileges (i.e. NET_ADMIN in init-ns).
>
My understanding is we want to protect both scenarios.
>>>         dev = __dev_get_by_name(net, ifr->ifr_name);
>>>         if (dev) {
>>>                 if (ifr->ifr_flags & IFF_TUN_EXCL)
>>> @@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>         tun->flags = (tun->flags & ~TUN_FEATURES) |
>>>                 (ifr->ifr_flags & TUN_FEATURES);
>>>
>>> +       if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
>>> +               tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
>>> +
>>
>> Similarly, this check only need to be performed in that branch.
>> Instead of reverting to non-frags mode, a tun_set_iff with the wrong
>> set of flags should probably fail hard.
> Yes, agree, wrong set of flags should fail hard and probably be done
> before attach or open, no?
Agreed, in v3 I will push this check before the conditional so both
branches can be rejected with EINVAL.
Willem de Bruijn Sept. 22, 2017, 5:50 p.m. UTC | #4
On Fri, Sep 22, 2017 at 1:48 PM, Petar Penkov <peterpenkov96@gmail.com> wrote:
> On Fri, Sep 22, 2017 at 9:51 AM, Mahesh Bandewar (महेश बंडेवार)
> <maheshb@google.com> wrote:
>> On Fri, Sep 22, 2017 at 7:06 AM, Willem de Bruijn
>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> @@ -2061,6 +2174,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>         if (tfile->detached)
>>>>                 return -EINVAL;
>>>>
>>>> +       if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
>>>> +               return -EPERM;
>>>> +
>>>
>>> This should perhaps be moved into the !dev branch, directly below the
>>> ns_capable check.
>>>
>> Hmm, does that mean fail only on creation but allow to attach if
>> exists? That would be wrong, isn't it? Correct me if I'm wrong but we
>> want to prevent both these scenarios if user does not have sufficient
>> privileges (i.e. NET_ADMIN in init-ns).

Ok.

>>
> My understanding is we want to protect both scenarios.
>>>>         dev = __dev_get_by_name(net, ifr->ifr_name);
>>>>         if (dev) {
>>>>                 if (ifr->ifr_flags & IFF_TUN_EXCL)
>>>> @@ -2185,6 +2301,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
>>>>         tun->flags = (tun->flags & ~TUN_FEATURES) |
>>>>                 (ifr->ifr_flags & TUN_FEATURES);
>>>>
>>>> +       if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
>>>> +               tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
>>>> +
>>>
>>> Similarly, this check only need to be performed in that branch.
>>> Instead of reverting to non-frags mode, a tun_set_iff with the wrong
>>> set of flags should probably fail hard.
>> Yes, agree, wrong set of flags should fail hard and probably be done
>> before attach or open, no?
> Agreed, in v3 I will push this check before the conditional so both
> branches can be rejected with EINVAL.

Sounds great.
diff mbox series

Patch

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f16407242b18..e398f019c590 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@ 
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -121,7 +122,8 @@  do {								\
 #define TUN_VNET_BE     0x40000000
 
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-		      IFF_MULTI_QUEUE | IFF_NAPI)
+		      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +175,7 @@  struct tun_file {
 		unsigned int ifindex;
 	};
 	struct napi_struct napi;
+	struct mutex napi_mutex;	/* Protects access to the above napi */
 	struct list_head next;
 	struct tun_struct *detached;
 	struct skb_array tx_array;
@@ -277,6 +280,7 @@  static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
 		netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
 			       NAPI_POLL_WEIGHT);
 		napi_enable(&tfile->napi);
+		mutex_init(&tfile->napi_mutex);
 	}
 }
 
@@ -292,6 +296,11 @@  static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile)
 		netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+	return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1036,7 +1045,8 @@  static void tun_poll_controller(struct net_device *dev)
 	 * supports polling, which enables bridge devices in virt setups to
 	 * still use netconsole
 	 * If NAPI is enabled, however, we need to schedule polling for all
-	 * queues.
+	 * queues unless we are using napi_gro_frags(), which we call in
+	 * process context and not in NAPI context.
 	 */
 	struct tun_struct *tun = netdev_priv(dev);
 
@@ -1044,6 +1054,9 @@  static void tun_poll_controller(struct net_device *dev)
 		struct tun_file *tfile;
 		int i;
 
+		if (tun_napi_frags_enabled(tun))
+			return;
+
 		rcu_read_lock();
 		for (i = 0; i < tun->numqueues; i++) {
 			tfile = rcu_dereference(tun->tfiles[i]);
@@ -1266,6 +1279,64 @@  static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+					    size_t len,
+					    const struct iov_iter *it)
+{
+	struct sk_buff *skb;
+	size_t linear;
+	int err;
+	int i;
+
+	if (it->nr_segs > MAX_SKB_FRAGS + 1)
+		return ERR_PTR(-ENOMEM);
+
+	local_bh_disable();
+	skb = napi_get_frags(&tfile->napi);
+	local_bh_enable();
+	if (!skb)
+		return ERR_PTR(-ENOMEM);
+
+	linear = iov_iter_single_seg_count(it);
+	err = __skb_grow(skb, linear);
+	if (err)
+		goto free;
+
+	skb->len = len;
+	skb->data_len = len - linear;
+	skb->truesize += skb->data_len;
+
+	for (i = 1; i < it->nr_segs; i++) {
+		size_t fragsz = it->iov[i].iov_len;
+		unsigned long offset;
+		struct page *page;
+		void *data;
+
+		if (fragsz == 0 || fragsz > PAGE_SIZE) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		local_bh_disable();
+		data = napi_alloc_frag(fragsz);
+		local_bh_enable();
+		if (!data) {
+			err = -ENOMEM;
+			goto free;
+		}
+
+		page = virt_to_head_page(data);
+		offset = data - page_address(page);
+		skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+	}
+
+	return skb;
+free:
+	/* frees skb and all frags allocated with napi_alloc_frag() */
+	napi_free_frags(&tfile->napi);
+	return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1478,6 +1549,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	int err;
 	u32 rxhash;
 	int skb_xdp = 1;
+	bool frags = tun_napi_frags_enabled(tun);
 
 	if (!(tun->dev->flags & IFF_UP))
 		return -EIO;
@@ -1535,7 +1607,7 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 			zerocopy = true;
 	}
 
-	if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+	if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
 		/* For the packet that is not easy to be processed
 		 * (e.g gso or jumbo packet), we will do it at after
 		 * skb was created with generic XDP routine.
@@ -1556,10 +1628,24 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 				linear = tun16_to_cpu(tun, gso.hdr_len);
 		}
 
-		skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+		if (frags) {
+			mutex_lock(&tfile->napi_mutex);
+			skb = tun_napi_alloc_frags(tfile, copylen, from);
+			/* tun_napi_alloc_frags() enforces a layout for the skb.
+			 * If zerocopy is enabled, then this layout will be
+			 * overwritten by zerocopy_sg_from_iter().
+			 */
+			zerocopy = false;
+		} else {
+			skb = tun_alloc_skb(tfile, align, copylen, linear,
+					    noblock);
+		}
+
 		if (IS_ERR(skb)) {
 			if (PTR_ERR(skb) != -EAGAIN)
 				this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			if (frags)
+				mutex_unlock(&tfile->napi_mutex);
 			return PTR_ERR(skb);
 		}
 
@@ -1571,6 +1657,11 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		if (err) {
 			this_cpu_inc(tun->pcpu_stats->rx_dropped);
 			kfree_skb(skb);
+			if (frags) {
+				tfile->napi.skb = NULL;
+				mutex_unlock(&tfile->napi_mutex);
+			}
+
 			return -EFAULT;
 		}
 	}
@@ -1578,6 +1669,11 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
 		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
 		kfree_skb(skb);
+		if (frags) {
+			tfile->napi.skb = NULL;
+			mutex_unlock(&tfile->napi_mutex);
+		}
+
 		return -EINVAL;
 	}
 
@@ -1603,7 +1699,8 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		skb->dev = tun->dev;
 		break;
 	case IFF_TAP:
-		skb->protocol = eth_type_trans(skb, tun->dev);
+		if (!frags)
+			skb->protocol = eth_type_trans(skb, tun->dev);
 		break;
 	}
 
@@ -1638,7 +1735,23 @@  static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 
 	rxhash = __skb_get_hash_symmetric(skb);
 
-	if (tun->flags & IFF_NAPI) {
+	if (frags) {
+		/* Exercise flow dissector code path. */
+		u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+		if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+			this_cpu_inc(tun->pcpu_stats->rx_dropped);
+			napi_free_frags(&tfile->napi);
+			mutex_unlock(&tfile->napi_mutex);
+			WARN_ON(1);
+			return -ENOMEM;
+		}
+
+		local_bh_disable();
+		napi_gro_frags(&tfile->napi);
+		local_bh_enable();
+		mutex_unlock(&tfile->napi_mutex);
+	} else if (tun->flags & IFF_NAPI) {
 		struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
 		int queue_len;
 
@@ -2061,6 +2174,9 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	if (tfile->detached)
 		return -EINVAL;
 
+	if ((ifr->ifr_flags & IFF_NAPI_FRAGS) && !capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	dev = __dev_get_by_name(net, ifr->ifr_name);
 	if (dev) {
 		if (ifr->ifr_flags & IFF_TUN_EXCL)
@@ -2185,6 +2301,9 @@  static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 	tun->flags = (tun->flags & ~TUN_FEATURES) |
 		(ifr->ifr_flags & TUN_FEATURES);
 
+	if (!(tun->flags & IFF_NAPI) || (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+		tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
 	/* Make sure persistent devices do not get stuck in
 	 * xoff state.
 	 */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 30b6184884eb..365ade5685c9 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,7 @@ 
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
 #define IFF_NAPI	0x0010
+#define IFF_NAPI_FRAGS	0x0020
 #define IFF_NO_PI	0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE	0x2000