diff mbox series

[bpf-next,v3,3/4] xsk: use state member for socket synchronization

Message ID 20190904114913.17217-4-bjorn.topel@gmail.com
State Accepted
Delegated to: BPF Maintainers
Headers show
Series xsk: various CPU barrier and {READ, WRITE}_ONCE | expand

Commit Message

Björn Töpel Sept. 4, 2019, 11:49 a.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

Prior the state variable was introduced by Ilya, the dev member was
used to determine whether the socket was bound or not. However, when
dev was read, proper SMP barriers and READ_ONCE were missing. In order
to address the missing barriers and READ_ONCE, we start using the
state variable as a point of synchronization. The state member
read/write is paired with proper SMP barriers, and from this follows
that the members described above does not need READ_ONCE if used in
conjunction with state check.

In all syscalls and the xsk_rcv path we check if state is
XSK_BOUND. If that is the case we do a SMP read barrier, and this
implies that the dev, umem and all rings are correctly setup. Note
that no READ_ONCE are needed for these variable if used when state is
XSK_BOUND (plus the read barrier).

To summarize: The members struct xdp_sock members dev, queue_id, umem,
fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
and missing {READ, WRITE}_ONCE. Now, umem, fq, cq, tx, rx, and state
are read lock-less. When these members are updated, WRITE_ONCE is
used. When read, READ_ONCE are only used when read outside the control
mutex (e.g. mmap) or, not synchronized with the state member
(XSK_BOUND plus smp_rmb())

Note that dev and queue_id do not need a WRITE_ONCE or READ_ONCE, due
to the introduce state synchronization (XSK_BOUND plus smp_rmb()).

Introducing the state check also fixes a race, found by syzcaller, in
xsk_poll() where umem could be accessed when stale.

Suggested-by: Hillf Danton <hdanton@sina.com>
Reported-by: syzbot+c82697e3043781e08802@syzkaller.appspotmail.com
Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP rings")
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 net/xdp/xsk.c | 54 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 15 deletions(-)

Comments

Jonathan Lemon Sept. 4, 2019, 4:40 p.m. UTC | #1
On 4 Sep 2019, at 4:49, Björn Töpel wrote:

> From: Björn Töpel <bjorn.topel@intel.com>
>
> Prior the state variable was introduced by Ilya, the dev member was
> used to determine whether the socket was bound or not. However, when
> dev was read, proper SMP barriers and READ_ONCE were missing. In order
> to address the missing barriers and READ_ONCE, we start using the
> state variable as a point of synchronization. The state member
> read/write is paired with proper SMP barriers, and from this follows
> that the members described above does not need READ_ONCE if used in
> conjunction with state check.
>
> In all syscalls and the xsk_rcv path we check if state is
> XSK_BOUND. If that is the case we do a SMP read barrier, and this
> implies that the dev, umem and all rings are correctly setup. Note
> that no READ_ONCE are needed for these variable if used when state is
> XSK_BOUND (plus the read barrier).
>
> To summarize: The members struct xdp_sock members dev, queue_id, umem,
> fq, cq, tx, rx, and state were read lock-less, with incorrect barriers
> and missing {READ, WRITE}_ONCE. Now, umem, fq, cq, tx, rx, and state
> are read lock-less. When these members are updated, WRITE_ONCE is
> used. When read, READ_ONCE are only used when read outside the control
> mutex (e.g. mmap) or, not synchronized with the state member
> (XSK_BOUND plus smp_rmb())
>
> Note that dev and queue_id do not need a WRITE_ONCE or READ_ONCE, due
> to the introduce state synchronization (XSK_BOUND plus smp_rmb()).
>
> Introducing the state check also fixes a race, found by syzcaller, in
> xsk_poll() where umem could be accessed when stale.
>
> Suggested-by: Hillf Danton <hdanton@sina.com>
> Reported-by: syzbot+c82697e3043781e08802@syzkaller.appspotmail.com
> Fixes: 77cd0d7b3f25 ("xsk: add support for need_wakeup flag in AF_XDP 
> rings")
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>

Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
diff mbox series

Patch

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 8c9056f06989..c2f1af3b6a7c 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -186,10 +186,23 @@  static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
 	return err;
 }
 
+static bool xsk_is_bound(struct xdp_sock *xs)
+{
+	if (READ_ONCE(xs->state) == XSK_BOUND) {
+		/* Matches smp_wmb() in bind(). */
+		smp_rmb();
+		return true;
+	}
+	return false;
+}
+
 int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
 	u32 len;
 
+	if (!xsk_is_bound(xs))
+		return -EINVAL;
+
 	if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
 		return -EINVAL;
 
@@ -387,7 +400,7 @@  static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
 	struct sock *sk = sock->sk;
 	struct xdp_sock *xs = xdp_sk(sk);
 
-	if (unlikely(!xs->dev))
+	if (unlikely(!xsk_is_bound(xs)))
 		return -ENXIO;
 	if (unlikely(!(xs->dev->flags & IFF_UP)))
 		return -ENETDOWN;
@@ -403,10 +416,15 @@  static unsigned int xsk_poll(struct file *file, struct socket *sock,
 			     struct poll_table_struct *wait)
 {
 	unsigned int mask = datagram_poll(file, sock, wait);
-	struct sock *sk = sock->sk;
-	struct xdp_sock *xs = xdp_sk(sk);
-	struct net_device *dev = xs->dev;
-	struct xdp_umem *umem = xs->umem;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct net_device *dev;
+	struct xdp_umem *umem;
+
+	if (unlikely(!xsk_is_bound(xs)))
+		return mask;
+
+	dev = xs->dev;
+	umem = xs->umem;
 
 	if (umem->need_wakeup)
 		dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id,
@@ -442,10 +460,9 @@  static void xsk_unbind_dev(struct xdp_sock *xs)
 {
 	struct net_device *dev = xs->dev;
 
-	if (!dev || xs->state != XSK_BOUND)
+	if (xs->state != XSK_BOUND)
 		return;
-
-	xs->state = XSK_UNBOUND;
+	WRITE_ONCE(xs->state, XSK_UNBOUND);
 
 	/* Wait for driver to stop using the xdp socket. */
 	xdp_del_sk_umem(xs->umem, xs);
@@ -520,7 +537,9 @@  static int xsk_release(struct socket *sock)
 	local_bh_enable();
 
 	xsk_delete_from_maps(xs);
+	mutex_lock(&xs->mutex);
 	xsk_unbind_dev(xs);
+	mutex_unlock(&xs->mutex);
 
 	xskq_destroy(xs->rx);
 	xskq_destroy(xs->tx);
@@ -632,12 +651,12 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 		}
 
 		umem_xs = xdp_sk(sock->sk);
-		if (!umem_xs->umem) {
-			/* No umem to inherit. */
+		if (!xsk_is_bound(umem_xs)) {
 			err = -EBADF;
 			sockfd_put(sock);
 			goto out_unlock;
-		} else if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
+		}
+		if (umem_xs->dev != dev || umem_xs->queue_id != qid) {
 			err = -EINVAL;
 			sockfd_put(sock);
 			goto out_unlock;
@@ -671,10 +690,15 @@  static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
 	xdp_add_sk_umem(xs->umem, xs);
 
 out_unlock:
-	if (err)
+	if (err) {
 		dev_put(dev);
-	else
-		xs->state = XSK_BOUND;
+	} else {
+		/* Matches smp_rmb() in bind() for shared umem
+		 * sockets, and xsk_is_bound().
+		 */
+		smp_wmb();
+		WRITE_ONCE(xs->state, XSK_BOUND);
+	}
 out_release:
 	mutex_unlock(&xs->mutex);
 	rtnl_unlock();
@@ -927,7 +951,7 @@  static int xsk_mmap(struct file *file, struct socket *sock,
 	unsigned long pfn;
 	struct page *qpg;
 
-	if (xs->state != XSK_READY)
+	if (READ_ONCE(xs->state) != XSK_READY)
 		return -EBUSY;
 
 	if (offset == XDP_PGOFF_RX_RING) {