diff mbox series

[RFC,net-next,5/6] rds: support for zcopy completion notification

Message ID 9ff41bc8f61a112138287b5029369a9910477811.1516147540.git.sowmini.varadhan@oracle.com
State RFC, archived
Delegated to: David Miller
Headers show
Series rds: zerocopy support | expand

Commit Message

Sowmini Varadhan Jan. 17, 2018, 12:20 p.m. UTC
RDS removes a datagram from the retransmit queue when an ACK is
received. The ACK indicates that the receiver has queued the
RDS datagram, so that the sender can safely forget the datagram.

If the datagram to be removed had pinned pages set up, add
an entry to the rs->rs_znotify_queue so that the notifcation
will be sent up via rds_rm_zerocopy_callback() when the
rds_message is eventually freed by rds_message_purge.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 net/rds/af_rds.c  |    3 ++
 net/rds/message.c |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 net/rds/rds.h     |   13 +++++++++-
 net/rds/recv.c    |    3 ++
 net/rds/send.c    |    7 +++++
 5 files changed, 91 insertions(+), 2 deletions(-)

Comments

Willem de Bruijn Jan. 18, 2018, 12:23 a.m. UTC | #1
On Wed, Jan 17, 2018 at 7:20 AM, Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
> RDS removes a datagram from the retransmit queue when an ACK is
> received. The ACK indicates that the receiver has queued the
> RDS datagram, so that the sender can safely forget the datagram.
>
> If the datagram to be removed had pinned pages set up, add
> an entry to the rs->rs_znotify_queue so that the notifcation
> will be sent up via rds_rm_zerocopy_callback() when the
> rds_message is eventually freed by rds_message_purge.
>
> Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
> ---

> +static void rds_rm_zerocopy_callback(struct rds_sock *rs)
> +{
> +       struct sock *sk = rds_rs_to_sk(rs);
> +       struct sk_buff *skb;
> +       struct sock_exterr_skb *serr;
> +       struct sk_buff_head *q;
> +       unsigned long flags;
> +       struct sk_buff *tail;
> +       u32 *ptr;
> +       int ncookies = 0, i;
> +       struct rds_znotifier *znotif, *ztmp;
> +       LIST_HEAD(tmp_list);
> +
> +       spin_lock_irqsave(&rs->rs_lock, flags);
> +       list_splice(&rs->rs_znotify_queue, &tmp_list);
> +       INIT_LIST_HEAD(&rs->rs_znotify_queue);
> +       spin_unlock_irqrestore(&rs->rs_lock, flags);
> +
> +       list_for_each_entry_safe(znotif, ztmp, &tmp_list, z_list)
> +               ncookies++;
> +       if (ncookies == 0)
> +               return;
> +       skb = alloc_skb(ncookies * sizeof(u32), GFP_ATOMIC);
> +       if (!skb) {
> +               spin_lock_irqsave(&rs->rs_lock, flags);
> +               list_splice(&tmp_list, &rs->rs_znotify_queue);
> +               spin_unlock_irqrestore(&rs->rs_lock, flags);
> +               return;
> +       }

TCP zerocopy avoids this issue by allocating the notification skb when the
zerocopy packet is created, which would be rds_message_copy_from_user.

This does not add an allocation, if using the same trick of stashing
the intermediate notification object (op_mmp_znotifier) in skb->cb. Though,
alloc_skb is probably more expensive than that kzalloc. If nothing else,
because of more zeroing.

> +       serr = SKB_EXT_ERR(skb);
> +       serr->ee.ee_errno = 0;
> +       serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
> +       serr->ee.ee_data = ncookies;

This changes the semantics of these fields. Please add a new SO_EE_CODE flag,
even if the semantics can be derived from the packet family (for now).

Even better would be if we can avoid the cookies completely. I understand
the issue with concurrent send threads racing on obtaining a zckey value. If
the sender could learn which zckey was chosen for a call, would that suffice?

I suspect that in even with concurrent senders, notifications arrive largely in
order, in which case we could just maintain the existing semantics and even
reuse that implementation.

> +       serr->ee.ee_info = 0;
> +       serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

Why set the copied bit here?

> +       ptr = skb_put(skb, ncookies * sizeof(u32));
> +
> +       i = 0;
> +       list_for_each_entry_safe(znotif, ztmp, &tmp_list, z_list) {
> +               list_del(&znotif->z_list);
> +               ptr[i++] = znotif->z_cookie;
> +               mm_unaccount_pinned_pages(&znotif->z_mmp);
> +               kfree(znotif);
> +       }
> +       WARN_ON(!list_empty(&tmp_list));
> +       q = &sk->sk_error_queue;
> +       spin_lock_irqsave(&q->lock, flags);
> +       tail = skb_peek_tail(q);
> +       if (!tail ||
> +           SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY)  {
> +               __skb_queue_tail(q, skb);
> +               skb = NULL;
> +       }

This drops the packet if the branch is not taken. In the TCP case this condition
means that we can try to coalesce packets, but that is not the case here.

> +       spin_unlock_irqrestore(&q->lock, flags);
> +       sk->sk_error_report(sk);
> +       consume_skb(skb);
> +}
> +
>  /*
>   * This relies on dma_map_sg() not touching sg[].page during merging.
>   */
> @@ -66,11 +127,15 @@ static void rds_message_purge(struct rds_message *rm)
>         for (i = 0; i < rm->data.op_nents; i++) {
>                 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
>                 /* XXX will have to put_page for page refs */
> -               __free_page(sg_page(&rm->data.op_sg[i]));
> +               if (!rm->data.op_zcopy)
> +                       __free_page(sg_page(&rm->data.op_sg[i]));
> +               else
> +                       put_page(sg_page(&rm->data.op_sg[i]));
>         }
>         rm->data.op_nents = 0;
>         spin_lock_irqsave(&rm->m_rs_lock, flags);
>         if (rm->m_rs) {
> +               rds_rm_zerocopy_callback(rm->m_rs);
>                 sock_put(rds_rs_to_sk(rm->m_rs));
>                 rm->m_rs = NULL;
>         }
> diff --git a/net/rds/rds.h b/net/rds/rds.h
> index 374ae83..de5015a 100644
> --- a/net/rds/rds.h
> +++ b/net/rds/rds.h
> @@ -356,6 +356,12 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
>  #define RDS_MSG_PAGEVEC                7
>  #define RDS_MSG_FLUSH          8
>
> +struct rds_znotifier {
> +       struct list_head        z_list;
> +       u32                     z_cookie;
> +       struct mmpin            z_mmp;
> +};
> +
>  struct rds_message {
>         refcount_t              m_refcount;
>         struct list_head        m_sock_item;
> @@ -431,11 +437,14 @@ struct rds_message {
>                 } rdma;
>                 struct rm_data_op {
>                         unsigned int            op_active:1;
> -                       unsigned int            op_notify:1;
> +                       unsigned int            op_notify:1,
> +                                               op_zcopy:1,

not necessary if op_mmp_znotifier is NULL unless set in
rds_message_copy_from_user

> +                                               op_pad_to_32:30;
>                         unsigned int            op_nents;
>                         unsigned int            op_count;
>                         unsigned int            op_dmasg;
>                         unsigned int            op_dmaoff;
> +                       struct rds_znotifier    *op_mmp_znotifier;
>                         struct scatterlist      *op_sg;
>                 } data;
Sowmini Varadhan Jan. 18, 2018, 11:40 a.m. UTC | #2
On (01/17/18 19:23), Willem de Bruijn wrote:
> > +static void rds_rm_zerocopy_callback(struct rds_sock *rs)
     :
> > +       skb = alloc_skb(ncookies * sizeof(u32), GFP_ATOMIC);
> 
> TCP zerocopy avoids this issue by allocating the notification skb when the
> zerocopy packet is created, which would be rds_message_copy_from_user.

right, I could allocate the skb when we set up the zcopy data strucutres.

> This does not add an allocation, if using the same trick of stashing
> the intermediate notification object (op_mmp_znotifier) in skb->cb. Though,
> alloc_skb is probably more expensive than that kzalloc. If nothing else,
> because of more zeroing.

I would have liked to reuse skb->cb, but had to fall back to the alloc_skb
because of the attempt to fall back to flexibly numbered (and sized) cookie
notifications.

If we choose the "max number of cookies" option discussed in the previous 
thread, I think I should be able to do this with the existing 48 byte sized 
cb and pack in 8 32 bit cookies after the sock_extended_err.. maybe
that's sufficient.

> > +       serr = SKB_EXT_ERR(skb);
> > +       serr->ee.ee_errno = 0;
> > +       serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
> > +       serr->ee.ee_data = ncookies;
> 
> This changes the semantics of these fields. Please add a new SO_EE_CODE flag,
> even if the semantics can be derived from the packet family (for now).

sounds good, I'll take care of this (and other review comments)
for the next rev.

> Even better would be if we can avoid the cookies completely. I understand
> the issue with concurrent send threads racing on obtaining a zckey value. If
> the sender could learn which zckey was chosen for a call, would that suffice?

I'm not sure I understand the proposal- you want sendmsg to return
the cookie ("zckey") for you? How?

even if we mangled sendmsg() to return something other than
the existing POSIX semantics, how will the application be asynchronously
notified that send has completed (on a per-message/per-datagram) basis?

> I suspect that in even with concurrent senders, notifications arrive
> largely in
> order, in which case we could just maintain the existing semantics and even
> reuse that implementation.

not so. rds-stress [1] with -d8 -t8 quickly disproves this on my 10G ixgbe
connection. 

When you have multiple threads writing to a socket, you cannot know
what was the "order of send", unless you bottleneck all the threads
to go through a single-point-of-send.  rds-stress is an example of this
sort of usage (fairly typical in our applications)

[1] http://public-yum.oracle.com/repo/OracleLinux/OL6/ofed_UEK/x86_64//getPackageSource/rds-tools-2.0.7-1.12.el6.src.rpm

Consider 2 RDS sockets fd1 and fd2, each one sending to the
same pair of IP addresses: if fd1 gets bound to tcp sock1, fd2 to tcp sock2,
it's very possible that the send completion is not in the same order
as the order of send. The application cannot know which socket gets
bound to which TCP connection (or even whether/how-many tcp connections
are involved: mprds strives to make this transparent to the application)

Same problem exists for pure-datagram sockets like PF_PACKET, UDP etc.
application may send buf1 (datagram1) and buf2 (datagram2), and buf2
may make it to the destination before buf1.  The "notifications 
arrive largely in order" may be mostly true about a single-stream TCP
connection but not for the datagram models (or even threaded tcp apps
like iperf?)..

> > +       tail = skb_peek_tail(q);
> > +       if (!tail ||
> > +           SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY)  {
> > +               __skb_queue_tail(q, skb);
> > +               skb = NULL;
> > +       }
> 
> This drops the packet if the branch is not taken. In the TCP case
> this condition
> means that we can try to coalesce packets, but that is not the case here.

good point, I'll check into this and fix (same applies for the comments
to patches 4/6 and 6/6)

> >                 } rdma;
> >                 struct rm_data_op {
> >                         unsigned int            op_active:1;
> > -                       unsigned int            op_notify:1;
> > +                       unsigned int            op_notify:1,
> > +                                               op_zcopy:1,
                                                     :
> > +                       struct rds_znotifier    *op_mmp_znotifier;
> 
> not necessary if op_mmp_znotifier is NULL unless set in
> rds_message_copy_from_user

To make sure I dont misunderstand, you are suggesting that we dont
need op_zcopy, but can just check for the null-ness of
op_mmp_znotifier (yes, true, I agree)? or something else?

--Sowmini
Willem de Bruijn Jan. 18, 2018, 10:46 p.m. UTC | #3
>> This changes the semantics of these fields. Please add a new SO_EE_CODE flag,
>> even if the semantics can be derived from the packet family (for now).
>
> sounds good, I'll take care of this (and other review comments)
> for the next rev.
>
>> Even better would be if we can avoid the cookies completely. I understand
>> the issue with concurrent send threads racing on obtaining a zckey value. If
>> the sender could learn which zckey was chosen for a call, would that suffice?
>
> I'm not sure I understand the proposal- you want sendmsg to return
> the cookie ("zckey") for you? How?
>
> even if we mangled sendmsg() to return something other than
> the existing POSIX semantics, how will the application be asynchronously
> notified that send has completed (on a per-message/per-datagram) basis?

I'm purposely glossing over how the kernel returns this item for now.
Was just wondering whether we can then assume mostly in order delivery
and reuse the existing notification interface from tcp zerocopy.

From your experiments, it sounds like this is not the case. In which case
there is little benefit to trying to force linear IDs derived from sk->sk_zckey.

>> I suspect that in even with concurrent senders, notifications arrive
>> largely in
>> order, in which case we could just maintain the existing semantics and even
>> reuse that implementation.
>
> not so. rds-stress [1] with -d8 -t8 quickly disproves this on my 10G ixgbe
> connection.

Okay. In that case, the cmsg cookie approach sounds fine. I had the same
in an early tcp zerocopy test version, as a matter of fact.

>> >                 } rdma;
>> >                 struct rm_data_op {
>> >                         unsigned int            op_active:1;
>> > -                       unsigned int            op_notify:1;
>> > +                       unsigned int            op_notify:1,
>> > +                                               op_zcopy:1,
>                                                      :
>> > +                       struct rds_znotifier    *op_mmp_znotifier;
>>
>> not necessary if op_mmp_znotifier is NULL unless set in
>> rds_message_copy_from_user
>
> To make sure I dont misunderstand, you are suggesting that we dont
> need op_zcopy, but can just check for the null-ness of
> op_mmp_znotifier (yes, true, I agree)? or something else?

Yes, that's what I meant.
diff mbox series

Patch

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b405f77..23126db 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -183,6 +183,8 @@  static unsigned int rds_poll(struct file *file, struct socket *sock,
 		mask |= (POLLIN | POLLRDNORM);
 	if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
 		mask |= (POLLOUT | POLLWRNORM);
+	if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+		mask |= POLLERR;
 	read_unlock_irqrestore(&rs->rs_recv_lock, flags);
 
 	/* clear state any time we wake a seen-congested socket */
@@ -511,6 +513,7 @@  static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
 	INIT_LIST_HEAD(&rs->rs_send_queue);
 	INIT_LIST_HEAD(&rs->rs_recv_queue);
 	INIT_LIST_HEAD(&rs->rs_notify_queue);
+	INIT_LIST_HEAD(&rs->rs_znotify_queue);
 	INIT_LIST_HEAD(&rs->rs_cong_list);
 	spin_lock_init(&rs->rs_rdma_lock);
 	rs->rs_rdma_keys = RB_ROOT;
diff --git a/net/rds/message.c b/net/rds/message.c
index ef3daaf..25c74b3 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -33,6 +33,9 @@ 
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/errqueue.h>
 
 #include "rds.h"
 
@@ -53,6 +56,64 @@  void rds_message_addref(struct rds_message *rm)
 }
 EXPORT_SYMBOL_GPL(rds_message_addref);
 
+static void rds_rm_zerocopy_callback(struct rds_sock *rs)
+{
+	struct sock *sk = rds_rs_to_sk(rs);
+	struct sk_buff *skb;
+	struct sock_exterr_skb *serr;
+	struct sk_buff_head *q;
+	unsigned long flags;
+	struct sk_buff *tail;
+	u32 *ptr;
+	int ncookies = 0, i;
+	struct rds_znotifier *znotif, *ztmp;
+	LIST_HEAD(tmp_list);
+
+	spin_lock_irqsave(&rs->rs_lock, flags);
+	list_splice(&rs->rs_znotify_queue, &tmp_list);
+	INIT_LIST_HEAD(&rs->rs_znotify_queue);
+	spin_unlock_irqrestore(&rs->rs_lock, flags);
+
+	list_for_each_entry_safe(znotif, ztmp, &tmp_list, z_list)
+		ncookies++;
+	if (ncookies == 0)
+		return;
+	skb = alloc_skb(ncookies * sizeof(u32), GFP_ATOMIC);
+	if (!skb) {
+		spin_lock_irqsave(&rs->rs_lock, flags);
+		list_splice(&tmp_list, &rs->rs_znotify_queue);
+		spin_unlock_irqrestore(&rs->rs_lock, flags);
+		return;
+	}
+	serr = SKB_EXT_ERR(skb);
+	serr->ee.ee_errno = 0;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
+	serr->ee.ee_data = ncookies;
+	serr->ee.ee_info = 0;
+	serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
+	ptr = skb_put(skb, ncookies * sizeof(u32));
+
+	i = 0;
+	list_for_each_entry_safe(znotif, ztmp, &tmp_list, z_list) {
+		list_del(&znotif->z_list);
+		ptr[i++] = znotif->z_cookie;
+		mm_unaccount_pinned_pages(&znotif->z_mmp);
+		kfree(znotif);
+	}
+	WARN_ON(!list_empty(&tmp_list));
+	q = &sk->sk_error_queue;
+	spin_lock_irqsave(&q->lock, flags);
+	tail = skb_peek_tail(q);
+	if (!tail ||
+	    SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY)  {
+		__skb_queue_tail(q, skb);
+		skb = NULL;
+	}
+	spin_unlock_irqrestore(&q->lock, flags);
+	sk->sk_error_report(sk);
+	consume_skb(skb);
+}
+
 /*
  * This relies on dma_map_sg() not touching sg[].page during merging.
  */
@@ -66,11 +127,15 @@  static void rds_message_purge(struct rds_message *rm)
 	for (i = 0; i < rm->data.op_nents; i++) {
 		rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i]));
 		/* XXX will have to put_page for page refs */
-		__free_page(sg_page(&rm->data.op_sg[i]));
+		if (!rm->data.op_zcopy)
+			__free_page(sg_page(&rm->data.op_sg[i]));
+		else
+			put_page(sg_page(&rm->data.op_sg[i]));
 	}
 	rm->data.op_nents = 0;
 	spin_lock_irqsave(&rm->m_rs_lock, flags);
 	if (rm->m_rs) {
+		rds_rm_zerocopy_callback(rm->m_rs);
 		sock_put(rds_rs_to_sk(rm->m_rs));
 		rm->m_rs = NULL;
 	}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 374ae83..de5015a 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -356,6 +356,12 @@  static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
 #define RDS_MSG_PAGEVEC		7
 #define RDS_MSG_FLUSH		8
 
+struct rds_znotifier {
+	struct list_head	z_list;
+	u32			z_cookie;
+	struct mmpin		z_mmp;
+};
+
 struct rds_message {
 	refcount_t		m_refcount;
 	struct list_head	m_sock_item;
@@ -431,11 +437,14 @@  struct rds_message {
 		} rdma;
 		struct rm_data_op {
 			unsigned int		op_active:1;
-			unsigned int		op_notify:1;
+			unsigned int		op_notify:1,
+						op_zcopy:1,
+						op_pad_to_32:30;
 			unsigned int		op_nents;
 			unsigned int		op_count;
 			unsigned int		op_dmasg;
 			unsigned int		op_dmaoff;
+			struct rds_znotifier	*op_mmp_znotifier;
 			struct scatterlist	*op_sg;
 		} data;
 	};
@@ -588,6 +597,8 @@  struct rds_sock {
 	/* Socket receive path trace points*/
 	u8			rs_rx_traces;
 	u8			rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
+
+	struct list_head	rs_znotify_queue; /* zerocopy completion */
 };
 
 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
diff --git a/net/rds/recv.c b/net/rds/recv.c
index b25bcfe..043f667 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -594,6 +594,9 @@  int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 
 	if (msg_flags & MSG_OOB)
 		goto out;
+	if (msg_flags & MSG_ERRQUEUE)
+		return sock_recv_errqueue(sk, msg, size, SOL_IP, IP_RECVERR,
+					  msg_flags);
 
 	while (1) {
 		/* If there are pending notifications, do those - and nothing else */
diff --git a/net/rds/send.c b/net/rds/send.c
index 5ac0925..5c38ce3 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -635,7 +635,14 @@  static void rds_send_remove_from_sock(struct list_head *messages, int status)
 		if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
 			struct rm_rdma_op *ro = &rm->rdma;
 			struct rds_notifier *notifier;
+			struct rds_znotifier *znotifier;
 
+			if (rm->data.op_zcopy) {
+				znotifier = rm->data.op_mmp_znotifier;
+				list_add_tail(&znotifier->z_list,
+					      &rs->rs_znotify_queue);
+				rm->data.op_mmp_znotifier = NULL;
+			}
 			list_del_init(&rm->m_sock_item);
 			rds_send_sndbuf_remove(rs, rm);