diff mbox

[2/2,net-next] openvswitch: Use skb_zerocopy() for upcall

Message ID 63f4476190ef4044f7c09b225b4e93cd929bf0be.1383901577.git.tgraf@suug.ch
State Superseded, archived
Delegated to: David Miller
Headers show

Commit Message

Thomas Graf Nov. 8, 2013, 9:15 a.m. UTC
Use of skb_zerocopy() avoids the expensive call to memcpy() when
copying the packet data into the Netlink skb. Completes checksum
through skb_checksum_help() if needed.

Netlink messaged must be properly padded and aligned to meet
sanity checks of the user space counterpart.

Cost of memcpy is significantly reduced from:
+   7.48%       vhost-8471  [k] memcpy
+   5.57%     ovs-vswitchd  [k] memcpy
+   2.81%       vhost-8471  [k] csum_partial_copy_generic

to:
+   5.72%     ovs-vswitchd  [k] memcpy
+   3.32%       vhost-5153  [k] memcpy
+   0.68%       vhost-5153  [k] skb_zerocopy

(megaflows disabled)

Signed-off-by: Thomas Graf <tgraf@suug.ch>
---
 net/openvswitch/datapath.c | 52 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 7 deletions(-)

Comments

Ben Hutchings Nov. 9, 2013, 10:02 p.m. UTC | #1
On Fri, 2013-11-08 at 10:15 +0100, Thomas Graf wrote:
> Use of skb_zerocopy() avoids the expensive call to memcpy() when
> copying the packet data into the Netlink skb. Completes checksum
> through skb_checksum_help() if needed.
> 
> Netlink messaged must be properly padded and aligned to meet
> sanity checks of the user space counterpart.
> 
> Cost of memcpy is significantly reduced from:
> +   7.48%       vhost-8471  [k] memcpy
> +   5.57%     ovs-vswitchd  [k] memcpy
> +   2.81%       vhost-8471  [k] csum_partial_copy_generic
> 
> to:
> +   5.72%     ovs-vswitchd  [k] memcpy
> +   3.32%       vhost-5153  [k] memcpy
> +   0.68%       vhost-5153  [k] skb_zerocopy
> 
> (megaflows disabled)
> 
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
>  net/openvswitch/datapath.c | 52 +++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 1408adc..3f170e3 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
[...]
> @@ -441,13 +449,43 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
>  			  nla_len(upcall_info->userdata),
>  			  nla_data(upcall_info->userdata));
>  
> -	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
> +	/* Only reserve room for attribute header, packet data is added
> +	 * in skb_zerocopy() */
> +	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
> +		goto out;
> +	nla->nla_len = nla_attr_size(skb->len);
>  
> -	skb_copy_and_csum_dev(skb, nla_data(nla));
> +	skb_zerocopy(user_skb, skb, skb->len, hlen);
>  
> -	genlmsg_end(user_skb, upcall);
> -	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
> +	/* OVS user space expects the size of the message to be aligned to
> +	 * NLA_ALIGNTO. Aligning nlmsg_len is not enough, the actual bytes
> +	 * read must match nlmsg_len.
> +	 */
> +	plen = NLA_ALIGN(user_skb->len) - user_skb->len;
> +	if (plen > 0) {
> +		int nr_frags = skb_shinfo(user_skb)->nr_frags;
> +
> +		if (nr_frags) {
> +			skb_frag_t *frag;
> +
> +			frag = &skb_shinfo(user_skb)->frags[nr_frags -1];
> +			skb_frag_size_add(frag, plen);

It looks like this is effectively padding with whatever happens to
follow the original packet content.  This could result in a small
information leak.  If the fragment has non-zero offset and already
extends to the end of a page, this could result in a segfault as the
next page may be unmapped.

Perhaps you could add the padding as an extra fragment pointing to a
preallocated zero page.  If the skb already has the maximum number of
fragments, you would have to copy the last fragment in order to add
padding.

> +			BUG_ON(frag->size > PAGE_SIZE);
[...]

I'm not sure that's a reasonable assumption either.  We certainly allow
fragments to be larger than PAGE_SIZE in the transmit path.

Ben.
Thomas Graf Nov. 10, 2013, 11:40 p.m. UTC | #2
On 11/09/13 at 10:02pm, Ben Hutchings wrote:
> On Fri, 2013-11-08 at 10:15 +0100, Thomas Graf wrote:
> > Use of skb_zerocopy() avoids the expensive call to memcpy() when
> > copying the packet data into the Netlink skb. Completes checksum
> > through skb_checksum_help() if needed.
> > 
> > Netlink messaged must be properly padded and aligned to meet
> > sanity checks of the user space counterpart.
> > 
> > Cost of memcpy is significantly reduced from:
> > +   7.48%       vhost-8471  [k] memcpy
> > +   5.57%     ovs-vswitchd  [k] memcpy
> > +   2.81%       vhost-8471  [k] csum_partial_copy_generic
> > 
> > to:
> > +   5.72%     ovs-vswitchd  [k] memcpy
> > +   3.32%       vhost-5153  [k] memcpy
> > +   0.68%       vhost-5153  [k] skb_zerocopy
> > 
> > (megaflows disabled)
> > 
> > Signed-off-by: Thomas Graf <tgraf@suug.ch>
> > ---
> >  net/openvswitch/datapath.c | 52 +++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 45 insertions(+), 7 deletions(-)
> > 
> > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> > index 1408adc..3f170e3 100644
> > --- a/net/openvswitch/datapath.c
> > +++ b/net/openvswitch/datapath.c
> [...]
> > @@ -441,13 +449,43 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
> >  			  nla_len(upcall_info->userdata),
> >  			  nla_data(upcall_info->userdata));
> >  
> > -	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
> > +	/* Only reserve room for attribute header, packet data is added
> > +	 * in skb_zerocopy() */
> > +	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
> > +		goto out;
> > +	nla->nla_len = nla_attr_size(skb->len);
> >  
> > -	skb_copy_and_csum_dev(skb, nla_data(nla));
> > +	skb_zerocopy(user_skb, skb, skb->len, hlen);
> >  
> > -	genlmsg_end(user_skb, upcall);
> > -	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
> > +	/* OVS user space expects the size of the message to be aligned to
> > +	 * NLA_ALIGNTO. Aligning nlmsg_len is not enough, the actual bytes
> > +	 * read must match nlmsg_len.
> > +	 */
> > +	plen = NLA_ALIGN(user_skb->len) - user_skb->len;
> > +	if (plen > 0) {
> > +		int nr_frags = skb_shinfo(user_skb)->nr_frags;
> > +
> > +		if (nr_frags) {
> > +			skb_frag_t *frag;
> > +
> > +			frag = &skb_shinfo(user_skb)->frags[nr_frags -1];
> > +			skb_frag_size_add(frag, plen);
> 
> It looks like this is effectively padding with whatever happens to
> follow the original packet content.  This could result in a small
> information leak.  If the fragment has non-zero offset and already
> extends to the end of a page, this could result in a segfault as the
> next page may be unmapped.
> 
> Perhaps you could add the padding as an extra fragment pointing to a
> preallocated zero page.  If the skb already has the maximum number of
> fragments, you would have to copy the last fragment in order to add
> padding.

You are right and thanks for the review Ben.

Realizing how complex this becomes I'm leaning towards avoiding
padding alltogether by fixing OVS user space to no longer enforce
it, signal this capability via a flag to the kernel and only
perform zerocopy for enabled OVS user space counterparts.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Nov. 11, 2013, 1:55 a.m. UTC | #3
On Mon, Nov 11, 2013 at 7:40 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 11/09/13 at 10:02pm, Ben Hutchings wrote:
>> On Fri, 2013-11-08 at 10:15 +0100, Thomas Graf wrote:
>> > Use of skb_zerocopy() avoids the expensive call to memcpy() when
>> > copying the packet data into the Netlink skb. Completes checksum
>> > through skb_checksum_help() if needed.
>> >
>> > Netlink messaged must be properly padded and aligned to meet
>> > sanity checks of the user space counterpart.
>> >
>> > Cost of memcpy is significantly reduced from:
>> > +   7.48%       vhost-8471  [k] memcpy
>> > +   5.57%     ovs-vswitchd  [k] memcpy
>> > +   2.81%       vhost-8471  [k] csum_partial_copy_generic
>> >
>> > to:
>> > +   5.72%     ovs-vswitchd  [k] memcpy
>> > +   3.32%       vhost-5153  [k] memcpy
>> > +   0.68%       vhost-5153  [k] skb_zerocopy
>> >
>> > (megaflows disabled)
>> >
>> > Signed-off-by: Thomas Graf <tgraf@suug.ch>
>> > ---
>> >  net/openvswitch/datapath.c | 52 +++++++++++++++++++++++++++++++++++++++-------
>> >  1 file changed, 45 insertions(+), 7 deletions(-)
>> >
>> > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> > index 1408adc..3f170e3 100644
>> > --- a/net/openvswitch/datapath.c
>> > +++ b/net/openvswitch/datapath.c
>> [...]
>> > @@ -441,13 +449,43 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
>> >                       nla_len(upcall_info->userdata),
>> >                       nla_data(upcall_info->userdata));
>> >
>> > -   nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
>> > +   /* Only reserve room for attribute header, packet data is added
>> > +    * in skb_zerocopy() */
>> > +   if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
>> > +           goto out;
>> > +   nla->nla_len = nla_attr_size(skb->len);
>> >
>> > -   skb_copy_and_csum_dev(skb, nla_data(nla));
>> > +   skb_zerocopy(user_skb, skb, skb->len, hlen);
>> >
>> > -   genlmsg_end(user_skb, upcall);
>> > -   err = genlmsg_unicast(net, user_skb, upcall_info->portid);
>> > +   /* OVS user space expects the size of the message to be aligned to
>> > +    * NLA_ALIGNTO. Aligning nlmsg_len is not enough, the actual bytes
>> > +    * read must match nlmsg_len.
>> > +    */
>> > +   plen = NLA_ALIGN(user_skb->len) - user_skb->len;
>> > +   if (plen > 0) {
>> > +           int nr_frags = skb_shinfo(user_skb)->nr_frags;
>> > +
>> > +           if (nr_frags) {
>> > +                   skb_frag_t *frag;
>> > +
>> > +                   frag = &skb_shinfo(user_skb)->frags[nr_frags -1];
>> > +                   skb_frag_size_add(frag, plen);
>>
>> It looks like this is effectively padding with whatever happens to
>> follow the original packet content.  This could result in a small
>> information leak.  If the fragment has non-zero offset and already
>> extends to the end of a page, this could result in a segfault as the
>> next page may be unmapped.
>>
>> Perhaps you could add the padding as an extra fragment pointing to a
>> preallocated zero page.  If the skb already has the maximum number of
>> fragments, you would have to copy the last fragment in order to add
>> padding.
>
> You are right and thanks for the review Ben.
>
> Realizing how complex this becomes I'm leaning towards avoiding
> padding alltogether by fixing OVS user space to no longer enforce
> it, signal this capability via a flag to the kernel and only
> perform zerocopy for enabled OVS user space counterparts.

It seems like at a minimum it would be a good idea to start by
patching userspace now. That would at least begin to limit the scope
of the problem.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 1408adc..3f170e3 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -381,10 +381,11 @@  static size_t key_attr_size(void)
 }
 
 static size_t upcall_msg_size(const struct sk_buff *skb,
-			      const struct nlattr *userdata)
+			      const struct nlattr *userdata,
+			      unsigned int hdrlen)
 {
 	size_t size = NLMSG_ALIGN(sizeof(struct ovs_header))
-		+ nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */
+		+ nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */
 		+ nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */
 
 	/* OVS_PACKET_ATTR_USERDATA */
@@ -402,6 +403,7 @@  static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	struct sk_buff *nskb = NULL;
 	struct sk_buff *user_skb; /* to be queued to userspace */
 	struct nlattr *nla;
+	unsigned int plen, hlen;
 	int err;
 
 	if (vlan_tx_tag_present(skb)) {
@@ -422,7 +424,13 @@  static int queue_userspace_packet(struct net *net, int dp_ifindex,
 		goto out;
 	}
 
-	user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC);
+	/* Complete checksum if needed */
+	if (skb->ip_summed == CHECKSUM_PARTIAL &&
+	    (err = skb_checksum_help(skb)))
+		goto out;
+
+	hlen = skb_zerocopy_headlen(skb);
+	user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata, hlen), GFP_ATOMIC);
 	if (!user_skb) {
 		err = -ENOMEM;
 		goto out;
@@ -441,13 +449,43 @@  static int queue_userspace_packet(struct net *net, int dp_ifindex,
 			  nla_len(upcall_info->userdata),
 			  nla_data(upcall_info->userdata));
 
-	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
+	/* Only reserve room for attribute header, packet data is added
+	 * in skb_zerocopy() */
+	if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0)))
+		goto out;
+	nla->nla_len = nla_attr_size(skb->len);
 
-	skb_copy_and_csum_dev(skb, nla_data(nla));
+	skb_zerocopy(user_skb, skb, skb->len, hlen);
 
-	genlmsg_end(user_skb, upcall);
-	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
+	/* OVS user space expects the size of the message to be aligned to
+	 * NLA_ALIGNTO. Aligning nlmsg_len is not enough, the actual bytes
+	 * read must match nlmsg_len.
+	 */
+	plen = NLA_ALIGN(user_skb->len) - user_skb->len;
+	if (plen > 0) {
+		int nr_frags = skb_shinfo(user_skb)->nr_frags;
+
+		if (nr_frags) {
+			skb_frag_t *frag;
+
+			frag = &skb_shinfo(user_skb)->frags[nr_frags -1];
+			skb_frag_size_add(frag, plen);
+			BUG_ON(frag->size > PAGE_SIZE);
+
+			user_skb->truesize += plen;
+			user_skb->len += plen;
+			user_skb->data_len += plen;
+		} else {
+			/* The linear headroom is aligned to NLMSG_ALIGN by
+			 * genlmsg_new(), room must be available.
+			 */
+			memset(skb_put(user_skb, plen), 0, plen);
+		}
+	}
 
+	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
+
+	err = genlmsg_unicast(net, user_skb, upcall_info->portid);
 out:
 	kfree_skb(nskb);
 	return err;