diff mbox

[V7,2/4,net-next] skbuff: Add userspace zero-copy buffers in skb

Message ID 1306610588.5180.87.camel@localhost.localdomain
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Shirley Ma May 28, 2011, 7:23 p.m. UTC
This patch adds userspace buffers support in skb shared info. A new 
struct skb_ubuf_info is needed to maintain the userspace buffers
argument and index, a callback is used to notify userspace to release
the buffers once lower device has done DMA (Last reference to that skb
has gone).

If there is any userspace apps to reference these userspace buffers,
then these userspaces buffers will be copied into kernel. This way we
can prevent userspace apps to hold these userspace buffers too long.

One userspace buffer info pointer is added in skb_shared_info. Is that
safe to use destructor_arg? From the comments, this destructor_arg is
used for destructor, destructor calls doesn't wait for last reference to
that skb is gone.

Signed-off-by: Shirley Ma <xma@us.ibm.com>
---
 include/linux/skbuff.h |   24 ++++++++++++++++
 net/core/skbuff.c      |   73 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 0 deletions(-)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Shirley Ma June 27, 2011, 3:45 p.m. UTC | #1
Hello Dave,

To support skb zero-copy, a pointer is needed to add to skb share info.
Do you agree with this approach? If not, do you have any other
suggestions?

Thanks
Shirley


On Sat, 2011-05-28 at 12:23 -0700, Shirley Ma wrote:
>                              From: 
> Shirley Ma <mashirle@us.ibm.com>
>                                To: 
> David Miller <davem@davemloft.net>,
> mst@redhat.com, Eric Dumazet
> <eric.dumazet@gmail.com>, Avi
> Kivity <avi@redhat.com>, Arnd
> Bergmann <arnd@arndb.de>
>                                Cc: 
> netdev@vger.kernel.org,
> kvm@vger.kernel.org,
> linux-kernel@vger.kernel.org
>                           Subject: 
> [PATCH V7 2/4 net-next] skbuff: Add
> userspace zero-copy buffers in skb
>                              Date: 
> 05/28/2011 12:23:08 PM
> 
> 
> This patch adds userspace buffers support in skb shared info. A new 
> struct skb_ubuf_info is needed to maintain the userspace buffers
> argument and index, a callback is used to notify userspace to release
> the buffers once lower device has done DMA (Last reference to that skb
> has gone).
> 
> If there is any userspace apps to reference these userspace buffers,
> then these userspaces buffers will be copied into kernel. This way we
> can prevent userspace apps to hold these userspace buffers too long.
> 
> One userspace buffer info pointer is added in skb_shared_info. Is that
> safe to use destructor_arg? From the comments, this destructor_arg is
> used for destructor, destructor calls doesn't wait for last reference
> to
> that skb is gone.
> 
> Signed-off-by: Shirley Ma <xma@us.ibm.com>
> ---
>  include/linux/skbuff.h |   24 ++++++++++++++++
>  net/core/skbuff.c      |   73
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 97 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index e8b78ce..37a2cb4 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -189,6 +189,17 @@ enum {
>         SKBTX_DRV_NEEDS_SK_REF = 1 << 3,
>  };
> 
> +/*
> + * The callback notifies userspace to release buffers when skb DMA is
> done in
> + * lower device, the skb last reference should be 0 when calling
> this.
> + * The desc is used to track userspace buffer index.
> + */
> +struct ubuf_info {
> +       void (*callback)(void *);
> +       void *arg;
> +       unsigned long desc;
> +};
> +
>  /* This data is invariant across clones and lives at
>   * the end of the header data, ie. at skb->end.
>   */
> @@ -203,6 +214,9 @@ struct skb_shared_info {
>         struct sk_buff  *frag_list;
>         struct skb_shared_hwtstamps hwtstamps;
> 
> +       /* DMA mapping from/to userspace buffers info */
> +       void            *ubuf_arg;
> +
>         /*
>          * Warning : all fields before dataref are cleared in
> __alloc_skb()
>          */
> @@ -2261,5 +2275,15 @@ static inline void
> skb_checksum_none_assert(struct sk_buff *skb)
>  }
> 
>  bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
> +
> +/*
> + *     skb *uarg - is the buffer from userspace
> + *     @skb: buffer to check
> + */
> +static inline int skb_ubuf(const struct sk_buff *skb)
> +{
> +       return skb_shinfo(skb)->ubuf_arg != NULL;
> +}
> +
>  #endif /* __KERNEL__ */
>  #endif /* _LINUX_SKBUFF_H */
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 46cbd28..115c5ca 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -329,6 +329,19 @@ static void skb_release_data(struct sk_buff *skb)
>                                 put_page(skb_shinfo(skb)->frags[i].page);
>                 }
> 
> +               /*
> +                * If skb buf is from userspace, we need to notify the
> caller
> +                * the lower device DMA has done;
> +                */
> +               if (skb_ubuf(skb)) {
> +                       struct ubuf_info *uarg;
> +
> +                       uarg = (struct ubuf_info
> *)skb_shinfo(skb)->ubuf_arg;
> +
> +                       if (uarg->callback)
> +                               uarg->callback(uarg);
> +               }
> +
>                 if (skb_has_frag_list(skb))
>                         skb_drop_fraglist(skb);
> 
> @@ -481,6 +494,9 @@ bool skb_recycle_check(struct sk_buff *skb, int
> skb_size)
>         if (irqs_disabled())
>                 return false;
> 
> +       if (skb_ubuf(skb))
> +               return false;
> +
>         if (skb_is_nonlinear(skb) || skb->fclone !=
> SKB_FCLONE_UNAVAILABLE)
>                 return false;
> 
> @@ -573,6 +589,7 @@ static struct sk_buff *__skb_clone(struct sk_buff
> *n, struct sk_buff *skb)
>         atomic_set(&n->users, 1);
> 
>         atomic_inc(&(skb_shinfo(skb)->dataref));
> +       skb_shinfo(skb)->ubuf_arg = NULL;
>         skb->cloned = 1;
> 
>         return n;
> @@ -596,6 +613,51 @@ struct sk_buff *skb_morph(struct sk_buff *dst,
> struct sk_buff *src)
>  }
>  EXPORT_SYMBOL_GPL(skb_morph);
> 
> +/* skb frags copy userspace buffers to kernel */
> +static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
> +{
> +       int i;
> +       int num_frags = skb_shinfo(skb)->nr_frags;
> +       struct page *page, *head = NULL;
> +       struct ubuf_info *uarg = skb_shinfo(skb)->ubuf_arg;
> +
> +       for (i = 0; i < num_frags; i++) {
> +               u8 *vaddr;
> +               skb_frag_t *f = &skb_shinfo(skb)->frags[i];
> +
> +               page = alloc_page(GFP_ATOMIC);
> +               if (!page) {
> +                       while (head) {
> +                               put_page(head);
> +                               head = (struct page *)head->private;
> +                       }
> +                       return -ENOMEM;
> +               }
> +               vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
> +               memcpy(page_address(page),
> +                      vaddr + f->page_offset, f->size);
> +               kunmap_skb_frag(vaddr);
> +               page->private = (unsigned long)head;
> +               head = page;
> +       }
> +
> +       /* skb frags release userspace buffers */
> +       for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
> +               put_page(skb_shinfo(skb)->frags[i].page);
> +
> +       uarg->callback(uarg);
> +       skb_shinfo(skb)->ubuf_arg = NULL;
> +
> +       /* skb frags point to kernel buffers */
> +       for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
> +               skb_shinfo(skb)->frags[i - 1].page_offset = 0;
> +               skb_shinfo(skb)->frags[i - 1].page = head;
> +               head = (struct page *)head->private;
> +       }
> +       return 0;
> +}
> +
> +
>  /**
>   *     skb_clone       -       duplicate an sk_buff
>   *     @skb: buffer to clone
> @@ -614,6 +676,11 @@ struct sk_buff *skb_clone(struct sk_buff *skb,
> gfp_t gfp_mask)
>  {
>         struct sk_buff *n;
> 
> +       if (skb_ubuf(skb)) {
> +               if (skb_copy_ubufs(skb, gfp_mask))
> +                       return NULL;
> +       }
> +
>         n = skb + 1;
>         if (skb->fclone == SKB_FCLONE_ORIG &&
>             n->fclone == SKB_FCLONE_UNAVAILABLE) {
> @@ -731,6 +798,12 @@ struct sk_buff *pskb_copy(struct sk_buff *skb,
> gfp_t gfp_mask)
>         if (skb_shinfo(skb)->nr_frags) {
>                 int i;
> 
> +               if (skb_ubuf(skb)) {
> +                       if (skb_copy_ubufs(skb, gfp_mask)) {
> +                               kfree(n);
> +                               goto out;
> +                       }
> +               }
>                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
>                         skb_shinfo(n)->frags[i] =
> skb_shinfo(skb)->frags[i];
>                         get_page(skb_shinfo(n)->frags[i].page); 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 27, 2011, 10:54 p.m. UTC | #2
From: Shirley Ma <mashirle@us.ibm.com>
Date: Mon, 27 Jun 2011 08:45:10 -0700

> To support skb zero-copy, a pointer is needed to add to skb share info.
> Do you agree with this approach? If not, do you have any other
> suggestions?

I really can't form an opinion unless I am shown the complete
implementation, what this give us in return, what the impact is, etc.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma June 28, 2011, 4:51 p.m. UTC | #3
On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote:
> From: Shirley Ma <mashirle@us.ibm.com>
> Date: Mon, 27 Jun 2011 08:45:10 -0700
> 
> > To support skb zero-copy, a pointer is needed to add to skb share
> info.
> > Do you agree with this approach? If not, do you have any other
> > suggestions?
> 
> I really can't form an opinion unless I am shown the complete
> implementation, what this give us in return, what the impact is, etc. 

zero-copy skb buffers can save significant CPUs. Right now, I only
implements macvtap/vhost zero-copy between KVM guest and host. The
performance is as follow:

Single TCP_STREAM 120 secs test results 2.6.39-rc3 over ixgbe 10Gb NIC
results:

Message BW(Gb/s)qemu-kvm (NumCPU)vhost-net(NumCPU) PerfTop irq/s
4K      7408.57         92.1%           22.6%           1229
4K(Orig)4913.17         118.1%          84.1%           2086    

8K      9129.90         89.3%           23.3%           1141
8K(Orig)7094.55         115.9%          84.7%           2157

16K     9178.81         89.1%           23.3%           1139
16K(Orig)8927.1         118.7%          83.4%           2262

64K     9171.43         88.4%           24.9%           1253
64K(Orig)9085.85        115.9%          82.4%           2229

You can see the overall CPU saved 50% w/i zero-copy.

The impact is every skb allocation consumed one more pointer in skb
share info, and a pointer check in skb release when last reference is
gone.

For skb clone, skb expand private head and skb copy, it still keeps copy
the buffers to kernel, so we can avoid user application, like tcpdump to
hold the user-space buffers too long.

Thanks
Shirley

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones June 28, 2011, 5:19 p.m. UTC | #4
On 06/28/2011 09:51 AM, Shirley Ma wrote:
> On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote:
>> From: Shirley Ma<mashirle@us.ibm.com>
>> Date: Mon, 27 Jun 2011 08:45:10 -0700
>>
>>> To support skb zero-copy, a pointer is needed to add to skb share
>> info.
>>> Do you agree with this approach? If not, do you have any other
>>> suggestions?
>>
>> I really can't form an opinion unless I am shown the complete
>> implementation, what this give us in return, what the impact is, etc.
>
> zero-copy skb buffers can save significant CPUs. Right now, I only
> implements macvtap/vhost zero-copy between KVM guest and host. The
> performance is as follow:
>
> Single TCP_STREAM 120 secs test results 2.6.39-rc3 over ixgbe 10Gb NIC
> results:
>
> Message BW(Gb/s)qemu-kvm (NumCPU)vhost-net(NumCPU) PerfTop irq/s
> 4K      7408.57         92.1%           22.6%           1229
> 4K(Orig)4913.17         118.1%          84.1%           2086
>
> 8K      9129.90         89.3%           23.3%           1141
> 8K(Orig)7094.55         115.9%          84.7%           2157
>
> 16K     9178.81         89.1%           23.3%           1139
> 16K(Orig)8927.1         118.7%          83.4%           2262
>
> 64K     9171.43         88.4%           24.9%           1253
> 64K(Orig)9085.85        115.9%          82.4%           2229
>
> You can see the overall CPU saved 50% w/i zero-copy.

While this isn't the copy between netperf and the stack, at some point 
you may want to enable netperf's "DIRTY" mode (./configure 
--enable-dirty) to cause it to start either dirtying buffers before 
send, or reading from buffers after receive.  I cannot guarantee that 
there hasn't been bitrot in that area of netperf though :)  Particularly 
in a TCP_MAERTS test.  The "DIRTY" mode code will not do anything in a 
TCP_SENDFILE test.

A simple sanity check of the effect of the changes on a TCP_RR test 
would probably be goodness as well.

happy benchmarking,

rick jones
one of these days I'll have to find a good way to get accurate overall 
CPU utilization from within a guest and teach netperf about it.

>
> The impact is every skb allocation consumed one more pointer in skb
> share info, and a pointer check in skb release when last reference is
> gone.
>
> For skb clone, skb expand private head and skb copy, it still keeps copy
> the buffers to kernel, so we can avoid user application, like tcpdump to
> hold the user-space buffers too long.
>
> Thanks
> Shirley
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller June 28, 2011, 11:35 p.m. UTC | #5
From: Shirley Ma <mashirle@us.ibm.com>
Date: Tue, 28 Jun 2011 09:51:32 -0700

> On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote:
>> From: Shirley Ma <mashirle@us.ibm.com>
>> Date: Mon, 27 Jun 2011 08:45:10 -0700
>> 
>> > To support skb zero-copy, a pointer is needed to add to skb share
>> info.
>> > Do you agree with this approach? If not, do you have any other
>> > suggestions?
>> 
>> I really can't form an opinion unless I am shown the complete
>> implementation, what this give us in return, what the impact is, etc. 
 ..
> You can see the overall CPU saved 50% w/i zero-copy.
> 
> The impact is every skb allocation consumed one more pointer in skb
> share info, and a pointer check in skb release when last reference is
> gone.
> 
> For skb clone, skb expand private head and skb copy, it still keeps copy
> the buffers to kernel, so we can avoid user application, like tcpdump to
> hold the user-space buffers too long.

Ok, now show me the "complete implementation".  I'm as interested in
the code as I am in the numbers, that's why I asked for both.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shirley Ma June 29, 2011, 4:28 a.m. UTC | #6
I have submitted this patchset from Version 1 to Version 7 already in
the past few months.
Here is the link to the patchset:

http://lists.openwall.net/netdev/2011/05/28/

I am working on V8 now.

Thanks
Shirley


On Tue, 2011-06-28 at 16:35 -0700, David Miller wrote:
> From: Shirley Ma <mashirle@us.ibm.com>
> Date: Tue, 28 Jun 2011 09:51:32 -0700
> 
> > On Mon, 2011-06-27 at 15:54 -0700, David Miller wrote:
> >> From: Shirley Ma <mashirle@us.ibm.com>
> >> Date: Mon, 27 Jun 2011 08:45:10 -0700
> >> 
> >> > To support skb zero-copy, a pointer is needed to add to skb share
> >> info.
> >> > Do you agree with this approach? If not, do you have any other
> >> > suggestions?
> >> 
> >> I really can't form an opinion unless I am shown the complete
> >> implementation, what this give us in return, what the impact is,
> etc. 
>  ..
> > You can see the overall CPU saved 50% w/i zero-copy.
> > 
> > The impact is every skb allocation consumed one more pointer in skb
> > share info, and a pointer check in skb release when last reference
> is
> > gone.
> > 
> > For skb clone, skb expand private head and skb copy, it still keeps
> copy
> > the buffers to kernel, so we can avoid user application, like
> tcpdump to
> > hold the user-space buffers too long.
> 
> Ok, now show me the "complete implementation".  I'm as interested in
> the code as I am in the numbers, that's why I asked for both. 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Michael S. Tsirkin April 30, 2012, 9:12 a.m. UTC | #7
On Tue, Jun 28, 2011 at 10:19:48AM -0700, Rick Jones wrote:
> one of these days I'll have to find a good way to get accurate
> overall CPU utilization from within a guest and teach netperf about
> it.

I think the cleanest way would be to run another netperf server on the
host.  netperf would get a flag with host address and get cpu
utilization info.

This is what we currently do manually: run mpstat on the host.

Thoughts?

By the way, could you point me to code used by netperf
to measure CPU utilization on Linux? I'd like to figure
out why isn't the result always consistent with e.g. mpstat.

Thanks!
Rick Jones April 30, 2012, 4:54 p.m. UTC | #8
On 04/30/2012 02:12 AM, Michael S. Tsirkin wrote:
> On Tue, Jun 28, 2011 at 10:19:48AM -0700, Rick Jones wrote:
>> one of these days I'll have to find a good way to get accurate
>> overall CPU utilization from within a guest and teach netperf about
>> it.
>
> I think the cleanest way would be to run another netperf server on the
> host.  netperf would get a flag with host address and get cpu
> utilization info.
>
> This is what we currently do manually: run mpstat on the host.
>
> Thoughts?

I might be able to enhance the LOC_CPU/REM_CPU calibration tests to be 
bona fide CPU utilization tests.

> By the way, could you point me to code used by netperf
> to measure CPU utilization on Linux? I'd like to figure
> out why isn't the result always consistent with e.g. mpstat.

That would be src/netcpu_procstat.c .  That code is automagically 
selected by the configure script when it determines the compilation is 
happening under Linux.  You can, if you wish, manually set it though I 
suspect the only other mechanism known to netperf that would function 
under Linux is the "looper" (aka CPU soaker) method.

rick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e8b78ce..37a2cb4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -189,6 +189,17 @@  enum {
 	SKBTX_DRV_NEEDS_SK_REF = 1 << 3,
 };
 
+/*
+ * The callback notifies userspace to release buffers when skb DMA is done in
+ * lower device, the skb last reference should be 0 when calling this.
+ * The desc is used to track userspace buffer index.
+ */
+struct ubuf_info {
+	void (*callback)(void *);
+	void *arg;
+	unsigned long desc;
+};
+
 /* This data is invariant across clones and lives at
  * the end of the header data, ie. at skb->end.
  */
@@ -203,6 +214,9 @@  struct skb_shared_info {
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 
+	/* DMA mapping from/to userspace buffers info */
+	void		*ubuf_arg;
+
 	/*
 	 * Warning : all fields before dataref are cleared in __alloc_skb()
 	 */
@@ -2261,5 +2275,15 @@  static inline void skb_checksum_none_assert(struct sk_buff *skb)
 }
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
+
+/*
+ *	skb *uarg - is the buffer from userspace
+ *	@skb: buffer to check
+ */
+static inline int skb_ubuf(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->ubuf_arg != NULL;
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 46cbd28..115c5ca 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -329,6 +329,19 @@  static void skb_release_data(struct sk_buff *skb)
 				put_page(skb_shinfo(skb)->frags[i].page);
 		}
 
+		/*
+		 * If skb buf is from userspace, we need to notify the caller
+		 * the lower device DMA has done;
+		 */
+		if (skb_ubuf(skb)) {
+			struct ubuf_info *uarg;
+
+			uarg = (struct ubuf_info *)skb_shinfo(skb)->ubuf_arg;
+
+			if (uarg->callback)
+				uarg->callback(uarg);
+		}
+
 		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
@@ -481,6 +494,9 @@  bool skb_recycle_check(struct sk_buff *skb, int skb_size)
 	if (irqs_disabled())
 		return false;
 
+	if (skb_ubuf(skb))
+		return false;
+
 	if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
 		return false;
 
@@ -573,6 +589,7 @@  static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
 	atomic_set(&n->users, 1);
 
 	atomic_inc(&(skb_shinfo(skb)->dataref));
+	skb_shinfo(skb)->ubuf_arg = NULL;
 	skb->cloned = 1;
 
 	return n;
@@ -596,6 +613,51 @@  struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
 }
 EXPORT_SYMBOL_GPL(skb_morph);
 
+/* skb frags copy userspace buffers to kernel */
+static int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
+{
+	int i;
+	int num_frags = skb_shinfo(skb)->nr_frags;
+	struct page *page, *head = NULL;
+	struct ubuf_info *uarg = skb_shinfo(skb)->ubuf_arg;
+
+	for (i = 0; i < num_frags; i++) {
+		u8 *vaddr;
+		skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+
+		page = alloc_page(GFP_ATOMIC);
+		if (!page) {
+			while (head) {
+				put_page(head);
+				head = (struct page *)head->private;
+			}
+			return -ENOMEM;
+		}
+		vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+		memcpy(page_address(page),
+		       vaddr + f->page_offset, f->size);
+		kunmap_skb_frag(vaddr);
+		page->private = (unsigned long)head;
+		head = page;
+	}
+
+	/* skb frags release userspace buffers */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+		put_page(skb_shinfo(skb)->frags[i].page);
+
+	uarg->callback(uarg);
+	skb_shinfo(skb)->ubuf_arg = NULL;
+
+	/* skb frags point to kernel buffers */
+	for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
+		skb_shinfo(skb)->frags[i - 1].page_offset = 0;
+		skb_shinfo(skb)->frags[i - 1].page = head;
+		head = (struct page *)head->private;
+	}
+	return 0;
+}
+
+
 /**
  *	skb_clone	-	duplicate an sk_buff
  *	@skb: buffer to clone
@@ -614,6 +676,11 @@  struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	struct sk_buff *n;
 
+	if (skb_ubuf(skb)) {
+		if (skb_copy_ubufs(skb, gfp_mask))
+			return NULL;
+	}
+
 	n = skb + 1;
 	if (skb->fclone == SKB_FCLONE_ORIG &&
 	    n->fclone == SKB_FCLONE_UNAVAILABLE) {
@@ -731,6 +798,12 @@  struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 	if (skb_shinfo(skb)->nr_frags) {
 		int i;
 
+		if (skb_ubuf(skb)) {
+			if (skb_copy_ubufs(skb, gfp_mask)) {
+				kfree(n);
+				goto out;
+			}
+		}
 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
 			skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
 			get_page(skb_shinfo(n)->frags[i].page);