diff mbox series

[RFC,v2,03/14] xsk: add umem fill queue support and mmap

Message ID 20180327165919.17933-4-bjorn.topel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Introducing AF_XDP support | expand

Commit Message

Björn Töpel March 27, 2018, 4:59 p.m. UTC
From: Magnus Karlsson <magnus.karlsson@intel.com>

Here, we add another setsockopt for registered user memory (umem)
called XDP_UMEM_FILL_QUEUE. Using this socket option, the process can
ask the kernel to allocate a queue (ring buffer) and also mmap it
(XDP_UMEM_PGOFF_FILL_QUEUE) into the process.

The queue is used to explicitly pass ownership of umem frames from the
user process to the kernel. These frames will in a later patch be
filled in with Rx packet data by the kernel.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 include/uapi/linux/if_xdp.h | 15 +++++++++++
 net/xdp/Makefile            |  2 +-
 net/xdp/xdp_umem.c          |  5 ++++
 net/xdp/xdp_umem.h          |  2 ++
 net/xdp/xsk.c               | 65 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xdp/xsk_queue.c         | 54 +++++++++++++++++++++++++++++++++++++
 net/xdp/xsk_queue.h         | 49 ++++++++++++++++++++++++++++++++++
 7 files changed, 190 insertions(+), 2 deletions(-)
 create mode 100644 net/xdp/xsk_queue.c
 create mode 100644 net/xdp/xsk_queue.h

Comments

Michael S. Tsirkin April 12, 2018, 2:15 a.m. UTC | #1
On Tue, Mar 27, 2018 at 06:59:08PM +0200, Björn Töpel wrote:
> @@ -30,4 +31,18 @@ struct xdp_umem_reg {
>  	__u32 frame_headroom; /* Frame head room */
>  };
>  
> +/* Pgoff for mmaping the rings */
> +#define XDP_UMEM_PGOFF_FILL_QUEUE	0x100000000
> +
> +struct xdp_queue {
> +	__u32 head_idx __attribute__((aligned(64)));
> +	__u32 tail_idx __attribute__((aligned(64)));
> +};
> +
> +/* Used for the fill and completion queues for buffers */
> +struct xdp_umem_queue {
> +	struct xdp_queue ptrs;
> +	__u32 desc[0] __attribute__((aligned(64)));
> +};
> +
>  #endif /* _LINUX_IF_XDP_H */

So IIUC it's a head/tail ring of 32 bit descriptors.

In my experience (from implementing ptr_ring) this
implies that head/tail cache lines bounce a lot between
CPUs. Caching will help some. You are also forced to
use barriers to check validity which is slow on
some architectures.

If instead you can use a special descriptor value (e.g. 0) as
a valid signal, things work much better:

- you read descriptor atomically, if it's not 0 it's fine
- same with write - write 0 to pass it to the other side
- there is a data dependency so no need for barriers (except on dec alpha)
- no need for power of 2 limitations, you can make it any size you like
- easy to resize too

architecture (if not implementation) would be shared with ptr_ring
so some of the optimization ideas like batched updates could
be lifted from there.

When I was building ptr_ring, any head/tail design underperformed
storing valid flag with data itself. YMMV.
Magnus Karlsson April 12, 2018, 7:38 a.m. UTC | #2
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Thursday, April 12, 2018 4:16 AM
> To: Björn Töpel <bjorn.topel@gmail.com>
> Cc: Karlsson, Magnus <magnus.karlsson@intel.com>; Duyck, Alexander H
> <alexander.h.duyck@intel.com>; alexander.duyck@gmail.com;
> john.fastabend@gmail.com; ast@fb.com; brouer@redhat.com;
> willemdebruijn.kernel@gmail.com; daniel@iogearbox.net;
> netdev@vger.kernel.org; michael.lundkvist@ericsson.com; Brandeburg,
> Jesse <jesse.brandeburg@intel.com>; Singhai, Anjali
> <anjali.singhai@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> ravineet.singh@ericsson.com
> Subject: Re: [RFC PATCH v2 03/14] xsk: add umem fill queue support and
> mmap
> 
> On Tue, Mar 27, 2018 at 06:59:08PM +0200, Björn Töpel wrote:
> > @@ -30,4 +31,18 @@ struct xdp_umem_reg {
> >  	__u32 frame_headroom; /* Frame head room */  };
> >
> > +/* Pgoff for mmaping the rings */
> > +#define XDP_UMEM_PGOFF_FILL_QUEUE	0x100000000
> > +
> > +struct xdp_queue {
> > +	__u32 head_idx __attribute__((aligned(64)));
> > +	__u32 tail_idx __attribute__((aligned(64))); };
> > +
> > +/* Used for the fill and completion queues for buffers */ struct
> > +xdp_umem_queue {
> > +	struct xdp_queue ptrs;
> > +	__u32 desc[0] __attribute__((aligned(64))); };
> > +
> >  #endif /* _LINUX_IF_XDP_H */
> 
> So IIUC it's a head/tail ring of 32 bit descriptors.
> 
> In my experience (from implementing ptr_ring) this implies that head/tail
> cache lines bounce a lot between CPUs. Caching will help some. You are also
> forced to use barriers to check validity which is slow on some architectures.
> 
> If instead you can use a special descriptor value (e.g. 0) as a valid signal,
> things work much better:
> 
> - you read descriptor atomically, if it's not 0 it's fine
> - same with write - write 0 to pass it to the other side
> - there is a data dependency so no need for barriers (except on dec alpha)
> - no need for power of 2 limitations, you can make it any size you like
> - easy to resize too
> 
> architecture (if not implementation) would be shared with ptr_ring so some
> of the optimization ideas like batched updates could be lifted from there.
> 
> When I was building ptr_ring, any head/tail design underperformed storing
> valid flag with data itself. YMMV.
> 
> --
> MST

I think you are definitely right in that there are ways in which
we can improve performance here. That said, the current queue
performs slightly better than the previous one we had that was
more or less a copy of one of your first virtio 1.1 proposals
from little over a year ago. It had bidirectional queues and a
valid flag in the descriptor itself. The reason we abandoned this
was not poor performance (it was good), but a need to go to
unidirectional queues. Maybe I should have only changed that
aspect and kept the valid flag.

Anyway, I will take a look at ptr_ring and run some experiments
along the lines of what you propose to get some
numbers. Considering your experience with these kind of
structures, you are likely right. I just need to convince
myself :-).

/Magnus
Jesper Dangaard Brouer April 12, 2018, 8:54 a.m. UTC | #3
On Thu, 12 Apr 2018 07:38:25 +0000
"Karlsson, Magnus" <magnus.karlsson@intel.com> wrote:

> > -----Original Message-----
> > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > Sent: Thursday, April 12, 2018 4:16 AM
> > To: Björn Töpel <bjorn.topel@gmail.com>
> > Cc: Karlsson, Magnus <magnus.karlsson@intel.com>; Duyck, Alexander H
> > <alexander.h.duyck@intel.com>; alexander.duyck@gmail.com;
> > john.fastabend@gmail.com; ast@fb.com; brouer@redhat.com;
> > willemdebruijn.kernel@gmail.com; daniel@iogearbox.net;
> > netdev@vger.kernel.org; michael.lundkvist@ericsson.com; Brandeburg,
> > Jesse <jesse.brandeburg@intel.com>; Singhai, Anjali
> > <anjali.singhai@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> > ravineet.singh@ericsson.com
> > Subject: Re: [RFC PATCH v2 03/14] xsk: add umem fill queue support and
> > mmap
> > 
> > On Tue, Mar 27, 2018 at 06:59:08PM +0200, Björn Töpel wrote:  
> > > @@ -30,4 +31,18 @@ struct xdp_umem_reg {
> > >  	__u32 frame_headroom; /* Frame head room */  };
> > >
> > > +/* Pgoff for mmaping the rings */
> > > +#define XDP_UMEM_PGOFF_FILL_QUEUE	0x100000000
> > > +
> > > +struct xdp_queue {
> > > +	__u32 head_idx __attribute__((aligned(64)));
> > > +	__u32 tail_idx __attribute__((aligned(64))); };
> > > +
> > > +/* Used for the fill and completion queues for buffers */ struct
> > > +xdp_umem_queue {
> > > +	struct xdp_queue ptrs;
> > > +	__u32 desc[0] __attribute__((aligned(64))); };
> > > +
> > >  #endif /* _LINUX_IF_XDP_H */  
> > 
> > So IIUC it's a head/tail ring of 32 bit descriptors.
> > 
> > In my experience (from implementing ptr_ring) this implies that head/tail
> > cache lines bounce a lot between CPUs. Caching will help some. You are also
> > forced to use barriers to check validity which is slow on some architectures.
> > 
> > If instead you can use a special descriptor value (e.g. 0) as a valid signal,
> > things work much better:
> > 
> > - you read descriptor atomically, if it's not 0 it's fine
> > - same with write - write 0 to pass it to the other side
> > - there is a data dependency so no need for barriers (except on dec alpha)
> > - no need for power of 2 limitations, you can make it any size you like
> > - easy to resize too
> > 
> > architecture (if not implementation) would be shared with ptr_ring so some
> > of the optimization ideas like batched updates could be lifted from there.
> > 
> > When I was building ptr_ring, any head/tail design underperformed storing
> > valid flag with data itself. YMMV.

I fully agree with MST here. This is also my experience.  I even
dropped my own Array-based Lock-Free (ALF) queue implementation[1] in
favor of ptr_ring. (Where I try to amortize this cost by bulking, but
this cause the queue to become non-wait-free)

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/include/linux/alf_queue.h

> I think you are definitely right in that there are ways in which
> we can improve performance here. That said, the current queue
> performs slightly better than the previous one we had that was
> more or less a copy of one of your first virtio 1.1 proposals
> from little over a year ago. It had bidirectional queues and a
> valid flag in the descriptor itself. The reason we abandoned this
> was not poor performance (it was good), but a need to go to
> unidirectional queues. Maybe I should have only changed that
> aspect and kept the valid flag.
> 
> Anyway, I will take a look at ptr_ring and run some experiments
> along the lines of what you propose to get some
> numbers. Considering your experience with these kind of
> structures, you are likely right. I just need to convince
> myself :-).

When benchmarking, be careful that you don't measure the "wrong"
queue situation.  When doing this kind of "overload" benchmarking, you
will likely create a situation where the queue is always full (which
hopefully isn't a production use-case).  In the almost/always full
queue situation, using the element values to sync-on (like MST propose)
will still cause the cache-line bouncing (that we want to avoid).

MST explain and have addressed this situation for ptr_ring in:
 commit fb9de9704775 ("ptr_ring: batch ring zeroing")
 https://git.kernel.org/torvalds/c/fb9de9704775
Michael S. Tsirkin April 12, 2018, 2:04 p.m. UTC | #4
On Thu, Apr 12, 2018 at 07:38:25AM +0000, Karlsson, Magnus wrote:
> I think you are definitely right in that there are ways in which
> we can improve performance here. That said, the current queue
> performs slightly better than the previous one we had that was
> more or less a copy of one of your first virtio 1.1 proposals
> from little over a year ago. It had bidirectional queues and a
> valid flag in the descriptor itself. The reason we abandoned this
> was not poor performance (it was good), but a need to go to
> unidirectional queues. Maybe I should have only changed that
> aspect and kept the valid flag.

Is there a summary about unidirectional queues anywhere?  I'm curious to
know whether there are any lessons here to be learned for virtio
or ptr_ring.
Magnus Karlsson April 12, 2018, 3:19 p.m. UTC | #5
> -----Original Message-----
> From: Michael S. Tsirkin [mailto:mst@redhat.com]
> Sent: Thursday, April 12, 2018 4:05 PM
> To: Karlsson, Magnus <magnus.karlsson@intel.com>
> Cc: Björn Töpel <bjorn.topel@gmail.com>; Duyck, Alexander H
> <alexander.h.duyck@intel.com>; alexander.duyck@gmail.com;
> john.fastabend@gmail.com; ast@fb.com; brouer@redhat.com;
> willemdebruijn.kernel@gmail.com; daniel@iogearbox.net;
> netdev@vger.kernel.org; michael.lundkvist@ericsson.com; Brandeburg,
> Jesse <jesse.brandeburg@intel.com>; Singhai, Anjali
> <anjali.singhai@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> ravineet.singh@ericsson.com
> Subject: Re: [RFC PATCH v2 03/14] xsk: add umem fill queue support and
> mmap
> 
> On Thu, Apr 12, 2018 at 07:38:25AM +0000, Karlsson, Magnus wrote:
> > I think you are definitely right in that there are ways in which we
> > can improve performance here. That said, the current queue performs
> > slightly better than the previous one we had that was more or less a
> > copy of one of your first virtio 1.1 proposals from little over a year
> > ago. It had bidirectional queues and a valid flag in the descriptor
> > itself. The reason we abandoned this was not poor performance (it was
> > good), but a need to go to unidirectional queues. Maybe I should have
> > only changed that aspect and kept the valid flag.
> 
> Is there a summary about unidirectional queues anywhere?  I'm curious to
> know whether there are any lessons here to be learned for virtio or ptr_ring.

I did a quick hack in which I used your ptr_ring for the fill queue instead of
our head/tail based one. In the corner cases (usually empty or usually full), there
is basically no difference. But for the case when the queue is always half full,
the ptr_ring implementation boosts the performance from 5.6 to 5.7 Mpps 
(as there is no cache line bouncing in this case) 
on my system (slower than Björn's that was used for the numbers in the RFC).

So I think this should be implemented properly so we can get some real numbers.
Especially since 0.1 Mpps with copies will likely become much more with zero-copy
as we are really chasing cycles there. We will get back a better evaluation in a few
days.

Thanks: Magnus

> --
> MST
Magnus Karlsson April 23, 2018, 10:26 a.m. UTC | #6
> -----Original Message-----
> From: Karlsson, Magnus
> Sent: Thursday, April 12, 2018 5:20 PM
> To: Michael S. Tsirkin <mst@redhat.com>
> Cc: Björn Töpel <bjorn.topel@gmail.com>; Duyck, Alexander H
> <alexander.h.duyck@intel.com>; alexander.duyck@gmail.com;
> john.fastabend@gmail.com; ast@fb.com; brouer@redhat.com;
> willemdebruijn.kernel@gmail.com; daniel@iogearbox.net;
> netdev@vger.kernel.org; michael.lundkvist@ericsson.com; Brandeburg,
> Jesse <jesse.brandeburg@intel.com>; Singhai, Anjali
> <anjali.singhai@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> ravineet.singh@ericsson.com
> Subject: RE: [RFC PATCH v2 03/14] xsk: add umem fill queue support and
> mmap
> 
> 
> 
> > -----Original Message-----
> > From: Michael S. Tsirkin [mailto:mst@redhat.com]
> > Sent: Thursday, April 12, 2018 4:05 PM
> > To: Karlsson, Magnus <magnus.karlsson@intel.com>
> > Cc: Björn Töpel <bjorn.topel@gmail.com>; Duyck, Alexander H
> > <alexander.h.duyck@intel.com>; alexander.duyck@gmail.com;
> > john.fastabend@gmail.com; ast@fb.com; brouer@redhat.com;
> > willemdebruijn.kernel@gmail.com; daniel@iogearbox.net;
> > netdev@vger.kernel.org; michael.lundkvist@ericsson.com; Brandeburg,
> > Jesse <jesse.brandeburg@intel.com>; Singhai, Anjali
> > <anjali.singhai@intel.com>; Zhang, Qi Z <qi.z.zhang@intel.com>;
> > ravineet.singh@ericsson.com
> > Subject: Re: [RFC PATCH v2 03/14] xsk: add umem fill queue support and
> > mmap
> >
> > On Thu, Apr 12, 2018 at 07:38:25AM +0000, Karlsson, Magnus wrote:
> > > I think you are definitely right in that there are ways in which we
> > > can improve performance here. That said, the current queue performs
> > > slightly better than the previous one we had that was more or less a
> > > copy of one of your first virtio 1.1 proposals from little over a
> > > year ago. It had bidirectional queues and a valid flag in the
> > > descriptor itself. The reason we abandoned this was not poor
> > > performance (it was good), but a need to go to unidirectional
> > > queues. Maybe I should have only changed that aspect and kept the valid
> flag.
> >
> > Is there a summary about unidirectional queues anywhere?  I'm curious
> > to know whether there are any lessons here to be learned for virtio or
> ptr_ring.
> 
> I did a quick hack in which I used your ptr_ring for the fill queue instead of
> our head/tail based one. In the corner cases (usually empty or usually full),
> there is basically no difference. But for the case when the queue is always
> half full, the ptr_ring implementation boosts the performance from 5.6 to 5.7
> Mpps (as there is no cache line bouncing in this case) on my system (slower
> than Björn's that was used for the numbers in the RFC).
> 
> So I think this should be implemented properly so we can get some real
> numbers.
> Especially since 0.1 Mpps with copies will likely become much more with
> zero-copy as we are really chasing cycles there. We will get back a better
> evaluation in a few days.
> 
> Thanks: Magnus
> 
> > --
> > MST

Hi Michael,

Sorry for the late reply. Been travelling. Björn and I have now
made a real implementation of the ptr_ring principles in the
af_xdp code. We just added a switch in bind (only for the purpose
of this test) to be able to pick what ring implementation to use
from the user space test program. The main difference between our
version of ptr_ring and our head/tail ring is that the ptr_ring
version uses the idx field to signal if the entry is available or
not (idx == 0 indicating empty descriptor) and that it does not
use the head and tail pointers at all. Even though it is not
a "ring of pointers" in our implementation, we will still call it
ptr_ring for the purpose of this mail.

In summary, our experiments show that the two rings perform the
same in our micro benchmarks when the queues are balanced and
rarely full or empty, but the head/tail version performs better
for RX when the queues are not perfectly balanced. Why is that?
We do not exactly know, but there are a number of differences
between a ptr_ring in the kernel and one between user and kernel
space for the use in af_xdp.

* The user space descriptors have to be validated as we are
  communicating between user space and kernel space. Done slightly
  differently for the two rings due to the batching below.

* The RX and TX ring have descriptors that are larger than one
  pointer, so need to have barriers here even with ptr_ring. We can
  not rely on address dependency because it is not a pointer.

* Batching performed slightly differently in both versions. We
  avoid touching head and tail for as long as possible. At the
  worst it is once per batch, but it might be much less than that
  on the consumer side. The drawback with the accesses to the
  head/tail pointers is that it usually ends up to be a cache
  line bounce. But with ptr_ring, the drawback is that it is
  always N writes (setting idx = 0) for a batch size of N. The
  good thing though, is that these will not incur any cache
  line bouncing if the rings are balanced (well, they will be
  read by the producer at some point, but only once per traversal
  of the ring).

Something to note is that we think that the head/tail version
provides an easier-to-use user space interface since the indexes start
from 0 instead of 1 as in the ptr_ring case. With ptr_ring you
have to teach the user space application writer not to use index
0. With the head/tail version no such restriction is needed.

Here are just some of the results for a workload where user space
is faster than kernel space. This is for the case in which the user
space program has no problem keeping up with the kernel.

head/tail 16-batch

  sock0@p3p2:16 rxdrop
                 pps        
rx              9,782,718   
tx              0           

  sock0@p3p2:16 l2fwd
                 pps        
rx              2,504,235   
tx              2,504,232   


ptr_ring 16-batch

  sock0@p3p2:16 rxdrop
                 pps        
rx              9,519,373   
tx              0           

  sock0@p3p2:16 l2fwd
                 pps        
rx              2,519,265   
tx              2,519,265   


ptr_ring with batch sizes calculated as in ptr_ring.h

  sock0@p3p2:16 rxdrop
                 pps        
rx              7,470,658   
tx              0           
^C

  sock0@p3p2:16 l2fwd
                 pps        
rx              2,431,701   
tx              2,431,701   

/Magnus
diff mbox series

Patch

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 69ccb3b0a3f2..0de1bbf2c5c7 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -22,6 +22,7 @@ 
 
 /* XDP socket options */
 #define XDP_UMEM_REG		  3
+#define XDP_UMEM_FILL_QUEUE	  4
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
@@ -30,4 +31,18 @@  struct xdp_umem_reg {
 	__u32 frame_headroom; /* Frame head room */
 };
 
+/* Pgoff for mmaping the rings */
+#define XDP_UMEM_PGOFF_FILL_QUEUE	0x100000000
+
+struct xdp_queue {
+	__u32 head_idx __attribute__((aligned(64)));
+	__u32 tail_idx __attribute__((aligned(64)));
+};
+
+/* Used for the fill and completion queues for buffers */
+struct xdp_umem_queue {
+	struct xdp_queue ptrs;
+	__u32 desc[0] __attribute__((aligned(64)));
+};
+
 #endif /* _LINUX_IF_XDP_H */
diff --git a/net/xdp/Makefile b/net/xdp/Makefile
index a5d736640a0f..074fb2b2d51c 100644
--- a/net/xdp/Makefile
+++ b/net/xdp/Makefile
@@ -1,2 +1,2 @@ 
-obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o
+obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o
 
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 8f768a7887da..f2c0768ca63e 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -66,6 +66,11 @@  static void xdp_umem_release(struct xdp_umem *umem)
 	struct mm_struct *mm;
 	unsigned long diff;
 
+	if (umem->fq) {
+		xskq_destroy(umem->fq);
+		umem->fq = NULL;
+	}
+
 	if (umem->pgs) {
 		xdp_umem_unpin_pages(umem);
 
diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h
index d9bbbb880088..b3538dd2118c 100644
--- a/net/xdp/xdp_umem.h
+++ b/net/xdp/xdp_umem.h
@@ -18,9 +18,11 @@ 
 #include <linux/mm.h>
 #include <linux/if_xdp.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem_props.h"
 
 struct xdp_umem {
+	struct xsk_queue *fq;
 	struct pid *pid;
 	struct page **pgs;
 	unsigned long address;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index cc0f26e17baf..6ff1d1f3322f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -32,6 +32,7 @@ 
 #include <linux/netdevice.h>
 #include <net/sock.h>
 
+#include "xsk_queue.h"
 #include "xdp_umem.h"
 
 struct xdp_sock {
@@ -47,6 +48,21 @@  static struct xdp_sock *xdp_sk(struct sock *sk)
 	return (struct xdp_sock *)sk;
 }
 
+static int xsk_init_queue(u32 entries, struct xsk_queue **queue)
+{
+	struct xsk_queue *q;
+
+	if (entries == 0 || *queue || !is_power_of_2(entries))
+		return -EINVAL;
+
+	q = xskq_create(entries);
+	if (!q)
+		return -ENOMEM;
+
+	*queue = q;
+	return 0;
+}
+
 static int xsk_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
@@ -109,6 +125,23 @@  static int xsk_setsockopt(struct socket *sock, int level, int optname,
 		mutex_unlock(&xs->mutex);
 		return 0;
 	}
+	case XDP_UMEM_FILL_QUEUE:
+	{
+		struct xsk_queue **q;
+		int entries;
+
+		if (!xs->umem)
+			return -EINVAL;
+
+		if (copy_from_user(&entries, optval, sizeof(entries)))
+			return -EFAULT;
+
+		mutex_lock(&xs->mutex);
+		q = &xs->umem->fq;
+		err = xsk_init_queue(entries, q);
+		mutex_unlock(&xs->mutex);
+		return err;
+	}
 	default:
 		break;
 	}
@@ -116,6 +149,36 @@  static int xsk_setsockopt(struct socket *sock, int level, int optname,
 	return -ENOPROTOOPT;
 }
 
+static int xsk_mmap(struct file *file, struct socket *sock,
+		    struct vm_area_struct *vma)
+{
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct xdp_sock *xs = xdp_sk(sock->sk);
+	struct xsk_queue *q;
+	unsigned long pfn;
+	struct page *qpg;
+	int err;
+
+	if (!xs->umem)
+		return -EINVAL;
+
+	if (offset == XDP_UMEM_PGOFF_FILL_QUEUE)
+		q = xs->umem->fq;
+	else
+		return -EINVAL;
+
+	qpg = virt_to_head_page(q->ring);
+	if (size > (PAGE_SIZE << compound_order(qpg)))
+		return -EINVAL;
+
+	pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
+	err = remap_pfn_range(vma, vma->vm_start, pfn,
+			      size, vma->vm_page_prot);
+
+	return err;
+}
+
 static struct proto xsk_proto = {
 	.name =		"XDP",
 	.owner =	THIS_MODULE,
@@ -139,7 +202,7 @@  static const struct proto_ops xsk_proto_ops = {
 	.getsockopt =	sock_no_getsockopt,
 	.sendmsg =	sock_no_sendmsg,
 	.recvmsg =	sock_no_recvmsg,
-	.mmap =		sock_no_mmap,
+	.mmap =		xsk_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
new file mode 100644
index 000000000000..fd4bb06aa112
--- /dev/null
+++ b/net/xdp/xsk_queue.c
@@ -0,0 +1,54 @@ 
+/*
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/slab.h>
+
+#include "xsk_queue.h"
+
+struct xsk_queue *xskq_create(u32 nentries)
+{
+	struct xsk_queue *q;
+	gfp_t gfp_flags;
+	size_t size;
+
+	q = kzalloc(sizeof(*q), GFP_KERNEL);
+	if (!q)
+		return NULL;
+
+	q->nentries = nentries;
+	q->ring_mask = nentries - 1;
+
+	gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN |
+		    __GFP_COMP  | __GFP_NORETRY;
+	size = xskq_umem_get_ring_size(q);
+	q->validation = XSK_VALIDATION_RX;
+
+	q->ring = (struct xdp_queue *)__get_free_pages(gfp_flags,
+						       get_order(size));
+	if (!q->ring) {
+		kfree(q);
+		return NULL;
+	}
+
+	return q;
+}
+
+void xskq_destroy(struct xsk_queue *q)
+{
+	if (!q)
+		return;
+
+	page_frag_free(q->ring);
+	kfree(q);
+}
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
new file mode 100644
index 000000000000..fe845a20c153
--- /dev/null
+++ b/net/xdp/xsk_queue.h
@@ -0,0 +1,49 @@ 
+/*
+ * XDP user-space ring structure
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _LINUX_XDP_QUEUE_H
+#define _LINUX_XDP_QUEUE_H
+
+#include <linux/types.h>
+#include <linux/if_xdp.h>
+
+#include "xdp_umem_props.h"
+
+enum xsk_validation {
+	XSK_VALIDATION_RX,	  /* Only address to packet buffer validated */
+};
+
+struct xsk_queue {
+	enum xsk_validation validation;
+	struct xdp_umem_props *umem_props;
+	u32 ring_mask;
+	u32 nentries;
+	u32 iter_head_idx;
+	u32 cached_head;
+	u32 cached_tail;
+	struct xdp_queue *ring;
+	u64 invalid_descs;
+};
+
+/* Functions operating on UMEM queues only */
+
+static inline u32 xskq_umem_get_ring_size(struct xsk_queue *q)
+{
+	return sizeof(struct xdp_umem_queue) + q->nentries * sizeof(u32);
+}
+
+struct xsk_queue *xskq_create(u32 nentries);
+void xskq_destroy(struct xsk_queue *q_ops);
+
+#endif /* _LINUX_XDP_QUEUE_H */