diff mbox series

[RFC,01/14] packet: introduce AF_PACKET V4 userspace API

Message ID 20171031124145.9667-2-bjorn.topel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Introducing AF_PACKET V4 support | expand

Commit Message

Björn Töpel Oct. 31, 2017, 12:41 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

This patch adds the necessary AF_PACKET V4 structures for usage from
userspace. AF_PACKET V4 is a new interface optimized for high
performance packet processing.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 include/uapi/linux/if_packet.h | 65 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

Comments

Willem de Bruijn Nov. 2, 2017, 1:45 a.m. UTC | #1
On Tue, Oct 31, 2017 at 9:41 PM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>
> This patch adds the necessary AF_PACKET V4 structures for usage from
> userspace. AF_PACKET V4 is a new interface optimized for high
> performance packet processing.
>
> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
> ---
>  include/uapi/linux/if_packet.h | 65 +++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 64 insertions(+), 1 deletion(-)
>
> +struct tpacket4_queue {
> +       struct tpacket4_desc *ring;
> +
> +       unsigned int avail_idx;
> +       unsigned int last_used_idx;
> +       unsigned int num_free;
> +       unsigned int ring_mask;
> +};
>
>  struct packet_mreq {
> @@ -294,6 +335,28 @@ struct packet_mreq {
>         unsigned char   mr_address[8];
>  };
>
> +/*
> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
> + * to register user memory which should be used to store the packet
> + * data.
> + *
> + * There are some constraints for the memory being registered:
> + * - The memory area has to be memory page size aligned.
> + * - The frame size has to be a power of 2.
> + * - The frame size cannot be smaller than 2048B.
> + * - The frame size cannot be larger than the memory page size.
> + *
> + * Corollary: The number of frames that can be stored is
> + * len / frame_size.
> + *
> + */
> +struct tpacket_memreg_req {
> +       unsigned long   addr;           /* Start of packet data area */
> +       unsigned long   len;            /* Length of packet data area */
> +       unsigned int    frame_size;     /* Frame size */
> +       unsigned int    data_headroom;  /* Frame head room */
> +};

Existing packet sockets take a tpacket_req, allocate memory and let the
user process mmap this. I understand that TPACKET_V4 distinguishes
the descriptor from packet pools, but could both use the existing structs
and logic (packet_mmap)? That would avoid introducing a lot of new code
just for granting user pages to the kernel.

Also, use of unsigned long can cause problems on 32/64 bit compat
environments. Prefer fixed width types in uapi. Same for pointer in
tpacket4_queue.
Björn Töpel Nov. 2, 2017, 10:06 a.m. UTC | #2
On 2017-11-02 02:45, Willem de Bruijn wrote:
> On Tue, Oct 31, 2017 at 9:41 PM, Björn Töpel <bjorn.topel@gmail.com> wrote:
>> From: Björn Töpel <bjorn.topel@intel.com>
>>
>> This patch adds the necessary AF_PACKET V4 structures for usage from
>> userspace. AF_PACKET V4 is a new interface optimized for high
>> performance packet processing.
>>
>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>> ---
>>   include/uapi/linux/if_packet.h | 65 +++++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 64 insertions(+), 1 deletion(-)
>>
>> +struct tpacket4_queue {
>> +       struct tpacket4_desc *ring;
>> +
>> +       unsigned int avail_idx;
>> +       unsigned int last_used_idx;
>> +       unsigned int num_free;
>> +       unsigned int ring_mask;
>> +};
>>
>>   struct packet_mreq {
>> @@ -294,6 +335,28 @@ struct packet_mreq {
>>          unsigned char   mr_address[8];
>>   };
>>
>> +/*
>> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
>> + * to register user memory which should be used to store the packet
>> + * data.
>> + *
>> + * There are some constraints for the memory being registered:
>> + * - The memory area has to be memory page size aligned.
>> + * - The frame size has to be a power of 2.
>> + * - The frame size cannot be smaller than 2048B.
>> + * - The frame size cannot be larger than the memory page size.
>> + *
>> + * Corollary: The number of frames that can be stored is
>> + * len / frame_size.
>> + *
>> + */
>> +struct tpacket_memreg_req {
>> +       unsigned long   addr;           /* Start of packet data area */
>> +       unsigned long   len;            /* Length of packet data area */
>> +       unsigned int    frame_size;     /* Frame size */
>> +       unsigned int    data_headroom;  /* Frame head room */
>> +};
>
> Existing packet sockets take a tpacket_req, allocate memory and let the
> user process mmap this. I understand that TPACKET_V4 distinguishes
> the descriptor from packet pools, but could both use the existing structs
> and logic (packet_mmap)? That would avoid introducing a lot of new code
> just for granting user pages to the kernel.
>

We could certainly pass the "tpacket_memreg_req" fields as part of
descriptor ring setup ("tpacket_req4"), but we went with having the
memory register as a new separate setsockopt. Having it separated,
makes it easier to compare regions at the kernel side of things. "Is
this the same umem as another one?" If we go the path of passing the
range at descriptor ring setup, we need to handle all kind of
overlapping ranges to determine when a copy is needed or not, in those
cases where the packet buffer (i.e. umem) is shared between processes.

> Also, use of unsigned long can cause problems on 32/64 bit compat
> environments. Prefer fixed width types in uapi. Same for pointer in
> tpacket4_queue.

I agree; We'll change to a fixed width type in next version. Do you
(and others on the list) prefer __u32/__u64 or unsigned int / unsigned
long long?


Thanks,
Björn
Tushar Dave Nov. 2, 2017, 4:40 p.m. UTC | #3
On 11/02/2017 03:06 AM, Björn Töpel wrote:
> On 2017-11-02 02:45, Willem de Bruijn wrote:
>> On Tue, Oct 31, 2017 at 9:41 PM, Björn Töpel <bjorn.topel@gmail.com> wrote:
>>> From: Björn Töpel <bjorn.topel@intel.com>
>>>
>>> This patch adds the necessary AF_PACKET V4 structures for usage from
>>> userspace. AF_PACKET V4 is a new interface optimized for high
>>> performance packet processing.
>>>
>>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>>> ---
>>>    include/uapi/linux/if_packet.h | 65 +++++++++++++++++++++++++++++++++++++++++-
>>>    1 file changed, 64 insertions(+), 1 deletion(-)
>>>
>>> +struct tpacket4_queue {
>>> +       struct tpacket4_desc *ring;
>>> +
>>> +       unsigned int avail_idx;
>>> +       unsigned int last_used_idx;
>>> +       unsigned int num_free;
>>> +       unsigned int ring_mask;
>>> +};
>>>
>>>    struct packet_mreq {
>>> @@ -294,6 +335,28 @@ struct packet_mreq {
>>>           unsigned char   mr_address[8];
>>>    };
>>>
>>> +/*
>>> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
>>> + * to register user memory which should be used to store the packet
>>> + * data.
>>> + *
>>> + * There are some constraints for the memory being registered:
>>> + * - The memory area has to be memory page size aligned.
>>> + * - The frame size has to be a power of 2.
>>> + * - The frame size cannot be smaller than 2048B.
>>> + * - The frame size cannot be larger than the memory page size.
>>> + *
>>> + * Corollary: The number of frames that can be stored is
>>> + * len / frame_size.
>>> + *
>>> + */
>>> +struct tpacket_memreg_req {
>>> +       unsigned long   addr;           /* Start of packet data area */
>>> +       unsigned long   len;            /* Length of packet data area */
>>> +       unsigned int    frame_size;     /* Frame size */
>>> +       unsigned int    data_headroom;  /* Frame head room */
>>> +};
>>
>> Existing packet sockets take a tpacket_req, allocate memory and let the
>> user process mmap this. I understand that TPACKET_V4 distinguishes
>> the descriptor from packet pools, but could both use the existing structs
>> and logic (packet_mmap)? That would avoid introducing a lot of new code
>> just for granting user pages to the kernel.
>>
> 
> We could certainly pass the "tpacket_memreg_req" fields as part of
> descriptor ring setup ("tpacket_req4"), but we went with having the
> memory register as a new separate setsockopt. Having it separated,
> makes it easier to compare regions at the kernel side of things. "Is
> this the same umem as another one?" If we go the path of passing the
> range at descriptor ring setup, we need to handle all kind of
> overlapping ranges to determine when a copy is needed or not, in those
> cases where the packet buffer (i.e. umem) is shared between processes.

Is there a reason to use separate packet socket for umem? Looks like
userspace has to create separate packet socket for PACKET_MEMREG.


-Tushar>
>> Also, use of unsigned long can cause problems on 32/64 bit compat
>> environments. Prefer fixed width types in uapi. Same for pointer in
>> tpacket4_queue.
> 
> I agree; We'll change to a fixed width type in next version. Do you
> (and others on the list) prefer __u32/__u64 or unsigned int / unsigned
> long long?
> 
> 
> Thanks,
> Björn
>
Björn Töpel Nov. 2, 2017, 4:47 p.m. UTC | #4
2017-11-02 17:40 GMT+01:00 Tushar Dave <tushar.n.dave@oracle.com>:
>
>
> On 11/02/2017 03:06 AM, Björn Töpel wrote:
>>
>> On 2017-11-02 02:45, Willem de Bruijn wrote:
>>>
>>> On Tue, Oct 31, 2017 at 9:41 PM, Björn Töpel <bjorn.topel@gmail.com>
>>> wrote:
>>>>
>>>> From: Björn Töpel <bjorn.topel@intel.com>
>>>>
>>>> This patch adds the necessary AF_PACKET V4 structures for usage from
>>>> userspace. AF_PACKET V4 is a new interface optimized for high
>>>> performance packet processing.
>>>>
>>>> Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
>>>> ---
>>>>    include/uapi/linux/if_packet.h | 65
>>>> +++++++++++++++++++++++++++++++++++++++++-
>>>>    1 file changed, 64 insertions(+), 1 deletion(-)
>>>>
>>>> +struct tpacket4_queue {
>>>> +       struct tpacket4_desc *ring;
>>>> +
>>>> +       unsigned int avail_idx;
>>>> +       unsigned int last_used_idx;
>>>> +       unsigned int num_free;
>>>> +       unsigned int ring_mask;
>>>> +};
>>>>
>>>>    struct packet_mreq {
>>>> @@ -294,6 +335,28 @@ struct packet_mreq {
>>>>           unsigned char   mr_address[8];
>>>>    };
>>>>
>>>> +/*
>>>> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
>>>> + * to register user memory which should be used to store the packet
>>>> + * data.
>>>> + *
>>>> + * There are some constraints for the memory being registered:
>>>> + * - The memory area has to be memory page size aligned.
>>>> + * - The frame size has to be a power of 2.
>>>> + * - The frame size cannot be smaller than 2048B.
>>>> + * - The frame size cannot be larger than the memory page size.
>>>> + *
>>>> + * Corollary: The number of frames that can be stored is
>>>> + * len / frame_size.
>>>> + *
>>>> + */
>>>> +struct tpacket_memreg_req {
>>>> +       unsigned long   addr;           /* Start of packet data area */
>>>> +       unsigned long   len;            /* Length of packet data area */
>>>> +       unsigned int    frame_size;     /* Frame size */
>>>> +       unsigned int    data_headroom;  /* Frame head room */
>>>> +};
>>>
>>>
>>> Existing packet sockets take a tpacket_req, allocate memory and let the
>>> user process mmap this. I understand that TPACKET_V4 distinguishes
>>> the descriptor from packet pools, but could both use the existing structs
>>> and logic (packet_mmap)? That would avoid introducing a lot of new code
>>> just for granting user pages to the kernel.
>>>
>>
>> We could certainly pass the "tpacket_memreg_req" fields as part of
>> descriptor ring setup ("tpacket_req4"), but we went with having the
>> memory register as a new separate setsockopt. Having it separated,
>> makes it easier to compare regions at the kernel side of things. "Is
>> this the same umem as another one?" If we go the path of passing the
>> range at descriptor ring setup, we need to handle all kind of
>> overlapping ranges to determine when a copy is needed or not, in those
>> cases where the packet buffer (i.e. umem) is shared between processes.
>
>
> Is there a reason to use separate packet socket for umem? Looks like
> userspace has to create separate packet socket for PACKET_MEMREG.
>

Let me clarify; You *can* use a separate socket for umem, but
you can also use the same/existing AF_PACKET socket for that.


Björn

>
> -Tushar>
>
>>> Also, use of unsigned long can cause problems on 32/64 bit compat
>>> environments. Prefer fixed width types in uapi. Same for pointer in
>>> tpacket4_queue.
>>
>>
>> I agree; We'll change to a fixed width type in next version. Do you
>> (and others on the list) prefer __u32/__u64 or unsigned int / unsigned
>> long long?
>>
>>
>> Thanks,
>> Björn
>>
>
Willem de Bruijn Nov. 3, 2017, 2:29 a.m. UTC | #5
>>> +/*
>>> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
>>> + * to register user memory which should be used to store the packet
>>> + * data.
>>> + *
>>> + * There are some constraints for the memory being registered:
>>> + * - The memory area has to be memory page size aligned.
>>> + * - The frame size has to be a power of 2.
>>> + * - The frame size cannot be smaller than 2048B.
>>> + * - The frame size cannot be larger than the memory page size.
>>> + *
>>> + * Corollary: The number of frames that can be stored is
>>> + * len / frame_size.
>>> + *
>>> + */
>>> +struct tpacket_memreg_req {
>>> +       unsigned long   addr;           /* Start of packet data area */
>>> +       unsigned long   len;            /* Length of packet data area */
>>> +       unsigned int    frame_size;     /* Frame size */
>>> +       unsigned int    data_headroom;  /* Frame head room */
>>> +};
>>
>> Existing packet sockets take a tpacket_req, allocate memory and let the
>> user process mmap this. I understand that TPACKET_V4 distinguishes
>> the descriptor from packet pools, but could both use the existing structs
>> and logic (packet_mmap)? That would avoid introducing a lot of new code
>> just for granting user pages to the kernel.
>>
>
> We could certainly pass the "tpacket_memreg_req" fields as part of
> descriptor ring setup ("tpacket_req4"), but we went with having the
> memory register as a new separate setsockopt. Having it separated,
> makes it easier to compare regions at the kernel side of things. "Is
> this the same umem as another one?" If we go the path of passing the
> range at descriptor ring setup, we need to handle all kind of
> overlapping ranges to determine when a copy is needed or not, in those
> cases where the packet buffer (i.e. umem) is shared between processes.

That's not what I meant. Both descriptor rings and packet pools are
memory regions. Packet sockets already have logic to allocate regions
and make them available to userspace with mmap(). Packet v4 reuses
that logic for its descriptor rings. Can it use the same for its packet
pool? Why does the kernel map user memory, instead? That is a lot of
non-trivial new logic.
Björn Töpel Nov. 3, 2017, 9:54 a.m. UTC | #6
2017-11-03 3:29 GMT+01:00 Willem de Bruijn <willemdebruijn.kernel@gmail.com>:
>>>> +/*
>>>> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
>>>> + * to register user memory which should be used to store the packet
>>>> + * data.
>>>> + *
>>>> + * There are some constraints for the memory being registered:
>>>> + * - The memory area has to be memory page size aligned.
>>>> + * - The frame size has to be a power of 2.
>>>> + * - The frame size cannot be smaller than 2048B.
>>>> + * - The frame size cannot be larger than the memory page size.
>>>> + *
>>>> + * Corollary: The number of frames that can be stored is
>>>> + * len / frame_size.
>>>> + *
>>>> + */
>>>> +struct tpacket_memreg_req {
>>>> +       unsigned long   addr;           /* Start of packet data area */
>>>> +       unsigned long   len;            /* Length of packet data area */
>>>> +       unsigned int    frame_size;     /* Frame size */
>>>> +       unsigned int    data_headroom;  /* Frame head room */
>>>> +};
>>>
>>> Existing packet sockets take a tpacket_req, allocate memory and let the
>>> user process mmap this. I understand that TPACKET_V4 distinguishes
>>> the descriptor from packet pools, but could both use the existing structs
>>> and logic (packet_mmap)? That would avoid introducing a lot of new code
>>> just for granting user pages to the kernel.
>>>
>>
>> We could certainly pass the "tpacket_memreg_req" fields as part of
>> descriptor ring setup ("tpacket_req4"), but we went with having the
>> memory register as a new separate setsockopt. Having it separated,
>> makes it easier to compare regions at the kernel side of things. "Is
>> this the same umem as another one?" If we go the path of passing the
>> range at descriptor ring setup, we need to handle all kind of
>> overlapping ranges to determine when a copy is needed or not, in those
>> cases where the packet buffer (i.e. umem) is shared between processes.
>
> That's not what I meant. Both descriptor rings and packet pools are
> memory regions. Packet sockets already have logic to allocate regions
> and make them available to userspace with mmap(). Packet v4 reuses
> that logic for its descriptor rings. Can it use the same for its packet
> pool? Why does the kernel map user memory, instead? That is a lot of
> non-trivial new logic.

Ah, got it. So, why do we register packet pool memory, instead of
allocating in the kernel and mapping *that* memory.

Actually, we started out with that approach, where the packet_mmap
call mapped Tx/Rx descriptor rings and the packet buffer region. We
later moved to this (register umem) approach, because it's more
flexible for user space, not having to use a AF_PACKET specific
allocator (i.e. continue to use regular mallocs, huge pages and such).

I agree that the memory register code is adding a lot of new logic,
but I believe it's worth the flexibility for user space. I'm looking
into if I can share the memory register logic from Infiniband/verbs
subsystem (drivers/infiniband/core/umem.c).


Björn
chetan L Nov. 15, 2017, 10:21 p.m. UTC | #7
>
> Actually, we started out with that approach, where the packet_mmap
> call mapped Tx/Rx descriptor rings and the packet buffer region. We
> later moved to this (register umem) approach, because it's more
> flexible for user space, not having to use a AF_PACKET specific
> allocator (i.e. continue to use regular mallocs, huge pages and such).
>


One quick question:
Any thoughts on SVM support?
Is SVM support going to be so disruptive that we will need to churn a tp_v5?

If not then to accommodate future SVM enablement do you think it might
make sense to add/stuff a control-info union in the tp4_queue (or umem
etc). And then in the future, I think setmemreg (or something else)
would need to pass the PASID in addition to the malloc'd addr.
Assumption here is that the user-app will bind PID<->PASID before
invoking the AF_ZC setup.



> Björn

Chetan
chetan L Nov. 15, 2017, 10:34 p.m. UTC | #8
On Tue, Oct 31, 2017 at 5:41 AM, Björn Töpel <bjorn.topel@gmail.com> wrote:
> From: Björn Töpel <bjorn.topel@intel.com>
>

> +/*
> + * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
> + * to register user memory which should be used to store the packet
> + * data.
> + *
> + * There are some constraints for the memory being registered:
> + * - The memory area has to be memory page size aligned.
> + * - The frame size has to be a power of 2.
> + * - The frame size cannot be smaller than 2048B.
> + * - The frame size cannot be larger than the memory page size.
> + *
> + * Corollary: The number of frames that can be stored is
> + * len / frame_size.
> + *
> + */
> +struct tpacket_memreg_req {
> +       unsigned long   addr;           /* Start of packet data area */
> +       unsigned long   len;            /* Length of packet data area */
> +       unsigned int    frame_size;     /* Frame size */
> +       unsigned int    data_headroom;  /* Frame head room */
> +};
> +

I have not reviewed the entire patchset but I think if we could add a
version_hdr and then unionize the fields, it might be easier to add
SVM support without having to spin v5. I could be wrong though.


Chetan
David Miller Nov. 16, 2017, 1:44 a.m. UTC | #9
From: chet l <loke.chetan@gmail.com>
Date: Wed, 15 Nov 2017 14:34:32 -0800

> I have not reviewed the entire patchset but I think if we could add a
> version_hdr and then unionize the fields, it might be easier to add
> SVM support without having to spin v5. I could be wrong though.

Please, NO VERSION FIELDS!

Design things properly from the start rather than using a crutch of
being able to "adjust things later".
Jesper Dangaard Brouer Nov. 16, 2017, 4:53 p.m. UTC | #10
On Wed, 15 Nov 2017 14:21:38 -0800
chet l <loke.chetan@gmail.com> wrote:

> One quick question:
> Any thoughts on SVM support?

What is SVM ?
chetan L Nov. 16, 2017, 7:32 p.m. UTC | #11
On Wed, Nov 15, 2017 at 5:44 PM, David Miller <davem@davemloft.net> wrote:
> From: chet l <loke.chetan@gmail.com>
> Date: Wed, 15 Nov 2017 14:34:32 -0800
>
>> I have not reviewed the entire patchset but I think if we could add a
>> version_hdr and then unionize the fields, it might be easier to add
>> SVM support without having to spin v5. I could be wrong though.
>
> Please, NO VERSION FIELDS!
>
> Design things properly from the start rather than using a crutch of
> being able to "adjust things later".

Agreed. If this step in tpkt_v4 is able to follow what req1/2/3 did as
part of the setsockopt(..) API then it should be ok. If its a
different API then it will be difficult for the follow-on version(s)
to make seamless changes.

Look at tpacket_req3 for example. Since there was no hdr, I had no
option but to align the fields with tpacket_req/req2 during the setup.
I won't have access to a SMMUv3 capable ARM platform anytime soon. So
I can't actually test/write anything as of now.


Chetan
chetan L Nov. 17, 2017, 3:32 a.m. UTC | #12
On Thu, Nov 16, 2017 at 8:53 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> On Wed, 15 Nov 2017 14:21:38 -0800
> chet l <loke.chetan@gmail.com> wrote:
>
>> One quick question:
>> Any thoughts on SVM support?
>
> What is SVM ?
>

Shared Virtual Memory(PCIe based). So going back to one of your
mapping examples. The protocol can be AF_CHANNEL.
Modes could be:
AF_ZC , AF_XDP_REDIRECT

Mapping types could be:
AF_NON_SVM(current setup - no PASID needed), AF_SVM(onus is on the
user to pass the PASID as part of the setsockopt), AF_SVM++


Chetan
diff mbox series

Patch

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 4df96a7dd4fa..8eabcd1b370a 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -56,6 +56,8 @@  struct sockaddr_ll {
 #define PACKET_QDISC_BYPASS		20
 #define PACKET_ROLLOVER_STATS		21
 #define PACKET_FANOUT_DATA		22
+#define PACKET_MEMREG			23
+#define PACKET_ZEROCOPY			24
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
@@ -243,13 +245,35 @@  struct tpacket_block_desc {
 	union tpacket_bd_header_u hdr;
 };
 
+#define TP4_DESC_KERNEL	0x0080 /* The descriptor is owned by the kernel */
+#define TP4_PKT_CONT	1 /* The packet continues in the next descriptor */
+
+struct tpacket4_desc {
+	__u32 idx;
+	__u32 len;
+	__u16 offset;
+	__u8  error; /* an errno */
+	__u8  flags;
+	__u8  padding[4];
+};
+
+struct tpacket4_queue {
+	struct tpacket4_desc *ring;
+
+	unsigned int avail_idx;
+	unsigned int last_used_idx;
+	unsigned int num_free;
+	unsigned int ring_mask;
+};
+
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
 #define TPACKET3_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
 
 enum tpacket_versions {
 	TPACKET_V1,
 	TPACKET_V2,
-	TPACKET_V3
+	TPACKET_V3,
+	TPACKET_V4
 };
 
 /*
@@ -282,9 +306,26 @@  struct tpacket_req3 {
 	unsigned int	tp_feature_req_word;
 };
 
+/* V4 frame structure
+ *
+ * The v4 frame is contained within a frame defined by
+ * PACKET_MEMREG/struct tpacket_memreg_req. Each frame is frame_size
+ * bytes, and laid out as following:
+ *
+ * - Start.
+ * - Gap, at least data_headroom (from struct tpacket_memreg_req),
+ *   chosen so that packet data (Start+data) is at least 64B aligned.
+ */
+
+struct tpacket_req4 {
+	int		mr_fd;	 /* File descriptor for registered buffers */
+	unsigned int	desc_nr; /* Number of entries in descriptor ring */
+};
+
 union tpacket_req_u {
 	struct tpacket_req	req;
 	struct tpacket_req3	req3;
+	struct tpacket_req4	req4;
 };
 
 struct packet_mreq {
@@ -294,6 +335,28 @@  struct packet_mreq {
 	unsigned char	mr_address[8];
 };
 
+/*
+ * struct tpacket_memreg_req is used in conjunction with PACKET_MEMREG
+ * to register user memory which should be used to store the packet
+ * data.
+ *
+ * There are some constraints for the memory being registered:
+ * - The memory area has to be memory page size aligned.
+ * - The frame size has to be a power of 2.
+ * - The frame size cannot be smaller than 2048B.
+ * - The frame size cannot be larger than the memory page size.
+ *
+ * Corollary: The number of frames that can be stored is
+ * len / frame_size.
+ *
+ */
+struct tpacket_memreg_req {
+	unsigned long	addr;		/* Start of packet data area */
+	unsigned long	len;		/* Length of packet data area */
+	unsigned int	frame_size;	/* Frame size */
+	unsigned int	data_headroom;	/* Frame head room */
+};
+
 #define PACKET_MR_MULTICAST	0
 #define PACKET_MR_PROMISC	1
 #define PACKET_MR_ALLMULTI	2