diff mbox

[v2,net-next,af-packet,1/2] Enhance af-packet to provide (near zero)lossless packet capture functionality.

Message ID 1308708650-25509-2-git-send-email-loke.chetan@gmail.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

chetan L June 22, 2011, 2:10 a.m. UTC
Added TPACKET_V3 definitions

Signed-off-by: Chetan Loke <loke.chetan@gmail.com>
---
 include/linux/if_packet.h |  128 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 128 insertions(+), 0 deletions(-)

Comments

David Miller July 1, 2011, 10:36 p.m. UTC | #1
From: Chetan Loke <loke.chetan@gmail.com>
Date: Tue, 21 Jun 2011 22:10:49 -0400

> +struct bd_v1 {
 -
> +	__u32	block_status;
> +	__u32	num_pkts;
> +	__u32	offset_to_first_pkt;
 -
> +	__u32	blk_len;
 -
> +	__u64	seq_num;
 ...
> +	union {
> +		struct {
> +			__u32	words[4];
> +			__u64	dword;
> +		} __attribute__ ((__packed__));
> +		struct bd_v1 bd1;
 ...
> +#define BLOCK_STATUS(x)	((x)->words[0])
> +#define BLOCK_NUM_PKTS(x)	((x)->words[1])
> +#define BLOCK_O2FP(x)		((x)->words[2])
> +#define BLOCK_LEN(x)		((x)->words[3])
> +#define BLOCK_SNUM(x)		((x)->dword)

This BLOCK_SNUM definition is buggy.  It modifies the
first 64-bit word in the block descriptor.

But the sequence number lives 16 bytes into the descriptor.

This value is only written to once and never used by anything.
I would just remove it entirely.

Next, having this overlay thing is entirely pointless.  Just refer to
the block descriptor members directly!  You certainly wouldn't have
had this sequence number bug if you had done that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 5, 2011, 2:53 p.m. UTC | #2
On Fri, Jul 1, 2011 at 6:36 PM, David Miller <davem@davemloft.net> wrote:
> From: Chetan Loke <loke.chetan@gmail.com>
> Date: Tue, 21 Jun 2011 22:10:49 -0400
>
>> +struct bd_v1 {
>  -
>> +     __u32   block_status;
>> +     __u32   num_pkts;
>> +     __u32   offset_to_first_pkt;
>  -
>> +     __u32   blk_len;
>  -
>> +     __u64   seq_num;
>  ...
>> +     union {
>> +             struct {
>> +                     __u32   words[4];
>> +                     __u64   dword;
>> +             } __attribute__ ((__packed__));
>> +             struct bd_v1 bd1;
>  ...
>> +#define BLOCK_STATUS(x)      ((x)->words[0])
>> +#define BLOCK_NUM_PKTS(x)    ((x)->words[1])
>> +#define BLOCK_O2FP(x)                ((x)->words[2])
>> +#define BLOCK_LEN(x)         ((x)->words[3])
>> +#define BLOCK_SNUM(x)                ((x)->dword)
>

Sorry, I was out on the long weekend. So couldn't get to this sooner.

> This BLOCK_SNUM definition is buggy.  It modifies the
> first 64-bit word in the block descriptor.
>
> But the sequence number lives 16 bytes into the descriptor.

hmm? the words/dword are enveloped within a 'struct'. Can you please
double check?

>
> This value is only written to once and never used by anything.
> I would just remove it entirely.
>

It is used by the applications. Look at the code comments:
	/*
	 * Quite a few uses of sequence number:
	 * 1. Make sure cache flush etc worked.
	 *    Well, one can argue - why not use the increasing ts below?
	 *    But look at 2. below first.
	 * 2. When you pass around blocks to other user space decoders,
	 *    you can see which blk[s] is[are] outstanding etc.
	 * 3. Validate kernel code.
	 */


> Next, having this overlay thing is entirely pointless.  Just refer to

It is useful.
Also, future versions of the block-descriptor can append a new field.
When that happens,
none of the code needs to worry about the version etc for the unchanged fields.
Look at setsockopt - I had to add an 'union' and pass that around to
avoid minimal code churn.
So the overlay may not be pointless.

> the block descriptor members directly!  You certainly wouldn't have
> had this sequence number bug if you had done that.
>
Look at the sample app posted on:
git://lolpcap.git.sourceforge.net/gitroot/lolpcap/lolpcap

function - void validate_blk_seq_num(struct block_desc *pbd)

This function validates the block_sequence_number (which is
incremented sequentially).
The application attempts to validate the entire block layout.


Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 5, 2011, 3:01 p.m. UTC | #3
From: chetan loke <loke.chetan@gmail.com>
Date: Tue, 5 Jul 2011 10:53:26 -0400

>> Next, having this overlay thing is entirely pointless.  Just refer to
> 
> It is useful.
> Also, future versions of the block-descriptor can append a new field.
> When that happens,
> none of the code needs to worry about the version etc for the unchanged fields.

That issue only exists because you haven't defined a common header
struct that the current, and all future, block descriptor variants can
include at the start of their definitions.

I still contend that all of these abstractions are too much and
unnecessary.

Use real data structures, not opaque "offset+size" poking into the
descriptors.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 6, 2011, 9:45 p.m. UTC | #4
On Tue, Jul 5, 2011 at 11:01 AM, David Miller <davem@davemloft.net> wrote:

>
> That issue only exists because you haven't defined a common header
> struct that the current, and all future, block descriptor variants can
> include at the start of their definitions.

what's common today may not be common tomorrow. After much thinking I
decided to not provide a generic header because I wouldn't want to
enforce anything.

new format:

union bd_header_u {
       /* renamed struct bd_v1 to hdr_v1 */
       struct hdr_v1 h1;
} __attribute__ ((__packed__));

struct block_desc {
       __u16 version;
       __u16 offset_to_priv;
       union bd_header_u hdr;
} __attribute__ ((__packed__));

Is this ok with you?


>
> Use real data structures, not opaque "offset+size" poking into the
> descriptors.
>
Used to writing firmware APIs. APIs use words/bytes so that they can
be interpreted by firmware folks too.

Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 7, 2011, 7:13 a.m. UTC | #5
From: chetan loke <loke.chetan@gmail.com>
Date: Wed, 6 Jul 2011 17:45:20 -0400

> new format:
> 
> union bd_header_u {
>        /* renamed struct bd_v1 to hdr_v1 */
>        struct hdr_v1 h1;
> } __attribute__ ((__packed__));
> 
> struct block_desc {
>        __u16 version;
>        __u16 offset_to_priv;
>        union bd_header_u hdr;
> } __attribute__ ((__packed__));
> 
> Is this ok with you?

Get rid of __packed__, it's going to kill performance on RISC
platforms.  If you use __packed__, regardless of the actual alignment,
the compiler must assume that each part of the struct "might" be
unaligned.  So on architectures such as sparc where alignment matters,
a word is going to be accessed by a sequence of byte loads/stores.

Do not use packed unless absolutely enforced by a protocol or hardware
data structure, it's evil.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
chetan L July 7, 2011, 1:04 p.m. UTC | #6
On Thu, Jul 7, 2011 at 3:13 AM, David Miller <davem@davemloft.net> wrote:

> Get rid of __packed__, it's going to kill performance on RISC
> platforms.  If you use __packed__, regardless of the actual alignment,

The performance boost has been achieved by amortizing the cost of
static spin-wait/poll and not by shrinking the data-set.


> the compiler must assume that each part of the struct "might" be
> unaligned.  So on architectures such as sparc where alignment matters,
> a word is going to be accessed by a sequence of byte loads/stores.
>
Haven't worked with sparc so I didn't know. Thanks for the insight.
One also needs to analyze both the user/kernel components.The app
reads the header(hdr_size <<< blk_size) just once and then walks the
entire block. Apps operate on local copy of the variable and not on
the header.

kernel components - almost everything is cached in kbdq_core. block is
updated while closing.

> Do not use packed unless absolutely enforced by a protocol or hardware
> data structure, it's evil.
>
Depends. Why not evaluate on case-by-case basis? All I need to do is
pass this definition of the header around and only mandate how wide
the fields should be.
Once packed, I don't need to worry about padding on different
OS's/arch's. All I care about is the offset to the first pkt and other
details. The block says - you provide me offset to the first packet
and I will start walking the packets.

Another way to look at it - you pack something and then no padding is
needed(not the right example because every pkt-header will be
byte-sequenced if packed but you get the idea) :
http://git2.kernel.org/?p=linux/kernel/git/davem/net-2.6.git;a=commit;h=13fcb7bd322164c67926ffe272846d4860196dc6


Chetan Loke
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller July 7, 2011, 1:11 p.m. UTC | #7
From: chetan loke <loke.chetan@gmail.com>
Date: Thu, 7 Jul 2011 09:04:58 -0400

> On Thu, Jul 7, 2011 at 3:13 AM, David Miller <davem@davemloft.net> wrote:
> 
>> Get rid of __packed__, it's going to kill performance on RISC
>> platforms.  If you use __packed__, regardless of the actual alignment,
> 
> The performance boost has been achieved by amortizing the cost of
> static spin-wait/poll and not by shrinking the data-set.

Chetan, if you're implementing something for performance reasons,
getting rid of packed is non-negotiable.

We pass data structures between userspace and the kernel all the
time, and without __packed__.  We have mechanisms to ensure the
size of the individual data types, and we have mechanisms to make
sure 64-bit datums get aligned even on x86 (see "aligned_u64" and
friends")

Again, I can't seriously consider your patch if you keep the packed
attribute crap in there.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index 6d66ce1..e5fad08 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -55,6 +55,17 @@  struct tpacket_stats {
 	unsigned int	tp_drops;
 };
 
+struct tpacket_stats_v3 {
+	unsigned int	tp_packets;
+	unsigned int	tp_drops;
+	unsigned int	tp_freeze_q_cnt;
+};
+
+union tpacket_stats_u {
+	struct tpacket_stats stats1;
+	struct tpacket_stats_v3 stats3;
+};
+
 struct tpacket_auxdata {
 	__u32		tp_status;
 	__u32		tp_len;
@@ -71,6 +82,7 @@  struct tpacket_auxdata {
 #define TP_STATUS_LOSING	0x4
 #define TP_STATUS_CSUMNOTREADY	0x8
 #define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_BLK_TMO	0x20
 
 /* Tx ring - header status */
 #define TP_STATUS_AVAILABLE	0x0
@@ -102,12 +114,114 @@  struct tpacket2_hdr {
 	__u32		tp_nsec;
 	__u16		tp_vlan_tci;
 };
+struct tpacket3_hdr {
+	__u32		tp_status;
+	__u32		tp_len;
+	__u32		tp_snaplen;
+	__u16		tp_mac;
+	__u16		tp_net;
+	__u32		tp_sec;
+	__u32		tp_nsec;
+	__u16		tp_vlan_tci;
+	__u16		tp_padding;
+	__u32		tp_next_offset;
+};
+
+struct bd_ts {
+	unsigned int ts_sec;
+	union {
+		struct {
+			unsigned int ts_usec;
+		};
+		struct {
+			unsigned int ts_nsec;
+		};
+	};
+} __attribute__ ((__packed__));
+
+struct bd_v1 {
+	/*
+	 * If you re-order the first 5 fields then
+	 * the BLOCK_XXX macros will NOT work.
+	 */
+	__u32	block_status;
+	__u32	num_pkts;
+	__u32	offset_to_first_pkt;
+
+	/* Number of valid bytes (including padding)
+	 * blk_len <= tp_block_size
+	 */
+	__u32	blk_len;
+
+	/*
+	 * Quite a few uses of sequence number:
+	 * 1. Make sure cache flush etc worked.
+	 *    Well, one can argue - why not use the increasing ts below?
+	 *    But look at 2. below first.
+	 * 2. When you pass around blocks to other user space decoders,
+	 *    you can see which blk[s] is[are] outstanding etc.
+	 * 3. Validate kernel code.
+	 */
+	__u64	seq_num;
+
+	/*
+	 * ts_last_pkt:
+	 *
+	 * Case 1.	Block has 'N'(N >=1) packets and TMO'd(timed out)
+	 *		ts_last_pkt == 'time-stamp of last packet' and NOT the
+	 *		time when the timer fired and the block was closed.
+	 *		By providing the ts of the last packet we can absolutely
+	 *		guarantee that time-stamp wise, the first packet in the next
+	 *		block will never precede the last packet of the previous
+	 *		block.
+	 * Case 2.	Block has zero packets and TMO'd
+	 *		ts_last_pkt = time when the timer fired and the block
+	 *		was closed.
+	 * Case 3.	Block has 'N' packets and NO TMO.
+	 *		ts_last_pkt = time-stamp of the last pkt in the block.
+	 *
+	 * ts_first_pkt:
+	 *		Is always the time-stamp when the block was opened.
+	 *		Case a)	ZERO packets
+	 *			No packets to deal with but atleast you know the
+	 *			time-interval of this block.
+	 *		Case b) Non-zero packets
+	 *			Use the ts of the first packet in the block.
+	 *
+	 */
+	struct bd_ts	ts_first_pkt;
+	struct bd_ts	ts_last_pkt;
+} __attribute__ ((__packed__));
+
+struct block_desc {
+	__u16 version;
+	__u16 offset_to_priv;
+	union {
+		struct {
+			__u32	words[4];
+			__u64	dword;
+		} __attribute__ ((__packed__));
+		struct bd_v1 bd1;
+	};
+} __attribute__ ((__packed__));
+
+
 
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
+#define TPACKET3_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
+
+#define BLOCK_STATUS(x)	((x)->words[0])
+#define BLOCK_NUM_PKTS(x)	((x)->words[1])
+#define BLOCK_O2FP(x)		((x)->words[2])
+#define BLOCK_LEN(x)		((x)->words[3])
+#define BLOCK_SNUM(x)		((x)->dword)
+#define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
+#define BLOCK_PRIV(x)		((void *)((char *)(x) + BLOCK_O2PRIV(x)))
 
 enum tpacket_versions {
 	TPACKET_V1,
 	TPACKET_V2,
+	TPACKET_V3,
 };
 
 /*
@@ -130,6 +244,20 @@  struct tpacket_req {
 	unsigned int	tp_frame_nr;	/* Total number of frames */
 };
 
+struct tpacket_req3 {
+	unsigned int	tp_block_size;	/* Minimal size of contiguous block */
+	unsigned int	tp_block_nr;	/* Number of blocks */
+	unsigned int	tp_frame_size;	/* Size of frame */
+	unsigned int	tp_frame_nr;	/* Total number of frames */
+	unsigned int	tp_retire_blk_tov; /* timeout in msecs */
+	unsigned int	tp_sizeof_priv; /* size of private data area */
+};
+
+union tpacket_req_u {
+	struct tpacket_req	req;
+	struct tpacket_req3	req3;
+};
+
 struct packet_mreq {
 	int		mr_ifindex;
 	unsigned short	mr_type;