diff mbox series

[RFC,10/14] samples/tpacket4: added tpbench

Message ID 20171031124145.9667-11-bjorn.topel@gmail.com
State RFC, archived
Delegated to: David Miller
Headers show
Series Introducing AF_PACKET V4 support | expand

Commit Message

Björn Töpel Oct. 31, 2017, 12:41 p.m. UTC
From: Björn Töpel <bjorn.topel@intel.com>

The tpbench program is benchmarking TPACKET_V2 up to
TPACKET_V4. There's a bench_all.sh script that makes testing all
versions easier.

Note that zero-copy means binding the TPACKET_V4 socket to a certain
NIC hardware queue, so you'll need to steer your traffic to a certain
NIC hardware queue. Say that you'd like your UDP traffic from port
4242 to end up in queue 16. Here, we use ethtool for this:

  ethtool -N p3p2 rx-flow-hash udp4 fn
  ethtool -N p3p2 flow-type udp4 src-port 4242 dst-port 4242 \
      action 16

running the benchmark in zero-copy mode can then be done using:

  taskset -c 16 ./tpbench -i p3p2 --rxdrop --zerocopy 17

Note that the queue is one-based and not zero-based.

Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
---
 samples/tpacket4/Makefile     |   12 +
 samples/tpacket4/bench_all.sh |   28 +
 samples/tpacket4/tpbench.c    | 1253 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 1293 insertions(+)
 create mode 100644 samples/tpacket4/Makefile
 create mode 100755 samples/tpacket4/bench_all.sh
 create mode 100644 samples/tpacket4/tpbench.c
diff mbox series

Patch

diff --git a/samples/tpacket4/Makefile b/samples/tpacket4/Makefile
new file mode 100644
index 000000000000..1dd731ffe3e9
--- /dev/null
+++ b/samples/tpacket4/Makefile
@@ -0,0 +1,12 @@ 
+# kbuild trick to avoid linker error. Can be omitted if a module is built.
+obj- := dummy.o
+
+# List of programs to build
+hostprogs-y := tpbench
+
+# Tell kbuild to always build the programs
+always := $(hostprogs-y)
+
+HOSTCFLAGS_tpbench.o += -I$(objtree)/usr/include
+
+all: tpbench
diff --git a/samples/tpacket4/bench_all.sh b/samples/tpacket4/bench_all.sh
new file mode 100755
index 000000000000..8d7ee17e1682
--- /dev/null
+++ b/samples/tpacket4/bench_all.sh
@@ -0,0 +1,28 @@ 
+#!/bin/bash
+
+DIR=`dirname "${BASH_SOURCE[0]}"`
+
+IF=p3p2
+DURATION=60
+CORE=14
+ZC=17
+
+echo "You might want to change the parameters in ${BASH_SOURCE[0]}"
+echo "${IF} cpu${CORE} duration ${DURATION}s zc ${ZC}"
+
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=2 --rxdrop
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=3 --rxdrop
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --rxdrop
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --rxdrop --zerocopy ${ZC}
+
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=2 --txonly
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=3 --txonly
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --txonly
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --txonly --zerocopy ${ZC}
+
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=2 --l2fwd
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=3 --l2fwd
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --l2fwd
+sudo taskset -c ${CORE} timeout -s int ${DURATION} ${DIR}/tpbench -i ${IF} --version=4 --l2fwd --zerocopy ${ZC}
+
+
diff --git a/samples/tpacket4/tpbench.c b/samples/tpacket4/tpbench.c
new file mode 100644
index 000000000000..46fb83009e06
--- /dev/null
+++ b/samples/tpacket4/tpbench.c
@@ -0,0 +1,1253 @@ 
+/*
+ *  tpbench
+ *  Copyright(c) 2017 Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ether.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/shm.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define BATCH_SIZE 64 /* process pace */
+
+#define NUM_BUFFERS 131072
+#define FRAME_SIZE 2048
+
+#define BLOCK_SIZE (1 << 22) /* V2/V3 */
+#define NUM_DESCS 4096 /* V4 */
+
+static unsigned long rx_npkts;
+static unsigned long tx_npkts;
+static unsigned long start_time;
+
+/* cli options */
+enum tpacket_version {
+	PV2 = 0,
+	PV3 = 1,
+	PV4 = 2,
+};
+
+enum benchmark_type {
+	BENCH_RXDROP = 0,
+	BENCH_TXONLY = 1,
+	BENCH_L2FWD = 2,
+};
+
+static enum tpacket_version opt_tpver = PV4;
+static enum benchmark_type opt_bench = BENCH_RXDROP;
+static const char *opt_if = "";
+static int opt_zerocopy;
+
+struct tpacket2_queue {
+	void *ring;
+
+	unsigned int last_used_idx;
+	unsigned int ring_size;
+	unsigned int frame_size_log2;
+};
+
+struct tp2_queue_pair {
+	struct tpacket2_queue rx;
+	struct tpacket2_queue tx;
+	int sfd;
+	const char *interface_name;
+};
+
+struct tpacket3_rx_queue {
+	void *ring;
+	struct tpacket3_hdr *frames[BATCH_SIZE];
+
+	unsigned int last_used_idx;
+	unsigned int ring_size; /* NB! blocks, not frames */
+	unsigned int block_size_log2;
+
+	struct tpacket3_hdr *last_frame;
+	unsigned int npkts; /* >0 in block */
+};
+
+struct tp3_queue_pair {
+	struct tpacket3_rx_queue rx;
+	struct tpacket2_queue tx;
+	int sfd;
+	const char *interface_name;
+};
+
+struct tp4_umem {
+	char *buffer;
+	size_t size;
+	unsigned int frame_size;
+	unsigned int frame_size_log2;
+	unsigned int nframes;
+	int mr_fd;
+	unsigned long free_stack[NUM_BUFFERS];
+	unsigned int free_stack_idx;
+};
+
+struct tp4_queue_pair {
+	struct tpacket4_queue rx;
+	struct tpacket4_queue tx;
+	int sfd;
+	const char *interface_name;
+	struct tp4_umem *umem;
+};
+
+struct benchmark {
+	void *		(*configure)(const char *interface_name);
+	void		(*rx)(void *queue_pair, unsigned int *start,
+			      unsigned int *end);
+	void *		(*get_data)(void *queue_pair, unsigned int idx,
+				    unsigned int *len);
+	unsigned long	(*get_data_desc)(void *queue_pair, unsigned int idx,
+					 unsigned int *len,
+					 unsigned short *offset);
+	void		(*set_data_desc)(void *queue_pair, unsigned int idx,
+					 unsigned long didx);
+	void		(*process)(void *queue_pair, unsigned int start,
+				   unsigned int end);
+	void		(*rx_release)(void *queue_pair, unsigned int start,
+				      unsigned int end);
+	void		(*tx)(void *queue_pair, unsigned int start,
+			      unsigned int end);
+};
+
+static char tx_frame[1024];
+static unsigned int tx_frame_len;
+static struct benchmark benchmark;
+
+#define lassert(expr)							\
+	do {								\
+		if (!(expr)) {						\
+			fprintf(stderr, "%s:%s:%i: Assertion failed: "	\
+				#expr ": errno: %d/\"%s\"\n",		\
+				__FILE__, __func__, __LINE__,		\
+				errno, strerror(errno));		\
+			exit(EXIT_FAILURE);				\
+		}							\
+	} while (0)
+
+#define barrier() __asm__ __volatile__("" : : : "memory")
+#define u_smp_rmb() barrier()
+#define u_smp_wmb() barrier()
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define log2(x)							\
+	((unsigned int)(8 * sizeof(unsigned long long) -	\
+			__builtin_clzll((x)) - 1))
+
+#if 0
+static void hex_dump(void *pkt, size_t length, const char *prefix)
+{
+	int i = 0;
+	const unsigned char *address = (unsigned char *)pkt;
+	const unsigned char *line = address;
+	size_t line_size = 32;
+	unsigned char c;
+
+	printf("%s | ", prefix);
+	while (length-- > 0) {
+		printf("%02X ", *address++);
+		if (!(++i % line_size) || (length == 0 && i % line_size)) {
+			if (length == 0) {
+				while (i++ % line_size)
+					printf("__ ");
+			}
+			printf(" | ");	/* right close */
+			while (line < address) {
+				c = *line++;
+				printf("%c", (c < 33 || c == 255) ? 0x2E : c);
+			}
+			printf("\n");
+			if (length > 0)
+				printf("%s | ", prefix);
+		}
+	}
+	printf("\n");
+}
+#endif
+
+static size_t gen_eth_frame(char *frame, int data)
+{
+	static const char d[] =
+		"\x3c\xfd\xfe\x9e\x7f\x71\xec\xb1\xd7\x98\x3a\xc0\x08\x00\x45\x00"
+		"\x00\x2e\x00\x00\x00\x00\x40\x11\x88\x97\x05\x08\x07\x08\xc8\x14"
+		"\x1e\x04\x10\x92\x10\x92\x00\x1a\x6d\xa3\x34\x33\x1f\x69\x40\x6b"
+		"\x54\x59\xb6\x14\x2d\x11\x44\xbf\xaf\xd9\xbe\xaa";
+
+	(void)data;
+	memcpy(frame, d, sizeof(d) - 1);
+	return sizeof(d) - 1;
+
+#if 0
+	/* XXX This generates "multicast packets" */
+	struct ether_header *eh = (struct ether_header *)frame;
+	size_t len = sizeof(struct ether_header);
+	int i;
+
+	for (i = 0; i < 6; i++) {
+		eh->ether_shost[i] = i + 0x01;
+		eh->ether_dhost[i] = i + 0x11;
+	}
+	eh->ether_type = htons(ETH_P_IP);
+
+	for (i = 0; i < 46; i++)
+		frame[len++] = data;
+
+	return len;
+#endif
+}
+
+static void setup_tx_frame(void)
+{
+	tx_frame_len = gen_eth_frame(tx_frame, 42);
+}
+
+static void swap_mac_addresses(void *data)
+{
+	struct ether_header *eth = (struct ether_header *)data;
+	struct ether_addr *src_addr = (struct ether_addr *)&eth->ether_shost;
+	struct ether_addr *dst_addr = (struct ether_addr *)&eth->ether_dhost;
+	struct ether_addr tmp;
+
+	tmp = *src_addr;
+	*src_addr = *dst_addr;
+	*dst_addr = tmp;
+}
+
+static void rx_dummy(void *queue_pair, unsigned int *start, unsigned int *end)
+{
+	(void)queue_pair;
+	*start = 0;
+	*end = BATCH_SIZE;
+}
+
+static void rx_release_dummy(void *queue_pair, unsigned int start,
+			     unsigned int end)
+{
+	(void)queue_pair;
+	(void)start;
+	(void)end;
+}
+
+static void *get_data_dummy(void *queue_pair, unsigned int idx,
+			    unsigned int *len)
+{
+	(void)queue_pair;
+	(void)idx;
+
+	*len = tx_frame_len;
+
+	return tx_frame;
+}
+
+#if 0
+static void process_hexdump(void *queue_pair, unsigned int start,
+			    unsigned int end)
+{
+	unsigned int len;
+	void *data;
+
+	while (start != end) {
+		data = benchmark.get_data(queue_pair, start, &len);
+		hex_dump(data, len, "Rx:");
+		start++;
+	}
+}
+#endif
+
+static void process_swap_mac(void *queue_pair, unsigned int start,
+			     unsigned int end)
+{
+	unsigned int len;
+	void *data;
+
+	while (start != end) {
+		data = benchmark.get_data(queue_pair, start, &len);
+		swap_mac_addresses(data);
+		start++;
+	}
+}
+
+static void run_benchmark(const char *interface_name)
+{
+	unsigned int start, end;
+	struct tp2_queue_pair *qp;
+
+	qp = benchmark.configure(interface_name);
+
+	for (;;) {
+		for (;;) {
+			benchmark.rx(qp, &start, &end);
+			if ((end - start) > 0)
+				break;
+			// XXX
+			//if (poll)
+			//	poll();
+		}
+
+		if (benchmark.process)
+			benchmark.process(qp, start, end);
+
+		benchmark.tx(qp, start, end);
+	}
+}
+
+static unsigned long get_nsecs(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000UL + ts.tv_nsec;
+}
+
+static void *tp2_configure(const char *interface_name)
+{
+	int sfd, noqdisc, ret, ver = TPACKET_V2;
+	struct tp2_queue_pair *tqp;
+	struct tpacket_req req = {};
+	struct sockaddr_ll ll;
+	void *rxring;
+
+	/* create PF_PACKET socket */
+	sfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	lassert(sfd >= 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
+	lassert(ret == 0);
+
+	tqp = calloc(1, sizeof(*tqp));
+	lassert(tqp);
+
+	tqp->sfd = sfd;
+	tqp->interface_name = interface_name;
+
+	req.tp_block_size = BLOCK_SIZE;
+	req.tp_frame_size = FRAME_SIZE;
+	req.tp_block_nr = NUM_BUFFERS * FRAME_SIZE / BLOCK_SIZE;
+	req.tp_frame_nr = req.tp_block_nr * BLOCK_SIZE / FRAME_SIZE;
+
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+
+	rxring = mmap(0, 2 * req.tp_block_size * req.tp_block_nr,
+		      PROT_READ | PROT_WRITE,
+		      MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sfd, 0);
+	lassert(rxring != MAP_FAILED);
+
+	tqp->rx.ring = rxring;
+	tqp->rx.ring_size = NUM_BUFFERS;
+	tqp->rx.frame_size_log2 = log2(req.tp_frame_size);
+
+	tqp->tx.ring = rxring + req.tp_block_size * req.tp_block_nr;
+	tqp->tx.ring_size = NUM_BUFFERS;
+	tqp->tx.frame_size_log2 = log2(req.tp_frame_size);
+
+	ll.sll_family = PF_PACKET;
+	ll.sll_protocol = htons(ETH_P_ALL);
+	ll.sll_ifindex = if_nametoindex(interface_name);
+	ll.sll_hatype = 0;
+	ll.sll_pkttype = 0;
+	ll.sll_halen = 0;
+
+	noqdisc = 1;
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_QDISC_BYPASS,
+			 &noqdisc, sizeof(noqdisc));
+	lassert(ret == 0);
+
+	ret = bind(sfd, (struct sockaddr *)&ll, sizeof(ll));
+	lassert(ret == 0);
+
+	setup_tx_frame();
+
+	return tqp;
+}
+
+static void tp2_rx(void *queue_pair, unsigned int *start, unsigned int *end)
+{
+	struct tpacket2_queue *rxq = &((struct tp2_queue_pair *)queue_pair)->rx;
+	unsigned int batch = 0;
+
+	*start = rxq->last_used_idx;
+	*end = rxq->last_used_idx;
+
+	for (;;) {
+		unsigned int idx = *end & (rxq->ring_size - 1);
+		struct tpacket2_hdr *hdr;
+
+		hdr = (struct tpacket2_hdr *)(rxq->ring +
+					      (idx << rxq->frame_size_log2));
+		if ((hdr->tp_status & TP_STATUS_USER) != TP_STATUS_USER)
+			break;
+
+		(*end)++;
+		if (++batch == BATCH_SIZE)
+			break;
+	}
+
+	rxq->last_used_idx = *end;
+	rx_npkts += (*end - *start);
+
+	/* status before data */
+	u_smp_rmb();
+}
+
+static void tp2_rx_release(void *queue_pair, unsigned int start,
+			   unsigned int end)
+{
+	struct tpacket2_queue *rxq = &((struct tp2_queue_pair *)queue_pair)->rx;
+	struct tpacket2_hdr *hdr;
+
+	while (start != end) {
+		hdr = (struct tpacket2_hdr *)(rxq->ring +
+					      ((start & (rxq->ring_size - 1))
+					       << rxq->frame_size_log2));
+
+		hdr->tp_status = TP_STATUS_KERNEL;
+		start++;
+	}
+}
+
+static void *tp2_get_data(void *queue_pair, unsigned int idx, unsigned int *len)
+{
+	struct tpacket2_queue *rxq = &((struct tp2_queue_pair *)queue_pair)->rx;
+	struct tpacket2_hdr *hdr;
+
+	hdr = (struct tpacket2_hdr *)(rxq->ring + ((idx & (rxq->ring_size - 1))
+						   << rxq->frame_size_log2));
+	*len = hdr->tp_snaplen;
+
+	return (char *)hdr + hdr->tp_mac;
+}
+
+static void tp2_tx(void *queue_pair, unsigned int start, unsigned int end)
+{
+	struct tp2_queue_pair *qp = queue_pair;
+	struct tpacket2_queue *txq = &qp->tx;
+	unsigned int len, curr = start;
+	void *data;
+	int ret;
+
+	while (curr != end) {
+		unsigned int idx = txq->last_used_idx & (txq->ring_size - 1);
+		struct tpacket2_hdr *hdr;
+
+		hdr = (struct tpacket2_hdr *)(txq->ring +
+					      (idx << txq->frame_size_log2));
+		if (hdr->tp_status &
+		    (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)) {
+			break;
+		}
+
+		data = benchmark.get_data(queue_pair, curr, &len);
+
+		hdr->tp_snaplen = len;
+		hdr->tp_len = len;
+		memcpy((char *)hdr + TPACKET2_HDRLEN -
+		       sizeof(struct sockaddr_ll), data, len);
+
+		u_smp_wmb();
+
+		hdr->tp_status = TP_STATUS_SEND_REQUEST;
+
+		txq->last_used_idx++;
+		curr++;
+	}
+
+	ret = sendto(qp->sfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	if (!(ret >= 0 || errno == EAGAIN || errno == ENOBUFS))
+		lassert(0);
+
+	benchmark.rx_release(queue_pair, start, end);
+
+	tx_npkts += (curr - start);
+}
+
+static void *tp3_configure(const char *interface_name)
+{
+	int sfd, noqdisc, ret, ver = TPACKET_V3;
+	struct tp3_queue_pair *tqp;
+	struct tpacket_req3 req = {};
+	struct sockaddr_ll ll;
+	void *rxring;
+
+	unsigned int blocksiz = 1 << 22, framesiz = 1 << 11;
+	unsigned int blocknum = 64;
+
+	/* create PF_PACKET socket */
+	sfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	lassert(sfd >= 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
+	lassert(ret == 0);
+
+	tqp = calloc(1, sizeof(*tqp));
+	lassert(tqp);
+
+	tqp->sfd = sfd;
+	tqp->interface_name = interface_name;
+
+	/* XXX is is unfair to have 2 frames per block in V3? */
+	req.tp_block_size = BLOCK_SIZE;
+	req.tp_frame_size = FRAME_SIZE;
+	req.tp_block_nr = NUM_BUFFERS * FRAME_SIZE / BLOCK_SIZE;
+	req.tp_frame_nr = req.tp_block_nr * BLOCK_SIZE / FRAME_SIZE;
+	req.tp_retire_blk_tov = 0;
+	req.tp_sizeof_priv = 0;
+	req.tp_feature_req_word = 0;
+
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+
+	rxring = mmap(0, 2 * req.tp_block_size * req.tp_block_nr,
+		      PROT_READ | PROT_WRITE,
+		      MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sfd, 0);
+	lassert(rxring != MAP_FAILED);
+
+	tqp->rx.ring = rxring;
+	tqp->rx.ring_size = blocknum;
+	tqp->rx.block_size_log2 = log2(blocksiz);
+
+	tqp->tx.ring = rxring + req.tp_block_size * req.tp_block_nr;
+	tqp->tx.ring_size = (blocksiz * blocknum) / framesiz;
+	tqp->tx.frame_size_log2 = log2(req.tp_frame_size);
+
+	ll.sll_family = PF_PACKET;
+	ll.sll_protocol = htons(ETH_P_ALL);
+	ll.sll_ifindex = if_nametoindex(interface_name);
+	ll.sll_hatype = 0;
+	ll.sll_pkttype = 0;
+	ll.sll_halen = 0;
+
+	noqdisc = 1;
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_QDISC_BYPASS,
+			 &noqdisc, sizeof(noqdisc));
+	lassert(ret == 0);
+
+	ret = bind(sfd, (struct sockaddr *)&ll, sizeof(ll));
+	lassert(ret == 0);
+
+	setup_tx_frame();
+
+	return tqp;
+}
+
+static void tp3_rx(void *queue_pair, unsigned int *start, unsigned int *end)
+{
+	struct tpacket3_rx_queue *rxq =
+		&((struct tp3_queue_pair *)queue_pair)->rx;
+	unsigned int i, npkts = BATCH_SIZE;
+	struct tpacket_block_desc *bd;
+	bool no_more_frames = false;
+
+	*start = 0;
+	*end = 0;
+
+	if (rxq->last_frame) {
+		if (rxq->npkts <= BATCH_SIZE) {
+			no_more_frames = true;
+			npkts = rxq->npkts;
+		}
+
+		for (i = 0; i < npkts; i++) {
+			rxq->last_frame = (struct tpacket3_hdr *)
+					  ((char *)rxq->last_frame +
+					   rxq->last_frame->tp_next_offset);
+			rxq->frames[i] = rxq->last_frame;
+		}
+
+		if (no_more_frames)
+			rxq->last_frame = NULL;
+
+		rxq->npkts -= npkts;
+		*end = npkts;
+		rx_npkts += npkts;
+
+		return;
+	}
+
+	bd = (struct tpacket_block_desc *)
+	     (rxq->ring + ((rxq->last_used_idx & (rxq->ring_size - 1))
+			   << rxq->block_size_log2));
+	if ((bd->hdr.bh1.block_status & TP_STATUS_USER) != TP_STATUS_USER)
+		return;
+
+	u_smp_rmb();
+
+	rxq->npkts = bd->hdr.bh1.num_pkts;
+	if (rxq->npkts <= BATCH_SIZE) {
+		no_more_frames = true;
+		npkts = rxq->npkts;
+	}
+
+	rxq->last_frame = (struct tpacket3_hdr *)
+			  ((char *)bd + bd->hdr.bh1.offset_to_first_pkt);
+	rxq->frames[0] = rxq->last_frame;
+	for (i = 1; i < npkts; i++) {
+		rxq->last_frame = (struct tpacket3_hdr *)
+				  ((char *)rxq->last_frame +
+				   rxq->last_frame->tp_next_offset);
+		rxq->frames[i] = rxq->last_frame;
+	}
+
+	if (no_more_frames)
+		rxq->last_frame = NULL;
+
+	*end = npkts;
+	rx_npkts += npkts;
+}
+
+static void tp3_rx_release(void *queue_pair, unsigned int start,
+			   unsigned int end)
+{
+	struct tpacket3_rx_queue *rxq =
+		&((struct tp3_queue_pair *)queue_pair)->rx;
+	struct tpacket_block_desc *bd;
+
+	(void)start;
+	(void)end;
+
+	if (rxq->last_frame)
+		return;
+
+	bd = (struct tpacket_block_desc *)
+	     (rxq->ring + ((rxq->last_used_idx & (rxq->ring_size - 1))
+			   << rxq->block_size_log2));
+
+	bd->hdr.bh1.block_status = TP_STATUS_KERNEL;
+	rxq->last_used_idx++;
+}
+
+static void *tp3_get_data(void *queue_pair, unsigned int idx, unsigned int *len)
+{
+	struct tpacket3_rx_queue *rxq =
+		&((struct tp3_queue_pair *)queue_pair)->rx;
+	struct tpacket3_hdr *hdr = rxq->frames[idx];
+
+	*len = hdr->tp_snaplen;
+
+	return (char *)hdr + hdr->tp_mac;
+}
+
+static void tp3_tx(void *queue_pair, unsigned int start, unsigned int end)
+{
+	struct tp3_queue_pair *qp = queue_pair;
+	struct tpacket2_queue *txq = &qp->tx;
+	unsigned int len, curr = start;
+	void *data;
+	int ret;
+
+	while (curr != end) {
+		unsigned int idx = txq->last_used_idx & (txq->ring_size - 1);
+		struct tpacket3_hdr *hdr;
+
+		hdr = (struct tpacket3_hdr *)(txq->ring +
+					      (idx << txq->frame_size_log2));
+		if (hdr->tp_status &
+		    (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING)) {
+			break;
+		}
+
+		data = benchmark.get_data(queue_pair, curr, &len);
+
+		hdr->tp_snaplen = len;
+		hdr->tp_len = len;
+		memcpy((char *)hdr + TPACKET3_HDRLEN -
+		       sizeof(struct sockaddr_ll), data, len);
+
+		u_smp_wmb();
+
+		hdr->tp_status = TP_STATUS_SEND_REQUEST;
+
+		txq->last_used_idx++;
+		curr++;
+	}
+
+	ret = sendto(qp->sfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	if (!(ret >= 0 || errno == EAGAIN || errno == ENOBUFS))
+		lassert(0);
+
+	benchmark.rx_release(queue_pair, start, end);
+
+	tx_npkts += (curr - start);
+}
+
+static inline void push_free_stack(struct tp4_umem *umem, unsigned long idx)
+{
+	umem->free_stack[--umem->free_stack_idx] = idx;
+}
+
+static inline unsigned long pop_free_stack(struct tp4_umem *umem)
+{
+	return	umem->free_stack[umem->free_stack_idx++];
+}
+
+static struct tp4_umem *alloc_and_register_buffers(size_t nbuffers)
+{
+	struct tpacket_memreg_req req = { .frame_size = FRAME_SIZE };
+	struct tp4_umem *umem;
+	size_t i;
+	int fd, ret;
+	void *bufs;
+
+	ret = posix_memalign((void **)&bufs, getpagesize(),
+			     nbuffers * req.frame_size);
+	lassert(ret == 0);
+
+	umem = calloc(1, sizeof(*umem));
+	lassert(umem);
+	fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	lassert(fd > 0);
+	req.addr = (unsigned long)bufs;
+	req.len = nbuffers * req.frame_size;
+	ret = setsockopt(fd, SOL_PACKET, PACKET_MEMREG, &req, sizeof(req));
+	lassert(ret == 0);
+
+	umem->frame_size = FRAME_SIZE;
+	umem->frame_size_log2 = log2(FRAME_SIZE);
+	umem->buffer = bufs;
+	umem->size = nbuffers * req.frame_size;
+	umem->nframes = nbuffers;
+	umem->mr_fd = fd;
+
+	for (i = 0; i < nbuffers; i++)
+		umem->free_stack[i] = i;
+
+	for (i = 0; i < nbuffers; i++) {
+		tx_frame_len = gen_eth_frame(bufs, 42);
+		bufs += FRAME_SIZE;
+	}
+
+	return umem;
+}
+
+static inline int tp4q_enqueue(struct tpacket4_queue *q,
+			       const struct tpacket4_desc *d,
+			       unsigned int dcnt)
+{
+	unsigned int avail_idx = q->avail_idx;
+	unsigned int i;
+	int j;
+
+	if (q->num_free < dcnt)
+		return -ENOSPC;
+
+	q->num_free -= dcnt;
+
+	for (i = 0; i < dcnt; i++) {
+		unsigned int idx = (avail_idx++) & q->ring_mask;
+
+		q->ring[idx].idx = d[i].idx;
+		q->ring[idx].len = d[i].len;
+		q->ring[idx].offset = d[i].offset;
+		q->ring[idx].error = 0;
+	}
+	u_smp_wmb();
+
+	for (j = dcnt - 1; j >= 0; j--) {
+		unsigned int idx = (q->avail_idx + j) & q->ring_mask;
+
+		q->ring[idx].flags = d[j].flags | TP4_DESC_KERNEL;
+	}
+	q->avail_idx += dcnt;
+
+	return 0;
+}
+
+static void *tp4_configure(const char *interface_name)
+{
+	int sfd, noqdisc, ret, ver = TPACKET_V4;
+	struct tpacket_req4 req = {};
+	struct tp4_queue_pair *tqp;
+	struct sockaddr_ll ll;
+	unsigned int i;
+	void *rxring;
+
+	/* create PF_PACKET socket */
+	sfd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+	lassert(sfd >= 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
+	lassert(ret == 0);
+
+	tqp = calloc(1, sizeof(*tqp));
+	lassert(tqp);
+
+	tqp->sfd = sfd;
+	tqp->interface_name = interface_name;
+
+	tqp->umem = alloc_and_register_buffers(NUM_BUFFERS);
+	lassert(tqp->umem);
+
+	req.mr_fd = tqp->umem->mr_fd;
+	req.desc_nr = NUM_DESCS;
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_TX_RING, &req, sizeof(req));
+	lassert(ret == 0);
+
+	rxring = mmap(0, 2 * req.desc_nr * sizeof(struct tpacket4_desc),
+		      PROT_READ | PROT_WRITE,
+		      MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sfd, 0);
+	lassert(rxring != MAP_FAILED);
+
+	tqp->rx.ring = rxring;
+	tqp->rx.num_free = req.desc_nr;
+	tqp->rx.ring_mask = req.desc_nr - 1;
+
+	tqp->tx.ring = &tqp->rx.ring[req.desc_nr];
+	tqp->tx.num_free = req.desc_nr;
+	tqp->tx.ring_mask = req.desc_nr - 1;
+
+	ll.sll_family = PF_PACKET;
+	ll.sll_protocol = htons(ETH_P_ALL);
+	ll.sll_ifindex = if_nametoindex(interface_name);
+	ll.sll_hatype = 0;
+	ll.sll_pkttype = 0;
+	ll.sll_halen = 0;
+
+	noqdisc = 1;
+	ret = setsockopt(sfd, SOL_PACKET, PACKET_QDISC_BYPASS,
+			 &noqdisc, sizeof(noqdisc));
+	lassert(ret == 0);
+
+	ret = bind(sfd, (struct sockaddr *)&ll, sizeof(ll));
+	lassert(ret == 0);
+
+	if (opt_zerocopy > 0) {
+		ret = setsockopt(sfd, SOL_PACKET, PACKET_ZEROCOPY,
+				 &opt_zerocopy, sizeof(opt_zerocopy));
+		lassert(ret == 0);
+	}
+
+	for (i = 0; i < (tqp->rx.ring_mask + 1)/4; i++) {
+		struct tpacket4_desc desc = {};
+
+		desc.idx = i;
+		ret = tp4q_enqueue(&tqp->rx, &desc, 1);
+		lassert(ret == 0);
+	}
+
+	return tqp;
+}
+
+static void tp4_rx(void *queue_pair, unsigned int *start, unsigned int *end)
+{
+	struct tpacket4_queue *q = &((struct tp4_queue_pair *)queue_pair)->rx;
+	unsigned int idx, recv_size, last_used = q->last_used_idx;
+	unsigned int uncleared = (q->avail_idx - last_used);
+
+	*start = last_used;
+	*end = last_used;
+	recv_size = (uncleared < BATCH_SIZE) ? uncleared : BATCH_SIZE;
+
+	idx = (last_used + recv_size - 1) & q->ring_mask;
+	if (q->ring[idx].flags & TP4_DESC_KERNEL)
+		return;
+
+	*end += recv_size;
+	rx_npkts += recv_size;
+	q->num_free = recv_size;
+
+	u_smp_rmb();
+}
+
+static inline void tp4_rx_release(void *queue_pair, unsigned int start,
+				  unsigned int end)
+{
+	struct tp4_queue_pair *qp = queue_pair;
+	struct tpacket4_queue *q = &qp->rx;
+	struct tpacket4_desc *src, *dst;
+	unsigned int nitems = end - start;
+
+	while (nitems--) {
+		dst = &q->ring[(q->avail_idx++) & q->ring_mask];
+		src = &q->ring[start++ & q->ring_mask];
+		*dst = *src;
+
+		u_smp_wmb();
+
+		dst->flags = TP4_DESC_KERNEL;
+	}
+
+	q->last_used_idx += q->num_free;
+	q->num_free = 0;
+}
+
+static inline void *tp4_get_data(void *queue_pair, unsigned int idx,
+				 unsigned int *len)
+{
+	struct tp4_queue_pair *qp = (struct tp4_queue_pair *)queue_pair;
+	struct tp4_umem *umem = qp->umem;
+	struct tpacket4_desc *d;
+
+	d = &qp->rx.ring[idx & qp->rx.ring_mask];
+	*len = d->len;
+
+	return (char *)umem->buffer + (d->idx << umem->frame_size_log2)
+		+ d->offset;
+}
+
+
+static inline unsigned long tp4_get_data_desc(void *queue_pair,
+					      unsigned int idx,
+					      unsigned int *len,
+					      unsigned short *offset)
+{
+	struct tp4_queue_pair *qp = queue_pair;
+	struct tpacket4_queue *q = &qp->rx;
+	struct tpacket4_desc *d;
+
+	d = &q->ring[idx & q->ring_mask];
+	*len = d->len;
+	*offset = d->offset;
+
+	return d->idx;
+}
+
+static inline unsigned long tp4_get_data_desc_dummy(void *queue_pair,
+						    unsigned int idx,
+						    unsigned int *len,
+						    unsigned short *offset)
+{
+	struct tp4_queue_pair *qp = queue_pair;
+
+	(void)idx;
+
+	*len = tx_frame_len;
+	*offset = 0;
+
+	return pop_free_stack(qp->umem);
+}
+
+static inline void tp4_set_data_desc(void *queue_pair, unsigned int idx,
+				     unsigned long didx)
+{
+	struct tp4_queue_pair *qp = queue_pair;
+	struct tpacket4_queue *q = &qp->rx;
+	struct tpacket4_desc *d;
+
+	d = &q->ring[idx & q->ring_mask];
+	d->idx = didx;
+}
+
+static inline void tp4_set_data_desc_dummy(void *queue_pair, unsigned int idx,
+					   unsigned long didx)
+{
+	struct tp4_queue_pair *qp = queue_pair;
+
+	(void)idx;
+
+	push_free_stack(qp->umem, didx);
+}
+
+static void tp4_tx(void *queue_pair, unsigned int start, unsigned int end)
+{
+	struct tp4_queue_pair *qp = (struct tp4_queue_pair *)queue_pair;
+	struct tpacket4_queue *q = &qp->tx;
+	unsigned int i, aidx, uidx, send_size, s, entries, ncleared = 0;
+	unsigned long cleared[BATCH_SIZE];
+	int ret;
+
+	entries = end - start;
+
+	if (q->num_free != NUM_DESCS) {
+		for (i = 0; i < entries; i++) {
+			uidx = q->last_used_idx & q->ring_mask;
+			if (q->ring[uidx].flags & TP4_DESC_KERNEL)
+				break;
+
+			q->last_used_idx++;
+			cleared[i] = q->ring[uidx].idx;
+			q->num_free++;
+			ncleared++;
+		}
+	}
+
+	tx_npkts += ncleared;
+
+	send_size = (q->num_free < entries) ? q->num_free : entries;
+	i = 0;
+	s = start;
+	q->num_free -= send_size;
+
+	while (send_size--) {
+		aidx = q->avail_idx++ & q->ring_mask;
+
+		q->ring[aidx].idx = benchmark.get_data_desc(
+			qp, s, &q->ring[aidx].len,
+			&q->ring[aidx].offset);
+		if (i < ncleared)
+			benchmark.set_data_desc(qp, s++, cleared[i++]);
+
+		u_smp_wmb();
+
+		q->ring[aidx].flags = TP4_DESC_KERNEL;
+	}
+
+	benchmark.rx_release(queue_pair, start, start + ncleared);
+
+	ret = sendto(qp->sfd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+	if (!(ret >= 0 || errno == EAGAIN || errno == ENOBUFS))
+		lassert(0);
+}
+
+static struct benchmark benchmarks[3][3] = {
+	{ /* V2 */
+		{ .configure = tp2_configure,
+		  .rx = tp2_rx,
+		  .get_data = NULL,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = NULL,
+		  .rx_release = NULL,
+		  .tx = tp2_rx_release,
+		},
+		{ .configure = tp2_configure,
+		  .rx = rx_dummy,
+		  .get_data = get_data_dummy,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = NULL,
+		  .rx_release = rx_release_dummy,
+		  .tx = tp2_tx,
+		},
+		{ .configure = tp2_configure,
+		  .rx = tp2_rx,
+		  .get_data = tp2_get_data,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = process_swap_mac,
+		  .rx_release = tp2_rx_release,
+		  .tx = tp2_tx,
+		}
+	},
+	{ /* V3 */
+		{ .configure = tp3_configure,
+		  .rx = tp3_rx,
+		  .get_data = NULL,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = NULL,
+		  .rx_release = NULL,
+		  .tx = tp3_rx_release,
+		},
+		{ .configure = tp3_configure,
+		  .rx = rx_dummy,
+		  .get_data = get_data_dummy,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = NULL,
+		  .rx_release = rx_release_dummy,
+		  .tx = tp3_tx,
+		},
+		{ .configure = tp3_configure,
+		  .rx = tp3_rx,
+		  .get_data = tp3_get_data,
+		  .set_data_desc = NULL,
+		  .get_data_desc = NULL,
+		  .process = process_swap_mac,
+		  .rx_release = tp3_rx_release,
+		  .tx = tp3_tx,
+		}
+	},
+	{ /* V4 */
+		{ .configure = tp4_configure,
+		  .rx = tp4_rx,
+		  .get_data = NULL,
+		  .get_data_desc = NULL,
+		  .set_data_desc = NULL,
+		  .process = NULL,
+		  .rx_release = NULL,
+		  .tx = tp4_rx_release,
+		},
+		{ .configure = tp4_configure,
+		  .rx = rx_dummy,
+		  .get_data = NULL,
+		  .get_data_desc = tp4_get_data_desc_dummy,
+		  .set_data_desc = tp4_set_data_desc_dummy,
+		  .process = NULL,
+		  .rx_release = rx_release_dummy,
+		  .tx = tp4_tx,
+		},
+		{ .configure = tp4_configure,
+		  .rx = tp4_rx,
+		  .get_data = tp4_get_data,
+		  .get_data_desc = tp4_get_data_desc,
+		  .set_data_desc = tp4_set_data_desc,
+		  .process = process_swap_mac,
+		  .rx_release = tp4_rx_release,
+		  .tx = tp4_tx,
+		}
+	}
+};
+
+static struct benchmark *get_benchmark(enum tpacket_version ver,
+				       enum benchmark_type type)
+{
+	return &benchmarks[ver][type];
+}
+
+
+
+
+static struct option long_options[] = {
+	{"version", required_argument, 0, 'v'},
+	{"rxdrop", no_argument, 0, 'r'},
+	{"txonly", no_argument, 0, 't'},
+	{"l2fwd", no_argument, 0, 'l'},
+	{"zerocopy", required_argument, 0, 'z'},
+	{"interface", required_argument, 0, 'i'},
+	{0, 0, 0, 0}
+};
+
+static void usage(void)
+{
+	const char *str =
+		"  Usage: tpbench [OPTIONS]\n"
+		"  Options:\n"
+		"  -v, --version=n	Use tpacket version n (default 4)\n"
+		"  -r, --rxdrop		Discard all incoming packets (default)\n"
+		"  -t, --txonly		Only send packets\n"
+		"  -l, --l2fwd		MAC swap L2 forwarding\n"
+		"  -z, --zerocopy=n	Enable zero-copy on queue n\n"
+		"  -i, --interface=n	Run on interface n\n"
+		"\n";
+	fprintf(stderr, "%s", str);
+	exit(EXIT_FAILURE);
+}
+
+static void parse_command_line(int argc, char **argv)
+{
+	int option_index, c, version, ret;
+
+	opterr = 0;
+
+	for (;;) {
+		c = getopt_long(argc, argv, "v:rtlz:i:", long_options,
+				&option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'v':
+			version = atoi(optarg);
+			if (version < 2 || version > 4) {
+				fprintf(stderr,
+					"ERROR: version has to be [2,4]\n");
+				usage();
+			}
+			opt_tpver = version - 2;
+			break;
+		case 'r':
+			opt_bench = BENCH_RXDROP;
+			break;
+		case 't':
+			opt_bench = BENCH_TXONLY;
+			break;
+		case 'l':
+			opt_bench = BENCH_L2FWD;
+			break;
+		case 'z':
+			opt_zerocopy = atoi(optarg);
+			break;
+		case 'i':
+			opt_if = optarg;
+			break;
+		default:
+			usage();
+		}
+	}
+
+	if (opt_zerocopy > 0 && opt_tpver != PV4) {
+		fprintf(stderr, "ERROR: version 4 required for zero-copy\n");
+		usage();
+	}
+
+	ret = if_nametoindex(opt_if);
+	if (!ret) {
+		fprintf(stderr, "ERROR: interface \"%s\" does not exist\n",
+			opt_if);
+		usage();
+	}
+}
+
+static void print_benchmark(bool running)
+{
+	const char *bench_str = "INVALID";
+
+	if (opt_bench == BENCH_RXDROP)
+		bench_str = "rxdrop";
+	else if (opt_bench == BENCH_TXONLY)
+		bench_str = "txonly";
+	else if (opt_bench == BENCH_L2FWD)
+		bench_str = "l2fwd";
+
+	printf("%s v%d %s ", opt_if, opt_tpver + 2, bench_str);
+	if (opt_zerocopy > 0)
+		printf("zc ");
+	else
+		printf("   ");
+
+	if (running) {
+		printf("running...");
+		fflush(stdout);
+	}
+}
+
+static void sigdie(int sig)
+{
+	unsigned long stop_time = get_nsecs();
+	long dt = stop_time - start_time;
+	(void)sig;
+
+	double rx_pps = rx_npkts * 1000000000. / dt;
+	double tx_pps = tx_npkts * 1000000000. / dt;
+
+	printf("\r");
+	print_benchmark(false);
+	printf("duration %4.2fs rx: %16lupkts @ %16.2fpps tx: %16lupkts @ %16.2fpps.\n",
+	       dt / 1000000000., rx_npkts, rx_pps, tx_npkts, tx_pps);
+
+	exit(EXIT_SUCCESS);
+}
+
+int main(int argc, char **argv)
+{
+	signal(SIGINT, sigdie);
+	parse_command_line(argc, argv);
+	print_benchmark(true);
+	benchmark = *get_benchmark(opt_tpver, opt_bench);
+	start_time = get_nsecs();
+	run_benchmark(opt_if);
+
+	return 0;
+}