Patchwork [v2] netmap backend (revised)

login
register
mail settings
Submitter Luigi Rizzo
Date Jan. 22, 2013, 7:12 a.m.
Message ID <20130122071215.GA37733@onelab2.iet.unipi.it>
Download mbox | patch
Permalink /patch/214353/
State New
Headers show

Comments

Luigi Rizzo - Jan. 22, 2013, 7:12 a.m.
reposting a version without changes that implement bounded
queues in net/queue.c

Hi,
the attached patch implements a qemu backend for the "netmap" API
thus allowing machines to attach to the VALE software switch as
well as netmap-supported cards (links below).

http://info.iet.unipi.it/~luigi/netmap/
http://info.iet.unipi.it/~luigi/vale/

This is a cleaned up version of code written last summer.

guest-guest speed using an e1000 frontend (with some modifications
related to interrupt moderation, will repost an updated version later):
up to 700 Kpps using sockets, and up to 5 Mpps using netmap within
the guests. I have not tried with virtio.

	cheers
	luigi



Signed-off-by: Luigi Rizzo <rizzo@iet.unipi.it>
--
 configure         |   31 +++++
 net/Makefile.objs |    1 +
 net/clients.h     |    4 +
 net/net.c         |    3 +
 net/qemu-netmap.c |  353 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 qapi-schema.json  |    8 +-


----- End forwarded message -----
Anthony Liguori - Jan. 22, 2013, 10:50 p.m.
Hi,

Thank you for submitting your patch series.  checkpatch.pl has
detected that one or more of the patches in this series violate
the QEMU coding style.

If you believe this message was sent in error, please ignore it
or respond here with an explanation.

Otherwise, please correct the coding style issues and resubmit a
new version of the patch.

For more information about QEMU coding style, see:

http://git.qemu.org/?p=qemu.git;a=blob_plain;f=CODING_STYLE;hb=HEAD

Here is the output from checkpatch.pl:

Subject: netmap backend (revised)
ERROR: code indent should never use tabs
#144: FILE: net/net.c:622:
+^I[NET_CLIENT_OPTIONS_KIND_NETMAP]    = net_init_netmap,$

ERROR: code indent should never use tabs
#194: FILE: net/qemu-netmap.c:40:
+#define ND(fd, ... )^I// debugging$

ERROR: do not use C99 // comments
#194: FILE: net/qemu-netmap.c:40:
+#define ND(fd, ... )	// debugging

ERROR: space prohibited before that close parenthesis ')'
#194: FILE: net/qemu-netmap.c:40:
+#define ND(fd, ... )	// debugging

WARNING: __func__ should be used instead of gcc specific __FUNCTION__
#201: FILE: net/qemu-netmap.c:47:
+                __FUNCTION__, __LINE__, ##__VA_ARGS__);         \

WARNING: braces {} are necessary for all arms of this statement
#214: FILE: net/qemu-netmap.c:60:
+                if (__cnt++ < lps)                              \
[...]

ERROR: code indent should never use tabs
#224: FILE: net/qemu-netmap.c:70:
+    int^I^I^Ifd;$

ERROR: code indent should never use tabs
#225: FILE: net/qemu-netmap.c:71:
+    int^I^I^Imemsize;$

ERROR: code indent should never use tabs
#226: FILE: net/qemu-netmap.c:72:
+    void^I^I*mem;$

ERROR: code indent should never use tabs
#227: FILE: net/qemu-netmap.c:73:
+    struct netmap_if^I*nifp;$

ERROR: code indent should never use tabs
#228: FILE: net/qemu-netmap.c:74:
+    struct netmap_ring^I*rx;$

ERROR: code indent should never use tabs
#229: FILE: net/qemu-netmap.c:75:
+    struct netmap_ring^I*tx;$

ERROR: code indent should never use tabs
#230: FILE: net/qemu-netmap.c:76:
+    char^I^Ifdname[128];^I/* normally /dev/netmap */$

ERROR: code indent should never use tabs
#231: FILE: net/qemu-netmap.c:77:
+    char^I^Iifname[128];^I/* maybe the nmreq here ? */$

ERROR: code indent should never use tabs
#235: FILE: net/qemu-netmap.c:81:
+    NetClientState^Inc;$

ERROR: code indent should never use tabs
#236: FILE: net/qemu-netmap.c:82:
+    struct netmap_state^Ime;$

ERROR: code indent should never use tabs
#237: FILE: net/qemu-netmap.c:83:
+    unsigned int^Iread_poll;$

ERROR: code indent should never use tabs
#238: FILE: net/qemu-netmap.c:84:
+    unsigned int^Iwrite_poll;$

ERROR: do not use C99 // comments
#241: FILE: net/qemu-netmap.c:87:
+// a fast copy routine only for multiples of 64 bytes, non overlapped.

ERROR: code indent should never use tabs
#277: FILE: net/qemu-netmap.c:123:
+^Ierror_report("Unable to open netmap device '%s'", me->fdname);$

ERROR: code indent should never use tabs
#278: FILE: net/qemu-netmap.c:124:
+^Ireturn -1;$

ERROR: code indent should never use tabs
#286: FILE: net/qemu-netmap.c:132:
+^Ierror_report("cannot get info on %s", me->ifname);$

ERROR: code indent should never use tabs
#287: FILE: net/qemu-netmap.c:133:
+^Igoto error;$

ERROR: code indent should never use tabs
#292: FILE: net/qemu-netmap.c:138:
+^Ierror_report("Unable to register %s", me->ifname);$

ERROR: code indent should never use tabs
#293: FILE: net/qemu-netmap.c:139:
+^Igoto error;$

ERROR: code indent should never use tabs
#298: FILE: net/qemu-netmap.c:144:
+^Ierror_report("Unable to mmap");$

ERROR: code indent should never use tabs
#299: FILE: net/qemu-netmap.c:145:
+^Ime->mem = NULL;$

ERROR: code indent should never use tabs
#300: FILE: net/qemu-netmap.c:146:
+^Igoto error;$

ERROR: do not use C99 // comments
#313: FILE: net/qemu-netmap.c:159:
+// XXX do we need the can-send routine ?

ERROR: do not use C99 // comments
#343: FILE: net/qemu-netmap.c:189:
+// update the read handler

ERROR: code indent should never use tabs
#347: FILE: net/qemu-netmap.c:193:
+^Is->read_poll = enable;$

ERROR: code indent should never use tabs
#348: FILE: net/qemu-netmap.c:194:
+^Inetmap_update_fd_handler(s);$

ERROR: do not use C99 // comments
#352: FILE: net/qemu-netmap.c:198:
+// update the write handler

ERROR: code indent should never use tabs
#356: FILE: net/qemu-netmap.c:202:
+^Is->write_poll = enable;$

ERROR: code indent should never use tabs
#357: FILE: net/qemu-netmap.c:203:
+^Inetmap_update_fd_handler(s);$

WARNING: line over 80 characters
#377: FILE: net/qemu-netmap.c:223:
+static ssize_t netmap_receive_raw(NetClientState *nc, const uint8_t *buf, size_t size)

ERROR: code indent should never use tabs
#383: FILE: net/qemu-netmap.c:229:
+^I/* request an early notification to avoid running dry */$

ERROR: code indent should never use tabs
#384: FILE: net/qemu-netmap.c:230:
+^Iif (ring->avail < ring->num_slots / 2 && s->write_poll == 0) {$

ERROR: code indent should never use tabs
#385: FILE: net/qemu-netmap.c:231:
+^I    netmap_write_poll(s, 1);$

ERROR: code indent should never use tabs
#386: FILE: net/qemu-netmap.c:232:
+^I}$

ERROR: code indent should never use tabs
#387: FILE: net/qemu-netmap.c:233:
+^Iif (ring->avail == 0) { // cannot write$

ERROR: do not use C99 // comments
#387: FILE: net/qemu-netmap.c:233:
+	if (ring->avail == 0) { // cannot write

ERROR: trailing statements should be on next line
#387: FILE: net/qemu-netmap.c:233:
+	if (ring->avail == 0) { // cannot write

ERROR: code indent should never use tabs
#388: FILE: net/qemu-netmap.c:234:
+^I    return 0;$

ERROR: code indent should never use tabs
#389: FILE: net/qemu-netmap.c:235:
+^I}$

ERROR: code indent should never use tabs
#390: FILE: net/qemu-netmap.c:236:
+^Iuint32_t i = ring->cur;$

ERROR: code indent should never use tabs
#391: FILE: net/qemu-netmap.c:237:
+^Iuint32_t idx = ring->slot[i].buf_idx;$

ERROR: code indent should never use tabs
#392: FILE: net/qemu-netmap.c:238:
+^Iuint8_t *dst = (u_char *)NETMAP_BUF(ring, idx);$

ERROR: code indent should never use tabs
#394: FILE: net/qemu-netmap.c:240:
+^Iring->slot[i].len = size;$

ERROR: code indent should never use tabs
#395: FILE: net/qemu-netmap.c:241:
+^Ipkt_copy(buf, dst, size);$

ERROR: code indent should never use tabs
#396: FILE: net/qemu-netmap.c:242:
+^Iring->cur = NETMAP_RING_NEXT(ring, i);$

ERROR: code indent should never use tabs
#397: FILE: net/qemu-netmap.c:243:
+^Iring->avail--;$

ERROR: do not use C99 // comments
#402: FILE: net/qemu-netmap.c:248:
+// complete a previous send (backend --> guest), enable the fd_read callback

ERROR: space prohibited before that close parenthesis ')'
#424: FILE: net/qemu-netmap.c:270:
+    while (ring->avail > 0 && qemu_can_send_packet(&s->nc) ) {

ERROR: code indent should never use tabs
#425: FILE: net/qemu-netmap.c:271:
+^Iuint32_t i = ring->cur;$

ERROR: code indent should never use tabs
#426: FILE: net/qemu-netmap.c:272:
+^Iuint32_t idx = ring->slot[i].buf_idx;$

ERROR: code indent should never use tabs
#427: FILE: net/qemu-netmap.c:273:
+^Iuint8_t *src = (u_char *)NETMAP_BUF(ring, idx);$

ERROR: code indent should never use tabs
#428: FILE: net/qemu-netmap.c:274:
+^Iint size = ring->slot[i].len;$

ERROR: code indent should never use tabs
#430: FILE: net/qemu-netmap.c:276:
+^Iring->cur = NETMAP_RING_NEXT(ring, i);$

ERROR: code indent should never use tabs
#431: FILE: net/qemu-netmap.c:277:
+^Iring->avail--;$

ERROR: code indent should never use tabs
#432: FILE: net/qemu-netmap.c:278:
+^Isent++;$

ERROR: code indent should never use tabs
#434: FILE: net/qemu-netmap.c:280:
+^Iif (size == 0) {$

ERROR: code indent should never use tabs
#435: FILE: net/qemu-netmap.c:281:
+^I    /* the guest does not receive anymore. Packet is queued, stop$

ERROR: code indent should never use tabs
#436: FILE: net/qemu-netmap.c:282:
+^I     * reading from the backend until netmap_send_completed()$

ERROR: code indent should never use tabs
#437: FILE: net/qemu-netmap.c:283:
+^I     */$

ERROR: code indent should never use tabs
#438: FILE: net/qemu-netmap.c:284:
+^I    netmap_read_poll(s, 0);$

ERROR: code indent should never use tabs
#439: FILE: net/qemu-netmap.c:285:
+^I    return;$

ERROR: code indent should never use tabs
#440: FILE: net/qemu-netmap.c:286:
+^I}$

ERROR: do not use C99 // comments
#442: FILE: net/qemu-netmap.c:288:
+    netmap_read_poll(s, 1); // probably useless.

ERROR: do not use C99 // comments
#446: FILE: net/qemu-netmap.c:292:
+// flush and close

ERROR: do not use C99 // comments
#475: FILE: net/qemu-netmap.c:321:
+//    .receive_raw = netmap_receive_raw,

ERROR: do not use C99 // comments
#476: FILE: net/qemu-netmap.c:322:
+//    .receive_iov = netmap_receive_iov,

WARNING: line over 80 characters
#486: FILE: net/qemu-netmap.c:332:
+int net_init_netmap(const NetClientOptions *opts, const char *name, NetClientState *peer)

ERROR: code indent should never use tabs
#496: FILE: net/qemu-netmap.c:342:
+^Inetmap_opts->has_ifname ? netmap_opts->ifname : "vale0");$

WARNING: braces {} are necessary for all arms of this statement
#497: FILE: net/qemu-netmap.c:343:
+    if (netmap_open(&me))
[...]

ERROR: code indent should never use tabs
#498: FILE: net/qemu-netmap.c:344:
+^Ireturn -1;$

ERROR: do not use C99 // comments
#504: FILE: net/qemu-netmap.c:350:
+    netmap_read_poll(s, 1); // initially only poll for reads.

total: 72 errors, 5 warnings, 461 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.



Regards,

Anthony Liguori
Luigi Rizzo - Jan. 23, 2013, 8:56 a.m.
On Tue, Jan 22, 2013 at 2:50 PM, Anthony Liguori <aliguori@us.ibm.com>wrote:

> Hi,
>
> Thank you for submitting your patch series.  checkpatch.pl has
> detected that one or more of the patches in this series violate
> the QEMU coding style.
>
> If you believe this message was sent in error, please ignore it
> or respond here with an explanation.
>
> Otherwise, please correct the coding style issues and resubmit a
> new version of the patch.
>
>
will do, thanks for the feedback
luigi
Stefan Hajnoczi - Jan. 23, 2013, 11:10 a.m.
On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote:
> reposting a version without changes that implement bounded
> queues in net/queue.c
> 
> Hi,
> the attached patch implements a qemu backend for the "netmap" API
> thus allowing machines to attach to the VALE software switch as
> well as netmap-supported cards (links below).
> 
> http://info.iet.unipi.it/~luigi/netmap/
> http://info.iet.unipi.it/~luigi/vale/
> 
> This is a cleaned up version of code written last summer.

Looks like a clean software approach to low-level packet I/O.  I guess
the biggest competitor would be a userspace library with NIC drivers
using modern IOMMUs to avoid the security/reliability problem that
previous userspace approaches suffered.  Pretty cool that netmap reuses
kernel NIC driver implementations to avoid duplicating all that code.

I wonder how/if netmaps handles advanced NIC features like multiqueue
and offloads?  But I've only read the webpage, not the papers or source
code :).

> diff --git a/net/Makefile.objs b/net/Makefile.objs
> index a08cd14..068253f 100644
> --- a/net/Makefile.objs
> +++ b/net/Makefile.objs
> @@ -10,3 +10,4 @@ common-obj-$(CONFIG_AIX) += tap-aix.o
>  common-obj-$(CONFIG_HAIKU) += tap-haiku.o
>  common-obj-$(CONFIG_SLIRP) += slirp.o
>  common-obj-$(CONFIG_VDE) += vde.o
> +common-obj-$(CONFIG_NETMAP) += qemu-netmap.o

Please drop the "qemu-" prefix.

> diff --git a/net/net.c b/net/net.c
> index cdd9b04..816c987 100644
> --- a/net/net.c
> +++ b/net/net.c
> @@ -618,6 +618,9 @@ static int (* const net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])(
>          [NET_CLIENT_OPTIONS_KIND_BRIDGE]    = net_init_bridge,
>  #endif
>          [NET_CLIENT_OPTIONS_KIND_HUBPORT]   = net_init_hubport,
> +#ifdef CONFIG_NETMAP
> +	[NET_CLIENT_OPTIONS_KIND_NETMAP]    = net_init_netmap,
> +#endif

Please use 4 spaces for indentation and run scripts/checkpatch.pl to
scan patches.

>  };
>  
>  
> diff --git a/net/qemu-netmap.c b/net/qemu-netmap.c
> new file mode 100644
> index 0000000..79d7c09
> --- /dev/null
> +++ b/net/qemu-netmap.c
> @@ -0,0 +1,353 @@
> +/*
> + * netmap access for qemu
> + *
> + * Copyright (c) 2012-2013 Luigi Rizzo
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include "config-host.h"
> +
> +/* note paths are different for -head and 1.3 */

Please drop this comment from the patch.

> +#define ND(fd, ... )	// debugging
> +#define D(format, ...)                                          \
> +        do {                                                    \
> +                struct timeval __xxts;                          \
> +                gettimeofday(&__xxts, NULL);                    \
> +                printf("%03d.%06d %s [%d] " format "\n",        \
> +                (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
> +                __FUNCTION__, __LINE__, ##__VA_ARGS__);         \
> +        } while (0)
> +
> +/* rate limited, lps indicates how many per second */
> +#define RD(lps, format, ...)                                    \
> +        do {                                                    \
> +                static int t0, __cnt;                           \
> +                struct timeval __xxts;                          \
> +                gettimeofday(&__xxts, NULL);                    \
> +                if (t0 != __xxts.tv_sec) {                      \
> +                        t0 = __xxts.tv_sec;                     \
> +                        __cnt = 0;                              \
> +                }                                               \
> +                if (__cnt++ < lps)                              \
> +                        D(format, ##__VA_ARGS__);               \
> +        } while (0)

Have you seen docs/tracing.txt?  It can do fprintf(stderr) but also
SystemTap/DTrace and a simple built-in binary tracer.

> +
> +
> +
> +/*
> + * private netmap device info
> + */
> +struct netmap_state {

QEMU code should use:

typedef struct {
    ...
} NetmapState;

See ./HACKING and ./CODING_STYLE.  The code usually avoids struct tags.

> +    int			fd;
> +    int			memsize;

size_t?

> +    void		*mem;
> +    struct netmap_if	*nifp;
> +    struct netmap_ring	*rx;
> +    struct netmap_ring	*tx;
> +    char		fdname[128];	/* normally /dev/netmap */

PATH_MAX?

> +    char		ifname[128];	/* maybe the nmreq here ? */

IFNAMSIZ?

> +};
> +
> +struct nm_state {
> +    NetClientState	nc;
> +    struct netmap_state	me;
> +    unsigned int	read_poll;
> +    unsigned int	write_poll;

bool for read_poll and write_poll.

> +};
> +
> +// a fast copy routine only for multiples of 64 bytes, non overlapped.
> +static inline void
> +pkt_copy(const void *_src, void *_dst, int l)
> +{
> +        const uint64_t *src = _src;
> +        uint64_t *dst = _dst;
> +#define likely(x)       __builtin_expect(!!(x), 1)
> +#define unlikely(x)       __builtin_expect(!!(x), 0)

Already defined in include/qemu/osdep.h.

> +        if (unlikely(l >= 1024)) {
> +                bcopy(src, dst, l);
> +                return;
> +        }
> +        for (; l > 0; l -= 64) {
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +                *dst++ = *src++;
> +        }
> +}

I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the
optimization is even a win.  The glibc code is probably hand-written
assembly that CPU vendors have contributed for specific CPU model
families.

Did you compare glibc memcpy() against pkt_copy()?

> +
> +
> +/*
> + * open a netmap device. We assume there is only one queue
> + * (which is the case for the VALE bridge).
> + */
> +static int netmap_open(struct netmap_state *me)
> +{
> +    int fd, l, err;

Please use "size_t len" instead of "int l".

> +    struct nmreq req;
> +
> +    me->fd = fd = open(me->fdname, O_RDWR);
> +    if (fd < 0) {
> +	error_report("Unable to open netmap device '%s'", me->fdname);
> +	return -1;
> +    }
> +    bzero(&req, sizeof(req));
> +    pstrcpy(req.nr_name, sizeof(req.nr_name), me->ifname);
> +    req.nr_ringid = 0;
> +    req.nr_version = NETMAP_API;
> +    err = ioctl(fd, NIOCGINFO, &req);
> +    if (err) {
> +	error_report("cannot get info on %s", me->ifname);
> +	goto error;
> +    }
> +    l = me->memsize = req.nr_memsize;
> +    err = ioctl(fd, NIOCREGIF, &req);
> +    if (err) {
> +	error_report("Unable to register %s", me->ifname);
> +	goto error;
> +    }
> +
> +    me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
> +    if (me->mem == MAP_FAILED) {
> +	error_report("Unable to mmap");
> +	me->mem = NULL;
> +	goto error;
> +    }
> +
> +    me->nifp = NETMAP_IF(me->mem, req.nr_offset);
> +    me->tx = NETMAP_TXRING(me->nifp, 0);
> +    me->rx = NETMAP_RXRING(me->nifp, 0);
> +    return 0;
> +
> +error:
> +    close(me->fd);
> +    return -1;
> +}
> +
> +// XXX do we need the can-send routine ?
> +static int netmap_can_send(void *opaque)
> +{
> +    struct nm_state *s = opaque;
> +
> +    return qemu_can_send_packet(&s->nc);
> +}

Yes, I think it is a good idea.  We only receive packets if our peer can
also receive them.  (Why we still need queues is another question but at
least .read_poll() should be implemented IMO.)

> +
> +static void netmap_send(void *opaque);
> +static void netmap_writable(void *opaque);
> +
> +/*
> + * set the handlers for the device
> + */
> +static void netmap_update_fd_handler(struct nm_state *s)
> +{
> +#if 1
> +    qemu_set_fd_handler2(s->me.fd,
> +                         s->read_poll  ? netmap_can_send : NULL,
> +                         s->read_poll  ? netmap_send     : NULL,
> +                         s->write_poll ? netmap_writable : NULL,
> +                         s);
> +#else
> +    qemu_set_fd_handler(s->me.fd,
> +                         s->read_poll  ? netmap_send     : NULL,
> +                         s->write_poll ? netmap_writable : NULL,
> +                         s);
> +#endif

Please drop the #if.

> +}
> +
> +// update the read handler
> +static void netmap_read_poll(struct nm_state *s, int enable)

bool enable

> +{
> +    if (s->read_poll != enable) { /* do nothing if not changed */
> +	s->read_poll = enable;
> +	netmap_update_fd_handler(s);
> +    }
> +}
> +
> +// update the write handler
> +static void netmap_write_poll(struct nm_state *s, int enable)

bool enable

> +{
> +    if (s->write_poll != enable) {
> +	s->write_poll = enable;
> +	netmap_update_fd_handler(s);
> +    }
> +}
> +
> +/*
> + * the fd_write() callback, invoked if the fd is marked as
> + * writable after a poll. Reset the handler and flush any
> + * buffered packets.
> + */
> +static void netmap_writable(void *opaque)
> +{
> +    struct nm_state *s = opaque;
> +
> +    netmap_write_poll(s, 0);
> +    qemu_flush_queued_packets(&s->nc);
> +}

It feels like the net subsystem is missing an opportunity to extract
common code for file descriptor readability/writability.  We're
basically duplicating the code from tap.c.

> +
> +/*
> + * new data guest --> backend
> + */
> +static ssize_t netmap_receive_raw(NetClientState *nc, const uint8_t *buf, size_t size)
> +{
> +    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
> +    struct netmap_ring *ring = s->me.tx;
> +
> +    if (ring) {
> +	/* request an early notification to avoid running dry */
> +	if (ring->avail < ring->num_slots / 2 && s->write_poll == 0) {
> +	    netmap_write_poll(s, 1);
> +	}
> +	if (ring->avail == 0) { // cannot write
> +	    return 0;
> +	}
> +	uint32_t i = ring->cur;
> +	uint32_t idx = ring->slot[i].buf_idx;
> +	uint8_t *dst = (u_char *)NETMAP_BUF(ring, idx);

Why cast to u_char instead of uint8_t directly?

> +
> +	ring->slot[i].len = size;
> +	pkt_copy(buf, dst, size);

How does netmap guarantee that the buffer is large enough for this
packet?

> +	ring->cur = NETMAP_RING_NEXT(ring, i);
> +	ring->avail--;
> +    }
> +    return size;
> +}
> +
> +// complete a previous send (backend --> guest), enable the fd_read callback

Please use /* */ instead of //.  I'm not sure if we strictly enforce it
but it's rare to see // in QEMU.

> +static void netmap_send(void *opaque)
> +{
> +    struct nm_state *s = opaque;
> +    int sent = 0;
> +    struct netmap_ring *ring = s->me.rx;
> +
> +    /* only check ring->avail, let the packet be queued
> +     * with qemu_send_packet_async() if needed
> +     * XXX until we fix the propagation on the bridge we need to stop early
> +     */
> +    while (ring->avail > 0 && qemu_can_send_packet(&s->nc) ) {
> +	uint32_t i = ring->cur;
> +	uint32_t idx = ring->slot[i].buf_idx;
> +	uint8_t *src = (u_char *)NETMAP_BUF(ring, idx);
> +	int size = ring->slot[i].len;
> +
> +	ring->cur = NETMAP_RING_NEXT(ring, i);
> +	ring->avail--;
> +	sent++;

sent is unused?

> +static void netmap_cleanup(NetClientState *nc)
> +{
> +    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
> +
> +    qemu_purge_queued_packets(nc);
> +
> +    netmap_read_poll(s, 0);
> +    netmap_write_poll(s, 0);

These could be replaced with a single call to netmap_poll(nc, false).

> +    close(s->me.fd);

Missing munmap?

> +int net_init_netmap(const NetClientOptions *opts, const char *name, NetClientState *peer)
> +{
> +    const NetdevNetmapOptions *netmap_opts = opts->netmap;
> +    NetClientState *nc;
> +    struct netmap_state me;
> +    struct nm_state *s;
> +
> +    pstrcpy(me.fdname, sizeof(me.fdname), name ? name : "/dev/netmap");
> +    /* set default name for the port if not supplied */
> +    pstrcpy(me.ifname, sizeof(me.ifname),
> +	netmap_opts->has_ifname ? netmap_opts->ifname : "vale0");
> +    if (netmap_open(&me))

QEMU coding style uses braces, even when the if body is only one line.

> +	return -1;
> +
> +    /* create the object -- XXX use name or ifname ? */

The name should be neither the filename nor the ifname.  It's a separate
namespace and the user gets to decide the name.
Luigi Rizzo - Jan. 23, 2013, 11:50 a.m.
On Wed, Jan 23, 2013 at 12:10:55PM +0100, Stefan Hajnoczi wrote:
> On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote:
> > reposting a version without changes that implement bounded
> > queues in net/queue.c
> > 
> > Hi,
> > the attached patch implements a qemu backend for the "netmap" API
> > thus allowing machines to attach to the VALE software switch as
> > well as netmap-supported cards (links below).
> > 
> > http://info.iet.unipi.it/~luigi/netmap/
> > http://info.iet.unipi.it/~luigi/vale/
> > 
> > This is a cleaned up version of code written last summer.
> 
> Looks like a clean software approach to low-level packet I/O.  I guess
> the biggest competitor would be a userspace library with NIC drivers
> using modern IOMMUs to avoid the security/reliability problem that
> previous userspace approaches suffered.  Pretty cool that netmap reuses
> kernel NIC driver implementations to avoid duplicating all that code.
> 
> I wonder how/if netmaps handles advanced NIC features like multiqueue
> and offloads?  But I've only read the webpage, not the papers or source
> code :).

there are definitely more details in the papers :)

IOMMU is not strictly necessary because userspace only sees packet
buffers and a device independent replica of the descriptor ring.
NIC registers and rings are only manipulated by the kernel within
system calls.

multiqueue is completely straightforward -- netmap exposes as many
queues as the hardware implements, you can attach file descriptors to
individual queues, bind threads to queues by just using pthread_setaffinity().

offloading so far is not supported, and that's part of the design:
it adds complexity at runtime to check the various possible
combinations of offloading in various places in the stack.
A single packet format makes the driver extremely efficient.

The thing that may make a difference is tcp segmentation and reassembly,
we will look at how to support it at some point.

I'apply the other changes you suggest below, thanks.
Only some comments:

The debugging macros (D, RD() )  are taken from our existing code,
> 
> > +#define ND(fd, ... )	// debugging
> > +#define D(format, ...)                                          \
...
> > +/* rate limited, lps indicates how many per second */
> > +#define RD(lps, format, ...)                                    \
...

> Have you seen docs/tracing.txt?  It can do fprintf(stderr) but also
> SystemTap/DTrace and a simple built-in binary tracer.

will look at it. These debugging macros are the same we use in other
netmap code so i'd prefer to keep them.

> > +// a fast copy routine only for multiples of 64 bytes, non overlapped.
> > +static inline void
> > +pkt_copy(const void *_src, void *_dst, int l)
...
> > +                *dst++ = *src++;
> > +        }
> > +}
> 
> I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the
> optimization is even a win.  The glibc code is probably hand-written
> assembly that CPU vendors have contributed for specific CPU model
> families.
> 
> Did you compare glibc memcpy() against pkt_copy()?

I haven't tried in detail on glibc but will run some tests.  In any
case not all systems have glibc, and on FreeBSD this pkt_copy was
a significant win for small packets (saving some 20ns each; of
course this counts only when you approach the 10 Mpps range, which
is what you get with netmap, and of course when data is in cache).

One reason pkt_copy gains something is that if it can assume there
is extra space in the buffer, it can work on large chunks avoiding the extra
jumps and instructions for the remaining 1-2-4 bytes.

> > +	ring->slot[i].len = size;
> > +	pkt_copy(buf, dst, size);
> 
> How does netmap guarantee that the buffer is large enough for this
> packet?

the netmap buffers are 2k, i'll make sure there is a check that the
size is not exceeded.

> > +    close(s->me.fd);
> 
> Missing munmap?

yes you are correct.

	cheers
	luigi
Stefan Hajnoczi - Jan. 23, 2013, 1:03 p.m.
On Wed, Jan 23, 2013 at 12:50:26PM +0100, Luigi Rizzo wrote:
> On Wed, Jan 23, 2013 at 12:10:55PM +0100, Stefan Hajnoczi wrote:
> > On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote:
> The debugging macros (D, RD() )  are taken from our existing code,
> > 
> > > +#define ND(fd, ... )	// debugging
> > > +#define D(format, ...)                                          \
> ...
> > > +/* rate limited, lps indicates how many per second */
> > > +#define RD(lps, format, ...)                                    \
> ...
> 
> > Have you seen docs/tracing.txt?  It can do fprintf(stderr) but also
> > SystemTap/DTrace and a simple built-in binary tracer.
> 
> will look at it. These debugging macros are the same we use in other
> netmap code so i'd prefer to keep them.

Okay.  Feel free to leave them.

> > > +// a fast copy routine only for multiples of 64 bytes, non overlapped.
> > > +static inline void
> > > +pkt_copy(const void *_src, void *_dst, int l)
> ...
> > > +                *dst++ = *src++;
> > > +        }
> > > +}
> > 
> > I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the
> > optimization is even a win.  The glibc code is probably hand-written
> > assembly that CPU vendors have contributed for specific CPU model
> > families.
> > 
> > Did you compare glibc memcpy() against pkt_copy()?
> 
> I haven't tried in detail on glibc but will run some tests.  In any
> case not all systems have glibc, and on FreeBSD this pkt_copy was
> a significant win for small packets (saving some 20ns each; of
> course this counts only when you approach the 10 Mpps range, which
> is what you get with netmap, and of course when data is in cache).
> 
> One reason pkt_copy gains something is that if it can assume there
> is extra space in the buffer, it can work on large chunks avoiding the extra
> jumps and instructions for the remaining 1-2-4 bytes.

I'd like to drop this code or at least make it FreeBSD-specific since
there's no guarantee that this is a good idea on any other libc.

I'm even doubtful that it's always a win on FreeBSD.  You have a
threshold to fall back to bcopy() and who knows what the "best" value
for various CPUs is.

Stefan
Luigi Rizzo - Jan. 23, 2013, 4:03 p.m.
On Wed, Jan 23, 2013 at 02:03:17PM +0100, Stefan Hajnoczi wrote:
> On Wed, Jan 23, 2013 at 12:50:26PM +0100, Luigi Rizzo wrote:
> > On Wed, Jan 23, 2013 at 12:10:55PM +0100, Stefan Hajnoczi wrote:
> > > On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote:
...
> > > > +// a fast copy routine only for multiples of 64 bytes, non overlapped.
> > > > +static inline void
> > > > +pkt_copy(const void *_src, void *_dst, int l)
> > ...
> > > > +                *dst++ = *src++;
> > > > +        }
> > > > +}
> > > 
> > > I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the
> > > optimization is even a win.  The glibc code is probably hand-written
> > > assembly that CPU vendors have contributed for specific CPU model
> > > families.
> > > 
> > > Did you compare glibc memcpy() against pkt_copy()?
> > 
> > I haven't tried in detail on glibc but will run some tests.  In any
> > case not all systems have glibc, and on FreeBSD this pkt_copy was
> > a significant win for small packets (saving some 20ns each; of
> > course this counts only when you approach the 10 Mpps range, which
> > is what you get with netmap, and of course when data is in cache).
> > 
> > One reason pkt_copy gains something is that if it can assume there
> > is extra space in the buffer, it can work on large chunks avoiding the extra
> > jumps and instructions for the remaining 1-2-4 bytes.
> 
> I'd like to drop this code or at least make it FreeBSD-specific since
> there's no guarantee that this is a good idea on any other libc.
> 
> I'm even doubtful that it's always a win on FreeBSD.  You have a
> threshold to fall back to bcopy() and who knows what the "best" value
> for various CPUs is.

indeed.
With the attached program (which however might be affected by the
fact that data is not used after copying) it seems that on a recent
linux (using gcc 4.6.2) the fastest is __builtin_memcpy()

	./testlock -m __builtin_memcpy -l 64

(by a factor of 2 or more) whereas all the other methods have
approximately the same speed.

On FreeBSD (with clang, gcc 4.2.1, gcc 4.6.4) the pkt_copy() above

	./testlock -m fastcopy -l 64

is largely better than other methods. I am a bit puzzled why
the builtin method on FreeBSD is not effective, but i will check
on some other forum...

cheers
luigi

/*
 * Copyright (C) 2012 Luigi Rizzo. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: testlock.c 12015 2013-01-23 15:51:17Z luigi $
 *
 * Test program to study various ops and concurrency issues.
 * Create multiple threads, possibly bind to cpus, and run a workload.
 *
 * cc -O2 -Werror -Wall testlock.c -o testlock -lpthread
 *	you might need -lrt
 */

#include <inttypes.h>
#include <sys/types.h>
#include <pthread.h>	/* pthread_* */

#if defined(__APPLE__)

#include <libkern/OSAtomic.h>
#define atomic_add_int(p, n) OSAtomicAdd32(n, (int *)p)
#define	atomic_cmpset_32(p, o, n)	OSAtomicCompareAndSwap32(o, n, (int *)p)

#elif defined(linux)

int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
	int ret = *p == old;
	*p = new;
	return ret;
}

#if defined(HAVE_GCC_ATOMICS)
int atomic_add_int(volatile int *p, int v)
{
        return __sync_fetch_and_add(p, v);
}
#else
inline
uint32_t atomic_add_int(uint32_t *p, int v)
{
        __asm __volatile (
        "       lock   xaddl   %0, %1 ;        "
        : "+r" (v),                     /* 0 (result) */
          "=m" (*p)                     /* 1 */
        : "m" (*p));                    /* 2 */
        return (v);
}
#endif

#else /* FreeBSD */
#include <sys/param.h>
#include <machine/atomic.h>
#include <pthread_np.h>	/* pthread w/ affinity */

#if __FreeBSD_version > 500000
#include <sys/cpuset.h>	/* cpu_set */
#if __FreeBSD_version > 800000
#define HAVE_AFFINITY
#endif

inline void prefetch (const void *x)
{
        __asm volatile("prefetcht0 %0" :: "m" (*(const unsigned long *)x));
}


#else /* FreeBSD 4.x */
int atomic_cmpset_32(volatile uint32_t *p, uint32_t old, uint32_t new)
{
	int ret = *p == old;
	*p = new;
	return ret;
}

#define PRIu64	"llu"
#endif /* FreeBSD 4.x */

#endif /* FreeBSD */

#include <signal.h>	/* signal */
#include <stdlib.h>
#include <stdio.h>
#include <poll.h>
#include <inttypes.h>	/* PRI* macros */
#include <string.h>	/* strcmp */
#include <fcntl.h>	/* open */
#include <unistd.h>	/* getopt */


#include <sys/sysctl.h>	/* sysctl */
#include <sys/time.h>	/* timersub */

static inline int min(int a, int b) { return a < b ? a : b; }

#define ONE_MILLION	1000000
/* debug support */
#define ND(format, ...)			
#define D(format, ...)				\
	fprintf(stderr, "%s [%d] " format "\n",	\
	__FUNCTION__, __LINE__, ##__VA_ARGS__)

int verbose = 0;

#if 1//def MY_RDTSC
/* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ 
#define my_rdtsc(t)				\
	do {					\
		u_int __regs[4];		\
						\
		do_cpuid(0, __regs);		\
		(t) = rdtsc();			\
	} while (0)

static __inline void
do_cpuid(u_int ax, u_int *p)
{
	__asm __volatile("cpuid"
			 : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
			 :  "0" (ax) );
}

static __inline uint64_t
rdtsc(void)
{
	uint64_t rv;

	// XXX does not work on linux-64 bit
	__asm __volatile("rdtscp" : "=A" (rv) : : "%rax");
	return (rv);
}
#endif /* 1 */

struct targ;

/*** global arguments for all threads ***/
struct glob_arg {
	struct  {
		uint32_t	ctr[1024];
	} v __attribute__ ((aligned(256) ));
	int64_t m_cycles;	/* total cycles */
	int nthreads;
	int cpus;
	int privs;	// 1 if has IO privileges
	int arg;	// microseconds in usleep
	char *test_name;
	void (*fn)(struct targ *);
	uint64_t scale;	// scaling factor
	char *scale_name;	// scaling factor
};

/*
 * Arguments for a new thread.
 */
struct targ {
	struct glob_arg *g;
	int		completed;
	u_int		*glob_ctr;
	uint64_t volatile count;
	struct timeval	tic, toc;
	int		me;
	pthread_t	thread;
	int		affinity;
};


static struct targ *ta;
static int global_nthreads;

/* control-C handler */
static void
sigint_h(int sig)
{
	int i;

	(void)sig;	/* UNUSED */
	for (i = 0; i < global_nthreads; i++) {
		/* cancel active threads. */
		if (ta[i].completed)
			continue;
		D("Cancelling thread #%d\n", i);
		pthread_cancel(ta[i].thread);
		ta[i].completed = 0;
	}
	signal(SIGINT, SIG_DFL);
}


/* sysctl wrapper to return the number of active CPUs */
static int
system_ncpus(void)
{
#ifdef linux
	return 1;
#else
	int mib[2] = { CTL_HW, HW_NCPU}, ncpus;
	size_t len = sizeof(mib);
	sysctl(mib, len / sizeof(mib[0]), &ncpus, &len, NULL, 0);
	D("system had %d cpus", ncpus);

	return (ncpus);
#endif
}

/*
 * try to get I/O privileges so we can execute cli/sti etc.
 */
int
getprivs(void)
{
	int fd = open("/dev/io", O_RDWR);
	if (fd < 0) {
		D("cannot open /dev/io, fd %d", fd);
		return 0;
	}
	return 1;
}

/* set the thread affinity. */
/* ARGSUSED */
#ifdef HAVE_AFFINITY
static int
setaffinity(pthread_t me, int i)
{
	cpuset_t cpumask;

	if (i == -1)
		return 0;

	/* Set thread affinity affinity.*/
	CPU_ZERO(&cpumask);
	CPU_SET(i, &cpumask);

	if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
		D("Unable to set affinity");
		return 1;
	}
	return 0;
}
#endif


static void *
td_body(void *data)
{
	struct targ *t = (struct targ *) data;

#ifdef HAVE_AFFINITY
	if (0 == setaffinity(t->thread, t->affinity))
#endif
	{
		/* main loop.*/
		D("testing %ld cycles", t->g->m_cycles);
		gettimeofday(&t->tic, NULL);
		t->g->fn(t);
		gettimeofday(&t->toc, NULL);
	}
	t->completed = 1;
	return (NULL);
}

void
test_sel(struct targ *t)
{
	int64_t m;
	for (m = 0; m < t->g->m_cycles; m++) {
		fd_set r;
		struct timeval to = { 0, t->g->arg};
		FD_ZERO(&r);
		FD_SET(0,&r);
		// FD_SET(1,&r);
		select(1, &r, NULL, NULL, &to);
		t->count++;
	}
}

void
test_poll(struct targ *t)
{
	int64_t m, ms = t->g->arg/1000;
	for (m = 0; m < t->g->m_cycles; m++) {
		struct pollfd x;
		x.fd = 0;
		x.events = POLLIN;
		poll(&x, 1, ms);
		t->count++;
	}
}

void
test_usleep(struct targ *t)
{
	int64_t m;
	for (m = 0; m < t->g->m_cycles; m++) {
		usleep(t->g->arg);
		t->count++;
	}
}

void
test_cli(struct targ *t)
{
        int64_t m, i;
	if (!t->g->privs) {	
		D("%s", "privileged instructions not available");
		return;
	}
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
			__asm __volatile("cli;");
			__asm __volatile("and %eax, %eax;");
			__asm __volatile("sti;");
			t->count++;
		}
        }
}

void
test_nop(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
			__asm __volatile("nop;");
			__asm __volatile("nop; nop; nop; nop; nop;");
			//__asm __volatile("nop; nop; nop; nop; nop;");
			t->count++;
		}
	}
}

void
test_rdtsc1(struct targ *t)
{
        int64_t m, i;
	uint64_t v;
	(void)v;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	my_rdtsc(v);
			t->count++;
		}
        }
}

void
test_rdtsc(struct targ *t)
{
        int64_t m, i;
	volatile uint64_t v;
	(void)v;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	v = rdtsc();
			t->count++;
		}
        }
}

void
test_add(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	t->glob_ctr[0] ++;
			t->count++;
		}
        }
}

void
test_atomic_add(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
                	atomic_add_int(t->glob_ctr, 1);
			t->count++;
		}
        }
}

void
test_atomic_cmpset(struct targ *t)
{
        int64_t m, i;
        for (m = 0; m < t->g->m_cycles; m++) {
		for (i = 0; i < ONE_MILLION; i++) {
		        atomic_cmpset_32(t->glob_ctr, m, i);
			t->count++;
		}
        }
}

void
test_time(struct targ *t)
{
        int64_t m;
        for (m = 0; m < t->g->m_cycles; m++) {
#ifndef __APPLE__
		struct timespec ts;
		clock_gettime(t->g->arg, &ts);
#endif
		t->count++;
        }
}

void
test_gettimeofday(struct targ *t)
{
        int64_t m;
	struct timeval ts;
        for (m = 0; m < t->g->m_cycles; m++) {
		gettimeofday(&ts, NULL);
		t->count++;
        }
}

/*
 * getppid is the simplest system call (getpid is cached by glibc
 * so it would not be a good test)
 */
void
test_getpid(struct targ *t)
{
        int64_t m;
        for (m = 0; m < t->g->m_cycles; m++) {
		getppid();
		t->count++;
        }
}


#define likely(x)	__builtin_expect(!!(x), 1)
#define unlikely(x)	__builtin_expect(!!(x), 0)

static void
fast_bcopy(void *_src, void *_dst, int l)
{
	uint64_t *src = _src;
	uint64_t *dst = _dst;
	if (unlikely(l >= 1024)) {
		bcopy(src, dst, l);
		return;
	}
	for (; likely(l > 0); l-=64) {
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
		*dst++ = *src++;
	}
}

// XXX if you want to make sure there is no inlining...
// static void (*fp)(void *_src, void *_dst, int l) = fast_bcopy;

#define HU	0x3ffff
static struct glob_arg huge[HU+1];

void
test_fastcopy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("fast copying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		fast_bcopy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_bcopy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("bcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		bcopy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_builtin_memcpy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("bcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		__builtin_memcpy(t->g, (void *)&huge[m & HU], len);
		t->count+=1;
        }
}

void
test_memcpy(struct targ *t)
{
        int64_t m;
	int len = t->g->arg;

	if (len > (int)sizeof(struct glob_arg))
		len = sizeof(struct glob_arg);
	D("memcopying %d bytes", len);
        for (m = 0; m < t->g->m_cycles; m++) {
		memcpy((void *)&huge[m & HU], t->g, len);
		t->count+=1;
        }
}

struct entry {
	void (*fn)(struct targ *);
	char *name;
	uint64_t scale;
	uint64_t m_cycles;
};
struct entry tests[] = {
	{ test_sel, "select", 1, 1000 },
	{ test_poll, "poll", 1, 1000 },
	{ test_usleep, "usleep", 1, 1000 },
	{ test_time, "time", 1, 1000 },
	{ test_gettimeofday, "gettimeofday", 1, 1000000 },
	{ test_getpid, "getpid", 1, 1000000 },
	{ test_bcopy, "bcopy", 1000, 100000000 },
	{ test_builtin_memcpy, "__builtin_memcpy", 1000, 100000000 },
	{ test_memcpy, "memcpy", 1000, 100000000 },
	{ test_fastcopy, "fastcopy", 1000, 100000000 },
	{ test_add, "add", ONE_MILLION, 100000000 },
	{ test_nop, "nop", ONE_MILLION, 100000000 },
	{ test_atomic_add, "atomic-add", ONE_MILLION, 100000000 },
	{ test_cli, "cli", ONE_MILLION, 100000000 },
	{ test_rdtsc, "rdtsc", ONE_MILLION, 100000000 },	// unserialized
	{ test_rdtsc1, "rdtsc1", ONE_MILLION, 100000000 },	// serialized
	{ test_atomic_cmpset, "cmpset", ONE_MILLION, 100000000 },
	{ NULL, NULL, 0, 0 }
};

static void
usage(void)
{
	const char *cmd = "test";
	int i;

	fprintf(stderr,
		"Usage:\n"
		"%s arguments\n"
		"\t-m name		test name\n"
		"\t-n cycles		(millions) of cycles\n"
		"\t-l arg		bytes, usec, ... \n"
		"\t-t threads		total threads\n"
		"\t-c cores		cores to use\n"
		"\t-a n			force affinity every n cores\n"
		"\t-A n			cache contention every n bytes\n"
		"\t-w report_ms		milliseconds between reports\n"
		"",
		cmd);
	fprintf(stderr, "Available tests:\n");
	for (i = 0; tests[i].name; i++) {
		fprintf(stderr, "%12s\n", tests[i].name);
	}

	exit(0);
}

static int64_t
getnum(const char *s)
{
	int64_t n;
	char *e;

	n = strtol(s, &e, 0);
	switch (e ? *e : '\0')  {
	case 'k':
	case 'K':
		return n*1000;
	case 'm':
	case 'M':
		return n*1000*1000;
	case 'g':
	case 'G':
		return n*1000*1000*1000;
	case 't':
	case 'T':
		return n*1000*1000*1000*1000;
	default:
		return n;
	}
}

struct glob_arg g;
int
main(int argc, char **argv)
{
	int i, ch, report_interval, affinity, align;

	ND("g has size %d", (int)sizeof(g));
	report_interval = 250;	/* ms */
	affinity = 0;		/* no affinity */
	align = 0;		/* global variable */

	bzero(&g, sizeof(g));

	g.privs = getprivs();
	g.nthreads = 1;
	g.cpus = 1;
	g.m_cycles = 0;

	while ( (ch = getopt(argc, argv, "A:a:m:n:w:c:t:vl:")) != -1) {
		switch(ch) {
		default:
			D("bad option %c %s", ch, optarg);
			usage();
			break;
		case 'A':	/* align */
			align = atoi(optarg);
			break;
		case 'a':	/* force affinity */
			affinity = atoi(optarg);
			break;
		case 'n':	/* cycles */
			g.m_cycles = getnum(optarg);
			break;
		case 'w':	/* report interval */
			report_interval = atoi(optarg);
			break;
		case 'c':
			g.cpus = atoi(optarg);
			break;
		case 't':
			g.nthreads = atoi(optarg);
			break;
		case 'm':
			g.test_name = optarg;
			break;
		case 'l':
			g.arg = getnum(optarg);
			break;

		case 'v':
			verbose++;
			break;
		}
	}
	argc -= optind;
	argv += optind;
	if (!g.test_name && argc > 0)
		g.test_name = argv[0];

	if (g.test_name) {
		for (i = 0; tests[i].name; i++) {
			if (!strcmp(g.test_name, tests[i].name)) {
				g.fn = tests[i].fn;
				g.scale = tests[i].scale;
				if (g.m_cycles == 0)
					g.m_cycles = tests[i].m_cycles;
				if (g.scale == ONE_MILLION)
					g.scale_name = "M";
				else if (g.scale == 1000)
					g.scale_name = "K";
				else {
					g.scale = 1;
					g.scale_name = "";
				}
				break;
			}
		}
	}
	if (!g.fn) {
		D("%s", "missing/unknown test name");
		usage();
	}
	i = system_ncpus();
	if (g.cpus < 0 || g.cpus > i) {
		D("%d cpus is too high, have only %d cpus", g.cpus, i);
		usage();
	}
	if (g.cpus == 0)
		g.cpus = i;
	if (g.nthreads < 1) {
		D("bad nthreads %d, using 1", g.nthreads);
		g.nthreads = 1;
	}
	i = sizeof(g.v.ctr) / g.nthreads*sizeof(g.v.ctr[0]);
	if (align < 0 || align > i) {
		D("bad align %d, max is %d", align, i);
		align = i;
	}

	/* Install ^C handler. */
	global_nthreads = g.nthreads;
	signal(SIGINT, sigint_h);

	ta = calloc(g.nthreads, sizeof(*ta));
	/*
	 * Now create the desired number of threads, each one
	 * using a single descriptor.
 	 */
	D("start %d threads on %d cores", g.nthreads, g.cpus);
	for (i = 0; i < g.nthreads; i++) {
		struct targ *t = &ta[i];
		bzero(t, sizeof(*t));
		t->g = &g;
		t->me = i;
		t->glob_ctr = &g.v.ctr[(i*align)/sizeof(g.v.ctr[0])];
		D("thread %d ptr %p", i, t->glob_ctr);
		t->affinity = affinity ? (affinity*i) % g.cpus : -1;
		if (pthread_create(&t->thread, NULL, td_body, t) == -1) {
			D("Unable to create thread %d", i);
			t->completed = 1;
		}
	}
	/* the main loop */

    {
	uint64_t my_count = 0, prev = 0;
	uint64_t count = 0;
	double delta_t;
	struct timeval tic, toc;

	gettimeofday(&toc, NULL);
	for (;;) {
		struct timeval now, delta;
		uint64_t pps;
		int done = 0;

		delta.tv_sec = report_interval/1000;
		delta.tv_usec = (report_interval%1000)*1000;
		select(0, NULL, NULL, NULL, &delta);
		gettimeofday(&now, NULL);
		timersub(&now, &toc, &toc);
		my_count = 0;
		for (i = 0; i < g.nthreads; i++) {
			my_count += ta[i].count;
			if (ta[i].completed)
				done++;
		}
		pps = toc.tv_sec* ONE_MILLION + toc.tv_usec;
		if (pps < 10000)
			continue;
		pps = (my_count - prev)*ONE_MILLION / pps;
		D("%" PRIu64 " %scycles/s scale %" PRIu64 " in %dus", pps/g.scale,
			g.scale_name, g.scale, (int)(toc.tv_sec* ONE_MILLION + toc.tv_usec));
		prev = my_count;
		toc = now;
		if (done == g.nthreads)
			break;
	}
	D("total %" PRIu64 " cycles", prev);

	timerclear(&tic);
	timerclear(&toc);
	for (i = 0; i < g.nthreads; i++) {
		pthread_join(ta[i].thread, NULL);

		if (ta[i].completed == 0)
			continue;

		/*
		 * Collect threads o1utput and extract information about
		 * how log it took to send all the packets.
		 */
		count += ta[i].count;
		if (!timerisset(&tic) || timercmp(&ta[i].tic, &tic, <))
			tic = ta[i].tic;
		if (!timerisset(&toc) || timercmp(&ta[i].toc, &toc, >))
			toc = ta[i].toc;
	}

	/* print output. */
	timersub(&toc, &tic, &toc);
	delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
	D("total %8.6f seconds", delta_t);
    }

	return (0);
}
/* end of file */
Luigi Rizzo - Jan. 24, 2013, 2:55 a.m.
On Wed, Jan 23, 2013 at 8:03 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:

> > I'm even doubtful that it's always a win on FreeBSD.  You have a
> > threshold to fall back to bcopy() and who knows what the "best" value
> > for various CPUs is.
>
> indeed.
> With the attached program (which however might be affected by the
> fact that data is not used after copying) it seems that on a recent
> linux (using gcc 4.6.2) the fastest is __builtin_memcpy()
>
>         ./testlock -m __builtin_memcpy -l 64
>
> (by a factor of 2 or more) whereas all the other methods have
> approximately the same speed.
>

never mind, pilot error. in my test program i had swapped the
arguments to __builtin_memcpy(). With the correct ones,
__builtin_memcpy()  == bcopy == memcpy on both machines,
and never faster than the pkt_copy().

In fact, on the machine with FreeBSD the unrolled loop
still beats all other methods at small packet sizes.

(e.g. (memcin my test program I had swapped the
source and destination operands for __builtin_memcpy(), and
this substantially changed the memory access pattern.

With the correct operands, __builtin_memcpy == memcpy == bcopy
on both FreeBSD and Linux.
On FreeBSD pkt_copy is still faster than the other methods for
small packets, whereas on Linux they are equivalent.

If you are curious why swapping source and dst changed things
so dramatically:

the test was supposed to read from a large chunk of
memory (over 1GB) to avoid always hitting L1 or L2.
Swapping operands causes reads to hit always the same line,
thus saving a lot of misses. The difference between the two
machine then probably is due to how the cache is used on writes.

cheers
luigi
Paolo Bonzini - Jan. 24, 2013, 8:09 a.m.
Il 23/01/2013 17:03, Luigi Rizzo ha scritto:
> On Wed, Jan 23, 2013 at 02:03:17PM +0100, Stefan Hajnoczi wrote:
>> On Wed, Jan 23, 2013 at 12:50:26PM +0100, Luigi Rizzo wrote:
>>> On Wed, Jan 23, 2013 at 12:10:55PM +0100, Stefan Hajnoczi wrote:
>>>> On Tue, Jan 22, 2013 at 08:12:15AM +0100, Luigi Rizzo wrote:
> ...
>>>>> +// a fast copy routine only for multiples of 64 bytes, non overlapped.
>>>>> +static inline void
>>>>> +pkt_copy(const void *_src, void *_dst, int l)
>>> ...
>>>>> +                *dst++ = *src++;
>>>>> +        }
>>>>> +}
>>>>
>>>> I wonder how different FreeBSD bcopy() is from glibc memcpy() and if the
>>>> optimization is even a win.  The glibc code is probably hand-written
>>>> assembly that CPU vendors have contributed for specific CPU model
>>>> families.
>>>>
>>>> Did you compare glibc memcpy() against pkt_copy()?
>>>
>>> I haven't tried in detail on glibc but will run some tests.  In any
>>> case not all systems have glibc, and on FreeBSD this pkt_copy was
>>> a significant win for small packets (saving some 20ns each; of
>>> course this counts only when you approach the 10 Mpps range, which
>>> is what you get with netmap, and of course when data is in cache).
>>>
>>> One reason pkt_copy gains something is that if it can assume there
>>> is extra space in the buffer, it can work on large chunks avoiding the extra
>>> jumps and instructions for the remaining 1-2-4 bytes.
>>
>> I'd like to drop this code or at least make it FreeBSD-specific since
>> there's no guarantee that this is a good idea on any other libc.
>>
>> I'm even doubtful that it's always a win on FreeBSD.  You have a
>> threshold to fall back to bcopy() and who knows what the "best" value
>> for various CPUs is.
> 
> indeed.
> With the attached program (which however might be affected by the
> fact that data is not used after copying) it seems that on a recent
> linux (using gcc 4.6.2) the fastest is __builtin_memcpy()
> 
> 	./testlock -m __builtin_memcpy -l 64
> 
> (by a factor of 2 or more) whereas all the other methods have
> approximately the same speed.
> 
> On FreeBSD (with clang, gcc 4.2.1, gcc 4.6.4) the pkt_copy() above
> 
> 	./testlock -m fastcopy -l 64
> 
> is largely better than other methods. I am a bit puzzled why
> the builtin method on FreeBSD is not effective, but i will check
> on some other forum...

Perhaps a different default for -march/-mtune?

Paolo
Stefan Hajnoczi - Jan. 24, 2013, 8:54 a.m.
On Wed, Jan 23, 2013 at 06:55:59PM -0800, Luigi Rizzo wrote:
> On Wed, Jan 23, 2013 at 8:03 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
> 
> > > I'm even doubtful that it's always a win on FreeBSD.  You have a
> > > threshold to fall back to bcopy() and who knows what the "best" value
> > > for various CPUs is.
> >
> > indeed.
> > With the attached program (which however might be affected by the
> > fact that data is not used after copying) it seems that on a recent
> > linux (using gcc 4.6.2) the fastest is __builtin_memcpy()
> >
> >         ./testlock -m __builtin_memcpy -l 64
> >
> > (by a factor of 2 or more) whereas all the other methods have
> > approximately the same speed.
> >
> 
> never mind, pilot error. in my test program i had swapped the
> arguments to __builtin_memcpy(). With the correct ones,
> __builtin_memcpy()  == bcopy == memcpy on both machines,
> and never faster than the pkt_copy().

Are the bcopy()/memcpy() calls given a length that is a multiple of 64 bytes?

IIUC pkt_copy() assumes 64-byte multiple lengths and that optimization
can matches with memcpy(dst, src, (len + 63) & ~63).  Maybe it helps and
at least ensures they are doing equal amounts of byte copying.

Stefan
Luigi Rizzo - Jan. 24, 2013, 5:35 p.m.
On Thu, Jan 24, 2013 at 09:54:19AM +0100, Stefan Hajnoczi wrote:
> On Wed, Jan 23, 2013 at 06:55:59PM -0800, Luigi Rizzo wrote:
> > On Wed, Jan 23, 2013 at 8:03 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
> > 
> > > > I'm even doubtful that it's always a win on FreeBSD.  You have a
> > > > threshold to fall back to bcopy() and who knows what the "best" value
> > > > for various CPUs is.
> > >
> > > indeed.
> > > With the attached program (which however might be affected by the
> > > fact that data is not used after copying) it seems that on a recent
> > > linux (using gcc 4.6.2) the fastest is __builtin_memcpy()
> > >
> > >         ./testlock -m __builtin_memcpy -l 64
> > >
> > > (by a factor of 2 or more) whereas all the other methods have
> > > approximately the same speed.
> > >
> > 
> > never mind, pilot error. in my test program i had swapped the
> > arguments to __builtin_memcpy(). With the correct ones,
> > __builtin_memcpy()  == bcopy == memcpy on both machines,
> > and never faster than the pkt_copy().
> 
> Are the bcopy()/memcpy() calls given a length that is a multiple of 64 bytes?
> 
> IIUC pkt_copy() assumes 64-byte multiple lengths and that optimization
> can matches with memcpy(dst, src, (len + 63) & ~63).  Maybe it helps and
> at least ensures they are doing equal amounts of byte copying.

the length is a parameter from the command line.
For short packets, at least on the i7-2600 and freebsd the pkt_copy()
is only slightly faster than memcpy on multiples of 64, and *a lot*
faster when the length is not a multiple.
Again i am not sure whether it depends on the compiler/glibc or
simply on the CPU, unfortunately i have no way to swap machines.

luigi
Stefan Hajnoczi - Jan. 25, 2013, 7:24 a.m.
On Thu, Jan 24, 2013 at 6:35 PM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
> On Thu, Jan 24, 2013 at 09:54:19AM +0100, Stefan Hajnoczi wrote:
>> On Wed, Jan 23, 2013 at 06:55:59PM -0800, Luigi Rizzo wrote:
>> > On Wed, Jan 23, 2013 at 8:03 AM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
>> >
>> > > > I'm even doubtful that it's always a win on FreeBSD.  You have a
>> > > > threshold to fall back to bcopy() and who knows what the "best" value
>> > > > for various CPUs is.
>> > >
>> > > indeed.
>> > > With the attached program (which however might be affected by the
>> > > fact that data is not used after copying) it seems that on a recent
>> > > linux (using gcc 4.6.2) the fastest is __builtin_memcpy()
>> > >
>> > >         ./testlock -m __builtin_memcpy -l 64
>> > >
>> > > (by a factor of 2 or more) whereas all the other methods have
>> > > approximately the same speed.
>> > >
>> >
>> > never mind, pilot error. in my test program i had swapped the
>> > arguments to __builtin_memcpy(). With the correct ones,
>> > __builtin_memcpy()  == bcopy == memcpy on both machines,
>> > and never faster than the pkt_copy().
>>
>> Are the bcopy()/memcpy() calls given a length that is a multiple of 64 bytes?
>>
>> IIUC pkt_copy() assumes 64-byte multiple lengths and that optimization
>> can matches with memcpy(dst, src, (len + 63) & ~63).  Maybe it helps and
>> at least ensures they are doing equal amounts of byte copying.
>
> the length is a parameter from the command line.
> For short packets, at least on the i7-2600 and freebsd the pkt_copy()
> is only slightly faster than memcpy on multiples of 64, and *a lot*
> faster when the length is not a multiple.

How about dropping pkt_copy() and instead rounding the memcpy() length
up to the next 64 byte multiple?

Using memcpy() is more future-proof IMO, that's why I'm pushing for this.

Stefan
Luigi Rizzo - Jan. 28, 2013, 6:46 p.m.
On Thu, Jan 24, 2013 at 11:24 PM, Stefan Hajnoczi <stefanha@gmail.com>wrote:

> On Thu, Jan 24, 2013 at 6:35 PM, Luigi Rizzo <rizzo@iet.unipi.it> wrote:
>


> >> >
> >> > never mind, pilot error. in my test program i had swapped the
> >> > arguments to __builtin_memcpy(). With the correct ones,
> >> > __builtin_memcpy()  == bcopy == memcpy on both machines,
> >> > and never faster than the pkt_copy().
> >>
> >> Are the bcopy()/memcpy() calls given a length that is a multiple of 64
> bytes?
> >>
> >> IIUC pkt_copy() assumes 64-byte multiple lengths and that optimization
> >> can matches with memcpy(dst, src, (len + 63) & ~63).  Maybe it helps and
> >> at least ensures they are doing equal amounts of byte copying.
> >
> > the length is a parameter from the command line.
> > For short packets, at least on the i7-2600 and freebsd the pkt_copy()
> > is only slightly faster than memcpy on multiples of 64, and *a lot*
> > faster when the length is not a multiple.
>
> How about dropping pkt_copy() and instead rounding the memcpy() length
> up to the next 64 byte multiple?
>

> Using memcpy() is more future-proof IMO, that's why I'm pushing for this.
>
>
fair enough, i'll make this conditional and enable memcpy() rounded to 64
bytes
multiples by default (though as i said the pkt_copy() is always at least
as fast as memcpy() on all machines i tried.

cheers
luigi

Patch

diff --git a/configure b/configure
index c6172ef..cfdf8a6 100755
--- a/configure
+++ b/configure
@@ -146,6 +146,7 @@  curl=""
 curses=""
 docs=""
 fdt=""
+netmap=""
 nptl=""
 pixman=""
 sdl=""
@@ -739,6 +740,10 @@  for opt do
   ;;
   --enable-vde) vde="yes"
   ;;
+  --disable-netmap) netmap="no"
+  ;;
+  --enable-netmap) netmap="yes"
+  ;;
   --disable-xen) xen="no"
   ;;
   --enable-xen) xen="yes"
@@ -1112,6 +1117,8 @@  echo "  --disable-uuid           disable uuid support"
 echo "  --enable-uuid            enable uuid support"
 echo "  --disable-vde            disable support for vde network"
 echo "  --enable-vde             enable support for vde network"
+echo "  --disable-netmap         disable support for netmap network"
+echo "  --enable-netmap          enable support for netmap network"
 echo "  --disable-linux-aio      disable Linux AIO support"
 echo "  --enable-linux-aio       enable Linux AIO support"
 echo "  --disable-cap-ng         disable libcap-ng support"
@@ -1914,6 +1921,26 @@  EOF
 fi
 
 ##########################################
+# netmap headers probe
+if test "$netmap" != "no" ; then
+  cat > $TMPC << EOF
+#include <inttypes.h>
+#include <net/if.h>
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+int main(void) { return 0; }
+EOF
+  if compile_prog "" "" ; then
+    netmap=yes
+  else
+    if test "$netmap" = "yes" ; then
+      feature_not_found "netmap"
+    fi
+    netmap=no
+  fi
+fi
+
+##########################################
 # libcap-ng library probe
 if test "$cap_ng" != "no" ; then
   cap_libs="-lcap-ng"
@@ -3314,6 +3341,7 @@  echo "NPTL support      $nptl"
 echo "GUEST_BASE        $guest_base"
 echo "PIE               $pie"
 echo "vde support       $vde"
+echo "netmap support    $netmap"
 echo "Linux AIO support $linux_aio"
 echo "ATTR/XATTR support $attr"
 echo "Install blobs     $blobs"
@@ -3438,6 +3466,9 @@  fi
 if test "$vde" = "yes" ; then
   echo "CONFIG_VDE=y" >> $config_host_mak
 fi
+if test "$netmap" = "yes" ; then
+  echo "CONFIG_NETMAP=y" >> $config_host_mak
+fi
 if test "$cap_ng" = "yes" ; then
   echo "CONFIG_LIBCAP=y" >> $config_host_mak
 fi
diff --git a/net/Makefile.objs b/net/Makefile.objs
index a08cd14..068253f 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -10,3 +10,4 @@  common-obj-$(CONFIG_AIX) += tap-aix.o
 common-obj-$(CONFIG_HAIKU) += tap-haiku.o
 common-obj-$(CONFIG_SLIRP) += slirp.o
 common-obj-$(CONFIG_VDE) += vde.o
+common-obj-$(CONFIG_NETMAP) += qemu-netmap.o
diff --git a/net/clients.h b/net/clients.h
index 7793294..952d076 100644
--- a/net/clients.h
+++ b/net/clients.h
@@ -52,4 +52,8 @@  int net_init_vde(const NetClientOptions *opts, const char *name,
                  NetClientState *peer);
 #endif
 
+#ifdef CONFIG_NETMAP
+int net_init_netmap(const NetClientOptions *opts, const char *name,
+                    NetClientState *peer);
+#endif
 #endif /* QEMU_NET_CLIENTS_H */
diff --git a/net/net.c b/net/net.c
index cdd9b04..816c987 100644
--- a/net/net.c
+++ b/net/net.c
@@ -618,6 +618,9 @@  static int (* const net_client_init_fun[NET_CLIENT_OPTIONS_KIND_MAX])(
         [NET_CLIENT_OPTIONS_KIND_BRIDGE]    = net_init_bridge,
 #endif
         [NET_CLIENT_OPTIONS_KIND_HUBPORT]   = net_init_hubport,
+#ifdef CONFIG_NETMAP
+	[NET_CLIENT_OPTIONS_KIND_NETMAP]    = net_init_netmap,
+#endif
 };
 
 
diff --git a/net/qemu-netmap.c b/net/qemu-netmap.c
new file mode 100644
index 0000000..79d7c09
--- /dev/null
+++ b/net/qemu-netmap.c
@@ -0,0 +1,353 @@ 
+/*
+ * netmap access for qemu
+ *
+ * Copyright (c) 2012-2013 Luigi Rizzo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "config-host.h"
+
+/* note paths are different for -head and 1.3 */
+#include "net/net.h"
+#include "clients.h"
+#include "sysemu/sysemu.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+
+#define ND(fd, ... )	// debugging
+#define D(format, ...)                                          \
+        do {                                                    \
+                struct timeval __xxts;                          \
+                gettimeofday(&__xxts, NULL);                    \
+                printf("%03d.%06d %s [%d] " format "\n",        \
+                (int)__xxts.tv_sec % 1000, (int)__xxts.tv_usec, \
+                __FUNCTION__, __LINE__, ##__VA_ARGS__);         \
+        } while (0)
+
+/* rate limited, lps indicates how many per second */
+#define RD(lps, format, ...)                                    \
+        do {                                                    \
+                static int t0, __cnt;                           \
+                struct timeval __xxts;                          \
+                gettimeofday(&__xxts, NULL);                    \
+                if (t0 != __xxts.tv_sec) {                      \
+                        t0 = __xxts.tv_sec;                     \
+                        __cnt = 0;                              \
+                }                                               \
+                if (__cnt++ < lps)                              \
+                        D(format, ##__VA_ARGS__);               \
+        } while (0)
+
+
+
+/*
+ * private netmap device info
+ */
+struct netmap_state {
+    int			fd;
+    int			memsize;
+    void		*mem;
+    struct netmap_if	*nifp;
+    struct netmap_ring	*rx;
+    struct netmap_ring	*tx;
+    char		fdname[128];	/* normally /dev/netmap */
+    char		ifname[128];	/* maybe the nmreq here ? */
+};
+
+struct nm_state {
+    NetClientState	nc;
+    struct netmap_state	me;
+    unsigned int	read_poll;
+    unsigned int	write_poll;
+};
+
+// a fast copy routine only for multiples of 64 bytes, non overlapped.
+static inline void
+pkt_copy(const void *_src, void *_dst, int l)
+{
+        const uint64_t *src = _src;
+        uint64_t *dst = _dst;
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)       __builtin_expect(!!(x), 0)
+        if (unlikely(l >= 1024)) {
+                bcopy(src, dst, l);
+                return;
+        }
+        for (; l > 0; l -= 64) {
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+                *dst++ = *src++;
+        }
+}
+
+
+/*
+ * open a netmap device. We assume there is only one queue
+ * (which is the case for the VALE bridge).
+ */
+static int netmap_open(struct netmap_state *me)
+{
+    int fd, l, err;
+    struct nmreq req;
+
+    me->fd = fd = open(me->fdname, O_RDWR);
+    if (fd < 0) {
+	error_report("Unable to open netmap device '%s'", me->fdname);
+	return -1;
+    }
+    bzero(&req, sizeof(req));
+    pstrcpy(req.nr_name, sizeof(req.nr_name), me->ifname);
+    req.nr_ringid = 0;
+    req.nr_version = NETMAP_API;
+    err = ioctl(fd, NIOCGINFO, &req);
+    if (err) {
+	error_report("cannot get info on %s", me->ifname);
+	goto error;
+    }
+    l = me->memsize = req.nr_memsize;
+    err = ioctl(fd, NIOCREGIF, &req);
+    if (err) {
+	error_report("Unable to register %s", me->ifname);
+	goto error;
+    }
+
+    me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+    if (me->mem == MAP_FAILED) {
+	error_report("Unable to mmap");
+	me->mem = NULL;
+	goto error;
+    }
+
+    me->nifp = NETMAP_IF(me->mem, req.nr_offset);
+    me->tx = NETMAP_TXRING(me->nifp, 0);
+    me->rx = NETMAP_RXRING(me->nifp, 0);
+    return 0;
+
+error:
+    close(me->fd);
+    return -1;
+}
+
+// XXX do we need the can-send routine ?
+static int netmap_can_send(void *opaque)
+{
+    struct nm_state *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+static void netmap_send(void *opaque);
+static void netmap_writable(void *opaque);
+
+/*
+ * set the handlers for the device
+ */
+static void netmap_update_fd_handler(struct nm_state *s)
+{
+#if 1
+    qemu_set_fd_handler2(s->me.fd,
+                         s->read_poll  ? netmap_can_send : NULL,
+                         s->read_poll  ? netmap_send     : NULL,
+                         s->write_poll ? netmap_writable : NULL,
+                         s);
+#else
+    qemu_set_fd_handler(s->me.fd,
+                         s->read_poll  ? netmap_send     : NULL,
+                         s->write_poll ? netmap_writable : NULL,
+                         s);
+#endif
+}
+
+// update the read handler
+static void netmap_read_poll(struct nm_state *s, int enable)
+{
+    if (s->read_poll != enable) { /* do nothing if not changed */
+	s->read_poll = enable;
+	netmap_update_fd_handler(s);
+    }
+}
+
+// update the write handler
+static void netmap_write_poll(struct nm_state *s, int enable)
+{
+    if (s->write_poll != enable) {
+	s->write_poll = enable;
+	netmap_update_fd_handler(s);
+    }
+}
+
+/*
+ * the fd_write() callback, invoked if the fd is marked as
+ * writable after a poll. Reset the handler and flush any
+ * buffered packets.
+ */
+static void netmap_writable(void *opaque)
+{
+    struct nm_state *s = opaque;
+
+    netmap_write_poll(s, 0);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+/*
+ * new data guest --> backend
+ */
+static ssize_t netmap_receive_raw(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
+    struct netmap_ring *ring = s->me.tx;
+
+    if (ring) {
+	/* request an early notification to avoid running dry */
+	if (ring->avail < ring->num_slots / 2 && s->write_poll == 0) {
+	    netmap_write_poll(s, 1);
+	}
+	if (ring->avail == 0) { // cannot write
+	    return 0;
+	}
+	uint32_t i = ring->cur;
+	uint32_t idx = ring->slot[i].buf_idx;
+	uint8_t *dst = (u_char *)NETMAP_BUF(ring, idx);
+
+	ring->slot[i].len = size;
+	pkt_copy(buf, dst, size);
+	ring->cur = NETMAP_RING_NEXT(ring, i);
+	ring->avail--;
+    }
+    return size;
+}
+
+// complete a previous send (backend --> guest), enable the fd_read callback
+static void netmap_send_completed(NetClientState *nc, ssize_t len)
+{
+    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
+
+    netmap_read_poll(s, 1);
+}
+
+/*
+ * netmap_send: backend -> guest
+ * there is traffic available from the network, try to send it up.
+ */
+static void netmap_send(void *opaque)
+{
+    struct nm_state *s = opaque;
+    int sent = 0;
+    struct netmap_ring *ring = s->me.rx;
+
+    /* only check ring->avail, let the packet be queued
+     * with qemu_send_packet_async() if needed
+     * XXX until we fix the propagation on the bridge we need to stop early
+     */
+    while (ring->avail > 0 && qemu_can_send_packet(&s->nc) ) {
+	uint32_t i = ring->cur;
+	uint32_t idx = ring->slot[i].buf_idx;
+	uint8_t *src = (u_char *)NETMAP_BUF(ring, idx);
+	int size = ring->slot[i].len;
+
+	ring->cur = NETMAP_RING_NEXT(ring, i);
+	ring->avail--;
+	sent++;
+        size = qemu_send_packet_async(&s->nc, src, size, netmap_send_completed);
+	if (size == 0) {
+	    /* the guest does not receive anymore. Packet is queued, stop
+	     * reading from the backend until netmap_send_completed()
+	     */
+	    netmap_read_poll(s, 0);
+	    return;
+	}
+    }
+    netmap_read_poll(s, 1); // probably useless.
+}
+
+
+// flush and close
+static void netmap_cleanup(NetClientState *nc)
+{
+    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    netmap_read_poll(s, 0);
+    netmap_write_poll(s, 0);
+    close(s->me.fd);
+
+    s->me.fd = -1;
+}
+
+static void netmap_poll(NetClientState *nc, bool enable)
+{
+    struct nm_state *s = DO_UPCAST(struct nm_state, nc, nc);
+
+    netmap_read_poll(s, enable);
+    netmap_write_poll(s, enable);
+}
+
+
+/* fd support */
+
+static NetClientInfo net_netmap_info = {
+    .type = NET_CLIENT_OPTIONS_KIND_NETMAP,
+    .size = sizeof(struct nm_state),
+    .receive = netmap_receive_raw,
+//    .receive_raw = netmap_receive_raw,
+//    .receive_iov = netmap_receive_iov,
+    .poll = netmap_poll,
+    .cleanup = netmap_cleanup,
+};
+
+/* the external calls */
+
+/*
+ * ... -net netmap,ifname="..."
+ */
+int net_init_netmap(const NetClientOptions *opts, const char *name, NetClientState *peer)
+{
+    const NetdevNetmapOptions *netmap_opts = opts->netmap;
+    NetClientState *nc;
+    struct netmap_state me;
+    struct nm_state *s;
+
+    pstrcpy(me.fdname, sizeof(me.fdname), name ? name : "/dev/netmap");
+    /* set default name for the port if not supplied */
+    pstrcpy(me.ifname, sizeof(me.ifname),
+	netmap_opts->has_ifname ? netmap_opts->ifname : "vale0");
+    if (netmap_open(&me))
+	return -1;
+
+    /* create the object -- XXX use name or ifname ? */
+    nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name);
+    s = DO_UPCAST(struct nm_state, nc, nc);
+    s->me = me;
+    netmap_read_poll(s, 1); // initially only poll for reads.
+
+    return 0;
+}
diff --git a/qapi-schema.json b/qapi-schema.json
index 6d7252b..f24b745 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2572,6 +2572,11 @@ 
   'data': {
     'hubid':     'int32' } }
 
+{ 'type': 'NetdevNetmapOptions',
+  'data': {
+    '*ifname':     'str' } }
+
+
 ##
 # @NetClientOptions
 #
@@ -2589,7 +2594,8 @@ 
     'vde':      'NetdevVdeOptions',
     'dump':     'NetdevDumpOptions',
     'bridge':   'NetdevBridgeOptions',
-    'hubport':  'NetdevHubPortOptions' } }
+    'hubport':  'NetdevHubPortOptions',
+    'netmap':   'NetdevNetmapOptions' } }
 
 ##
 # @NetLegacy