diff mbox

Re: [PATCH qemu-kvm] Add raw(af_packet) network backend to qemu

Message ID 1264798376.15980.362.camel@w-sridhar.beaverton.ibm.com
State New
Headers show

Commit Message

Sridhar Samudrala Jan. 29, 2010, 8:52 p.m. UTC
On Wed, 2010-01-27 at 14:56 -0800, Sridhar Samudrala wrote:
> On Wed, 2010-01-27 at 22:39 +0100, Arnd Bergmann wrote:
> > On Wednesday 27 January 2010, Anthony Liguori wrote:
> > > >> I think -net socket,fd should just be (trivially) extended to work with raw
> > > >> sockets out of the box, with no support for opening it. Then you can have
> > > >> libvirt or some wrapper open a raw socket and a private namespace and just pass it
> > > >> down.
> > > >>      
> > > > That'd work. Anthony?
> > > 
> > > The fundamental problem that I have with all of this is that we should 
> > > not be introducing new network backends that are based around something 
> > > only a developer is going to understand.  If I'm a user and I want to 
> > > use an external switch in VEPA mode, how in the world am I going to know 
> > > that I'm supposed to use the -net raw backend or the -net socket 
> > > backend?  It might as well be the -net butterflies backend as far as a 
> > > user is concerned.
> > 
> > My point is that we already have -net socket,fd and any user that passes
> > an fd into that already knows what he wants to do with it. Making it
> > work with raw sockets is just a natural extension to this, which works
> > on all kernels and (with separate namespaces) is reasonably secure.
> 
> Didn't realize that -net socket is already there and supports TCP and
> UDP sockets. I will look into extending -net socket to support AF_PACKET
> SOCK_RAW type sockets.

OK. Here is a patch that adds AF_PACKET-SOCK_RAW support to -netdev socket
backend. It allows specifying a already opened raw fd or a ifname to which a
raw socket can be bind.

   -netdev socket,fd=X,id=str
   -netdev socket,ifname=<ethX/macvlanX>,id=str

However, i found that struct NetSocketState doesn't include all the State info that
is required to support AF_PACKET Raw sockets. So i had to add NetSocketRawState 
and also couldn't re-use much of the code.

I think -net socket backend is more geared towards AF_INET sockets. Adding support
for a new family of socket doesn't fit nicely with the existing code.
But if this approach is more acceptable than a new -net raw,fd backend, i am fine 
with it.

Thanks
Sridhar
diff mbox

Patch

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index eba578a..7d62dd9 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -15,6 +15,7 @@ 
 #include "net.h"
 #include "net/checksum.h"
 #include "net/tap.h"
+#include "net/socket.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
 
@@ -133,6 +134,9 @@  static int peer_has_vnet_hdr(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_vnet_hdr = tap_has_vnet_hdr(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_vnet_hdr = sock_raw_has_vnet_hdr(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -149,6 +153,9 @@  static int peer_has_ufo(VirtIONet *n)
     case NET_CLIENT_TYPE_TAP:
         n->has_ufo = tap_has_ufo(n->nic->nc.peer);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        n->has_ufo = sock_raw_has_ufo(n->nic->nc.peer);
+        break;
     default:
         return 0;            
     }
@@ -165,6 +172,9 @@  static void peer_using_vnet_hdr(VirtIONet *n, int using_vnet_hdr)
     case NET_CLIENT_TYPE_TAP:
         tap_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_using_vnet_hdr(n->nic->nc.peer, using_vnet_hdr);
+        break;
     default:
         break; 
     }
@@ -180,6 +190,9 @@  static void peer_set_offload(VirtIONet *n, int csum, int tso4, int tso6,
     case NET_CLIENT_TYPE_TAP:
         tap_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
         break;
+    case NET_CLIENT_TYPE_SOCKET_RAW:
+        sock_raw_set_offload(n->nic->nc.peer, csum, tso4, tso6, ecn, ufo);
+        break;
     default:
         break; 
     }
diff --git a/net.c b/net.c
index 6ef93e6..3d25d64 100644
--- a/net.c
+++ b/net.c
@@ -1002,6 +1002,11 @@  static struct {
                 .type = QEMU_OPT_STRING,
                 .help = "UDP multicast address and port number",
             },
+            {
+                .name = "ifname",
+                .type = QEMU_OPT_STRING,
+                .help = "interface name",
+            },
             { /* end of list */ }
         },
 #ifdef CONFIG_VDE
diff --git a/net.h b/net.h
index 116bb80..74b3e69 100644
--- a/net.h
+++ b/net.h
@@ -34,7 +34,8 @@  typedef enum {
     NET_CLIENT_TYPE_TAP,
     NET_CLIENT_TYPE_SOCKET,
     NET_CLIENT_TYPE_VDE,
-    NET_CLIENT_TYPE_DUMP
+    NET_CLIENT_TYPE_DUMP,
+    NET_CLIENT_TYPE_SOCKET_RAW,
 } net_client_type;
 
 typedef void (NetPoll)(VLANClientState *, bool enable);
diff --git a/net/socket.c b/net/socket.c
index 5533737..56f5bad 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -32,6 +32,327 @@ 
 #include "qemu_socket.h"
 #include "sysemu.h"
 
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+/* Maximum GSO packet size (64k) plus plenty of room for
+ * the ethernet and virtio_net headers
+ */
+#define RAW_BUFSIZE (4096 + 65536)
+
+typedef struct NetSocketRawState {
+    VLANClientState nc;
+    int fd;
+    uint8_t buf[RAW_BUFSIZE];
+    int promisc;
+    unsigned int read_poll:1;
+    unsigned int write_poll:1;
+    unsigned int has_vnet_hdr:1;
+    unsigned int using_vnet_hdr:1;	
+    unsigned int has_ufo:1;
+} NetSocketRawState;
+
+struct virtio_net_hdr
+{
+    uint8_t flags;
+    uint8_t gso_type;
+    uint16_t hdr_len;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+static int sock_raw_can_send(void *opaque);
+static void sock_raw_send(void *opaque);
+static void sock_raw_writable(void *opaque);
+
+static void sock_raw_update_fd_handler(NetSocketRawState *s)
+{
+    qemu_set_fd_handler2(s->fd,
+                         s->read_poll  ? sock_raw_can_send : NULL,
+                         s->read_poll  ? sock_raw_send     : NULL,
+                         s->write_poll ? sock_raw_writable : NULL,
+                         s);
+}
+
+static void sock_raw_read_poll(NetSocketRawState *s, int enable)
+{
+    s->read_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_write_poll(NetSocketRawState *s, int enable)
+{
+    s->write_poll = !!enable;
+    sock_raw_update_fd_handler(s);
+}
+
+static void sock_raw_writable(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    sock_raw_write_poll(s, 0);
+    qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t sock_raw_write_packet(NetSocketRawState *s,
+                                     const struct iovec *iov,
+                                     int iovcnt)
+{
+    ssize_t len;
+
+    do {
+        len = writev(s->fd, iov, iovcnt);
+    } while (len == -1 && errno == EINTR);
+
+    if (len == -1 && errno == EAGAIN) {
+        sock_raw_write_poll(s, 1);
+        return 0;
+    }
+
+    if (len == -1)
+        printf("raw_write_packet: errno:%d\n", errno);
+
+    return len;
+}
+
+static ssize_t sock_raw_receive_iov(VLANClientState *nc,
+                                    const struct iovec *iov,
+                                    int iovcnt)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    const struct iovec *iovp = iov;
+    struct iovec iov_copy[iovcnt + 1];
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+        iov_copy[0].iov_base = &hdr;
+        iov_copy[0].iov_len =  sizeof(hdr);
+        memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+        iovp = iov_copy;
+        iovcnt++;
+    }
+
+    return sock_raw_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t sock_raw_receive_raw(VLANClientState *nc, const uint8_t *buf,
+                                    size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[2];
+    int iovcnt = 0;
+    struct virtio_net_hdr hdr = { 0, };
+
+    if (s->has_vnet_hdr) {
+        iov[iovcnt].iov_base = &hdr;
+        iov[iovcnt].iov_len  = sizeof(hdr);
+        iovcnt++;
+    }
+
+    iov[iovcnt].iov_base = (char *)buf;
+    iov[iovcnt].iov_len  = size;
+    iovcnt++;
+
+    return sock_raw_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t sock_raw_receive(VLANClientState *nc, const uint8_t *buf,
+                                size_t size)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+    struct iovec iov[1];
+
+    if (s->has_vnet_hdr && !s->using_vnet_hdr)
+        return sock_raw_receive_raw(nc, buf, size);
+
+    iov[0].iov_base = (char *)buf;
+    iov[0].iov_len  = size;
+
+    return sock_raw_write_packet(s, iov, 1);
+}
+
+static int sock_raw_can_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+
+    return qemu_can_send_packet(&s->nc);
+}
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags)
+{
+    int ret;
+
+    ret = recv(fd, buf, maxlen, flags);
+    return ret;
+}
+
+static void sock_raw_send_completed(VLANClientState *nc, ssize_t len)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    sock_raw_read_poll(s, 1);
+}
+
+static void sock_raw_send(void *opaque)
+{
+    NetSocketRawState *s = opaque;
+    int size;
+
+    do {
+        uint8_t *buf = s->buf;
+
+        size = sock_raw_read_packet(s->fd, s->buf, sizeof(s->buf), MSG_TRUNC);
+        if (size <= 0)
+            break;
+
+        if (s->has_vnet_hdr && !s->using_vnet_hdr) {
+            buf  += sizeof(struct virtio_net_hdr);
+            size -= sizeof(struct virtio_net_hdr);
+        }
+
+        size = qemu_send_packet_async(&s->nc, buf, size,
+                                      sock_raw_send_completed);
+        if (size == 0)
+            sock_raw_read_poll(s, 0);
+
+    } while (size > 0 && qemu_can_send_packet(&s->nc));
+}
+
+int sock_raw_has_ufo(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_ufo;
+}
+
+int sock_raw_has_vnet_hdr(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+
+    return s->has_vnet_hdr;
+}
+
+void sock_raw_using_vnet_hdr(VLANClientState *nc, int using_vnet_hdr)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    using_vnet_hdr = using_vnet_hdr != 0;
+
+    assert(nc->info->type == NET_CLIENT_TYPE_SOCKET_RAW);
+    assert(s->has_vnet_hdr == using_vnet_hdr);
+
+    s->using_vnet_hdr = using_vnet_hdr;
+}
+
+void sock_raw_set_offload(VLANClientState *nc, int csum, int tso4,
+                     int tso6, int ecn, int ufo)
+{
+    return;
+}
+
+static void sock_raw_cleanup(VLANClientState *nc)
+{
+    NetSocketRawState *s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    qemu_purge_queued_packets(nc);
+
+    sock_raw_read_poll(s, 0);
+    sock_raw_write_poll(s, 0);
+    close(s->fd);
+}
+
+int sock_raw_probe_vnet_hdr(int fd)
+{
+    int val, len;
+	
+    len = sizeof(val);
+    if (getsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &val, (socklen_t *)&len)
+                                                                         == 0) 
+        return 1;	
+	
+    return 0;
+}
+
+static NetClientInfo net_raw_info = {
+    .type = NET_CLIENT_TYPE_SOCKET_RAW,
+    .size = sizeof(NetSocketRawState),
+    .receive = sock_raw_receive,
+    .receive_raw = NULL,
+    .receive_iov = sock_raw_receive_iov,
+    .cleanup = sock_raw_cleanup,
+};
+
+
+static NetSocketRawState *net_socket_fd_init_raw(VLANState *vlan,
+                                                 const char *model,
+                                                 const char *name, int fd)
+{
+    VLANClientState *nc;
+    NetSocketRawState *s;
+
+    nc = qemu_new_net_client(&net_raw_info, vlan, NULL, model, name);
+
+    s = DO_UPCAST(NetSocketRawState, nc, nc);
+
+    s->fd = fd;
+    s->has_vnet_hdr = sock_raw_probe_vnet_hdr(fd);
+    s->using_vnet_hdr = 0;
+    s->has_ufo = 1;
+    sock_raw_read_poll(s, 1);
+
+    return s;
+}
+
+static int net_socket_raw_ifname_init(VLANState *vlan, const char *model,
+				      const char *name, const char *ifname)
+{
+    struct ifreq req;
+    int fd, ret;
+    struct sockaddr_ll lladdr;
+    int val;
+
+    fd = qemu_socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+    if (fd < 0)
+        fprintf(stderr, "packet socket failed\n");
+
+    memset(&req, 0, sizeof(req));
+    strncpy(req.ifr_name, ifname, IFNAMSIZ-1);
+    ret = ioctl(fd, SIOCGIFINDEX, &req);
+    if (ret < 0)
+        fprintf(stderr, "SIOCGIFINDEX failed\n");
+
+    memset(&lladdr, 0, sizeof(lladdr));
+    lladdr.sll_family   = AF_PACKET;
+    lladdr.sll_protocol = htons(ETH_P_ALL);
+    lladdr.sll_ifindex  = req.ifr_ifindex;
+    ret = bind(fd, (const struct sockaddr *)&lladdr, sizeof(lladdr));
+    if (ret < 0)
+        fprintf(stderr, "bind failed\n");
+
+    val = 1;
+    ret=setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, (const char *)&val,
+                   sizeof(val));
+    if (ret < 0) {
+        fprintf(stderr, "setsockopt(SOL_PACKET, PACKET_VNET_HDR) failed\n");
+    } 
+
+    ret = fcntl(fd, F_SETFL, fcntl(fd, F_GETFL) | O_NONBLOCK);
+    if (ret < 0)
+        fprintf(stderr, "fcntl(O_NONBLOCK) set failed\n");
+
+    net_socket_fd_init_raw(vlan, model, name, fd);
+
+    return 0;
+}
+
+
 typedef struct NetSocketState {
     VLANClientState nc;
     int fd;
@@ -337,6 +658,8 @@  static NetSocketState *net_socket_fd_init(VLANState *vlan,
         return net_socket_fd_init_dgram(vlan, model, name, fd, is_connected);
     case SOCK_STREAM:
         return net_socket_fd_init_stream(vlan, model, name, fd, is_connected);
+    case SOCK_RAW:
+        return (struct NetSocketState *)net_socket_fd_init_raw(vlan, model, name, fd);
     default:
         /* who knows ... this could be a eg. a pty, do warn and continue as stream */
         fprintf(stderr, "qemu: warning: socket type=%d for fd=%d is not SOCK_DGRAM or SOCK_STREAM\n", so_type, fd);
@@ -519,6 +842,22 @@  int net_init_socket(QemuOpts *opts,
             close(fd);
             return -1;
         }
+    } else if (qemu_opt_get(opts, "ifname")) {
+        const char *ifname;
+
+        if (qemu_opt_get(opts, "fd") ||
+            qemu_opt_get(opts, "connect") ||
+            qemu_opt_get(opts, "listen") ||
+            qemu_opt_get(opts, "mcast")) {
+            qemu_error("fd=, connect= and mcast= and listen= is invalid with ifname=\n");
+            return -1;
+        }
+
+        ifname = qemu_opt_get(opts, "ifname");
+
+        if (net_socket_raw_ifname_init(vlan, "socket", name, ifname) == -1) {
+            return -1;
+        }
     } else if (qemu_opt_get(opts, "listen")) {
         const char *listen;
 
diff --git a/net/socket.h b/net/socket.h
index ea46f02..cc09866 100644
--- a/net/socket.h
+++ b/net/socket.h
@@ -30,4 +30,13 @@ 
 int net_init_socket(QemuOpts *opts, Monitor *mon,
                     const char *name, VLANState *vlan);
 
+#define PACKET_VNET_HDR	15
+
+ssize_t sock_raw_read_packet(int fd, uint8_t *buf, int maxlen, int flags);
+int sock_raw_has_ufo(VLANClientState *vc);
+int sock_raw_has_vnet_hdr(VLANClientState *vc);
+void sock_raw_using_vnet_hdr(VLANClientState *vc, int using_vnet_hdr);
+int sock_raw_probe_vnet_hdr(int fd);
+void sock_raw_set_offload(VLANClientState *vc, int csum, int tso4, int tso6, int ecn, int ufo);
+
 #endif /* QEMU_NET_SOCKET_H */