Patchwork [4/4] Add support for -net bridge

login
register
mail settings
Submitter Anthony Liguori
Date Nov. 4, 2009, 12:28 a.m.
Message ID <1257294485-27015-5-git-send-email-aliguori@us.ibm.com>
Download mbox | patch
Permalink /patch/37542/
State New
Headers show

Comments

Anthony Liguori - Nov. 4, 2009, 12:28 a.m.
The most common use of -net tap is to connect a tap device to a bridge.  This
requires the use of a script and running qemu as root in order to allocate a
tap device to pass to the script.

This model is great for portability and flexibility but it's incredibly
Krumme, Chris - Nov. 4, 2009, 1:49 p.m.
Hello Anthony,

Now that I have read the whole series I say again great patch. 

> -----Original Message-----
> From: 
> qemu-devel-bounces+chris.krumme=windriver.com@nongnu.org 
> [mailto:qemu-devel-bounces+chris.krumme=windriver.com@nongnu.o
> rg] On Behalf Of Anthony Liguori
> Sent: Tuesday, November 03, 2009 6:28 PM
> To: qemu-devel@nongnu.org
> Cc: Mark McLoughlin; Anthony Liguori; Arnd Bergmann; Dustin 
> Kirkland; Michael Tsirkin; Juan Quintela
> Subject: [Qemu-devel] [PATCH 4/4] Add support for -net bridge
> 
> The most common use of -net tap is to connect a tap device to 
> a bridge.  This
> requires the use of a script and running qemu as root in 
> order to allocate a
> tap device to pass to the script.
> 
> This model is great for portability and flexibility but it's 
> incredibly
> difficult to eliminate the need to run qemu as root.  The 
> only really viable
> mechanism is to use tunctl to create a tap device, attach it 
> to a bridge as
> root, and then hand that tap device to qemu.  The problem 
> with this mechanism
> is that it requires administrator intervention whenever a 
> user wants to create
> a guest.
> 
> By essentially writing a helper that implements the most 
> common qemu-ifup
> script that can be safely given cap_net_admin, we can 
> dramatically simplify
> things for non-privileged users.  We still support -net tap 
> as a mechanism
> for advanced users and backwards compatibility.
> 
> Currently, this is very Linux centric but there's really no 
> reason why it
> couldn't be extended for other Unixes.
> 
> A typical invocation of -net bridge would be:
> 
>   qemu -net bridge -net nic,model=virtio
> 
> The default bridge that we attach to is qemubr0.  The 
> thinking is that a distro
> could preconfigure such an interface to allow out-of-the-box 
> bridged networking.
> 
> Alternatively, if a user wants to use a different bridge, 
> they can say:
> 
>   qemu -net bridge,br=br0 -net nic,model=virtio
> 
> Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
> ---
>  configure       |    1 +
>  net.c           |   20 +++++++-
>  net/tap.c       |  142 
> +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/tap.h       |    2 +
>  qemu-options.hx |    3 +
>  5 files changed, 167 insertions(+), 1 deletions(-)
> 
> diff --git a/configure b/configure
> index 7c9d3a2..55a1a4f 100755
> --- a/configure
> +++ b/configure
> @@ -1896,6 +1896,7 @@ echo >> $config_host_mak
>  
>  echo "CONFIG_QEMU_SHAREDIR=\"$prefix$datasuffix\"" >> 
> $config_host_mak
>  echo "CONFIG_QEMU_CONFDIR=\"/etc/qemu\"" >> $config_host_mak
> +echo "CONFIG_QEMU_HELPERDIR=\"$prefix/libexec\"" >> $config_host_mak
>  
>  case "$cpu" in
>    
> i386|x86_64|alpha|cris|hppa|ia64|m68k|microblaze|mips|mips64|p
> pc|ppc64|s390|sparc|sparc64)
> diff --git a/net.c b/net.c
> index 37662c6..54a7a5b 100644
> --- a/net.c
> +++ b/net.c
> @@ -2541,6 +2541,22 @@ static struct {
>              },
>              { /* end of list */ }
>          },
> +    }, {
> +        .type = "bridge",
> +        .init = net_init_bridge,
> +        .desc = {
> +            NET_COMMON_PARAMS_DESC,
> +            {
> +                .name = "br",
> +                .type = QEMU_OPT_STRING,
> +                .help = "bridge name",
> +            }, {
> +                .name = "helper",
> +                .type = QEMU_OPT_STRING,
> +                .help = "command to execute to configure bridge",
> +            },
> +            { /* end of list */ }
> +        },
>      },
>      { /* end of list */ }
>  };
> @@ -2565,7 +2581,8 @@ int net_client_init(Monitor *mon, 
> QemuOpts *opts, int is_netdev)
>  #ifdef CONFIG_VDE
>              strcmp(type, "vde") != 0 &&
>  #endif
> -            strcmp(type, "socket") != 0) {
> +            strcmp(type, "socket") != 0 &&
> +            strcmp(type, "bridge") != 0) {
>              qemu_error("The '%s' network backend type is not 
> valid with -netdev\n",
>                         type);
>              return -1;
> @@ -2641,6 +2658,7 @@ static int net_host_check_device(const 
> char *device)
>  #ifdef CONFIG_VDE
>                                         ,"vde"
>  #endif
> +                                       , "bridge"
>      };
>      for (i = 0; i < sizeof(valid_param_list) / sizeof(char *); i++) {
>          if (!strncmp(valid_param_list[i], device,
> diff --git a/net/tap.c b/net/tap.c
> index bdb4a15..f5abed6 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -436,3 +436,145 @@ int net_init_tap(QemuOpts *opts, 
> Monitor *mon, const char *name, VLANState *vlan
>  
>      return 0;
>  }
> +
> +#define DEFAULT_BRIDGE_INTERFACE "qemubr0"
> +#define DEFAULT_BRIDGE_HELPER CONFIG_QEMU_HELPERDIR 
> "/qemu-bridge-helper"
> +
> +static int recv_fd(int c)
> +{
> +    int fd;
> +    uint8_t msgbuf[CMSG_SPACE(sizeof(fd))];
> +    struct msghdr msg = {
> +        .msg_control = msgbuf,
> +        .msg_controllen = sizeof(msgbuf),
> +    };
> +    struct cmsghdr *cmsg;
> +    struct iovec iov;
> +    uint8_t req[1];
> +    ssize_t len;
> +
> +    cmsg = CMSG_FIRSTHDR(&msg);
> +    cmsg->cmsg_level = SOL_SOCKET;
> +    cmsg->cmsg_type = SCM_RIGHTS;
> +    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
> +    msg.msg_controllen = cmsg->cmsg_len;
> +
> +    iov.iov_base = req;
> +    iov.iov_len = sizeof(req);
> +
> +    msg.msg_iov = &iov;
> +    msg.msg_iovlen = 1;
> +
> +    len = recvmsg(c, &msg, 0);
> +    if (len > 0) {
> +        memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
> +
> +        return fd;
> +    }
> +
> +    return len;
> +}
> +
> +static int net_bridge_run_helper(const char *helper, const 
> char *bridge)
> +{
> +    sigset_t oldmask, mask;
> +    int pid, status;
> +    char *args[5];
> +    char **parg;
> +    int sv[2];
> +
> +    sigemptyset(&mask);
> +    sigaddset(&mask, SIGCHLD);
> +    sigprocmask(SIG_BLOCK, &mask, &oldmask);
> +
> +    if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
> +        return -1;
> +    }
> +
> +    /* try to launch network script */
> +    pid = fork();
> +    if (pid == 0) {
> +        int open_max = sysconf(_SC_OPEN_MAX), i;
> +        char buf[32];
> +
> +        snprintf(buf, sizeof(buf), "%d", sv[1]);
> +
> +        for (i = 0; i < open_max; i++) {
> +            if (i != STDIN_FILENO &&
> +                i != STDOUT_FILENO &&
> +                i != STDERR_FILENO &&
> +                i != sv[1]) {
> +                close(i);
> +            }
> +        }
> +        parg = args;
> +        *parg++ = (char *)helper;
> +        *parg++ = (char *)"--use-vnet";
> +        *parg++ = (char *)bridge;
> +        *parg++ = buf;
> +        *parg++ = NULL;
> +        execv(helper, args);
> +        _exit(1);
> +    } else if (pid > 0) {
> +        int fd;
> +
> +        close(sv[1]);
> +
> +        do {
> +            fd = recv_fd(sv[0]);
> +        } while (fd == -1 && errno == EINTR);
> +
> +        close(sv[0]);
> +
> +        while (waitpid(pid, &status, 0) != pid) {
> +            /* loop */
> +        }
> +        sigprocmask(SIG_SETMASK, &oldmask, NULL);
> +
> +        if (fd < 0) {
> +            fprintf(stderr, "failed to recv file descriptor\n");
> +            return -1;
> +        }
> +
> +        if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
> +            return fd;
> +        }
> +    }
> +    fprintf(stderr, "failed to launch bridge helper\n");
> +    return -1;
> +}
> +
> +int net_init_bridge(QemuOpts *opts, Monitor *mon, const char 
> *name, VLANState *vlan)
> +{
> +    TAPState *s;
> +    int fd, vnet_hdr;
> +
> +    if (!qemu_opt_get(opts, "br")) {
> +        qemu_opt_set(opts, "br", DEFAULT_BRIDGE_INTERFACE);
> +    }
> +    if (!qemu_opt_get(opts, "helper")) {
> +        qemu_opt_set(opts, "helper", DEFAULT_BRIDGE_HELPER);
> +    }
> +
> +    fd = net_bridge_run_helper(qemu_opt_get(opts, "helper"),
> +                               qemu_opt_get(opts, "br"));
> +
> +    fcntl(fd, F_SETFL, O_NONBLOCK);
> +
> +    vnet_hdr = tap_probe_vnet_hdr(fd);
> +
> +    s = net_tap_fd_init(vlan, "bridge", name, fd, vnet_hdr);
> +    if (!s) {
> +        close(fd);
> +        return -1;
> +    }
> +
> +    snprintf(s->vc->info_str, sizeof(s->vc->info_str),
> +             "br=%s", qemu_opt_get(opts, "br"));
> +
> +    if (vlan) {
> +        vlan->nb_host_devs++;
> +    }
> +
> +    return 0;
> +}
> diff --git a/net/tap.h b/net/tap.h
> index 538a562..9527db4 100644
> --- a/net/tap.h
> +++ b/net/tap.h
> @@ -48,4 +48,6 @@ int tap_probe_vnet_hdr(int fd);
>  int tap_probe_has_ufo(int fd);
>  void tap_fd_set_offload(int fd, int csum, int tso4, int 
> tso6, int ecn, int ufo);
>  
> +int net_init_bridge(QemuOpts *opts, Monitor *mon, const char 
> *name, VLANState *vlan);
> +
>  #endif /* QEMU_NET_TAP_H */
> diff --git a/qemu-options.hx b/qemu-options.hx
> index d78b738..8f20aa6 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -820,6 +820,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>      "                default of 'sndbuf=1048576' can be 
> disabled using 'sndbuf=0'\n"
>      "                use vnet_hdr=off to avoid enabling the 
> IFF_VNET_HDR tap flag; use\n"
>      "                vnet_hdr=on to make the lack of 
> IFF_VNET_HDR support an error condition\n"
> +    "-net bridge[,vlan=n][,name=str][,br=bridge][,helper=helper]\n"
> +    "                connects to a host bridge device 'br' 
> using the program 'helper'\n"

Do you need to mention the default name qemubr0 here?

Thanks

Chris


> +    "                to create a tap device and attach it to 
> the bridge\n"
>  #endif
>      "-net 
> socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connec
> t=host:port]\n"
>      "                connect the vlan 'n' to another VLAN 
> using a socket connection\n"
> -- 
> 1.6.2.5
> 
> 
> 
>
Anthony Liguori - Nov. 4, 2009, 2:23 p.m.
Krumme, Chris wrote:
> Do you need to mention the default name qemubr0 here?
>   

Good suggestion.

Regards,

Anthony Liguori
Avi Kivity - Nov. 5, 2009, 2:41 p.m.
On 11/04/2009 02:28 AM, Anthony Liguori wrote:
> The most common use of -net tap is to connect a tap device to a bridge.  This
> requires the use of a script and running qemu as root in order to allocate a
> tap device to pass to the script.
>
> This model is great for portability and flexibility but it's incredibly
> difficult to eliminate the need to run qemu as root.  The only really viable
> mechanism is to use tunctl to create a tap device, attach it to a bridge as
> root, and then hand that tap device to qemu.  The problem with this mechanism
> is that it requires administrator intervention whenever a user wants to create
> a guest.
>
> By essentially writing a helper that implements the most common qemu-ifup
> script that can be safely given cap_net_admin, we can dramatically simplify
> things for non-privileged users.  We still support -net tap as a mechanism
> for advanced users and backwards compatibility.
>
> Currently, this is very Linux centric but there's really no reason why it
> couldn't be extended for other Unixes.
>
> A typical invocation of -net bridge would be:
>
>    qemu -net bridge -net nic,model=virtio
>
> The default bridge that we attach to is qemubr0.  The thinking is that a distro
> could preconfigure such an interface to allow out-of-the-box bridged networking.
>
> Alternatively, if a user wants to use a different bridge, they can say:
>
>    qemu -net bridge,br=br0 -net nic,model=virtio
>
>
> +int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan);
> +
>    

Don't we need to tear the interface down after shutdown?
Anthony Liguori - Nov. 5, 2009, 2:45 p.m.
Avi Kivity wrote:
>> +int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, 
>> VLANState *vlan);
>> +
>>    
>
> Don't we need to tear the interface down after shutdown?

net_init_bridge calls net_tap_fd_init which registers tap_cleanup.  That 
closes the fd and frees associated memory.

The helper does not allocate a persistent tap device so closing the file 
descriptor is sufficient for cleanup.
Avi Kivity - Nov. 5, 2009, 2:49 p.m.
On 11/05/2009 04:45 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>>> +int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, 
>>> VLANState *vlan);
>>> +
>>
>> Don't we need to tear the interface down after shutdown?
>
> net_init_bridge calls net_tap_fd_init which registers tap_cleanup.  
> That closes the fd and frees associated memory.
>
> The helper does not allocate a persistent tap device so closing the 
> file descriptor is sufficient for cleanup.
>

Ah, so unclean shutdown will clean up as well.
Jamie Lokier - Nov. 6, 2009, 2:29 a.m.
Anthony Liguori wrote:
> Avi Kivity wrote:
> >>+int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, 
> >>VLANState *vlan);
> >>+
> >>   
> >
> >Don't we need to tear the interface down after shutdown?
> 
> net_init_bridge calls net_tap_fd_init which registers tap_cleanup.  That 
> closes the fd and frees associated memory.
> 
> The helper does not allocate a persistent tap device so closing the file 
> descriptor is sufficient for cleanup.

I think you should at least have the option to call a/the helper when
cleaning up too.

For the same reason that -net downscript= was added.  (From the names
I get the impression that was added later than -net script=, perhaps
due to the same kind of oversight ;-)

A user-supplied helper (I'm thinking of my needs) would create per-tap
iptables/ebtables rules and perhaps routing table entries, in addition
to creating the tap interface itself.  Routing table entries are
automatically deleted when the interface is, but iptables/ebtables
rules are not.  (Although it would be a nice little kernel addition if
they could be flagged to be.)

But now, thinking a bit more clearly... why is the helper separate
from "-net script="?

Overall, I envisage this happening (sorry for making up the name
tap-up-helper, as I seem to have lost your patch mails):

    tap-up-helper=      # Sets up tap interface, adds to bridge if needed.
    script=             # Sets up IP config

       ....QEMU runs....

    downscript=         # Removes IP config
    tap-down-helper=    # If only for symmetry!

We does tap-up-helper need to be separate from script?  Does QEMU need
to do something in between calling the two?

If not, then the existing "-net script=" script could just send the
tap descriptor as well as anything else it does, couldn't it?

-- Jamie
David Woodhouse - Nov. 7, 2009, 5:29 p.m.
On Tue, 2009-11-03 at 18:28 -0600, Anthony Liguori wrote:
> The most common use of -net tap is to connect a tap device to a bridge.  This
> requires the use of a script and running qemu as root in order to allocate a
> tap device to pass to the script. 

Does it?

Tap devices can be created (and configured) in advance, and can be
chowned so that they can be opened by an otherwise unprivileged user (or
group). You don't need root privileges to use a tap device.
Anthony Liguori - Nov. 7, 2009, 10:11 p.m.
David Woodhouse wrote:
> On Tue, 2009-11-03 at 18:28 -0600, Anthony Liguori wrote:
>   
>> The most common use of -net tap is to connect a tap device to a bridge.  This
>> requires the use of a script and running qemu as root in order to allocate a
>> tap device to pass to the script. 
>>     
>
> Does it?
>
> Tap devices can be created (and configured) in advance, and can be
> chowned so that they can be opened by an otherwise unprivileged user (or
> group).

But that requires prior administrative access.

>  You don't need root privileges to use a tap device.
>   

You can access a preconfigured tap device but you cannot allocate a tap 
device and connect it to a bridge without CAP_NET_ADMIN.

Regards,

Anthony Liguori
Avi Kivity - Nov. 8, 2009, 8:27 a.m.
On 11/08/2009 12:11 AM, Anthony Liguori wrote:
>
>>  You don't need root privileges to use a tap device.
>
> You can access a preconfigured tap device but you cannot allocate a 
> tap device and connect it to a bridge without CAP_NET_ADMIN.

btw, shouldn't we, in the general case, create a bridge per user and use 
IP NAT?  If we have a global bridge, users can spoof each other's MAC 
addresses and interfere with their virtual machines.  They can also 
interfere with the real network.

That's not a concern with most one-user-per-machine configurations, but 
the default configuration should be safe.
Arnd Bergmann - Nov. 8, 2009, 8:43 a.m.
On Sunday 08 November 2009 08:27:41 Avi Kivity wrote:
> On 11/08/2009 12:11 AM, Anthony Liguori wrote:
> >
> >>  You don't need root privileges to use a tap device.
> >
> > You can access a preconfigured tap device but you cannot allocate a 
> > tap device and connect it to a bridge without CAP_NET_ADMIN.
> 
> btw, shouldn't we, in the general case, create a bridge per user and use 
> IP NAT?  If we have a global bridge, users can spoof each other's MAC 
> addresses and interfere with their virtual machines.  They can also 
> interfere with the real network.
> 
> That's not a concern with most one-user-per-machine configurations, but 
> the default configuration should be safe.

It also depends a lot on what you want to do with the virtual machine.
If you want to run a game or a legacy application in a different operating
system on your desktop, a NATed bridge is ideal, but it does not work
on a server if the guest wants to listen on a socket with its own IP address.

	Arnd <><
Avi Kivity - Nov. 8, 2009, 8:55 a.m.
On 11/08/2009 10:43 AM, Arnd Bergmann wrote:
>> btw, shouldn't we, in the general case, create a bridge per user and use
>> IP NAT?  If we have a global bridge, users can spoof each other's MAC
>> addresses and interfere with their virtual machines.  They can also
>> interfere with the real network.
>>
>> That's not a concern with most one-user-per-machine configurations, but
>> the default configuration should be safe.
>>      
> It also depends a lot on what you want to do with the virtual machine.
> If you want to run a game or a legacy application in a different operating
> system on your desktop, a NATed bridge is ideal, but it does not work
> on a server if the guest wants to listen on a socket with its own IP address.
>    

Yes.  It also depends on what the system administrator wants you to be 
able to do.  On desktop machines you are usually the system 
administrator so there is no problem.  But we should beware of making it 
easy to subvert security.

There is also the problem of accidental MAC overlap - qemu uses the same 
MAC address for all virtual machines unless overridden, so if two users 
create a virtual machine without specifying MAC addresses they will 
trample each other.  A single user could also have trouble launching two 
guests; that's not a security problem, but will lead to a lot of 
annoyance and false bug reports ("networking dies as soon as I launch a 
second guest").
Anthony Liguori - Nov. 9, 2009, 2:20 p.m.
Avi Kivity wrote:
> On 11/08/2009 12:11 AM, Anthony Liguori wrote:
>>
>>>  You don't need root privileges to use a tap device.
>>
>> You can access a preconfigured tap device but you cannot allocate a 
>> tap device and connect it to a bridge without CAP_NET_ADMIN.
>
> btw, shouldn't we, in the general case, create a bridge per user and 
> use IP NAT?  If we have a global bridge, users can spoof each other's 
> MAC addresses and interfere with their virtual machines.

qemu-bridge-helper supports that model quite well :-)  You would create 
a NAT'd bridge for each user as the administrator, then create a 
bridge.conf that consisted of per-user includes with appropriate 
permissions set on each of those files.

>   They can also interfere with the real network.
>
> That's not a concern with most one-user-per-machine configurations, 
> but the default configuration should be safe.

Let's not kid ourselves, no matter what we do we're giving a user 
elevated privileges.  Even with NAT, if the host can access the NAT'ed 
network, then you can run a privileged service (like NFS) in that 
network.  Like it or not, some networks rely on privileged services 
being trusted as part of their security model (consider NIS).

I think the best we can do is provide a tool that allows an 
administrator to grant users additional privileges in the tiniest 
increments possible.  Putting people in wheel just so they can do 
virtualization is too much.

I don't see having an fscap-based helper as creating policy.  I see it 
as adding a mechanism for administrators to create policy.
Jamie Lokier - Nov. 9, 2009, 3:39 p.m.
Anthony Liguori wrote:
> Let's not kid ourselves, no matter what we do we're giving a user 
> elevated privileges.  Even with NAT, if the host can access the NAT'ed 
> network, then you can run a privileged service (like NFS) in that 
> network.

I don't see how outgoing NAT (SNAT), where the guest can make
_outgoing_ connections to the network, allows the guest to run a
privileged service accessible to the network.  Sure, the guest can run
an NFS server, but it means nothing to the outside - it's on the
guest's own private little network.  Same as Slirp.

The guest cannot even make an outgoing request which appears to come
from an privileged port - if the SNAT rule has the appropriate options
to force the port into an unprivileged range.

For the guest's NFS server to be visible to the network requires
incoming NAT (DNAT) on the host, often called "port forwarding".  But
that is done by explicit administration; if you can do that, you can
run a privileged service on the host anyway.

> I think the best we can do is provide a tool that allows an 
> administrator to grant users additional privileges in the tiniest 
> increments possible.  Putting people in wheel just so they can do 
> virtualization is too much.
> 
> I don't see having an fscap-based helper as creating policy.  I see it 
> as adding a mechanism for administrators to create policy.

I agree with both of these.

-- Jamie
Anthony Liguori - Nov. 9, 2009, 3:43 p.m.
Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>> Let's not kid ourselves, no matter what we do we're giving a user 
>> elevated privileges.  Even with NAT, if the host can access the NAT'ed 
>> network, then you can run a privileged service (like NFS) in that 
>> network.
>>     
>
> I don't see how outgoing NAT (SNAT), where the guest can make
> _outgoing_ connections to the network, allows the guest to run a
> privileged service accessible to the network.  Sure, the guest can run
> an NFS server, but it means nothing to the outside - it's on the
> guest's own private little network.  Same as Slirp.
>
> The guest cannot even make an outgoing request which appears to come
> from an privileged port - if the SNAT rule has the appropriate options
> to force the port into an unprivileged range.
>
> For the guest's NFS server to be visible to the network requires
> incoming NAT (DNAT) on the host, often called "port forwarding".  But
> that is done by explicit administration; if you can do that, you can
> run a privileged service on the host anyway.
>   

You are correct except that I qualified this as NAT with host access 
which so far is the common model.  If the host can access the NAT'd 
network behind the NAT, then port privileges are important.
Jamie Lokier - Nov. 9, 2009, 7:19 p.m.
Anthony Liguori wrote:
> You are correct except that I qualified this as NAT with host access 
> which so far is the common model.  If the host can access the NAT'd 
> network behind the NAT, then port privileges are important.

You're right.

This is why QEMU guests should be run inside an LXC container :-)

Or in the general case, a security-conscious net-setup script should
ensure general user invocations are limited to admin-decided subnets
with admin-decided firewall rules, so that they just look like
processes with ordinary access to everything else.

Iptables being what it is, that'd have to be distro specific and
sometimes site specific.

-- Jamie
Avi Kivity - Nov. 10, 2009, 12:23 p.m.
On 11/09/2009 04:20 PM, Anthony Liguori wrote:
> Avi Kivity wrote:
>> On 11/08/2009 12:11 AM, Anthony Liguori wrote:
>>>
>>>>  You don't need root privileges to use a tap device.
>>>
>>> You can access a preconfigured tap device but you cannot allocate a 
>>> tap device and connect it to a bridge without CAP_NET_ADMIN.
>>
>> btw, shouldn't we, in the general case, create a bridge per user and 
>> use IP NAT?  If we have a global bridge, users can spoof each other's 
>> MAC addresses and interfere with their virtual machines.
>
> qemu-bridge-helper supports that model quite well :-)  You would 
> create a NAT'd bridge for each user as the administrator, then create 
> a bridge.conf that consisted of per-user includes with appropriate 
> permissions set on each of those files.

Except that the out-of-the-box experience is "go and configure things" 
again.

Really, bridging is fairly complex.  You have to set up the bridge, 
perhaps with NAT, perhaps to the physical device (possibly fighting a 
bit with NetworkManager in that case), allocate MAC addresses in a 
nonconflicting way, set up DHCP, and if you are multiuser, protect users 
from each other, and protect the host from guests.

>
>>   They can also interfere with the real network.
>>
>> That's not a concern with most one-user-per-machine configurations, 
>> but the default configuration should be safe.
>
> Let's not kid ourselves, no matter what we do we're giving a user 
> elevated privileges.  Even with NAT, if the host can access the NAT'ed 
> network, then you can run a privileged service (like NFS) in that 
> network.  Like it or not, some networks rely on privileged services 
> being trusted as part of their security model (consider NIS).

At least on commercial Linux vendor exports NFS home directories with 
the 'secure' option, requiring a privileged port to mount NFS shares.  
If your patch falls into the wrong hands, virtualization developers 
working at such a vendor might be able to access their home directories 
from guests, which would annoy IT immensely.

> I think the best we can do is provide a tool that allows an 
> administrator to grant users additional privileges in the tiniest 
> increments possible.  Putting people in wheel just so they can do 
> virtualization is too much.

I'm concerned that we're confusing the situation instead of clearing 
it.  Telling users to "edit /etc/foo/bar" so you can run bridging 
without fully explaining the impact is not responsible.  Trusting 
libvirt to do it end-to-end is fine, but in this case it can also set up 
the tap itself.

>
> I don't see having an fscap-based helper as creating policy.  I see it 
> as adding a mechanism for administrators to create policy.
>

I'll summarize my current issues with the patchset:

- by introducing an suid/fscap helper we're expanding the scope of 
security in qemu.  In addition to protecting qemu against malicious 
guests trying to gain local user privileges, we need to protect it 
against malicious local users trying to gain root (or CAP_NET_ADMIN) 
privileges

- we're introducing yet another authorization mechanism.  We have 
enough.  We already have feedback that policykit is wanted in at least 
some cases.  Given that authorization is best kept consistent in a 
distro, and that distros try to be as inconsistent from each other as 
possible, we can't please everyone.

- for the command-line user, we aren't providing a full solution; they 
still have a lot of configuration to do before things work (especially 
if they don't already have a network segment with DHCP)

- for libvirt and other management systems, it is insufficient since 
they will at least in some cases need to perform privileged 
configuration before handing the tap to the guest.  So they still need 
to be privileged and they can't use -net bridge for hotplug.

I would be happier if we drop suid/fscap from the helper.  We can have a 
user-provided script or binary gain the necessary privileges (using sudo 
or policykit or whatever).  This way we don't need to deal with local 
privilege escalation vulnerabilities.

Patch

difficult to eliminate the need to run qemu as root.  The only really viable
mechanism is to use tunctl to create a tap device, attach it to a bridge as
root, and then hand that tap device to qemu.  The problem with this mechanism
is that it requires administrator intervention whenever a user wants to create
a guest.

By essentially writing a helper that implements the most common qemu-ifup
script that can be safely given cap_net_admin, we can dramatically simplify
things for non-privileged users.  We still support -net tap as a mechanism
for advanced users and backwards compatibility.

Currently, this is very Linux centric but there's really no reason why it
couldn't be extended for other Unixes.

A typical invocation of -net bridge would be:

  qemu -net bridge -net nic,model=virtio

The default bridge that we attach to is qemubr0.  The thinking is that a distro
could preconfigure such an interface to allow out-of-the-box bridged networking.

Alternatively, if a user wants to use a different bridge, they can say:

  qemu -net bridge,br=br0 -net nic,model=virtio

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 configure       |    1 +
 net.c           |   20 +++++++-
 net/tap.c       |  142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/tap.h       |    2 +
 qemu-options.hx |    3 +
 5 files changed, 167 insertions(+), 1 deletions(-)

diff --git a/configure b/configure
index 7c9d3a2..55a1a4f 100755
--- a/configure
+++ b/configure
@@ -1896,6 +1896,7 @@  echo >> $config_host_mak
 
 echo "CONFIG_QEMU_SHAREDIR=\"$prefix$datasuffix\"" >> $config_host_mak
 echo "CONFIG_QEMU_CONFDIR=\"/etc/qemu\"" >> $config_host_mak
+echo "CONFIG_QEMU_HELPERDIR=\"$prefix/libexec\"" >> $config_host_mak
 
 case "$cpu" in
   i386|x86_64|alpha|cris|hppa|ia64|m68k|microblaze|mips|mips64|ppc|ppc64|s390|sparc|sparc64)
diff --git a/net.c b/net.c
index 37662c6..54a7a5b 100644
--- a/net.c
+++ b/net.c
@@ -2541,6 +2541,22 @@  static struct {
             },
             { /* end of list */ }
         },
+    }, {
+        .type = "bridge",
+        .init = net_init_bridge,
+        .desc = {
+            NET_COMMON_PARAMS_DESC,
+            {
+                .name = "br",
+                .type = QEMU_OPT_STRING,
+                .help = "bridge name",
+            }, {
+                .name = "helper",
+                .type = QEMU_OPT_STRING,
+                .help = "command to execute to configure bridge",
+            },
+            { /* end of list */ }
+        },
     },
     { /* end of list */ }
 };
@@ -2565,7 +2581,8 @@  int net_client_init(Monitor *mon, QemuOpts *opts, int is_netdev)
 #ifdef CONFIG_VDE
             strcmp(type, "vde") != 0 &&
 #endif
-            strcmp(type, "socket") != 0) {
+            strcmp(type, "socket") != 0 &&
+            strcmp(type, "bridge") != 0) {
             qemu_error("The '%s' network backend type is not valid with -netdev\n",
                        type);
             return -1;
@@ -2641,6 +2658,7 @@  static int net_host_check_device(const char *device)
 #ifdef CONFIG_VDE
                                        ,"vde"
 #endif
+                                       , "bridge"
     };
     for (i = 0; i < sizeof(valid_param_list) / sizeof(char *); i++) {
         if (!strncmp(valid_param_list[i], device,
diff --git a/net/tap.c b/net/tap.c
index bdb4a15..f5abed6 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -436,3 +436,145 @@  int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
 
     return 0;
 }
+
+#define DEFAULT_BRIDGE_INTERFACE "qemubr0"
+#define DEFAULT_BRIDGE_HELPER CONFIG_QEMU_HELPERDIR "/qemu-bridge-helper"
+
+static int recv_fd(int c)
+{
+    int fd;
+    uint8_t msgbuf[CMSG_SPACE(sizeof(fd))];
+    struct msghdr msg = {
+        .msg_control = msgbuf,
+        .msg_controllen = sizeof(msgbuf),
+    };
+    struct cmsghdr *cmsg;
+    struct iovec iov;
+    uint8_t req[1];
+    ssize_t len;
+
+    cmsg = CMSG_FIRSTHDR(&msg);
+    cmsg->cmsg_level = SOL_SOCKET;
+    cmsg->cmsg_type = SCM_RIGHTS;
+    cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
+    msg.msg_controllen = cmsg->cmsg_len;
+
+    iov.iov_base = req;
+    iov.iov_len = sizeof(req);
+
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+
+    len = recvmsg(c, &msg, 0);
+    if (len > 0) {
+        memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+
+        return fd;
+    }
+
+    return len;
+}
+
+static int net_bridge_run_helper(const char *helper, const char *bridge)
+{
+    sigset_t oldmask, mask;
+    int pid, status;
+    char *args[5];
+    char **parg;
+    int sv[2];
+
+    sigemptyset(&mask);
+    sigaddset(&mask, SIGCHLD);
+    sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
+    if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
+        return -1;
+    }
+
+    /* try to launch network script */
+    pid = fork();
+    if (pid == 0) {
+        int open_max = sysconf(_SC_OPEN_MAX), i;
+        char buf[32];
+
+        snprintf(buf, sizeof(buf), "%d", sv[1]);
+
+        for (i = 0; i < open_max; i++) {
+            if (i != STDIN_FILENO &&
+                i != STDOUT_FILENO &&
+                i != STDERR_FILENO &&
+                i != sv[1]) {
+                close(i);
+            }
+        }
+        parg = args;
+        *parg++ = (char *)helper;
+        *parg++ = (char *)"--use-vnet";
+        *parg++ = (char *)bridge;
+        *parg++ = buf;
+        *parg++ = NULL;
+        execv(helper, args);
+        _exit(1);
+    } else if (pid > 0) {
+        int fd;
+
+        close(sv[1]);
+
+        do {
+            fd = recv_fd(sv[0]);
+        } while (fd == -1 && errno == EINTR);
+
+        close(sv[0]);
+
+        while (waitpid(pid, &status, 0) != pid) {
+            /* loop */
+        }
+        sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+        if (fd < 0) {
+            fprintf(stderr, "failed to recv file descriptor\n");
+            return -1;
+        }
+
+        if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+            return fd;
+        }
+    }
+    fprintf(stderr, "failed to launch bridge helper\n");
+    return -1;
+}
+
+int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan)
+{
+    TAPState *s;
+    int fd, vnet_hdr;
+
+    if (!qemu_opt_get(opts, "br")) {
+        qemu_opt_set(opts, "br", DEFAULT_BRIDGE_INTERFACE);
+    }
+    if (!qemu_opt_get(opts, "helper")) {
+        qemu_opt_set(opts, "helper", DEFAULT_BRIDGE_HELPER);
+    }
+
+    fd = net_bridge_run_helper(qemu_opt_get(opts, "helper"),
+                               qemu_opt_get(opts, "br"));
+
+    fcntl(fd, F_SETFL, O_NONBLOCK);
+
+    vnet_hdr = tap_probe_vnet_hdr(fd);
+
+    s = net_tap_fd_init(vlan, "bridge", name, fd, vnet_hdr);
+    if (!s) {
+        close(fd);
+        return -1;
+    }
+
+    snprintf(s->vc->info_str, sizeof(s->vc->info_str),
+             "br=%s", qemu_opt_get(opts, "br"));
+
+    if (vlan) {
+        vlan->nb_host_devs++;
+    }
+
+    return 0;
+}
diff --git a/net/tap.h b/net/tap.h
index 538a562..9527db4 100644
--- a/net/tap.h
+++ b/net/tap.h
@@ -48,4 +48,6 @@  int tap_probe_vnet_hdr(int fd);
 int tap_probe_has_ufo(int fd);
 void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
 
+int net_init_bridge(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan);
+
 #endif /* QEMU_NET_TAP_H */
diff --git a/qemu-options.hx b/qemu-options.hx
index d78b738..8f20aa6 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -820,6 +820,9 @@  DEF("net", HAS_ARG, QEMU_OPTION_net,
     "                default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n"
     "                use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag; use\n"
     "                vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n"
+    "-net bridge[,vlan=n][,name=str][,br=bridge][,helper=helper]\n"
+    "                connects to a host bridge device 'br' using the program 'helper'\n"
+    "                to create a tap device and attach it to the bridge\n"
 #endif
     "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                connect the vlan 'n' to another VLAN using a socket connection\n"