Message ID | 886ef6ffeb6748f6dc4fe5431f71cb12bb74edc9.1267122331.git.mst@redhat.com |
---|---|
State | New |
Headers | show |
On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote: > This adds vhost binary option to tap, to enable vhost net accelerator. > Default is off for now, we'll be able to make default on long term > when we know it's stable. > > vhostfd option can be used by management, to pass in the fd. Assigning > vhostfd implies vhost=on. > > Signed-off-by: Michael S. Tsirkin<mst@redhat.com> > Since the thinking these days is that macvtap and tap is pretty much all we'll ever need for vhost-net, perhaps we should revisit -net vhost vs. -net tap,vhost=X? I think -net vhost,fd=X makes a lot more sense than -net tap,vhost=on,vhostfd=X. Regards, Anthony Liguori > --- > net.c | 8 ++++++++ > net/tap.c | 33 +++++++++++++++++++++++++++++++++ > qemu-options.hx | 4 +++- > 3 files changed, 44 insertions(+), 1 deletions(-) > > diff --git a/net.c b/net.c > index a1bf49f..d1e23f1 100644 > --- a/net.c > +++ b/net.c > @@ -973,6 +973,14 @@ static const struct { > .name = "vnet_hdr", > .type = QEMU_OPT_BOOL, > .help = "enable the IFF_VNET_HDR flag on the tap interface" > + }, { > + .name = "vhost", > + .type = QEMU_OPT_BOOL, > + .help = "enable vhost-net network accelerator", > + }, { > + .name = "vhostfd", > + .type = QEMU_OPT_STRING, > + .help = "file descriptor of an already opened vhost net device", > }, > #endif /* _WIN32 */ > { /* end of list */ } > diff --git a/net/tap.c b/net/tap.c > index fc59fd4..65797ef 100644 > --- a/net/tap.c > +++ b/net/tap.c > @@ -41,6 +41,8 @@ > > #include "net/tap-linux.h" > > +#include "hw/vhost_net.h" > + > /* Maximum GSO packet size (64k) plus plenty of room for > * the ethernet and virtio_net headers > */ > @@ -57,6 +59,7 @@ typedef struct TAPState { > unsigned int has_vnet_hdr : 1; > unsigned int using_vnet_hdr : 1; > unsigned int has_ufo: 1; > + struct vhost_net *vhost_net; > } TAPState; > > static int launch_script(const char *setup_script, const char *ifname, int fd); > @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc) > { > TAPState *s = DO_UPCAST(TAPState, nc, nc); > > + if (s->vhost_net) { > + vhost_net_cleanup(s->vhost_net); > + } > + > qemu_purge_queued_packets(nc); > > if (s->down_script[0]) > @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, > s->has_ufo = tap_probe_has_ufo(s->fd); > tap_set_offload(&s->nc, 0, 0, 0, 0, 0); > tap_read_poll(s, 1); > + s->vhost_net = NULL; > return s; > } > > @@ -456,5 +464,30 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan > } > } > > + if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) { > + int vhostfd, r; > + if (qemu_opt_get(opts, "vhostfd")) { > + r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd")); > + if (r == -1) { > + return -1; > + } > + vhostfd = r; > + } else { > + vhostfd = -1; > + } > + s->vhost_net = vhost_net_init(&s->nc, vhostfd); > + if (!s->vhost_net) { > + qemu_error("vhost-net requested but could not be initialized\n"); > + return -1; > + } > + } else if (qemu_opt_get(opts, "vhostfd")) { > + qemu_error("vhostfd= is not valid without vhost\n"); > + return -1; > + } > + > + if (vlan) { > + vlan->nb_host_devs++; > + } > + > return 0; > } > diff --git a/qemu-options.hx b/qemu-options.hx > index f53922f..1850906 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -879,7 +879,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > "-net tap[,vlan=n][,name=str],ifname=name\n" > " connect the host TAP network interface to VLAN 'n'\n" > #else > - "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n" > + "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n" > " connect the host TAP network interface to VLAN 'n' and use the\n" > " network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n" > " and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n" > @@ -889,6 +889,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n" > " use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n" > " use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n" > + " use vhost=on to enable experimental in kernel accelerator\n" > + " use 'vhostfd=h' to connect to an already opened vhost net device\n" > #endif > "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" > " connect the vlan 'n' to another VLAN using a socket connection\n" >
On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote: > On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote: >> This adds vhost binary option to tap, to enable vhost net accelerator. >> Default is off for now, we'll be able to make default on long term >> when we know it's stable. >> >> vhostfd option can be used by management, to pass in the fd. Assigning >> vhostfd implies vhost=on. >> >> Signed-off-by: Michael S. Tsirkin<mst@redhat.com> >> > > Since the thinking these days is that macvtap and tap is pretty much all > we'll ever need for vhost-net, perhaps we should revisit -net vhost vs. > -net tap,vhost=X? > > I think -net vhost,fd=X makes a lot more sense than -net > tap,vhost=on,vhostfd=X. > > Regards, > > Anthony Liguori We'll have to duplicate all tap options. I think long term we will just make vhost=on the default. Users do not really care about vhost, it just makes tap go fater. So promoting it to 1st class options is wrong IMO. >> --- >> net.c | 8 ++++++++ >> net/tap.c | 33 +++++++++++++++++++++++++++++++++ >> qemu-options.hx | 4 +++- >> 3 files changed, 44 insertions(+), 1 deletions(-) >> >> diff --git a/net.c b/net.c >> index a1bf49f..d1e23f1 100644 >> --- a/net.c >> +++ b/net.c >> @@ -973,6 +973,14 @@ static const struct { >> .name = "vnet_hdr", >> .type = QEMU_OPT_BOOL, >> .help = "enable the IFF_VNET_HDR flag on the tap interface" >> + }, { >> + .name = "vhost", >> + .type = QEMU_OPT_BOOL, >> + .help = "enable vhost-net network accelerator", >> + }, { >> + .name = "vhostfd", >> + .type = QEMU_OPT_STRING, >> + .help = "file descriptor of an already opened vhost net device", >> }, >> #endif /* _WIN32 */ >> { /* end of list */ } >> diff --git a/net/tap.c b/net/tap.c >> index fc59fd4..65797ef 100644 >> --- a/net/tap.c >> +++ b/net/tap.c >> @@ -41,6 +41,8 @@ >> >> #include "net/tap-linux.h" >> >> +#include "hw/vhost_net.h" >> + >> /* Maximum GSO packet size (64k) plus plenty of room for >> * the ethernet and virtio_net headers >> */ >> @@ -57,6 +59,7 @@ typedef struct TAPState { >> unsigned int has_vnet_hdr : 1; >> unsigned int using_vnet_hdr : 1; >> unsigned int has_ufo: 1; >> + struct vhost_net *vhost_net; >> } TAPState; >> >> static int launch_script(const char *setup_script, const char *ifname, int fd); >> @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc) >> { >> TAPState *s = DO_UPCAST(TAPState, nc, nc); >> >> + if (s->vhost_net) { >> + vhost_net_cleanup(s->vhost_net); >> + } >> + >> qemu_purge_queued_packets(nc); >> >> if (s->down_script[0]) >> @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, >> s->has_ufo = tap_probe_has_ufo(s->fd); >> tap_set_offload(&s->nc, 0, 0, 0, 0, 0); >> tap_read_poll(s, 1); >> + s->vhost_net = NULL; >> return s; >> } >> >> @@ -456,5 +464,30 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan >> } >> } >> >> + if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) { >> + int vhostfd, r; >> + if (qemu_opt_get(opts, "vhostfd")) { >> + r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd")); >> + if (r == -1) { >> + return -1; >> + } >> + vhostfd = r; >> + } else { >> + vhostfd = -1; >> + } >> + s->vhost_net = vhost_net_init(&s->nc, vhostfd); >> + if (!s->vhost_net) { >> + qemu_error("vhost-net requested but could not be initialized\n"); >> + return -1; >> + } >> + } else if (qemu_opt_get(opts, "vhostfd")) { >> + qemu_error("vhostfd= is not valid without vhost\n"); >> + return -1; >> + } >> + >> + if (vlan) { >> + vlan->nb_host_devs++; >> + } >> + >> return 0; >> } >> diff --git a/qemu-options.hx b/qemu-options.hx >> index f53922f..1850906 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -879,7 +879,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, >> "-net tap[,vlan=n][,name=str],ifname=name\n" >> " connect the host TAP network interface to VLAN 'n'\n" >> #else >> - "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n" >> + "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n" >> " connect the host TAP network interface to VLAN 'n' and use the\n" >> " network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n" >> " and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n" >> @@ -889,6 +889,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, >> " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n" >> " use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n" >> " use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n" >> + " use vhost=on to enable experimental in kernel accelerator\n" >> + " use 'vhostfd=h' to connect to an already opened vhost net device\n" >> #endif >> "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" >> " connect the vlan 'n' to another VLAN using a socket connection\n" >>
On 02/26/2010 08:51 AM, Michael S. Tsirkin wrote: > On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote: > >> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote: >> >>> This adds vhost binary option to tap, to enable vhost net accelerator. >>> Default is off for now, we'll be able to make default on long term >>> when we know it's stable. >>> >>> vhostfd option can be used by management, to pass in the fd. Assigning >>> vhostfd implies vhost=on. >>> >>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com> >>> >>> >> Since the thinking these days is that macvtap and tap is pretty much all >> we'll ever need for vhost-net, perhaps we should revisit -net vhost vs. >> -net tap,vhost=X? >> >> I think -net vhost,fd=X makes a lot more sense than -net >> tap,vhost=on,vhostfd=X. >> >> Regards, >> >> Anthony Liguori >> > We'll have to duplicate all tap options. > I think long term we will just make vhost=on the default. > I don't think we can. vhost only works when using KVM and it doesn't support all of the features of userspace virtio. Since it's in upstream Linux without supporting all of the virtio-net features, it's something we're going to have to deal with for a long time. Furthermore, vhost reduces a virtual machine's security. It offers an impressive performance boost (particularly when dealing with 10gbit+ networking) but for a user that doesn't have such strong networking performance requirements, I think it's reasonable for them to not want to make a security trade off. One reason I like -net vhost is that it's a much less obscure syntax and it's the sort of thing that is easy to tell users that they should use. I understand you're argument for -net tap if you assume vhost=on will become the default because that means that users never really have to be aware of vhost once it becomes the default. But as I said above, I don't think it's reasonable to make it on by default with -net tap. > Users do not really care about vhost, it just makes tap > go fater. So promoting it to 1st class options is wrong IMO. > User's should care about vhost because it impacts the features supported by the virtual machine and it has security ramifications. It's a great feature and I think the most users will want to use it, but I do think it's something that users ought to be aware of. Regards, Anthony Liguori
On Fri, Feb 26, 2010 at 09:23:01AM -0600, Anthony Liguori wrote: > On 02/26/2010 08:51 AM, Michael S. Tsirkin wrote: >> On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote: >> >>> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote: >>> >>>> This adds vhost binary option to tap, to enable vhost net accelerator. >>>> Default is off for now, we'll be able to make default on long term >>>> when we know it's stable. >>>> >>>> vhostfd option can be used by management, to pass in the fd. Assigning >>>> vhostfd implies vhost=on. >>>> >>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com> >>>> >>>> >>> Since the thinking these days is that macvtap and tap is pretty much all >>> we'll ever need for vhost-net, perhaps we should revisit -net vhost vs. >>> -net tap,vhost=X? >>> >>> I think -net vhost,fd=X makes a lot more sense than -net >>> tap,vhost=on,vhostfd=X. >>> >>> Regards, >>> >>> Anthony Liguori >>> >> We'll have to duplicate all tap options. >> I think long term we will just make vhost=on the default. >> > > I don't think we can. vhost only works when using KVM Yes, default to on with KVM. > and it doesn't > support all of the features of userspace virtio. Since it's in upstream > Linux without supporting all of the virtio-net features, it's something > we're going to have to deal with for a long time. Speaking of vlan filtering etc? It's just a matter of time before it supports all interesting features. Kernel support is there in net-next already, userspace should be easy too. I should be able to code it up once I finish bothering about upstream merge (hint hint :)). > Furthermore, vhost reduces a virtual machine's security. It offers an > impressive performance boost (particularly when dealing with 10gbit+ > networking) but for a user that doesn't have such strong networking > performance requirements, I think it's reasonable for them to not want > to make a security trade off. It's hard for me to see how it reduces VM security. If it does, it's not by design and will be fixed. > One reason I like -net vhost is that it's a much less obscure syntax and > it's the sort of thing that is easy to tell users that they should use. > I understand you're argument for -net tap if you assume vhost=on will > become the default because that means that users never really have to be > aware of vhost once it becomes the default. But as I said above, I > don't think it's reasonable to make it on by default with -net tap. Not yet, but we'll get there. >> Users do not really care about vhost, it just makes tap >> go fater. So promoting it to 1st class options is wrong IMO. >> > > User's should care about vhost because it impacts the features supported > by the virtual machine and it has security ramifications. It's a great > feature and I think the most users will want to use it, but I do think > it's something that users ought to be aware of. > > Regards, > > Anthony Liguori
On 02/27/2010 01:44 PM, Michael S. Tsirkin wrote: >> and it doesn't >> support all of the features of userspace virtio. Since it's in upstream >> Linux without supporting all of the virtio-net features, it's something >> we're going to have to deal with for a long time. >> > Speaking of vlan filtering etc? It's just a matter of time before it > supports all interesting features. Kernel support is there in net-next > already, userspace should be easy too. I should be able to code it up > once I finish bothering about upstream merge (hint hint :)). > :-) As I've said in the past, I'm willing to live with -net tap,vhost but I really think -net vhost would be better in the long run. The only two real issues I have with the series is the ring address mapping stability and the duplicated slot management code. Both have security implications so I think it's important that they be addressed. Otherwise, I'm pretty happy with how things are. >> Furthermore, vhost reduces a virtual machine's security. It offers an >> impressive performance boost (particularly when dealing with 10gbit+ >> networking) but for a user that doesn't have such strong networking >> performance requirements, I think it's reasonable for them to not want >> to make a security trade off. >> > It's hard for me to see how it reduces VM security. If it does, it's > not by design and will be fixed. > If you have a bug in vhost-net (would never happen of course) then it's a host-kernel exploit whereas if we have a bug in virtio-net userspace, it's a local user exploit. We have a pretty robust architecture to deal with local user exploits (qemu can run unprivilieged, SELinux enforces mandatory access control) but a host-kernel can not be protected against. I'm not saying that we should never put things in the kernel, but there's definitely a security vs. performance trade off here. Regards, Anthony Liguori
On Sun, Feb 28, 2010 at 10:08:26AM -0600, Anthony Liguori wrote: > On 02/27/2010 01:44 PM, Michael S. Tsirkin wrote: >>> and it doesn't >>> support all of the features of userspace virtio. Since it's in upstream >>> Linux without supporting all of the virtio-net features, it's something >>> we're going to have to deal with for a long time. >>> >> Speaking of vlan filtering etc? It's just a matter of time before it >> supports all interesting features. Kernel support is there in net-next >> already, userspace should be easy too. I should be able to code it up >> once I finish bothering about upstream merge (hint hint :)). >> > > :-) As I've said in the past, I'm willing to live with -net tap,vhost > but I really think -net vhost would be better in the long run. > > The only two real issues I have with the series is the ring address > mapping stability This one I do not yet understand completely to be able solve. Is the only case where PCI BAR overlays RAM? I think this case is best dealt with by disabling BAR mapping. > and the duplicated slot management code. If you look at qemu-kvm, it's even triplicated :) I just would like to get the code merged, then work at adding more infrastructure to prettify it. > Both have security implications so I think it's important that they > be addressed. Otherwise, I'm pretty happy with how things are. Care suggesting some solutions? > >>> Furthermore, vhost reduces a virtual machine's security. It offers an >>> impressive performance boost (particularly when dealing with 10gbit+ >>> networking) but for a user that doesn't have such strong networking >>> performance requirements, I think it's reasonable for them to not want >>> to make a security trade off. >>> >> It's hard for me to see how it reduces VM security. If it does, it's >> not by design and will be fixed. >> > > If you have a bug in vhost-net (would never happen of course) then it's > a host-kernel exploit whereas if we have a bug in virtio-net userspace, > it's a local user exploit. We have a pretty robust architecture to deal > with local user exploits (qemu can run unprivilieged, SELinux enforces > mandatory access control) but a host-kernel can not be protected against. > > I'm not saying that we should never put things in the kernel, but > there's definitely a security vs. performance trade off here. > > Regards, > > Anthony Liguori Not sure I get the argument completely. Any kernel service with a bug might be exploited for priveledge escalation. Yes, more kernel code gives you more attack surface, but given we use rich interfaces such as ones exposed by kvm, I am not sure by how much. Also note that vhost net does not take qemu out of the equation for everything, just for datapath operations.
On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: >> Both have security implications so I think it's important that they >> be addressed. Otherwise, I'm pretty happy with how things are. >> > Care suggesting some solutions? > The obvious thing to do would be to use the memory notifier in vhost to keep track of whenever something remaps the ring's memory region and if that happens, issue an ioctl to vhost to change the location of the ring. Also, you would need to merge the vhost slot management code with the KVM slot management code. I'm sympathetic to your arguments though. As qemu is today, the above is definitely the right thing to do. But ram is always ram and ram always has a fixed (albeit non-linear) mapping within a guest. We can probably be smarter in qemu. There are areas of MMIO/ROM address space that *sometimes* end up behaving like ram, but that's a different case. The one other case to consider is ram hot add/remove in which case, ram may be removed or added (but it's location will never change during its lifetime). Here's what I'll propose, and I'd really like to hear what Paul think about it before we start down this path. I think we should add a new API that's: void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); This API would do two things. It would call qemu_ram_alloc() and cpu_register_physical_memory() just as code does today. It would also add this region into a new table. There would be: void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size); void cpu_ram_unmap(void *mem); These calls would use this new table to lookup ram addresses. These mappings are valid as long as the guest is executed. Within the table, each region would have a reference count. When it comes time to do hot add/remove, we would wait to remove a region until the reference count went to zero to avoid unmapping during DMA. cpu_ram_add() never gets called with overlapping regions. We'll modify cpu_register_physical_memory() to ensure that a ram mapping is never changed after initial registration. vhost no longer needs to bother keeping the dynamic table up to date so it removes all of the slot management code from vhost. KVM still needs the code to handle rom/ram mappings but we can take care of that next. virtio-net's userspace code can do the same thing as vhost and only map the ring once which should be a big performance improvement. It also introduces a place to do madvise() reset registrations. This is definitely appropriate for target-i386. I suspect it is for other architectures too. Regards, Anthony Liguori >> >>>> Furthermore, vhost reduces a virtual machine's security. It offers an >>>> impressive performance boost (particularly when dealing with 10gbit+ >>>> networking) but for a user that doesn't have such strong networking >>>> performance requirements, I think it's reasonable for them to not want >>>> to make a security trade off. >>>> >>>> >>> It's hard for me to see how it reduces VM security. If it does, it's >>> not by design and will be fixed. >>> >>> >> If you have a bug in vhost-net (would never happen of course) then it's >> a host-kernel exploit whereas if we have a bug in virtio-net userspace, >> it's a local user exploit. We have a pretty robust architecture to deal >> with local user exploits (qemu can run unprivilieged, SELinux enforces >> mandatory access control) but a host-kernel can not be protected against. >> >> I'm not saying that we should never put things in the kernel, but >> there's definitely a security vs. performance trade off here. >> >> Regards, >> >> Anthony Liguori >> > Not sure I get the argument completely. Any kernel service with a bug > might be exploited for priveledge escalation. Yes, more kernel code > gives you more attack surface, but given we use rich interfaces such as > ones exposed by kvm, I am not sure by how much. > > Also note that vhost net does not take qemu out of the equation for > everything, just for datapath operations. > >
On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: > On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: >>> Both have security implications so I think it's important that they >>> be addressed. Otherwise, I'm pretty happy with how things are. >>> >> Care suggesting some solutions? >> > > The obvious thing to do would be to use the memory notifier in vhost to > keep track of whenever something remaps the ring's memory region and if > that happens, issue an ioctl to vhost to change the location of the > ring. It would be easy to do, but what I wondered about, is what happens in the guest meanwhile. Which ring address has the correct descriptors: the old one? The new one? Both? This question leads me to the belief that well-behaved guest will never encounter this. > Also, you would need to merge the vhost slot management code with > the KVM slot management code. > > I'm sympathetic to your arguments though. As qemu is today, the above > is definitely the right thing to do. But ram is always ram and ram > always has a fixed (albeit non-linear) mapping within a guest. We can > probably be smarter in qemu. > > There are areas of MMIO/ROM address space that *sometimes* end up > behaving like ram, but that's a different case. The one other case to > consider is ram hot add/remove in which case, ram may be removed or > added (but it's location will never change during its lifetime). > > Here's what I'll propose, and I'd really like to hear what Paul think > about it before we start down this path. > > I think we should add a new API that's: > > void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); > > This API would do two things. It would call qemu_ram_alloc() and > cpu_register_physical_memory() just as code does today. It would also > add this region into a new table. > > There would be: > > void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size); > void cpu_ram_unmap(void *mem); > > These calls would use this new table to lookup ram addresses. These > mappings are valid as long as the guest is executed. Within the table, > each region would have a reference count. When it comes time to do hot > add/remove, we would wait to remove a region until the reference count > went to zero to avoid unmapping during DMA. > > cpu_ram_add() never gets called with overlapping regions. We'll modify > cpu_register_physical_memory() to ensure that a ram mapping is never > changed after initial registration. > > vhost no longer needs to bother keeping the dynamic table up to date so > it removes all of the slot management code from vhost. KVM still needs > the code to handle rom/ram mappings but we can take care of that next. > virtio-net's userspace code can do the same thing as vhost and only map > the ring once which should be a big performance improvement. > > It also introduces a place to do madvise() reset registrations. > > This is definitely appropriate for target-i386. I suspect it is for > other architectures too. > > Regards, > > Anthony Liguori > >>> >>>>> Furthermore, vhost reduces a virtual machine's security. It offers an >>>>> impressive performance boost (particularly when dealing with 10gbit+ >>>>> networking) but for a user that doesn't have such strong networking >>>>> performance requirements, I think it's reasonable for them to not want >>>>> to make a security trade off. >>>>> >>>>> >>>> It's hard for me to see how it reduces VM security. If it does, it's >>>> not by design and will be fixed. >>>> >>>> >>> If you have a bug in vhost-net (would never happen of course) then it's >>> a host-kernel exploit whereas if we have a bug in virtio-net userspace, >>> it's a local user exploit. We have a pretty robust architecture to deal >>> with local user exploits (qemu can run unprivilieged, SELinux enforces >>> mandatory access control) but a host-kernel can not be protected against. >>> >>> I'm not saying that we should never put things in the kernel, but >>> there's definitely a security vs. performance trade off here. >>> >>> Regards, >>> >>> Anthony Liguori >>> >> Not sure I get the argument completely. Any kernel service with a bug >> might be exploited for priveledge escalation. Yes, more kernel code >> gives you more attack surface, but given we use rich interfaces such as >> ones exposed by kvm, I am not sure by how much. >> >> Also note that vhost net does not take qemu out of the equation for >> everything, just for datapath operations. >> >>
On 02/28/2010 03:01 PM, Michael S. Tsirkin wrote: > On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: > >> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: >> >>>> Both have security implications so I think it's important that they >>>> be addressed. Otherwise, I'm pretty happy with how things are. >>>> >>>> >>> Care suggesting some solutions? >>> >>> >> The obvious thing to do would be to use the memory notifier in vhost to >> keep track of whenever something remaps the ring's memory region and if >> that happens, issue an ioctl to vhost to change the location of the >> ring. >> > It would be easy to do, but what I wondered about, is what happens in the > guest meanwhile. Which ring address has the correct descriptors: the old > one? The new one? Both? This question leads me to the belief that well-behaved > guest will never encounter this. > This is not a question of well-behaved guests. It's a question about what our behaviour is in the face of a malicious guest. While I agree with you that that behaviour can be undefined, writing to an invalid ram location I believe could lead to guest privilege escalation. I think the two solutions we could implement would be to always use the latest mapping (which is what all code does today) or to actively prevent ram from being remapped (which is my proposal below). Regards, Anthony Liguori
> I'm sympathetic to your arguments though. As qemu is today, the above > is definitely the right thing to do. But ram is always ram and ram > always has a fixed (albeit non-linear) mapping within a guest. I think this assumption is unsafe. There are machines where RAM mappings can change. It's not uncommon for a chip select (i.e. physical memory address region) to be switchable to several different sources, one of which may be RAM. I'm pretty sure this functionality is present (but not actually implemented) on some of the current qemu targets. I agree that changing RAM mappings under an active DMA is a fairly suspect thing to do. However I think we need to avoid cache mappings between separate DMA transactions i.e. when the guest can know that no DMA will occur, and safely remap things. I'm also of the opinion that virtio devices should behave the same as any other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an IOMMU, then it should see the same address space as any other PCI device in that location. Apart from anything else, failure to do this breaks nested virtualization. While qemu doesn't currently implement an IOMMU, the DMA interfaces have been designed to allow it. > void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); We need to support aliased memory regions. For example the ARM RealView boards expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common for systems to create aliases by ignoring certain address bits. e.g. each sim slot is allocated a fixed 256M region. Populating that slot with a 128M stick will cause the contents to be aliased in both the top and bottom halves of that region. Paul
On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote: > > I'm sympathetic to your arguments though. As qemu is today, the above > > is definitely the right thing to do. But ram is always ram and ram > > always has a fixed (albeit non-linear) mapping within a guest. > > I think this assumption is unsafe. There are machines where RAM mappings can > change. It's not uncommon for a chip select (i.e. physical memory address > region) to be switchable to several different sources, one of which may be > RAM. I'm pretty sure this functionality is present (but not actually > implemented) on some of the current qemu targets. > > I agree that changing RAM mappings under an active DMA is a fairly suspect > thing to do. However I think we need to avoid cache mappings between separate > DMA transactions i.e. when the guest can know that no DMA will occur, and > safely remap things. > > I'm also of the opinion that virtio devices should behave the same as any > other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an > IOMMU, then it should see the same address space as any other PCI device in > that location. It already doesn't. virtio passes physical memory addresses to device instead of DMA addresses. > Apart from anything else, failure to do this breaks nested > virtualization. Assigning PV device in nested virtualization? It could work, but not sure what the point would be. > While qemu doesn't currently implement an IOMMU, the DMA > interfaces have been designed to allow it. > > > void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); > > We need to support aliased memory regions. For example the ARM RealView boards > expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common > for systems to create aliases by ignoring certain address bits. e.g. each sim > slot is allocated a fixed 256M region. Populating that slot with a 128M stick > will cause the contents to be aliased in both the top and bottom halves of > that region. > > Paul
On 03/01/2010 01:27 PM, Michael S. Tsirkin wrote: > On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote: > >>> I'm sympathetic to your arguments though. As qemu is today, the above >>> is definitely the right thing to do. But ram is always ram and ram >>> always has a fixed (albeit non-linear) mapping within a guest. >>> >> I think this assumption is unsafe. There are machines where RAM mappings can >> change. It's not uncommon for a chip select (i.e. physical memory address >> region) to be switchable to several different sources, one of which may be >> RAM. I'm pretty sure this functionality is present (but not actually >> implemented) on some of the current qemu targets. >> >> I agree that changing RAM mappings under an active DMA is a fairly suspect >> thing to do. However I think we need to avoid cache mappings between separate >> DMA transactions i.e. when the guest can know that no DMA will occur, and >> safely remap things. >> >> I'm also of the opinion that virtio devices should behave the same as any >> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an >> IOMMU, then it should see the same address space as any other PCI device in >> that location. >> > It already doesn't. virtio passes physical memory addresses > to device instead of DMA addresses. > That's technically a bug. >> Apart from anything else, failure to do this breaks nested >> virtualization. >> > Assigning PV device in nested virtualization? It could work, but not > sure what the point would be. > It misses the point really. vhost-net is not a device model and it shouldn't have to care about things like PCI IOMMU. If we did ever implement a PCI IOMMU, then we would perform ring translation (or not use vhost-net). Regards, Anthony Liguori >> While qemu doesn't currently implement an IOMMU, the DMA >> interfaces have been designed to allow it. >> >> >>> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); >>> >> We need to support aliased memory regions. For example the ARM RealView boards >> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common >> for systems to create aliases by ignoring certain address bits. e.g. each sim >> slot is allocated a fixed 256M region. Populating that slot with a 128M stick >> will cause the contents to be aliased in both the top and bottom halves of >> that region. >> >> Paul >>
On Mon, Mar 01, 2010 at 03:54:00PM -0600, Anthony Liguori wrote: > On 03/01/2010 01:27 PM, Michael S. Tsirkin wrote: >> On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote: >> >>>> I'm sympathetic to your arguments though. As qemu is today, the above >>>> is definitely the right thing to do. But ram is always ram and ram >>>> always has a fixed (albeit non-linear) mapping within a guest. >>>> >>> I think this assumption is unsafe. There are machines where RAM mappings can >>> change. It's not uncommon for a chip select (i.e. physical memory address >>> region) to be switchable to several different sources, one of which may be >>> RAM. I'm pretty sure this functionality is present (but not actually >>> implemented) on some of the current qemu targets. >>> >>> I agree that changing RAM mappings under an active DMA is a fairly suspect >>> thing to do. However I think we need to avoid cache mappings between separate >>> DMA transactions i.e. when the guest can know that no DMA will occur, and >>> safely remap things. >>> >>> I'm also of the opinion that virtio devices should behave the same as any >>> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an >>> IOMMU, then it should see the same address space as any other PCI device in >>> that location. >>> >> It already doesn't. virtio passes physical memory addresses >> to device instead of DMA addresses. >> > > That's technically a bug. > >>> Apart from anything else, failure to do this breaks nested >>> virtualization. >>> >> Assigning PV device in nested virtualization? It could work, but not >> sure what the point would be. >> > > It misses the point really. > > vhost-net is not a device model and it shouldn't have to care about > things like PCI IOMMU. If we did ever implement a PCI IOMMU, then we > would perform ring translation (or not use vhost-net). > > Regards, > > Anthony Liguori Right. >>> While qemu doesn't currently implement an IOMMU, the DMA >>> interfaces have been designed to allow it. >>> >>> >>>> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); >>>> >>> We need to support aliased memory regions. For example the ARM RealView boards >>> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common >>> for systems to create aliases by ignoring certain address bits. e.g. each sim >>> slot is allocated a fixed 256M region. Populating that slot with a 128M stick >>> will cause the contents to be aliased in both the top and bottom halves of >>> that region. >>> >>> Paul >>>
On 02/28/2010 04:39 PM, Paul Brook wrote: >> I'm sympathetic to your arguments though. As qemu is today, the above >> is definitely the right thing to do. But ram is always ram and ram >> always has a fixed (albeit non-linear) mapping within a guest. >> > I think this assumption is unsafe. There are machines where RAM mappings can > change. It's not uncommon for a chip select (i.e. physical memory address > region) to be switchable to several different sources, one of which may be > RAM. I'm pretty sure this functionality is present (but not actually > implemented) on some of the current qemu targets. > But I presume this is more about switching a dim to point at a different region in memory. It's a rare event similar to memory hot plug. Either way, if there are platforms where we don't treat ram with the new ram api, that's okay. > I agree that changing RAM mappings under an active DMA is a fairly suspect > thing to do. However I think we need to avoid cache mappings between separate > DMA transactions i.e. when the guest can know that no DMA will occur, and > safely remap things. > One thing I like about having a new ram api is it gives us a stronger interface than what we have today. Today, we don't have a strong guarantee that mappings won't be changed during a DMA transaction. With a new api, cpu_physical_memory_map() changes semantics. It only returns pointers for static ram mappings. Everything else is bounced which guarantees that an address can't change during DMA. > I'm also of the opinion that virtio devices should behave the same as any > other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an > IOMMU, then it should see the same address space as any other PCI device in > that location. Apart from anything else, failure to do this breaks nested > virtualization. While qemu doesn't currently implement an IOMMU, the DMA > interfaces have been designed to allow it. > Yes, I've been working on that. virtio is a bit more complicated than a normal PCI device because it can be on top of two busses. It needs an additional layer of abstraction to deal with this. >> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); >> > We need to support aliased memory regions. For example the ARM RealView boards > expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common > for systems to create aliases by ignoring certain address bits. e.g. each sim > slot is allocated a fixed 256M region. Populating that slot with a 128M stick > will cause the contents to be aliased in both the top and bottom halves of > that region. > Okay, I'd prefer to add an explicit aliasing API. That gives us more information to work with. Regards, Anthony Liguori > Paul >
> > I think this assumption is unsafe. There are machines where RAM mappings > > can change. It's not uncommon for a chip select (i.e. physical memory > > address region) to be switchable to several different sources, one of > > which may be RAM. I'm pretty sure this functionality is present (but not > > actually implemented) on some of the current qemu targets. > > But I presume this is more about switching a dim to point at a different > region in memory. It's a rare event similar to memory hot plug. Approximately that, yes. One use is to start with ROM at address zero, then switch to RAM once you've initialised the DRAM controller (using a small internal SRAM as a trampoline). > Either way, if there are platforms where we don't treat ram with the new > ram api, that's okay. > > > I agree that changing RAM mappings under an active DMA is a fairly > > suspect thing to do. However I think we need to avoid cache mappings > > between separate DMA transactions i.e. when the guest can know that no > > DMA will occur, and safely remap things. > > One thing I like about having a new ram api is it gives us a stronger > interface than what we have today. Today, we don't have a strong > guarantee that mappings won't be changed during a DMA transaction. > > With a new api, cpu_physical_memory_map() changes semantics. It only > returns pointers for static ram mappings. Everything else is bounced > which guarantees that an address can't change during DMA. Doesn't this mean that only the initial RAM is directly DMA-able? While memory hotplug(and unplug) may be an infrequent event, having the majority of ram be hotplug seems much more likely. Giving each guest a small base allocation, then hotplug the rest as required/paid for seems an entirely reasonable setup. I use VMs which are normally hosted on a multi-core machine with buckets of ram, but may be migrated to a single CPU host with limited ram in an emergency. Paul
On 03/02/2010 08:33 AM, Paul Brook wrote: >>> I think this assumption is unsafe. There are machines where RAM mappings >>> can change. It's not uncommon for a chip select (i.e. physical memory >>> address region) to be switchable to several different sources, one of >>> which may be RAM. I'm pretty sure this functionality is present (but not >>> actually implemented) on some of the current qemu targets. >>> >> But I presume this is more about switching a dim to point at a different >> region in memory. It's a rare event similar to memory hot plug. >> > Approximately that, yes. One use is to start with ROM at address zero, then > switch to RAM once you've initialised the DRAM controller (using a small > internal SRAM as a trampoline). > > >> Either way, if there are platforms where we don't treat ram with the new >> ram api, that's okay. >> >> >>> I agree that changing RAM mappings under an active DMA is a fairly >>> suspect thing to do. However I think we need to avoid cache mappings >>> between separate DMA transactions i.e. when the guest can know that no >>> DMA will occur, and safely remap things. >>> >> One thing I like about having a new ram api is it gives us a stronger >> interface than what we have today. Today, we don't have a strong >> guarantee that mappings won't be changed during a DMA transaction. >> >> With a new api, cpu_physical_memory_map() changes semantics. It only >> returns pointers for static ram mappings. Everything else is bounced >> which guarantees that an address can't change during DMA. >> > Doesn't this mean that only the initial RAM is directly DMA-able? > > While memory hotplug(and unplug) may be an infrequent event, having the > majority of ram be hotplug seems much more likely. Hotplug works fine for direct DMA'ing. map/unmap would maintain a reference count on the registered RAM region and hot unplug would not be allowed until that reference dropped to zero. For something like virtio, it means that the driver has to be unloaded in the guest before you hot unplug the region of memory if it happens to be using that region of memory for the ring storage. The key difference is that these regions are created and destroyed rarely and in such a way that the destruction is visible to the guest. If you compare that to IO memory, we currently flip IO memory's mappings dynamically without the guest really being aware (such as the VGA optimization). An API like this wouldn't work for IO memory today without some serious thinking about how to model this sort of thing. Regards, Anthony Liguori
> >> With a new api, cpu_physical_memory_map() changes semantics. It only > >> returns pointers for static ram mappings. Everything else is bounced > >> which guarantees that an address can't change during DMA. > > > > Doesn't this mean that only the initial RAM is directly DMA-able? > > > > While memory hotplug(and unplug) may be an infrequent event, having the > > majority of ram be hotplug seems much more likely. > > Hotplug works fine for direct DMA'ing. map/unmap would maintain a > reference count on the registered RAM region and hot unplug would not be > allowed until that reference dropped to zero. For something like > virtio, it means that the driver has to be unloaded in the guest before > you hot unplug the region of memory if it happens to be using that > region of memory for the ring storage. > > The key difference is that these regions are created and destroyed > rarely and in such a way that the destruction is visible to the guest. So you're making ram unmap an asynchronous process, and requiring that the address space not be reused until that umap has completed? Paul
On 03/02/2010 08:55 AM, Paul Brook wrote: >>>> With a new api, cpu_physical_memory_map() changes semantics. It only >>>> returns pointers for static ram mappings. Everything else is bounced >>>> which guarantees that an address can't change during DMA. >>>> >>> Doesn't this mean that only the initial RAM is directly DMA-able? >>> >>> While memory hotplug(and unplug) may be an infrequent event, having the >>> majority of ram be hotplug seems much more likely. >>> >> Hotplug works fine for direct DMA'ing. map/unmap would maintain a >> reference count on the registered RAM region and hot unplug would not be >> allowed until that reference dropped to zero. For something like >> virtio, it means that the driver has to be unloaded in the guest before >> you hot unplug the region of memory if it happens to be using that >> region of memory for the ring storage. >> >> The key difference is that these regions are created and destroyed >> rarely and in such a way that the destruction is visible to the guest. >> > So you're making ram unmap an asynchronous process, and requiring that the > address space not be reused until that umap has completed? > It technically already would be. If you've got a pending DMA transaction and you try to hot unplug badness will happen. This is something that is certainly exploitable. Regards, Anthony Liguori > Paul >
> >> The key difference is that these regions are created and destroyed > >> rarely and in such a way that the destruction is visible to the guest. > > > > So you're making ram unmap an asynchronous process, and requiring that > > the address space not be reused until that umap has completed? > > It technically already would be. If you've got a pending DMA > transaction and you try to hot unplug badness will happen. This is > something that is certainly exploitable. Hmm, I guess we probably want to make this work with all mappings then. DMA to a ram backed PCI BAR (e.g. video ram) is certainly feasible. Technically it's not the unmap that causes badness, it's freeing the underlying ram. For these reasons I'm tempted to push the refcounting down to the ram allocation level. This has a couple of nice properties. Firstly we don't care about dynamic allocation any more. We just say that mapping changes may not effect active DMA transactions. If virtio chooses to define that the vring DMA transaction starts when the device is enabled and ends when disabled, that's fine by me. This probably requires revisiting the memory barrier issues - barriers are pointless if you don't guarantee cache coherence (i.e. no bounce buffers). Secondly, ram deallocation is not guest visible. The guest visible parts (memory unmapping) can happen immediately, and we avoid a whole set of unplug/replug race conditions. We may want to delay the completion of a monitor hotplug command until the actual deallocation occurs, but that's a largely separate issue. Paul
On Tue, Mar 02, 2010 at 03:53:30PM +0000, Paul Brook wrote: > > >> The key difference is that these regions are created and destroyed > > >> rarely and in such a way that the destruction is visible to the guest. > > > > > > So you're making ram unmap an asynchronous process, and requiring that > > > the address space not be reused until that umap has completed? > > > > It technically already would be. If you've got a pending DMA > > transaction and you try to hot unplug badness will happen. This is > > something that is certainly exploitable. > > Hmm, I guess we probably want to make this work with all mappings then. DMA to > a ram backed PCI BAR (e.g. video ram) is certainly feasible. This used to be possible with PCI/PCI-X. But as far as I know, with PCI Express, devices can not access each other's BARs. > Technically it's not the unmap that causes badness, it's freeing the > underlying ram. > > For these reasons I'm tempted to push the refcounting down to the ram > allocation level. This has a couple of nice properties. > > Firstly we don't care about dynamic allocation any more. We just say that > mapping changes may not effect active DMA transactions. If virtio chooses to > define that the vring DMA transaction starts when the device is enabled and > ends when disabled, that's fine by me. This probably requires revisiting the > memory barrier issues - barriers are pointless if you don't guarantee cache > coherence (i.e. no bounce buffers). > > Secondly, ram deallocation is not guest visible. The guest visible parts > (memory unmapping) can happen immediately, and we avoid a whole set of > unplug/replug race conditions. We may want to delay the completion of a > monitor hotplug command until the actual deallocation occurs, but that's a > largely separate issue. > > Paul
On 03/02/2010 09:53 AM, Paul Brook wrote: >>>> The key difference is that these regions are created and destroyed >>>> rarely and in such a way that the destruction is visible to the guest. >>>> >>> So you're making ram unmap an asynchronous process, and requiring that >>> the address space not be reused until that umap has completed? >>> >> It technically already would be. If you've got a pending DMA >> transaction and you try to hot unplug badness will happen. This is >> something that is certainly exploitable. >> > Hmm, I guess we probably want to make this work with all mappings then. DMA to > a ram backed PCI BAR (e.g. video ram) is certainly feasible. > Technically it's not the unmap that causes badness, it's freeing the > underlying ram. > Let's avoid confusing terminology. We have RAM mappings and then we have PCI BARs that are mapped as IO_MEM_RAM. PCI BARs mapped as IO_MEM_RAM are allocated by the device and live for the duration of the device. If you did something that changed the BAR's mapping from IO_MEM_RAM to an actual IO memory type, then you'd continue to DMA to the allocated device memory instead of doing MMIO operations.[1] That's completely accurate and safe. If you did this to bare metal, I expect you'd get very similar results. This is different from DMA'ing to a RAM region and then removing the RAM region while the IO is in flight. In this case, the mapping disappears and you potentially have the guest writing to an invalid host pointer. [1] I don't think it's useful to support DMA'ing to arbitrary IO_MEM_RAM areas. Instead, I think we should always bounce to this memory. The benefit is that we avoid the complications resulting from PCI hot unplug and reference counting. > For these reasons I'm tempted to push the refcounting down to the ram > allocation level. This has a couple of nice properties. > > Firstly we don't care about dynamic allocation any more. We just say that > mapping changes may not effect active DMA transactions. Only if we think it's necessary to support native DMA to arbitrary IO_MEM_RAM. I contend this is never a normal or performance sensitive case and it's not worth supporting. > If virtio chooses to > define that the vring DMA transaction starts when the device is enabled and > ends when disabled, that's fine by me. This probably requires revisiting the > memory barrier issues - barriers are pointless if you don't guarantee cache > coherence (i.e. no bounce buffers). > > Secondly, ram deallocation is not guest visible. The guest visible parts > (memory unmapping) can happen immediately, and we avoid a whole set of > unplug/replug race conditions. We may want to delay the completion of a > monitor hotplug command until the actual deallocation occurs, but that's a > largely separate issue. > You can do the same thing and always bounce IO_MEM_RAM IO regions. It's just a question of whether we think it's worth the effort to do native DMA to this type of memory. I personally don't think it is at least in the beginning. Regards, Anthony Liguori > Paul >
On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: > On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: > >>Both have security implications so I think it's important that they > >>be addressed. Otherwise, I'm pretty happy with how things are. > >Care suggesting some solutions? > > The obvious thing to do would be to use the memory notifier in vhost > to keep track of whenever something remaps the ring's memory region > and if that happens, issue an ioctl to vhost to change the location > of the ring. Also, you would need to merge the vhost slot > management code with the KVM slot management code. There are no security implications as long as vhost uses the qemu process mappings. But your point is valid. > I'm sympathetic to your arguments though. As qemu is today, the > above is definitely the right thing to do. But ram is always ram > and ram always has a fixed (albeit non-linear) mapping within a > guest. We can probably be smarter in qemu. > > There are areas of MMIO/ROM address space that *sometimes* end up > behaving like ram, but that's a different case. The one other case > to consider is ram hot add/remove in which case, ram may be removed > or added (but it's location will never change during its lifetime). > > Here's what I'll propose, and I'd really like to hear what Paul > think about it before we start down this path. > > I think we should add a new API that's: > > void cpu_ram_add(target_phys_addr_t start, ram_addr_t size); > > This API would do two things. It would call qemu_ram_alloc() and > cpu_register_physical_memory() just as code does today. It would > also add this region into a new table. > > There would be: > > void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size); > void cpu_ram_unmap(void *mem); > > These calls would use this new table to lookup ram addresses. These > mappings are valid as long as the guest is executed. Within the > table, each region would have a reference count. When it comes time > to do hot add/remove, we would wait to remove a region until the > reference count went to zero to avoid unmapping during DMA. > > cpu_ram_add() never gets called with overlapping regions. We'll > modify cpu_register_physical_memory() to ensure that a ram mapping > is never changed after initial registration. What is the difference between your proposal and cpu_physical_memory_map? What i'd like to see is binding between cpu_physical_memory_map and qdev devices, so that you can use different host memory mappings for device context and for CPU context (and provide the possibility for, say, map a certain memory region as read-only). > vhost no longer needs to bother keeping the dynamic table up to date > so it removes all of the slot management code from vhost. KVM still > needs the code to handle rom/ram mappings but we can take care of > that next. virtio-net's userspace code can do the same thing as > vhost and only map the ring once which should be a big performance > improvement. Pinning the host virtual addresses as you propose reduces flexibility. See above about different mappings for DMA/CPU contexes.
On Tue, Mar 02, 2010 at 10:12:05AM -0600, Anthony Liguori wrote: > On 03/02/2010 09:53 AM, Paul Brook wrote: > >>>>The key difference is that these regions are created and destroyed > >>>>rarely and in such a way that the destruction is visible to the guest. > >>>So you're making ram unmap an asynchronous process, and requiring that > >>>the address space not be reused until that umap has completed? > >>It technically already would be. If you've got a pending DMA > >>transaction and you try to hot unplug badness will happen. This is > >>something that is certainly exploitable. > >Hmm, I guess we probably want to make this work with all mappings then. DMA to > >a ram backed PCI BAR (e.g. video ram) is certainly feasible. > >Technically it's not the unmap that causes badness, it's freeing the > >underlying ram. > > Let's avoid confusing terminology. We have RAM mappings and then we > have PCI BARs that are mapped as IO_MEM_RAM. > > PCI BARs mapped as IO_MEM_RAM are allocated by the device and live > for the duration of the device. If you did something that changed > the BAR's mapping from IO_MEM_RAM to an actual IO memory type, then > you'd continue to DMA to the allocated device memory instead of > doing MMIO operations.[1] > > That's completely accurate and safe. If you did this to bare metal, > I expect you'd get very similar results. > > This is different from DMA'ing to a RAM region and then removing the > RAM region while the IO is in flight. In this case, the mapping > disappears and you potentially have the guest writing to an invalid > host pointer. > > [1] I don't think it's useful to support DMA'ing to arbitrary > IO_MEM_RAM areas. Instead, I think we should always bounce to this > memory. The benefit is that we avoid the complications resulting > from PCI hot unplug and reference counting. Agree. Thus the suggestion to tie cpu_physical_memory_map to qdev infrastructure.
On 03/02/2010 10:12 AM, Marcelo Tosatti wrote: > On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: > >> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: >> >>>> Both have security implications so I think it's important that they >>>> be addressed. Otherwise, I'm pretty happy with how things are. >>>> >>> Care suggesting some solutions? >>> >> The obvious thing to do would be to use the memory notifier in vhost >> to keep track of whenever something remaps the ring's memory region >> and if that happens, issue an ioctl to vhost to change the location >> of the ring. Also, you would need to merge the vhost slot >> management code with the KVM slot management code. >> > There are no security implications as long as vhost uses the qemu > process mappings. > There potentially are within a guest. If a guest can trigger a qemu bug that results in qemu writing to a different location than what the guest told it to write, a malicious software may use this to escalate it's privileges within a guest. >> cpu_ram_add() never gets called with overlapping regions. We'll >> modify cpu_register_physical_memory() to ensure that a ram mapping >> is never changed after initial registration. >> > What is the difference between your proposal and > cpu_physical_memory_map? > cpu_physical_memory_map() has the following semantics: - it always returns a transient mapping - it may (transparently) bounce - it may fail to bounce, caller must deal The new function I'm proposing has the following semantics: - it always returns a persistent mapping - it never bounces - it will only fail if the mapping isn't ram A caller can use the new function to implement an optimization to force the device to only work with real ram. IOW, this is something we can use in virtio, but very little else. cpu_physical_memory_map can be used in more circumstances. > What i'd like to see is binding between cpu_physical_memory_map and qdev > devices, so that you can use different host memory mappings for device > context and for CPU context (and provide the possibility for, say, map > a certain memory region as read-only). > We really want per-bus mappings. At the lowest level, we'll have sysbus_memory_map() but we'll also have pci_memory_map(), virtio_memory_map(), etc. Nothing should ever call cpu_physical_memory_map() directly. Regards, Anthony Liguori
On Tue, Mar 02, 2010 at 10:56:48AM -0600, Anthony Liguori wrote: > On 03/02/2010 10:12 AM, Marcelo Tosatti wrote: >> On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: >> >>> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: >>> >>>>> Both have security implications so I think it's important that they >>>>> be addressed. Otherwise, I'm pretty happy with how things are. >>>>> >>>> Care suggesting some solutions? >>>> >>> The obvious thing to do would be to use the memory notifier in vhost >>> to keep track of whenever something remaps the ring's memory region >>> and if that happens, issue an ioctl to vhost to change the location >>> of the ring. Also, you would need to merge the vhost slot >>> management code with the KVM slot management code. >>> >> There are no security implications as long as vhost uses the qemu >> process mappings. >> > > There potentially are within a guest. If a guest can trigger a qemu bug > that results in qemu writing to a different location than what the guest > told it to write, a malicious software may use this to escalate it's > privileges within a guest. If malicious software has access to hardware that does DMA, game is likely over :) >>> cpu_ram_add() never gets called with overlapping regions. We'll >>> modify cpu_register_physical_memory() to ensure that a ram mapping >>> is never changed after initial registration. >>> >> What is the difference between your proposal and >> cpu_physical_memory_map? >> > > cpu_physical_memory_map() has the following semantics: > > - it always returns a transient mapping > - it may (transparently) bounce > - it may fail to bounce, caller must deal > > The new function I'm proposing has the following semantics: > > - it always returns a persistent mapping > - it never bounces > - it will only fail if the mapping isn't ram > > A caller can use the new function to implement an optimization to force > the device to only work with real ram. IOW, this is something we can > use in virtio, but very little else. cpu_physical_memory_map can be > used in more circumstances. > >> What i'd like to see is binding between cpu_physical_memory_map and qdev >> devices, so that you can use different host memory mappings for device >> context and for CPU context (and provide the possibility for, say, map >> a certain memory region as read-only). >> > > We really want per-bus mappings. At the lowest level, we'll have > sysbus_memory_map() but we'll also have pci_memory_map(), > virtio_memory_map(), etc. > > Nothing should ever call cpu_physical_memory_map() directly. > > Regards, > > Anthony Liguori
On Tue, Mar 02, 2010 at 10:56:48AM -0600, Anthony Liguori wrote: > On 03/02/2010 10:12 AM, Marcelo Tosatti wrote: > >On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote: > >>On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote: > >>>>Both have security implications so I think it's important that they > >>>>be addressed. Otherwise, I'm pretty happy with how things are. > >>>Care suggesting some solutions? > >>The obvious thing to do would be to use the memory notifier in vhost > >>to keep track of whenever something remaps the ring's memory region > >>and if that happens, issue an ioctl to vhost to change the location > >>of the ring. Also, you would need to merge the vhost slot > >>management code with the KVM slot management code. > >There are no security implications as long as vhost uses the qemu > >process mappings. > > There potentially are within a guest. If a guest can trigger a qemu > bug that results in qemu writing to a different location than what > the guest told it to write, a malicious software may use this to > escalate it's privileges within a guest. > > >>cpu_ram_add() never gets called with overlapping regions. We'll > >>modify cpu_register_physical_memory() to ensure that a ram mapping > >>is never changed after initial registration. > >What is the difference between your proposal and > >cpu_physical_memory_map? > > cpu_physical_memory_map() has the following semantics: > > - it always returns a transient mapping > - it may (transparently) bounce > - it may fail to bounce, caller must deal > > The new function I'm proposing has the following semantics: What exactly are the purposes of the new function? > - it always returns a persistent mapping For hotplug? What exactly you mean persistent? > - it never bounces > - it will only fail if the mapping isn't ram > > A caller can use the new function to implement an optimization to > force the device to only work with real ram. To bypass the address translation in exec.c? > IOW, this is something we can use in virtio, but very little else. > cpu_physical_memory_map can be used in more circumstances. Does not make much sense to me. The qdev <-> memory map mapping seems more important. Following your security enhancement drive, you can for example check whether the region can actually be mapped by the device and deny otherwise, or do whatever host-side memory protection tricks you'd like. And "cpu_ram_map" seems like the memory is accessed through cpu context, while it is really always device context. > >What i'd like to see is binding between cpu_physical_memory_map and qdev > >devices, so that you can use different host memory mappings for device > >context and for CPU context (and provide the possibility for, say, map > >a certain memory region as read-only). > > We really want per-bus mappings. At the lowest level, we'll have > sysbus_memory_map() but we'll also have pci_memory_map(), > virtio_memory_map(), etc. > > Nothing should ever call cpu_physical_memory_map() directly. Yep. > > Regards, > > Anthony Liguori >
On 03/02/2010 12:00 PM, Marcelo Tosatti wrote: >> - it always returns a transient mapping >> - it may (transparently) bounce >> - it may fail to bounce, caller must deal >> >> The new function I'm proposing has the following semantics: >> > What exactly are the purposes of the new function? > We need an API that can be used to obtain a persistent mapping of a given guest physical address. We don't have that today. We can potentially change cpu_physical_memory_map() to also accommodate this use case but that's a second step. >> - it always returns a persistent mapping >> > For hotplug? What exactly you mean persistent? > Hotplug cannot happen as long as a persistent mapping exists for an address within that region. This is okay. You cannot have an active device driver DMA'ing to a DIMM and then hot unplug it. The guest is responsible for making sure this doesn't happen. >> - it never bounces >> - it will only fail if the mapping isn't ram >> >> A caller can use the new function to implement an optimization to >> force the device to only work with real ram. >> > To bypass the address translation in exec.c? > No, the ultimate goal is to convert virtio ring accesses from: static inline uint32_t vring_desc_len(target_phys_addr_t desc_pa, int i) { target_phys_addr_t pa; pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); return ldl_phys(pa); } len = vring_desc_len(vring.desc_pa, i) To: len = ldl_w(vring->desc[i].len); When host == arch, ldl_w is a nop. Otherwise, it's just a byte swap. ldl_phys() today turns into cpu_physical_memory_read() which is very slow. To support this, we must enforce that when a guest passes us a physical address, we can safely obtain a persistent mapping to it. This is true for any ram address. It's not true for MMIO memory. We have no way to do this with cpu_physical_memory_map(). >> IOW, this is something we can use in virtio, but very little else. >> cpu_physical_memory_map can be used in more circumstances. >> > Does not make much sense to me. The qdev<-> memory map mapping seems > more important. Following your security enhancement drive, you can for > example check whether the region can actually be mapped by the device > and deny otherwise, or do whatever host-side memory protection tricks > you'd like. > It's two independent things. Part of what makes virtio so complicated to convert to proper bus accessors is it's use of ldl_phys/stl_phys/etc. No other device use those functions. If we reduce virtio to just use a map() function, it simplifies the bus accessor conversion. > And "cpu_ram_map" seems like the memory is accessed through cpu context, > while it is really always device context. > Yes, but that's a separate effort. In fact, see http://wiki.qemu.org/Features/RamAPI vs. http://wiki.qemu.org/Features/PCIMemoryAPI Regards, Anthony Liguori
> The new function I'm proposing has the following semantics: > > - it always returns a persistent mapping > - it never bounces > - it will only fail if the mapping isn't ram So you're assuming that virtio rings are in ram that is not hot-pluggable or remapable, and the whole region is contiguous? That sounds like it's likely to come back and bite you. The guest has no idea which areas of ram happen to be contiguous on the host. Paul
On 03/02/2010 04:41 PM, Paul Brook wrote: >> The new function I'm proposing has the following semantics: >> >> - it always returns a persistent mapping >> - it never bounces >> - it will only fail if the mapping isn't ram >> > So you're assuming that virtio rings are in ram that is not hot-pluggable As long as the device is active, yes. This would be true with bare metal. Memory allocated for the virtio-pci ring is not reclaimable and you have to be able to reclaim the entire area of ram covered by a DIMM to hot unplug it. A user would have to unload the virtio-pci module to release the memory before hot unplug would be an option. NB, almost nothing supports memory hot remove because it's very difficult for an OS to actually do. > or > remapable, Yes, it cannot be remapable. > and the whole region is contiguous? > Yes, it has to be contiguous. > That sounds like it's likely to come back and bite you. The guest has no idea > which areas of ram happen to be contiguous on the host. > Practically speaking, with target-i386 anything that is contiguous in guest physical memory is contiguous in the host address space provided it's ram. These assumptions are important. I have a local branch (that I'll send out soon) that implements a ram API and converted virtio to make use of it. I'm seeing a ~50% increase in tx throughput. If you try to support discontiguous, remapable ram for the virtio ring, then you have to do an l1_phys_map lookup for every single ring variable access followed by a memory copy. This ends up costing an awful lot practically speaking. The changes should work equally well with syborg although I don't think I can do meaningful performance testing there (I don't actually have a syborg image to test). Regards, Anthony Liguori > Paul >
> > That sounds like it's likely to come back and bite you. The guest has no > > idea which areas of ram happen to be contiguous on the host. > > Practically speaking, with target-i386 anything that is contiguous in > guest physical memory is contiguous in the host address space provided > it's ram. > > These assumptions are important. I have a local branch (that I'll send > out soon) that implements a ram API and converted virtio to make use of > it. I'm seeing a ~50% increase in tx throughput. IMO supporting discontiguous regions is a requirement. target-i386 might get away with contiguous memory, because it omits most of the board level details. For everything else I'd expect this to be a real problem. I'm not happy about the not-remapable assumption either. In my experience this is fairly common. In fact many real x86 machines have this capability (to workaround the 4G PCI hole). By my reading the ppc440_bamboo board fails both your assumptions. I imagine the KVM-PPC folks would be upset if you decided that virtio no longer worked on this board. This is all somewhat disappointing, given virtio is supposed to be a DMA based architecture, and not rely on shared memory semantics. Paul
On Wed, Mar 03, 2010 at 08:15:15AM -0600, Anthony Liguori wrote: > On 03/02/2010 04:41 PM, Paul Brook wrote: > >>The new function I'm proposing has the following semantics: > >> > >>- it always returns a persistent mapping > >>- it never bounces > >>- it will only fail if the mapping isn't ram > >So you're assuming that virtio rings are in ram that is not hot-pluggable > > As long as the device is active, yes. This would be true with bare > metal. Memory allocated for the virtio-pci ring is not reclaimable > and you have to be able to reclaim the entire area of ram covered by > a DIMM to hot unplug it. A user would have to unload the virtio-pci > module to release the memory before hot unplug would be an option. > > NB, almost nothing supports memory hot remove because it's very > difficult for an OS to actually do. > > > or > >remapable, > > Yes, it cannot be remapable. > > > and the whole region is contiguous? > > Yes, it has to be contiguous. > > >That sounds like it's likely to come back and bite you. The guest has no idea > >which areas of ram happen to be contiguous on the host. > > Practically speaking, with target-i386 anything that is contiguous > in guest physical memory is contiguous in the host address space > provided it's ram. > > These assumptions are important. I have a local branch (that I'll > send out soon) that implements a ram API and converted virtio to > make use of it. I'm seeing a ~50% increase in tx throughput. > > If you try to support discontiguous, remapable ram for the virtio > ring, then you have to do an l1_phys_map lookup for every single > ring variable access followed by a memory copy. This ends up > costing an awful lot practically speaking. Speed up the lookup instead? > > The changes should work equally well with syborg although I don't > think I can do meaningful performance testing there (I don't > actually have a syborg image to test). > > Regards, > > Anthony Liguori > > >Paul
diff --git a/net.c b/net.c index a1bf49f..d1e23f1 100644 --- a/net.c +++ b/net.c @@ -973,6 +973,14 @@ static const struct { .name = "vnet_hdr", .type = QEMU_OPT_BOOL, .help = "enable the IFF_VNET_HDR flag on the tap interface" + }, { + .name = "vhost", + .type = QEMU_OPT_BOOL, + .help = "enable vhost-net network accelerator", + }, { + .name = "vhostfd", + .type = QEMU_OPT_STRING, + .help = "file descriptor of an already opened vhost net device", }, #endif /* _WIN32 */ { /* end of list */ } diff --git a/net/tap.c b/net/tap.c index fc59fd4..65797ef 100644 --- a/net/tap.c +++ b/net/tap.c @@ -41,6 +41,8 @@ #include "net/tap-linux.h" +#include "hw/vhost_net.h" + /* Maximum GSO packet size (64k) plus plenty of room for * the ethernet and virtio_net headers */ @@ -57,6 +59,7 @@ typedef struct TAPState { unsigned int has_vnet_hdr : 1; unsigned int using_vnet_hdr : 1; unsigned int has_ufo: 1; + struct vhost_net *vhost_net; } TAPState; static int launch_script(const char *setup_script, const char *ifname, int fd); @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); + if (s->vhost_net) { + vhost_net_cleanup(s->vhost_net); + } + qemu_purge_queued_packets(nc); if (s->down_script[0]) @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan, s->has_ufo = tap_probe_has_ufo(s->fd); tap_set_offload(&s->nc, 0, 0, 0, 0, 0); tap_read_poll(s, 1); + s->vhost_net = NULL; return s; } @@ -456,5 +464,30 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan } } + if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) { + int vhostfd, r; + if (qemu_opt_get(opts, "vhostfd")) { + r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd")); + if (r == -1) { + return -1; + } + vhostfd = r; + } else { + vhostfd = -1; + } + s->vhost_net = vhost_net_init(&s->nc, vhostfd); + if (!s->vhost_net) { + qemu_error("vhost-net requested but could not be initialized\n"); + return -1; + } + } else if (qemu_opt_get(opts, "vhostfd")) { + qemu_error("vhostfd= is not valid without vhost\n"); + return -1; + } + + if (vlan) { + vlan->nb_host_devs++; + } + return 0; } diff --git a/qemu-options.hx b/qemu-options.hx index f53922f..1850906 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -879,7 +879,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, "-net tap[,vlan=n][,name=str],ifname=name\n" " connect the host TAP network interface to VLAN 'n'\n" #else - "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n" + "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n" " connect the host TAP network interface to VLAN 'n' and use the\n" " network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n" " and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n" @@ -889,6 +889,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, " default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n" " use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n" " use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n" + " use vhost=on to enable experimental in kernel accelerator\n" + " use 'vhostfd=h' to connect to an already opened vhost net device\n" #endif "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n" " connect the vlan 'n' to another VLAN using a socket connection\n"
This adds vhost binary option to tap, to enable vhost net accelerator. Default is off for now, we'll be able to make default on long term when we know it's stable. vhostfd option can be used by management, to pass in the fd. Assigning vhostfd implies vhost=on. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- net.c | 8 ++++++++ net/tap.c | 33 +++++++++++++++++++++++++++++++++ qemu-options.hx | 4 +++- 3 files changed, 44 insertions(+), 1 deletions(-)