Patchwork [PATCHv2,10/12] tap: add vhost/vhostfd options

login
register
mail settings
Submitter Michael S. Tsirkin
Date Feb. 25, 2010, 6:28 p.m.
Message ID <886ef6ffeb6748f6dc4fe5431f71cb12bb74edc9.1267122331.git.mst@redhat.com>
Download mbox | patch
Permalink /patch/46266/
State New
Headers show

Comments

Michael S. Tsirkin - Feb. 25, 2010, 6:28 p.m.
This adds vhost binary option to tap, to enable vhost net accelerator.
Default is off for now, we'll be able to make default on long term
when we know it's stable.

vhostfd option can be used by management, to pass in the fd. Assigning
vhostfd implies vhost=on.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net.c           |    8 ++++++++
 net/tap.c       |   33 +++++++++++++++++++++++++++++++++
 qemu-options.hx |    4 +++-
 3 files changed, 44 insertions(+), 1 deletions(-)
Anthony Liguori - Feb. 25, 2010, 7:47 p.m.
On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
> This adds vhost binary option to tap, to enable vhost net accelerator.
> Default is off for now, we'll be able to make default on long term
> when we know it's stable.
>
> vhostfd option can be used by management, to pass in the fd. Assigning
> vhostfd implies vhost=on.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>    

Since the thinking these days is that macvtap and tap is pretty much all 
we'll ever need for vhost-net, perhaps we should revisit -net vhost vs. 
-net tap,vhost=X?

I think -net vhost,fd=X makes a lot more sense than -net 
tap,vhost=on,vhostfd=X.

Regards,

Anthony Liguori

> ---
>   net.c           |    8 ++++++++
>   net/tap.c       |   33 +++++++++++++++++++++++++++++++++
>   qemu-options.hx |    4 +++-
>   3 files changed, 44 insertions(+), 1 deletions(-)
>
> diff --git a/net.c b/net.c
> index a1bf49f..d1e23f1 100644
> --- a/net.c
> +++ b/net.c
> @@ -973,6 +973,14 @@ static const struct {
>                   .name = "vnet_hdr",
>                   .type = QEMU_OPT_BOOL,
>                   .help = "enable the IFF_VNET_HDR flag on the tap interface"
> +            }, {
> +                .name = "vhost",
> +                .type = QEMU_OPT_BOOL,
> +                .help = "enable vhost-net network accelerator",
> +            }, {
> +                .name = "vhostfd",
> +                .type = QEMU_OPT_STRING,
> +                .help = "file descriptor of an already opened vhost net device",
>               },
>   #endif /* _WIN32 */
>               { /* end of list */ }
> diff --git a/net/tap.c b/net/tap.c
> index fc59fd4..65797ef 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -41,6 +41,8 @@
>
>   #include "net/tap-linux.h"
>
> +#include "hw/vhost_net.h"
> +
>   /* Maximum GSO packet size (64k) plus plenty of room for
>    * the ethernet and virtio_net headers
>    */
> @@ -57,6 +59,7 @@ typedef struct TAPState {
>       unsigned int has_vnet_hdr : 1;
>       unsigned int using_vnet_hdr : 1;
>       unsigned int has_ufo: 1;
> +    struct vhost_net *vhost_net;
>   } TAPState;
>
>   static int launch_script(const char *setup_script, const char *ifname, int fd);
> @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc)
>   {
>       TAPState *s = DO_UPCAST(TAPState, nc, nc);
>
> +    if (s->vhost_net) {
> +        vhost_net_cleanup(s->vhost_net);
> +    }
> +
>       qemu_purge_queued_packets(nc);
>
>       if (s->down_script[0])
> @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan,
>       s->has_ufo = tap_probe_has_ufo(s->fd);
>       tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
>       tap_read_poll(s, 1);
> +    s->vhost_net = NULL;
>       return s;
>   }
>
> @@ -456,5 +464,30 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
>           }
>       }
>
> +    if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) {
> +        int vhostfd, r;
> +        if (qemu_opt_get(opts, "vhostfd")) {
> +            r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
> +            if (r == -1) {
> +                return -1;
> +            }
> +            vhostfd = r;
> +        } else {
> +            vhostfd = -1;
> +        }
> +        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
> +        if (!s->vhost_net) {
> +            qemu_error("vhost-net requested but could not be initialized\n");
> +            return -1;
> +        }
> +    } else if (qemu_opt_get(opts, "vhostfd")) {
> +        qemu_error("vhostfd= is not valid without vhost\n");
> +        return -1;
> +    }
> +
> +    if (vlan) {
> +        vlan->nb_host_devs++;
> +    }
> +
>       return 0;
>   }
> diff --git a/qemu-options.hx b/qemu-options.hx
> index f53922f..1850906 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -879,7 +879,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>       "-net tap[,vlan=n][,name=str],ifname=name\n"
>       "                connect the host TAP network interface to VLAN 'n'\n"
>   #else
> -    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n"
> +    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n"
>       "                connect the host TAP network interface to VLAN 'n' and use the\n"
>       "                network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n"
>       "                and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n"
> @@ -889,6 +889,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>       "                default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n"
>       "                use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n"
>       "                use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n"
> +    "                use vhost=on to enable experimental in kernel accelerator\n"
> +    "                use 'vhostfd=h' to connect to an already opened vhost net device\n"
>   #endif
>       "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
>       "                connect the vlan 'n' to another VLAN using a socket connection\n"
>
Michael S. Tsirkin - Feb. 26, 2010, 2:51 p.m.
On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote:
> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
>> This adds vhost binary option to tap, to enable vhost net accelerator.
>> Default is off for now, we'll be able to make default on long term
>> when we know it's stable.
>>
>> vhostfd option can be used by management, to pass in the fd. Assigning
>> vhostfd implies vhost=on.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>    
>
> Since the thinking these days is that macvtap and tap is pretty much all  
> we'll ever need for vhost-net, perhaps we should revisit -net vhost vs.  
> -net tap,vhost=X?
>
> I think -net vhost,fd=X makes a lot more sense than -net  
> tap,vhost=on,vhostfd=X.
>
> Regards,
>
> Anthony Liguori

We'll have to duplicate all tap options.
I think long term we will just make vhost=on the default.
Users do not really care about vhost, it just makes tap
go fater. So promoting it to 1st class options is wrong IMO.

>> ---
>>   net.c           |    8 ++++++++
>>   net/tap.c       |   33 +++++++++++++++++++++++++++++++++
>>   qemu-options.hx |    4 +++-
>>   3 files changed, 44 insertions(+), 1 deletions(-)
>>
>> diff --git a/net.c b/net.c
>> index a1bf49f..d1e23f1 100644
>> --- a/net.c
>> +++ b/net.c
>> @@ -973,6 +973,14 @@ static const struct {
>>                   .name = "vnet_hdr",
>>                   .type = QEMU_OPT_BOOL,
>>                   .help = "enable the IFF_VNET_HDR flag on the tap interface"
>> +            }, {
>> +                .name = "vhost",
>> +                .type = QEMU_OPT_BOOL,
>> +                .help = "enable vhost-net network accelerator",
>> +            }, {
>> +                .name = "vhostfd",
>> +                .type = QEMU_OPT_STRING,
>> +                .help = "file descriptor of an already opened vhost net device",
>>               },
>>   #endif /* _WIN32 */
>>               { /* end of list */ }
>> diff --git a/net/tap.c b/net/tap.c
>> index fc59fd4..65797ef 100644
>> --- a/net/tap.c
>> +++ b/net/tap.c
>> @@ -41,6 +41,8 @@
>>
>>   #include "net/tap-linux.h"
>>
>> +#include "hw/vhost_net.h"
>> +
>>   /* Maximum GSO packet size (64k) plus plenty of room for
>>    * the ethernet and virtio_net headers
>>    */
>> @@ -57,6 +59,7 @@ typedef struct TAPState {
>>       unsigned int has_vnet_hdr : 1;
>>       unsigned int using_vnet_hdr : 1;
>>       unsigned int has_ufo: 1;
>> +    struct vhost_net *vhost_net;
>>   } TAPState;
>>
>>   static int launch_script(const char *setup_script, const char *ifname, int fd);
>> @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc)
>>   {
>>       TAPState *s = DO_UPCAST(TAPState, nc, nc);
>>
>> +    if (s->vhost_net) {
>> +        vhost_net_cleanup(s->vhost_net);
>> +    }
>> +
>>       qemu_purge_queued_packets(nc);
>>
>>       if (s->down_script[0])
>> @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan,
>>       s->has_ufo = tap_probe_has_ufo(s->fd);
>>       tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
>>       tap_read_poll(s, 1);
>> +    s->vhost_net = NULL;
>>       return s;
>>   }
>>
>> @@ -456,5 +464,30 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
>>           }
>>       }
>>
>> +    if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) {
>> +        int vhostfd, r;
>> +        if (qemu_opt_get(opts, "vhostfd")) {
>> +            r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
>> +            if (r == -1) {
>> +                return -1;
>> +            }
>> +            vhostfd = r;
>> +        } else {
>> +            vhostfd = -1;
>> +        }
>> +        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
>> +        if (!s->vhost_net) {
>> +            qemu_error("vhost-net requested but could not be initialized\n");
>> +            return -1;
>> +        }
>> +    } else if (qemu_opt_get(opts, "vhostfd")) {
>> +        qemu_error("vhostfd= is not valid without vhost\n");
>> +        return -1;
>> +    }
>> +
>> +    if (vlan) {
>> +        vlan->nb_host_devs++;
>> +    }
>> +
>>       return 0;
>>   }
>> diff --git a/qemu-options.hx b/qemu-options.hx
>> index f53922f..1850906 100644
>> --- a/qemu-options.hx
>> +++ b/qemu-options.hx
>> @@ -879,7 +879,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>>       "-net tap[,vlan=n][,name=str],ifname=name\n"
>>       "                connect the host TAP network interface to VLAN 'n'\n"
>>   #else
>> -    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n"
>> +    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n"
>>       "                connect the host TAP network interface to VLAN 'n' and use the\n"
>>       "                network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n"
>>       "                and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n"
>> @@ -889,6 +889,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
>>       "                default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n"
>>       "                use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n"
>>       "                use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n"
>> +    "                use vhost=on to enable experimental in kernel accelerator\n"
>> +    "                use 'vhostfd=h' to connect to an already opened vhost net device\n"
>>   #endif
>>       "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
>>       "                connect the vlan 'n' to another VLAN using a socket connection\n"
>>
Anthony Liguori - Feb. 26, 2010, 3:23 p.m.
On 02/26/2010 08:51 AM, Michael S. Tsirkin wrote:
> On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote:
>    
>> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
>>      
>>> This adds vhost binary option to tap, to enable vhost net accelerator.
>>> Default is off for now, we'll be able to make default on long term
>>> when we know it's stable.
>>>
>>> vhostfd option can be used by management, to pass in the fd. Assigning
>>> vhostfd implies vhost=on.
>>>
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>
>>>        
>> Since the thinking these days is that macvtap and tap is pretty much all
>> we'll ever need for vhost-net, perhaps we should revisit -net vhost vs.
>> -net tap,vhost=X?
>>
>> I think -net vhost,fd=X makes a lot more sense than -net
>> tap,vhost=on,vhostfd=X.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
> We'll have to duplicate all tap options.
> I think long term we will just make vhost=on the default.
>    

I don't think we can.  vhost only works when using KVM and it doesn't 
support all of the features of userspace virtio.  Since it's in upstream 
Linux without supporting all of the virtio-net features, it's something 
we're going to have to deal with for a long time.

Furthermore, vhost reduces a virtual machine's security.  It offers an 
impressive performance boost (particularly when dealing with 10gbit+ 
networking) but for a user that doesn't have such strong networking 
performance requirements, I think it's reasonable for them to not want 
to make a security trade off.

One reason I like -net vhost is that it's a much less obscure syntax and 
it's the sort of thing that is easy to tell users that they should use.  
I understand you're argument for -net tap if you assume vhost=on will 
become the default because that means that users never really have to be 
aware of vhost once it becomes the default.  But as I said above, I 
don't think it's reasonable to make it on by default with -net tap.

> Users do not really care about vhost, it just makes tap
> go fater. So promoting it to 1st class options is wrong IMO.
>    

User's should care about vhost because it impacts the features supported 
by the virtual machine and it has security ramifications.  It's a great 
feature and I think the most users will want to use it, but I do think 
it's something that users ought to be aware of.

Regards,

Anthony Liguori
Michael S. Tsirkin - Feb. 27, 2010, 7:44 p.m.
On Fri, Feb 26, 2010 at 09:23:01AM -0600, Anthony Liguori wrote:
> On 02/26/2010 08:51 AM, Michael S. Tsirkin wrote:
>> On Thu, Feb 25, 2010 at 01:47:27PM -0600, Anthony Liguori wrote:
>>    
>>> On 02/25/2010 12:28 PM, Michael S. Tsirkin wrote:
>>>      
>>>> This adds vhost binary option to tap, to enable vhost net accelerator.
>>>> Default is off for now, we'll be able to make default on long term
>>>> when we know it's stable.
>>>>
>>>> vhostfd option can be used by management, to pass in the fd. Assigning
>>>> vhostfd implies vhost=on.
>>>>
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>>
>>>>        
>>> Since the thinking these days is that macvtap and tap is pretty much all
>>> we'll ever need for vhost-net, perhaps we should revisit -net vhost vs.
>>> -net tap,vhost=X?
>>>
>>> I think -net vhost,fd=X makes a lot more sense than -net
>>> tap,vhost=on,vhostfd=X.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>> We'll have to duplicate all tap options.
>> I think long term we will just make vhost=on the default.
>>    
>
> I don't think we can.  vhost only works when using KVM

Yes, default to on with KVM.

> and it doesn't  
> support all of the features of userspace virtio.  Since it's in upstream  
> Linux without supporting all of the virtio-net features, it's something  
> we're going to have to deal with for a long time.

Speaking of vlan filtering etc?  It's just a matter of time before it
supports all interesting features. Kernel support is there in net-next
already, userspace should be easy too. I should be able to code it up
once I finish bothering about upstream merge (hint hint :)).

> Furthermore, vhost reduces a virtual machine's security.  It offers an  
> impressive performance boost (particularly when dealing with 10gbit+  
> networking) but for a user that doesn't have such strong networking  
> performance requirements, I think it's reasonable for them to not want  
> to make a security trade off.

It's hard for me to see how it reduces VM security. If it does, it's
not by design and will be fixed.

> One reason I like -net vhost is that it's a much less obscure syntax and  
> it's the sort of thing that is easy to tell users that they should use.   
> I understand you're argument for -net tap if you assume vhost=on will  
> become the default because that means that users never really have to be  
> aware of vhost once it becomes the default.  But as I said above, I  
> don't think it's reasonable to make it on by default with -net tap.

Not yet, but we'll get there.

>> Users do not really care about vhost, it just makes tap
>> go fater. So promoting it to 1st class options is wrong IMO.
>>    
>
> User's should care about vhost because it impacts the features supported  
> by the virtual machine and it has security ramifications.  It's a great  
> feature and I think the most users will want to use it, but I do think  
> it's something that users ought to be aware of.
>
> Regards,
>
> Anthony Liguori
Anthony Liguori - Feb. 28, 2010, 4:08 p.m.
On 02/27/2010 01:44 PM, Michael S. Tsirkin wrote:
>> and it doesn't
>> support all of the features of userspace virtio.  Since it's in upstream
>> Linux without supporting all of the virtio-net features, it's something
>> we're going to have to deal with for a long time.
>>      
> Speaking of vlan filtering etc?  It's just a matter of time before it
> supports all interesting features. Kernel support is there in net-next
> already, userspace should be easy too. I should be able to code it up
> once I finish bothering about upstream merge (hint hint :)).
>    

:-)  As I've said in the past, I'm willing to live with -net tap,vhost 
but I really think -net vhost would be better in the long run.

The only two real issues I have with the series is the ring address 
mapping stability and the duplicated slot management code.  Both have 
security implications so I think it's important that they be addressed.  
Otherwise, I'm pretty happy with how things are.

>> Furthermore, vhost reduces a virtual machine's security.  It offers an
>> impressive performance boost (particularly when dealing with 10gbit+
>> networking) but for a user that doesn't have such strong networking
>> performance requirements, I think it's reasonable for them to not want
>> to make a security trade off.
>>      
> It's hard for me to see how it reduces VM security. If it does, it's
> not by design and will be fixed.
>    

If you have a bug in vhost-net (would never happen of course) then it's 
a host-kernel exploit whereas if we have a bug in virtio-net userspace, 
it's a local user exploit.  We have a pretty robust architecture to deal 
with local user exploits (qemu can run unprivilieged, SELinux enforces 
mandatory access control) but a host-kernel can not be protected against.

I'm not saying that we should never put things in the kernel, but 
there's definitely a security vs. performance trade off here.

Regards,

Anthony Liguori
Michael S. Tsirkin - Feb. 28, 2010, 5:19 p.m.
On Sun, Feb 28, 2010 at 10:08:26AM -0600, Anthony Liguori wrote:
> On 02/27/2010 01:44 PM, Michael S. Tsirkin wrote:
>>> and it doesn't
>>> support all of the features of userspace virtio.  Since it's in upstream
>>> Linux without supporting all of the virtio-net features, it's something
>>> we're going to have to deal with for a long time.
>>>      
>> Speaking of vlan filtering etc?  It's just a matter of time before it
>> supports all interesting features. Kernel support is there in net-next
>> already, userspace should be easy too. I should be able to code it up
>> once I finish bothering about upstream merge (hint hint :)).
>>    
>
> :-)  As I've said in the past, I'm willing to live with -net tap,vhost  
> but I really think -net vhost would be better in the long run.
>
> The only two real issues I have with the series is the ring address  
> mapping stability

This one I do not yet understand completely to be able solve.  Is the
only case where PCI BAR overlays RAM?  I think this case is best dealt
with by disabling BAR mapping.



> and the duplicated slot management code.

If you look at qemu-kvm, it's even triplicated :) I just would like to
get the code merged, then work at adding more infrastructure to prettify
it.

> Both have  security implications so I think it's important that they
> be addressed.   Otherwise, I'm pretty happy with how things are.

Care suggesting some solutions?

>
>>> Furthermore, vhost reduces a virtual machine's security.  It offers an
>>> impressive performance boost (particularly when dealing with 10gbit+
>>> networking) but for a user that doesn't have such strong networking
>>> performance requirements, I think it's reasonable for them to not want
>>> to make a security trade off.
>>>      
>> It's hard for me to see how it reduces VM security. If it does, it's
>> not by design and will be fixed.
>>    
>
> If you have a bug in vhost-net (would never happen of course) then it's  
> a host-kernel exploit whereas if we have a bug in virtio-net userspace,  
> it's a local user exploit.  We have a pretty robust architecture to deal  
> with local user exploits (qemu can run unprivilieged, SELinux enforces  
> mandatory access control) but a host-kernel can not be protected against.
> 
> I'm not saying that we should never put things in the kernel, but  
> there's definitely a security vs. performance trade off here.
>
> Regards,
>
> Anthony Liguori

Not sure I get the argument completely. Any kernel service with a bug
might be exploited for priveledge escalation. Yes, more kernel code
gives you more attack surface, but given we use rich interfaces such as
ones exposed by kvm, I am not sure by how much.

Also note that vhost net does not take qemu out of the equation for
everything, just for datapath operations.
Anthony Liguori - Feb. 28, 2010, 8:57 p.m.
On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
>> Both have  security implications so I think it's important that they
>> be addressed.   Otherwise, I'm pretty happy with how things are.
>>      
> Care suggesting some solutions?
>    

The obvious thing to do would be to use the memory notifier in vhost to 
keep track of whenever something remaps the ring's memory region and if 
that happens, issue an ioctl to vhost to change the location of the 
ring.  Also, you would need to merge the vhost slot management code with 
the KVM slot management code.

I'm sympathetic to your arguments though.  As qemu is today, the above 
is definitely the right thing to do.  But ram is always ram and ram 
always has a fixed (albeit non-linear) mapping within a guest.  We can 
probably be smarter in qemu.

There are areas of MMIO/ROM address space that *sometimes* end up 
behaving like ram, but that's a different case.  The one other case to 
consider is ram hot add/remove in which case, ram may be removed or 
added (but it's location will never change during its lifetime).

Here's what I'll propose, and I'd really like to hear what Paul think 
about it before we start down this path.

I think we should add a new API that's:

void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);

This API would do two things.  It would call qemu_ram_alloc() and 
cpu_register_physical_memory() just as code does today.  It would also 
add this region into a new table.

There would be:

void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size);
void cpu_ram_unmap(void *mem);

These calls would use this new table to lookup ram addresses.  These 
mappings are valid as long as the guest is executed.  Within the table, 
each region would have a reference count.  When it comes time to do hot 
add/remove, we would wait to remove a region until the reference count 
went to zero to avoid unmapping during DMA.

cpu_ram_add() never gets called with overlapping regions.  We'll modify 
cpu_register_physical_memory() to ensure that a ram mapping is never 
changed after initial registration.

vhost no longer needs to bother keeping the dynamic table up to date so 
it removes all of the slot management code from vhost.  KVM still needs 
the code to handle rom/ram mappings but we can take care of that next.  
virtio-net's userspace code can do the same thing as vhost and only map 
the ring once which should be a big performance improvement.

It also introduces a place to do madvise() reset registrations.

This is definitely appropriate for target-i386.  I suspect it is for 
other architectures too.

Regards,

Anthony Liguori

>>      
>>>> Furthermore, vhost reduces a virtual machine's security.  It offers an
>>>> impressive performance boost (particularly when dealing with 10gbit+
>>>> networking) but for a user that doesn't have such strong networking
>>>> performance requirements, I think it's reasonable for them to not want
>>>> to make a security trade off.
>>>>
>>>>          
>>> It's hard for me to see how it reduces VM security. If it does, it's
>>> not by design and will be fixed.
>>>
>>>        
>> If you have a bug in vhost-net (would never happen of course) then it's
>> a host-kernel exploit whereas if we have a bug in virtio-net userspace,
>> it's a local user exploit.  We have a pretty robust architecture to deal
>> with local user exploits (qemu can run unprivilieged, SELinux enforces
>> mandatory access control) but a host-kernel can not be protected against.
>>
>> I'm not saying that we should never put things in the kernel, but
>> there's definitely a security vs. performance trade off here.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
> Not sure I get the argument completely. Any kernel service with a bug
> might be exploited for priveledge escalation. Yes, more kernel code
> gives you more attack surface, but given we use rich interfaces such as
> ones exposed by kvm, I am not sure by how much.
>
> Also note that vhost net does not take qemu out of the equation for
> everything, just for datapath operations.
>
>
Michael S. Tsirkin - Feb. 28, 2010, 9:01 p.m.
On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
>>> Both have  security implications so I think it's important that they
>>> be addressed.   Otherwise, I'm pretty happy with how things are.
>>>      
>> Care suggesting some solutions?
>>    
>
> The obvious thing to do would be to use the memory notifier in vhost to  
> keep track of whenever something remaps the ring's memory region and if  
> that happens, issue an ioctl to vhost to change the location of the  
> ring.

It would be easy to do, but what I wondered about, is what happens in the
guest meanwhile. Which ring address has the correct descriptors: the old
one?  The new one? Both?  This question leads me to the belief that well-behaved
guest will never encounter this.

>  Also, you would need to merge the vhost slot management code with  
> the KVM slot management code.
>
> I'm sympathetic to your arguments though.  As qemu is today, the above  
> is definitely the right thing to do.  But ram is always ram and ram  
> always has a fixed (albeit non-linear) mapping within a guest.  We can  
> probably be smarter in qemu.
>
> There are areas of MMIO/ROM address space that *sometimes* end up  
> behaving like ram, but that's a different case.  The one other case to  
> consider is ram hot add/remove in which case, ram may be removed or  
> added (but it's location will never change during its lifetime).
>
> Here's what I'll propose, and I'd really like to hear what Paul think  
> about it before we start down this path.
>
> I think we should add a new API that's:
>
> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
>
> This API would do two things.  It would call qemu_ram_alloc() and  
> cpu_register_physical_memory() just as code does today.  It would also  
> add this region into a new table.
>
> There would be:
>
> void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size);
> void cpu_ram_unmap(void *mem);
>
> These calls would use this new table to lookup ram addresses.  These  
> mappings are valid as long as the guest is executed.  Within the table,  
> each region would have a reference count.  When it comes time to do hot  
> add/remove, we would wait to remove a region until the reference count  
> went to zero to avoid unmapping during DMA.
>
> cpu_ram_add() never gets called with overlapping regions.  We'll modify  
> cpu_register_physical_memory() to ensure that a ram mapping is never  
> changed after initial registration.
>
> vhost no longer needs to bother keeping the dynamic table up to date so  
> it removes all of the slot management code from vhost.  KVM still needs  
> the code to handle rom/ram mappings but we can take care of that next.   
> virtio-net's userspace code can do the same thing as vhost and only map  
> the ring once which should be a big performance improvement.
>
> It also introduces a place to do madvise() reset registrations.
>
> This is definitely appropriate for target-i386.  I suspect it is for  
> other architectures too.
>
> Regards,
>
> Anthony Liguori
>
>>>      
>>>>> Furthermore, vhost reduces a virtual machine's security.  It offers an
>>>>> impressive performance boost (particularly when dealing with 10gbit+
>>>>> networking) but for a user that doesn't have such strong networking
>>>>> performance requirements, I think it's reasonable for them to not want
>>>>> to make a security trade off.
>>>>>
>>>>>          
>>>> It's hard for me to see how it reduces VM security. If it does, it's
>>>> not by design and will be fixed.
>>>>
>>>>        
>>> If you have a bug in vhost-net (would never happen of course) then it's
>>> a host-kernel exploit whereas if we have a bug in virtio-net userspace,
>>> it's a local user exploit.  We have a pretty robust architecture to deal
>>> with local user exploits (qemu can run unprivilieged, SELinux enforces
>>> mandatory access control) but a host-kernel can not be protected against.
>>>
>>> I'm not saying that we should never put things in the kernel, but
>>> there's definitely a security vs. performance trade off here.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>> Not sure I get the argument completely. Any kernel service with a bug
>> might be exploited for priveledge escalation. Yes, more kernel code
>> gives you more attack surface, but given we use rich interfaces such as
>> ones exposed by kvm, I am not sure by how much.
>>
>> Also note that vhost net does not take qemu out of the equation for
>> everything, just for datapath operations.
>>
>>
Anthony Liguori - Feb. 28, 2010, 10:38 p.m.
On 02/28/2010 03:01 PM, Michael S. Tsirkin wrote:
> On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
>    
>> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
>>      
>>>> Both have  security implications so I think it's important that they
>>>> be addressed.   Otherwise, I'm pretty happy with how things are.
>>>>
>>>>          
>>> Care suggesting some solutions?
>>>
>>>        
>> The obvious thing to do would be to use the memory notifier in vhost to
>> keep track of whenever something remaps the ring's memory region and if
>> that happens, issue an ioctl to vhost to change the location of the
>> ring.
>>      
> It would be easy to do, but what I wondered about, is what happens in the
> guest meanwhile. Which ring address has the correct descriptors: the old
> one?  The new one? Both?  This question leads me to the belief that well-behaved
> guest will never encounter this.
>    

This is not a question of well-behaved guests.  It's a question about 
what our behaviour is in the face of a malicious guest.  While I agree 
with you that that behaviour can be undefined, writing to an invalid ram 
location I believe could lead to guest privilege escalation.

I think the two solutions we could implement would be to always use the 
latest mapping (which is what all code does today) or to actively 
prevent ram from being remapped (which is my proposal below).

Regards,

Anthony Liguori
Paul Brook - Feb. 28, 2010, 10:39 p.m.
> I'm sympathetic to your arguments though.  As qemu is today, the above
> is definitely the right thing to do.  But ram is always ram and ram
> always has a fixed (albeit non-linear) mapping within a guest.

I think this assumption is unsafe. There are machines where RAM mappings can 
change. It's not uncommon for a chip select (i.e. physical memory address 
region) to be switchable to several different sources, one of which may be 
RAM.  I'm pretty sure this functionality is present (but not actually 
implemented) on some of the current qemu targets.

I agree that changing RAM mappings under an active DMA is a fairly suspect 
thing to do. However I think we need to avoid cache mappings between separate 
DMA transactions i.e. when the guest can know that no DMA will occur, and 
safely remap things.

I'm also of the opinion that virtio devices should behave the same as any 
other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an 
IOMMU, then it should see the same address space as any other PCI device in 
that location.  Apart from anything else, failure to do this breaks nested 
virtualization.  While qemu doesn't currently implement an IOMMU, the DMA 
interfaces have been designed to allow it.

> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);

We need to support aliased memory regions. For example the ARM RealView boards 
expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common 
for systems to create aliases by ignoring certain address bits. e.g. each sim 
slot is allocated a fixed 256M region. Populating that slot with a 128M stick 
will cause the contents to be aliased in both the top and bottom halves of 
that region.

Paul
Michael S. Tsirkin - March 1, 2010, 7:27 p.m.
On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote:
> > I'm sympathetic to your arguments though.  As qemu is today, the above
> > is definitely the right thing to do.  But ram is always ram and ram
> > always has a fixed (albeit non-linear) mapping within a guest.
> 
> I think this assumption is unsafe. There are machines where RAM mappings can 
> change. It's not uncommon for a chip select (i.e. physical memory address 
> region) to be switchable to several different sources, one of which may be 
> RAM.  I'm pretty sure this functionality is present (but not actually 
> implemented) on some of the current qemu targets.
> 
> I agree that changing RAM mappings under an active DMA is a fairly suspect 
> thing to do. However I think we need to avoid cache mappings between separate 
> DMA transactions i.e. when the guest can know that no DMA will occur, and 
> safely remap things.
> 
> I'm also of the opinion that virtio devices should behave the same as any 
> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an 
> IOMMU, then it should see the same address space as any other PCI device in 
> that location.

It already doesn't. virtio passes physical memory addresses
to device instead of DMA addresses.

> Apart from anything else, failure to do this breaks nested 
> virtualization.

Assigning PV device in nested virtualization? It could work, but not
sure what the point would be.

>  While qemu doesn't currently implement an IOMMU, the DMA 
> interfaces have been designed to allow it.
> 
> > void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
> 
> We need to support aliased memory regions. For example the ARM RealView boards 
> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common 
> for systems to create aliases by ignoring certain address bits. e.g. each sim 
> slot is allocated a fixed 256M region. Populating that slot with a 128M stick 
> will cause the contents to be aliased in both the top and bottom halves of 
> that region.
> 
> Paul
Anthony Liguori - March 1, 2010, 9:54 p.m.
On 03/01/2010 01:27 PM, Michael S. Tsirkin wrote:
> On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote:
>    
>>> I'm sympathetic to your arguments though.  As qemu is today, the above
>>> is definitely the right thing to do.  But ram is always ram and ram
>>> always has a fixed (albeit non-linear) mapping within a guest.
>>>        
>> I think this assumption is unsafe. There are machines where RAM mappings can
>> change. It's not uncommon for a chip select (i.e. physical memory address
>> region) to be switchable to several different sources, one of which may be
>> RAM.  I'm pretty sure this functionality is present (but not actually
>> implemented) on some of the current qemu targets.
>>
>> I agree that changing RAM mappings under an active DMA is a fairly suspect
>> thing to do. However I think we need to avoid cache mappings between separate
>> DMA transactions i.e. when the guest can know that no DMA will occur, and
>> safely remap things.
>>
>> I'm also of the opinion that virtio devices should behave the same as any
>> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an
>> IOMMU, then it should see the same address space as any other PCI device in
>> that location.
>>      
> It already doesn't. virtio passes physical memory addresses
> to device instead of DMA addresses.
>    

That's technically a bug.

>> Apart from anything else, failure to do this breaks nested
>> virtualization.
>>      
> Assigning PV device in nested virtualization? It could work, but not
> sure what the point would be.
>    

It misses the point really.

vhost-net is not a device model and it shouldn't have to care about 
things like PCI IOMMU.  If we did ever implement a PCI IOMMU, then we 
would perform ring translation (or not use vhost-net).

Regards,

Anthony Liguori

>>   While qemu doesn't currently implement an IOMMU, the DMA
>> interfaces have been designed to allow it.
>>
>>      
>>> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
>>>        
>> We need to support aliased memory regions. For example the ARM RealView boards
>> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common
>> for systems to create aliases by ignoring certain address bits. e.g. each sim
>> slot is allocated a fixed 256M region. Populating that slot with a 128M stick
>> will cause the contents to be aliased in both the top and bottom halves of
>> that region.
>>
>> Paul
>>
Michael S. Tsirkin - March 2, 2010, 9:57 a.m.
On Mon, Mar 01, 2010 at 03:54:00PM -0600, Anthony Liguori wrote:
> On 03/01/2010 01:27 PM, Michael S. Tsirkin wrote:
>> On Sun, Feb 28, 2010 at 10:39:21PM +0000, Paul Brook wrote:
>>    
>>>> I'm sympathetic to your arguments though.  As qemu is today, the above
>>>> is definitely the right thing to do.  But ram is always ram and ram
>>>> always has a fixed (albeit non-linear) mapping within a guest.
>>>>        
>>> I think this assumption is unsafe. There are machines where RAM mappings can
>>> change. It's not uncommon for a chip select (i.e. physical memory address
>>> region) to be switchable to several different sources, one of which may be
>>> RAM.  I'm pretty sure this functionality is present (but not actually
>>> implemented) on some of the current qemu targets.
>>>
>>> I agree that changing RAM mappings under an active DMA is a fairly suspect
>>> thing to do. However I think we need to avoid cache mappings between separate
>>> DMA transactions i.e. when the guest can know that no DMA will occur, and
>>> safely remap things.
>>>
>>> I'm also of the opinion that virtio devices should behave the same as any
>>> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an
>>> IOMMU, then it should see the same address space as any other PCI device in
>>> that location.
>>>      
>> It already doesn't. virtio passes physical memory addresses
>> to device instead of DMA addresses.
>>    
>
> That's technically a bug.
>
>>> Apart from anything else, failure to do this breaks nested
>>> virtualization.
>>>      
>> Assigning PV device in nested virtualization? It could work, but not
>> sure what the point would be.
>>    
>
> It misses the point really.
>
> vhost-net is not a device model and it shouldn't have to care about  
> things like PCI IOMMU.  If we did ever implement a PCI IOMMU, then we  
> would perform ring translation (or not use vhost-net).
>
> Regards,
>
> Anthony Liguori

Right.

>>>   While qemu doesn't currently implement an IOMMU, the DMA
>>> interfaces have been designed to allow it.
>>>
>>>      
>>>> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
>>>>        
>>> We need to support aliased memory regions. For example the ARM RealView boards
>>> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common
>>> for systems to create aliases by ignoring certain address bits. e.g. each sim
>>> slot is allocated a fixed 256M region. Populating that slot with a 128M stick
>>> will cause the contents to be aliased in both the top and bottom halves of
>>> that region.
>>>
>>> Paul
>>>
Anthony Liguori - March 2, 2010, 2:07 p.m.
On 02/28/2010 04:39 PM, Paul Brook wrote:
>> I'm sympathetic to your arguments though.  As qemu is today, the above
>> is definitely the right thing to do.  But ram is always ram and ram
>> always has a fixed (albeit non-linear) mapping within a guest.
>>      
> I think this assumption is unsafe. There are machines where RAM mappings can
> change. It's not uncommon for a chip select (i.e. physical memory address
> region) to be switchable to several different sources, one of which may be
> RAM.  I'm pretty sure this functionality is present (but not actually
> implemented) on some of the current qemu targets.
>    

But I presume this is more about switching a dim to point at a different 
region in memory.  It's a rare event similar to memory hot plug.

Either way, if there are platforms where we don't treat ram with the new 
ram api, that's okay.

> I agree that changing RAM mappings under an active DMA is a fairly suspect
> thing to do. However I think we need to avoid cache mappings between separate
> DMA transactions i.e. when the guest can know that no DMA will occur, and
> safely remap things.
>    

One thing I like about having a new ram api is it gives us a stronger 
interface than what we have today.  Today, we don't have a strong 
guarantee that mappings won't be changed during a DMA transaction.

With a new api, cpu_physical_memory_map() changes semantics.  It only 
returns pointers for static ram mappings.  Everything else is bounced 
which guarantees that an address can't change during DMA.

> I'm also of the opinion that virtio devices should behave the same as any
> other device. i.e. if you put a virtio-net-pci device on a PCI bus behind an
> IOMMU, then it should see the same address space as any other PCI device in
> that location.  Apart from anything else, failure to do this breaks nested
> virtualization.  While qemu doesn't currently implement an IOMMU, the DMA
> interfaces have been designed to allow it.
>    

Yes, I've been working on that.  virtio is a bit more complicated than a 
normal PCI device because it can be on top of two busses.  It needs an 
additional layer of abstraction to deal with this.

>> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
>>      
> We need to support aliased memory regions. For example the ARM RealView boards
> expose the first 256M RAM at both address 0x0 and 0x70000000. It's also common
> for systems to create aliases by ignoring certain address bits. e.g. each sim
> slot is allocated a fixed 256M region. Populating that slot with a 128M stick
> will cause the contents to be aliased in both the top and bottom halves of
> that region.
>    

Okay, I'd prefer to add an explicit aliasing API.  That gives us more 
information to work with.

Regards,

Anthony Liguori

> Paul
>
Paul Brook - March 2, 2010, 2:33 p.m.
> > I think this assumption is unsafe. There are machines where RAM mappings
> > can change. It's not uncommon for a chip select (i.e. physical memory
> > address region) to be switchable to several different sources, one of
> > which may be RAM.  I'm pretty sure this functionality is present (but not
> > actually implemented) on some of the current qemu targets.
> 
> But I presume this is more about switching a dim to point at a different
> region in memory.  It's a rare event similar to memory hot plug.

Approximately that, yes. One use is to start with ROM at address zero, then 
switch to RAM once you've initialised the DRAM controller (using a small 
internal SRAM as a trampoline). 
 
> Either way, if there are platforms where we don't treat ram with the new
> ram api, that's okay.
> 
> > I agree that changing RAM mappings under an active DMA is a fairly
> > suspect thing to do. However I think we need to avoid cache mappings
> > between separate DMA transactions i.e. when the guest can know that no
> > DMA will occur, and safely remap things.
> 
> One thing I like about having a new ram api is it gives us a stronger
> interface than what we have today.  Today, we don't have a strong
> guarantee that mappings won't be changed during a DMA transaction.
> 
> With a new api, cpu_physical_memory_map() changes semantics.  It only
> returns pointers for static ram mappings.  Everything else is bounced
> which guarantees that an address can't change during DMA.

Doesn't this mean that only the initial RAM is directly DMA-able?

While memory hotplug(and unplug) may be an infrequent event, having the 
majority of ram be hotplug seems much more likely.  Giving each guest a small 
base allocation, then hotplug the rest as required/paid for seems an entirely 
reasonable setup.  I use VMs which are normally hosted on a multi-core machine 
with buckets of ram, but may be migrated to a single CPU host with limited ram 
in an emergency.

Paul
Anthony Liguori - March 2, 2010, 2:39 p.m.
On 03/02/2010 08:33 AM, Paul Brook wrote:
>>> I think this assumption is unsafe. There are machines where RAM mappings
>>> can change. It's not uncommon for a chip select (i.e. physical memory
>>> address region) to be switchable to several different sources, one of
>>> which may be RAM.  I'm pretty sure this functionality is present (but not
>>> actually implemented) on some of the current qemu targets.
>>>        
>> But I presume this is more about switching a dim to point at a different
>> region in memory.  It's a rare event similar to memory hot plug.
>>      
> Approximately that, yes. One use is to start with ROM at address zero, then
> switch to RAM once you've initialised the DRAM controller (using a small
> internal SRAM as a trampoline).
>
>    
>> Either way, if there are platforms where we don't treat ram with the new
>> ram api, that's okay.
>>
>>      
>>> I agree that changing RAM mappings under an active DMA is a fairly
>>> suspect thing to do. However I think we need to avoid cache mappings
>>> between separate DMA transactions i.e. when the guest can know that no
>>> DMA will occur, and safely remap things.
>>>        
>> One thing I like about having a new ram api is it gives us a stronger
>> interface than what we have today.  Today, we don't have a strong
>> guarantee that mappings won't be changed during a DMA transaction.
>>
>> With a new api, cpu_physical_memory_map() changes semantics.  It only
>> returns pointers for static ram mappings.  Everything else is bounced
>> which guarantees that an address can't change during DMA.
>>      
> Doesn't this mean that only the initial RAM is directly DMA-able?
>
> While memory hotplug(and unplug) may be an infrequent event, having the
> majority of ram be hotplug seems much more likely.

Hotplug works fine for direct DMA'ing.  map/unmap would maintain a 
reference count on the registered RAM region and hot unplug would not be 
allowed until that reference dropped to zero.  For something like 
virtio, it means that the driver has to be unloaded in the guest before 
you hot unplug the region of memory if it happens to be using that 
region of memory for the ring storage.

The key difference is that these regions are created and destroyed 
rarely and in such a way that the destruction is visible to the guest.

If you compare that to IO memory, we currently flip IO memory's mappings 
dynamically without the guest really being aware (such as the VGA 
optimization).  An API like this wouldn't work for IO memory today 
without some serious thinking about how to model this sort of thing.

Regards,

Anthony Liguori
Paul Brook - March 2, 2010, 2:55 p.m.
> >> With a new api, cpu_physical_memory_map() changes semantics.  It only
> >> returns pointers for static ram mappings.  Everything else is bounced
> >> which guarantees that an address can't change during DMA.
> >
> > Doesn't this mean that only the initial RAM is directly DMA-able?
> >
> > While memory hotplug(and unplug) may be an infrequent event, having the
> > majority of ram be hotplug seems much more likely.
> 
> Hotplug works fine for direct DMA'ing.  map/unmap would maintain a
> reference count on the registered RAM region and hot unplug would not be
> allowed until that reference dropped to zero.  For something like
> virtio, it means that the driver has to be unloaded in the guest before
> you hot unplug the region of memory if it happens to be using that
> region of memory for the ring storage.
> 
> The key difference is that these regions are created and destroyed
> rarely and in such a way that the destruction is visible to the guest.

So you're making ram unmap an asynchronous process, and requiring that the 
address space not be reused until that umap has completed?

Paul
Anthony Liguori - March 2, 2010, 3:33 p.m.
On 03/02/2010 08:55 AM, Paul Brook wrote:
>>>> With a new api, cpu_physical_memory_map() changes semantics.  It only
>>>> returns pointers for static ram mappings.  Everything else is bounced
>>>> which guarantees that an address can't change during DMA.
>>>>          
>>> Doesn't this mean that only the initial RAM is directly DMA-able?
>>>
>>> While memory hotplug(and unplug) may be an infrequent event, having the
>>> majority of ram be hotplug seems much more likely.
>>>        
>> Hotplug works fine for direct DMA'ing.  map/unmap would maintain a
>> reference count on the registered RAM region and hot unplug would not be
>> allowed until that reference dropped to zero.  For something like
>> virtio, it means that the driver has to be unloaded in the guest before
>> you hot unplug the region of memory if it happens to be using that
>> region of memory for the ring storage.
>>
>> The key difference is that these regions are created and destroyed
>> rarely and in such a way that the destruction is visible to the guest.
>>      
> So you're making ram unmap an asynchronous process, and requiring that the
> address space not be reused until that umap has completed?
>    

It technically already would be.  If you've got a pending DMA 
transaction and you try to hot unplug badness will happen.  This is 
something that is certainly exploitable.

Regards,

Anthony Liguori

> Paul
>
Paul Brook - March 2, 2010, 3:53 p.m.
> >> The key difference is that these regions are created and destroyed
> >> rarely and in such a way that the destruction is visible to the guest.
> >
> > So you're making ram unmap an asynchronous process, and requiring that
> > the address space not be reused until that umap has completed?
> 
> It technically already would be.  If you've got a pending DMA
> transaction and you try to hot unplug badness will happen.  This is
> something that is certainly exploitable.

Hmm, I guess we probably want to make this work with all mappings then. DMA to 
a ram backed PCI BAR (e.g. video ram) is certainly feasible.
Technically it's not the unmap that causes badness, it's freeing the 
underlying ram.

For these reasons I'm tempted to push the refcounting down to the ram 
allocation level. This has a couple of nice properties.

Firstly we don't care about dynamic allocation any more. We just say that 
mapping changes may not effect active DMA transactions. If virtio chooses to 
define that the vring DMA transaction starts when the device is enabled and 
ends when disabled, that's fine by me.  This probably requires revisiting the 
memory barrier issues - barriers are pointless if you don't guarantee cache 
coherence (i.e. no bounce buffers).

Secondly, ram deallocation is not guest visible. The guest visible parts 
(memory unmapping) can happen immediately, and we avoid a whole set of 
unplug/replug race conditions. We may want to delay the completion of a 
monitor hotplug command until the actual deallocation occurs, but that's a 
largely separate issue.

Paul
Michael S. Tsirkin - March 2, 2010, 3:56 p.m.
On Tue, Mar 02, 2010 at 03:53:30PM +0000, Paul Brook wrote:
> > >> The key difference is that these regions are created and destroyed
> > >> rarely and in such a way that the destruction is visible to the guest.
> > >
> > > So you're making ram unmap an asynchronous process, and requiring that
> > > the address space not be reused until that umap has completed?
> > 
> > It technically already would be.  If you've got a pending DMA
> > transaction and you try to hot unplug badness will happen.  This is
> > something that is certainly exploitable.
> 
> Hmm, I guess we probably want to make this work with all mappings then. DMA to 
> a ram backed PCI BAR (e.g. video ram) is certainly feasible.

This used to be possible with PCI/PCI-X.  But as far as I know, with PCI
Express, devices can not access each other's BARs.

> Technically it's not the unmap that causes badness, it's freeing the 
> underlying ram.
> 
> For these reasons I'm tempted to push the refcounting down to the ram 
> allocation level. This has a couple of nice properties.
> 
> Firstly we don't care about dynamic allocation any more. We just say that 
> mapping changes may not effect active DMA transactions. If virtio chooses to 
> define that the vring DMA transaction starts when the device is enabled and 
> ends when disabled, that's fine by me.  This probably requires revisiting the 
> memory barrier issues - barriers are pointless if you don't guarantee cache 
> coherence (i.e. no bounce buffers).
> 
> Secondly, ram deallocation is not guest visible. The guest visible parts 
> (memory unmapping) can happen immediately, and we avoid a whole set of 
> unplug/replug race conditions. We may want to delay the completion of a 
> monitor hotplug command until the actual deallocation occurs, but that's a 
> largely separate issue.
> 
> Paul
Anthony Liguori - March 2, 2010, 4:12 p.m.
On 03/02/2010 09:53 AM, Paul Brook wrote:
>>>> The key difference is that these regions are created and destroyed
>>>> rarely and in such a way that the destruction is visible to the guest.
>>>>          
>>> So you're making ram unmap an asynchronous process, and requiring that
>>> the address space not be reused until that umap has completed?
>>>        
>> It technically already would be.  If you've got a pending DMA
>> transaction and you try to hot unplug badness will happen.  This is
>> something that is certainly exploitable.
>>      
> Hmm, I guess we probably want to make this work with all mappings then. DMA to
> a ram backed PCI BAR (e.g. video ram) is certainly feasible.
> Technically it's not the unmap that causes badness, it's freeing the
> underlying ram.
>    

Let's avoid confusing terminology.  We have RAM mappings and then we 
have PCI BARs that are mapped as IO_MEM_RAM.

PCI BARs mapped as IO_MEM_RAM are allocated by the device and live for 
the duration of the device.  If you did something that changed the BAR's 
mapping from IO_MEM_RAM to an actual IO memory type, then you'd continue 
to DMA to the allocated device memory instead of doing MMIO operations.[1]

That's completely accurate and safe.  If you did this to bare metal, I 
expect you'd get very similar results.

This is different from DMA'ing to a RAM region and then removing the RAM 
region while the IO is in flight.  In this case, the mapping disappears 
and you potentially have the guest writing to an invalid host pointer.

[1] I don't think it's useful to support DMA'ing to arbitrary IO_MEM_RAM 
areas.  Instead, I think we should always bounce to this memory.  The 
benefit is that we avoid the complications resulting from PCI hot unplug 
and reference counting.

> For these reasons I'm tempted to push the refcounting down to the ram
> allocation level. This has a couple of nice properties.
>
> Firstly we don't care about dynamic allocation any more. We just say that
> mapping changes may not effect active DMA transactions.

Only if we think it's necessary to support native DMA to arbitrary 
IO_MEM_RAM.  I contend this is never a normal or performance sensitive 
case and it's not worth supporting.

>   If virtio chooses to
> define that the vring DMA transaction starts when the device is enabled and
> ends when disabled, that's fine by me.  This probably requires revisiting the
> memory barrier issues - barriers are pointless if you don't guarantee cache
> coherence (i.e. no bounce buffers).
>
> Secondly, ram deallocation is not guest visible. The guest visible parts
> (memory unmapping) can happen immediately, and we avoid a whole set of
> unplug/replug race conditions. We may want to delay the completion of a
> monitor hotplug command until the actual deallocation occurs, but that's a
> largely separate issue.
>    

You can do the same thing and always bounce IO_MEM_RAM IO regions.  It's 
just a question of whether we think it's worth the effort to do native 
DMA to this type of memory.  I personally don't think it is at least in 
the beginning.

Regards,

Anthony Liguori

> Paul
>
Marcelo Tosatti - March 2, 2010, 4:12 p.m.
On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
> >>Both have  security implications so I think it's important that they
> >>be addressed.   Otherwise, I'm pretty happy with how things are.
> >Care suggesting some solutions?
> 
> The obvious thing to do would be to use the memory notifier in vhost
> to keep track of whenever something remaps the ring's memory region
> and if that happens, issue an ioctl to vhost to change the location
> of the ring.  Also, you would need to merge the vhost slot
> management code with the KVM slot management code.

There are no security implications as long as vhost uses the qemu
process mappings.

But your point is valid.

> I'm sympathetic to your arguments though.  As qemu is today, the
> above is definitely the right thing to do.  But ram is always ram
> and ram always has a fixed (albeit non-linear) mapping within a
> guest.  We can probably be smarter in qemu.
> 
> There are areas of MMIO/ROM address space that *sometimes* end up
> behaving like ram, but that's a different case.  The one other case
> to consider is ram hot add/remove in which case, ram may be removed
> or added (but it's location will never change during its lifetime).
> 
> Here's what I'll propose, and I'd really like to hear what Paul
> think about it before we start down this path.
> 
> I think we should add a new API that's:
> 
> void cpu_ram_add(target_phys_addr_t start, ram_addr_t size);
> 
> This API would do two things.  It would call qemu_ram_alloc() and
> cpu_register_physical_memory() just as code does today.  It would
> also add this region into a new table.
> 
> There would be:
> 
> void *cpu_ram_map(target_phys_addr_t start, ram_addr_t *size);
> void cpu_ram_unmap(void *mem);
> 
> These calls would use this new table to lookup ram addresses.  These
> mappings are valid as long as the guest is executed.  Within the
> table, each region would have a reference count.  When it comes time
> to do hot add/remove, we would wait to remove a region until the
> reference count went to zero to avoid unmapping during DMA.
>
> cpu_ram_add() never gets called with overlapping regions.  We'll
> modify cpu_register_physical_memory() to ensure that a ram mapping
> is never changed after initial registration.

What is the difference between your proposal and
cpu_physical_memory_map?

What i'd like to see is binding between cpu_physical_memory_map and qdev 
devices, so that you can use different host memory mappings for device
context and for CPU context (and provide the possibility for, say, map 
a certain memory region as read-only).

> vhost no longer needs to bother keeping the dynamic table up to date
> so it removes all of the slot management code from vhost.  KVM still
> needs the code to handle rom/ram mappings but we can take care of
> that next.  virtio-net's userspace code can do the same thing as
> vhost and only map the ring once which should be a big performance
> improvement.

Pinning the host virtual addresses as you propose reduces flexibility.
See above about different mappings for DMA/CPU contexes.
Marcelo Tosatti - March 2, 2010, 4:21 p.m.
On Tue, Mar 02, 2010 at 10:12:05AM -0600, Anthony Liguori wrote:
> On 03/02/2010 09:53 AM, Paul Brook wrote:
> >>>>The key difference is that these regions are created and destroyed
> >>>>rarely and in such a way that the destruction is visible to the guest.
> >>>So you're making ram unmap an asynchronous process, and requiring that
> >>>the address space not be reused until that umap has completed?
> >>It technically already would be.  If you've got a pending DMA
> >>transaction and you try to hot unplug badness will happen.  This is
> >>something that is certainly exploitable.
> >Hmm, I guess we probably want to make this work with all mappings then. DMA to
> >a ram backed PCI BAR (e.g. video ram) is certainly feasible.
> >Technically it's not the unmap that causes badness, it's freeing the
> >underlying ram.
> 
> Let's avoid confusing terminology.  We have RAM mappings and then we
> have PCI BARs that are mapped as IO_MEM_RAM.
> 
> PCI BARs mapped as IO_MEM_RAM are allocated by the device and live
> for the duration of the device.  If you did something that changed
> the BAR's mapping from IO_MEM_RAM to an actual IO memory type, then
> you'd continue to DMA to the allocated device memory instead of
> doing MMIO operations.[1]
> 
> That's completely accurate and safe.  If you did this to bare metal,
> I expect you'd get very similar results.
> 
> This is different from DMA'ing to a RAM region and then removing the
> RAM region while the IO is in flight.  In this case, the mapping
> disappears and you potentially have the guest writing to an invalid
> host pointer.
> 
> [1] I don't think it's useful to support DMA'ing to arbitrary
> IO_MEM_RAM areas.  Instead, I think we should always bounce to this
> memory.  The benefit is that we avoid the complications resulting
> from PCI hot unplug and reference counting.

Agree. Thus the suggestion to tie cpu_physical_memory_map to qdev
infrastructure.
Anthony Liguori - March 2, 2010, 4:56 p.m.
On 03/02/2010 10:12 AM, Marcelo Tosatti wrote:
> On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
>    
>> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
>>      
>>>> Both have  security implications so I think it's important that they
>>>> be addressed.   Otherwise, I'm pretty happy with how things are.
>>>>          
>>> Care suggesting some solutions?
>>>        
>> The obvious thing to do would be to use the memory notifier in vhost
>> to keep track of whenever something remaps the ring's memory region
>> and if that happens, issue an ioctl to vhost to change the location
>> of the ring.  Also, you would need to merge the vhost slot
>> management code with the KVM slot management code.
>>      
> There are no security implications as long as vhost uses the qemu
> process mappings.
>    

There potentially are within a guest.  If a guest can trigger a qemu bug 
that results in qemu writing to a different location than what the guest 
told it to write, a malicious software may use this to escalate it's 
privileges within a guest.

>> cpu_ram_add() never gets called with overlapping regions.  We'll
>> modify cpu_register_physical_memory() to ensure that a ram mapping
>> is never changed after initial registration.
>>      
> What is the difference between your proposal and
> cpu_physical_memory_map?
>    

cpu_physical_memory_map() has the following semantics:

- it always returns a transient mapping
- it may (transparently) bounce
- it may fail to bounce, caller must deal

The new function I'm proposing has the following semantics:

- it always returns a persistent mapping
- it never bounces
- it will only fail if the mapping isn't ram

A caller can use the new function to implement an optimization to force 
the device to only work with real ram.  IOW, this is something we can 
use in virtio, but very little else.  cpu_physical_memory_map can be 
used in more circumstances.

> What i'd like to see is binding between cpu_physical_memory_map and qdev
> devices, so that you can use different host memory mappings for device
> context and for CPU context (and provide the possibility for, say, map
> a certain memory region as read-only).
>    

We really want per-bus mappings.  At the lowest level, we'll have 
sysbus_memory_map() but we'll also have pci_memory_map(), 
virtio_memory_map(), etc.

Nothing should ever call cpu_physical_memory_map() directly.

Regards,

Anthony Liguori
Michael S. Tsirkin - March 2, 2010, 5 p.m.
On Tue, Mar 02, 2010 at 10:56:48AM -0600, Anthony Liguori wrote:
> On 03/02/2010 10:12 AM, Marcelo Tosatti wrote:
>> On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
>>    
>>> On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
>>>      
>>>>> Both have  security implications so I think it's important that they
>>>>> be addressed.   Otherwise, I'm pretty happy with how things are.
>>>>>          
>>>> Care suggesting some solutions?
>>>>        
>>> The obvious thing to do would be to use the memory notifier in vhost
>>> to keep track of whenever something remaps the ring's memory region
>>> and if that happens, issue an ioctl to vhost to change the location
>>> of the ring.  Also, you would need to merge the vhost slot
>>> management code with the KVM slot management code.
>>>      
>> There are no security implications as long as vhost uses the qemu
>> process mappings.
>>    
>
> There potentially are within a guest.  If a guest can trigger a qemu bug  
> that results in qemu writing to a different location than what the guest  
> told it to write, a malicious software may use this to escalate it's  
> privileges within a guest.

If malicious software has access to hardware that does DMA,
game is likely over :)

>>> cpu_ram_add() never gets called with overlapping regions.  We'll
>>> modify cpu_register_physical_memory() to ensure that a ram mapping
>>> is never changed after initial registration.
>>>      
>> What is the difference between your proposal and
>> cpu_physical_memory_map?
>>    
>
> cpu_physical_memory_map() has the following semantics:
>
> - it always returns a transient mapping
> - it may (transparently) bounce
> - it may fail to bounce, caller must deal
>
> The new function I'm proposing has the following semantics:
>
> - it always returns a persistent mapping
> - it never bounces
> - it will only fail if the mapping isn't ram
>
> A caller can use the new function to implement an optimization to force  
> the device to only work with real ram.  IOW, this is something we can  
> use in virtio, but very little else.  cpu_physical_memory_map can be  
> used in more circumstances.
>
>> What i'd like to see is binding between cpu_physical_memory_map and qdev
>> devices, so that you can use different host memory mappings for device
>> context and for CPU context (and provide the possibility for, say, map
>> a certain memory region as read-only).
>>    
>
> We really want per-bus mappings.  At the lowest level, we'll have  
> sysbus_memory_map() but we'll also have pci_memory_map(),  
> virtio_memory_map(), etc.
>
> Nothing should ever call cpu_physical_memory_map() directly.
>
> Regards,
>
> Anthony Liguori
Marcelo Tosatti - March 2, 2010, 6 p.m.
On Tue, Mar 02, 2010 at 10:56:48AM -0600, Anthony Liguori wrote:
> On 03/02/2010 10:12 AM, Marcelo Tosatti wrote:
> >On Sun, Feb 28, 2010 at 02:57:56PM -0600, Anthony Liguori wrote:
> >>On 02/28/2010 11:19 AM, Michael S. Tsirkin wrote:
> >>>>Both have  security implications so I think it's important that they
> >>>>be addressed.   Otherwise, I'm pretty happy with how things are.
> >>>Care suggesting some solutions?
> >>The obvious thing to do would be to use the memory notifier in vhost
> >>to keep track of whenever something remaps the ring's memory region
> >>and if that happens, issue an ioctl to vhost to change the location
> >>of the ring.  Also, you would need to merge the vhost slot
> >>management code with the KVM slot management code.
> >There are no security implications as long as vhost uses the qemu
> >process mappings.
> 
> There potentially are within a guest.  If a guest can trigger a qemu
> bug that results in qemu writing to a different location than what
> the guest told it to write, a malicious software may use this to
> escalate it's privileges within a guest.
> 
> >>cpu_ram_add() never gets called with overlapping regions.  We'll
> >>modify cpu_register_physical_memory() to ensure that a ram mapping
> >>is never changed after initial registration.
> >What is the difference between your proposal and
> >cpu_physical_memory_map?
> 
> cpu_physical_memory_map() has the following semantics:


> 
> - it always returns a transient mapping
> - it may (transparently) bounce
> - it may fail to bounce, caller must deal
> 
> The new function I'm proposing has the following semantics:

What exactly are the purposes of the new function?

> - it always returns a persistent mapping

For hotplug? What exactly you mean persistent?

> - it never bounces
> - it will only fail if the mapping isn't ram
> 
> A caller can use the new function to implement an optimization to
> force the device to only work with real ram.

To bypass the address translation in exec.c? 

>   IOW, this is something we can use in virtio, but very little else.
> cpu_physical_memory_map can be used in more circumstances.

Does not make much sense to me. The qdev <-> memory map mapping seems
more important. Following your security enhancement drive, you can for
example check whether the region can actually be mapped by the device
and deny otherwise, or do whatever host-side memory protection tricks 
you'd like.

And "cpu_ram_map" seems like the memory is accessed through cpu context, 
while it is really always device context.

> >What i'd like to see is binding between cpu_physical_memory_map and qdev
> >devices, so that you can use different host memory mappings for device
> >context and for CPU context (and provide the possibility for, say, map
> >a certain memory region as read-only).
> 
> We really want per-bus mappings.  At the lowest level, we'll have
> sysbus_memory_map() but we'll also have pci_memory_map(),
> virtio_memory_map(), etc.
>
> Nothing should ever call cpu_physical_memory_map() directly.

Yep.

> 
> Regards,
> 
> Anthony Liguori
>
Anthony Liguori - March 2, 2010, 6:13 p.m.
On 03/02/2010 12:00 PM, Marcelo Tosatti wrote:
>> - it always returns a transient mapping
>> - it may (transparently) bounce
>> - it may fail to bounce, caller must deal
>>
>> The new function I'm proposing has the following semantics:
>>      
> What exactly are the purposes of the new function?
>    

We need an API that can be used to obtain a persistent mapping of a 
given guest physical address.  We don't have that today.  We can 
potentially change cpu_physical_memory_map() to also accommodate this 
use case but that's a second step.

>> - it always returns a persistent mapping
>>      
> For hotplug? What exactly you mean persistent?
>    

Hotplug cannot happen as long as a persistent mapping exists for an 
address within that region.  This is okay.  You cannot have an active 
device driver DMA'ing to a DIMM and then hot unplug it.  The guest is 
responsible for making sure this doesn't happen.

>> - it never bounces
>> - it will only fail if the mapping isn't ram
>>
>> A caller can use the new function to implement an optimization to
>> force the device to only work with real ram.
>>      
> To bypass the address translation in exec.c?
>    

No, the ultimate goal is to convert virtio ring accesses from:

static inline uint32_t vring_desc_len(target_phys_addr_t desc_pa, int i)
{
     target_phys_addr_t pa;
     pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
     return ldl_phys(pa);
}

len = vring_desc_len(vring.desc_pa, i)

To:

len = ldl_w(vring->desc[i].len);

When host == arch, ldl_w is a nop.  Otherwise, it's just a byte swap.  
ldl_phys() today turns into cpu_physical_memory_read() which is very slow.

To support this, we must enforce that when a guest passes us a physical 
address, we can safely obtain a persistent mapping to it.  This is true 
for any ram address.  It's not true for MMIO memory.  We have no way to 
do this with cpu_physical_memory_map().

>>    IOW, this is something we can use in virtio, but very little else.
>> cpu_physical_memory_map can be used in more circumstances.
>>      
> Does not make much sense to me. The qdev<->  memory map mapping seems
> more important. Following your security enhancement drive, you can for
> example check whether the region can actually be mapped by the device
> and deny otherwise, or do whatever host-side memory protection tricks
> you'd like.
>    

It's two independent things.  Part of what makes virtio so complicated 
to convert to proper bus accessors is it's use of 
ldl_phys/stl_phys/etc.  No other device use those functions.  If we 
reduce virtio to just use a map() function, it simplifies the bus 
accessor conversion.

> And "cpu_ram_map" seems like the memory is accessed through cpu context,
> while it is really always device context.
>    

Yes, but that's a separate effort.  In fact, see 
http://wiki.qemu.org/Features/RamAPI vs. 
http://wiki.qemu.org/Features/PCIMemoryAPI

Regards,

Anthony Liguori
Paul Brook - March 2, 2010, 10:41 p.m.
> The new function I'm proposing has the following semantics:
> 
> - it always returns a persistent mapping
> - it never bounces
> - it will only fail if the mapping isn't ram

So you're assuming that virtio rings are in ram that is not hot-pluggable or 
remapable, and the whole region is contiguous?
That sounds like it's likely to come back and bite you. The guest has no idea 
which areas of ram happen to be contiguous on the host.

Paul
Anthony Liguori - March 3, 2010, 2:15 p.m.
On 03/02/2010 04:41 PM, Paul Brook wrote:
>> The new function I'm proposing has the following semantics:
>>
>> - it always returns a persistent mapping
>> - it never bounces
>> - it will only fail if the mapping isn't ram
>>      
> So you're assuming that virtio rings are in ram that is not hot-pluggable

As long as the device is active, yes.  This would be true with bare 
metal.  Memory allocated for the virtio-pci ring is not reclaimable and 
you have to be able to reclaim the entire area of ram covered by a DIMM 
to hot unplug it.  A user would have to unload the virtio-pci module to 
release the memory before hot unplug would be an option.

NB, almost nothing supports memory hot remove because it's very 
difficult for an OS to actually do.

>   or
> remapable,

Yes, it cannot be remapable.

>   and the whole region is contiguous?
>    

Yes, it has to be contiguous.

> That sounds like it's likely to come back and bite you. The guest has no idea
> which areas of ram happen to be contiguous on the host.
>    

Practically speaking, with target-i386 anything that is contiguous in 
guest physical memory is contiguous in the host address space provided 
it's ram.

These assumptions are important.  I have a local branch (that I'll send 
out soon) that implements a ram API and converted virtio to make use of 
it.  I'm seeing a ~50% increase in tx throughput.

If you try to support discontiguous, remapable ram for the virtio ring, 
then you have to do an l1_phys_map lookup for every single ring variable 
access followed by a memory copy.  This ends up costing an awful lot 
practically speaking.

The changes should work equally well with syborg although I don't think 
I can do meaningful performance testing there (I don't actually have a 
syborg image to test).

Regards,

Anthony Liguori

> Paul
>
Paul Brook - March 3, 2010, 2:43 p.m.
> > That sounds like it's likely to come back and bite you. The guest has no
> > idea which areas of ram happen to be contiguous on the host.
> 
> Practically speaking, with target-i386 anything that is contiguous in
> guest physical memory is contiguous in the host address space provided
> it's ram.
> 
> These assumptions are important.  I have a local branch (that I'll send
> out soon) that implements a ram API and converted virtio to make use of
> it.  I'm seeing a ~50% increase in tx throughput.

IMO supporting discontiguous regions is a requirement. target-i386 might get 
away with contiguous memory, because it omits most of the board level details. 
For everything else I'd expect this to be a real problem.

I'm not happy about the not-remapable assumption either.  In my experience 
this is fairly common.  In fact many real x86 machines have this capability 
(to workaround the 4G PCI hole).

By my reading the ppc440_bamboo board fails both your assumptions.
I imagine the KVM-PPC folks would be upset if you decided that virtio no 
longer worked on this board.

This is all somewhat disappointing, given virtio is supposed to be a DMA based 
architecture, and not rely on shared memory semantics.

Paul
Marcelo Tosatti - March 3, 2010, 4:24 p.m.
On Wed, Mar 03, 2010 at 08:15:15AM -0600, Anthony Liguori wrote:
> On 03/02/2010 04:41 PM, Paul Brook wrote:
> >>The new function I'm proposing has the following semantics:
> >>
> >>- it always returns a persistent mapping
> >>- it never bounces
> >>- it will only fail if the mapping isn't ram
> >So you're assuming that virtio rings are in ram that is not hot-pluggable
> 
> As long as the device is active, yes.  This would be true with bare
> metal.  Memory allocated for the virtio-pci ring is not reclaimable
> and you have to be able to reclaim the entire area of ram covered by
> a DIMM to hot unplug it.  A user would have to unload the virtio-pci
> module to release the memory before hot unplug would be an option.
> 
> NB, almost nothing supports memory hot remove because it's very
> difficult for an OS to actually do.
> 
> >  or
> >remapable,
> 
> Yes, it cannot be remapable.
> 
> >  and the whole region is contiguous?
> 
> Yes, it has to be contiguous.
> 
> >That sounds like it's likely to come back and bite you. The guest has no idea
> >which areas of ram happen to be contiguous on the host.
> 
> Practically speaking, with target-i386 anything that is contiguous
> in guest physical memory is contiguous in the host address space
> provided it's ram.
> 
> These assumptions are important.  I have a local branch (that I'll
> send out soon) that implements a ram API and converted virtio to
> make use of it.  I'm seeing a ~50% increase in tx throughput.
> 
> If you try to support discontiguous, remapable ram for the virtio
> ring, then you have to do an l1_phys_map lookup for every single
> ring variable access followed by a memory copy.  This ends up
> costing an awful lot practically speaking.

Speed up the lookup instead?

> 
> The changes should work equally well with syborg although I don't
> think I can do meaningful performance testing there (I don't
> actually have a syborg image to test).
> 
> Regards,
> 
> Anthony Liguori
> 
> >Paul

Patch

diff --git a/net.c b/net.c
index a1bf49f..d1e23f1 100644
--- a/net.c
+++ b/net.c
@@ -973,6 +973,14 @@  static const struct {
                 .name = "vnet_hdr",
                 .type = QEMU_OPT_BOOL,
                 .help = "enable the IFF_VNET_HDR flag on the tap interface"
+            }, {
+                .name = "vhost",
+                .type = QEMU_OPT_BOOL,
+                .help = "enable vhost-net network accelerator",
+            }, {
+                .name = "vhostfd",
+                .type = QEMU_OPT_STRING,
+                .help = "file descriptor of an already opened vhost net device",
             },
 #endif /* _WIN32 */
             { /* end of list */ }
diff --git a/net/tap.c b/net/tap.c
index fc59fd4..65797ef 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -41,6 +41,8 @@ 
 
 #include "net/tap-linux.h"
 
+#include "hw/vhost_net.h"
+
 /* Maximum GSO packet size (64k) plus plenty of room for
  * the ethernet and virtio_net headers
  */
@@ -57,6 +59,7 @@  typedef struct TAPState {
     unsigned int has_vnet_hdr : 1;
     unsigned int using_vnet_hdr : 1;
     unsigned int has_ufo: 1;
+    struct vhost_net *vhost_net;
 } TAPState;
 
 static int launch_script(const char *setup_script, const char *ifname, int fd);
@@ -252,6 +255,10 @@  static void tap_cleanup(VLANClientState *nc)
 {
     TAPState *s = DO_UPCAST(TAPState, nc, nc);
 
+    if (s->vhost_net) {
+        vhost_net_cleanup(s->vhost_net);
+    }
+
     qemu_purge_queued_packets(nc);
 
     if (s->down_script[0])
@@ -307,6 +314,7 @@  static TAPState *net_tap_fd_init(VLANState *vlan,
     s->has_ufo = tap_probe_has_ufo(s->fd);
     tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
     tap_read_poll(s, 1);
+    s->vhost_net = NULL;
     return s;
 }
 
@@ -456,5 +464,30 @@  int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
         }
     }
 
+    if (qemu_opt_get_bool(opts, "vhost", !!qemu_opt_get(opts, "vhostfd"))) {
+        int vhostfd, r;
+        if (qemu_opt_get(opts, "vhostfd")) {
+            r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
+            if (r == -1) {
+                return -1;
+            }
+            vhostfd = r;
+        } else {
+            vhostfd = -1;
+        }
+        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
+        if (!s->vhost_net) {
+            qemu_error("vhost-net requested but could not be initialized\n");
+            return -1;
+        }
+    } else if (qemu_opt_get(opts, "vhostfd")) {
+        qemu_error("vhostfd= is not valid without vhost\n");
+        return -1;
+    }
+
+    if (vlan) {
+        vlan->nb_host_devs++;
+    }
+
     return 0;
 }
diff --git a/qemu-options.hx b/qemu-options.hx
index f53922f..1850906 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -879,7 +879,7 @@  DEF("net", HAS_ARG, QEMU_OPTION_net,
     "-net tap[,vlan=n][,name=str],ifname=name\n"
     "                connect the host TAP network interface to VLAN 'n'\n"
 #else
-    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n"
+    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n"
     "                connect the host TAP network interface to VLAN 'n' and use the\n"
     "                network scripts 'file' (default=" DEFAULT_NETWORK_SCRIPT ")\n"
     "                and 'dfile' (default=" DEFAULT_NETWORK_DOWN_SCRIPT ")\n"
@@ -889,6 +889,8 @@  DEF("net", HAS_ARG, QEMU_OPTION_net,
     "                default of 'sndbuf=1048576' can be disabled using 'sndbuf=0')\n"
     "                use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag\n"
     "                use vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n"
+    "                use vhost=on to enable experimental in kernel accelerator\n"
+    "                use 'vhostfd=h' to connect to an already opened vhost net device\n"
 #endif
     "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                connect the vlan 'n' to another VLAN using a socket connection\n"