Patchwork [v2,4/5] backdoor: [softmmu] Add QEMU-side proxy to "libbackdoor.a"

login
register
mail settings
Submitter Lluís Vilanova
Date Dec. 5, 2011, 10:23 p.m.
Message ID <20111205222312.31271.66303.stgit@ginnungagap.bsc.es>
Download mbox | patch
Permalink /patch/129454/
State New
Headers show

Comments

Lluís Vilanova - Dec. 5, 2011, 10:23 p.m.
Uses a virtual device to proxy uses of the backdoor communication channel to the
user-provided code.

Signed-off-by: Lluís Vilanova <vilanova@ac.upc.edu>
---
 Makefile.objs           |    1 
 backdoor/qemu/softmmu.c |  186 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/pci.h                |    1 
 3 files changed, 188 insertions(+), 0 deletions(-)
 create mode 100644 backdoor/qemu/softmmu.c
Anthony Liguori - Dec. 6, 2011, 7:55 p.m.
I really worry about us introducing so many of these one-off paravirtual devices.

I would much prefer that you look at doing this as an extension to the ivshmem 
device as it already has this sort of scope.  You should be able to do this by 
just extending the size of bar 1 and using a well known guest id.

Regards,

Anthony Liguori

On 12/05/2011 04:23 PM, Lluís Vilanova wrote:
> Uses a virtual device to proxy uses of the backdoor communication channel to the
> user-provided code.
>
> Signed-off-by: Lluís Vilanova<vilanova@ac.upc.edu>
> ---
>   Makefile.objs           |    1
>   backdoor/qemu/softmmu.c |  186 +++++++++++++++++++++++++++++++++++++++++++++++
>   hw/pci.h                |    1
>   3 files changed, 188 insertions(+), 0 deletions(-)
>   create mode 100644 backdoor/qemu/softmmu.c
>
> diff --git a/Makefile.objs b/Makefile.objs
> index 9784441..a45ff56 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -401,6 +401,7 @@ $(trace-obj-y): $(GENERATED_HEADERS)
>   # backdoor
>
>   backdoor-nested-$(CONFIG_USER_ONLY) += user.o
> +backdoor-nested-$(CONFIG_SOFTMMU) += softmmu.o
>
>   backdoor-obj-y += $(addprefix backdoor/qemu/, $(backdoor-nested-y))
>
> diff --git a/backdoor/qemu/softmmu.c b/backdoor/qemu/softmmu.c
> new file mode 100644
> index 0000000..9cde59f
> --- /dev/null
> +++ b/backdoor/qemu/softmmu.c
> @@ -0,0 +1,186 @@
> +/*
> + * QEMU-side management of backdoor channels in softmmu emulation.
> + *
> + * Copyright (C) 2011 Lluís Vilanova<vilanova@ac.upc.edu>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "hw/pci.h"
> +#include "backdoor/qemu/qemu-backdoor.h"
> +
> +
> +#define PAGE_SIZE TARGET_PAGE_SIZE
> +
> +
> +typedef struct BackdoorState
> +{
> +    PCIDevice dev;
> +
> +    uint8_t pages;
> +    uint64_t size;
> +
> +    union
> +    {
> +        uint64_t v;
> +        char     a[8];
> +    } c_size;
> +    union
> +    {
> +        uint64_t v;
> +        uint8_t  a[8];
> +    } c_cmd;
> +
> +    void *data_ptr;
> +    MemoryRegion data;
> +    MemoryRegion control;
> +} BackdoorState;
> +
> +
> +static uint64_t backdoor_control_io_read(void *opaque, target_phys_addr_t addr, unsigned size)
> +{
> +    BackdoorState *s = opaque;
> +
> +    /* c_size already has target endianess */
> +
> +    switch (size) {
> +    case 1:
> +    {
> +        uint8_t *res = (uint8_t*)&s->c_size.a[addr % sizeof(uint64_t)];
> +        return *res;
> +    }
> +    case 2:
> +    {
> +        uint16_t *res = (uint16_t*)&s->c_size.a[addr % sizeof(uint64_t)];
> +        return *res;
> +    }
> +    case 4:
> +    {
> +        uint32_t *res = (uint32_t*)&s->c_size.a[addr % sizeof(uint64_t)];
> +        return *res;
> +    }
> +    case 8:
> +    {
> +        uint64_t *res = (uint64_t*)&s->c_size.a[addr % sizeof(uint64_t)];
> +        return *res;
> +    }
> +    default:
> +        fprintf(stderr, "error: backdoor: Unexpected read of size %d\n", size);
> +        abort();
> +    }
> +}
> +
> +static void backdoor_control_io_write(void *opaque, target_phys_addr_t addr, uint64_t data, unsigned size)
> +{
> +    BackdoorState *s = opaque;
> +
> +    /* c_cmd will have target endianess (left up to the user) */
> +
> +    switch (size) {
> +    case 1:
> +    {
> +        uint8_t *res = (uint8_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
> +        *res = (uint8_t)data;
> +        break;
> +    }
> +    case 2:
> +    {
> +        uint16_t *res = (uint16_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
> +        *res = (uint16_t)data;
> +        break;
> +    }
> +    case 4:
> +    {
> +        uint32_t *res = (uint32_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
> +        *res = (uint32_t)data;
> +        break;
> +    }
> +    case 8:
> +    {
> +        uint64_t *res = (uint64_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
> +        *res = (uint64_t)data;
> +        break;
> +    }
> +    default:
> +        fprintf(stderr, "error: backdoor: Unexpected write of size %d\n", size);
> +        abort();
> +    }
> +
> +    if ((addr + size) % sizeof(s->c_cmd.v) == 0) {
> +        qemu_backdoor(s->c_cmd.v, s->data_ptr);
> +    }
> +}
> +
> +static const MemoryRegionOps backdoor_control_ops = {
> +    .read = backdoor_control_io_read,
> +    .write = backdoor_control_io_write,
> +    .endianness = DEVICE_NATIVE_ENDIAN,
> +    .impl = {
> +        .min_access_size = 1,
> +        .max_access_size = 8,
> +    },
> +};
> +
> +
> +static int backdoor_init(PCIDevice *dev)
> +{
> +    BackdoorState *s = DO_UPCAST(BackdoorState, dev, dev);
> +
> +    if (s->pages<  1) {
> +        fprintf(stderr, "error: backdoor: "
> +                "the data channel must have one or more pages\n");
> +        return -1;
> +    }
> +    s->size = s->pages * PAGE_SIZE;
> +    s->c_size.v = tswap64(s->size);
> +
> +    pci_set_word(s->dev.config + PCI_COMMAND,
> +                 PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
> +
> +    memory_region_init_io(&s->control,&backdoor_control_ops, s,
> +                          "backdoor.control", PAGE_SIZE);
> +    pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,&s->control);
> +
> +    memory_region_init_ram(&s->data,&s->dev.qdev,
> +                           "backdoor.data", s->c_size.v);
> +    pci_register_bar(&s->dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY,&s->data);
> +    s->data_ptr = qemu_get_ram_ptr(s->data.ram_addr);
> +
> +    qemu_backdoor_init(s->c_size.v);
> +
> +    return 0;
> +}
> +
> +static int backdoor_fini(PCIDevice *dev)
> +{
> +    BackdoorState *s = DO_UPCAST(BackdoorState, dev, dev);
> +
> +    memory_region_destroy(&s->data);
> +    memory_region_destroy(&s->control);
> +
> +    return 0;
> +}
> +
> +
> +static PCIDeviceInfo backdoor_info = {
> +    .qdev.name  = "backdoor",
> +    .qdev.desc  = "Backdoor communication channel",
> +    .qdev.size  = sizeof(BackdoorState),
> +    .init       = backdoor_init,
> +    .exit       = backdoor_fini,
> +    .vendor_id  = PCI_VENDOR_ID_REDHAT_QUMRANET,
> +    .device_id  = PCI_DEVICE_ID_BACKDOOR,
> +    .class_id   = PCI_CLASS_MEMORY_RAM,
> +    .qdev.props = (Property[]) {
> +        DEFINE_PROP_UINT8("pages", BackdoorState, pages, 1),
> +        DEFINE_PROP_END_OF_LIST(),
> +    }
> +};
> +
> +static void backdoor_register_device(void)
> +{
> +    pci_qdev_register(&backdoor_info);
> +}
> +
> +device_init(backdoor_register_device)
> diff --git a/hw/pci.h b/hw/pci.h
> index 625e717..e7dc3cb 100644
> --- a/hw/pci.h
> +++ b/hw/pci.h
> @@ -75,6 +75,7 @@
>   #define PCI_DEVICE_ID_VIRTIO_BLOCK       0x1001
>   #define PCI_DEVICE_ID_VIRTIO_BALLOON     0x1002
>   #define PCI_DEVICE_ID_VIRTIO_CONSOLE     0x1003
> +#define PCI_DEVICE_ID_BACKDOOR           0x1004
>
>   #define FMT_PCIBUS                      PRIx64
>
>
>
Lluís Vilanova - Dec. 6, 2011, 10:30 p.m.
Anthony Liguori writes:

> I really worry about us introducing so many of these one-off paravirtual devices.
> I would much prefer that you look at doing this as an extension to the ivshmem
> device as it already has this sort of scope.  You should be able to do this by
> just extending the size of bar 1 and using a well known guest id.

I did in fact look at ivshmem some time ago, and it's true that both use the
same mechanisms; but each device has a completely different purpose. To me it
just seems that extending the control BAR in ivshmem to call the user-provided
backdoor callbacks is just conflating two completely separate devices into a
single one. Besides, I think that the qemu-side of the backdoor is simple enough
to avoid being a maintenance burden.

Another question would be to join both so that the backdoor can be used to
orchestrate operations between multiple VMs through ivshmem's server, but I
really have no time to look into that and don't even know whether it would then
make sense to join both devices.


Thanks,
  Lluis
Anthony Liguori - Dec. 6, 2011, 10:35 p.m.
On 12/06/2011 04:30 PM, Lluís Vilanova wrote:
> Anthony Liguori writes:
>
>> I really worry about us introducing so many of these one-off paravirtual devices.
>> I would much prefer that you look at doing this as an extension to the ivshmem
>> device as it already has this sort of scope.  You should be able to do this by
>> just extending the size of bar 1 and using a well known guest id.
>
> I did in fact look at ivshmem some time ago, and it's true that both use the
> same mechanisms; but each device has a completely different purpose. To me it
> just seems that extending the control BAR in ivshmem to call the user-provided
> backdoor callbacks is just conflating two completely separate devices into a
> single one. Besides, I think that the qemu-side of the backdoor is simple enough
> to avoid being a maintenance burden.

They have the same purpose (which are both vague TBH).  The only reason I'm 
sympathetic to this device is that virtio-serial has such insane semantics.

It's bad enough we already have two "backdoor" mechanisms, adding a third seems 
insane to me.

Regards,

Anthony Liguori

>
> Another question would be to join both so that the backdoor can be used to
> orchestrate operations between multiple VMs through ivshmem's server, but I
> really have no time to look into that and don't even know whether it would then
> make sense to join both devices.
>
>
> Thanks,
>    Lluis
>
Peter Maydell - Dec. 6, 2011, 10:37 p.m.
2011/12/6 Anthony Liguori <anthony@codemonkey.ws>:
> It's bad enough we already have two "backdoor" mechanisms, adding a third
> seems insane to me.

Isn't there yet another one out-of-tree in the android emulator?

-- PMM
Lluís Vilanova - Dec. 6, 2011, 10:39 p.m.
Lluís Vilanova writes:

> Anthony Liguori writes:
>> I really worry about us introducing so many of these one-off paravirtual devices.
>> I would much prefer that you look at doing this as an extension to the ivshmem
>> device as it already has this sort of scope.  You should be able to do this by
>> just extending the size of bar 1 and using a well known guest id.

> I did in fact look at ivshmem some time ago, and it's true that both use the
> same mechanisms; but each device has a completely different purpose. To me it
> just seems that extending the control BAR in ivshmem to call the user-provided
> backdoor callbacks is just conflating two completely separate devices into a
> single one. Besides, I think that the qemu-side of the backdoor is simple enough
> to avoid being a maintenance burden.

> Another question would be to join both so that the backdoor can be used to
> orchestrate operations between multiple VMs through ivshmem's server, but I
> really have no time to look into that and don't even know whether it would then
> make sense to join both devices.

BTW, I think that having the softmmu-side of the backdoor inside ivshmem would
in fact be a simple change. I just want to make sure whether the reason to merge
both is about minimising code maintenance or rather thinking that it would make
more sense to have both as a single pack of functionalities.


Thanks,
  Lluis
Markus Armbruster - Dec. 7, 2011, 8:21 a.m.
Anthony Liguori <anthony@codemonkey.ws> writes:

> On 12/06/2011 04:30 PM, Lluís Vilanova wrote:
>> Anthony Liguori writes:
>>
>>> I really worry about us introducing so many of these one-off paravirtual devices.
>>> I would much prefer that you look at doing this as an extension to the ivshmem
>>> device as it already has this sort of scope.  You should be able to do this by
>>> just extending the size of bar 1 and using a well known guest id.
>>
>> I did in fact look at ivshmem some time ago, and it's true that both use the
>> same mechanisms; but each device has a completely different purpose. To me it
>> just seems that extending the control BAR in ivshmem to call the user-provided
>> backdoor callbacks is just conflating two completely separate devices into a
>> single one. Besides, I think that the qemu-side of the backdoor is simple enough
>> to avoid being a maintenance burden.
>
> They have the same purpose (which are both vague TBH).  The only
> reason I'm sympathetic to this device is that virtio-serial has such
> insane semantics.

Could you summarize what's wrong?  Is it fixable?

[...]
Anthony Liguori - Dec. 7, 2011, 1:49 p.m.
On 12/07/2011 02:21 AM, Markus Armbruster wrote:
> Anthony Liguori<anthony@codemonkey.ws>  writes:
>
>> On 12/06/2011 04:30 PM, Lluís Vilanova wrote:
>>> Anthony Liguori writes:
>>>
>>>> I really worry about us introducing so many of these one-off paravirtual devices.
>>>> I would much prefer that you look at doing this as an extension to the ivshmem
>>>> device as it already has this sort of scope.  You should be able to do this by
>>>> just extending the size of bar 1 and using a well known guest id.
>>>
>>> I did in fact look at ivshmem some time ago, and it's true that both use the
>>> same mechanisms; but each device has a completely different purpose. To me it
>>> just seems that extending the control BAR in ivshmem to call the user-provided
>>> backdoor callbacks is just conflating two completely separate devices into a
>>> single one. Besides, I think that the qemu-side of the backdoor is simple enough
>>> to avoid being a maintenance burden.
>>
>> They have the same purpose (which are both vague TBH).  The only
>> reason I'm sympathetic to this device is that virtio-serial has such
>> insane semantics.
>
> Could you summarize what's wrong?  Is it fixable?

I don't think so as it's part of the userspace ABI now.

Mike, please help me make sure I get this all right.  A normal file/socket has 
the following guest semantics:

1) When a disconnect occurs, you will receive a return of '0' or -EPIPE 
depending on the platform.  The fd is now unusable and you must close/reopen.

2) You can setup SIGIO/SIGPIPE to fire off whenever a file descriptor becomes 
readable/writable.

virtio serial has the following semantics:

1) When a disconnect occurs, if you read() you will receive an -EPIPE.

2) However, if a reconnect occurs before you issue your read(), the read will 
complete with no indication that a disconnect occurred.

3) This makes it impossible to determine whether a disconnect has occurred which 
makes it very hard to reset your protocol stream.  To deal with this, 
virtio-serial can issue a SIGIO signal upon disconnect.

4) Signals are asynchronous, so a reconnect may have occurred by the time you 
get the SIGIO signal.  It's unclear that you can do anything useful with this.

So besides overloading the meaning of SIGIO, there's really no way to figure out 
in the guest when a reconnect has occurred.  To deal with this in qemu-ga, we 
actually only allow 7-bit data transfers and use the 8th bit as an in-band 
message to tell the guest that a reset has occurred.

Regards,

Anthony Liguori

>
> [...]
>
Michael Roth - Dec. 7, 2011, 7:44 p.m.
On 12/07/2011 07:49 AM, Anthony Liguori wrote:
> On 12/07/2011 02:21 AM, Markus Armbruster wrote:
>> Anthony Liguori<anthony@codemonkey.ws> writes:
>>
>>> On 12/06/2011 04:30 PM, Lluís Vilanova wrote:
>>>> Anthony Liguori writes:
>>>>
>>>>> I really worry about us introducing so many of these one-off
>>>>> paravirtual devices.
>>>>> I would much prefer that you look at doing this as an extension to
>>>>> the ivshmem
>>>>> device as it already has this sort of scope. You should be able to
>>>>> do this by
>>>>> just extending the size of bar 1 and using a well known guest id.
>>>>
>>>> I did in fact look at ivshmem some time ago, and it's true that both
>>>> use the
>>>> same mechanisms; but each device has a completely different purpose.
>>>> To me it
>>>> just seems that extending the control BAR in ivshmem to call the
>>>> user-provided
>>>> backdoor callbacks is just conflating two completely separate
>>>> devices into a
>>>> single one. Besides, I think that the qemu-side of the backdoor is
>>>> simple enough
>>>> to avoid being a maintenance burden.
>>>
>>> They have the same purpose (which are both vague TBH). The only
>>> reason I'm sympathetic to this device is that virtio-serial has such
>>> insane semantics.
>>
>> Could you summarize what's wrong? Is it fixable?
>
> I don't think so as it's part of the userspace ABI now.
>
> Mike, please help me make sure I get this all right. A normal
> file/socket has the following guest semantics:
>
> 1) When a disconnect occurs, you will receive a return of '0' or -EPIPE
> depending on the platform. The fd is now unusable and you must
> close/reopen.
>
> 2) You can setup SIGIO/SIGPIPE to fire off whenever a file descriptor
> becomes readable/writable.
>
> virtio serial has the following semantics:
>
> 1) When a disconnect occurs, if you read() you will receive an -EPIPE.
>
> 2) However, if a reconnect occurs before you issue your read(), the read
> will complete with no indication that a disconnect occurred.
>
> 3) This makes it impossible to determine whether a disconnect has
> occurred which makes it very hard to reset your protocol stream. To deal
> with this, virtio-serial can issue a SIGIO signal upon disconnect.
>
> 4) Signals are asynchronous, so a reconnect may have occurred by the
> time you get the SIGIO signal. It's unclear that you can do anything
> useful with this.

That about sums it up. There was a thread about this a while back where 
there was some tentative agreement on a way to fix this by introducing 
QEMU flags that invoke similar semantics to unix sockets:

http://thread.gmane.org/gmane.comp.emulators.qemu/94721/focus=95496

But at this point we'd need to re-visit, since there's a fair number of 
virtio-serial users now. It'd probably need to be something you could 
switch on from the guest via an fcntl() or something.

>
> So besides overloading the meaning of SIGIO, there's really no way to
> figure out in the guest when a reconnect has occurred. To deal with this
> in qemu-ga, we actually only allow 7-bit data transfers and use the 8th
> bit as an in-band message to tell the guest that a reset has occurred.

Yup, it's not perfect though, due to a delayed/spurious response from an 
agent that sent data before it read/handled the reset sequence. We don't 
get that problem with unix sockets since they'd get an -EPIPE and would 
be blocked from sending to a newly opened session.

We try to account for this on the host by following up a reset sequences 
will the guest-sync RPC, which contains a unique ID that the guest echos 
back to us. That way we can throw away stale data on the host until we 
get the intended response. In our case, it's not quite perfect since if 
the agent sent a "{" before getting reset, subsequent transmission of 
the guest-sync response can be lost. We'd need to precede responses to 
guest-sync with a 0xFF as well, so that the host flushes it's rcv 
buffer/parser state...

And, somewhat off-topic, but none of addresses the case where an agent 
hangs on an RPC. This would require some additional handling by the 
agent side where we might have tie some additional action to the 0xFF 
sequence.

Previously this scenario was handled by a hard-coded timeout mechanism 
in the agent, with a seperate thread handling the RPCs, but we've since 
dropped the thread due to potential for memory leaks (with plans to 
re-introduce using a child process).

client-induced resets would be much nicer though, and a reserved byte is 
the best solution we've been able to come up with given the current 
virtio-serial semantics.

>
> Regards,
>
> Anthony Liguori
>
>>
>> [...]
>>
>
Anthony Liguori - Dec. 7, 2011, 7:53 p.m.
On 12/07/2011 01:44 PM, Michael Roth wrote:
> On 12/07/2011 07:49 AM, Anthony Liguori wrote:
>> On 12/07/2011 02:21 AM, Markus Armbruster wrote:
>>> Anthony Liguori<anthony@codemonkey.ws> writes:
>>>
>>>> On 12/06/2011 04:30 PM, Lluís Vilanova wrote:
>>>>> Anthony Liguori writes:
>>>>>
>>>>>> I really worry about us introducing so many of these one-off
>>>>>> paravirtual devices.
>>>>>> I would much prefer that you look at doing this as an extension to
>>>>>> the ivshmem
>>>>>> device as it already has this sort of scope. You should be able to
>>>>>> do this by
>>>>>> just extending the size of bar 1 and using a well known guest id.
>>>>>
>>>>> I did in fact look at ivshmem some time ago, and it's true that both
>>>>> use the
>>>>> same mechanisms; but each device has a completely different purpose.
>>>>> To me it
>>>>> just seems that extending the control BAR in ivshmem to call the
>>>>> user-provided
>>>>> backdoor callbacks is just conflating two completely separate
>>>>> devices into a
>>>>> single one. Besides, I think that the qemu-side of the backdoor is
>>>>> simple enough
>>>>> to avoid being a maintenance burden.
>>>>
>>>> They have the same purpose (which are both vague TBH). The only
>>>> reason I'm sympathetic to this device is that virtio-serial has such
>>>> insane semantics.
>>>
>>> Could you summarize what's wrong? Is it fixable?
>>
>> I don't think so as it's part of the userspace ABI now.
>>
>> Mike, please help me make sure I get this all right. A normal
>> file/socket has the following guest semantics:
>>
>> 1) When a disconnect occurs, you will receive a return of '0' or -EPIPE
>> depending on the platform. The fd is now unusable and you must
>> close/reopen.
>>
>> 2) You can setup SIGIO/SIGPIPE to fire off whenever a file descriptor
>> becomes readable/writable.
>>
>> virtio serial has the following semantics:
>>
>> 1) When a disconnect occurs, if you read() you will receive an -EPIPE.
>>
>> 2) However, if a reconnect occurs before you issue your read(), the read
>> will complete with no indication that a disconnect occurred.
>>
>> 3) This makes it impossible to determine whether a disconnect has
>> occurred which makes it very hard to reset your protocol stream. To deal
>> with this, virtio-serial can issue a SIGIO signal upon disconnect.
>>
>> 4) Signals are asynchronous, so a reconnect may have occurred by the
>> time you get the SIGIO signal. It's unclear that you can do anything
>> useful with this.
>
> That about sums it up. There was a thread about this a while back where there
> was some tentative agreement on a way to fix this by introducing QEMU flags that
> invoke similar semantics to unix sockets:
>
> http://thread.gmane.org/gmane.comp.emulators.qemu/94721/focus=95496
>
> But at this point we'd need to re-visit, since there's a fair number of
> virtio-serial users now. It'd probably need to be something you could switch on
> from the guest via an fcntl() or something.
>
>>
>> So besides overloading the meaning of SIGIO, there's really no way to
>> figure out in the guest when a reconnect has occurred. To deal with this
>> in qemu-ga, we actually only allow 7-bit data transfers and use the 8th
>> bit as an in-band message to tell the guest that a reset has occurred.
>
> Yup, it's not perfect though, due to a delayed/spurious response from an agent
> that sent data before it read/handled the reset sequence. We don't get that
> problem with unix sockets since they'd get an -EPIPE and would be blocked from
> sending to a newly opened session.
>
> We try to account for this on the host by following up a reset sequences will
> the guest-sync RPC, which contains a unique ID that the guest echos back to us.
> That way we can throw away stale data on the host until we get the intended
> response. In our case, it's not quite perfect since if the agent sent a "{"
> before getting reset, subsequent transmission of the guest-sync response can be
> lost. We'd need to precede responses to guest-sync with a 0xFF as well, so that
> the host flushes it's rcv buffer/parser state...
>
> And, somewhat off-topic, but none of addresses the case where an agent hangs on
> an RPC. This would require some additional handling by the agent side where we
> might have tie some additional action to the 0xFF sequence.
>
> Previously this scenario was handled by a hard-coded timeout mechanism in the
> agent, with a seperate thread handling the RPCs, but we've since dropped the
> thread due to potential for memory leaks (with plans to re-introduce using a
> child process).
>
> client-induced resets would be much nicer though, and a reserved byte is the
> best solution we've been able to come up with given the current virtio-serial
> semantics.

Yeah, we really need a "sane reset semantics" flag for virtio-serial that 
provides a guest and host initiated channel close mechanism.

I think you need to do this by using a single ring and using a simple session id 
with an explicit open/close message.  That way there is never ambiguity.

And yes, I can't help but think of Dave Millers comments long ago that any PV 
transport is eventually going to reinvent TCP, poorly.

Regards,

Anthony Liguori

>
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>> [...]
>>>
>>
>
>
Markus Armbruster - Dec. 8, 2011, 10:11 a.m.
Anthony Liguori <anthony@codemonkey.ws> writes:

> On 12/07/2011 01:44 PM, Michael Roth wrote:
>> On 12/07/2011 07:49 AM, Anthony Liguori wrote:
>>> On 12/07/2011 02:21 AM, Markus Armbruster wrote:
>>>> Anthony Liguori<anthony@codemonkey.ws> writes:
[...]
>>>>> They have the same purpose (which are both vague TBH). The only
>>>>> reason I'm sympathetic to this device is that virtio-serial has such
>>>>> insane semantics.
>>>>
>>>> Could you summarize what's wrong? Is it fixable?
>>>
>>> I don't think so as it's part of the userspace ABI now.
>>>
>>> Mike, please help me make sure I get this all right. A normal
>>> file/socket has the following guest semantics:
>>>
>>> 1) When a disconnect occurs, you will receive a return of '0' or -EPIPE
>>> depending on the platform. The fd is now unusable and you must
>>> close/reopen.
>>>
>>> 2) You can setup SIGIO/SIGPIPE to fire off whenever a file descriptor
>>> becomes readable/writable.
>>>
>>> virtio serial has the following semantics:
>>>
>>> 1) When a disconnect occurs, if you read() you will receive an -EPIPE.
>>>
>>> 2) However, if a reconnect occurs before you issue your read(), the read
>>> will complete with no indication that a disconnect occurred.
>>>
>>> 3) This makes it impossible to determine whether a disconnect has
>>> occurred which makes it very hard to reset your protocol stream. To deal
>>> with this, virtio-serial can issue a SIGIO signal upon disconnect.
>>>
>>> 4) Signals are asynchronous, so a reconnect may have occurred by the
>>> time you get the SIGIO signal. It's unclear that you can do anything
>>> useful with this.
>>
>> That about sums it up. There was a thread about this a while back where there
>> was some tentative agreement on a way to fix this by introducing QEMU flags that
>> invoke similar semantics to unix sockets:
>>
>> http://thread.gmane.org/gmane.comp.emulators.qemu/94721/focus=95496
>>
>> But at this point we'd need to re-visit, since there's a fair number of
>> virtio-serial users now. It'd probably need to be something you could switch on
>> from the guest via an fcntl() or something.
>>
>>>
>>> So besides overloading the meaning of SIGIO, there's really no way to
>>> figure out in the guest when a reconnect has occurred. To deal with this
>>> in qemu-ga, we actually only allow 7-bit data transfers and use the 8th
>>> bit as an in-band message to tell the guest that a reset has occurred.
>>
>> Yup, it's not perfect though, due to a delayed/spurious response from an agent
>> that sent data before it read/handled the reset sequence. We don't get that
>> problem with unix sockets since they'd get an -EPIPE and would be blocked from
>> sending to a newly opened session.
>>
>> We try to account for this on the host by following up a reset sequences will
>> the guest-sync RPC, which contains a unique ID that the guest echos back to us.
>> That way we can throw away stale data on the host until we get the intended
>> response. In our case, it's not quite perfect since if the agent sent a "{"
>> before getting reset, subsequent transmission of the guest-sync response can be
>> lost. We'd need to precede responses to guest-sync with a 0xFF as well, so that
>> the host flushes it's rcv buffer/parser state...
>>
>> And, somewhat off-topic, but none of addresses the case where an agent hangs on
>> an RPC. This would require some additional handling by the agent side where we
>> might have tie some additional action to the 0xFF sequence.
>>
>> Previously this scenario was handled by a hard-coded timeout mechanism in the
>> agent, with a seperate thread handling the RPCs, but we've since dropped the
>> thread due to potential for memory leaks (with plans to re-introduce using a
>> child process).
>>
>> client-induced resets would be much nicer though, and a reserved byte is the
>> best solution we've been able to come up with given the current virtio-serial
>> semantics.
>
> Yeah, we really need a "sane reset semantics" flag for virtio-serial
> that provides a guest and host initiated channel close mechanism.
>
> I think you need to do this by using a single ring and using a simple
> session id with an explicit open/close message.  That way there is
> never ambiguity.

So it is fixable.

> And yes, I can't help but think of Dave Millers comments long ago that
> any PV transport is eventually going to reinvent TCP, poorly.

No doubt then, no doubt now.  But if I remember correctly, we didn't
create virtio-serial because we thought we could do better than TCP/IP.
We thought we need a zero-config communication channel that doesn't
interfere in any way with the guest's networking.  Since the network
folks were unwilling to give us one ("use TCP already"), we looked for
another bare metal thing to imitate, and chose serial lines.

Now, comparing serial lines to TCP/IP makes no sense.  Different layers.

Layering a real network protocol on top of serial line is possible; SLIP
exists.  But as long as we insist on "don't interfere in any way with
the guest's networking", we can't use TCP, and thus are doomed to
reinvent it, poorly.
Anthony Liguori - Dec. 8, 2011, 2:37 p.m.
On 12/08/2011 04:11 AM, Markus Armbruster wrote:
> Anthony Liguori<anthony@codemonkey.ws>  writes:
>> And yes, I can't help but think of Dave Millers comments long ago that
>> any PV transport is eventually going to reinvent TCP, poorly.
>
> No doubt then, no doubt now.  But if I remember correctly, we didn't
> create virtio-serial because we thought we could do better than TCP/IP.
> We thought we need a zero-config communication channel that doesn't
> interfere in any way with the guest's networking.  Since the network
> folks were unwilling to give us one ("use TCP already"), we looked for
> another bare metal thing to imitate, and chose serial lines.
>
> Now, comparing serial lines to TCP/IP makes no sense.  Different layers.

virtio-serial is not a serial line.  It attempts to have connection semantics 
(like a socket) which is what the fundamental problem is.

It probably makes sense to have a virtio-serial2 that is exposed to the guest as 
a tty device and stick strictly to serial semantics.

> Layering a real network protocol on top of serial line is possible; SLIP
> exists.  But as long as we insist on "don't interfere in any way with
> the guest's networking", we can't use TCP, and thus are doomed to
> reinvent it, poorly.

I think we just got too clever.

Regards,

Anthony Liguori

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 9784441..a45ff56 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -401,6 +401,7 @@  $(trace-obj-y): $(GENERATED_HEADERS)
 # backdoor
 
 backdoor-nested-$(CONFIG_USER_ONLY) += user.o
+backdoor-nested-$(CONFIG_SOFTMMU) += softmmu.o
 
 backdoor-obj-y += $(addprefix backdoor/qemu/, $(backdoor-nested-y))
 
diff --git a/backdoor/qemu/softmmu.c b/backdoor/qemu/softmmu.c
new file mode 100644
index 0000000..9cde59f
--- /dev/null
+++ b/backdoor/qemu/softmmu.c
@@ -0,0 +1,186 @@ 
+/*
+ * QEMU-side management of backdoor channels in softmmu emulation.
+ *
+ * Copyright (C) 2011 Lluís Vilanova <vilanova@ac.upc.edu>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "hw/pci.h"
+#include "backdoor/qemu/qemu-backdoor.h"
+
+
+#define PAGE_SIZE TARGET_PAGE_SIZE
+
+
+typedef struct BackdoorState
+{
+    PCIDevice dev;
+
+    uint8_t pages;
+    uint64_t size;
+
+    union
+    {
+        uint64_t v;
+        char     a[8];
+    } c_size;
+    union
+    {
+        uint64_t v;
+        uint8_t  a[8];
+    } c_cmd;
+
+    void *data_ptr;
+    MemoryRegion data;
+    MemoryRegion control;
+} BackdoorState;
+
+
+static uint64_t backdoor_control_io_read(void *opaque, target_phys_addr_t addr, unsigned size)
+{
+    BackdoorState *s = opaque;
+
+    /* c_size already has target endianess */
+
+    switch (size) {
+    case 1:
+    {
+        uint8_t *res = (uint8_t*)&s->c_size.a[addr % sizeof(uint64_t)];
+        return *res;
+    }
+    case 2:
+    {
+        uint16_t *res = (uint16_t*)&s->c_size.a[addr % sizeof(uint64_t)];
+        return *res;
+    }
+    case 4:
+    {
+        uint32_t *res = (uint32_t*)&s->c_size.a[addr % sizeof(uint64_t)];
+        return *res;
+    }
+    case 8:
+    {
+        uint64_t *res = (uint64_t*)&s->c_size.a[addr % sizeof(uint64_t)];
+        return *res;
+    }
+    default:
+        fprintf(stderr, "error: backdoor: Unexpected read of size %d\n", size);
+        abort();
+    }
+}
+
+static void backdoor_control_io_write(void *opaque, target_phys_addr_t addr, uint64_t data, unsigned size)
+{
+    BackdoorState *s = opaque;
+
+    /* c_cmd will have target endianess (left up to the user) */
+
+    switch (size) {
+    case 1:
+    {
+        uint8_t *res = (uint8_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
+        *res = (uint8_t)data;
+        break;
+    }
+    case 2:
+    {
+        uint16_t *res = (uint16_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
+        *res = (uint16_t)data;
+        break;
+    }
+    case 4:
+    {
+        uint32_t *res = (uint32_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
+        *res = (uint32_t)data;
+        break;
+    }
+    case 8:
+    {
+        uint64_t *res = (uint64_t*)&s->c_cmd.a[addr % sizeof(uint64_t)];
+        *res = (uint64_t)data;
+        break;
+    }
+    default:
+        fprintf(stderr, "error: backdoor: Unexpected write of size %d\n", size);
+        abort();
+    }
+
+    if ((addr + size) % sizeof(s->c_cmd.v) == 0) {
+        qemu_backdoor(s->c_cmd.v, s->data_ptr);
+    }
+}
+
+static const MemoryRegionOps backdoor_control_ops = {
+    .read = backdoor_control_io_read,
+    .write = backdoor_control_io_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .impl = {
+        .min_access_size = 1,
+        .max_access_size = 8,
+    },
+};
+
+
+static int backdoor_init(PCIDevice *dev)
+{
+    BackdoorState *s = DO_UPCAST(BackdoorState, dev, dev);
+
+    if (s->pages < 1) {
+        fprintf(stderr, "error: backdoor: "
+                "the data channel must have one or more pages\n");
+        return -1;
+    }
+    s->size = s->pages * PAGE_SIZE;
+    s->c_size.v = tswap64(s->size);
+
+    pci_set_word(s->dev.config + PCI_COMMAND,
+                 PCI_COMMAND_IO | PCI_COMMAND_MEMORY);
+
+    memory_region_init_io(&s->control, &backdoor_control_ops, s,
+                          "backdoor.control", PAGE_SIZE);
+    pci_register_bar(&s->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->control);
+
+    memory_region_init_ram(&s->data, &s->dev.qdev,
+                           "backdoor.data", s->c_size.v);
+    pci_register_bar(&s->dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY, &s->data);
+    s->data_ptr = qemu_get_ram_ptr(s->data.ram_addr);
+
+    qemu_backdoor_init(s->c_size.v);
+
+    return 0;
+}
+
+static int backdoor_fini(PCIDevice *dev)
+{
+    BackdoorState *s = DO_UPCAST(BackdoorState, dev, dev);
+
+    memory_region_destroy(&s->data);
+    memory_region_destroy(&s->control);
+
+    return 0;
+}
+
+
+static PCIDeviceInfo backdoor_info = {
+    .qdev.name  = "backdoor",
+    .qdev.desc  = "Backdoor communication channel",
+    .qdev.size  = sizeof(BackdoorState),
+    .init       = backdoor_init,
+    .exit       = backdoor_fini,
+    .vendor_id  = PCI_VENDOR_ID_REDHAT_QUMRANET,
+    .device_id  = PCI_DEVICE_ID_BACKDOOR,
+    .class_id   = PCI_CLASS_MEMORY_RAM,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_UINT8("pages", BackdoorState, pages, 1),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void backdoor_register_device(void)
+{
+    pci_qdev_register(&backdoor_info);
+}
+
+device_init(backdoor_register_device)
diff --git a/hw/pci.h b/hw/pci.h
index 625e717..e7dc3cb 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -75,6 +75,7 @@ 
 #define PCI_DEVICE_ID_VIRTIO_BLOCK       0x1001
 #define PCI_DEVICE_ID_VIRTIO_BALLOON     0x1002
 #define PCI_DEVICE_ID_VIRTIO_CONSOLE     0x1003
+#define PCI_DEVICE_ID_BACKDOOR           0x1004
 
 #define FMT_PCIBUS                      PRIx64