diff mbox series

[v7,12/21] multi-process: Connect Proxy Object with device in the remote process

Message ID 20f42fce1b701586a23c9abdb3b53d080845e94a.1593273671.git.elena.ufimtseva@oracle.com
State New
Headers show
Series Initial support for multi-process qemu | expand

Commit Message

Elena Ufimtseva June 27, 2020, 5:09 p.m. UTC
From: Jagannathan Raman <jag.raman@oracle.com>

Send a message to the remote process to connect PCI device with the
corresponding Proxy object in QEMU

Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
---
 hw/i386/remote-msg.c     | 39 +++++++++++++++++++++++++++++++++++++++
 hw/pci/proxy.c           | 28 ++++++++++++++++++++++++++++
 include/hw/pci/proxy.h   |  1 +
 include/io/mpqemu-link.h |  1 +
 io/mpqemu-link.c         |  8 ++++++++
 5 files changed, 77 insertions(+)

Comments

Stefan Hajnoczi July 1, 2020, 9:20 a.m. UTC | #1
On Sat, Jun 27, 2020 at 10:09:34AM -0700, elena.ufimtseva@oracle.com wrote:
> From: Jagannathan Raman <jag.raman@oracle.com>
> 
> Send a message to the remote process to connect PCI device with the
> corresponding Proxy object in QEMU

I thought the protocol was simplified to a 1:1 device:socket model, but
this patch seems to implement an N:1 model?

In a 1:1 model the CONNECT_DEV message is not necessary because each
socket is already associated with a specific remote device (e.g. qemu -M
remote -object mplink,dev=lsi-scsi-1,sockpath=/tmp/lsi-scsi-1.sock).
Connecting to the socket already indicates which device we are talking
to.

The N:1 model will work but it's more complex. There is a main socket
that is used for CONNECT_DEV (anything else?) and we need to worry about
the lifecycle of the per-device sockets that are passed over the main
socket.

> @@ -50,3 +58,34 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
>  
>      return TRUE;
>  }
> +
> +static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
> +                                    Error **errp)
> +{
> +    char *devid = (char *)msg->data2;
> +    QIOChannel *dioc = NULL;
> +    DeviceState *dev = NULL;
> +    MPQemuMsg ret = { 0 };
> +    int rc = 0;
> +
> +    g_assert(devid && (devid[msg->size - 1] == '\0'));

Asserts are not suitable for external input validation since a failure
aborts the program and lets the client cause a denial-of-service. When
there are multiple clients, one misbehaved client shouldn't be able to
kill the server. Please validate devid using an if statement and set
errp on failure.

Can msg->size be 0? If yes, this code accesses before the beginning of
the buffer.

> +
> +    dev = qdev_find_recursive(sysbus_get_default(), devid);
> +    if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
> +        rc = 0xff;
> +        goto exit;
> +    }
> +
> +    dioc = qio_channel_new_fd(msg->fds[0], errp);

Missing error handling if qio_channel_new_fd() fails. We need to
close(msg->fds[0]) ourselves in this case.

> +
> +    qio_channel_add_watch(dioc, G_IO_IN | G_IO_HUP, mpqemu_process_msg,
> +                          (void *)dev, NULL);
> +
> +exit:
> +    ret.cmd = RET_MSG;
> +    ret.bytestream = 0;
> +    ret.data1.u64 = rc;
> +    ret.size = sizeof(ret.data1);
> +
> +    mpqemu_msg_send(&ret, com);
> +}
> diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
> index 6d62399c52..16649ed0ec 100644
> --- a/hw/pci/proxy.c
> +++ b/hw/pci/proxy.c
> @@ -15,10 +15,38 @@
>  #include "io/channel-util.h"
>  #include "hw/qdev-properties.h"
>  #include "monitor/monitor.h"
> +#include "io/mpqemu-link.h"
>  
>  static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
>  {
> +    DeviceState *dev = DEVICE(pdev);
> +    MPQemuMsg msg = { 0 };
> +    int fds[2];
> +    Error *local_err = NULL;
> +
>      pdev->com = qio_channel_new_fd(fd, errp);
> +
> +    if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) {
> +        error_setg(errp, "Failed to create proxy channel with fd %d", fd);
> +        return;

pdev->com needs to be cleaned up.

> diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
> index 5887c8c6c0..54df3b254e 100644
> --- a/io/mpqemu-link.c
> +++ b/io/mpqemu-link.c
> @@ -234,6 +234,14 @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
>              return false;
>          }
>          break;
> +    case CONNECT_DEV:
> +        if ((msg->num_fds != 1) ||
> +            (msg->fds[0] == -1) ||
> +            (msg->fds[0] == -1) ||

This line is duplicated.
Jag Raman July 24, 2020, 4:57 p.m. UTC | #2
> On Jul 1, 2020, at 5:20 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Sat, Jun 27, 2020 at 10:09:34AM -0700, elena.ufimtseva@oracle.com wrote:
>> From: Jagannathan Raman <jag.raman@oracle.com>
>> 
>> Send a message to the remote process to connect PCI device with the
>> corresponding Proxy object in QEMU
> 
> I thought the protocol was simplified to a 1:1 device:socket model, but
> this patch seems to implement an N:1 model?

Hi Stefan,

Thanks for your feedback on all the patches. We were able to address most
of your feedback in this series, and we are looking forward to sending the next
version out for review.

We wanted to double check with you regarding this patch alone.

The protocol is still a 1:1 device:socket model. It’s not possible for a proxy device
object to send messages to arbitrary devices in the remote.

> 
> In a 1:1 model the CONNECT_DEV message is not necessary because each
> socket is already associated with a specific remote device (e.g. qemu -M
> remote -object mplink,dev=lsi-scsi-1,sockpath=/tmp/lsi-scsi-1.sock).
> Connecting to the socket already indicates which device we are talking
> to.
> 
> The N:1 model will work but it's more complex. There is a main socket
> that is used for CONNECT_DEV (anything else?) and we need to worry about
> the lifecycle of the per-device sockets that are passed over the main
> socket.

The main socket is only used for CONNECT_DEV. The CONNECT_DEV
message sticks a fd to the remote device.

We are using the following command-line in the remote process:
qemu-system-x86_64 -machine remote,fd=4 -device lsi53c895a,id=lsi1 ...

The alternative approach would be to let the orchestrator to assign fds for
each remote device. In this approach, we would specify an ‘fd’ for each
device object like below:
qemu-system-x86_64 -machine remote -device lsi53c895a,id=lsi1,fd=4 …

The alternative approach would entail changes to the DeviceState struct
and qdev_device_add() function, which we thought is not preferable. Could
you please share your thoughts on this?

Thanks!
—
Jag

> 
>> @@ -50,3 +58,34 @@ gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
>> 
>>     return TRUE;
>> }
>> +
>> +static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
>> +                                    Error **errp)
>> +{
>> +    char *devid = (char *)msg->data2;
>> +    QIOChannel *dioc = NULL;
>> +    DeviceState *dev = NULL;
>> +    MPQemuMsg ret = { 0 };
>> +    int rc = 0;
>> +
>> +    g_assert(devid && (devid[msg->size - 1] == '\0'));
> 
> Asserts are not suitable for external input validation since a failure
> aborts the program and lets the client cause a denial-of-service. When
> there are multiple clients, one misbehaved client shouldn't be able to
> kill the server. Please validate devid using an if statement and set
> errp on failure.
> 
> Can msg->size be 0? If yes, this code accesses before the beginning of
> the buffer.
> 
>> +
>> +    dev = qdev_find_recursive(sysbus_get_default(), devid);
>> +    if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
>> +        rc = 0xff;
>> +        goto exit;
>> +    }
>> +
>> +    dioc = qio_channel_new_fd(msg->fds[0], errp);
> 
> Missing error handling if qio_channel_new_fd() fails. We need to
> close(msg->fds[0]) ourselves in this case.
> 
>> +
>> +    qio_channel_add_watch(dioc, G_IO_IN | G_IO_HUP, mpqemu_process_msg,
>> +                          (void *)dev, NULL);
>> +
>> +exit:
>> +    ret.cmd = RET_MSG;
>> +    ret.bytestream = 0;
>> +    ret.data1.u64 = rc;
>> +    ret.size = sizeof(ret.data1);
>> +
>> +    mpqemu_msg_send(&ret, com);
>> +}
>> diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
>> index 6d62399c52..16649ed0ec 100644
>> --- a/hw/pci/proxy.c
>> +++ b/hw/pci/proxy.c
>> @@ -15,10 +15,38 @@
>> #include "io/channel-util.h"
>> #include "hw/qdev-properties.h"
>> #include "monitor/monitor.h"
>> +#include "io/mpqemu-link.h"
>> 
>> static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
>> {
>> +    DeviceState *dev = DEVICE(pdev);
>> +    MPQemuMsg msg = { 0 };
>> +    int fds[2];
>> +    Error *local_err = NULL;
>> +
>>     pdev->com = qio_channel_new_fd(fd, errp);
>> +
>> +    if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) {
>> +        error_setg(errp, "Failed to create proxy channel with fd %d", fd);
>> +        return;
> 
> pdev->com needs to be cleaned up.
> 
>> diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
>> index 5887c8c6c0..54df3b254e 100644
>> --- a/io/mpqemu-link.c
>> +++ b/io/mpqemu-link.c
>> @@ -234,6 +234,14 @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
>>             return false;
>>         }
>>         break;
>> +    case CONNECT_DEV:
>> +        if ((msg->num_fds != 1) ||
>> +            (msg->fds[0] == -1) ||
>> +            (msg->fds[0] == -1) ||
> 
> This line is duplicated.
Stefan Hajnoczi July 27, 2020, 1:18 p.m. UTC | #3
On Fri, Jul 24, 2020 at 12:57:22PM -0400, Jag Raman wrote:
> > On Jul 1, 2020, at 5:20 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> > On Sat, Jun 27, 2020 at 10:09:34AM -0700, elena.ufimtseva@oracle.com wrote:
> > In a 1:1 model the CONNECT_DEV message is not necessary because each
> > socket is already associated with a specific remote device (e.g. qemu -M
> > remote -object mplink,dev=lsi-scsi-1,sockpath=/tmp/lsi-scsi-1.sock).
> > Connecting to the socket already indicates which device we are talking
> > to.
> > 
> > The N:1 model will work but it's more complex. There is a main socket
> > that is used for CONNECT_DEV (anything else?) and we need to worry about
> > the lifecycle of the per-device sockets that are passed over the main
> > socket.
> 
> The main socket is only used for CONNECT_DEV. The CONNECT_DEV
> message sticks a fd to the remote device.
> 
> We are using the following command-line in the remote process:
> qemu-system-x86_64 -machine remote,fd=4 -device lsi53c895a,id=lsi1 ...
> 
> The alternative approach would be to let the orchestrator to assign fds for
> each remote device. In this approach, we would specify an ‘fd’ for each
> device object like below:
> qemu-system-x86_64 -machine remote -device lsi53c895a,id=lsi1,fd=4 …
> 
> The alternative approach would entail changes to the DeviceState struct
> and qdev_device_add() function, which we thought is not preferable. Could
> you please share your thoughts on this?

I suggest dropping multi-device support for now. It will be implemented
differently with VFIO-over-socket anyway, so it's not worth investing
much time into.

The main socket approach needs authentication support if multiple guests
share a remote device emulation process. Otherwise guest A can access
guest B's devices.

It's simpler if each device has a separate UNIX domain socket. It is not
necessary to modify lsi53c895a in order to do this. Either the socket
can be associated with the remote PCIe port (although I think the
current code implements the older PCI Local Bus instead of PCIe) or a
separate -object mpqlink,device=lsi1,fd=4 object can be defined (I think
that's the syntax I've shared in the past).

For now though, just using the -machine remote,fd=4 approach is fine -
but limited to 1 device.

Stefan
Michael S. Tsirkin July 27, 2020, 1:22 p.m. UTC | #4
On Mon, Jul 27, 2020 at 02:18:29PM +0100, Stefan Hajnoczi wrote:
> I suggest dropping multi-device support for now. It will be implemented
> differently with VFIO-over-socket anyway, so it's not worth investing
> much time into.

I'm not sure I buy the VFIO-over-socket yet. It seems a weirdly QEMU
specific IPC mechanism. However ...

> The main socket approach needs authentication support if multiple guests
> share a remote device emulation process. Otherwise guest A can access
> guest B's devices.
> 
> It's simpler if each device has a separate UNIX domain socket. It is not
> necessary to modify lsi53c895a in order to do this. Either the socket
> can be associated with the remote PCIe port (although I think the
> current code implements the older PCI Local Bus instead of PCIe) or a
> separate -object mpqlink,device=lsi1,fd=4 object can be defined (I think
> that's the syntax I've shared in the past).
> 
> For now though, just using the -machine remote,fd=4 approach is fine -
> but limited to 1 device.
> 
> Stefan

I agree to all of the above. Starting with a single fd is a good idea.
Jag Raman July 31, 2020, 6:31 p.m. UTC | #5
> On Jul 27, 2020, at 9:18 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> On Fri, Jul 24, 2020 at 12:57:22PM -0400, Jag Raman wrote:
>>> On Jul 1, 2020, at 5:20 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
>>> On Sat, Jun 27, 2020 at 10:09:34AM -0700, elena.ufimtseva@oracle.com wrote:
>>> In a 1:1 model the CONNECT_DEV message is not necessary because each
>>> socket is already associated with a specific remote device (e.g. qemu -M
>>> remote -object mplink,dev=lsi-scsi-1,sockpath=/tmp/lsi-scsi-1.sock).
>>> Connecting to the socket already indicates which device we are talking
>>> to.
>>> 
>>> The N:1 model will work but it's more complex. There is a main socket
>>> that is used for CONNECT_DEV (anything else?) and we need to worry about
>>> the lifecycle of the per-device sockets that are passed over the main
>>> socket.
>> 
>> The main socket is only used for CONNECT_DEV. The CONNECT_DEV
>> message sticks a fd to the remote device.
>> 
>> We are using the following command-line in the remote process:
>> qemu-system-x86_64 -machine remote,fd=4 -device lsi53c895a,id=lsi1 ...
>> 
>> The alternative approach would be to let the orchestrator to assign fds for
>> each remote device. In this approach, we would specify an ‘fd’ for each
>> device object like below:
>> qemu-system-x86_64 -machine remote -device lsi53c895a,id=lsi1,fd=4 …
>> 
>> The alternative approach would entail changes to the DeviceState struct
>> and qdev_device_add() function, which we thought is not preferable. Could
>> you please share your thoughts on this?
> 
> I suggest dropping multi-device support for now. It will be implemented
> differently with VFIO-over-socket anyway, so it's not worth investing
> much time into.
> 
> The main socket approach needs authentication support if multiple guests
> share a remote device emulation process. Otherwise guest A can access
> guest B's devices.
> 
> It's simpler if each device has a separate UNIX domain socket. It is not
> necessary to modify lsi53c895a in order to do this. Either the socket
> can be associated with the remote PCIe port (although I think the
> current code implements the older PCI Local Bus instead of PCIe) or a
> separate -object mpqlink,device=lsi1,fd=4 object can be defined (I think
> that's the syntax I've shared in the past).
> 
> For now though, just using the -machine remote,fd=4 approach is fine -
> but limited to 1 device.

Hi Stefan,

Thanks for your response.

We didn’t quite get the purpose of the “object” when you proposed the
command-line. But we understand it now.

We also agree to wait for the authentication protocol before enabling
multiple devices in the same remote process. Therefore we have limited
the number of devices to 1.

Even with one device, we still had the problem of attaching the file
descriptor with a device int the remote end. The object idea you proposed
seemed to fit well with connecting the fd with device. We have added this
object in v8, which we just sent out.

We have dropped the main-socket/control-channel as it’s not needed
anymore.

We look forward to your responses to v8.

Thank you very much!

> 
> Stefan
diff mbox series

Patch

diff --git a/hw/i386/remote-msg.c b/hw/i386/remote-msg.c
index 58e24ab2ad..68f50866bb 100644
--- a/hw/i386/remote-msg.c
+++ b/hw/i386/remote-msg.c
@@ -6,6 +6,11 @@ 
 #include "io/mpqemu-link.h"
 #include "qapi/error.h"
 #include "sysemu/runstate.h"
+#include "io/channel-util.h"
+#include "hw/pci/pci.h"
+
+static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
+                                    Error **errp);
 
 gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
                             gpointer opaque)
@@ -34,6 +39,9 @@  gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
     }
 
     switch (msg.cmd) {
+    case CONNECT_DEV:
+        process_connect_dev_msg(&msg, ioc, &local_err);
+        break;
     default:
         error_setg(&local_err, "Unknown command (%d) received from proxy \
                    in remote process pid=%d", msg.cmd, getpid());
@@ -50,3 +58,34 @@  gboolean mpqemu_process_msg(QIOChannel *ioc, GIOCondition cond,
 
     return TRUE;
 }
+
+static void process_connect_dev_msg(MPQemuMsg *msg, QIOChannel *com,
+                                    Error **errp)
+{
+    char *devid = (char *)msg->data2;
+    QIOChannel *dioc = NULL;
+    DeviceState *dev = NULL;
+    MPQemuMsg ret = { 0 };
+    int rc = 0;
+
+    g_assert(devid && (devid[msg->size - 1] == '\0'));
+
+    dev = qdev_find_recursive(sysbus_get_default(), devid);
+    if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
+        rc = 0xff;
+        goto exit;
+    }
+
+    dioc = qio_channel_new_fd(msg->fds[0], errp);
+
+    qio_channel_add_watch(dioc, G_IO_IN | G_IO_HUP, mpqemu_process_msg,
+                          (void *)dev, NULL);
+
+exit:
+    ret.cmd = RET_MSG;
+    ret.bytestream = 0;
+    ret.data1.u64 = rc;
+    ret.size = sizeof(ret.data1);
+
+    mpqemu_msg_send(&ret, com);
+}
diff --git a/hw/pci/proxy.c b/hw/pci/proxy.c
index 6d62399c52..16649ed0ec 100644
--- a/hw/pci/proxy.c
+++ b/hw/pci/proxy.c
@@ -15,10 +15,38 @@ 
 #include "io/channel-util.h"
 #include "hw/qdev-properties.h"
 #include "monitor/monitor.h"
+#include "io/mpqemu-link.h"
 
 static void proxy_set_socket(PCIProxyDev *pdev, int fd, Error **errp)
 {
+    DeviceState *dev = DEVICE(pdev);
+    MPQemuMsg msg = { 0 };
+    int fds[2];
+    Error *local_err = NULL;
+
     pdev->com = qio_channel_new_fd(fd, errp);
+
+    if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds)) {
+        error_setg(errp, "Failed to create proxy channel with fd %d", fd);
+        return;
+    }
+
+    msg.cmd = CONNECT_DEV;
+    msg.bytestream = 1;
+    msg.data2 = (uint8_t *)dev->id;
+    msg.size = strlen(dev->id) + 1;
+    msg.num_fds = 1;
+    msg.fds[0] = fds[1];
+
+    (void)mpqemu_msg_send_reply_co(&msg, pdev->com, &local_err);
+    if (local_err) {
+        error_setg(errp, "Failed to send DEV_CONNECT to the remote process");
+        close(fds[0]);
+    } else {
+        pdev->dev = qio_channel_new_fd(fds[0], errp);
+    }
+
+    close(fds[1]);
 }
 
 static Property proxy_properties[] = {
diff --git a/include/hw/pci/proxy.h b/include/hw/pci/proxy.h
index c1c7142fa2..72dd7e0944 100644
--- a/include/hw/pci/proxy.h
+++ b/include/hw/pci/proxy.h
@@ -30,6 +30,7 @@  typedef struct PCIProxyDev {
     PCIDevice parent_dev;
     char *fd;
     QIOChannel *com;
+    QIOChannel *dev;
 } PCIProxyDev;
 
 typedef struct PCIProxyDevClass {
diff --git a/include/io/mpqemu-link.h b/include/io/mpqemu-link.h
index c6d2b6bf8b..d620806c17 100644
--- a/include/io/mpqemu-link.h
+++ b/include/io/mpqemu-link.h
@@ -36,6 +36,7 @@ 
 typedef enum {
     INIT = 0,
     SYNC_SYSMEM,
+    CONNECT_DEV,
     RET_MSG,
     MAX = INT_MAX,
 } MPQemuCmd;
diff --git a/io/mpqemu-link.c b/io/mpqemu-link.c
index 5887c8c6c0..54df3b254e 100644
--- a/io/mpqemu-link.c
+++ b/io/mpqemu-link.c
@@ -234,6 +234,14 @@  bool mpqemu_msg_valid(MPQemuMsg *msg)
             return false;
         }
         break;
+    case CONNECT_DEV:
+        if ((msg->num_fds != 1) ||
+            (msg->fds[0] == -1) ||
+            (msg->fds[0] == -1) ||
+            !msg->bytestream) {
+            return false;
+        }
+        break;
     default:
         break;
     }