diff mbox series

[7/7] vhost-user-blk: Implement reconnection during realize

Message ID 20210609154658.350308-8-kwolf@redhat.com
State New
Headers show
Series vhost-user-blk: Implement reconnection during realize | expand

Commit Message

Kevin Wolf June 9, 2021, 3:46 p.m. UTC
Commit dabefdd6 removed code that was supposed to try reconnecting
during .realize(), but actually just crashed and had several design
problems.

This adds the feature back without the crash in simple cases while also
fixing some design problems: Reconnection is now only tried if there was
a problem with the connection and not an error related to the content
(which would fail again the same way in the next attempt). Reconnection
is limited to three attempts (four with the initial attempt) so that we
won't end up in an infinite loop if a problem is permanent. If the
backend restarts three times in the very short time window of device
initialisation, we have bigger problems and erroring out is the right
course of action.

In the case that a connection error occurs and we reconnect, the error
message is printed using error_report_err(), but otherwise ignored.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/block/vhost-user-blk.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

Comments

Stefano Garzarella June 10, 2021, 9:23 a.m. UTC | #1
On Wed, Jun 09, 2021 at 05:46:58PM +0200, Kevin Wolf wrote:
>Commit dabefdd6 removed code that was supposed to try reconnecting
>during .realize(), but actually just crashed and had several design
>problems.
>
>This adds the feature back without the crash in simple cases while also
>fixing some design problems: Reconnection is now only tried if there was
>a problem with the connection and not an error related to the content
>(which would fail again the same way in the next attempt). Reconnection
>is limited to three attempts (four with the initial attempt) so that we
>won't end up in an infinite loop if a problem is permanent. If the
>backend restarts three times in the very short time window of device
>initialisation, we have bigger problems and erroring out is the right
>course of action.
>
>In the case that a connection error occurs and we reconnect, the error
>message is printed using error_report_err(), but otherwise ignored.
>
>Signed-off-by: Kevin Wolf <kwolf@redhat.com>
>---
> hw/block/vhost-user-blk.c | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
>
>diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>index e49d2e4c83..f75a42bc62 100644
>--- a/hw/block/vhost-user-blk.c
>+++ b/hw/block/vhost-user-blk.c
>@@ -455,8 +455,10 @@ static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp)
>
> static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
> {
>+    ERRP_GUARD();
>     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>     VHostUserBlk *s = VHOST_USER_BLK(vdev);
>+    int retries;
>     int i, ret;
>
>     if (!s->chardev.chr) {
>@@ -498,7 +500,17 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>     s->inflight = g_new0(struct vhost_inflight, 1);
>     s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
>
>-    ret = vhost_user_blk_realize_connect(s, errp);
>+    retries = 3;

Maybe we can add a macro for this with a comment with the information 
provided in this commit message.

Thanks,
Stefano
Raphael Norwitz June 11, 2021, 8:15 p.m. UTC | #2
On Wed, Jun 09, 2021 at 05:46:58PM +0200, Kevin Wolf wrote:
> Commit dabefdd6 removed code that was supposed to try reconnecting
> during .realize(), but actually just crashed and had several design
> problems.
> 
> This adds the feature back without the crash in simple cases while also
> fixing some design problems: Reconnection is now only tried if there was
> a problem with the connection and not an error related to the content
> (which would fail again the same way in the next attempt). Reconnection
> is limited to three attempts (four with the initial attempt) so that we
> won't end up in an infinite loop if a problem is permanent. If the
> backend restarts three times in the very short time window of device
> initialisation, we have bigger problems and erroring out is the right
> course of action.
> 
> In the case that a connection error occurs and we reconnect, the error
> message is printed using error_report_err(), but otherwise ignored.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>

Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com>

> ---
>  hw/block/vhost-user-blk.c | 14 +++++++++++++-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index e49d2e4c83..f75a42bc62 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -455,8 +455,10 @@ static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp)
>  
>  static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>  {
> +    ERRP_GUARD();
>      VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>      VHostUserBlk *s = VHOST_USER_BLK(vdev);
> +    int retries;
>      int i, ret;
>  
>      if (!s->chardev.chr) {
> @@ -498,7 +500,17 @@ static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
>      s->inflight = g_new0(struct vhost_inflight, 1);
>      s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
>  
> -    ret = vhost_user_blk_realize_connect(s, errp);
> +    retries = 3;
> +    assert(!*errp);
> +    do {
> +        if (*errp) {
> +            error_prepend(errp, "Reconnecting after error: ");
> +            error_report_err(*errp);
> +            *errp = NULL;
> +        }
> +        ret = vhost_user_blk_realize_connect(s, errp);
> +    } while (ret == -EPROTO && retries--);
> +
>      if (ret < 0) {
>          goto virtio_err;
>      }
> -- 
> 2.30.2
>
diff mbox series

Patch

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index e49d2e4c83..f75a42bc62 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -455,8 +455,10 @@  static int vhost_user_blk_realize_connect(VHostUserBlk *s, Error **errp)
 
 static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
 {
+    ERRP_GUARD();
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
     VHostUserBlk *s = VHOST_USER_BLK(vdev);
+    int retries;
     int i, ret;
 
     if (!s->chardev.chr) {
@@ -498,7 +500,17 @@  static void vhost_user_blk_device_realize(DeviceState *dev, Error **errp)
     s->inflight = g_new0(struct vhost_inflight, 1);
     s->vhost_vqs = g_new0(struct vhost_virtqueue, s->num_queues);
 
-    ret = vhost_user_blk_realize_connect(s, errp);
+    retries = 3;
+    assert(!*errp);
+    do {
+        if (*errp) {
+            error_prepend(errp, "Reconnecting after error: ");
+            error_report_err(*errp);
+            *errp = NULL;
+        }
+        ret = vhost_user_blk_realize_connect(s, errp);
+    } while (ret == -EPROTO && retries--);
+
     if (ret < 0) {
         goto virtio_err;
     }