diff mbox series

[v3,5/5] virtio-gpu: fix v2 migration

Message ID 20240515141557.1277999-6-marcandre.lureau@redhat.com
State New
Headers show
Series Fix "virtio-gpu: fix scanout migration post-load" | expand

Commit Message

Marc-André Lureau May 15, 2024, 2:15 p.m. UTC
From: Marc-André Lureau <marcandre.lureau@redhat.com>

Commit dfcf74fa ("virtio-gpu: fix scanout migration post-load") broke
forward/backward version migration. Versioning of nested VMSD structures
is not straightforward, as the wire format doesn't have nested
structures versions.

Use the previously introduced check_machine_version() function as a
field test to ensure proper saving/loading based on the machine version.
The VMSD.version is irrelevant now.

Fixes: dfcf74fa ("virtio-gpu: fix scanout migration post-load")
Suggested-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
---
 hw/display/virtio-gpu.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

Comments

Michael S. Tsirkin May 15, 2024, 4:02 p.m. UTC | #1
On Wed, May 15, 2024 at 06:15:56PM +0400, marcandre.lureau@redhat.com wrote:
> From: Marc-André Lureau <marcandre.lureau@redhat.com>
> 
> Commit dfcf74fa ("virtio-gpu: fix scanout migration post-load") broke
> forward/backward version migration. Versioning of nested VMSD structures
> is not straightforward, as the wire format doesn't have nested
> structures versions.
> 
> Use the previously introduced check_machine_version() function as a
> field test to ensure proper saving/loading based on the machine version.
> The VMSD.version is irrelevant now.
> 
> Fixes: dfcf74fa ("virtio-gpu: fix scanout migration post-load")
> Suggested-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>

I don't get it. Our standard way to do it is:
- add a property (begin name with x- so we don't commit to an API)
- set from compat machinery
- test property value in VMSTATE macros

Big advantage is, it works well with any downstreams
which pick any properties they like.
Why is this not a good fit here?


> ---
>  hw/display/virtio-gpu.c | 21 +++++++++++++--------
>  1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index ae831b6b3e..b2d8e5faeb 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -20,6 +20,7 @@
>  #include "trace.h"
>  #include "sysemu/dma.h"
>  #include "sysemu/sysemu.h"
> +#include "hw/boards.h"
>  #include "hw/virtio/virtio.h"
>  #include "migration/qemu-file-types.h"
>  #include "hw/virtio/virtio-gpu.h"
> @@ -1166,10 +1167,14 @@ static void virtio_gpu_cursor_bh(void *opaque)
>      virtio_gpu_handle_cursor(&g->parent_obj.parent_obj, g->cursor_vq);
>  }
>  
> +static bool machine_check_9_0(void *opaque, int version)
> +{
> +    return machine_check_version(9, 0);
> +}
> +
>  static const VMStateDescription vmstate_virtio_gpu_scanout = {
>      .name = "virtio-gpu-one-scanout",
> -    .version_id = 2,
> -    .minimum_version_id = 1,
> +    .version_id = 1,
>      .fields = (const VMStateField[]) {
>          VMSTATE_UINT32(resource_id, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(width, struct virtio_gpu_scanout),
> @@ -1181,12 +1186,12 @@ static const VMStateDescription vmstate_virtio_gpu_scanout = {
>          VMSTATE_UINT32(cursor.hot_y, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(cursor.pos.x, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(cursor.pos.y, struct virtio_gpu_scanout),
> -        VMSTATE_UINT32_V(fb.format, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.bytes_pp, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.width, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.height, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.stride, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.offset, struct virtio_gpu_scanout, 2),
> +        VMSTATE_UINT32_TEST(fb.format, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.bytes_pp, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.width, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.height, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.stride, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.offset, struct virtio_gpu_scanout, machine_check_9_0),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> -- 
> 2.41.0.28.gd7d8841f67
Daniel P. Berrangé May 15, 2024, 4:03 p.m. UTC | #2
On Wed, May 15, 2024 at 06:15:56PM +0400, marcandre.lureau@redhat.com wrote:
> From: Marc-André Lureau <marcandre.lureau@redhat.com>
> 
> Commit dfcf74fa ("virtio-gpu: fix scanout migration post-load") broke
> forward/backward version migration. Versioning of nested VMSD structures
> is not straightforward, as the wire format doesn't have nested
> structures versions.
> 
> Use the previously introduced check_machine_version() function as a
> field test to ensure proper saving/loading based on the machine version.
> The VMSD.version is irrelevant now.
> 
> Fixes: dfcf74fa ("virtio-gpu: fix scanout migration post-load")
> Suggested-by: Peter Xu <peterx@redhat.com>
> Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> ---
>  hw/display/virtio-gpu.c | 21 +++++++++++++--------
>  1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index ae831b6b3e..b2d8e5faeb 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -20,6 +20,7 @@
>  #include "trace.h"
>  #include "sysemu/dma.h"
>  #include "sysemu/sysemu.h"
> +#include "hw/boards.h"
>  #include "hw/virtio/virtio.h"
>  #include "migration/qemu-file-types.h"
>  #include "hw/virtio/virtio-gpu.h"
> @@ -1166,10 +1167,14 @@ static void virtio_gpu_cursor_bh(void *opaque)
>      virtio_gpu_handle_cursor(&g->parent_obj.parent_obj, g->cursor_vq);
>  }
>  
> +static bool machine_check_9_0(void *opaque, int version)
> +{
> +    return machine_check_version(9, 0);
> +}

I think applying version number checks to decide machine type
compatibility is a highly undesirable direction for QEMU to
take.

Machine type compatibility is a difficult problem, but one of
the good aspects about our current solution is that it is
clear what the differences are for each version. We can see
all the compatibility properties/flags/values being set in
one place, in the declaration of each machine's class.

Sprinkling version number checks around the codebase in
arbitrary files will harm visibility of what ABI is expressd
by each machine, and thus is liable to increase the liklihood
of mistakes.

This will negatively impact downstream vendors cherry-picking
patches to their stable branches, as the version number logic
may have incorrect semantics. 

It will also create trouble for downstream vendors who define
their own machines with distinct versioning from upstream, as
there will be confusion over whether a version check is for
the base QEMU version, or the downstream version, and such
code added to the tree is less visible than the machine type
definitions.

Above all, I'm failing to see why there's a compelling reason
for virtio_gpu to diverge from our long standing practice of
adding a named property flag "virtio_scanout_vmstate_fix"
on the machine class, and then setting it in machine types
which need it.


> +
>  static const VMStateDescription vmstate_virtio_gpu_scanout = {
>      .name = "virtio-gpu-one-scanout",
> -    .version_id = 2,
> -    .minimum_version_id = 1,
> +    .version_id = 1,
>      .fields = (const VMStateField[]) {
>          VMSTATE_UINT32(resource_id, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(width, struct virtio_gpu_scanout),
> @@ -1181,12 +1186,12 @@ static const VMStateDescription vmstate_virtio_gpu_scanout = {
>          VMSTATE_UINT32(cursor.hot_y, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(cursor.pos.x, struct virtio_gpu_scanout),
>          VMSTATE_UINT32(cursor.pos.y, struct virtio_gpu_scanout),
> -        VMSTATE_UINT32_V(fb.format, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.bytes_pp, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.width, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.height, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.stride, struct virtio_gpu_scanout, 2),
> -        VMSTATE_UINT32_V(fb.offset, struct virtio_gpu_scanout, 2),
> +        VMSTATE_UINT32_TEST(fb.format, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.bytes_pp, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.width, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.height, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.stride, struct virtio_gpu_scanout, machine_check_9_0),
> +        VMSTATE_UINT32_TEST(fb.offset, struct virtio_gpu_scanout, machine_check_9_0),
>          VMSTATE_END_OF_LIST()
>      },
>  };
> -- 
> 2.41.0.28.gd7d8841f67
> 
> 

With regards,
Daniel
Peter Xu May 15, 2024, 4:31 p.m. UTC | #3
On Wed, May 15, 2024 at 12:02:49PM -0400, Michael S. Tsirkin wrote:
> On Wed, May 15, 2024 at 06:15:56PM +0400, marcandre.lureau@redhat.com wrote:
> > From: Marc-André Lureau <marcandre.lureau@redhat.com>
> > 
> > Commit dfcf74fa ("virtio-gpu: fix scanout migration post-load") broke
> > forward/backward version migration. Versioning of nested VMSD structures
> > is not straightforward, as the wire format doesn't have nested
> > structures versions.
> > 
> > Use the previously introduced check_machine_version() function as a
> > field test to ensure proper saving/loading based on the machine version.
> > The VMSD.version is irrelevant now.
> > 
> > Fixes: dfcf74fa ("virtio-gpu: fix scanout migration post-load")
> > Suggested-by: Peter Xu <peterx@redhat.com>
> > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> 
> I don't get it. Our standard way to do it is:
> - add a property (begin name with x- so we don't commit to an API)
> - set from compat machinery
> - test property value in VMSTATE macros
> 
> Big advantage is, it works well with any downstreams
> which pick any properties they like.
> Why is this not a good fit here?

I think it'll simplify upstream to avoid introducing one new field + one
new property for each of such protocol change, which fundamentally are the
same thing.  But it's indeed a good point that such helper can slightly
complicate the backport a bit.. I assume a global replacement of versions
over the helper will be needed after downstream settles on how to map
downstream MCs to upstream's.

Thanks,
Peter Xu May 15, 2024, 5:03 p.m. UTC | #4
On Wed, May 15, 2024 at 05:03:44PM +0100, Daniel P. Berrangé wrote:
> Above all, I'm failing to see why there's a compelling reason
> for virtio_gpu to diverge from our long standing practice of
> adding a named property flag "virtio_scanout_vmstate_fix"
> on the machine class, and then setting it in machine types
> which need it.

The reason to introduce that is definitely avoid introducing fields /
properties in similar cases in which case all the fields may represent the
same thing ("return true if MC is older than xxx version").  Especially
when such change is not bound to a new feature so in which case it won't
make sense to allow user to even control that propoerty, even if we
exported this "x-virtio-scanout-fix" property, but now we must export it
because compat fields require it.

However I think agree that having upstream specific MC versions in VMSD
checks is kind of unwanted.  I think the major problem is we don't have
that extra machine type abstract where we can have simply a number showing
the release of QEMU, then we can map that number to whatever
upstream/downstream machine types.  E.g.:

  Release No.         Upstream version       Downstream version
  50                  9.0                    Y.0
  51                  9.1
  52                  9.2                    Y.1
  ...

Then downstream is not mapping to 9.0/... but the release no.  Then here
instead of hard code upstream MC versions we can already provide similar
helpers like:

  machine_type_newer_than_50()

Then device code can use it without polluting that with upstream MC
versioning.  Downstream will simply work if downstream MCs are mapped
alright to the release no. when rebase.

But I'm not sure whether it'll be even worthwhile.. the majority will still
be that the VMSD change is caused by a new feature, and exporting that
property might in most cases be wanted.

In all cases, for now I agree it's at least easier to stick with the simple
way.

Thanks,
Daniel P. Berrangé May 15, 2024, 5:15 p.m. UTC | #5
On Wed, May 15, 2024 at 11:03:27AM -0600, Peter Xu wrote:
> On Wed, May 15, 2024 at 05:03:44PM +0100, Daniel P. Berrangé wrote:
> > Above all, I'm failing to see why there's a compelling reason
> > for virtio_gpu to diverge from our long standing practice of
> > adding a named property flag "virtio_scanout_vmstate_fix"
> > on the machine class, and then setting it in machine types
> > which need it.
> 
> The reason to introduce that is definitely avoid introducing fields /
> properties in similar cases in which case all the fields may represent the
> same thing ("return true if MC is older than xxx version").  Especially
> when such change is not bound to a new feature so in which case it won't
> make sense to allow user to even control that propoerty, even if we
> exported this "x-virtio-scanout-fix" property, but now we must export it
> because compat fields require it.
> 
> However I think agree that having upstream specific MC versions in VMSD
> checks is kind of unwanted.  I think the major problem is we don't have
> that extra machine type abstract where we can have simply a number showing
> the release of QEMU, then we can map that number to whatever
> upstream/downstream machine types.  E.g.:
> 
>   Release No.         Upstream version       Downstream version
>   50                  9.0                    Y.0
>   51                  9.1
>   52                  9.2                    Y.1
>   ...

Downstream versions do not map cleanly to individual upstream versions
across the whole code base. If we have two distinct features in upstream
version X, each of them may map to a different downstream release. 

This can happen when downstream skips one or more upstream releases.
One feature from the skipped release might be backported to an earlier
downstream release, while other feature might not arrive downstream
until they later rebase. Version based checks are an inherantly
undesirable idea for a situation where there is any backporting taking
place, whether its machine type versions or something else. Named feature
/ flag based checks are always the way to go.


With regards,
Daniel
Michael S. Tsirkin May 15, 2024, 10:02 p.m. UTC | #6
On Wed, May 15, 2024 at 10:31:32AM -0600, Peter Xu wrote:
> On Wed, May 15, 2024 at 12:02:49PM -0400, Michael S. Tsirkin wrote:
> > On Wed, May 15, 2024 at 06:15:56PM +0400, marcandre.lureau@redhat.com wrote:
> > > From: Marc-André Lureau <marcandre.lureau@redhat.com>
> > > 
> > > Commit dfcf74fa ("virtio-gpu: fix scanout migration post-load") broke
> > > forward/backward version migration. Versioning of nested VMSD structures
> > > is not straightforward, as the wire format doesn't have nested
> > > structures versions.
> > > 
> > > Use the previously introduced check_machine_version() function as a
> > > field test to ensure proper saving/loading based on the machine version.
> > > The VMSD.version is irrelevant now.
> > > 
> > > Fixes: dfcf74fa ("virtio-gpu: fix scanout migration post-load")
> > > Suggested-by: Peter Xu <peterx@redhat.com>
> > > Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
> > 
> > I don't get it. Our standard way to do it is:
> > - add a property (begin name with x- so we don't commit to an API)
> > - set from compat machinery
> > - test property value in VMSTATE macros
> > 
> > Big advantage is, it works well with any downstreams
> > which pick any properties they like.
> > Why is this not a good fit here?
> 
> I think it'll simplify upstream to avoid introducing one new field + one
> new property for each of such protocol change, which fundamentally are the
> same thing.  But it's indeed a good point that such helper can slightly
> complicate the backport a bit.. I assume a global replacement of versions
> over the helper will be needed after downstream settles on how to map
> downstream MCs to upstream's.
> 
> Thanks,

There's nothing special about this specific code. If we want to rework
how machine compat is handled we can do it, but I wouldn't start with
this virtio gpu bug.

It's a big if though, I don't like how this patch works at all.
Peter Xu May 16, 2024, 4:11 a.m. UTC | #7
On Wed, May 15, 2024 at 06:15:48PM +0100, Daniel P. Berrangé wrote:
> On Wed, May 15, 2024 at 11:03:27AM -0600, Peter Xu wrote:
> > On Wed, May 15, 2024 at 05:03:44PM +0100, Daniel P. Berrangé wrote:
> > > Above all, I'm failing to see why there's a compelling reason
> > > for virtio_gpu to diverge from our long standing practice of
> > > adding a named property flag "virtio_scanout_vmstate_fix"
> > > on the machine class, and then setting it in machine types
> > > which need it.
> > 
> > The reason to introduce that is definitely avoid introducing fields /
> > properties in similar cases in which case all the fields may represent the
> > same thing ("return true if MC is older than xxx version").  Especially
> > when such change is not bound to a new feature so in which case it won't
> > make sense to allow user to even control that propoerty, even if we
> > exported this "x-virtio-scanout-fix" property, but now we must export it
> > because compat fields require it.
> > 
> > However I think agree that having upstream specific MC versions in VMSD
> > checks is kind of unwanted.  I think the major problem is we don't have
> > that extra machine type abstract where we can have simply a number showing
> > the release of QEMU, then we can map that number to whatever
> > upstream/downstream machine types.  E.g.:
> > 
> >   Release No.         Upstream version       Downstream version
> >   50                  9.0                    Y.0
> >   51                  9.1
> >   52                  9.2                    Y.1
> >   ...
> 
> Downstream versions do not map cleanly to individual upstream versions
> across the whole code base. If we have two distinct features in upstream
> version X, each of them may map to a different downstream release. 
> 
> This can happen when downstream skips one or more upstream releases.
> One feature from the skipped release might be backported to an earlier
> downstream release, while other feature might not arrive downstream
> until they later rebase. Version based checks are an inherantly
> undesirable idea for a situation where there is any backporting taking
> place, whether its machine type versions or something else. Named feature
> / flag based checks are always the way to go.

I thought this should work better with things like this where we only want
to fix a break in ABI, and none of downstream should special case things
like such fix.. but I agree even with that in mind such case could be so
rare to bother with above scheme.  I could have raised a bad idea I
suppose. :-( Let's stick with the simple until someone has better idea.

Thanks,
diff mbox series

Patch

diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
index ae831b6b3e..b2d8e5faeb 100644
--- a/hw/display/virtio-gpu.c
+++ b/hw/display/virtio-gpu.c
@@ -20,6 +20,7 @@ 
 #include "trace.h"
 #include "sysemu/dma.h"
 #include "sysemu/sysemu.h"
+#include "hw/boards.h"
 #include "hw/virtio/virtio.h"
 #include "migration/qemu-file-types.h"
 #include "hw/virtio/virtio-gpu.h"
@@ -1166,10 +1167,14 @@  static void virtio_gpu_cursor_bh(void *opaque)
     virtio_gpu_handle_cursor(&g->parent_obj.parent_obj, g->cursor_vq);
 }
 
+static bool machine_check_9_0(void *opaque, int version)
+{
+    return machine_check_version(9, 0);
+}
+
 static const VMStateDescription vmstate_virtio_gpu_scanout = {
     .name = "virtio-gpu-one-scanout",
-    .version_id = 2,
-    .minimum_version_id = 1,
+    .version_id = 1,
     .fields = (const VMStateField[]) {
         VMSTATE_UINT32(resource_id, struct virtio_gpu_scanout),
         VMSTATE_UINT32(width, struct virtio_gpu_scanout),
@@ -1181,12 +1186,12 @@  static const VMStateDescription vmstate_virtio_gpu_scanout = {
         VMSTATE_UINT32(cursor.hot_y, struct virtio_gpu_scanout),
         VMSTATE_UINT32(cursor.pos.x, struct virtio_gpu_scanout),
         VMSTATE_UINT32(cursor.pos.y, struct virtio_gpu_scanout),
-        VMSTATE_UINT32_V(fb.format, struct virtio_gpu_scanout, 2),
-        VMSTATE_UINT32_V(fb.bytes_pp, struct virtio_gpu_scanout, 2),
-        VMSTATE_UINT32_V(fb.width, struct virtio_gpu_scanout, 2),
-        VMSTATE_UINT32_V(fb.height, struct virtio_gpu_scanout, 2),
-        VMSTATE_UINT32_V(fb.stride, struct virtio_gpu_scanout, 2),
-        VMSTATE_UINT32_V(fb.offset, struct virtio_gpu_scanout, 2),
+        VMSTATE_UINT32_TEST(fb.format, struct virtio_gpu_scanout, machine_check_9_0),
+        VMSTATE_UINT32_TEST(fb.bytes_pp, struct virtio_gpu_scanout, machine_check_9_0),
+        VMSTATE_UINT32_TEST(fb.width, struct virtio_gpu_scanout, machine_check_9_0),
+        VMSTATE_UINT32_TEST(fb.height, struct virtio_gpu_scanout, machine_check_9_0),
+        VMSTATE_UINT32_TEST(fb.stride, struct virtio_gpu_scanout, machine_check_9_0),
+        VMSTATE_UINT32_TEST(fb.offset, struct virtio_gpu_scanout, machine_check_9_0),
         VMSTATE_END_OF_LIST()
     },
 };