diff mbox series

[focal:linux-azure,LP:#1894896] hv_netvsc: Fix hibernation for mlx5 VF driver

Message ID 20201027171822.1575137-1-marcelo.cerri@canonical.com
State New
Headers show
Series [focal:linux-azure,LP:#1894896] hv_netvsc: Fix hibernation for mlx5 VF driver | expand

Commit Message

Marcelo Henrique Cerri Oct. 27, 2020, 5:18 p.m. UTC
From: Dexuan Cui <decui@microsoft.com>

BugLink: https://bugs.launchpad.net/bugs/1894896

mlx5_suspend()/resume() keep the network interface, so during hibernation
netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
netvsc_resume() should call netvsc_vf_changed() to switch the data path
back to the VF after hibernation. Note: after we close and re-open the
vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
the data path is implicitly switched to the netvsc NIC. Similarly,
netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
can no longer be used after hibernation.

For mlx4, since the VF network interafce is explicitly destroyed and
re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
already explicitly switches the data path from and to the VF automatically
via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
this fix. Note: mlx4 can still work with the fix because in
netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.

Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
Signed-off-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
---

Marcelo's comments:

I was able to reproduce the problem with the current version from
focal-updates following the test instructions from the LP bug and I
was also able to confirm the fix solves the problem with a test
kernel:

https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz

5.8 already received the fix via an upstream stable update
(LP:#1894896) and hibernation is not supported in 4.15 linux-azure.

Besides that the fix is a clean cherry-pick from upstream.

---
 drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

Comments

Marcelo Henrique Cerri Oct. 27, 2020, 5:25 p.m. UTC | #1
On Tue, Oct 27, 2020 at 02:18:22PM -0300, Marcelo Henrique Cerri wrote:
> From: Dexuan Cui <decui@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1894896
> 
> mlx5_suspend()/resume() keep the network interface, so during hibernation
> netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
> netvsc_resume() should call netvsc_vf_changed() to switch the data path
> back to the VF after hibernation. Note: after we close and re-open the
> vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
> the data path is implicitly switched to the netvsc NIC. Similarly,
> netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
> can no longer be used after hibernation.
> 
> For mlx4, since the VF network interafce is explicitly destroyed and
> re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
> already explicitly switches the data path from and to the VF automatically
> via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
> this fix. Note: mlx4 can still work with the fix because in
> netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
> 
> Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
> Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
> ---
> 
> Marcelo's comments:
> 
> I was able to reproduce the problem with the current version from
> focal-updates following the test instructions from the LP bug and I
> was also able to confirm the fix solves the problem with a test
> kernel:
> 
> https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
> 
> 5.8 already received the fix via an upstream stable update
> (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.

Just a small correction, the LP bug for the upstream stable update is
actually LP:#1897550. But that doesn't affect the patch or the commit
message.


> 
> Besides that the fix is a clean cherry-pick from upstream.
> 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
>  1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 33d9d17ee0e0..be5ede2a6b94 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
>  static int netvsc_suspend(struct hv_device *dev)
>  {
>  	struct net_device_context *ndev_ctx;
> -	struct net_device *vf_netdev, *net;
>  	struct netvsc_device *nvdev;
> +	struct net_device *net;
>  	int ret;
>  
>  	net = hv_get_drvdata(dev);
> @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
>  		goto out;
>  	}
>  
> -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> -	if (vf_netdev)
> -		netvsc_unregister_vf(vf_netdev);
> -
>  	/* Save the current config info */
>  	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
>  
> @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
>  	struct net_device *net = hv_get_drvdata(dev);
>  	struct net_device_context *net_device_ctx;
>  	struct netvsc_device_info *device_info;
> +	struct net_device *vf_netdev;
>  	int ret;
>  
>  	rtnl_lock();
> @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
>  	netvsc_devinfo_put(device_info);
>  	net_device_ctx->saved_netvsc_dev_info = NULL;
>  
> +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
> +	 * hibernation, but here the data path is implicitly switched to the
> +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
> +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
> +	 */
> +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
> +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
> +		ret = -EINVAL;
> +
>  	rtnl_unlock();
>  
>  	return ret;
> -- 
> 2.25.1
>
Stefan Bader Oct. 28, 2020, 8:43 a.m. UTC | #2
On 27.10.20 18:18, Marcelo Henrique Cerri wrote:
> From: Dexuan Cui <decui@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1894896
> 
> mlx5_suspend()/resume() keep the network interface, so during hibernation
> netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
> netvsc_resume() should call netvsc_vf_changed() to switch the data path
> back to the VF after hibernation. Note: after we close and re-open the
> vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
> the data path is implicitly switched to the netvsc NIC. Similarly,
> netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
> can no longer be used after hibernation.
> 
> For mlx4, since the VF network interafce is explicitly destroyed and
> re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
> already explicitly switches the data path from and to the VF automatically
> via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
> this fix. Note: mlx4 can still work with the fix because in
> netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
> 
> Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
> Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
> ---

Just two notes not related to the patch itself. One just for whoever applies it
later. Maybe no longer an issue but I remember some issues with the cutoff hints
when there are multiple of them. I would maybe manually delete things from the
downloaded patch...
Second is about the primary tasks on launchpad. This always relates to the
current development series and is a moving target. Right now, there would be no
source for azure there, so that task should be invalid. The reason for not
leaving around any tasks in an open (new, confirmed, triaged, in progress) state
is that this prevents the bug report from considered done. So it still shows up
in bug searches and is counted as "active" bug. And there is already enough of
those.

-Stefan

> 
> Marcelo's comments:
> 
> I was able to reproduce the problem with the current version from
> focal-updates following the test instructions from the LP bug and I
> was also able to confirm the fix solves the problem with a test
> kernel:
> 
> https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
> 
> 5.8 already received the fix via an upstream stable update
> (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.
> 
> Besides that the fix is a clean cherry-pick from upstream.
> 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
>  1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 33d9d17ee0e0..be5ede2a6b94 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
>  static int netvsc_suspend(struct hv_device *dev)
>  {
>  	struct net_device_context *ndev_ctx;
> -	struct net_device *vf_netdev, *net;
>  	struct netvsc_device *nvdev;
> +	struct net_device *net;
>  	int ret;
>  
>  	net = hv_get_drvdata(dev);
> @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
>  		goto out;
>  	}
>  
> -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> -	if (vf_netdev)
> -		netvsc_unregister_vf(vf_netdev);
> -
>  	/* Save the current config info */
>  	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
>  
> @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
>  	struct net_device *net = hv_get_drvdata(dev);
>  	struct net_device_context *net_device_ctx;
>  	struct netvsc_device_info *device_info;
> +	struct net_device *vf_netdev;
>  	int ret;
>  
>  	rtnl_lock();
> @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
>  	netvsc_devinfo_put(device_info);
>  	net_device_ctx->saved_netvsc_dev_info = NULL;
>  
> +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
> +	 * hibernation, but here the data path is implicitly switched to the
> +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
> +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
> +	 */
> +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
> +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
> +		ret = -EINVAL;
> +
>  	rtnl_unlock();
>  
>  	return ret;
>
Marcelo Henrique Cerri Oct. 28, 2020, 12:05 p.m. UTC | #3
On Wed, Oct 28, 2020 at 09:43:11AM +0100, Stefan Bader wrote:
> On 27.10.20 18:18, Marcelo Henrique Cerri wrote:
> > From: Dexuan Cui <decui@microsoft.com>
> > 
> > BugLink: https://bugs.launchpad.net/bugs/1894896
> > 
> > mlx5_suspend()/resume() keep the network interface, so during hibernation
> > netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
> > netvsc_resume() should call netvsc_vf_changed() to switch the data path
> > back to the VF after hibernation. Note: after we close and re-open the
> > vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
> > the data path is implicitly switched to the netvsc NIC. Similarly,
> > netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
> > can no longer be used after hibernation.
> > 
> > For mlx4, since the VF network interafce is explicitly destroyed and
> > re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
> > already explicitly switches the data path from and to the VF automatically
> > via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
> > this fix. Note: mlx4 can still work with the fix because in
> > netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
> > 
> > Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
> > Signed-off-by: Dexuan Cui <decui@microsoft.com>
> > Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> > (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
> > Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
> Acked-by: Stefan Bader <stefan.bader@canonical.com>
> > ---
> 
> Just two notes not related to the patch itself. One just for whoever applies it
> later. Maybe no longer an issue but I remember some issues with the cutoff hints
> when there are multiple of them. I would maybe manually delete things from the
> downloaded patch...
> Second is about the primary tasks on launchpad. This always relates to the
> current development series and is a moving target. Right now, there would be no
> source for azure there, so that task should be invalid. The reason for not
> leaving around any tasks in an open (new, confirmed, triaged, in progress) state
> is that this prevents the bug report from considered done. So it still shows up
> in bug searches and is counted as "active" bug. And there is already enough of
> those.

Thanks for the explanation, Stefan. What issue did you have with the
cutoff hints? Was the comment after the three dashed included to the
commit message?


> 
> -Stefan
> 
> > 
> > Marcelo's comments:
> > 
> > I was able to reproduce the problem with the current version from
> > focal-updates following the test instructions from the LP bug and I
> > was also able to confirm the fix solves the problem with a test
> > kernel:
> > 
> > https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
> > 
> > 5.8 already received the fix via an upstream stable update
> > (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.
> > 
> > Besides that the fix is a clean cherry-pick from upstream.
> > 
> > ---
> >  drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
> >  1 file changed, 11 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> > index 33d9d17ee0e0..be5ede2a6b94 100644
> > --- a/drivers/net/hyperv/netvsc_drv.c
> > +++ b/drivers/net/hyperv/netvsc_drv.c
> > @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
> >  static int netvsc_suspend(struct hv_device *dev)
> >  {
> >  	struct net_device_context *ndev_ctx;
> > -	struct net_device *vf_netdev, *net;
> >  	struct netvsc_device *nvdev;
> > +	struct net_device *net;
> >  	int ret;
> >  
> >  	net = hv_get_drvdata(dev);
> > @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
> >  		goto out;
> >  	}
> >  
> > -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> > -	if (vf_netdev)
> > -		netvsc_unregister_vf(vf_netdev);
> > -
> >  	/* Save the current config info */
> >  	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
> >  
> > @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
> >  	struct net_device *net = hv_get_drvdata(dev);
> >  	struct net_device_context *net_device_ctx;
> >  	struct netvsc_device_info *device_info;
> > +	struct net_device *vf_netdev;
> >  	int ret;
> >  
> >  	rtnl_lock();
> > @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
> >  	netvsc_devinfo_put(device_info);
> >  	net_device_ctx->saved_netvsc_dev_info = NULL;
> >  
> > +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
> > +	 * hibernation, but here the data path is implicitly switched to the
> > +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
> > +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
> > +	 */
> > +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
> > +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
> > +		ret = -EINVAL;
> > +
> >  	rtnl_unlock();
> >  
> >  	return ret;
> > 
> 
>
Stefan Bader Oct. 28, 2020, 2:06 p.m. UTC | #4
On 28.10.20 13:05, Marcelo Henrique Cerri wrote:
> On Wed, Oct 28, 2020 at 09:43:11AM +0100, Stefan Bader wrote:
>> On 27.10.20 18:18, Marcelo Henrique Cerri wrote:
>>> From: Dexuan Cui <decui@microsoft.com>
>>>
>>> BugLink: https://bugs.launchpad.net/bugs/1894896
>>>
>>> mlx5_suspend()/resume() keep the network interface, so during hibernation
>>> netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
>>> netvsc_resume() should call netvsc_vf_changed() to switch the data path
>>> back to the VF after hibernation. Note: after we close and re-open the
>>> vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
>>> the data path is implicitly switched to the netvsc NIC. Similarly,
>>> netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
>>> can no longer be used after hibernation.
>>>
>>> For mlx4, since the VF network interafce is explicitly destroyed and
>>> re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
>>> already explicitly switches the data path from and to the VF automatically
>>> via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
>>> this fix. Note: mlx4 can still work with the fix because in
>>> netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
>>>
>>> Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
>>> Signed-off-by: Dexuan Cui <decui@microsoft.com>
>>> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
>>> (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
>>> Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
>> Acked-by: Stefan Bader <stefan.bader@canonical.com>
>>> ---
>>
>> Just two notes not related to the patch itself. One just for whoever applies it
>> later. Maybe no longer an issue but I remember some issues with the cutoff hints
>> when there are multiple of them. I would maybe manually delete things from the
>> downloaded patch...
>> Second is about the primary tasks on launchpad. This always relates to the
>> current development series and is a moving target. Right now, there would be no
>> source for azure there, so that task should be invalid. The reason for not
>> leaving around any tasks in an open (new, confirmed, triaged, in progress) state
>> is that this prevents the bug report from considered done. So it still shows up
>> in bug searches and is counted as "active" bug. And there is already enough of
>> those.
> 
> Thanks for the explanation, Stefan. What issue did you have with the
> cutoff hints? Was the comment after the three dashed included to the
> commit message?

Yes, either all or one of them or not at all being happy on git am. But again,
it has been a long while ago. So its just a vague reminder to double check.


> 
> 
>>
>> -Stefan
>>
>>>
>>> Marcelo's comments:
>>>
>>> I was able to reproduce the problem with the current version from
>>> focal-updates following the test instructions from the LP bug and I
>>> was also able to confirm the fix solves the problem with a test
>>> kernel:
>>>
>>> https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
>>>
>>> 5.8 already received the fix via an upstream stable update
>>> (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.
>>>
>>> Besides that the fix is a clean cherry-pick from upstream.
>>>
>>> ---
>>>  drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
>>>  1 file changed, 11 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
>>> index 33d9d17ee0e0..be5ede2a6b94 100644
>>> --- a/drivers/net/hyperv/netvsc_drv.c
>>> +++ b/drivers/net/hyperv/netvsc_drv.c
>>> @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
>>>  static int netvsc_suspend(struct hv_device *dev)
>>>  {
>>>  	struct net_device_context *ndev_ctx;
>>> -	struct net_device *vf_netdev, *net;
>>>  	struct netvsc_device *nvdev;
>>> +	struct net_device *net;
>>>  	int ret;
>>>  
>>>  	net = hv_get_drvdata(dev);
>>> @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
>>>  		goto out;
>>>  	}
>>>  
>>> -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
>>> -	if (vf_netdev)
>>> -		netvsc_unregister_vf(vf_netdev);
>>> -
>>>  	/* Save the current config info */
>>>  	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
>>>  
>>> @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
>>>  	struct net_device *net = hv_get_drvdata(dev);
>>>  	struct net_device_context *net_device_ctx;
>>>  	struct netvsc_device_info *device_info;
>>> +	struct net_device *vf_netdev;
>>>  	int ret;
>>>  
>>>  	rtnl_lock();
>>> @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
>>>  	netvsc_devinfo_put(device_info);
>>>  	net_device_ctx->saved_netvsc_dev_info = NULL;
>>>  
>>> +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
>>> +	 * hibernation, but here the data path is implicitly switched to the
>>> +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
>>> +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
>>> +	 */
>>> +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
>>> +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
>>> +		ret = -EINVAL;
>>> +
>>>  	rtnl_unlock();
>>>  
>>>  	return ret;
>>>
>>
>>
> 
> 
> 
>
Ian May Oct. 28, 2020, 9:53 p.m. UTC | #5
LGTM

The first comment of the attached buglink references two additional patches being needed.  Is there a reason those weren't included with this patch?

Thanks
Ian


On 2020-10-27 14:18:22 , Marcelo Henrique Cerri wrote:
> From: Dexuan Cui <decui@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1894896
> 
> mlx5_suspend()/resume() keep the network interface, so during hibernation
> netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
> netvsc_resume() should call netvsc_vf_changed() to switch the data path
> back to the VF after hibernation. Note: after we close and re-open the
> vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
> the data path is implicitly switched to the netvsc NIC. Similarly,
> netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
> can no longer be used after hibernation.
> 
> For mlx4, since the VF network interafce is explicitly destroyed and
> re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
> already explicitly switches the data path from and to the VF automatically
> via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
> this fix. Note: mlx4 can still work with the fix because in
> netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
> 
> Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
> Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>
> ---
> 
> Marcelo's comments:
> 
> I was able to reproduce the problem with the current version from
> focal-updates following the test instructions from the LP bug and I
> was also able to confirm the fix solves the problem with a test
> kernel:
> 
> https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
> 
> 5.8 already received the fix via an upstream stable update
> (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.
> 
> Besides that the fix is a clean cherry-pick from upstream.
> 
> ---
>  drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
>  1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 33d9d17ee0e0..be5ede2a6b94 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
>  static int netvsc_suspend(struct hv_device *dev)
>  {
>  	struct net_device_context *ndev_ctx;
> -	struct net_device *vf_netdev, *net;
>  	struct netvsc_device *nvdev;
> +	struct net_device *net;
>  	int ret;
>  
>  	net = hv_get_drvdata(dev);
> @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
>  		goto out;
>  	}
>  
> -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> -	if (vf_netdev)
> -		netvsc_unregister_vf(vf_netdev);
> -
>  	/* Save the current config info */
>  	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
>  
> @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
>  	struct net_device *net = hv_get_drvdata(dev);
>  	struct net_device_context *net_device_ctx;
>  	struct netvsc_device_info *device_info;
> +	struct net_device *vf_netdev;
>  	int ret;
>  
>  	rtnl_lock();
> @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
>  	netvsc_devinfo_put(device_info);
>  	net_device_ctx->saved_netvsc_dev_info = NULL;
>  
> +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
> +	 * hibernation, but here the data path is implicitly switched to the
> +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
> +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
> +	 */
> +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
> +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
> +		ret = -EINVAL;
> +
>  	rtnl_unlock();
>  
>  	return ret;
> -- 
> 2.25.1
> 
> 
> -- 
> kernel-team mailing list
> kernel-team@lists.ubuntu.com
> https://lists.ubuntu.com/mailman/listinfo/kernel-team
Kleber Sacilotto de Souza Nov. 6, 2020, 4:52 p.m. UTC | #6
On 27.10.20 18:18, Marcelo Henrique Cerri wrote:
> From: Dexuan Cui <decui@microsoft.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/1894896
> 
> mlx5_suspend()/resume() keep the network interface, so during hibernation
> netvsc_unregister_vf() and netvsc_register_vf() are not called, and hence
> netvsc_resume() should call netvsc_vf_changed() to switch the data path
> back to the VF after hibernation. Note: after we close and re-open the
> vmbus channel of the netvsc NIC in netvsc_suspend() and netvsc_resume(),
> the data path is implicitly switched to the netvsc NIC. Similarly,
> netvsc_suspend() should not call netvsc_unregister_vf(), otherwise the VF
> can no longer be used after hibernation.
> 
> For mlx4, since the VF network interafce is explicitly destroyed and
> re-created during hibernation (see mlx4_suspend()/resume()), hv_netvsc
> already explicitly switches the data path from and to the VF automatically
> via netvsc_register_vf() and netvsc_unregister_vf(), so mlx4 doesn't need
> this fix. Note: mlx4 can still work with the fix because in
> netvsc_suspend()/resume() ndev_ctx->vf_netdev is NULL for mlx4.
> 
> Fixes: 0efeea5fb153 ("hv_netvsc: Add the support of hibernation")
> Signed-off-by: Dexuan Cui <decui@microsoft.com>
> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
> (cherry picked from commit 19162fd4063a3211843b997a454b505edb81d5ce)
> Signed-off-by: Marcelo Henrique Cerri <marcelo.cerri@canonical.com>

Applied to focal/linux-azure.

The two additional patches mentioned on comment #1 of the LP bug are
not really needed?

Thanks,
Kleber

> ---
> 
> Marcelo's comments:
> 
> I was able to reproduce the problem with the current version from
> focal-updates following the test instructions from the LP bug and I
> was also able to confirm the fix solves the problem with a test
> kernel:
> 
> https://kernel.ubuntu.com/~mhcerri/azure/focal-linux-azure_lp1894896.tgz
> 
> 5.8 already received the fix via an upstream stable update
> (LP:#1894896) and hibernation is not supported in 4.15 linux-azure.
> 
> Besides that the fix is a clean cherry-pick from upstream.
> 
> ---
>   drivers/net/hyperv/netvsc_drv.c | 16 +++++++++++-----
>   1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
> index 33d9d17ee0e0..be5ede2a6b94 100644
> --- a/drivers/net/hyperv/netvsc_drv.c
> +++ b/drivers/net/hyperv/netvsc_drv.c
> @@ -2551,8 +2551,8 @@ static int netvsc_remove(struct hv_device *dev)
>   static int netvsc_suspend(struct hv_device *dev)
>   {
>   	struct net_device_context *ndev_ctx;
> -	struct net_device *vf_netdev, *net;
>   	struct netvsc_device *nvdev;
> +	struct net_device *net;
>   	int ret;
>   
>   	net = hv_get_drvdata(dev);
> @@ -2568,10 +2568,6 @@ static int netvsc_suspend(struct hv_device *dev)
>   		goto out;
>   	}
>   
> -	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
> -	if (vf_netdev)
> -		netvsc_unregister_vf(vf_netdev);
> -
>   	/* Save the current config info */
>   	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
>   
> @@ -2587,6 +2583,7 @@ static int netvsc_resume(struct hv_device *dev)
>   	struct net_device *net = hv_get_drvdata(dev);
>   	struct net_device_context *net_device_ctx;
>   	struct netvsc_device_info *device_info;
> +	struct net_device *vf_netdev;
>   	int ret;
>   
>   	rtnl_lock();
> @@ -2599,6 +2596,15 @@ static int netvsc_resume(struct hv_device *dev)
>   	netvsc_devinfo_put(device_info);
>   	net_device_ctx->saved_netvsc_dev_info = NULL;
>   
> +	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
> +	 * hibernation, but here the data path is implicitly switched to the
> +	 * netvsc NIC since the vmbus channel is closed and re-opened, so
> +	 * netvsc_vf_changed() must be used to switch the data path to the VF.
> +	 */
> +	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
> +	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
> +		ret = -EINVAL;
> +
>   	rtnl_unlock();
>   
>   	return ret;
>
diff mbox series

Patch

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 33d9d17ee0e0..be5ede2a6b94 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -2551,8 +2551,8 @@  static int netvsc_remove(struct hv_device *dev)
 static int netvsc_suspend(struct hv_device *dev)
 {
 	struct net_device_context *ndev_ctx;
-	struct net_device *vf_netdev, *net;
 	struct netvsc_device *nvdev;
+	struct net_device *net;
 	int ret;
 
 	net = hv_get_drvdata(dev);
@@ -2568,10 +2568,6 @@  static int netvsc_suspend(struct hv_device *dev)
 		goto out;
 	}
 
-	vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
-	if (vf_netdev)
-		netvsc_unregister_vf(vf_netdev);
-
 	/* Save the current config info */
 	ndev_ctx->saved_netvsc_dev_info = netvsc_devinfo_get(nvdev);
 
@@ -2587,6 +2583,7 @@  static int netvsc_resume(struct hv_device *dev)
 	struct net_device *net = hv_get_drvdata(dev);
 	struct net_device_context *net_device_ctx;
 	struct netvsc_device_info *device_info;
+	struct net_device *vf_netdev;
 	int ret;
 
 	rtnl_lock();
@@ -2599,6 +2596,15 @@  static int netvsc_resume(struct hv_device *dev)
 	netvsc_devinfo_put(device_info);
 	net_device_ctx->saved_netvsc_dev_info = NULL;
 
+	/* A NIC driver (e.g. mlx5) may keep the VF network interface across
+	 * hibernation, but here the data path is implicitly switched to the
+	 * netvsc NIC since the vmbus channel is closed and re-opened, so
+	 * netvsc_vf_changed() must be used to switch the data path to the VF.
+	 */
+	vf_netdev = rtnl_dereference(net_device_ctx->vf_netdev);
+	if (vf_netdev && netvsc_vf_changed(vf_netdev) != NOTIFY_OK)
+		ret = -EINVAL;
+
 	rtnl_unlock();
 
 	return ret;