diff mbox series

[v3,net-next,2/2] ionic: add devlink firmware update

Message ID 20200908224812.63434-3-snelson@pensando.io
State Changes Requested
Delegated to: David Miller
Headers show
Series ionic: add devlink dev flash support | expand

Commit Message

Shannon Nelson Sept. 8, 2020, 10:48 p.m. UTC
Add support for firmware update through the devlink interface.
This update copies the firmware object into the device, asks
the current firmware to install it, then asks the firmware to
select the new firmware for the next boot-up.

The install and select steps are launched as asynchronous
requests, which are then followed up with status request
commands.  These status request commands will be answered with
an EAGAIN return value and will try again until the request
has completed or reached the timeout specified.

Signed-off-by: Shannon Nelson <snelson@pensando.io>
---
 drivers/net/ethernet/pensando/ionic/Makefile  |   2 +-
 .../ethernet/pensando/ionic/ionic_devlink.c   |  14 ++
 .../ethernet/pensando/ionic/ionic_devlink.h   |   3 +
 .../net/ethernet/pensando/ionic/ionic_fw.c    | 209 ++++++++++++++++++
 .../net/ethernet/pensando/ionic/ionic_main.c  |  23 +-
 5 files changed, 242 insertions(+), 9 deletions(-)
 create mode 100644 drivers/net/ethernet/pensando/ionic/ionic_fw.c

Comments

Jakub Kicinski Sept. 8, 2020, 11:54 p.m. UTC | #1
On Tue,  8 Sep 2020 15:48:12 -0700 Shannon Nelson wrote:
> +	dl = priv_to_devlink(ionic);
> +	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
> +	start_time = jiffies;
> +	end_time = start_time + (timeout * HZ);
> +	do {
> +		mutex_lock(&ionic->dev_cmd_lock);
> +		ionic_dev_cmd_go(&ionic->idev, &cmd);
> +		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
> +		mutex_unlock(&ionic->dev_cmd_lock);
> +
> +		devlink_flash_update_status_notify(dl, label, NULL,
> +						   (jiffies - start_time) / HZ,
> +						   timeout);

That's not what I meant. I think we can plumb proper timeout parameter
through devlink all the way to user space.

> +	} while (time_before(jiffies, end_time) && (err == -EAGAIN || err == -ETIMEDOUT));
> +
> +	if (err == -EAGAIN || err == -ETIMEDOUT) {
> +		NL_SET_ERR_MSG_MOD(extack, "Firmware wait timed out");
> +		dev_err(ionic->dev, "DEV_CMD firmware wait %s timed out\n", label);
> +	} else if (err) {
> +		NL_SET_ERR_MSG_MOD(extack, "Firmware wait failed");
> +	} else {
> +		devlink_flash_update_status_notify(dl, label, NULL, timeout, timeout);
> +	}


> +		if (offset > next_interval) {
> +			devlink_flash_update_status_notify(dl, "Downloading",
> +							   NULL, offset, fw->size);
> +			next_interval = offset + (fw->size / IONIC_FW_INTERVAL_FRACTION);
> +		}
> +	}
> +	devlink_flash_update_status_notify(dl, "Downloading", NULL, 1, 1);

This one wasn't updated.
Shannon Nelson Sept. 9, 2020, 4:23 p.m. UTC | #2
On 9/8/20 4:54 PM, Jakub Kicinski wrote:
> On Tue,  8 Sep 2020 15:48:12 -0700 Shannon Nelson wrote:
>> +	dl = priv_to_devlink(ionic);
>> +	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
>> +	start_time = jiffies;
>> +	end_time = start_time + (timeout * HZ);
>> +	do {
>> +		mutex_lock(&ionic->dev_cmd_lock);
>> +		ionic_dev_cmd_go(&ionic->idev, &cmd);
>> +		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
>> +		mutex_unlock(&ionic->dev_cmd_lock);
>> +
>> +		devlink_flash_update_status_notify(dl, label, NULL,
>> +						   (jiffies - start_time) / HZ,
>> +						   timeout);
> That's not what I meant. I think we can plumb proper timeout parameter
> through devlink all the way to user space.

Sure, but until that gets worked out, this should suffice.

>
>> +	} while (time_before(jiffies, end_time) && (err == -EAGAIN || err == -ETIMEDOUT));
>> +
>> +	if (err == -EAGAIN || err == -ETIMEDOUT) {
>> +		NL_SET_ERR_MSG_MOD(extack, "Firmware wait timed out");
>> +		dev_err(ionic->dev, "DEV_CMD firmware wait %s timed out\n", label);
>> +	} else if (err) {
>> +		NL_SET_ERR_MSG_MOD(extack, "Firmware wait failed");
>> +	} else {
>> +		devlink_flash_update_status_notify(dl, label, NULL, timeout, timeout);
>> +	}
>
>> +		if (offset > next_interval) {
>> +			devlink_flash_update_status_notify(dl, "Downloading",
>> +							   NULL, offset, fw->size);
>> +			next_interval = offset + (fw->size / IONIC_FW_INTERVAL_FRACTION);
>> +		}
>> +	}
>> +	devlink_flash_update_status_notify(dl, "Downloading", NULL, 1, 1);
> This one wasn't updated.

Yep, missed it.  I'll follow up.

sln
Jakub Kicinski Sept. 9, 2020, 4:44 p.m. UTC | #3
On Wed, 9 Sep 2020 09:23:08 -0700 Shannon Nelson wrote:
> On 9/8/20 4:54 PM, Jakub Kicinski wrote:
> > On Tue,  8 Sep 2020 15:48:12 -0700 Shannon Nelson wrote:  
> >> +	dl = priv_to_devlink(ionic);
> >> +	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
> >> +	start_time = jiffies;
> >> +	end_time = start_time + (timeout * HZ);
> >> +	do {
> >> +		mutex_lock(&ionic->dev_cmd_lock);
> >> +		ionic_dev_cmd_go(&ionic->idev, &cmd);
> >> +		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
> >> +		mutex_unlock(&ionic->dev_cmd_lock);
> >> +
> >> +		devlink_flash_update_status_notify(dl, label, NULL,
> >> +						   (jiffies - start_time) / HZ,
> >> +						   timeout);  
> > That's not what I meant. I think we can plumb proper timeout parameter
> > through devlink all the way to user space.  
> 
> Sure, but until that gets worked out, this should suffice.

I don't understand - what will get worked out?
Shannon Nelson Sept. 9, 2020, 5:58 p.m. UTC | #4
On 9/9/20 9:44 AM, Jakub Kicinski wrote:
> On Wed, 9 Sep 2020 09:23:08 -0700 Shannon Nelson wrote:
>> On 9/8/20 4:54 PM, Jakub Kicinski wrote:
>>> On Tue,  8 Sep 2020 15:48:12 -0700 Shannon Nelson wrote:
>>>> +	dl = priv_to_devlink(ionic);
>>>> +	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
>>>> +	start_time = jiffies;
>>>> +	end_time = start_time + (timeout * HZ);
>>>> +	do {
>>>> +		mutex_lock(&ionic->dev_cmd_lock);
>>>> +		ionic_dev_cmd_go(&ionic->idev, &cmd);
>>>> +		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
>>>> +		mutex_unlock(&ionic->dev_cmd_lock);
>>>> +
>>>> +		devlink_flash_update_status_notify(dl, label, NULL,
>>>> +						   (jiffies - start_time) / HZ,
>>>> +						   timeout);
>>> That's not what I meant. I think we can plumb proper timeout parameter
>>> through devlink all the way to user space.
>> Sure, but until that gets worked out, this should suffice.
> I don't understand - what will get worked out?

I'm suggesting that this implementation using the existing devlink 
logging services should suffice until someone can design, implement, and 
get accepted a different bit of plumbing.  Unfortunately, that's not a 
job that I can get to right now.

sln
Jakub Kicinski Sept. 9, 2020, 7:22 p.m. UTC | #5
On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:
> On 9/9/20 9:44 AM, Jakub Kicinski wrote:
> > On Wed, 9 Sep 2020 09:23:08 -0700 Shannon Nelson wrote:  
> >> On 9/8/20 4:54 PM, Jakub Kicinski wrote:  
> >>> On Tue,  8 Sep 2020 15:48:12 -0700 Shannon Nelson wrote:  
> >>>> +	dl = priv_to_devlink(ionic);
> >>>> +	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
> >>>> +	start_time = jiffies;
> >>>> +	end_time = start_time + (timeout * HZ);
> >>>> +	do {
> >>>> +		mutex_lock(&ionic->dev_cmd_lock);
> >>>> +		ionic_dev_cmd_go(&ionic->idev, &cmd);
> >>>> +		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
> >>>> +		mutex_unlock(&ionic->dev_cmd_lock);
> >>>> +
> >>>> +		devlink_flash_update_status_notify(dl, label, NULL,
> >>>> +						   (jiffies - start_time) / HZ,
> >>>> +						   timeout);  
> >>> That's not what I meant. I think we can plumb proper timeout parameter
> >>> through devlink all the way to user space.  
> >> Sure, but until that gets worked out, this should suffice.  
> > I don't understand - what will get worked out?  
> 
> I'm suggesting that this implementation using the existing devlink 
> logging services should suffice until someone can design, implement, and 
> get accepted a different bit of plumbing.  Unfortunately, that's not a 
> job that I can get to right now.

This hack is too nasty to be accepted.

So to be clear your options are:
 - plumb the single extra netlink parameter through to devlink
 - wait for someone else to do that for you, before you get firmware
   flashing accepted upstream.

Your "NIC" is quite "special", so you gotta be willing to lay the
groundwork it you want it to be supported upstream.

I already regret acking your weird live reset without proper APIs.
Now Mellanox is doing the plumbing for the exact same feature.
David Miller Sept. 9, 2020, 9:30 p.m. UTC | #6
From: Shannon Nelson <snelson@pensando.io>
Date: Wed, 9 Sep 2020 10:58:19 -0700

> I'm suggesting that this implementation using the existing devlink
> logging services should suffice until someone can design, implement,
> and get accepted a different bit of plumbing.  Unfortunately, that's
> not a job that I can get to right now.

Sorry, this is not how we operate.

If you do things that way you will have to support the "temporary"
solution forever in order to not break user facing interfaces.

That misses the whole point of doing it properly.
Shannon Nelson Sept. 10, 2020, 1:34 a.m. UTC | #7
On 9/9/20 12:22 PM, Jakub Kicinski wrote:
> On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:
>>
>> I'm suggesting that this implementation using the existing devlink
>> logging services should suffice until someone can design, implement, and
>> get accepted a different bit of plumbing.  Unfortunately, that's not a
>> job that I can get to right now.
> This hack is too nasty to be accepted.

Your comment earlier was

 > I wonder if we can steal a page from systemd's book and display
 > "time until timeout", or whatchamacallit, like systemd does when it's
 > waiting for processes to quit. All drivers have some timeout set on the
 > operation. If users knew the driver sets timeout to n minutes and they
 > see the timer ticking up they'd be less likely to think the command has
 > hanged..

I implemented the loop such that the timeout value was the 100%, and 
each time through the loop the elapsed time value is sent, so the user 
gets to see the % value increasing as the wait goes on, in the same way 
they see the download progress percentage ticking away. This is how I 
approached the stated requirement of user seeing the "timer ticking up", 
using the existing machinery.  This seems to be how 
devlink_flash_update_status_notify() is expected to be used, so I'm a 
little surprised at the critique.

> So to be clear your options are:
>   - plumb the single extra netlink parameter through to devlink
>   - wait for someone else to do that for you, before you get firmware
>     flashing accepted upstream.
>

Since you seem to have something else in mind, a little more detail 
would be helpful.

We currently see devlink updating a percentage, something like:
Downloading:  56%
using backspaces to overwrite the value as the updates are published.

How do you envision the userland interpretation of the timeout ticking?  
Do you want to see something like:
Installing - timeout seconds:  23
as a countdown?

So, maybe a flag parameter that can tell the UI to use the raw value and 
not massage it into a percentage?

Do you see this new netlink parameter to be a boolean switch between the 
percentage and raw, or maybe a bitflag parameter that might end up with 
several bits of context information for userland to interpret?

Are you thinking of a new flags parameter in 
devlink_flash_update_status_notify(), or a new function to service this?

If a new parameter to devlink_flash_update_status_notify(), maybe it is 
time to make a struct for flash update data rather than adding more 
parameters to the function?

Should we add yet another parameter to replace the '%' with some other 
label, so devlink could print something like
Installing - timeout in:  23 secs

Or could we use a 0 value for total to signify using a raw value and not 
need to plumb a new parameter?  Although this might not get along well 
with older devlink utilities.

Thoughts?

sln
Jakub Kicinski Sept. 10, 2020, 5:56 p.m. UTC | #8
On Wed, 9 Sep 2020 18:34:57 -0700 Shannon Nelson wrote:
> On 9/9/20 12:22 PM, Jakub Kicinski wrote:
> > On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:  
> >>
> >> I'm suggesting that this implementation using the existing devlink
> >> logging services should suffice until someone can design, implement, and
> >> get accepted a different bit of plumbing.  Unfortunately, that's not a
> >> job that I can get to right now.  
> > This hack is too nasty to be accepted.  
> 
> Your comment earlier was
> 
>  > I wonder if we can steal a page from systemd's book and display
>  > "time until timeout", or whatchamacallit, like systemd does when it's
>  > waiting for processes to quit. All drivers have some timeout set on the
>  > operation. If users knew the driver sets timeout to n minutes and they
>  > see the timer ticking up they'd be less likely to think the command has
>  > hanged..  
> 
> I implemented the loop such that the timeout value was the 100%, and 
> each time through the loop the elapsed time value is sent, so the user 
> gets to see the % value increasing as the wait goes on, in the same way 
> they see the download progress percentage ticking away. 

Right but you said that in most cases the value never goes up to 25min,
so user will see the value increment from 0 to say 5% very slowly and
then jump to 100%.

> This is how I approached the stated requirement of user seeing the
> "timer ticking up", using the existing machinery.  This seems to be
> how devlink_flash_update_status_notify() is expected to be used, so
> I'm a little surprised at the critique.

Sorry, I thought the systemd reference would be clear enough, see the
screenshot here:

https://images.app.goo.gl/gz1Uwg6mcHEd3D2m7

Systemd prints something link:

bla bla bla (XXs / YYs)

where XX is the timer ticking up, and YY is the timeout value.

> > So to be clear your options are:
> >   - plumb the single extra netlink parameter through to devlink
> >   - wait for someone else to do that for you, before you get
> > firmware flashing accepted upstream.
> >  
> 
> Since you seem to have something else in mind, a little more detail 
> would be helpful.
> 
> We currently see devlink updating a percentage, something like:
> Downloading:  56%
> using backspaces to overwrite the value as the updates are published.
> 
> How do you envision the userland interpretation of the timeout
> ticking? Do you want to see something like:
> Installing - timeout seconds:  23
> as a countdown?

I was under the impression that the systemd format would be familiar 
to users, hence:

Downloading:  56% (Xm Ys / Zm Vz)

The part in brackets only appearing after a few seconds without a
notification, otherwise the whole thing would get noisy.

> So, maybe a flag parameter that can tell the UI to use the raw value
> and not massage it into a percentage?
> 
> Do you see this new netlink parameter to be a boolean switch between
> the percentage and raw, or maybe a bitflag parameter that might end
> up with several bits of context information for userland to interpret?
> 
> Are you thinking of a new flags parameter in 
> devlink_flash_update_status_notify(), or a new function to service
> this?
> 
> If a new parameter to devlink_flash_update_status_notify(), maybe it
> is time to make a struct for flash update data rather than adding
> more parameters to the function?
> 
> Should we add yet another parameter to replace the '%' with some
> other label, so devlink could print something like
> Installing - timeout in:  23 secs
> 
> Or could we use a 0 value for total to signify using a raw value and
> not need to plumb a new parameter?  Although this might not get along
> well with older devlink utilities.

I was thinking of adding an extra timeout parameter to 
devlink_flash_update_status_notify() - timeout length in seconds.
And an extra netlink attr for that.

We could perhaps make:

static inline void 
devlink_flash_update_status_notify(struct devlink *devlink, const char *status_msg,
				    unsigned long done, unsigned long total)
{
	struct ..._args = {
		.status_msg = status_msg,
		.done = done,
		.total = total,
	}

	__devlink_flash_update_status_notify(devlink, &.._args);
}

IOW drop the component parameter from the normal helper, cause almost
nobody uses that. The add a more full featured __ version, which would
take the arg struct, the struct would include the timeout value.

If the timeout is lower than 15sec drivers will probably have little
value in reporting it, so simplified helper should be nice there to save LOC.

The user space can do the counting up trivially using select(),
or a syscall timeout. The netlink notification would only carry timeout.
(LMK if this is problematic, I haven't looked at the user space part.)
Shannon Nelson Sept. 14, 2020, 10:02 p.m. UTC | #9
On 9/10/20 10:56 AM, Jakub Kicinski wrote:
> On Wed, 9 Sep 2020 18:34:57 -0700 Shannon Nelson wrote:
>> On 9/9/20 12:22 PM, Jakub Kicinski wrote:
>>> On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:
>>>> I'm suggesting that this implementation using the existing devlink
>>>> logging services should suffice until someone can design, implement, and
>>>> get accepted a different bit of plumbing.  Unfortunately, that's not a
>>>> job that I can get to right now.
>>> This hack is too nasty to be accepted.
>> Your comment earlier was
>>
>>   > I wonder if we can steal a page from systemd's book and display
>>   > "time until timeout", or whatchamacallit, like systemd does when it's
>>   > waiting for processes to quit. All drivers have some timeout set on the
>>   > operation. If users knew the driver sets timeout to n minutes and they
>>   > see the timer ticking up they'd be less likely to think the command has
>>   > hanged..
>>
>> I implemented the loop such that the timeout value was the 100%, and
>> each time through the loop the elapsed time value is sent, so the user
>> gets to see the % value increasing as the wait goes on, in the same way
>> they see the download progress percentage ticking away.
> Right but you said that in most cases the value never goes up to 25min,
> so user will see the value increment from 0 to say 5% very slowly and
> then jump to 100%.
>
>> This is how I approached the stated requirement of user seeing the
>> "timer ticking up", using the existing machinery.  This seems to be
>> how devlink_flash_update_status_notify() is expected to be used, so
>> I'm a little surprised at the critique.
> Sorry, I thought the systemd reference would be clear enough, see the
> screenshot here:
>
> https://images.app.goo.gl/gz1Uwg6mcHEd3D2m7
>
> Systemd prints something link:
>
> bla bla bla (XXs / YYs)
>
> where XX is the timer ticking up, and YY is the timeout value.
>
>>> So to be clear your options are:
>>>    - plumb the single extra netlink parameter through to devlink
>>>    - wait for someone else to do that for you, before you get
>>> firmware flashing accepted upstream.
>>>   
>> Since you seem to have something else in mind, a little more detail
>> would be helpful.
>>
>> We currently see devlink updating a percentage, something like:
>> Downloading:  56%
>> using backspaces to overwrite the value as the updates are published.
>>
>> How do you envision the userland interpretation of the timeout
>> ticking? Do you want to see something like:
>> Installing - timeout seconds:  23
>> as a countdown?
> I was under the impression that the systemd format would be familiar
> to users, hence:
>
> Downloading:  56% (Xm Ys / Zm Vz)
>
> The part in brackets only appearing after a few seconds without a
> notification, otherwise the whole thing would get noisy.
>
>> So, maybe a flag parameter that can tell the UI to use the raw value
>> and not massage it into a percentage?
>>
>> Do you see this new netlink parameter to be a boolean switch between
>> the percentage and raw, or maybe a bitflag parameter that might end
>> up with several bits of context information for userland to interpret?
>>
>> Are you thinking of a new flags parameter in
>> devlink_flash_update_status_notify(), or a new function to service
>> this?
>>
>> If a new parameter to devlink_flash_update_status_notify(), maybe it
>> is time to make a struct for flash update data rather than adding
>> more parameters to the function?
>>
>> Should we add yet another parameter to replace the '%' with some
>> other label, so devlink could print something like
>> Installing - timeout in:  23 secs
>>
>> Or could we use a 0 value for total to signify using a raw value and
>> not need to plumb a new parameter?  Although this might not get along
>> well with older devlink utilities.
> I was thinking of adding an extra timeout parameter to
> devlink_flash_update_status_notify() - timeout length in seconds.
> And an extra netlink attr for that.
>
> We could perhaps make:
>
> static inline void
> devlink_flash_update_status_notify( const char *status_msg,
> 				    unsigned long done, unsigned long total)
> {
> 	struct ..._args = {
> 		.status_msg = status_msg,
> 		.done = done,
> 		.total = total,
> 	}
>
> 	__devlink_flash_update_status_notify(devlink, &.._args);
> }
>
> IOW drop the component parameter from the normal helper, cause almost
> nobody uses that. The add a more full featured __ version, which would
> take the arg struct, the struct would include the timeout value.
>
> If the timeout is lower than 15sec drivers will probably have little
> value in reporting it, so simplified helper should be nice there to save LOC.
>
> The user space can do the counting up trivially using select(),
> or a syscall timeout. The netlink notification would only carry timeout.
> (LMK if this is problematic, I haven't looked at the user space part.)

What if we simplify this idea to adding a timeout variant of the 
devlink_flash_update_begin_notify()?  Perhaps something like
     devlink_flash_update_begin_notify_timeout(struct devlink *devlink, 
unsigned int timeout_seconds)
This can pass up a timeout parameter at the beginning of the flash and 
the userland utility can do what it needs at that point to set up the UI 
display.

I think using a struct internally to devlink.c/h might still have merit, 
but I'm not sure there's a need to mess with the rest of the API just yet.

sln
Jakub Kicinski Sept. 14, 2020, 10:59 p.m. UTC | #10
On Mon, 14 Sep 2020 15:02:18 -0700 Shannon Nelson wrote:
> On 9/10/20 10:56 AM, Jakub Kicinski wrote:
> > On Wed, 9 Sep 2020 18:34:57 -0700 Shannon Nelson wrote:  
> >> On 9/9/20 12:22 PM, Jakub Kicinski wrote:  
> >>> On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:  
> >>>> I'm suggesting that this implementation using the existing devlink
> >>>> logging services should suffice until someone can design, implement, and
> >>>> get accepted a different bit of plumbing.  Unfortunately, that's not a
> >>>> job that I can get to right now.  
> >>> This hack is too nasty to be accepted.  
> >> Your comment earlier was
> >>  
> >>   > I wonder if we can steal a page from systemd's book and display
> >>   > "time until timeout", or whatchamacallit, like systemd does when it's
> >>   > waiting for processes to quit. All drivers have some timeout set on the
> >>   > operation. If users knew the driver sets timeout to n minutes and they
> >>   > see the timer ticking up they'd be less likely to think the command has
> >>   > hanged..  
> >>
> >> I implemented the loop such that the timeout value was the 100%, and
> >> each time through the loop the elapsed time value is sent, so the user
> >> gets to see the % value increasing as the wait goes on, in the same way
> >> they see the download progress percentage ticking away.  
> > Right but you said that in most cases the value never goes up to 25min,
> > so user will see the value increment from 0 to say 5% very slowly and
> > then jump to 100%.
> >  
> >> This is how I approached the stated requirement of user seeing the
> >> "timer ticking up", using the existing machinery.  This seems to be
> >> how devlink_flash_update_status_notify() is expected to be used, so
> >> I'm a little surprised at the critique.  
> > Sorry, I thought the systemd reference would be clear enough, see the
> > screenshot here:
> >
> > https://images.app.goo.gl/gz1Uwg6mcHEd3D2m7
> >
> > Systemd prints something link:
> >
> > bla bla bla (XXs / YYs)
> >
> > where XX is the timer ticking up, and YY is the timeout value.
> >  
> >>> So to be clear your options are:
> >>>    - plumb the single extra netlink parameter through to devlink
> >>>    - wait for someone else to do that for you, before you get
> >>> firmware flashing accepted upstream.
> >>>     
> >> Since you seem to have something else in mind, a little more detail
> >> would be helpful.
> >>
> >> We currently see devlink updating a percentage, something like:
> >> Downloading:  56%
> >> using backspaces to overwrite the value as the updates are published.
> >>
> >> How do you envision the userland interpretation of the timeout
> >> ticking? Do you want to see something like:
> >> Installing - timeout seconds:  23
> >> as a countdown?  
> > I was under the impression that the systemd format would be familiar
> > to users, hence:
> >
> > Downloading:  56% (Xm Ys / Zm Vz)
> >
> > The part in brackets only appearing after a few seconds without a
> > notification, otherwise the whole thing would get noisy.
> >  
> >> So, maybe a flag parameter that can tell the UI to use the raw value
> >> and not massage it into a percentage?
> >>
> >> Do you see this new netlink parameter to be a boolean switch between
> >> the percentage and raw, or maybe a bitflag parameter that might end
> >> up with several bits of context information for userland to interpret?
> >>
> >> Are you thinking of a new flags parameter in
> >> devlink_flash_update_status_notify(), or a new function to service
> >> this?
> >>
> >> If a new parameter to devlink_flash_update_status_notify(), maybe it
> >> is time to make a struct for flash update data rather than adding
> >> more parameters to the function?
> >>
> >> Should we add yet another parameter to replace the '%' with some
> >> other label, so devlink could print something like
> >> Installing - timeout in:  23 secs
> >>
> >> Or could we use a 0 value for total to signify using a raw value and
> >> not need to plumb a new parameter?  Although this might not get along
> >> well with older devlink utilities.  
> > I was thinking of adding an extra timeout parameter to
> > devlink_flash_update_status_notify() - timeout length in seconds.
> > And an extra netlink attr for that.
> >
> > We could perhaps make:
> >
> > static inline void
> > devlink_flash_update_status_notify( const char *status_msg,
> > 				    unsigned long done, unsigned long total)
> > {
> > 	struct ..._args = {
> > 		.status_msg = status_msg,
> > 		.done = done,
> > 		.total = total,
> > 	}
> >
> > 	__devlink_flash_update_status_notify(devlink, &.._args);
> > }
> >
> > IOW drop the component parameter from the normal helper, cause almost
> > nobody uses that. The add a more full featured __ version, which would
> > take the arg struct, the struct would include the timeout value.
> >
> > If the timeout is lower than 15sec drivers will probably have little
> > value in reporting it, so simplified helper should be nice there to save LOC.
> >
> > The user space can do the counting up trivially using select(),
> > or a syscall timeout. The netlink notification would only carry timeout.
> > (LMK if this is problematic, I haven't looked at the user space part.)  
> 
> What if we simplify this idea to adding a timeout variant of the 
> devlink_flash_update_begin_notify()?  Perhaps something like
>      devlink_flash_update_begin_notify_timeout(struct devlink *devlink, 
> unsigned int timeout_seconds)

+const char *status_msg, I assume?

> This can pass up a timeout parameter at the beginning of the flash and 
> the userland utility can do what it needs at that point to set up the UI 
> display.
> 
> I think using a struct internally to devlink.c/h might still have merit, 
> but I'm not sure there's a need to mess with the rest of the API just yet.

Do you mean that this will set the timeout on the entire operation? 
And then still trigger notifications with
devlink_flash_update_begin_notify()? I was considering that to avoid
the need to add a new param, but it doesn't match the need all that 
well, most of time the timeout is on a portion of the operation (e.g. 
a FW command), not the entire process.

If you mean that we can just provide a simpler helper to the drivers
and internally fill in @done and @total as 0 - that could be a good 
option. I'd still prefer if there was one devlink_nl_flash_update_fill() 
that gets a structure.
Jacob Keller Sept. 14, 2020, 11:15 p.m. UTC | #11
On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
> IOW drop the component parameter from the normal helper, cause almost
> nobody uses that. The add a more full featured __ version, which would
> take the arg struct, the struct would include the timeout value.
> 
I would point out that the ice driver does use it to help indicate which
section of the flash is currently being updated.

i.e.

$ devlink dev flash pci/0000:af:00.0 file firmware.bin
Preparing to flash
[fw.mgmt] Erasing
[fw.mgmt] Erasing done
[fw.mgmt] Flashing 100%
[fw.mgmt] Flashing done 100%
[fw.undi] Erasing
[fw.undi] Erasing done
[fw.undi] Flashing 100%
[fw.undi] Flashing done 100%
[fw.netlist] Erasing
[fw.netlist] Erasing done
[fw.netlist] Flashing 100%
[fw.netlist] Flashing done 100%

I'd like to keep that, as it helps tell which component is currently
being updated. If we drop this, then either I have to manually build
strings which include the component name, or we lose this information on
display.

Thanks,
Jake
Jacob Keller Sept. 14, 2020, 11:19 p.m. UTC | #12
On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
> On Wed, 9 Sep 2020 18:34:57 -0700 Shannon Nelson wrote:
>> On 9/9/20 12:22 PM, Jakub Kicinski wrote:
>>> On Wed, 9 Sep 2020 10:58:19 -0700 Shannon Nelson wrote:  
>>>>
>>>> I'm suggesting that this implementation using the existing devlink
>>>> logging services should suffice until someone can design, implement, and
>>>> get accepted a different bit of plumbing.  Unfortunately, that's not a
>>>> job that I can get to right now.  
>>> This hack is too nasty to be accepted.  
>>
>> Your comment earlier was
>>
>>  > I wonder if we can steal a page from systemd's book and display
>>  > "time until timeout", or whatchamacallit, like systemd does when it's
>>  > waiting for processes to quit. All drivers have some timeout set on the
>>  > operation. If users knew the driver sets timeout to n minutes and they
>>  > see the timer ticking up they'd be less likely to think the command has
>>  > hanged..  
>>
>> I implemented the loop such that the timeout value was the 100%, and 
>> each time through the loop the elapsed time value is sent, so the user 
>> gets to see the % value increasing as the wait goes on, in the same way 
>> they see the download progress percentage ticking away. 
> 
> Right but you said that in most cases the value never goes up to 25min,
> so user will see the value increment from 0 to say 5% very slowly and
> then jump to 100%.
> 
>> This is how I approached the stated requirement of user seeing the
>> "timer ticking up", using the existing machinery.  This seems to be
>> how devlink_flash_update_status_notify() is expected to be used, so
>> I'm a little surprised at the critique.
> 
> Sorry, I thought the systemd reference would be clear enough, see the
> screenshot here:
> 
> https://images.app.goo.gl/gz1Uwg6mcHEd3D2m7
> 
> Systemd prints something link:
> 
> bla bla bla (XXs / YYs)
> 
> where XX is the timer ticking up, and YY is the timeout value.
> 
>>> So to be clear your options are:
>>>   - plumb the single extra netlink parameter through to devlink
>>>   - wait for someone else to do that for you, before you get
>>> firmware flashing accepted upstream.
>>>  
>>
>> Since you seem to have something else in mind, a little more detail 
>> would be helpful.
>>
>> We currently see devlink updating a percentage, something like:
>> Downloading:  56%
>> using backspaces to overwrite the value as the updates are published.
>>
>> How do you envision the userland interpretation of the timeout
>> ticking? Do you want to see something like:
>> Installing - timeout seconds:  23
>> as a countdown?
> 
> I was under the impression that the systemd format would be familiar 
> to users, hence:
> 
> Downloading:  56% (Xm Ys / Zm Vz)

FWIW, I like this approach. I think all we need to implement it is to
send the additional timeout parameter as part of the status notify
command. Then, devlink userspace, if it sees a timeout can choose when
to start displaying the "(Xm Ys / Zm Vs)" portion. Userspace could track
elapsed time changed in its event loop until the message changes.

This would also work for the ice driver, where we indicate that an erase
command could take up to several minutes as well.

Thanks,
Jake
Jakub Kicinski Sept. 14, 2020, 11:36 p.m. UTC | #13
On Mon, 14 Sep 2020 16:15:28 -0700 Jacob Keller wrote:
> On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
> > IOW drop the component parameter from the normal helper, cause almost
> > nobody uses that. The add a more full featured __ version, which would
> > take the arg struct, the struct would include the timeout value.
> >   
> I would point out that the ice driver does use it to help indicate which
> section of the flash is currently being updated.
> 
> i.e.
> 
> $ devlink dev flash pci/0000:af:00.0 file firmware.bin
> Preparing to flash
> [fw.mgmt] Erasing
> [fw.mgmt] Erasing done
> [fw.mgmt] Flashing 100%
> [fw.mgmt] Flashing done 100%
> [fw.undi] Erasing
> [fw.undi] Erasing done
> [fw.undi] Flashing 100%
> [fw.undi] Flashing done 100%
> [fw.netlist] Erasing
> [fw.netlist] Erasing done
> [fw.netlist] Flashing 100%
> [fw.netlist] Flashing done 100%
> 
> I'd like to keep that, as it helps tell which component is currently
> being updated. If we drop this, then either I have to manually build
> strings which include the component name, or we lose this information on
> display.

Thanks for pointing that out. My recollection was that ice and netdevsim
were the only two users, so I thought those could use the full __*
helper and pass an arg struct. But no strong feelings.
Shannon Nelson Sept. 14, 2020, 11:47 p.m. UTC | #14
On 9/14/20 4:36 PM, Jakub Kicinski wrote:
> On Mon, 14 Sep 2020 16:15:28 -0700 Jacob Keller wrote:
>> On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
>>> IOW drop the component parameter from the normal helper, cause almost
>>> nobody uses that. The add a more full featured __ version, which would
>>> take the arg struct, the struct would include the timeout value.
>>>    
>> I would point out that the ice driver does use it to help indicate which
>> section of the flash is currently being updated.
>>
>> i.e.
>>
>> $ devlink dev flash pci/0000:af:00.0 file firmware.bin
>> Preparing to flash
>> [fw.mgmt] Erasing
>> [fw.mgmt] Erasing done
>> [fw.mgmt] Flashing 100%
>> [fw.mgmt] Flashing done 100%
>> [fw.undi] Erasing
>> [fw.undi] Erasing done
>> [fw.undi] Flashing 100%
>> [fw.undi] Flashing done 100%
>> [fw.netlist] Erasing
>> [fw.netlist] Erasing done
>> [fw.netlist] Flashing 100%
>> [fw.netlist] Flashing done 100%
>>
>> I'd like to keep that, as it helps tell which component is currently
>> being updated. If we drop this, then either I have to manually build
>> strings which include the component name, or we lose this information on
>> display.
> Thanks for pointing that out. My recollection was that ice and netdevsim
> were the only two users, so I thought those could use the full __*
> helper and pass an arg struct. But no strong feelings.

Thanks, both.

I'd been going back and forth all morning about whether a simple single 
timeout or a timeout for each "chunk" would be appropriate. I'll try to 
be back in another day or three with an RFC.

sln
Jacob Keller Sept. 15, 2020, 12:52 a.m. UTC | #15
> -----Original Message-----
> From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org> On
> Behalf Of Jakub Kicinski
> Sent: Monday, September 14, 2020 4:36 PM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: Shannon Nelson <snelson@pensando.io>; netdev@vger.kernel.org;
> davem@davemloft.net
> Subject: Re: [PATCH v3 net-next 2/2] ionic: add devlink firmware update
> 
> On Mon, 14 Sep 2020 16:15:28 -0700 Jacob Keller wrote:
> > On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
> > > IOW drop the component parameter from the normal helper, cause almost
> > > nobody uses that. The add a more full featured __ version, which would
> > > take the arg struct, the struct would include the timeout value.
> > >
> > I would point out that the ice driver does use it to help indicate which
> > section of the flash is currently being updated.
> >
> > i.e.
> >
> > $ devlink dev flash pci/0000:af:00.0 file firmware.bin
> > Preparing to flash
> > [fw.mgmt] Erasing
> > [fw.mgmt] Erasing done
> > [fw.mgmt] Flashing 100%
> > [fw.mgmt] Flashing done 100%
> > [fw.undi] Erasing
> > [fw.undi] Erasing done
> > [fw.undi] Flashing 100%
> > [fw.undi] Flashing done 100%
> > [fw.netlist] Erasing
> > [fw.netlist] Erasing done
> > [fw.netlist] Flashing 100%
> > [fw.netlist] Flashing done 100%
> >
> > I'd like to keep that, as it helps tell which component is currently
> > being updated. If we drop this, then either I have to manually build
> > strings which include the component name, or we lose this information on
> > display.
> 
> Thanks for pointing that out. My recollection was that ice and netdevsim
> were the only two users, so I thought those could use the full __*
> helper and pass an arg struct. But no strong feelings.

Yea that would work for me :)

Thanks,
Jake
Shannon Nelson Sept. 15, 2020, 1:14 a.m. UTC | #16
On 9/14/20 5:53 PM, Keller, Jacob E wrote:
>
>> -----Original Message-----
>> From: Shannon Nelson <snelson@pensando.io>
>> Sent: Monday, September 14, 2020 4:47 PM
>> To: Jakub Kicinski <kuba@kernel.org>; Keller, Jacob E <jacob.e.keller@intel.com>
>> Cc: netdev@vger.kernel.org; davem@davemloft.net
>> Subject: Re: [PATCH v3 net-next 2/2] ionic: add devlink firmware update
>>
>> On 9/14/20 4:36 PM, Jakub Kicinski wrote:
>>> On Mon, 14 Sep 2020 16:15:28 -0700 Jacob Keller wrote:
>>>> On 9/10/2020 10:56 AM, Jakub Kicinski wrote:
>>>>> IOW drop the component parameter from the normal helper, cause almost
>>>>> nobody uses that. The add a more full featured __ version, which would
>>>>> take the arg struct, the struct would include the timeout value.
>>>>>
>>>> I would point out that the ice driver does use it to help indicate which
>>>> section of the flash is currently being updated.
>>>>
>>>> i.e.
>>>>
>>>> $ devlink dev flash pci/0000:af:00.0 file firmware.bin
>>>> Preparing to flash
>>>> [fw.mgmt] Erasing
>>>> [fw.mgmt] Erasing done
>>>> [fw.mgmt] Flashing 100%
>>>> [fw.mgmt] Flashing done 100%
>>>> [fw.undi] Erasing
>>>> [fw.undi] Erasing done
>>>> [fw.undi] Flashing 100%
>>>> [fw.undi] Flashing done 100%
>>>> [fw.netlist] Erasing
>>>> [fw.netlist] Erasing done
>>>> [fw.netlist] Flashing 100%
>>>> [fw.netlist] Flashing done 100%
>>>>
>>>> I'd like to keep that, as it helps tell which component is currently
>>>> being updated. If we drop this, then either I have to manually build
>>>> strings which include the component name, or we lose this information on
>>>> display.
>>> Thanks for pointing that out. My recollection was that ice and netdevsim
>>> were the only two users, so I thought those could use the full __*
>>> helper and pass an arg struct. But no strong feelings.
>> Thanks, both.
>>
>> I'd been going back and forth all morning about whether a simple single
>> timeout or a timeout for each "chunk" would be appropriate. I'll try to
>> be back in another day or three with an RFC.
>>
>> sln
> For ice, a timeout for each message/chunk makes the most sense, but I could see  a different reasoning when you have multiple steps all bounded by the same timeout
>
> Thanks,
> Jake
>

So now we're beginning to dance around timeout boundaries - how can we 
define the beginning and end of a timeout boundary, and how do they 
relate to the component and label?  Currently, if either the component 
or status_msg changes, the devlink user program does a newline to start 
a new status line.  The done and total values are used from each notify 
message to create a % value displayed, but are not dependent on any 
previous done or total values, so the total doesn't need to be the same 
value from status message to status message, even if the component and 
label remain the same, devlink will just print whatever % gets 
calculated that time.

I'm thinking that the behavior of the timeout value should remain 
separate from the component and status_msg values, such that once given, 
then the userland countdown continues on that timeout.  Each subsequent 
notify, regardless of component or label changes, should continue 
reporting that same timeout value for as long as it applies to the 
action.  If a new timeout value is reported, the countdown starts over.  
This continues until either the countdown finishes or the driver reports 
the flash as completed.  I think this allows is the flexibility for 
multiple steps that Jake alludes to above.  Does this make sense?

What should the userland program do when the timeout expires?  Start 
counting backwards?  Stop waiting?  Do we care to define this at the moment?

sln
Jakub Kicinski Sept. 15, 2020, 3:50 p.m. UTC | #17
On Mon, 14 Sep 2020 18:14:22 -0700 Shannon Nelson wrote:
> So now we're beginning to dance around timeout boundaries - how can we 
> define the beginning and end of a timeout boundary, and how do they 
> relate to the component and label?  Currently, if either the component 
> or status_msg changes, the devlink user program does a newline to start 
> a new status line.  The done and total values are used from each notify 
> message to create a % value displayed, but are not dependent on any 
> previous done or total values, so the total doesn't need to be the same 
> value from status message to status message, even if the component and 
> label remain the same, devlink will just print whatever % gets 
> calculated that time.

I think systemd removes the timeout marking when it moves on to the
next job, and so should devlink when it moves on to the next
component/status_msg.

> I'm thinking that the behavior of the timeout value should remain 
> separate from the component and status_msg values, such that once given, 
> then the userland countdown continues on that timeout.  Each subsequent 
> notify, regardless of component or label changes, should continue 
> reporting that same timeout value for as long as it applies to the 
> action.  If a new timeout value is reported, the countdown starts over.  

What if no timeout exists for the next action? Driver reports 0 to
"clear"?

> This continues until either the countdown finishes or the driver reports 
> the flash as completed.  I think this allows is the flexibility for 
> multiple steps that Jake alludes to above.  Does this make sense?

I disagree. This doesn't match reality/driver behavior and will lead to
timeouts counting to some random value, that's to say the drivers
timeout instant will not match when user space reaches timeout.

The timeout should be per notification, because drivers send a
notification per command, and commands have timeout.

The timeout is only needed if there is no progress to report, i.e.
driver is waiting for something to happen.

> What should the userland program do when the timeout expires?  Start 
> counting backwards?  Stop waiting?  Do we care to define this at the moment?

[component] bla bla X% (timeout reached)
Jacob Keller Sept. 15, 2020, 4:50 p.m. UTC | #18
> -----Original Message-----
> From: Jakub Kicinski <kuba@kernel.org>
> Sent: Tuesday, September 15, 2020 8:51 AM
> To: Shannon Nelson <snelson@pensando.io>
> Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
> davem@davemloft.net
> Subject: Re: [PATCH v3 net-next 2/2] ionic: add devlink firmware update
> 
> On Mon, 14 Sep 2020 18:14:22 -0700 Shannon Nelson wrote:
> > So now we're beginning to dance around timeout boundaries - how can we
> > define the beginning and end of a timeout boundary, and how do they
> > relate to the component and label?  Currently, if either the component
> > or status_msg changes, the devlink user program does a newline to start
> > a new status line.  The done and total values are used from each notify
> > message to create a % value displayed, but are not dependent on any
> > previous done or total values, so the total doesn't need to be the same
> > value from status message to status message, even if the component and
> > label remain the same, devlink will just print whatever % gets
> > calculated that time.
> 
> I think systemd removes the timeout marking when it moves on to the
> next job, and so should devlink when it moves on to the next
> component/status_msg.
> 
> > I'm thinking that the behavior of the timeout value should remain
> > separate from the component and status_msg values, such that once given,
> > then the userland countdown continues on that timeout.  Each subsequent
> > notify, regardless of component or label changes, should continue
> > reporting that same timeout value for as long as it applies to the
> > action.  If a new timeout value is reported, the countdown starts over.
> 
> What if no timeout exists for the next action? Driver reports 0 to
> "clear"?
> 
> > This continues until either the countdown finishes or the driver reports
> > the flash as completed.  I think this allows is the flexibility for
> > multiple steps that Jake alludes to above.  Does this make sense?
> 
> I disagree. This doesn't match reality/driver behavior and will lead to
> timeouts counting to some random value, that's to say the drivers
> timeout instant will not match when user space reaches timeout.
> 
> The timeout should be per notification, because drivers send a
> notification per command, and commands have timeout.
> 

This is how everything operates today. Just send a new status for every command.

Is that not how your case works?

> The timeout is only needed if there is no progress to report, i.e.
> driver is waiting for something to happen.
> 

Right.

> > What should the userland program do when the timeout expires?  Start
> > counting backwards?  Stop waiting?  Do we care to define this at the moment?
> 
> [component] bla bla X% (timeout reached)

Yep. I don't think userspace should bail or do anything but display here. Basically: the driver will timeout and then end the update process with an error. The timeout value is just a useful display so that users aren't confused why there is no output going on while waiting.

Thanks,
Jake
Shannon Nelson Sept. 15, 2020, 5:20 p.m. UTC | #19
On 9/15/20 9:50 AM, Keller, Jacob E wrote:
>
>> -----Original Message-----
>> From: Jakub Kicinski <kuba@kernel.org>
>> Sent: Tuesday, September 15, 2020 8:51 AM
>> To: Shannon Nelson <snelson@pensando.io>
>> Cc: Keller, Jacob E <jacob.e.keller@intel.com>; netdev@vger.kernel.org;
>> davem@davemloft.net
>> Subject: Re: [PATCH v3 net-next 2/2] ionic: add devlink firmware update
>>
>> On Mon, 14 Sep 2020 18:14:22 -0700 Shannon Nelson wrote:
>>> So now we're beginning to dance around timeout boundaries - how can we
>>> define the beginning and end of a timeout boundary, and how do they
>>> relate to the component and label?  Currently, if either the component
>>> or status_msg changes, the devlink user program does a newline to start
>>> a new status line.  The done and total values are used from each notify
>>> message to create a % value displayed, but are not dependent on any
>>> previous done or total values, so the total doesn't need to be the same
>>> value from status message to status message, even if the component and
>>> label remain the same, devlink will just print whatever % gets
>>> calculated that time.
>> I think systemd removes the timeout marking when it moves on to the
>> next job, and so should devlink when it moves on to the next
>> component/status_msg.

Works for me.  I'll try to note these UI implementation hints somewhere 
useful.

>>
>>> I'm thinking that the behavior of the timeout value should remain
>>> separate from the component and status_msg values, such that once given,
>>> then the userland countdown continues on that timeout.  Each subsequent
>>> notify, regardless of component or label changes, should continue
>>> reporting that same timeout value for as long as it applies to the
>>> action.  If a new timeout value is reported, the countdown starts over.
>> What if no timeout exists for the next action? Driver reports 0 to
>> "clear"?

Yes, that's what I would expect.

>>
>>> This continues until either the countdown finishes or the driver reports
>>> the flash as completed.  I think this allows is the flexibility for
>>> multiple steps that Jake alludes to above.  Does this make sense?
>> I disagree. This doesn't match reality/driver behavior and will lead to
>> timeouts counting to some random value, that's to say the drivers
>> timeout instant will not match when user space reaches timeout.
>>
>> The timeout should be per notification, because drivers send a
>> notification per command, and commands have timeout.
>>
> This is how everything operates today. Just send a new status for every command.
>
> Is that not how your case works?
>
>> The timeout is only needed if there is no progress to report, i.e.
>> driver is waiting for something to happen.
>>
> Right.
>
>>> What should the userland program do when the timeout expires?  Start
>>> counting backwards?  Stop waiting?  Do we care to define this at the moment?
>> [component] bla bla X% (timeout reached)
> Yep. I don't think userspace should bail or do anything but display here. Basically: the driver will timeout and then end the update process with an error. The timeout value is just a useful display so that users aren't confused why there is no output going on while waiting.
>
>

If individual notify messages have a timeout, how can we have a 
progress-percentage reported with a timeout?  This implies to me that 
the timeout is on the component:bla-bla pair, but there are many notify 
messages in order to show the progress in percentage done.  This is why 
I was suggesting that if the timeout and component and status messages 
haven't changed, then we're still working on the same timeout.

sln
Jakub Kicinski Sept. 15, 2020, 5:39 p.m. UTC | #20
On Tue, 15 Sep 2020 10:20:11 -0700 Shannon Nelson wrote:
> >>> What should the userland program do when the timeout expires?  Start
> >>> counting backwards?  Stop waiting?  Do we care to define this at the moment?  
> >> [component] bla bla X% (timeout reached)
> >  
> > Yep. I don't think userspace should bail or do anything but display
> > here. Basically: the driver will timeout and then end the update
> > process with an error. The timeout value is just a useful display
> > so that users aren't confused why there is no output going on while
> > waiting.
> >
> If individual notify messages have a timeout, how can we have a 
> progress-percentage reported with a timeout?  This implies to me that 
> the timeout is on the component:bla-bla pair, but there are many
> notify messages in order to show the progress in percentage done.
> This is why I was suggesting that if the timeout and component and
> status messages haven't changed, then we're still working on the same
> timeout.

My thinking is that the timeout is mostly useful for commands which
can't meaningfully provide the progress percentage, or the percentage
update is at a very high granularity. If percentage updates are reported
often they should usually be sufficient.

We mostly want to make sure user doesn't think the system has hung.
Jacob Keller Sept. 15, 2020, 6:44 p.m. UTC | #21
On 9/15/2020 10:39 AM, Jakub Kicinski wrote:
> On Tue, 15 Sep 2020 10:20:11 -0700 Shannon Nelson wrote:
>>>>> What should the userland program do when the timeout expires?  Start
>>>>> counting backwards?  Stop waiting?  Do we care to define this at the moment?  
>>>> [component] bla bla X% (timeout reached)
>>>  
>>> Yep. I don't think userspace should bail or do anything but display
>>> here. Basically: the driver will timeout and then end the update
>>> process with an error. The timeout value is just a useful display
>>> so that users aren't confused why there is no output going on while
>>> waiting.
>>>
>> If individual notify messages have a timeout, how can we have a 
>> progress-percentage reported with a timeout?  This implies to me that 
>> the timeout is on the component:bla-bla pair, but there are many
>> notify messages in order to show the progress in percentage done.
>> This is why I was suggesting that if the timeout and component and
>> status messages haven't changed, then we're still working on the same
>> timeout.
> 
> My thinking is that the timeout is mostly useful for commands which
> can't meaningfully provide the progress percentage, or the percentage
> update is at a very high granularity. If percentage updates are reported
> often they should usually be sufficient.
> 
> We mostly want to make sure user doesn't think the system has hung.
> 

Exactly how I saw it.

Basically, the timeout should take effect as long as the (component,
msg) pair stays the same.

So if you send percentage reports with the same message and component,
then the timeout stays in effect. Once you start a new message, then the
timeout would be reset.

We could in theory provide both a "timeout" and "time elapsed" field,
which would allow the application to draw the timeout at an abitrary
point. Then it could progress the time as normal if it hasn't received a
new message yet, allowing for consistent screen updates...

But I think that might be overkill. For the cases where we do get some
sort of progress, then the percentage update is usually enough.
Jakub Kicinski Sept. 15, 2020, 7 p.m. UTC | #22
On Tue, 15 Sep 2020 11:44:07 -0700 Jacob Keller wrote:
> Exactly how I saw it.
> 
> Basically, the timeout should take effect as long as the (component,
> msg) pair stays the same.
> 
> So if you send percentage reports with the same message and component,
> then the timeout stays in effect. Once you start a new message, then the
> timeout would be reset.

I don't think I agree with that. As I said that'd make the timeout not
match the reality of what happens in the driver.

Say I have 4 updates (every 25%) each has a timeout of 30 seconds.
If I understand what you're saying correctly you'd set a timeout of 
2 min for the operation. But if first two chunks finish in 10 seconds,
and 3rd one timeouts out the timeout will happen (in the driver) when
the user-visible timer is at (50sec / 2 min). 

I think that each notification should update the timeout. And like
systemd we should not display the timeout counter in the first, say 5
seconds to minimize the display noise.

> We could in theory provide both a "timeout" and "time elapsed" field,
> which would allow the application to draw the timeout at an abitrary
> point. Then it could progress the time as normal if it hasn't received a
> new message yet, allowing for consistent screen updates...

I'm not sure I follow this part.

> But I think that might be overkill. For the cases where we do get some
> sort of progress, then the percentage update is usually enough.
Shannon Nelson Sept. 15, 2020, 10:11 p.m. UTC | #23
On 9/15/20 12:00 PM, Jakub Kicinski wrote:
> On Tue, 15 Sep 2020 11:44:07 -0700 Jacob Keller wrote:
>> Exactly how I saw it.
>>
>> Basically, the timeout should take effect as long as the (component,
>> msg) pair stays the same.
>>
>> So if you send percentage reports with the same message and component,
>> then the timeout stays in effect. Once you start a new message, then the
>> timeout would be reset.
> I don't think I agree with that. As I said that'd make the timeout not
> match the reality of what happens in the driver.

I have an asynchronous FW interaction where the driver sends one FW 
command to start the fw-install and sends several more FW commands to 
check on the status until either it gets a done or error status or too 
many seconds have elapsed.  How would you suggest this gets modeled?

In the model you are suggesting, the driver can only do a single 
status_notify with a timeout before the initial async FW command, then 
no other status_notify messages until the driver gets the done/error 
status, or the time has elapsed, regardless of how long that might 
take.  The user will only see the timeout ticking, but no activity from 
the driver.  This allows the user to see what the deadline is, but 
doesn't reassure them that the process is still moving.

I'm suggesting that the driver might send some intermediate 
status_notify messages in order to assure the user that things are not 
stalled out.  Driving a spinner would be nice, but we don't have that 
concept in this interface, so poking the done/total values could be used 
for that.  In order to not reset the timeout on each of these 
intermediate updates, we pass the same timeout value.

At this point I'm going to try a patchset that implements the basics 
that we already have agreed upon, and this detail can get worked out 
later, as I believe it doesn't change the internal implementation.

sln
Jakub Kicinski Sept. 15, 2020, 10:27 p.m. UTC | #24
On Tue, 15 Sep 2020 15:11:06 -0700 Shannon Nelson wrote:
> On 9/15/20 12:00 PM, Jakub Kicinski wrote:
> > On Tue, 15 Sep 2020 11:44:07 -0700 Jacob Keller wrote:  
> >> Exactly how I saw it.
> >>
> >> Basically, the timeout should take effect as long as the (component,
> >> msg) pair stays the same.
> >>
> >> So if you send percentage reports with the same message and component,
> >> then the timeout stays in effect. Once you start a new message, then the
> >> timeout would be reset.  
> > I don't think I agree with that. As I said that'd make the timeout not
> > match the reality of what happens in the driver.  
> 
> I have an asynchronous FW interaction where the driver sends one FW 
> command to start the fw-install and sends several more FW commands to 
> check on the status until either it gets a done or error status or too 
> many seconds have elapsed.  How would you suggest this gets modeled?

It's still one command. The fact that the driver periodically checks if
its finished is an implementation detail. Drivers which periodically
check if "done bit" in some register got cleared or not also don't send
a notification every time they've done so.

> In the model you are suggesting, the driver can only do a single 
> status_notify with a timeout before the initial async FW command, then 
> no other status_notify messages until the driver gets the done/error 
> status, or the time has elapsed, regardless of how long that might 
> take.  The user will only see the timeout ticking, but no activity from 
> the driver.  This allows the user to see what the deadline is, but 
> doesn't reassure them that the process is still moving.

The timer should be a sufficient indication to the user not to worry,
yet. The worrying starts once the timer expires, then something is up.

> I'm suggesting that the driver might send some intermediate 
> status_notify messages in order to assure the user that things are not 
> stalled out.  Driving a spinner would be nice, but we don't have that 
> concept in this interface, so poking the done/total values could be used 
> for that.  In order to not reset the timeout on each of these 
> intermediate updates, we pass the same timeout value.

What useful status_notify messages can a driver send mid-command?
Timeout tells the user "for this much time you may not see any real
status updates".

> At this point I'm going to try a patchset that implements the basics 
> that we already have agreed upon, and this detail can get worked out 
> later, as I believe it doesn't change the internal implementation.
diff mbox series

Patch

diff --git a/drivers/net/ethernet/pensando/ionic/Makefile b/drivers/net/ethernet/pensando/ionic/Makefile
index 29f304d75261..8d3c2d3cb10d 100644
--- a/drivers/net/ethernet/pensando/ionic/Makefile
+++ b/drivers/net/ethernet/pensando/ionic/Makefile
@@ -5,4 +5,4 @@  obj-$(CONFIG_IONIC) := ionic.o
 
 ionic-y := ionic_main.o ionic_bus_pci.o ionic_devlink.o ionic_dev.o \
 	   ionic_debugfs.o ionic_lif.o ionic_rx_filter.o ionic_ethtool.o \
-	   ionic_txrx.o ionic_stats.o
+	   ionic_txrx.o ionic_stats.o ionic_fw.o
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
index 8d9fb2e19cca..5348f05ebc32 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
@@ -9,6 +9,19 @@ 
 #include "ionic_lif.h"
 #include "ionic_devlink.h"
 
+static int ionic_dl_flash_update(struct devlink *dl,
+				 const char *fwname,
+				 const char *component,
+				 struct netlink_ext_ack *extack)
+{
+	struct ionic *ionic = devlink_priv(dl);
+
+	if (component)
+		return -EOPNOTSUPP;
+
+	return ionic_firmware_update(ionic->lif, fwname, extack);
+}
+
 static int ionic_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 			     struct netlink_ext_ack *extack)
 {
@@ -48,6 +61,7 @@  static int ionic_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 
 static const struct devlink_ops ionic_dl_ops = {
 	.info_get	= ionic_dl_info_get,
+	.flash_update	= ionic_dl_flash_update,
 };
 
 struct ionic *ionic_devlink_alloc(struct device *dev)
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.h b/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
index 0690172fc57a..5c01a9e306d8 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
+++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.h
@@ -6,6 +6,9 @@ 
 
 #include <net/devlink.h>
 
+int ionic_firmware_update(struct ionic_lif *lif, const char *fw_name,
+			  struct netlink_ext_ack *extack);
+
 struct ionic *ionic_devlink_alloc(struct device *dev);
 void ionic_devlink_free(struct ionic *ionic);
 int ionic_devlink_register(struct ionic *ionic);
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_fw.c b/drivers/net/ethernet/pensando/ionic/ionic_fw.c
new file mode 100644
index 000000000000..c8d699e219d4
--- /dev/null
+++ b/drivers/net/ethernet/pensando/ionic/ionic_fw.c
@@ -0,0 +1,209 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2020 Pensando Systems, Inc */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/firmware.h>
+
+#include "ionic.h"
+#include "ionic_dev.h"
+#include "ionic_lif.h"
+#include "ionic_devlink.h"
+
+/* The worst case wait for the install activity is about 25 minutes when
+ * installing a new CPLD, which is very seldom.  Normal is about 30-35
+ * seconds.  Since the driver can't tell if a CPLD update will happen we
+ * set the timeout for the ugly case.
+ */
+#define IONIC_FW_INSTALL_TIMEOUT	(25 * 60)
+#define IONIC_FW_SELECT_TIMEOUT		30
+
+/* Number of periodic log updates during fw file download */
+#define IONIC_FW_INTERVAL_FRACTION	32
+
+static void ionic_dev_cmd_firmware_download(struct ionic_dev *idev, u64 addr,
+					    u32 offset, u32 length)
+{
+	union ionic_dev_cmd cmd = {
+		.fw_download.opcode = IONIC_CMD_FW_DOWNLOAD,
+		.fw_download.offset = offset,
+		.fw_download.addr = addr,
+		.fw_download.length = length
+	};
+
+	ionic_dev_cmd_go(idev, &cmd);
+}
+
+static void ionic_dev_cmd_firmware_install(struct ionic_dev *idev)
+{
+	union ionic_dev_cmd cmd = {
+		.fw_control.opcode = IONIC_CMD_FW_CONTROL,
+		.fw_control.oper = IONIC_FW_INSTALL_ASYNC
+	};
+
+	ionic_dev_cmd_go(idev, &cmd);
+}
+
+static void ionic_dev_cmd_firmware_activate(struct ionic_dev *idev, u8 slot)
+{
+	union ionic_dev_cmd cmd = {
+		.fw_control.opcode = IONIC_CMD_FW_CONTROL,
+		.fw_control.oper = IONIC_FW_ACTIVATE_ASYNC,
+		.fw_control.slot = slot
+	};
+
+	ionic_dev_cmd_go(idev, &cmd);
+}
+
+static int ionic_fw_status_long_wait(struct ionic *ionic,
+				     const char *label,
+				     unsigned long timeout,
+				     u8 fw_cmd,
+				     struct netlink_ext_ack *extack)
+{
+	union ionic_dev_cmd cmd = {
+		.fw_control.opcode = IONIC_CMD_FW_CONTROL,
+		.fw_control.oper = fw_cmd,
+	};
+	unsigned long start_time;
+	unsigned long end_time;
+	struct devlink *dl;
+	int err;
+
+	dl = priv_to_devlink(ionic);
+	devlink_flash_update_status_notify(dl, label, NULL, 1, timeout);
+	start_time = jiffies;
+	end_time = start_time + (timeout * HZ);
+	do {
+		mutex_lock(&ionic->dev_cmd_lock);
+		ionic_dev_cmd_go(&ionic->idev, &cmd);
+		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+		mutex_unlock(&ionic->dev_cmd_lock);
+
+		devlink_flash_update_status_notify(dl, label, NULL,
+						   (jiffies - start_time) / HZ,
+						   timeout);
+	} while (time_before(jiffies, end_time) && (err == -EAGAIN || err == -ETIMEDOUT));
+
+	if (err == -EAGAIN || err == -ETIMEDOUT) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware wait timed out");
+		dev_err(ionic->dev, "DEV_CMD firmware wait %s timed out\n", label);
+	} else if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Firmware wait failed");
+	} else {
+		devlink_flash_update_status_notify(dl, label, NULL, timeout, timeout);
+	}
+
+	return err;
+}
+
+int ionic_firmware_update(struct ionic_lif *lif, const char *fw_name,
+			  struct netlink_ext_ack *extack)
+{
+	struct ionic_dev *idev = &lif->ionic->idev;
+	struct net_device *netdev = lif->netdev;
+	struct ionic *ionic = lif->ionic;
+	union ionic_dev_cmd_comp comp;
+	u32 buf_sz, copy_sz, offset;
+	const struct firmware *fw;
+	struct devlink *dl;
+	int next_interval;
+	int err = 0;
+	u8 fw_slot;
+
+	netdev_info(netdev, "Installing firmware %s\n", fw_name);
+
+	dl = priv_to_devlink(ionic);
+	devlink_flash_update_begin_notify(dl);
+	devlink_flash_update_status_notify(dl, "Preparing to flash", NULL, 0, 0);
+
+	err = request_firmware(&fw, fw_name, ionic->dev);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Unable to find firmware file");
+		goto err_out;
+	}
+
+	buf_sz = sizeof(idev->dev_cmd_regs->data);
+
+	netdev_dbg(netdev,
+		   "downloading firmware - size %d part_sz %d nparts %lu\n",
+		   (int)fw->size, buf_sz, DIV_ROUND_UP(fw->size, buf_sz));
+
+	devlink_flash_update_status_notify(dl, "Downloading", NULL, 0, fw->size);
+	offset = 0;
+	next_interval = fw->size / IONIC_FW_INTERVAL_FRACTION;
+	while (offset < fw->size) {
+		copy_sz = min_t(unsigned int, buf_sz, fw->size - offset);
+		mutex_lock(&ionic->dev_cmd_lock);
+		memcpy_toio(&idev->dev_cmd_regs->data, fw->data + offset, copy_sz);
+		ionic_dev_cmd_firmware_download(idev,
+						offsetof(union ionic_dev_cmd_regs, data),
+						offset, copy_sz);
+		err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+		mutex_unlock(&ionic->dev_cmd_lock);
+		if (err) {
+			netdev_err(netdev,
+				   "download failed offset 0x%x addr 0x%lx len 0x%x\n",
+				   offset, offsetof(union ionic_dev_cmd_regs, data),
+				   copy_sz);
+			NL_SET_ERR_MSG_MOD(extack, "Segment download failed");
+			goto err_out;
+		}
+		offset += copy_sz;
+
+		if (offset > next_interval) {
+			devlink_flash_update_status_notify(dl, "Downloading",
+							   NULL, offset, fw->size);
+			next_interval = offset + (fw->size / IONIC_FW_INTERVAL_FRACTION);
+		}
+	}
+	devlink_flash_update_status_notify(dl, "Downloading", NULL, 1, 1);
+
+	devlink_flash_update_status_notify(dl, "Installing", NULL, 0, 2);
+
+	mutex_lock(&ionic->dev_cmd_lock);
+	ionic_dev_cmd_firmware_install(idev);
+	err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+	ionic_dev_cmd_comp(idev, (union ionic_dev_cmd_comp *)&comp);
+	fw_slot = comp.fw_control.slot;
+	mutex_unlock(&ionic->dev_cmd_lock);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to start firmware install");
+		goto err_out;
+	}
+
+	err = ionic_fw_status_long_wait(ionic, "Installing",
+					IONIC_FW_INSTALL_TIMEOUT,
+					IONIC_FW_INSTALL_STATUS,
+					extack);
+	if (err)
+		goto err_out;
+
+	devlink_flash_update_status_notify(dl, "Selecting", NULL, 0, 2);
+
+	mutex_lock(&ionic->dev_cmd_lock);
+	ionic_dev_cmd_firmware_activate(idev, fw_slot);
+	err = ionic_dev_cmd_wait(ionic, DEVCMD_TIMEOUT);
+	mutex_unlock(&ionic->dev_cmd_lock);
+	if (err) {
+		NL_SET_ERR_MSG_MOD(extack, "Failed to start firmware select");
+		goto err_out;
+	}
+
+	err = ionic_fw_status_long_wait(ionic, "Selecting",
+					IONIC_FW_SELECT_TIMEOUT,
+					IONIC_FW_ACTIVATE_STATUS,
+					extack);
+	if (err)
+		goto err_out;
+
+	netdev_info(netdev, "Firmware update completed\n");
+
+err_out:
+	if (err)
+		devlink_flash_update_status_notify(dl, "Flash failed", NULL, 0, 0);
+	release_firmware(fw);
+	devlink_flash_update_end_notify(dl);
+	return err;
+}
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c
index f1fd9a98ae4a..c0979b9e38fc 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_main.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c
@@ -361,17 +361,22 @@  int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
 	 */
 	max_wait = jiffies + (max_seconds * HZ);
 try_again:
+	opcode = idev->dev_cmd_regs->cmd.cmd.opcode;
 	start_time = jiffies;
 	do {
 		done = ionic_dev_cmd_done(idev);
 		if (done)
 			break;
-		msleep(5);
-		hb = ionic_heartbeat_check(ionic);
+		usleep_range(100, 200);
+
+		/* Don't check the heartbeat on FW_CONTROL commands as they are
+		 * notorious for interrupting the firmware's heartbeat update.
+		 */
+		if (opcode != IONIC_CMD_FW_CONTROL)
+			hb = ionic_heartbeat_check(ionic);
 	} while (!done && !hb && time_before(jiffies, max_wait));
 	duration = jiffies - start_time;
 
-	opcode = idev->dev_cmd_regs->cmd.cmd.opcode;
 	dev_dbg(ionic->dev, "DEVCMD %s (%d) done=%d took %ld secs (%ld jiffies)\n",
 		ionic_opcode_to_str(opcode), opcode,
 		done, duration / HZ, duration);
@@ -395,8 +400,9 @@  int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
 
 	err = ionic_dev_cmd_status(&ionic->idev);
 	if (err) {
-		if (err == IONIC_RC_EAGAIN && !time_after(jiffies, max_wait)) {
-			dev_err(ionic->dev, "DEV_CMD %s (%d) error, %s (%d) retrying...\n",
+		if (err == IONIC_RC_EAGAIN &&
+		    time_before(jiffies, (max_wait - HZ))) {
+			dev_dbg(ionic->dev, "DEV_CMD %s (%d), %s (%d) retrying...\n",
 				ionic_opcode_to_str(opcode), opcode,
 				ionic_error_to_str(err), err);
 
@@ -406,9 +412,10 @@  int ionic_dev_cmd_wait(struct ionic *ionic, unsigned long max_seconds)
 			goto try_again;
 		}
 
-		dev_err(ionic->dev, "DEV_CMD %s (%d) error, %s (%d) failed\n",
-			ionic_opcode_to_str(opcode), opcode,
-			ionic_error_to_str(err), err);
+		if (!(opcode == IONIC_CMD_FW_CONTROL && err == IONIC_RC_EAGAIN))
+			dev_err(ionic->dev, "DEV_CMD %s (%d) error, %s (%d) failed\n",
+				ionic_opcode_to_str(opcode), opcode,
+				ionic_error_to_str(err), err);
 
 		return ionic_error_to_errno(err);
 	}