diff mbox series

[RFC,v2,03/18] scsi: core: Implement reserved command handling

Message ID 1654770559-101375-4-git-send-email-john.garry@huawei.com
State New
Headers show
Series blk-mq/libata/scsi: SCSI driver tagging improvements | expand

Commit Message

John Garry June 9, 2022, 10:29 a.m. UTC
From: Hannes Reinecke <hare@suse.de>

Quite some drivers are using management commands internally, which
typically use the same hardware tag pool (ie they are being allocated
from the same hardware resources) as the 'normal' I/O commands.
These commands are set aside before allocating the block-mq tag bitmap,
so they'll never show up as busy in the tag map.
The block-layer, OTOH, already has 'reserved_tags' to handle precisely
this situation.
So this patch adds a new field 'nr_reserved_cmds' to the SCSI host
template to instruct the block layer to set aside a tag space for these
management commands by using reserved tags.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: John Garry <john.garry@huawei.com>
---
 drivers/scsi/hosts.c     |  3 +++
 drivers/scsi/scsi_lib.c  |  6 +++++-
 include/scsi/scsi_host.h | 22 +++++++++++++++++++++-
 3 files changed, 29 insertions(+), 2 deletions(-)

Comments

Damien Le Moal June 13, 2022, 7:01 a.m. UTC | #1
On 6/9/22 19:29, John Garry wrote:
> From: Hannes Reinecke <hare@suse.de>
> 
> Quite some drivers are using management commands internally, which
> typically use the same hardware tag pool (ie they are being allocated
> from the same hardware resources) as the 'normal' I/O commands.
> These commands are set aside before allocating the block-mq tag bitmap,
> so they'll never show up as busy in the tag map.
> The block-layer, OTOH, already has 'reserved_tags' to handle precisely
> this situation.
> So this patch adds a new field 'nr_reserved_cmds' to the SCSI host
> template to instruct the block layer to set aside a tag space for these
> management commands by using reserved tags.
> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> Signed-off-by: John Garry <john.garry@huawei.com>
> ---
>  drivers/scsi/hosts.c     |  3 +++
>  drivers/scsi/scsi_lib.c  |  6 +++++-
>  include/scsi/scsi_host.h | 22 +++++++++++++++++++++-
>  3 files changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
> index 8352f90d997d..27296addaf63 100644
> --- a/drivers/scsi/hosts.c
> +++ b/drivers/scsi/hosts.c
> @@ -474,6 +474,9 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
>  	if (sht->virt_boundary_mask)
>  		shost->virt_boundary_mask = sht->virt_boundary_mask;
>  
> +	if (sht->nr_reserved_cmds)
> +		shost->nr_reserved_cmds = sht->nr_reserved_cmds;
> +
>  	device_initialize(&shost->shost_gendev);
>  	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
>  	shost->shost_gendev.bus = &scsi_bus_type;
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 6ffc9e4258a8..f6e53c6d913c 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1974,8 +1974,12 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
>  	else
>  		tag_set->ops = &scsi_mq_ops_no_commit;
>  	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
> +
>  	tag_set->nr_maps = shost->nr_maps ? : 1;
> -	tag_set->queue_depth = shost->can_queue;
> +	tag_set->queue_depth =
> +		shost->can_queue + shost->nr_reserved_cmds;
> +	tag_set->reserved_tags = shost->nr_reserved_cmds;
> +
>  	tag_set->cmd_size = cmd_size;
>  	tag_set->numa_node = dev_to_node(shost->dma_dev);
>  	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
> index 59aef1f178f5..149dcbd4125e 100644
> --- a/include/scsi/scsi_host.h
> +++ b/include/scsi/scsi_host.h
> @@ -366,10 +366,19 @@ struct scsi_host_template {
>  	/*
>  	 * This determines if we will use a non-interrupt driven
>  	 * or an interrupt driven scheme.  It is set to the maximum number
> -	 * of simultaneous commands a single hw queue in HBA will accept.
> +	 * of simultaneous commands a single hw queue in HBA will accept
> +	 * excluding internal commands.
>  	 */
>  	int can_queue;
>  
> +	/*
> +	 * This determines how many commands the HBA will set aside
> +	 * for internal commands. This number will be added to
> +	 * @can_queue to calcumate the maximum number of simultaneous

s/calcumate/calculate

But this is weird. For SATA, can_queue is 32. Having reserved commands,
that number needs to stay the same. We cannot have more than 32 tags.
I think keeping can_queue as the max queue depth with at most
nr_reserved_cmds tags reserved is better.

> +	 * commands sent to the host.
> +	 */
> +	int nr_reserved_cmds;
> +
>  	/*
>  	 * In many instances, especially where disconnect / reconnect are
>  	 * supported, our host also has an ID on the SCSI bus.  If this is
> @@ -602,6 +611,11 @@ struct Scsi_Host {
>  	unsigned short max_cmd_len;
>  
>  	int this_id;
> +
> +	/*
> +	 * Number of commands this host can handle at the same time.
> +	 * This excludes reserved commands as specified by nr_reserved_cmds.
> +	 */
>  	int can_queue;
>  	short cmd_per_lun;
>  	short unsigned int sg_tablesize;
> @@ -620,6 +634,12 @@ struct Scsi_Host {
>  	 */
>  	unsigned nr_hw_queues;
>  	unsigned nr_maps;
> +
> +	/*
> +	 * Number of reserved commands to allocate, if any.
> +	 */
> +	unsigned nr_reserved_cmds;
> +
>  	unsigned active_mode:2;
>  
>  	/*
John Garry June 13, 2022, 8:25 a.m. UTC | #2
On 13/06/2022 08:01, Damien Le Moal wrote:
> On 6/9/22 19:29, John Garry wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> Quite some drivers are using management commands internally, which
>> typically use the same hardware tag pool (ie they are being allocated
>> from the same hardware resources) as the 'normal' I/O commands.
>> These commands are set aside before allocating the block-mq tag bitmap,
>> so they'll never show up as busy in the tag map.
>> The block-layer, OTOH, already has 'reserved_tags' to handle precisely
>> this situation.
>> So this patch adds a new field 'nr_reserved_cmds' to the SCSI host
>> template to instruct the block layer to set aside a tag space for these
>> management commands by using reserved tags.
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>> ---
>>   drivers/scsi/hosts.c     |  3 +++
>>   drivers/scsi/scsi_lib.c  |  6 +++++-
>>   include/scsi/scsi_host.h | 22 +++++++++++++++++++++-
>>   3 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
>> index 8352f90d997d..27296addaf63 100644
>> --- a/drivers/scsi/hosts.c
>> +++ b/drivers/scsi/hosts.c
>> @@ -474,6 +474,9 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
>>   	if (sht->virt_boundary_mask)
>>   		shost->virt_boundary_mask = sht->virt_boundary_mask;
>>   
>> +	if (sht->nr_reserved_cmds)
>> +		shost->nr_reserved_cmds = sht->nr_reserved_cmds;
>> +
>>   	device_initialize(&shost->shost_gendev);
>>   	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
>>   	shost->shost_gendev.bus = &scsi_bus_type;
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index 6ffc9e4258a8..f6e53c6d913c 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1974,8 +1974,12 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
>>   	else
>>   		tag_set->ops = &scsi_mq_ops_no_commit;
>>   	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
>> +
>>   	tag_set->nr_maps = shost->nr_maps ? : 1;
>> -	tag_set->queue_depth = shost->can_queue;
>> +	tag_set->queue_depth =
>> +		shost->can_queue + shost->nr_reserved_cmds;
>> +	tag_set->reserved_tags = shost->nr_reserved_cmds;
>> +
>>   	tag_set->cmd_size = cmd_size;
>>   	tag_set->numa_node = dev_to_node(shost->dma_dev);
>>   	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
>> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
>> index 59aef1f178f5..149dcbd4125e 100644
>> --- a/include/scsi/scsi_host.h
>> +++ b/include/scsi/scsi_host.h
>> @@ -366,10 +366,19 @@ struct scsi_host_template {
>>   	/*
>>   	 * This determines if we will use a non-interrupt driven
>>   	 * or an interrupt driven scheme.  It is set to the maximum number
>> -	 * of simultaneous commands a single hw queue in HBA will accept.
>> +	 * of simultaneous commands a single hw queue in HBA will accept
>> +	 * excluding internal commands.
>>   	 */
>>   	int can_queue;
>>   
>> +	/*
>> +	 * This determines how many commands the HBA will set aside
>> +	 * for internal commands. This number will be added to
>> +	 * @can_queue to calcumate the maximum number of simultaneous
> 

Hi Damien,

> s/calcumate/calculate
> 
> But this is weird. For SATA, can_queue is 32. Having reserved commands,
> that number needs to stay the same. 

It does.

> We cannot have more than 32 tags.

We may have 32 regular tags and 1 reserved tag for SATA.

> I think keeping can_queue as the max queue depth with at most
> nr_reserved_cmds tags reserved is better.

Maybe the wording in the comment can be improved as it originally 
focused on SAS HBAs where there are no special rules for tagset depth or 
how the tagset should be carved up to handle regular and reserved commands.

Thanks,
John

> 
>> +	 * commands sent to the host.
>> +	 */
>> +	int nr_reserved_cmds;
>> +
>>   	/*
>>   	 * In many instances, especially where disconnect / reconnect are
>>   	 * supported, our host also has an ID on the SCSI bus.  If this is
>> @@ -602,6 +611,11 @@ struct Scsi_Host {
>>   	unsigned short max_cmd_len;
>>   
>>   	int this_id;
>> +
>> +	/*
>> +	 * Number of commands this host can handle at the same time.
>> +	 * This excludes reserved commands as specified by nr_reserved_cmds.
>> +	 */
>>   	int can_queue;
>>   	short cmd_per_lun;
>>   	short unsigned int sg_tablesize;
>> @@ -620,6 +634,12 @@ struct Scsi_Host {
>>   	 */
>>   	unsigned nr_hw_queues;
>>   	unsigned nr_maps;
>> +
>> +	/*
>> +	 * Number of reserved commands to allocate, if any.
>> +	 */
>> +	unsigned nr_reserved_cmds;
>> +
>>   	unsigned active_mode:2;
>>   
>>   	/*
> 
>
Damien Le Moal June 13, 2022, 9:06 a.m. UTC | #3
On 6/13/22 17:25, John Garry wrote:
> On 13/06/2022 08:01, Damien Le Moal wrote:
>> On 6/9/22 19:29, John Garry wrote:
>>> From: Hannes Reinecke <hare@suse.de>
>>>
>>> Quite some drivers are using management commands internally, which
>>> typically use the same hardware tag pool (ie they are being allocated
>>> from the same hardware resources) as the 'normal' I/O commands.
>>> These commands are set aside before allocating the block-mq tag bitmap,
>>> so they'll never show up as busy in the tag map.
>>> The block-layer, OTOH, already has 'reserved_tags' to handle precisely
>>> this situation.
>>> So this patch adds a new field 'nr_reserved_cmds' to the SCSI host
>>> template to instruct the block layer to set aside a tag space for these
>>> management commands by using reserved tags.
>>>
>>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>>> Signed-off-by: John Garry <john.garry@huawei.com>
>>> ---
>>>   drivers/scsi/hosts.c     |  3 +++
>>>   drivers/scsi/scsi_lib.c  |  6 +++++-
>>>   include/scsi/scsi_host.h | 22 +++++++++++++++++++++-
>>>   3 files changed, 29 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
>>> index 8352f90d997d..27296addaf63 100644
>>> --- a/drivers/scsi/hosts.c
>>> +++ b/drivers/scsi/hosts.c
>>> @@ -474,6 +474,9 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
>>>   	if (sht->virt_boundary_mask)
>>>   		shost->virt_boundary_mask = sht->virt_boundary_mask;
>>>   
>>> +	if (sht->nr_reserved_cmds)
>>> +		shost->nr_reserved_cmds = sht->nr_reserved_cmds;
>>> +
>>>   	device_initialize(&shost->shost_gendev);
>>>   	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
>>>   	shost->shost_gendev.bus = &scsi_bus_type;
>>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>>> index 6ffc9e4258a8..f6e53c6d913c 100644
>>> --- a/drivers/scsi/scsi_lib.c
>>> +++ b/drivers/scsi/scsi_lib.c
>>> @@ -1974,8 +1974,12 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
>>>   	else
>>>   		tag_set->ops = &scsi_mq_ops_no_commit;
>>>   	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
>>> +
>>>   	tag_set->nr_maps = shost->nr_maps ? : 1;
>>> -	tag_set->queue_depth = shost->can_queue;
>>> +	tag_set->queue_depth =
>>> +		shost->can_queue + shost->nr_reserved_cmds;
>>> +	tag_set->reserved_tags = shost->nr_reserved_cmds;
>>> +
>>>   	tag_set->cmd_size = cmd_size;
>>>   	tag_set->numa_node = dev_to_node(shost->dma_dev);
>>>   	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
>>> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
>>> index 59aef1f178f5..149dcbd4125e 100644
>>> --- a/include/scsi/scsi_host.h
>>> +++ b/include/scsi/scsi_host.h
>>> @@ -366,10 +366,19 @@ struct scsi_host_template {
>>>   	/*
>>>   	 * This determines if we will use a non-interrupt driven
>>>   	 * or an interrupt driven scheme.  It is set to the maximum number
>>> -	 * of simultaneous commands a single hw queue in HBA will accept.
>>> +	 * of simultaneous commands a single hw queue in HBA will accept
>>> +	 * excluding internal commands.
>>>   	 */
>>>   	int can_queue;
>>>   
>>> +	/*
>>> +	 * This determines how many commands the HBA will set aside
>>> +	 * for internal commands. This number will be added to
>>> +	 * @can_queue to calcumate the maximum number of simultaneous
>>
> 
> Hi Damien,
> 
>> s/calcumate/calculate
>>
>> But this is weird. For SATA, can_queue is 32. Having reserved commands,
>> that number needs to stay the same. 
> 
> It does.
> 
>> We cannot have more than 32 tags.
> 
> We may have 32 regular tags and 1 reserved tag for SATA.

Right. But that is the messy part though. That extra 1 tag is actually not
a tag since all internal commands are non-NCQ commands that do not need a
tag...

I am working on command duration limits support currently. This feature
set has a new horrendous "improvement": a command can be aborted by the
device if it fails its duration limit, but the abort is done with a good
status + sense data available bit set so that the device queue is not
aborted entirely like with a regular NCQ command error.

For such aborted commands, the command sense data is set to
"COMPLETED/DATA UNAVAILABLE". In this case, the host needs to go read the
new "successful NCQ sense data log" to check that the command sense is
indeed "COMPLETED/DATA UNAVAILABLE". And to go read that log page without
stalling the device queue, we would need an internal NCQ (queuable) command.

Currently, that is not possible to do cleanly as there are no guarantees
we can get a free tag (there is a race between block layer tag allocation
and libata internal tag counting). So a reserved tag for that would be
nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
rather useless. But that also means that we kind-of go back to the days
when Linux showed ATA drives max QD of 31...

I am still struggling with this particular use case and trying to make it
fit with your series. Trying out different things right now.


> 
>> I think keeping can_queue as the max queue depth with at most
>> nr_reserved_cmds tags reserved is better.
> 
> Maybe the wording in the comment can be improved as it originally 
> focused on SAS HBAs where there are no special rules for tagset depth or 
> how the tagset should be carved up to handle regular and reserved commands.

Indeed. And that would be for HBAs that do *not* use libsas/libata.
Otherwise, the NCQ vs non-NCQ reserved tag mess is there.

> 
> Thanks,
> John
> 
>>
>>> +	 * commands sent to the host.
>>> +	 */
>>> +	int nr_reserved_cmds;
>>> +
>>>   	/*
>>>   	 * In many instances, especially where disconnect / reconnect are
>>>   	 * supported, our host also has an ID on the SCSI bus.  If this is
>>> @@ -602,6 +611,11 @@ struct Scsi_Host {
>>>   	unsigned short max_cmd_len;
>>>   
>>>   	int this_id;
>>> +
>>> +	/*
>>> +	 * Number of commands this host can handle at the same time.
>>> +	 * This excludes reserved commands as specified by nr_reserved_cmds.
>>> +	 */
>>>   	int can_queue;
>>>   	short cmd_per_lun;
>>>   	short unsigned int sg_tablesize;
>>> @@ -620,6 +634,12 @@ struct Scsi_Host {
>>>   	 */
>>>   	unsigned nr_hw_queues;
>>>   	unsigned nr_maps;
>>> +
>>> +	/*
>>> +	 * Number of reserved commands to allocate, if any.
>>> +	 */
>>> +	unsigned nr_reserved_cmds;
>>> +
>>>   	unsigned active_mode:2;
>>>   
>>>   	/*
>>
>>
>
John Garry June 13, 2022, 9:34 a.m. UTC | #4
On 13/06/2022 10:06, Damien Le Moal wrote:
>>> We cannot have more than 32 tags.
>> We may have 32 regular tags and 1 reserved tag for SATA.
> Right. But that is the messy part though. That extra 1 tag is actually not
> a tag since all internal commands are non-NCQ commands that do not need a
> tag...

But apart from SATA, libsas LLDDs do need a real tag for the libata 
internal command.

> 
> I am working on command duration limits support currently. This feature
> set has a new horrendous "improvement": a command can be aborted by the
> device if it fails its duration limit, but the abort is done with a good
> status + sense data available bit set so that the device queue is not
> aborted entirely like with a regular NCQ command error.
> 
> For such aborted commands, the command sense data is set to
> "COMPLETED/DATA UNAVAILABLE". In this case, the host needs to go read the
> new "successful NCQ sense data log" to check that the command sense is
> indeed "COMPLETED/DATA UNAVAILABLE". And to go read that log page without
> stalling the device queue, we would need an internal NCQ (queuable) command.
> 
> Currently, that is not possible to do cleanly as there are no guarantees
> we can get a free tag (there is a race between block layer tag allocation
> and libata internal tag counting). So a reserved tag for that would be
> nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
> commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
> rather useless. But that also means that we kind-of go back to the days
> when Linux showed ATA drives max QD of 31...

So must the ATA_TAG_INTERNAL qc always be available for non-NCQ action 
like EH, and that is why you cannot reuse for this internal NCQ 
(queuable) command?

> 
> I am still struggling with this particular use case and trying to make it
> fit with your series. Trying out different things right now.
> 

ok

> 
>>> I think keeping can_queue as the max queue depth with at most
>>> nr_reserved_cmds tags reserved is better.
>> Maybe the wording in the comment can be improved as it originally
>> focused on SAS HBAs where there are no special rules for tagset depth or
>> how the tagset should be carved up to handle regular and reserved commands.
> Indeed. And that would be for HBAs that do*not*  use libsas/libata.
> Otherwise, the NCQ vs non-NCQ reserved tag mess is there.
> 

Thanks,
John
Damien Le Moal June 13, 2022, 9:43 a.m. UTC | #5
On 6/13/22 18:34, John Garry wrote:
> On 13/06/2022 10:06, Damien Le Moal wrote:
>>>> We cannot have more than 32 tags.
>>> We may have 32 regular tags and 1 reserved tag for SATA.
>> Right. But that is the messy part though. That extra 1 tag is actually not
>> a tag since all internal commands are non-NCQ commands that do not need a
>> tag...
> 
> But apart from SATA, libsas LLDDs do need a real tag for the libata 
> internal command.

Yes, but that is really to manage the LLD command descriptor and not for
the device to use in the case of non-NCQ commands. There is not
necessarily a 1:1 equivalence between the HBA command descriptor tag and
ATA command tag.

> 
>>
>> I am working on command duration limits support currently. This feature
>> set has a new horrendous "improvement": a command can be aborted by the
>> device if it fails its duration limit, but the abort is done with a good
>> status + sense data available bit set so that the device queue is not
>> aborted entirely like with a regular NCQ command error.
>>
>> For such aborted commands, the command sense data is set to
>> "COMPLETED/DATA UNAVAILABLE". In this case, the host needs to go read the
>> new "successful NCQ sense data log" to check that the command sense is
>> indeed "COMPLETED/DATA UNAVAILABLE". And to go read that log page without
>> stalling the device queue, we would need an internal NCQ (queuable) command.
>>
>> Currently, that is not possible to do cleanly as there are no guarantees
>> we can get a free tag (there is a race between block layer tag allocation
>> and libata internal tag counting). So a reserved tag for that would be
>> nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
>> commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
>> rather useless. But that also means that we kind-of go back to the days
>> when Linux showed ATA drives max QD of 31...
> 
> So must the ATA_TAG_INTERNAL qc always be available for non-NCQ action 
> like EH, and that is why you cannot reuse for this internal NCQ 
> (queuable) command?

Currently, ATA_TAG_INTERNAL is always used for non-NCQ commands. Seeing a
qc with that tag means it is *not* NCQ.

I am trying to see if I can reuse the tag from one of the commands that
completed with that weird good status/sense data available. The problem
though is that this all needs to be done *before* calling
qc->complete_fn() which will free the tag. So we endup with 2 qcs that
have the same tag, the second one (for the read log command) temporarily
using the tag but still going through the same completion path without the
original command fully completed yet. It is a real mess.

> 
>>
>> I am still struggling with this particular use case and trying to make it
>> fit with your series. Trying out different things right now.
>>
> 
> ok
> 
>>
>>>> I think keeping can_queue as the max queue depth with at most
>>>> nr_reserved_cmds tags reserved is better.
>>> Maybe the wording in the comment can be improved as it originally
>>> focused on SAS HBAs where there are no special rules for tagset depth or
>>> how the tagset should be carved up to handle regular and reserved commands.
>> Indeed. And that would be for HBAs that do*not*  use libsas/libata.
>> Otherwise, the NCQ vs non-NCQ reserved tag mess is there.
>>
> 
> Thanks,
> John
John Garry June 13, 2022, 10:05 a.m. UTC | #6
On 13/06/2022 10:43, Damien Le Moal wrote:
>>> Currently, that is not possible to do cleanly as there are no guarantees
>>> we can get a free tag (there is a race between block layer tag allocation
>>> and libata internal tag counting). So a reserved tag for that would be
>>> nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
>>> commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
>>> rather useless. But that also means that we kind-of go back to the days
>>> when Linux showed ATA drives max QD of 31...
>> So must the ATA_TAG_INTERNAL qc always be available for non-NCQ action
>> like EH, and that is why you cannot reuse for this internal NCQ
>> (queuable) command?
> Currently, ATA_TAG_INTERNAL is always used for non-NCQ commands. Seeing a
> qc with that tag means it is*not*  NCQ.
> 
> I am trying to see if I can reuse the tag from one of the commands that
> completed with that weird good status/sense data available. The problem
> though is that this all needs to be done*before*  calling
> qc->complete_fn() which will free the tag. So we endup with 2 qcs that
> have the same tag, the second one (for the read log command) temporarily
> using the tag but still going through the same completion path without the
> original command fully completed yet. It is a real mess.
> 

Reusing tags seems really messy, but then reserving an NCQ command seems 
wasteful.

Thanks,
John
Bart Van Assche June 14, 2022, 6:20 p.m. UTC | #7
On 6/13/22 00:01, Damien Le Moal wrote:
> On 6/9/22 19:29, John Garry wrote:
>> +	/*
>> +	 * This determines how many commands the HBA will set aside
>> +	 * for internal commands. This number will be added to
>> +	 * @can_queue to calcumate the maximum number of simultaneous
> 
> s/calcumate/calculate
> 
> But this is weird. For SATA, can_queue is 32. Having reserved commands,
> that number needs to stay the same. We cannot have more than 32 tags.
> I think keeping can_queue as the max queue depth with at most
> nr_reserved_cmds tags reserved is better.
> 
>> +	 * commands sent to the host.
>> +	 */
>> +	int nr_reserved_cmds;

+1 for Damien's request. I also prefer to keep can_queue as the maximum
queue depth, whether or not nr_reserved_cmds has been set.

Thanks,

Bart.
Damien Le Moal June 14, 2022, 11:43 p.m. UTC | #8
On 6/15/22 03:20, Bart Van Assche wrote:
> On 6/13/22 00:01, Damien Le Moal wrote:
>> On 6/9/22 19:29, John Garry wrote:
>>> +	/*
>>> +	 * This determines how many commands the HBA will set aside
>>> +	 * for internal commands. This number will be added to
>>> +	 * @can_queue to calcumate the maximum number of simultaneous
>>
>> s/calcumate/calculate
>>
>> But this is weird. For SATA, can_queue is 32. Having reserved commands,
>> that number needs to stay the same. We cannot have more than 32 tags.
>> I think keeping can_queue as the max queue depth with at most
>> nr_reserved_cmds tags reserved is better.
>>
>>> +	 * commands sent to the host.
>>> +	 */
>>> +	int nr_reserved_cmds;
> 
> +1 for Damien's request. I also prefer to keep can_queue as the maximum
> queue depth, whether or not nr_reserved_cmds has been set.

For non SATA drives, I still think that is a good idea. However, for SATA,
we always have the internal tag command that is special. With John's
change, it would have to be reserved but that means we are down to 31 max
QD, so going backward several years... That internal tag for ATA does not
need to be reserved since this command is always used when the drive is
idle and no other NCQ commands are on-going.

So the solution to all this is a likely a little more complicated if we
want to keep ATA max QD to 32.

> 
> Thanks,
> 
> Bart.
John Garry June 15, 2022, 7:35 a.m. UTC | #9
On 15/06/2022 00:43, Damien Le Moal wrote:
> On 6/15/22 03:20, Bart Van Assche wrote:
>> On 6/13/22 00:01, Damien Le Moal wrote:
>>> On 6/9/22 19:29, John Garry wrote:
>>>> +	/*
>>>> +	 * This determines how many commands the HBA will set aside
>>>> +	 * for internal commands. This number will be added to
>>>> +	 * @can_queue to calcumate the maximum number of simultaneous
>>>
>>> s/calcumate/calculate
>>>
>>> But this is weird. For SATA, can_queue is 32. Having reserved commands,
>>> that number needs to stay the same. We cannot have more than 32 tags.
>>> I think keeping can_queue as the max queue depth with at most
>>> nr_reserved_cmds tags reserved is better.
>>>
>>>> +	 * commands sent to the host.
>>>> +	 */
>>>> +	int nr_reserved_cmds;
>>
>> +1 for Damien's request. I also prefer to keep can_queue as the maximum
>> queue depth, whether or not nr_reserved_cmds has been set.
> 
> For non SATA drives, I still think that is a good idea. However, for SATA,
> we always have the internal tag command that is special. With John's
> change, it would have to be reserved but that means we are down to 31 max
> QD,

My intention is to keep regular tag depth at 32 for SATA. We add an 
extra tag as a reserved tag. Indeed, this is called a 'tag', but it's 
just really the placeholder for what will be the ATA_TAG_INTERNAL request.

About how we set scsi_host.can_queue, in this series we set .can_queue 
as max regular tags, and the handling is as follows:

scsi_mq_setup_tags():
tag_set->queue_depth = shost->can_queue + shost->nr_reserved_cmds
tag_set->reserved_tags = shost->nr_reserved_cmds

So we honour the rule that blk_mq_tag_set.queue_depth is the total tag 
depth, including reserved.

Incidentally I think Christoph prefers to keep .can_queue at total max 
tags including reserved:
https://lore.kernel.org/linux-scsi/337339b7-6f4a-a25c-f11c-7f701b42d6a8@suse.de/

> so going backward several years... That internal tag for ATA does not
> need to be reserved since this command is always used when the drive is
> idle and no other NCQ commands are on-going.

So do you mean that ATA_TAG_INTERNAL qc is used for other commands apart 
from internal commands?

> 
> So the solution to all this is a likely a little more complicated if we
> want to keep ATA max QD to 32.
> 

thanks,
John
Damien Le Moal June 16, 2022, 2:47 a.m. UTC | #10
On 6/15/22 16:35, John Garry wrote:
> On 15/06/2022 00:43, Damien Le Moal wrote:
>> On 6/15/22 03:20, Bart Van Assche wrote:
>>> On 6/13/22 00:01, Damien Le Moal wrote:
>>>> On 6/9/22 19:29, John Garry wrote:
>>>>> +    /*
>>>>> +     * This determines how many commands the HBA will set aside
>>>>> +     * for internal commands. This number will be added to
>>>>> +     * @can_queue to calcumate the maximum number of simultaneous
>>>>
>>>> s/calcumate/calculate
>>>>
>>>> But this is weird. For SATA, can_queue is 32. Having reserved commands,
>>>> that number needs to stay the same. We cannot have more than 32 tags.
>>>> I think keeping can_queue as the max queue depth with at most
>>>> nr_reserved_cmds tags reserved is better.
>>>>
>>>>> +     * commands sent to the host.
>>>>> +     */
>>>>> +    int nr_reserved_cmds;
>>>
>>> +1 for Damien's request. I also prefer to keep can_queue as the maximum
>>> queue depth, whether or not nr_reserved_cmds has been set.
>>
>> For non SATA drives, I still think that is a good idea. However, for 
>> SATA,
>> we always have the internal tag command that is special. With John's
>> change, it would have to be reserved but that means we are down to 31 max
>> QD,
> 
> My intention is to keep regular tag depth at 32 for SATA. We add an 
> extra tag as a reserved tag. Indeed, this is called a 'tag', but it's 
> just really the placeholder for what will be the ATA_TAG_INTERNAL request.
> 
> About how we set scsi_host.can_queue, in this series we set .can_queue 
> as max regular tags, and the handling is as follows:
> 
> scsi_mq_setup_tags():
> tag_set->queue_depth = shost->can_queue + shost->nr_reserved_cmds
> tag_set->reserved_tags = shost->nr_reserved_cmds
> 
> So we honour the rule that blk_mq_tag_set.queue_depth is the total tag 
> depth, including reserved.
> 
> Incidentally I think Christoph prefers to keep .can_queue at total max 
> tags including reserved:
> https://lore.kernel.org/linux-scsi/337339b7-6f4a-a25c-f11c-7f701b42d6a8@suse.de/ 
> 
> 
>> so going backward several years... That internal tag for ATA does not
>> need to be reserved since this command is always used when the drive is
>> idle and no other NCQ commands are on-going.
> 
> So do you mean that ATA_TAG_INTERNAL qc is used for other commands apart 
> from internal commands?

No. It is used only for internal commands. What I meant to say is that 
currently, internal commands are issued only on device scan, device 
revalidate and error handling. All of these phases are done with the 
device under EH with the issuing path stopped and all commands 
completed, so no regular commands can be issued. Only internal ones, non 
NCQ, using the ATA_TAG_INTERNAL. So strictly speaking, we should not 
need to reserve that internal tag at all.

> 
>>
>> So the solution to all this is a likely a little more complicated if we
>> want to keep ATA max QD to 32.
>>
> 
> thanks,
> John
John Garry June 16, 2022, 8:24 a.m. UTC | #11
On 16/06/2022 03:47, Damien Le Moal wrote:
>>> so going backward several years... That internal tag for ATA does not
>>> need to be reserved since this command is always used when the drive is
>>> idle and no other NCQ commands are on-going.
>>
>> So do you mean that ATA_TAG_INTERNAL qc is used for other commands 
>> apart from internal commands?
> 
> No. It is used only for internal commands. What I meant to say is that 
> currently, internal commands are issued only on device scan, device 
> revalidate and error handling. All of these phases are done with the 
> device under EH with the issuing path stopped and all commands 
> completed, 

If I want to allocate a request for an ATA internal command then could I 
use 1x from the regular tags? I didn't think that this was possible as I 
thought that all tags may be outstanding when EH kicks in. I need to 
double check it.

Even if it were true, not using a reserved tag for ATA internal command 
makes things more tricky as this command requires special handling for 
scsi blk_mq_ops and there is no easy way to identify the command as 
reserved (to know special handling is required).

>so no regular commands can be issued. Only internal ones, non 
> NCQ, using the ATA_TAG_INTERNAL. So strictly speaking, we should not 
> need to reserve that internal tag at all.
> 

Thanks,
John
Damien Le Moal June 16, 2022, 8:41 a.m. UTC | #12
On 2022/06/16 17:24, John Garry wrote:
> On 16/06/2022 03:47, Damien Le Moal wrote:
>>>> so going backward several years... That internal tag for ATA does not
>>>> need to be reserved since this command is always used when the drive is
>>>> idle and no other NCQ commands are on-going.
>>>
>>> So do you mean that ATA_TAG_INTERNAL qc is used for other commands 
>>> apart from internal commands?
>>
>> No. It is used only for internal commands. What I meant to say is that 
>> currently, internal commands are issued only on device scan, device 
>> revalidate and error handling. All of these phases are done with the 
>> device under EH with the issuing path stopped and all commands 
>> completed, 
> 
> If I want to allocate a request for an ATA internal command then could I 
> use 1x from the regular tags? I didn't think that this was possible as I 
> thought that all tags may be outstanding when EH kicks in. I need to 
> double check it.

When EH kicks in, the drive is in error mode and all commands are back to the
host. From there, you need to get the drive out of error mode with read log 10h
and then internal commands can be issued if needed. Then the aborted commands
that are not in error are restarted.

For the non-error case (revalidate), ap->ops->qc_defer() will make sure that NCQ
and non-NCQ commands are never mixed. Since all internal commands are non-ncq,
when an internal command is issued, there are necessarily no other commands
ongoing, but 32 NCQ commands may be waiting, without any free tag. The internal
command being non-NCQ can still proceed since it does not need a real device tag.

The joy of ATA...

> Even if it were true, not using a reserved tag for ATA internal command 
> makes things more tricky as this command requires special handling for 
> scsi blk_mq_ops and there is no easy way to identify the command as 
> reserved (to know special handling is required).

Yes. Having the ATA_TAG_INTERNAL tag as a reserved tag is fine. But from the
above, you can see that this is not really needed at all to make things work.
The management of ATA_TAG_INTERNAL as a reserve tag is really about getting your
API to simplify the code.

What I am thinking is that with your patches as is, it seems that we can never
actually reserve a real tag for ATA to do internal NCQ commands... We do not
really need that for now though, apart maybe for speeding up device revalidate.
Everytime that one runs, one can see a big spike in read/write IO latencies
because of the queue drain it causes.

And for CDL 0xD policy error handling, I may need a reserved NCQ tag... Still
trying to work out qc/tag reuse for now though.

> 
>> so no regular commands can be issued. Only internal ones, non 
>> NCQ, using the ATA_TAG_INTERNAL. So strictly speaking, we should not 
>> need to reserve that internal tag at all.
>>
> 
> Thanks,
> John
>
Hannes Reinecke June 20, 2022, 6:24 a.m. UTC | #13
On 6/13/22 09:01, Damien Le Moal wrote:
> On 6/9/22 19:29, John Garry wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> Quite some drivers are using management commands internally, which
>> typically use the same hardware tag pool (ie they are being allocated
>> from the same hardware resources) as the 'normal' I/O commands.
>> These commands are set aside before allocating the block-mq tag bitmap,
>> so they'll never show up as busy in the tag map.
>> The block-layer, OTOH, already has 'reserved_tags' to handle precisely
>> this situation.
>> So this patch adds a new field 'nr_reserved_cmds' to the SCSI host
>> template to instruct the block layer to set aside a tag space for these
>> management commands by using reserved tags.
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> Signed-off-by: John Garry <john.garry@huawei.com>
>> ---
>>   drivers/scsi/hosts.c     |  3 +++
>>   drivers/scsi/scsi_lib.c  |  6 +++++-
>>   include/scsi/scsi_host.h | 22 +++++++++++++++++++++-
>>   3 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
>> index 8352f90d997d..27296addaf63 100644
>> --- a/drivers/scsi/hosts.c
>> +++ b/drivers/scsi/hosts.c
>> @@ -474,6 +474,9 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
>>   	if (sht->virt_boundary_mask)
>>   		shost->virt_boundary_mask = sht->virt_boundary_mask;
>>   
>> +	if (sht->nr_reserved_cmds)
>> +		shost->nr_reserved_cmds = sht->nr_reserved_cmds;
>> +
>>   	device_initialize(&shost->shost_gendev);
>>   	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
>>   	shost->shost_gendev.bus = &scsi_bus_type;
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index 6ffc9e4258a8..f6e53c6d913c 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -1974,8 +1974,12 @@ int scsi_mq_setup_tags(struct Scsi_Host *shost)
>>   	else
>>   		tag_set->ops = &scsi_mq_ops_no_commit;
>>   	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
>> +
>>   	tag_set->nr_maps = shost->nr_maps ? : 1;
>> -	tag_set->queue_depth = shost->can_queue;
>> +	tag_set->queue_depth =
>> +		shost->can_queue + shost->nr_reserved_cmds;
>> +	tag_set->reserved_tags = shost->nr_reserved_cmds;
>> +
>>   	tag_set->cmd_size = cmd_size;
>>   	tag_set->numa_node = dev_to_node(shost->dma_dev);
>>   	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
>> diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
>> index 59aef1f178f5..149dcbd4125e 100644
>> --- a/include/scsi/scsi_host.h
>> +++ b/include/scsi/scsi_host.h
>> @@ -366,10 +366,19 @@ struct scsi_host_template {
>>   	/*
>>   	 * This determines if we will use a non-interrupt driven
>>   	 * or an interrupt driven scheme.  It is set to the maximum number
>> -	 * of simultaneous commands a single hw queue in HBA will accept.
>> +	 * of simultaneous commands a single hw queue in HBA will accept
>> +	 * excluding internal commands.
>>   	 */
>>   	int can_queue;
>>   
>> +	/*
>> +	 * This determines how many commands the HBA will set aside
>> +	 * for internal commands. This number will be added to
>> +	 * @can_queue to calcumate the maximum number of simultaneous
> 
> s/calcumate/calculate
> 
> But this is weird. For SATA, can_queue is 32. Having reserved commands,
> that number needs to stay the same. We cannot have more than 32 tags.
> I think keeping can_queue as the max queue depth with at most
> nr_reserved_cmds tags reserved is better.
> 
I had been thinking about this for quite a while, and figured that the 
'reserved' commands model from blk-mq doesn't fit nicely with the SATA 
protocol.

So my idea for SATA is simply _not_ to use reserved tags.
Any TMF functions (or the equivalent thereof) should always be sent as 
non-NCQ commands. And when doing so we're back to QD=1 on SATA anyway, 
so there _must_ be tags available. Consequently the main reason for 
having reserved tags (namely to guarantee that tags are available for 
TMF) doesn't apply here.

Which is why in my initial patchset I've always been referrring to 
'internal' commands, and drivers could select if the 'internal' commands 
are mappend on reserved tags or not.

Cheers,

Hannes
Christoph Hellwig June 20, 2022, 6:28 a.m. UTC | #14
On Mon, Jun 20, 2022 at 08:24:24AM +0200, Hannes Reinecke wrote:
> So my idea for SATA is simply _not_ to use reserved tags.
> Any TMF functions (or the equivalent thereof) should always be sent as 
> non-NCQ commands. And when doing so we're back to QD=1 on SATA anyway, so 
> there _must_ be tags available. Consequently the main reason for having 
> reserved tags (namely to guarantee that tags are available for TMF) doesn't 
> apply here.

At least in the non-elevator case (which includes all passthrough I/O)
request have tags assigned as soon as they are allocated.  So, we
absolutely can have all tags allocated and then need to do a TMF.
Hannes Reinecke June 20, 2022, 6:45 a.m. UTC | #15
On 6/13/22 11:06, Damien Le Moal wrote:
> On 6/13/22 17:25, John Garry wrote:
[ .. ]
>>
>> We may have 32 regular tags and 1 reserved tag for SATA.
> 
> Right. But that is the messy part though. That extra 1 tag is actually not
> a tag since all internal commands are non-NCQ commands that do not need a
> tag...
> 
> I am working on command duration limits support currently. This feature
> set has a new horrendous "improvement": a command can be aborted by the
> device if it fails its duration limit, but the abort is done with a good
> status + sense data available bit set so that the device queue is not
> aborted entirely like with a regular NCQ command error.
> 
> For such aborted commands, the command sense data is set to
> "COMPLETED/DATA UNAVAILABLE". In this case, the host needs to go read the
> new "successful NCQ sense data log" to check that the command sense is
> indeed "COMPLETED/DATA UNAVAILABLE". And to go read that log page without
> stalling the device queue, we would need an internal NCQ (queuable) command.
> 
> Currently, that is not possible to do cleanly as there are no guarantees
> we can get a free tag (there is a race between block layer tag allocation
> and libata internal tag counting). So a reserved tag for that would be
> nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
> commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
> rather useless. But that also means that we kind-of go back to the days
> when Linux showed ATA drives max QD of 31...
> 
> I am still struggling with this particular use case and trying to make it
> fit with your series. Trying out different things right now.
> 
Hmm. Struggling on how that is supposed to work in general.
As we're reading from a log to get the sense information I guess that 
log is organized by tag index. Meaning we have to keep hold of the tag 
which generated that error.
Q1: Can we (re-) use that tag to read the log information?
Q2: What do you do if all 32 command generate such an error?

But really, this sounds no different from the 'classical' request sense 
handling in SCSI ML. Have you considered just run with that an map 
'REQUEST SENSE' on your new NCQ Get Log page command?

Cheers,

Hannes
Hannes Reinecke June 20, 2022, 7:03 a.m. UTC | #16
On 6/20/22 08:28, Christoph Hellwig wrote:
> On Mon, Jun 20, 2022 at 08:24:24AM +0200, Hannes Reinecke wrote:
>> So my idea for SATA is simply _not_ to use reserved tags.
>> Any TMF functions (or the equivalent thereof) should always be sent as
>> non-NCQ commands. And when doing so we're back to QD=1 on SATA anyway, so
>> there _must_ be tags available. Consequently the main reason for having
>> reserved tags (namely to guarantee that tags are available for TMF) doesn't
>> apply here.
> 
> At least in the non-elevator case (which includes all passthrough I/O)
> request have tags assigned as soon as they are allocated.  So, we
> absolutely can have all tags allocated and then need to do a TMF.

SATA internals may come to the rescue here; if there's an error all NCQ 
commands are aborted. So we'll get at least one command tag back.
As for the command duration limits I'm still waiting for clarification
from Damien if we can reuse tags there.

But I do agree it's ugly.

Cheers,

Hannes
Damien Le Moal June 20, 2022, 7:06 a.m. UTC | #17
On 6/20/22 15:45, Hannes Reinecke wrote:
> On 6/13/22 11:06, Damien Le Moal wrote:
>> On 6/13/22 17:25, John Garry wrote:
> [ .. ]
>>>
>>> We may have 32 regular tags and 1 reserved tag for SATA.
>>
>> Right. But that is the messy part though. That extra 1 tag is actually not
>> a tag since all internal commands are non-NCQ commands that do not need a
>> tag...
>>
>> I am working on command duration limits support currently. This feature
>> set has a new horrendous "improvement": a command can be aborted by the
>> device if it fails its duration limit, but the abort is done with a good
>> status + sense data available bit set so that the device queue is not
>> aborted entirely like with a regular NCQ command error.
>>
>> For such aborted commands, the command sense data is set to
>> "COMPLETED/DATA UNAVAILABLE". In this case, the host needs to go read the
>> new "successful NCQ sense data log" to check that the command sense is
>> indeed "COMPLETED/DATA UNAVAILABLE". And to go read that log page without
>> stalling the device queue, we would need an internal NCQ (queuable) command.
>>
>> Currently, that is not possible to do cleanly as there are no guarantees
>> we can get a free tag (there is a race between block layer tag allocation
>> and libata internal tag counting). So a reserved tag for that would be
>> nice. We would end up with 31 IO tags at most + 1 reserved tag for NCQ
>> commands + ATA_TAG_INTERNAL for non-NCQ. That last one would be rendered
>> rather useless. But that also means that we kind-of go back to the days
>> when Linux showed ATA drives max QD of 31...
>>
>> I am still struggling with this particular use case and trying to make it
>> fit with your series. Trying out different things right now.
>>
> Hmm. Struggling on how that is supposed to work in general.

The standard monks defined it as conceptually easy: if a command completes
with success and sense data available bit set, then just read that log
page that has the sense data to check what happened. Very trivial in
principle.
But of course, this is ATA, so a mess in practice because we want to do
that read log with an NCQ command to less impact on the drive performance
than a regular error. Otherwise, if we simply do a regular eh, we end up
with the same impact as a hard command failure. And then we end up with
all these problems with tag reusing and nothing in libata allowing to do
internal ncq commands.

> As we're reading from a log to get the sense information I guess that 
> log is organized by tag index. Meaning we have to keep hold of the tag 
> which generated that error.

Yep. This is a 1024B log which has all the sense information of for all
completed NCQ commands, organized per tag.

> Q1: Can we (re-) use that tag to read the log information?

I thought of that. BUT: if a revalidate or regular eh is ongoing, we need
to delay issuing of the NCQ read log command since eh will prevent issuing
anything (there will be non-ncq commands on-going). Problem here is that
delaying ncq commands means essentially doing a requeue so we need a real
req/scsi req for that. Reusing the tag for a new temporary internal qc is
not enough.

> Q2: What do you do if all 32 command generate such an error?

For that case, I can simply use the internal tag and do a non-ncq read
log. That is actually the easy case !

> But really, this sounds no different from the 'classical' request sense 
> handling in SCSI ML. Have you considered just run with that an map 
> 'REQUEST SENSE' on your new NCQ Get Log page command?

I am exploring the reuse of the scsi EH now. But very messy on libata
side. Still no good solution.

While doing that, I did discover that libata eh is very messy because of
one driver only: scsi ipr. That is the only one that does not have a
->error_handler port operation. And because of that, we are stuck with
lots of "old EH" ata code. So there are always 2 different eh path.
Complete mess. I am trying to see if I can't convert scsi ipr to have a
error_handler port operation, but I cannot test anything as I do not have
the hardware.

> 
> Cheers,
> 
> Hannes
Damien Le Moal June 20, 2022, 7:14 a.m. UTC | #18
On 6/20/22 16:03, Hannes Reinecke wrote:
> On 6/20/22 08:28, Christoph Hellwig wrote:
>> On Mon, Jun 20, 2022 at 08:24:24AM +0200, Hannes Reinecke wrote:
>>> So my idea for SATA is simply _not_ to use reserved tags.
>>> Any TMF functions (or the equivalent thereof) should always be sent as
>>> non-NCQ commands. And when doing so we're back to QD=1 on SATA anyway, so
>>> there _must_ be tags available. Consequently the main reason for having
>>> reserved tags (namely to guarantee that tags are available for TMF) doesn't
>>> apply here.
>>
>> At least in the non-elevator case (which includes all passthrough I/O)
>> request have tags assigned as soon as they are allocated.  So, we
>> absolutely can have all tags allocated and then need to do a TMF.
> 
> SATA internals may come to the rescue here; if there's an error all NCQ 
> commands are aborted. So we'll get at least one command tag back.
> As for the command duration limits I'm still waiting for clarification
> from Damien if we can reuse tags there.

We cannot easily reuse tags as the CDL case is *not* an error. So no queue
abort come with it. If we had the queue abort, things would be easy as
that would essentially be a regular eh.

Simply using scsi_execute_req() does not work since we have no guarantees
we can get a tag: all 32 commands being executed may complete with needing
sense data, and so scsi_execute_req() would hang waiting for a tag. I
could try hacking scsi_execute_req() to reuse a tag instead of allocating
a new one... Calling scsi_execute_req() from libata is really ugly though.

> 
> But I do agree it's ugly.
> 
> Cheers,
> 
> Hannes
Hannes Reinecke June 20, 2022, 8:02 a.m. UTC | #19
On 6/15/22 09:35, John Garry wrote:
> On 15/06/2022 00:43, Damien Le Moal wrote:
>> On 6/15/22 03:20, Bart Van Assche wrote:
>>> On 6/13/22 00:01, Damien Le Moal wrote:
>>>> On 6/9/22 19:29, John Garry wrote:
>>>>> +    /*
>>>>> +     * This determines how many commands the HBA will set aside
>>>>> +     * for internal commands. This number will be added to
>>>>> +     * @can_queue to calcumate the maximum number of simultaneous
>>>>
>>>> s/calcumate/calculate
>>>>
>>>> But this is weird. For SATA, can_queue is 32. Having reserved commands,
>>>> that number needs to stay the same. We cannot have more than 32 tags.
>>>> I think keeping can_queue as the max queue depth with at most
>>>> nr_reserved_cmds tags reserved is better.
>>>>
>>>>> +     * commands sent to the host.
>>>>> +     */
>>>>> +    int nr_reserved_cmds;
>>>
>>> +1 for Damien's request. I also prefer to keep can_queue as the maximum
>>> queue depth, whether or not nr_reserved_cmds has been set.
>>
>> For non SATA drives, I still think that is a good idea. However, for 
>> SATA,
>> we always have the internal tag command that is special. With John's
>> change, it would have to be reserved but that means we are down to 31 max
>> QD,
> 
> My intention is to keep regular tag depth at 32 for SATA. We add an 
> extra tag as a reserved tag. Indeed, this is called a 'tag', but it's 
> just really the placeholder for what will be the ATA_TAG_INTERNAL request.
> 
> About how we set scsi_host.can_queue, in this series we set .can_queue 
> as max regular tags, and the handling is as follows:
> 
> scsi_mq_setup_tags():
> tag_set->queue_depth = shost->can_queue + shost->nr_reserved_cmds
> tag_set->reserved_tags = shost->nr_reserved_cmds
> 
> So we honour the rule that blk_mq_tag_set.queue_depth is the total tag 
> depth, including reserved.
> 
> Incidentally I think Christoph prefers to keep .can_queue at total max 
> tags including reserved:
> https://lore.kernel.org/linux-scsi/337339b7-6f4a-a25c-f11c-7f701b42d6a8@suse.de/ 
> 
> 
>> so going backward several years... That internal tag for ATA does not
>> need to be reserved since this command is always used when the drive is
>> idle and no other NCQ commands are on-going.
> 
> So do you mean that ATA_TAG_INTERNAL qc is used for other commands apart 
> from internal commands?
> 
Well.

The problem is that 'ATA_TAG_INTERNAL' currently is overloaded to
a) signal internal commands
b) 'magic' tag when looking up commands

My proposal would be to separate these use-cases, and use a flag (eg 
ATA_QCFLAG_INTERNAL) to determine internal commands.

The we'll be needing an internal tag-lookup map
(NCQ tag -> blk-mq tag) for ata_qc_from_tag() to retrieve the command 
corresponding to a driver tag.

I guess we'll need that anyway with libsas, as there we're working with 
a budget tag which has no relationship with the NCQ tag whatsoever ...

But with that we can kill 'ATA_TAG_INTERNAL', and let the driver figure 
out how to allocate internal tags.

Cheers,

Hannes
Hannes Reinecke June 20, 2022, 8:27 a.m. UTC | #20
On 6/16/22 10:41, Damien Le Moal wrote:
> On 2022/06/16 17:24, John Garry wrote:
>> On 16/06/2022 03:47, Damien Le Moal wrote:
>>>>> so going backward several years... That internal tag for ATA does not
>>>>> need to be reserved since this command is always used when the drive is
>>>>> idle and no other NCQ commands are on-going.
>>>>
>>>> So do you mean that ATA_TAG_INTERNAL qc is used for other commands
>>>> apart from internal commands?
>>>
>>> No. It is used only for internal commands. What I meant to say is that
>>> currently, internal commands are issued only on device scan, device
>>> revalidate and error handling. All of these phases are done with the
>>> device under EH with the issuing path stopped and all commands
>>> completed,
>>
>> If I want to allocate a request for an ATA internal command then could I
>> use 1x from the regular tags? I didn't think that this was possible as I
>> thought that all tags may be outstanding when EH kicks in. I need to
>> double check it.
> 
> When EH kicks in, the drive is in error mode and all commands are back to the
> host. From there, you need to get the drive out of error mode with read log 10h
> and then internal commands can be issued if needed. Then the aborted commands
> that are not in error are restarted.
> 
> For the non-error case (revalidate), ap->ops->qc_defer() will make sure that NCQ
> and non-NCQ commands are never mixed. Since all internal commands are non-ncq,
> when an internal command is issued, there are necessarily no other commands
> ongoing, but 32 NCQ commands may be waiting, without any free tag. The internal
> command being non-NCQ can still proceed since it does not need a real device tag.
> 
> The joy of ATA...
> 
>> Even if it were true, not using a reserved tag for ATA internal command
>> makes things more tricky as this command requires special handling for
>> scsi blk_mq_ops and there is no easy way to identify the command as
>> reserved (to know special handling is required).
> 
> Yes. Having the ATA_TAG_INTERNAL tag as a reserved tag is fine. But from the
> above, you can see that this is not really needed at all to make things work.
> The management of ATA_TAG_INTERNAL as a reserve tag is really about getting your
> API to simplify the code.
> 
> What I am thinking is that with your patches as is, it seems that we can never
> actually reserve a real tag for ATA to do internal NCQ commands... We do not
> really need that for now though, apart maybe for speeding up device revalidate.
> Everytime that one runs, one can see a big spike in read/write IO latencies
> because of the queue drain it causes.
> 
Hmm. But doesn't that mean the we can reserve one tag, _and_ set the 
queue depth to '32'?
We'll need to fiddle with the tag map on completion (cf my previous 
mail), but then we might need to do that anyway if we separate out 
ATA_QCFLAG_INTERNAL ...

Cheers,

Hannes
Damien Le Moal June 20, 2022, 9:02 a.m. UTC | #21
On 6/20/22 17:27, Hannes Reinecke wrote:
> On 6/16/22 10:41, Damien Le Moal wrote:
>> On 2022/06/16 17:24, John Garry wrote:
>>> On 16/06/2022 03:47, Damien Le Moal wrote:
>>>>>> so going backward several years... That internal tag for ATA does not
>>>>>> need to be reserved since this command is always used when the drive is
>>>>>> idle and no other NCQ commands are on-going.
>>>>>
>>>>> So do you mean that ATA_TAG_INTERNAL qc is used for other commands
>>>>> apart from internal commands?
>>>>
>>>> No. It is used only for internal commands. What I meant to say is that
>>>> currently, internal commands are issued only on device scan, device
>>>> revalidate and error handling. All of these phases are done with the
>>>> device under EH with the issuing path stopped and all commands
>>>> completed,
>>>
>>> If I want to allocate a request for an ATA internal command then could I
>>> use 1x from the regular tags? I didn't think that this was possible as I
>>> thought that all tags may be outstanding when EH kicks in. I need to
>>> double check it.
>>
>> When EH kicks in, the drive is in error mode and all commands are back to the
>> host. From there, you need to get the drive out of error mode with read log 10h
>> and then internal commands can be issued if needed. Then the aborted commands
>> that are not in error are restarted.
>>
>> For the non-error case (revalidate), ap->ops->qc_defer() will make sure that NCQ
>> and non-NCQ commands are never mixed. Since all internal commands are non-ncq,
>> when an internal command is issued, there are necessarily no other commands
>> ongoing, but 32 NCQ commands may be waiting, without any free tag. The internal
>> command being non-NCQ can still proceed since it does not need a real device tag.
>>
>> The joy of ATA...
>>
>>> Even if it were true, not using a reserved tag for ATA internal command
>>> makes things more tricky as this command requires special handling for
>>> scsi blk_mq_ops and there is no easy way to identify the command as
>>> reserved (to know special handling is required).
>>
>> Yes. Having the ATA_TAG_INTERNAL tag as a reserved tag is fine. But from the
>> above, you can see that this is not really needed at all to make things work.
>> The management of ATA_TAG_INTERNAL as a reserve tag is really about getting your
>> API to simplify the code.
>>
>> What I am thinking is that with your patches as is, it seems that we can never
>> actually reserve a real tag for ATA to do internal NCQ commands... We do not
>> really need that for now though, apart maybe for speeding up device revalidate.
>> Everytime that one runs, one can see a big spike in read/write IO latencies
>> because of the queue drain it causes.
>>
> Hmm. But doesn't that mean the we can reserve one tag, _and_ set the 
> queue depth to '32'?
> We'll need to fiddle with the tag map on completion (cf my previous 
> mail), but then we might need to do that anyway if we separate out 
> ATA_QCFLAG_INTERNAL ...

Reserving a tag is not enough. As explained, even if I can get a tag for a
qc, I need a proper req to safely issue an ncq command (because of the
potential need to delay and requeue even if we have a free tag !).

So reserving a tag/req to be able to do NCQ at the cost of max qd being 31
works for that. We could keep max qd at 32 by creating one more "fake" tag
and having a request for it, that is, having the fake tag visible to the
block layer as a reserved tag, as John's series is doing, but for the
reserved tags, we actually need to use an effective tag (qc->hw_tag) when
issuing the commands. And for that, we can reuse the tag of one of the
failed commands.

> 
> Cheers,
> 
> Hannes
Christoph Hellwig June 20, 2022, 9:05 a.m. UTC | #22
On Mon, Jun 20, 2022 at 06:02:30PM +0900, Damien Le Moal wrote:
> So reserving a tag/req to be able to do NCQ at the cost of max qd being 31
> works for that. We could keep max qd at 32 by creating one more "fake" tag
> and having a request for it, that is, having the fake tag visible to the
> block layer as a reserved tag, as John's series is doing, but for the
> reserved tags, we actually need to use an effective tag (qc->hw_tag) when
> issuing the commands. And for that, we can reuse the tag of one of the
> failed commands.

Take a look at the magic flush request in blk-flush.c, which is
preallocated but borrows a tag from the request that wants a pre- or
post-flush.  The logic is rather ugly, but maybe it might actually
become cleaner by generalizing it a bit.
Damien Le Moal June 20, 2022, 11:24 a.m. UTC | #23
On 6/20/22 18:05, Christoph Hellwig wrote:
> On Mon, Jun 20, 2022 at 06:02:30PM +0900, Damien Le Moal wrote:
>> So reserving a tag/req to be able to do NCQ at the cost of max qd being 31
>> works for that. We could keep max qd at 32 by creating one more "fake" tag
>> and having a request for it, that is, having the fake tag visible to the
>> block layer as a reserved tag, as John's series is doing, but for the
>> reserved tags, we actually need to use an effective tag (qc->hw_tag) when
>> issuing the commands. And for that, we can reuse the tag of one of the
>> failed commands.
> 
> Take a look at the magic flush request in blk-flush.c, which is
> preallocated but borrows a tag from the request that wants a pre- or
> post-flush.  The logic is rather ugly, but maybe it might actually
> become cleaner by generalizing it a bit.

Thanks. Will check.
I am also looking at scsi_unjam_host() and scsi_eh_get_sense(). These
reuse a scsi command to do eh operations. So I could use that too, modulo
making it work outside of eh context to keep the command flow intact.
Hannes Reinecke June 20, 2022, 11:56 a.m. UTC | #24
On 6/20/22 13:24, Damien Le Moal wrote:
> On 6/20/22 18:05, Christoph Hellwig wrote:
>> On Mon, Jun 20, 2022 at 06:02:30PM +0900, Damien Le Moal wrote:
>>> So reserving a tag/req to be able to do NCQ at the cost of max qd being 31
>>> works for that. We could keep max qd at 32 by creating one more "fake" tag
>>> and having a request for it, that is, having the fake tag visible to the
>>> block layer as a reserved tag, as John's series is doing, but for the
>>> reserved tags, we actually need to use an effective tag (qc->hw_tag) when
>>> issuing the commands. And for that, we can reuse the tag of one of the
>>> failed commands.
>>
>> Take a look at the magic flush request in blk-flush.c, which is
>> preallocated but borrows a tag from the request that wants a pre- or
>> post-flush.  The logic is rather ugly, but maybe it might actually
>> become cleaner by generalizing it a bit.
> 
> Thanks. Will check.
> I am also looking at scsi_unjam_host() and scsi_eh_get_sense(). These
> reuse a scsi command to do eh operations. So I could use that too, modulo
> making it work outside of eh context to keep the command flow intact.
> 

Tsk. I was hoping to be able to remove it (especially 
scsi_eh_get_sense()), but looks as if we actually do need it.
But it might be not a bad idea to have scsi_eh_get_sense() to run 
independent on the SCSI EH stuff; returning with a sense code is not 
necessary an error, so there are reasons for not always invoking SCSI EH.

Cheers,

Hannes
diff mbox series

Patch

diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index 8352f90d997d..27296addaf63 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -474,6 +474,9 @@  struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	if (sht->virt_boundary_mask)
 		shost->virt_boundary_mask = sht->virt_boundary_mask;
 
+	if (sht->nr_reserved_cmds)
+		shost->nr_reserved_cmds = sht->nr_reserved_cmds;
+
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
 	shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 6ffc9e4258a8..f6e53c6d913c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1974,8 +1974,12 @@  int scsi_mq_setup_tags(struct Scsi_Host *shost)
 	else
 		tag_set->ops = &scsi_mq_ops_no_commit;
 	tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1;
+
 	tag_set->nr_maps = shost->nr_maps ? : 1;
-	tag_set->queue_depth = shost->can_queue;
+	tag_set->queue_depth =
+		shost->can_queue + shost->nr_reserved_cmds;
+	tag_set->reserved_tags = shost->nr_reserved_cmds;
+
 	tag_set->cmd_size = cmd_size;
 	tag_set->numa_node = dev_to_node(shost->dma_dev);
 	tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 59aef1f178f5..149dcbd4125e 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -366,10 +366,19 @@  struct scsi_host_template {
 	/*
 	 * This determines if we will use a non-interrupt driven
 	 * or an interrupt driven scheme.  It is set to the maximum number
-	 * of simultaneous commands a single hw queue in HBA will accept.
+	 * of simultaneous commands a single hw queue in HBA will accept
+	 * excluding internal commands.
 	 */
 	int can_queue;
 
+	/*
+	 * This determines how many commands the HBA will set aside
+	 * for internal commands. This number will be added to
+	 * @can_queue to calcumate the maximum number of simultaneous
+	 * commands sent to the host.
+	 */
+	int nr_reserved_cmds;
+
 	/*
 	 * In many instances, especially where disconnect / reconnect are
 	 * supported, our host also has an ID on the SCSI bus.  If this is
@@ -602,6 +611,11 @@  struct Scsi_Host {
 	unsigned short max_cmd_len;
 
 	int this_id;
+
+	/*
+	 * Number of commands this host can handle at the same time.
+	 * This excludes reserved commands as specified by nr_reserved_cmds.
+	 */
 	int can_queue;
 	short cmd_per_lun;
 	short unsigned int sg_tablesize;
@@ -620,6 +634,12 @@  struct Scsi_Host {
 	 */
 	unsigned nr_hw_queues;
 	unsigned nr_maps;
+
+	/*
+	 * Number of reserved commands to allocate, if any.
+	 */
+	unsigned nr_reserved_cmds;
+
 	unsigned active_mode:2;
 
 	/*