diff mbox

[GIT,v4,0/2] libsas: eh reworks (ata-eh vs discovery, races, ...)

Message ID 20120110073647.4563.7504.stgit@localhost6.localdomain6
State Not Applicable
Delegated to: David Miller
Headers show

Pull-request

git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4

Commit Message

Dan Williams Jan. 10, 2012, 7:38 a.m. UTC
The latest version of the patch kit is available here:

  git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4

Changes since v3: http://marc.info/?l=linux-scsi&m=132581455214789&w=2

1/ null pointer regression smp_execute_task: make sure device is not
   removed from the domain while eh is in flight.  Fix up the logic that
   defers domain_device destruction to its own context.  Folded into:
   "libsas: prevent domain rediscovery competing with ata error handling"
   plus a small prep patch: "libsas: convert dev->gone to flags"

2/ lockdep regression in smp_execute_task: failed to release the lock
   before returning.  Folded into: "libsas: check for 'gone' expanders in
   smp_execute_task()"

3/ unnecessary locking in isci_ata_check_ready().  Folded into: "isci:
   ->lldd_ata_check_ready handler"

4/ two new fixes follow in separate mails
   [PATCH v4 1/2] libsas: pre-clean commands that won the eh vs completion race
   [PATCH v4 2/2] libsas: feed the scsi_block_when_processing_errors() meter

Incremental diff: libsas-eh-reworks-v3..libsas-eh-reworks-v4


The following changes since commit 5c41dc3a79150e93e5d050871a10b761be8281a1:

  [SCSI] lpfc 8.3.28: Update driver version to 8.3.28 (2011-12-15 10:57:45 +0400)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4

Dan Williams (36):
      libsas: remove unused ata_task_resp fields
      libsas: kill sas_slave_destroy
      libsas: fix domain_device leak
      libsas: fix leak of dev->sata_dev.identify_[packet_]device
      libsas: replace event locks with atomic bitops
      libsas: convert ha->state to flags
      libsas: introduce sas_drain_work()
      libsas: remove ata_port.lock management duties from lldds
      libsas: convert dev->gone to flags
      libsas: prevent domain rediscovery competing with ata error handling
      libsas: use ->set_dmamode to notify lldds of NCQ parameters
      libsas: kill invocation of scsi_eh_finish_cmd from sas_ata_task_done
      libsas: close error handling vs sas_ata_task_done() race
      libsas: prevent double completion of scmds from eh
      libsas: fix timeout vs completion race
      libsas: let libata handle command timeouts
      libsas: defer SAS_TASK_NEED_DEV_RESET commands to libata
      libsas: use libata-eh-reset for sata rediscovery fis transmit failures
      libsas: perform sas-transport resets in shost->workq context
      libsas: execute transport link resets with libata-eh via host workqueue
      libsas: sas_phy_enable via transport_sas_phy_reset
      libsas: async ata-eh
      libsas: poll for ata device readiness after reset
      libsas: don't mark expanders as gone when a child device is removed
      libsas: check for 'gone' expanders in smp_execute_task()
      libsas: fix sas_find_local_phy(), take phy references
      libsas: don't recover 'gone' devices in sas_ata_hard_reset()
      isci: kill iphy->isci_port lookups
      isci: kill isci_port->status
      isci: fix interpretation of "hard" reset
      isci: stop interpreting ->lldd_lu_reset() as an ata soft-reset
      isci: ->lldd_ata_check_ready handler
      isci: remove bus and reset handlers
      isci: remove IDEV_EH hack to disable "discovery-time" ata resets
      libsas: pre-clean commands that won the eh vs completion race
      libsas: feed the scsi_block_when_processing_errors() meter

Jeff Skirvin (2):
      libsas: Remove redundant phy state notification calls.
      libsas: add mutex for SMP task execution

 Documentation/scsi/libsas.txt       |   15 -
 drivers/ata/libata-eh.c             |    1 +
 drivers/ata/libata.h                |    1 -
 drivers/scsi/aic94xx/aic94xx.h      |    2 +
 drivers/scsi/aic94xx/aic94xx_dev.c  |   38 ++-
 drivers/scsi/aic94xx/aic94xx_init.c |    5 +-
 drivers/scsi/aic94xx/aic94xx_tmf.c  |    9 +-
 drivers/scsi/isci/host.c            |    8 +-
 drivers/scsi/isci/host.h            |   19 +-
 drivers/scsi/isci/init.c            |   13 +-
 drivers/scsi/isci/phy.c             |   18 +-
 drivers/scsi/isci/phy.h             |    1 -
 drivers/scsi/isci/port.c            |  217 ++++++------
 drivers/scsi/isci/port.h            |   11 +-
 drivers/scsi/isci/remote_device.c   |   32 +--
 drivers/scsi/isci/remote_device.h   |    7 +-
 drivers/scsi/isci/request.c         |  198 +----------
 drivers/scsi/isci/request.h         |    9 +-
 drivers/scsi/isci/task.c            |  158 ++-------
 drivers/scsi/isci/task.h            |   40 --
 drivers/scsi/libsas/sas_ata.c       |  692 +++++++++++++++--------------------
 drivers/scsi/libsas/sas_discover.c  |  152 +++++++--
 drivers/scsi/libsas/sas_event.c     |   89 +++++-
 drivers/scsi/libsas/sas_expander.c  |  113 ++++--
 drivers/scsi/libsas/sas_init.c      |  192 +++++++++-
 drivers/scsi/libsas/sas_internal.h  |   73 ++--
 drivers/scsi/libsas/sas_phy.c       |   12 +-
 drivers/scsi/libsas/sas_port.c      |   26 +-
 drivers/scsi/libsas/sas_scsi_host.c |  320 +++++++---------
 drivers/scsi/mvsas/mv_init.c        |    1 -
 drivers/scsi/mvsas/mv_sas.c         |   11 +-
 drivers/scsi/pm8001/pm8001_init.c   |    1 -
 drivers/scsi/pm8001/pm8001_sas.c    |   29 +-
 drivers/scsi/scsi_transport_sas.c   |   59 +++-
 include/linux/libata.h              |    1 +
 include/scsi/libsas.h               |   67 ++--
 include/scsi/sas_ata.h              |   26 +-
 include/scsi/scsi_transport_sas.h   |   12 +-
 38 files changed, 1321 insertions(+), 1357 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

James Bottomley Jan. 10, 2012, 7:29 p.m. UTC | #1
On Mon, 2012-01-09 at 23:38 -0800, Dan Williams wrote:
> The latest version of the patch kit is available here:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4
> 
> Changes since v3: http://marc.info/?l=linux-scsi&m=132581455214789&w=2

Just to check I'm not lost here. I think this is currently the only
mergeable set of patches you have because you're reworking patches in
all your other series and will repost ... is this correct?

James


--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Jan. 10, 2012, 8:18 p.m. UTC | #2
On Tue, Jan 10, 2012 at 11:29 AM, James Bottomley
<James.Bottomley@hansenpartnership.com> wrote:
> On Mon, 2012-01-09 at 23:38 -0800, Dan Williams wrote:
>> The latest version of the patch kit is available here:
>>
>>   git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4
>>
>> Changes since v3: http://marc.info/?l=linux-scsi&m=132581455214789&w=2
>
> Just to check I'm not lost here. I think this is currently the only
> mergeable set of patches you have because you're reworking patches in
> all your other series and will repost ... is this correct?

There are two mergeable topics in isci.git:
libsas-eh-reworks-v4: mergeable
devel: mergeable, posted as "[GIT PATCH v2 00/15] isci update for 3.3
(part1)" [1].  This contains all the general updates to the driver
that were independent of the libsas churn.  It was rebased to
incorporate Ben's firmware removal patch, but is otherwise idle and
ready to merge.

--
Dan

http://marc.info/?l=linux-scsi&m=132581499114886&w=2
--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jack Wang Jan. 13, 2012, 12:57 a.m. UTC | #3
Hi Dan,

Thanks for your fix, I do test this with new patchset, this works good for
me.
Only one thing confuse me, kernel sometimes print cmd timed out when disk
attached.
Like :
"
[  312.732468] sd 4:0:11:0: [sdl] command ffff88032c6eaa00 timed out
[  312.753114] sd 4:0:13:0: [sdn] command ffff88032c6eb000 timed out
[  312.753257] sd 4:0:4:0: [sde] command ffff88032903e800 timed out
[  312.753266] sd 4:0:14:0: [sdo] command ffff880329284c00 timed out
[  312.755304] sd 4:0:1:0: [sdb] command ffff8801b4b80600 timed out
[  312.797458] sd 4:0:15:0: [sdp] command ffff880329285900 timed out
"
Although, this is no harm.

Jack
> 
> The latest version of the patch kit is available here:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git
> libsas-eh-reworks-v4
> 
> Changes since v3: http://marc.info/?l=linux-scsi&m=132581455214789&w=2
> 
> 1/ null pointer regression smp_execute_task: make sure device is not
>    removed from the domain while eh is in flight.  Fix up the logic that
>    defers domain_device destruction to its own context.  Folded into:
>    "libsas: prevent domain rediscovery competing with ata error handling"
>    plus a small prep patch: "libsas: convert dev->gone to flags"
> 
> 2/ lockdep regression in smp_execute_task: failed to release the lock
>    before returning.  Folded into: "libsas: check for 'gone' expanders in
>    smp_execute_task()"
> 
> 3/ unnecessary locking in isci_ata_check_ready().  Folded into: "isci:
>    ->lldd_ata_check_ready handler"
> 
> 4/ two new fixes follow in separate mails
>    [PATCH v4 1/2] libsas: pre-clean commands that won the eh vs completion
race
>    [PATCH v4 2/2] libsas: feed the scsi_block_when_processing_errors()
meter
> 
> Incremental diff: libsas-eh-reworks-v3..libsas-eh-reworks-v4
> 
> diff --git a/drivers/scsi/isci/port.c b/drivers/scsi/isci/port.c
> index e795645..eca9ba0 100644
> --- a/drivers/scsi/isci/port.c
> +++ b/drivers/scsi/isci/port.c
> @@ -1681,10 +1681,7 @@ int isci_ata_check_ready(struct domain_device *dev)
>  	if (test_bit(IPORT_RESET_PENDING, &iport->state))
>  		goto out;
> 
> -	/* snapshot active phy mask */
> -	spin_lock_irqsave(&ihost->scic_lock, flags);
>  	rc = !!iport->active_phy_mask;
> -	spin_unlock_irqrestore(&ihost->scic_lock, flags);
>   out:
>  	isci_put_device(idev);
> 
> diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
> index f90fdcf..a062adc 100644
> --- a/drivers/scsi/libsas/sas_ata.c
> +++ b/drivers/scsi/libsas/sas_ata.c
> @@ -195,7 +195,7 @@ static unsigned int sas_ata_qc_issue(struct
ata_queued_cmd
> *qc)
>  	spin_unlock(ap->lock);
> 
>  	/* If the device fell off, no sense in issuing commands */
> -	if (dev->gone)
> +	if (test_bit(SAS_DEV_GONE, &dev->state))
>  		goto out;
> 
>  	task = sas_alloc_task(GFP_ATOMIC);
> @@ -327,7 +327,7 @@ static int sas_ata_hard_reset(struct ata_link *link,
> unsigned int *class,
>  	struct domain_device *dev = ap->private_data;
>  	struct sas_internal *i = dev_to_sas_internal(dev);
> 
> -	if (dev->gone)
> +	if (test_bit(SAS_DEV_GONE, &dev->state))
>  		return -ENODEV;
> 
>  	res = i->dft->lldd_I_T_nexus_reset(dev);
> @@ -663,6 +663,11 @@ static void async_sas_ata_eh(void *data,
async_cookie_t
> cookie)
> 
>  	ata_port_printk(ap, KERN_DEBUG, "sas eh calling libata port error
> handler");
>  	ata_scsi_port_error_handler(ha->core.shost, ap);
> +
> +	/* tell scsi_block_when_processing_errors() waiters that we are
> +	 * still making forward progress
> +	 */
> +	wake_up(&ha->core.shost->host_wait);
>  }
> 
>  void sas_ata_strategy_handler(struct Scsi_Host *shost)
> diff --git a/drivers/scsi/libsas/sas_discover.c
> b/drivers/scsi/libsas/sas_discover.c
> index 0b7555d..dfb0250 100644
> --- a/drivers/scsi/libsas/sas_discover.c
> +++ b/drivers/scsi/libsas/sas_discover.c
> @@ -269,17 +269,22 @@ static void sas_destruct_devices(struct work_struct
> *work)
> 
>  	clear_bit(DISCE_DESTRUCT, &port->disc.pending);
> 
> -	list_for_each_entry_safe(dev, n, &port->destroy_list, dev_list_node)
{
> +	list_for_each_entry_safe(dev, n, &port->destroy_list,
disco_list_node)
> {
> +		list_del_init(&dev->disco_list_node);
> +
>  		sas_remove_children(&dev->rphy->dev);
>  		sas_rphy_delete(dev->rphy);
>  		dev->rphy = NULL;
>  		sas_unregister_common_dev(port, dev);
> +
> +		sas_put_device(dev);
>  	}
>  }
> 
>  void sas_unregister_dev(struct asd_sas_port *port, struct domain_device
> *dev)
>  {
> -	if (!list_empty(&dev->disco_list_node)) {
> +	if (!test_bit(SAS_DEV_DESTROY, &dev->state) &&
> +	    !list_empty(&dev->disco_list_node)) {
>  		/* this rphy never saw sas_rphy_add */
>  		list_del_init(&dev->disco_list_node);
>  		sas_rphy_free(dev->rphy);
> @@ -287,13 +292,9 @@ void sas_unregister_dev(struct asd_sas_port *port,
struct
> domain_device *dev)
>  		sas_unregister_common_dev(port, dev);
>  	}
> 
> -	if (dev->rphy) {
> +	if (dev->rphy && !test_and_set_bit(SAS_DEV_DESTROY, &dev->state)) {
>  		sas_rphy_unlink(dev->rphy);
> -
> -		spin_lock_irq(&port->dev_list_lock);
> -		list_move_tail(&dev->dev_list_node, &port->destroy_list);
> -		spin_unlock_irq(&port->dev_list_lock);
> -
> +		list_move_tail(&dev->disco_list_node, &port->destroy_list);
>  		sas_discover_event(dev->port, DISCE_DESTRUCT);
>  	}
>  }
> diff --git a/drivers/scsi/libsas/sas_expander.c
> b/drivers/scsi/libsas/sas_expander.c
> index e47599b..68a80a0 100644
> --- a/drivers/scsi/libsas/sas_expander.c
> +++ b/drivers/scsi/libsas/sas_expander.c
> @@ -74,8 +74,10 @@ static int smp_execute_task(struct domain_device *dev,
void
> *req, int req_size,
> 
>  	mutex_lock(&dev->ex_dev.cmd_mutex);
>  	for (retry = 0; retry < 3; retry++) {
> -		if (dev->gone)
> -			return -ECOMM;
> +		if (test_bit(SAS_DEV_GONE, &dev->state)) {
> +			res = -ECOMM;
> +			break;
> +		}
> 
>  		task = sas_alloc_task(GFP_KERNEL);
>  		if (!task) {
> @@ -1794,7 +1796,7 @@ static void sas_unregister_ex_tree(struct
asd_sas_port
> *port, struct domain_devi
>  	struct domain_device *child, *n;
> 
>  	list_for_each_entry_safe(child, n, &ex->children, siblings) {
> -		child->gone = 1;
> +		set_bit(SAS_DEV_GONE, &child->state);
>  		if (child->dev_type == EDGE_DEV ||
>  		    child->dev_type == FANOUT_DEV)
>  			sas_unregister_ex_tree(port, child);
> @@ -1815,7 +1817,7 @@ static void sas_unregister_devs_sas_addr(struct
> domain_device *parent,
>  			&ex_dev->children, siblings) {
>  			if (SAS_ADDR(child->sas_addr) ==
>  			    SAS_ADDR(phy->attached_sas_addr)) {
> -				child->gone = 1;
> +				set_bit(SAS_DEV_GONE, &child->state);
>  				if (child->dev_type == EDGE_DEV ||
>  				    child->dev_type == FANOUT_DEV)
>  					sas_unregister_ex_tree(parent->port,
child);
> diff --git a/drivers/scsi/libsas/sas_port.c
> b/drivers/scsi/libsas/sas_port.c
> index 36e2905..31adcd1 100644
> --- a/drivers/scsi/libsas/sas_port.c
> +++ b/drivers/scsi/libsas/sas_port.c
> @@ -168,7 +168,7 @@ void sas_deform_port(struct asd_sas_phy *phy, int
gone)
> 
>  	if (port->num_phys == 1) {
>  		if (dev && gone)
> -			dev->gone = 1;
> +			set_bit(SAS_DEV_GONE, &dev->state);
>  		sas_unregister_domain_devices(port);
>  		sas_port_delete(port->port);
>  		port->port = NULL;
> diff --git a/drivers/scsi/libsas/sas_scsi_host.c
> b/drivers/scsi/libsas/sas_scsi_host.c
> index 59a227d..731c892 100644
> --- a/drivers/scsi/libsas/sas_scsi_host.c
> +++ b/drivers/scsi/libsas/sas_scsi_host.c
> @@ -208,7 +208,7 @@ int sas_queuecommand(struct Scsi_Host *host, struct
> scsi_cmnd *cmd)
>  	int res = 0;
> 
>  	/* If the device fell off, no sense in issuing commands */
> -	if (dev->gone) {
> +	if (test_bit(SAS_DEV_GONE, &dev->state)) {
>  		cmd->result = DID_BAD_TARGET << 16;
>  		goto out_done;
>  	}
> @@ -249,8 +249,8 @@ out_done:
> 
>  static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
>  {
> -	struct sas_task *task = TO_SAS_TASK(cmd);
>  	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host);
> +	struct sas_task *task = TO_SAS_TASK(cmd);
> 
>  	/* At this point, we only get called following an actual abort
>  	 * of the task, so we should be guaranteed not to be racing with
> @@ -267,9 +267,9 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
> 
>  static void sas_eh_defer_cmd(struct scsi_cmnd *cmd)
>  {
> -	struct sas_task *task = TO_SAS_TASK(cmd);
> -	struct domain_device *dev = task->dev;
> +	struct domain_device *dev = cmd_to_domain_dev(cmd);
>  	struct sas_ha_struct *ha = dev->port->ha;
> +	struct sas_task *task = TO_SAS_TASK(cmd);
> 
>  	if (!dev_is_sata(dev)) {
>  		sas_eh_finish_cmd(cmd);
> @@ -530,8 +530,9 @@ static int sas_eh_handle_sas_errors(struct Scsi_Host
> *shost,
>  	struct sas_internal *i = to_sas_internal(shost->transportt);
>  	unsigned long flags;
>  	struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost);
> +	LIST_HEAD(done);
> 
> -Again:
> +	/* clean out any commands that won the completion vs eh race */
>  	list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
>  		struct domain_device *dev = cmd_to_domain_dev(cmd);
>  		struct sas_task *task;
> @@ -545,7 +546,12 @@ Again:
>  		spin_unlock_irqrestore(&dev->done_lock, flags);
> 
>  		if (!task)
> -			continue;
> +			list_move_tail(&cmd->eh_entry, &done);
> +	}
> +
> + Again:
> +	list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
> +		struct sas_task *task = TO_SAS_TASK(cmd);
> 
>  		list_del_init(&cmd->eh_entry);
> 
> @@ -649,15 +655,16 @@ Again:
>  			goto clear_q;
>  		}
>  	}
> + out:
> +	list_splice_tail(&done, work_q);
>  	list_splice_tail_init(&ha->eh_ata_q, work_q);
>  	return list_empty(work_q);
> -clear_q:
> +
> + clear_q:
>  	SAS_DPRINTK("--- Exit %s -- clear_q\n", __func__);
>  	list_for_each_entry_safe(cmd, n, work_q, eh_entry)
>  		sas_eh_finish_cmd(cmd);
> -
> -	list_splice_tail_init(&ha->eh_ata_q, work_q);
> -	return list_empty(work_q);
> +	goto out;
>  }
> 
>  void sas_scsi_recover_host(struct Scsi_Host *shost)
> diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
> index feb61a8..55bab86 100644
> --- a/include/scsi/libsas.h
> +++ b/include/scsi/libsas.h
> @@ -174,7 +174,11 @@ struct sata_device {
>  	struct ata_taskfile tf;
>  };
> 
> -/* ---------- Domain device ---------- */
> +enum {
> +	SAS_DEV_GONE,
> +	SAS_DEV_DESTROY,
> +};
> +
>  struct domain_device {
>  	spinlock_t done_lock;
>          enum sas_dev_type dev_type;
> @@ -191,7 +195,7 @@ struct domain_device {
>  	struct sas_phy *phy;
> 
>          struct list_head dev_list_node;
> -	struct list_head disco_list_node;
> +	struct list_head disco_list_node; /* awaiting probe or destruct */
> 
>          enum sas_protocol    iproto;
>          enum sas_protocol    tproto;
> @@ -209,7 +213,7 @@ struct domain_device {
>          };
> 
>          void *lldd_dev;
> -	int gone;
> +	unsigned long state;
>  	struct kref kref;
>  };
> 
> 
> The following changes since commit
5c41dc3a79150e93e5d050871a10b761be8281a1:
> 
>   [SCSI] lpfc 8.3.28: Update driver version to 8.3.28 (2011-12-15 10:57:45
> +0400)
> 
> are available in the git repository at:
>   git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git
> libsas-eh-reworks-v4
> 
> Dan Williams (36):
>       libsas: remove unused ata_task_resp fields
>       libsas: kill sas_slave_destroy
>       libsas: fix domain_device leak
>       libsas: fix leak of dev->sata_dev.identify_[packet_]device
>       libsas: replace event locks with atomic bitops
>       libsas: convert ha->state to flags
>       libsas: introduce sas_drain_work()
>       libsas: remove ata_port.lock management duties from lldds
>       libsas: convert dev->gone to flags
>       libsas: prevent domain rediscovery competing with ata error handling
>       libsas: use ->set_dmamode to notify lldds of NCQ parameters
>       libsas: kill invocation of scsi_eh_finish_cmd from sas_ata_task_done
>       libsas: close error handling vs sas_ata_task_done() race
>       libsas: prevent double completion of scmds from eh
>       libsas: fix timeout vs completion race
>       libsas: let libata handle command timeouts
>       libsas: defer SAS_TASK_NEED_DEV_RESET commands to libata
>       libsas: use libata-eh-reset for sata rediscovery fis transmit
failures
>       libsas: perform sas-transport resets in shost->workq context
>       libsas: execute transport link resets with libata-eh via host
workqueue
>       libsas: sas_phy_enable via transport_sas_phy_reset
>       libsas: async ata-eh
>       libsas: poll for ata device readiness after reset
>       libsas: don't mark expanders as gone when a child device is removed
>       libsas: check for 'gone' expanders in smp_execute_task()
>       libsas: fix sas_find_local_phy(), take phy references
>       libsas: don't recover 'gone' devices in sas_ata_hard_reset()
>       isci: kill iphy->isci_port lookups
>       isci: kill isci_port->status
>       isci: fix interpretation of "hard" reset
>       isci: stop interpreting ->lldd_lu_reset() as an ata soft-reset
>       isci: ->lldd_ata_check_ready handler
>       isci: remove bus and reset handlers
>       isci: remove IDEV_EH hack to disable "discovery-time" ata resets
>       libsas: pre-clean commands that won the eh vs completion race
>       libsas: feed the scsi_block_when_processing_errors() meter
> 
> Jeff Skirvin (2):
>       libsas: Remove redundant phy state notification calls.
>       libsas: add mutex for SMP task execution
> 
>  Documentation/scsi/libsas.txt       |   15 -
>  drivers/ata/libata-eh.c             |    1 +
>  drivers/ata/libata.h                |    1 -
>  drivers/scsi/aic94xx/aic94xx.h      |    2 +
>  drivers/scsi/aic94xx/aic94xx_dev.c  |   38 ++-
>  drivers/scsi/aic94xx/aic94xx_init.c |    5 +-
>  drivers/scsi/aic94xx/aic94xx_tmf.c  |    9 +-
>  drivers/scsi/isci/host.c            |    8 +-
>  drivers/scsi/isci/host.h            |   19 +-
>  drivers/scsi/isci/init.c            |   13 +-
>  drivers/scsi/isci/phy.c             |   18 +-
>  drivers/scsi/isci/phy.h             |    1 -
>  drivers/scsi/isci/port.c            |  217 ++++++------
>  drivers/scsi/isci/port.h            |   11 +-
>  drivers/scsi/isci/remote_device.c   |   32 +--
>  drivers/scsi/isci/remote_device.h   |    7 +-
>  drivers/scsi/isci/request.c         |  198 +----------
>  drivers/scsi/isci/request.h         |    9 +-
>  drivers/scsi/isci/task.c            |  158 ++-------
>  drivers/scsi/isci/task.h            |   40 --
>  drivers/scsi/libsas/sas_ata.c       |  692
> +++++++++++++++--------------------
>  drivers/scsi/libsas/sas_discover.c  |  152 +++++++--
>  drivers/scsi/libsas/sas_event.c     |   89 +++++-
>  drivers/scsi/libsas/sas_expander.c  |  113 ++++--
>  drivers/scsi/libsas/sas_init.c      |  192 +++++++++-
>  drivers/scsi/libsas/sas_internal.h  |   73 ++--
>  drivers/scsi/libsas/sas_phy.c       |   12 +-
>  drivers/scsi/libsas/sas_port.c      |   26 +-
>  drivers/scsi/libsas/sas_scsi_host.c |  320 +++++++---------
>  drivers/scsi/mvsas/mv_init.c        |    1 -
>  drivers/scsi/mvsas/mv_sas.c         |   11 +-
>  drivers/scsi/pm8001/pm8001_init.c   |    1 -
>  drivers/scsi/pm8001/pm8001_sas.c    |   29 +-
>  drivers/scsi/scsi_transport_sas.c   |   59 +++-
>  include/linux/libata.h              |    1 +
>  include/scsi/libsas.h               |   67 ++--
>  include/scsi/sas_ata.h              |   26 +-
>  include/scsi/scsi_transport_sas.h   |   12 +-
>  38 files changed, 1321 insertions(+), 1357 deletions(-)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Jan. 13, 2012, 1:21 a.m. UTC | #4
On Thu, Jan 12, 2012 at 4:57 PM, Jack Wang <jack_wang@usish.com> wrote:
> Hi Dan,
>
> Thanks for your fix, I do test this with new patchset, this works good for
> me.
> Only one thing confuse me, kernel sometimes print cmd timed out when disk
> attached.
> Like :
> "
> [  312.732468] sd 4:0:11:0: [sdl] command ffff88032c6eaa00 timed out
> [  312.753114] sd 4:0:13:0: [sdn] command ffff88032c6eb000 timed out
> [  312.753257] sd 4:0:4:0: [sde] command ffff88032903e800 timed out
> [  312.753266] sd 4:0:14:0: [sdo] command ffff880329284c00 timed out
> [  312.755304] sd 4:0:1:0: [sdb] command ffff8801b4b80600 timed out
> [  312.797458] sd 4:0:15:0: [sdp] command ffff880329285900 timed out
> "
> Although, this is no harm.

These were probably timeouts that were happening before but did not
get reported by the old sas_scsi_timed_out().

So, I'm still trying to figure out what to do about the "failure to
transmit signature-fis" case that you sent a patch to address.  I'm
wondering if we need to schedule rediscovery after a longer timeout?
I agree with skipping sas_set_ex_phy(), but for initial discovery it
would be nice to know that we have a device out there that is trying
to connect and hold off ->scan_finished() until libsas has given up on
waiting for that phy to settle.

For example libata will reset 3 times with increasing wait times (10s,
10s, 35s).  On the last attempt it will slow down the phy to give the
ata device a better chance of connecting.  libsas is just doing 3
500ms tries at full speed and then giving up.

Wouldn't mind someone from libata land commenting on how libsas can be
more helpful to struggling sata devices.

"Let libata do it" would be my first choice, but at the point where we
are discovering this condition we don't yet have an ata_port, and
libsas only creates an ata_port *after* receiving a signature-fis.
Maybe we could create an unattached ata_port (i.e. with a stand-in
domain_device), but libsas would need to learn how to attach and probe
the domain_device after the fact.

--
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jack Wang Jan. 13, 2012, 1:37 a.m. UTC | #5
> 
> On Thu, Jan 12, 2012 at 4:57 PM, Jack Wang <jack_wang@usish.com> wrote:
> > Hi Dan,
> >
> > Thanks for your fix, I do test this with new patchset, this works good for
> > me.
> > Only one thing confuse me, kernel sometimes print cmd timed out when disk
> > attached.
> > Like :
> > "
> > [  312.732468] sd 4:0:11:0: [sdl] command ffff88032c6eaa00 timed out
> > [  312.753114] sd 4:0:13:0: [sdn] command ffff88032c6eb000 timed out
> > [  312.753257] sd 4:0:4:0: [sde] command ffff88032903e800 timed out
> > [  312.753266] sd 4:0:14:0: [sdo] command ffff880329284c00 timed out
> > [  312.755304] sd 4:0:1:0: [sdb] command ffff8801b4b80600 timed out
> > [  312.797458] sd 4:0:15:0: [sdp] command ffff880329285900 timed out
> > "
> > Although, this is no harm.
> 
> These were probably timeouts that were happening before but did not
> get reported by the old sas_scsi_timed_out().
> 
> So, I'm still trying to figure out what to do about the "failure to
> transmit signature-fis" case that you sent a patch to address.  I'm
> wondering if we need to schedule rediscovery after a longer timeout?
> I agree with skipping sas_set_ex_phy(), but for initial discovery it
> would be nice to know that we have a device out there that is trying
> to connect and hold off ->scan_finished() until libsas has given up on
> waiting for that phy to settle.
> 
> For example libata will reset 3 times with increasing wait times (10s,
> 10s, 35s).  On the last attempt it will slow down the phy to give the
> ata device a better chance of connecting.  libsas is just doing 3
> 500ms tries at full speed and then giving up.
[Jack Wang] 
Yes, in our internal test I expand the time to 10s, but I'm not sure this will be accepted by mainline kernel.
> 
> Wouldn't mind someone from libata land commenting on how libsas can be
> more helpful to struggling sata devices.
> 
> "Let libata do it" would be my first choice, but at the point where we
> are discovering this condition we don't yet have an ata_port, and
> libsas only creates an ata_port *after* receiving a signature-fis.
> Maybe we could create an unattached ata_port (i.e. with a stand-in
> domain_device), but libsas would need to learn how to attach and probe
> the domain_device after the fact.
> 
> --
> Dan

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dan Williams Jan. 13, 2012, 1:51 a.m. UTC | #6
On Thu, Jan 12, 2012 at 5:37 PM, Jack Wang <jack_wang@usish.com> wrote:
>> For example libata will reset 3 times with increasing wait times (10s,
>> 10s, 35s).  On the last attempt it will slow down the phy to give the
>> ata device a better chance of connecting.  libsas is just doing 3
>> 500ms tries at full speed and then giving up.
> [Jack Wang]
> Yes, in our internal test I expand the time to 10s, but I'm not sure this will be accepted by mainline kernel.

Do you happen to have logs of a passing and failing case, i.e. before
and after adding the 10s time delay change?

I think having libata do it's normal link recovery in this case would
solve both problems of giving enough time for ata device to come up,
and allowing the initial scan to find the device and let the initial
trip through ata eh manage link.  But it would be a pretty big change
from what libsas is doing currently for sata discovery.
--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
James Bottomley Jan. 16, 2012, 8:40 a.m. UTC | #7
On Tue, 2012-01-10 at 12:18 -0800, Dan Williams wrote:
> On Tue, Jan 10, 2012 at 11:29 AM, James Bottomley
> <James.Bottomley@hansenpartnership.com> wrote:
> > On Mon, 2012-01-09 at 23:38 -0800, Dan Williams wrote:
> >> The latest version of the patch kit is available here:
> >>
> >>   git://git.kernel.org/pub/scm/linux/kernel/git/djbw/isci.git libsas-eh-reworks-v4
> >>
> >> Changes since v3: http://marc.info/?l=linux-scsi&m=132581455214789&w=2
> >
> > Just to check I'm not lost here. I think this is currently the only
> > mergeable set of patches you have because you're reworking patches in
> > all your other series and will repost ... is this correct?
> 
> There are two mergeable topics in isci.git:
> libsas-eh-reworks-v4: mergeable
> devel: mergeable, posted as "[GIT PATCH v2 00/15] isci update for 3.3
> (part1)" [1].  This contains all the general updates to the driver
> that were independent of the libsas churn.  It was rebased to
> incorporate Ben's firmware removal patch, but is otherwise idle and
> ready to merge.

1/2 doesn't apply:

patching file drivers/scsi/libsas/sas_scsi_host.c
Hunk #1 succeeded at 235 with fuzz 2 (offset -14 lines).
Hunk #2 FAILED at 267.
Hunk #3 succeeded at 505 with fuzz 2 (offset -25 lines).
Hunk #4 succeeded at 512 with fuzz 1 (offset -34 lines).
Hunk #5 FAILED at 655.
2 out of 5 hunks FAILED -- saving rejects to file
drivers/scsi/libsas/sas_scsi_host.c.rej

James


--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/scsi/isci/port.c b/drivers/scsi/isci/port.c
index e795645..eca9ba0 100644
--- a/drivers/scsi/isci/port.c
+++ b/drivers/scsi/isci/port.c
@@ -1681,10 +1681,7 @@  int isci_ata_check_ready(struct domain_device *dev)
 	if (test_bit(IPORT_RESET_PENDING, &iport->state))
 		goto out;
 
-	/* snapshot active phy mask */
-	spin_lock_irqsave(&ihost->scic_lock, flags);
 	rc = !!iport->active_phy_mask;
-	spin_unlock_irqrestore(&ihost->scic_lock, flags);
  out:
 	isci_put_device(idev);
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index f90fdcf..a062adc 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -195,7 +195,7 @@  static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc)
 	spin_unlock(ap->lock);
 
 	/* If the device fell off, no sense in issuing commands */
-	if (dev->gone)
+	if (test_bit(SAS_DEV_GONE, &dev->state))
 		goto out;
 
 	task = sas_alloc_task(GFP_ATOMIC);
@@ -327,7 +327,7 @@  static int sas_ata_hard_reset(struct ata_link *link, unsigned int *class,
 	struct domain_device *dev = ap->private_data;
 	struct sas_internal *i = dev_to_sas_internal(dev);
 
-	if (dev->gone)
+	if (test_bit(SAS_DEV_GONE, &dev->state))
 		return -ENODEV;
 
 	res = i->dft->lldd_I_T_nexus_reset(dev);
@@ -663,6 +663,11 @@  static void async_sas_ata_eh(void *data, async_cookie_t cookie)
 
 	ata_port_printk(ap, KERN_DEBUG, "sas eh calling libata port error handler");
 	ata_scsi_port_error_handler(ha->core.shost, ap);
+
+	/* tell scsi_block_when_processing_errors() waiters that we are
+	 * still making forward progress
+	 */
+	wake_up(&ha->core.shost->host_wait);
 }
 
 void sas_ata_strategy_handler(struct Scsi_Host *shost)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index 0b7555d..dfb0250 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -269,17 +269,22 @@  static void sas_destruct_devices(struct work_struct *work)
 
 	clear_bit(DISCE_DESTRUCT, &port->disc.pending);
 
-	list_for_each_entry_safe(dev, n, &port->destroy_list, dev_list_node) {
+	list_for_each_entry_safe(dev, n, &port->destroy_list, disco_list_node) {
+		list_del_init(&dev->disco_list_node);
+
 		sas_remove_children(&dev->rphy->dev);
 		sas_rphy_delete(dev->rphy);
 		dev->rphy = NULL;
 		sas_unregister_common_dev(port, dev);
+
+		sas_put_device(dev);
 	}
 }
 
 void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
 {
-	if (!list_empty(&dev->disco_list_node)) {
+	if (!test_bit(SAS_DEV_DESTROY, &dev->state) &&
+	    !list_empty(&dev->disco_list_node)) {
 		/* this rphy never saw sas_rphy_add */
 		list_del_init(&dev->disco_list_node);
 		sas_rphy_free(dev->rphy);
@@ -287,13 +292,9 @@  void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev)
 		sas_unregister_common_dev(port, dev);
 	}
 
-	if (dev->rphy) {
+	if (dev->rphy && !test_and_set_bit(SAS_DEV_DESTROY, &dev->state)) {
 		sas_rphy_unlink(dev->rphy);
-
-		spin_lock_irq(&port->dev_list_lock);
-		list_move_tail(&dev->dev_list_node, &port->destroy_list);
-		spin_unlock_irq(&port->dev_list_lock);
-
+		list_move_tail(&dev->disco_list_node, &port->destroy_list);
 		sas_discover_event(dev->port, DISCE_DESTRUCT);
 	}
 }
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index e47599b..68a80a0 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -74,8 +74,10 @@  static int smp_execute_task(struct domain_device *dev, void *req, int req_size,
 
 	mutex_lock(&dev->ex_dev.cmd_mutex);
 	for (retry = 0; retry < 3; retry++) {
-		if (dev->gone)
-			return -ECOMM;
+		if (test_bit(SAS_DEV_GONE, &dev->state)) {
+			res = -ECOMM;
+			break;
+		}
 
 		task = sas_alloc_task(GFP_KERNEL);
 		if (!task) {
@@ -1794,7 +1796,7 @@  static void sas_unregister_ex_tree(struct asd_sas_port *port, struct domain_devi
 	struct domain_device *child, *n;
 
 	list_for_each_entry_safe(child, n, &ex->children, siblings) {
-		child->gone = 1;
+		set_bit(SAS_DEV_GONE, &child->state);
 		if (child->dev_type == EDGE_DEV ||
 		    child->dev_type == FANOUT_DEV)
 			sas_unregister_ex_tree(port, child);
@@ -1815,7 +1817,7 @@  static void sas_unregister_devs_sas_addr(struct domain_device *parent,
 			&ex_dev->children, siblings) {
 			if (SAS_ADDR(child->sas_addr) ==
 			    SAS_ADDR(phy->attached_sas_addr)) {
-				child->gone = 1;
+				set_bit(SAS_DEV_GONE, &child->state);
 				if (child->dev_type == EDGE_DEV ||
 				    child->dev_type == FANOUT_DEV)
 					sas_unregister_ex_tree(parent->port, child);
diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c
index 36e2905..31adcd1 100644
--- a/drivers/scsi/libsas/sas_port.c
+++ b/drivers/scsi/libsas/sas_port.c
@@ -168,7 +168,7 @@  void sas_deform_port(struct asd_sas_phy *phy, int gone)
 
 	if (port->num_phys == 1) {
 		if (dev && gone)
-			dev->gone = 1;
+			set_bit(SAS_DEV_GONE, &dev->state);
 		sas_unregister_domain_devices(port);
 		sas_port_delete(port->port);
 		port->port = NULL;
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 59a227d..731c892 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -208,7 +208,7 @@  int sas_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	int res = 0;
 
 	/* If the device fell off, no sense in issuing commands */
-	if (dev->gone) {
+	if (test_bit(SAS_DEV_GONE, &dev->state)) {
 		cmd->result = DID_BAD_TARGET << 16;
 		goto out_done;
 	}
@@ -249,8 +249,8 @@  out_done:
 
 static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
 {
-	struct sas_task *task = TO_SAS_TASK(cmd);
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(cmd->device->host);
+	struct sas_task *task = TO_SAS_TASK(cmd);
 
 	/* At this point, we only get called following an actual abort
 	 * of the task, so we should be guaranteed not to be racing with
@@ -267,9 +267,9 @@  static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
 
 static void sas_eh_defer_cmd(struct scsi_cmnd *cmd)
 {
-	struct sas_task *task = TO_SAS_TASK(cmd);
-	struct domain_device *dev = task->dev;
+	struct domain_device *dev = cmd_to_domain_dev(cmd);
 	struct sas_ha_struct *ha = dev->port->ha;
+	struct sas_task *task = TO_SAS_TASK(cmd);
 
 	if (!dev_is_sata(dev)) {
 		sas_eh_finish_cmd(cmd);
@@ -530,8 +530,9 @@  static int sas_eh_handle_sas_errors(struct Scsi_Host *shost,
 	struct sas_internal *i = to_sas_internal(shost->transportt);
 	unsigned long flags;
 	struct sas_ha_struct *ha = SHOST_TO_SAS_HA(shost);
+	LIST_HEAD(done);
 
-Again:
+	/* clean out any commands that won the completion vs eh race */
 	list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
 		struct domain_device *dev = cmd_to_domain_dev(cmd);
 		struct sas_task *task;
@@ -545,7 +546,12 @@  Again:
 		spin_unlock_irqrestore(&dev->done_lock, flags);
 
 		if (!task)
-			continue;
+			list_move_tail(&cmd->eh_entry, &done);
+	}
+
+ Again:
+	list_for_each_entry_safe(cmd, n, work_q, eh_entry) {
+		struct sas_task *task = TO_SAS_TASK(cmd);
 
 		list_del_init(&cmd->eh_entry);
 
@@ -649,15 +655,16 @@  Again:
 			goto clear_q;
 		}
 	}
+ out:
+	list_splice_tail(&done, work_q);
 	list_splice_tail_init(&ha->eh_ata_q, work_q);
 	return list_empty(work_q);
-clear_q:
+
+ clear_q:
 	SAS_DPRINTK("--- Exit %s -- clear_q\n", __func__);
 	list_for_each_entry_safe(cmd, n, work_q, eh_entry)
 		sas_eh_finish_cmd(cmd);
-
-	list_splice_tail_init(&ha->eh_ata_q, work_q);
-	return list_empty(work_q);
+	goto out;
 }
 
 void sas_scsi_recover_host(struct Scsi_Host *shost)
diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h
index feb61a8..55bab86 100644
--- a/include/scsi/libsas.h
+++ b/include/scsi/libsas.h
@@ -174,7 +174,11 @@  struct sata_device {
 	struct ata_taskfile tf;
 };
 
-/* ---------- Domain device ---------- */
+enum {
+	SAS_DEV_GONE,
+	SAS_DEV_DESTROY,
+};
+
 struct domain_device {
 	spinlock_t done_lock;
         enum sas_dev_type dev_type;
@@ -191,7 +195,7 @@  struct domain_device {
 	struct sas_phy *phy;
 
         struct list_head dev_list_node;
-	struct list_head disco_list_node;
+	struct list_head disco_list_node; /* awaiting probe or destruct */
 
         enum sas_protocol    iproto;
         enum sas_protocol    tproto;
@@ -209,7 +213,7 @@  struct domain_device {
         };
 
         void *lldd_dev;
-	int gone;
+	unsigned long state;
 	struct kref kref;
 };