[PATCH] libata-eh-fw: update ata_scsi_error() for new EH
authorTejun Heo <htejun@gmail.com>
Mon, 15 May 2006 11:58:12 +0000 (20:58 +0900)
committerTejun Heo <htejun@gmail.com>
Mon, 15 May 2006 11:58:12 +0000 (20:58 +0900)
Update ata_scsi_error() for new EH.  ata_scsi_error() is responsible
for claiming timed out qcs and invoking ->error_handler in safe and
synchronized manner.  As the state of the controller is unknown if a
qc has timed out, the port is frozen in such cases.

Note that ata_scsi_timed_out() isn't used for new EH.  This is because
a timed out qc cannot be claimed by EH without freezing the port and
freezing the port in ata_scsi_timed_out() results in unnecessary
abortion of other active qcs.  ata_scsi_timed_out() can be removed
once all drivers are converted to new EH.

While at it, add 'TODO: kill' comments to old EH functions.

Signed-off-by: Tejun Heo <htejun@gmail.com>
drivers/scsi/libata-eh.c
include/linux/libata.h

index cb4e2b8d32d974d129c89f9e05dbe1eae2daf129..0803231f65779776e96383f4452a09adc16c0f42 100644 (file)
@@ -44,6 +44,8 @@
 
 #include "libata.h"
 
+static void __ata_port_freeze(struct ata_port *ap);
+
 /**
  *     ata_scsi_timed_out - SCSI layer time out callback
  *     @cmd: timed out SCSI command
@@ -55,6 +57,8 @@
  *     from finishing it by setting EH_SCHEDULED and return
  *     EH_NOT_HANDLED.
  *
+ *     TODO: kill this function once old EH is gone.
+ *
  *     LOCKING:
  *     Called from timer context
  *
@@ -67,10 +71,16 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
        struct ata_port *ap = ata_shost_to_port(host);
        unsigned long flags;
        struct ata_queued_cmd *qc;
-       enum scsi_eh_timer_return ret = EH_HANDLED;
+       enum scsi_eh_timer_return ret;
 
        DPRINTK("ENTER\n");
 
+       if (ap->ops->error_handler) {
+               ret = EH_NOT_HANDLED;
+               goto out;
+       }
+
+       ret = EH_HANDLED;
        spin_lock_irqsave(&ap->host_set->lock, flags);
        qc = ata_qc_from_tag(ap, ap->active_tag);
        if (qc) {
@@ -81,6 +91,7 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
        }
        spin_unlock_irqrestore(&ap->host_set->lock, flags);
 
+ out:
        DPRINTK("EXIT, ret=%d\n", ret);
        return ret;
 }
@@ -100,21 +111,132 @@ enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
 void ata_scsi_error(struct Scsi_Host *host)
 {
        struct ata_port *ap = ata_shost_to_port(host);
+       spinlock_t *hs_lock = &ap->host_set->lock;
+       int i, repeat_cnt = ATA_EH_MAX_REPEAT;
+       unsigned long flags;
 
        DPRINTK("ENTER\n");
 
-       /* synchronize with IRQ handler and port task */
-       spin_unlock_wait(&ap->host_set->lock);
+       /* synchronize with port task */
        ata_port_flush_task(ap);
 
-       WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+       /* synchronize with host_set lock and sort out timeouts */
+
+       /* For new EH, all qcs are finished in one of three ways -
+        * normal completion, error completion, and SCSI timeout.
+        * Both cmpletions can race against SCSI timeout.  When normal
+        * completion wins, the qc never reaches EH.  When error
+        * completion wins, the qc has ATA_QCFLAG_FAILED set.
+        *
+        * When SCSI timeout wins, things are a bit more complex.
+        * Normal or error completion can occur after the timeout but
+        * before this point.  In such cases, both types of
+        * completions are honored.  A scmd is determined to have
+        * timed out iff its associated qc is active and not failed.
+        */
+       if (ap->ops->error_handler) {
+               struct scsi_cmnd *scmd, *tmp;
+               int nr_timedout = 0;
+
+               spin_lock_irqsave(hs_lock, flags);
+
+               list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
+                       struct ata_queued_cmd *qc;
+
+                       for (i = 0; i < ATA_MAX_QUEUE; i++) {
+                               qc = __ata_qc_from_tag(ap, i);
+                               if (qc->flags & ATA_QCFLAG_ACTIVE &&
+                                   qc->scsicmd == scmd)
+                                       break;
+                       }
+
+                       if (i < ATA_MAX_QUEUE) {
+                               /* the scmd has an associated qc */
+                               if (!(qc->flags & ATA_QCFLAG_FAILED)) {
+                                       /* which hasn't failed yet, timeout */
+                                       qc->err_mask |= AC_ERR_TIMEOUT;
+                                       qc->flags |= ATA_QCFLAG_FAILED;
+                                       nr_timedout++;
+                               }
+                       } else {
+                               /* Normal completion occurred after
+                                * SCSI timeout but before this point.
+                                * Successfully complete it.
+                                */
+                               scmd->retries = scmd->allowed;
+                               scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
+                       }
+               }
+
+               /* If we have timed out qcs.  They belong to EH from
+                * this point but the state of the controller is
+                * unknown.  Freeze the port to make sure the IRQ
+                * handler doesn't diddle with those qcs.  This must
+                * be done atomically w.r.t. setting QCFLAG_FAILED.
+                */
+               if (nr_timedout)
+                       __ata_port_freeze(ap);
+
+               spin_unlock_irqrestore(hs_lock, flags);
+       } else
+               spin_unlock_wait(hs_lock);
+
+ repeat:
+       /* invoke error handler */
+       if (ap->ops->error_handler) {
+               /* clear EH pending */
+               spin_lock_irqsave(hs_lock, flags);
+               ap->flags &= ~ATA_FLAG_EH_PENDING;
+               spin_unlock_irqrestore(hs_lock, flags);
+
+               /* invoke EH */
+               ap->ops->error_handler(ap);
+
+               /* Exception might have happend after ->error_handler
+                * recovered the port but before this point.  Repeat
+                * EH in such case.
+                */
+               spin_lock_irqsave(hs_lock, flags);
+
+               if (ap->flags & ATA_FLAG_EH_PENDING) {
+                       if (--repeat_cnt) {
+                               ata_port_printk(ap, KERN_INFO,
+                                       "EH pending after completion, "
+                                       "repeating EH (cnt=%d)\n", repeat_cnt);
+                               spin_unlock_irqrestore(hs_lock, flags);
+                               goto repeat;
+                       }
+                       ata_port_printk(ap, KERN_ERR, "EH pending after %d "
+                                       "tries, giving up\n", ATA_EH_MAX_REPEAT);
+               }
 
-       ap->ops->eng_timeout(ap);
+               /* Clear host_eh_scheduled while holding hs_lock such
+                * that if exception occurs after this point but
+                * before EH completion, SCSI midlayer will
+                * re-initiate EH.
+                */
+               host->host_eh_scheduled = 0;
+
+               spin_unlock_irqrestore(hs_lock, flags);
+       } else {
+               WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
+               ap->ops->eng_timeout(ap);
+       }
 
+       /* finish or retry handled scmd's and clean up */
        WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
 
        scsi_eh_flush_done_q(&ap->eh_done_q);
 
+       /* clean up */
+       spin_lock_irqsave(hs_lock, flags);
+
+       if (ap->flags & ATA_FLAG_RECOVERED)
+               ata_port_printk(ap, KERN_INFO, "EH complete\n");
+       ap->flags &= ~ATA_FLAG_RECOVERED;
+
+       spin_unlock_irqrestore(hs_lock, flags);
+
        DPRINTK("EXIT\n");
 }
 
@@ -133,6 +255,8 @@ void ata_scsi_error(struct Scsi_Host *host)
  *     an interrupt was not delivered to the driver, even though the
  *     transaction completed successfully.
  *
+ *     TODO: kill this function once old EH is gone.
+ *
  *     LOCKING:
  *     Inherited from SCSI layer (none, can sleep)
  */
@@ -198,6 +322,8 @@ static void ata_qc_timeout(struct ata_queued_cmd *qc)
  *     an interrupt was not delivered to the driver, even though the
  *     transaction completed successfully.
  *
+ *     TODO: kill this function once old EH is gone.
+ *
  *     LOCKING:
  *     Inherited from SCSI layer (none, can sleep)
  */
index 6758b4d374a0389031f41a1d7aa151e9da094822..5ad50163c8ef63534a6f8f67c90d7e75980fa924 100644 (file)
@@ -225,6 +225,9 @@ enum {
        ATA_PORT_PRIMARY        = (1 << 0),
        ATA_PORT_SECONDARY      = (1 << 1),
 
+       /* max repeat if error condition is still set after ->error_handler */
+       ATA_EH_MAX_REPEAT       = 5,
+
        /* how hard are we gonna try to probe/recover devices */
        ATA_PROBE_MAX_TRIES     = 3,
 };