IB/ipath: Improve handling and reporting of parity errors
authorBryan O'Sullivan <bos@pathscale.com>
Thu, 15 Mar 2007 21:45:07 +0000 (14:45 -0700)
committerRoland Dreier <rolandd@cisco.com>
Thu, 19 Apr 2007 03:20:58 +0000 (20:20 -0700)
Mostly cleanup.

Signed-off-by: Dave Olson <dave.olson@qlogic.com>
Signed-off-by: Bryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
drivers/infiniband/hw/ipath/ipath_driver.c
drivers/infiniband/hw/ipath/ipath_eeprom.c
drivers/infiniband/hw/ipath/ipath_iba6110.c
drivers/infiniband/hw/ipath/ipath_iba6120.c
drivers/infiniband/hw/ipath/ipath_init_chip.c
drivers/infiniband/hw/ipath/ipath_intr.c
drivers/infiniband/hw/ipath/ipath_kernel.h
drivers/infiniband/hw/ipath/ipath_registers.h

index 056e10663289b8ac1c34c21e0c4842c5bfb37445..13b9785e684cdb93dc5db4f107d48096b062e524 100644 (file)
@@ -605,8 +605,9 @@ static void __devexit cleanup_device(struct ipath_devdata *dd)
 
                ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
                           dd->ipath_pageshadow);
-               vfree(dd->ipath_pageshadow);
+               tmpp = dd->ipath_pageshadow;
                dd->ipath_pageshadow = NULL;
+               vfree(tmpp);
        }
 
        /*
index a4019a6b75602c760ef5774ed2216cd33b9d6576..030185f90ee2b6ffe9ef70de2dc61ed4aa9c1c65 100644 (file)
@@ -626,6 +626,10 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
        } else
                memcpy(dd->ipath_serial, ifp->if_serial,
                       sizeof ifp->if_serial);
+       if (!strstr(ifp->if_comment, "Tested successfully"))
+               ipath_dev_err(dd, "Board SN %s did not pass functional "
+                       "test: %s\n", dd->ipath_serial,
+                       ifp->if_comment);
 
        ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
                   (unsigned long long) be64_to_cpu(dd->ipath_guid));
index b50436c566383e70010fbff0c0042e8afcaa7130..8e0794d316fb69a9fbe7886447af75ef8c92c821 100644 (file)
@@ -284,6 +284,14 @@ static const struct ipath_cregs ipath_ht_cregs = {
 #define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
 #define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
 
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
 /*
  * masks and bits that are different in different chips, or present only
  * in one
@@ -402,6 +410,14 @@ static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
        INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
 };
 
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+                       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+                         << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static int ipath_ht_txe_recover(struct ipath_devdata *);
+
 /**
  * ipath_ht_handle_hwerrors - display hardware errors.
  * @dd: the infinipath device
@@ -450,13 +466,12 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
 
        /*
         * make sure we get this much out, unless told to be quiet,
+        * it's a parity error we may recover from,
         * or it's occurred within the last 5 seconds
         */
-       if ((hwerrs & ~(dd->ipath_lasthwerror |
-                       ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
-                         INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
-                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
-           (ipath_debug & __IPATH_VERBDBG))
+       if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+               RXE_EAGER_PARITY)) ||
+               (ipath_debug & __IPATH_VERBDBG))
                dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
                         "(cleared)\n", (unsigned long long) hwerrs);
        dd->ipath_lasthwerror |= hwerrs;
@@ -467,7 +482,7 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                              (hwerrs & ~dd->ipath_hwe_bitsextant));
 
        ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
-       if (ctrl & INFINIPATH_C_FREEZEMODE) {
+       if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
                /*
                 * parity errors in send memory are recoverable,
                 * just cancel the send (if indicated in * sendbuffererror),
@@ -476,50 +491,14 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                 * occur if a processor speculative read is done to the PIO
                 * buffer while we are sending a packet, for example.
                 */
-               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
-                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
-                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
-                       ipath_stats.sps_txeparity++;
-                       ipath_dbg("Recovering from TXE parity error (%llu), "
-                                 "hwerrstatus=%llx\n",
-                                 (unsigned long long) ipath_stats.sps_txeparity,
-                                 (unsigned long long) hwerrs);
-                       ipath_disarm_senderrbufs(dd);
-                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
-                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
-                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
-                       if (!hwerrs) { /* else leave in freeze mode */
-                               ipath_write_kreg(dd,
-                                                dd->ipath_kregs->kr_control,
-                                                dd->ipath_control);
-                               return;
-                       }
-               }
-               if (hwerrs) {
-                       /*
-                        * if any set that we aren't ignoring; only
-                        * make the complaint once, in case it's stuck
-                        * or recurring, and we get here multiple
-                        * times.
-                        */
-                       if (dd->ipath_flags & IPATH_INITTED) {
-                               ipath_dev_err(dd, "Fatal Hardware Error (freeze "
-                                             "mode), no longer usable, SN %.16s\n",
-                                                 dd->ipath_serial);
-                               isfatal = 1;
-                       }
-                       *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
-                       /* mark as having had error */
-                       *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
-                       /*
-                        * mark as not usable, at a minimum until driver
-                        * is reloaded, probably until reboot, since no
-                        * other reset is possible.
-                        */
-                       dd->ipath_flags &= ~IPATH_INITTED;
-               } else {
-                       ipath_dbg("Clearing freezemode on ignored hardware "
-                                 "error\n");
+               if ((hwerrs & TXE_PIO_PARITY) && ipath_ht_txe_recover(dd))
+                       hwerrs &= ~TXE_PIO_PARITY;
+               if (hwerrs & RXE_EAGER_PARITY)
+                       ipath_dev_err(dd, "RXE parity, Eager TID error is not "
+                               "recoverable\n");
+               if (!hwerrs) {
+                       ipath_dbg("Clearing freezemode on ignored or "
+                                 "recovered hardware error\n");
                        ctrl &= ~INFINIPATH_C_FREEZEMODE;
                        ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
                                         ctrl);
@@ -587,7 +566,32 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                                 dd->ipath_hwerrmask);
        }
 
-       ipath_dev_err(dd, "%s hardware error\n", msg);
+       if (hwerrs) {
+               /*
+                * if any set that we aren't ignoring; only
+                * make the complaint once, in case it's stuck
+                * or recurring, and we get here multiple
+                * times.
+                */
+               ipath_dev_err(dd, "%s hardware error\n", msg);
+               if (dd->ipath_flags & IPATH_INITTED) {
+                       ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+                                         "mode), no longer usable, SN %.16s\n",
+                                         dd->ipath_serial);
+                       isfatal = 1;
+               }
+               *dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+               /* mark as having had error */
+               *dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+               /*
+                * mark as not usable, at a minimum until driver
+                * is reloaded, probably until reboot, since no
+                * other reset is possible.
+                */
+               dd->ipath_flags &= ~IPATH_INITTED;
+       }
+       else
+               *msg = 0; /* recovered from all of them */
        if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
                /*
                 * for status file; if no trailing brace is copied,
@@ -658,7 +662,8 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
        if (n)
                snprintf(name, namelen, "%s", n);
 
-       if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || dd->ipath_minrev > 3)) {
+       if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+               dd->ipath_minrev > 3)) {
                /*
                 * This version of the driver only supports Rev 3.2 and 3.3
                 */
@@ -1163,6 +1168,8 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
 
        if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
                ipath_dev_err(dd, "MemBIST did not complete!\n");
+       if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+               ipath_dbg("MemBIST corrected\n");
 
        ipath_check_htlink(dd);
 
@@ -1366,6 +1373,9 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd,
                             u64 __iomem *tidptr, u32 type,
                             unsigned long pa)
 {
+       if (!dd->ipath_kregbase)
+               return;
+
        if (pa != dd->ipath_tidinvalid) {
                if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
                        dev_info(&dd->pcidev->dev,
@@ -1382,10 +1392,10 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd,
                        pa |= lenvalid | INFINIPATH_RT_VALID;
                }
        }
-       if (dd->ipath_kregbase)
-               writeq(pa, tidptr);
+       writeq(pa, tidptr);
 }
 
+
 /**
  * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
  * @dd: the infinipath device
@@ -1515,7 +1525,7 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
                         INFINIPATH_S_ABORT);
 
        ipath_get_eeprom_info(dd);
-       if(dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' &&
+       if (dd->ipath_boardrev == 5 && dd->ipath_serial[0] == '1' &&
                dd->ipath_serial[1] == '2' && dd->ipath_serial[2] == '8') {
                /*
                 * Later production QHT7040 has same changes as QHT7140, so
@@ -1528,6 +1538,24 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
        return 0;
 }
 
+
+static int ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+       int cnt = ++ipath_stats.sps_txeparity;
+       if (cnt >= IPATH_MAX_PARITY_ATTEMPTS)  {
+               if (cnt == IPATH_MAX_PARITY_ATTEMPTS)
+                       ipath_dev_err(dd,
+                               "Too many attempts to recover from "
+                               "TXE parity, giving up\n");
+               return 0;
+       }
+       dev_info(&dd->pcidev->dev,
+               "Recovering from TXE PIO parity error\n");
+       ipath_disarm_senderrbufs(dd, 1);
+       return 1;
+}
+
+
 /**
  * ipath_init_ht_get_base_info - set chip-specific flags for user code
  * @dd: the infinipath device
index 5c50383880f2baeb604f318038df298f4cdc2243..aa2b519443317d40ae7f7daeec1d9e3bdadaa601 100644 (file)
@@ -321,6 +321,12 @@ static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
        INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
 };
 
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+                       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+                       << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+
+static int ipath_pe_txe_recover(struct ipath_devdata *);
+
 /**
  * ipath_pe_handle_hwerrors - display hardware errors.
  * @dd: the infinipath device
@@ -394,25 +400,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
                 * occur if a processor speculative read is done to the PIO
                 * buffer while we are sending a packet, for example.
                 */
-               if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
-                              INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
-                             << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
-                       ipath_stats.sps_txeparity++;
-                       ipath_dbg("Recovering from TXE parity error (%llu), "
-                                 "hwerrstatus=%llx\n",
-                                 (unsigned long long) ipath_stats.sps_txeparity,
-                                 (unsigned long long) hwerrs);
-                       ipath_disarm_senderrbufs(dd);
-                       hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
-                                    INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
-                                   << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
-                       if (!hwerrs) { /* else leave in freeze mode */
-                               ipath_write_kreg(dd,
-                                                dd->ipath_kregs->kr_control,
-                                                dd->ipath_control);
-                           return;
-                       }
-               }
+               if ((hwerrs & TXE_PIO_PARITY) && ipath_pe_txe_recover(dd))
+                       hwerrs &= ~TXE_PIO_PARITY;
                if (hwerrs) {
                        /*
                         * if any set that we aren't ignoring only make the
@@ -581,6 +570,8 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd)
 
        if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
                ipath_dev_err(dd, "MemBIST did not complete!\n");
+       if (extsval & INFINIPATH_EXTS_MEMBIST_FOUND)
+               ipath_dbg("MemBIST corrected\n");
 
        val = ~0ULL;    /* barring bugs, all hwerrors become interrupts, */
 
@@ -1330,6 +1321,35 @@ static void ipath_pe_free_irq(struct ipath_devdata *dd)
        dd->ipath_irq = 0;
 }
 
+/*
+ * On platforms using this chip, and not having ordered WC stores, we
+ * can get TXE parity errors due to speculative reads to the PIO buffers,
+ * and this, due to a chip bug can result in (many) false parity error
+ * reports.  So it's a debug print on those, and an info print on systems
+ * where the speculative reads don't occur.
+ * Because we can get lots of false errors, we have no upper limit
+ * on recovery attempts on those platforms.
+ */
+static int ipath_pe_txe_recover(struct ipath_devdata *dd)
+{
+       if (ipath_unordered_wc())
+               ipath_dbg("Recovering from TXE PIO parity error\n");
+       else {
+               int cnt = ++ipath_stats.sps_txeparity;
+               if (cnt >= IPATH_MAX_PARITY_ATTEMPTS)  {
+                       if (cnt == IPATH_MAX_PARITY_ATTEMPTS)
+                               ipath_dev_err(dd,
+                                       "Too many attempts to recover from "
+                                       "TXE parity, giving up\n");
+                       return 0;
+               }
+               dev_info(&dd->pcidev->dev,
+                       "Recovering from TXE PIO parity error\n");
+       }
+       ipath_disarm_senderrbufs(dd, 1);
+       return 1;
+}
+
 /**
  * ipath_init_iba6120_funcs - set up the chip-specific function pointers
  * @dd: the infinipath device
index 1e77b55afe93d6fabf2a05bf8e036e89a75fbb88..72caa9f091f6c0623d158ca93f1a34d8626722d4 100644 (file)
@@ -590,6 +590,10 @@ static int init_housekeeping(struct ipath_devdata *dd,
                goto done;
        }
 
+
+       /* clear diagctrl register, in case diags were running and crashed */
+       ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
        /* clear the initial reset flag, in case first driver load */
        ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
                         INFINIPATH_E_RESET);
index 24853310df1c68eff8295712abf923ae0636bd3e..45d033169c6e5fe765dc402f0981db2cdc5717d6 100644 (file)
 #include "ipath_verbs.h"
 #include "ipath_common.h"
 
+/*
+ * clear (write) a pio buffer, to clear a parity error.   This routine
+ * should only be called when in freeze mode, and the buffer should be
+ * canceled afterwards.
+ */
+static void ipath_clrpiobuf(struct ipath_devdata *dd, u32 pnum)
+{
+       u32 __iomem *pbuf;
+       u32 dwcnt; /* dword count to write */
+       if (pnum < dd->ipath_piobcnt2k) {
+               pbuf = (u32 __iomem *) (dd->ipath_pio2kbase + pnum *
+                       dd->ipath_palign);
+               dwcnt = dd->ipath_piosize2k >> 2;
+       }
+       else {
+               pbuf = (u32 __iomem *) (dd->ipath_pio4kbase +
+                       (pnum - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+               dwcnt = dd->ipath_piosize4k >> 2;
+       }
+       dev_info(&dd->pcidev->dev,
+               "Rewrite PIO buffer %u, to recover from parity error\n",
+               pnum);
+       *pbuf = dwcnt+1; /* no flush required, since already in freeze */
+       while(--dwcnt)
+               *pbuf++ = 0;
+}
+
 /*
  * Called when we might have an error that is specific to a particular
  * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ * If rewrite is true, and bits are set in the sendbufferror registers,
+ * we'll write to the buffer, for error recovery on parity errors.
  */
-void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd, int rewrite)
 {
        u32 piobcnt;
        unsigned long sbuf[4];
@@ -74,8 +103,11 @@ void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
                }
 
                for (i = 0; i < piobcnt; i++)
-                       if (test_bit(i, sbuf))
+                       if (test_bit(i, sbuf)) {
+                               if (rewrite)
+                                       ipath_clrpiobuf(dd, i);
                                ipath_disarm_piobufs(dd, i, 1);
+                       }
                dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
        }
 }
@@ -114,7 +146,7 @@ static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
 {
        u64 ignore_this_time = 0;
 
-       ipath_disarm_senderrbufs(dd);
+       ipath_disarm_senderrbufs(dd, 0);
        if ((errs & E_SUM_LINK_PKTERRS) &&
            !(dd->ipath_flags & IPATH_LINKACTIVE)) {
                /*
index 5428c2619ba9ba2d234693a06eb25377fe712f9f..e900c2593f446b58ed41426c12b5538e629c2758 100644 (file)
@@ -590,7 +590,6 @@ int ipath_enable_wc(struct ipath_devdata *dd);
 void ipath_disable_wc(struct ipath_devdata *dd);
 int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
 void ipath_shutdown_device(struct ipath_devdata *);
-void ipath_disarm_senderrbufs(struct ipath_devdata *);
 
 struct file_operations;
 int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
@@ -713,6 +712,7 @@ void ipath_init_iba6120_funcs(struct ipath_devdata *);
 void ipath_init_iba6110_funcs(struct ipath_devdata *);
 void ipath_get_eeprom_info(struct ipath_devdata *);
 u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *, int);
 
 /*
  * number of words used for protocol header if not set by ipath_userinit();
@@ -897,6 +897,8 @@ dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
 
 extern unsigned ipath_debug; /* debugging bit mask */
 
+#define IPATH_MAX_PARITY_ATTEMPTS 10000 /* max times to try recovery */
+
 const char *ipath_get_unit_name(int unit);
 
 extern struct mutex ipath_mutex;
index 6e99eafdfd73efb6b15c352424d4e88f84705063..c182bcd62098cd180e4d73c7a4cecd41a3804ba3 100644 (file)
 #define INFINIPATH_XGXS_RX_POL_SHIFT 19
 #define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
 
-#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL        /* 40 bits valid */
-
-/* TID entries (memory), HT-only */
-#define INFINIPATH_RT_VALID 0x8000000000000000ULL
-#define INFINIPATH_RT_ADDR_SHIFT 0
-#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFF
-#define INFINIPATH_RT_BUFSIZE_SHIFT 48
 
 /*
  * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our