sfc: Distinguish critical and non-critical over-temperature conditions
authorBen Hutchings <bhutchings@solarflare.com>
Thu, 2 Dec 2010 13:46:24 +0000 (13:46 +0000)
committerDavid S. Miller <davem@davemloft.net>
Fri, 3 Dec 2010 17:08:03 +0000 (09:08 -0800)
Set both the 'maximum' and critical temperature limits for LM87
hardware monitors on Falcon boards.  Do not shut down a port until the
critical temperature is reached, but warn as soon as the 'maximum'
temperature is reached.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/sfc/falcon_boards.c

index cfc29d767588f2aa9212b49d571d3d6bfaf09eb8..86180ee02ec07e50e037367fcf7cb6e33a5bd995 100644 (file)
 #define FALCON_BOARD_SFN4112F 0x52
 
 /* Board temperature is about 15°C above ambient when air flow is
- * limited. */
+ * limited.  The maximum acceptable ambient temperature varies
+ * depending on the PHY specifications but the critical temperature
+ * above which we should shut down to avoid damage is 80°C. */
 #define FALCON_BOARD_TEMP_BIAS 15
+#define FALCON_BOARD_TEMP_CRIT (80 + FALCON_BOARD_TEMP_BIAS)
 
 /* SFC4000 datasheet says: 'The maximum permitted junction temperature
  * is 125°C; the thermal design of the environment for the SFC4000
  * should aim to keep this well below 100°C.' */
+#define FALCON_JUNC_TEMP_MIN   0
 #define FALCON_JUNC_TEMP_MAX   90
+#define FALCON_JUNC_TEMP_CRIT  125
 
 /*****************************************************************************
  * Support for LM87 sensor chip used on several boards
  */
+#define LM87_REG_TEMP_HW_INT_LOCK      0x13
+#define LM87_REG_TEMP_HW_EXT_LOCK      0x14
+#define LM87_REG_TEMP_HW_INT           0x17
+#define LM87_REG_TEMP_HW_EXT           0x18
+#define LM87_REG_TEMP_EXT1             0x26
+#define LM87_REG_TEMP_INT              0x27
 #define LM87_REG_ALARMS1               0x41
 #define LM87_REG_ALARMS2               0x42
 #define LM87_IN_LIMITS(nr, _min, _max)                 \
 
 #if defined(CONFIG_SENSORS_LM87) || defined(CONFIG_SENSORS_LM87_MODULE)
 
+static int efx_poke_lm87(struct i2c_client *client, const u8 *reg_values)
+{
+       while (*reg_values) {
+               u8 reg = *reg_values++;
+               u8 value = *reg_values++;
+               int rc = i2c_smbus_write_byte_data(client, reg, value);
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+static const u8 falcon_lm87_common_regs[] = {
+       LM87_REG_TEMP_HW_INT_LOCK, FALCON_BOARD_TEMP_CRIT,
+       LM87_REG_TEMP_HW_INT, FALCON_BOARD_TEMP_CRIT,
+       LM87_TEMP_EXT1_LIMITS(FALCON_JUNC_TEMP_MIN, FALCON_JUNC_TEMP_MAX),
+       LM87_REG_TEMP_HW_EXT_LOCK, FALCON_JUNC_TEMP_CRIT,
+       LM87_REG_TEMP_HW_EXT, FALCON_JUNC_TEMP_CRIT,
+       0
+};
+
 static int efx_init_lm87(struct efx_nic *efx, struct i2c_board_info *info,
                         const u8 *reg_values)
 {
@@ -67,13 +99,12 @@ static int efx_init_lm87(struct efx_nic *efx, struct i2c_board_info *info,
        if (!client)
                return -EIO;
 
-       while (*reg_values) {
-               u8 reg = *reg_values++;
-               u8 value = *reg_values++;
-               rc = i2c_smbus_write_byte_data(client, reg, value);
-               if (rc)
-                       goto err;
-       }
+       rc = efx_poke_lm87(client, reg_values);
+       if (rc)
+               goto err;
+       rc = efx_poke_lm87(client, falcon_lm87_common_regs);
+       if (rc)
+               goto err;
 
        board->hwmon_client = client;
        return 0;
@@ -91,36 +122,56 @@ static void efx_fini_lm87(struct efx_nic *efx)
 static int efx_check_lm87(struct efx_nic *efx, unsigned mask)
 {
        struct i2c_client *client = falcon_board(efx)->hwmon_client;
-       s32 alarms1, alarms2;
+       bool temp_crit, elec_fault, is_failure;
+       u16 alarms;
+       s32 reg;
 
        /* If link is up then do not monitor temperature */
        if (EFX_WORKAROUND_7884(efx) && efx->link_state.up)
                return 0;
 
-       alarms1 = i2c_smbus_read_byte_data(client, LM87_REG_ALARMS1);
-       alarms2 = i2c_smbus_read_byte_data(client, LM87_REG_ALARMS2);
-       if (alarms1 < 0)
-               return alarms1;
-       if (alarms2 < 0)
-               return alarms2;
-       alarms1 &= mask;
-       alarms2 &= mask >> 8;
-       if (alarms1 || alarms2) {
+       reg = i2c_smbus_read_byte_data(client, LM87_REG_ALARMS1);
+       if (reg < 0)
+               return reg;
+       alarms = reg;
+       reg = i2c_smbus_read_byte_data(client, LM87_REG_ALARMS2);
+       if (reg < 0)
+               return reg;
+       alarms |= reg << 8;
+       alarms &= mask;
+
+       temp_crit = false;
+       if (alarms & LM87_ALARM_TEMP_INT) {
+               reg = i2c_smbus_read_byte_data(client, LM87_REG_TEMP_INT);
+               if (reg < 0)
+                       return reg;
+               if (reg > FALCON_BOARD_TEMP_CRIT)
+                       temp_crit = true;
+       }
+       if (alarms & LM87_ALARM_TEMP_EXT1) {
+               reg = i2c_smbus_read_byte_data(client, LM87_REG_TEMP_EXT1);
+               if (reg < 0)
+                       return reg;
+               if (reg > FALCON_JUNC_TEMP_CRIT)
+                       temp_crit = true;
+       }
+       elec_fault = alarms & ~(LM87_ALARM_TEMP_INT | LM87_ALARM_TEMP_EXT1);
+       is_failure = temp_crit || elec_fault;
+
+       if (alarms)
                netif_err(efx, hw, efx->net_dev,
-                         "LM87 detected a hardware failure (status %02x:%02x)"
-                         "%s%s%s\n",
-                         alarms1, alarms2,
-                         (alarms1 & LM87_ALARM_TEMP_INT) ?
+                         "LM87 detected a hardware %s (status %02x:%02x)"
+                         "%s%s%s%s\n",
+                         is_failure ? "failure" : "problem",
+                         alarms & 0xff, alarms >> 8,
+                         (alarms & LM87_ALARM_TEMP_INT) ?
                          "; board is overheating" : "",
-                         (alarms1 & LM87_ALARM_TEMP_EXT1) ?
+                         (alarms & LM87_ALARM_TEMP_EXT1) ?
                          "; controller is overheating" : "",
-                         (alarms1 & ~(LM87_ALARM_TEMP_INT | LM87_ALARM_TEMP_EXT1)
-                          || alarms2) ?
-                         "; electrical fault" : "");
-               return -ERANGE;
-       }
+                         temp_crit ? "; reached critical temperature" : "",
+                         elec_fault ? "; electrical fault" : "");
 
-       return 0;
+       return is_failure ? -ERANGE : 0;
 }
 
 #else /* !CONFIG_SENSORS_LM87 */