drm/i915: Convert hangcheck from a timer into a delayed work item
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 26 Jan 2015 16:03:03 +0000 (18:03 +0200)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Wed, 28 Jan 2015 16:22:12 +0000 (17:22 +0100)
When run as a timer, i915_hangcheck_elapsed() must adhere to all the
rules of running in a softirq context. This is advantageous to us as we
want to minimise the risk that a driver bug will prevent us from
detecting a hung GPU. However, that is irrelevant if the driver bug
prevents us from resetting and recovering. Still it is prudent not to
rely on mutexes inside the checker, but given the coarseness of
dev->struct_mutex doing so is extremely hard.

Give in and run from a work queue, i.e. outside of softirq.

v2: Use own workqueue to avoid deadlocks (Daniel)
    Cleanup commit msg and add comment to i915_queue_hangcheck() (Chris)

Cc: Jani Nikula <jani.nikula@intel.com>
Cc: Daniel Vetter <dnaiel.vetter@ffwll.chm>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> (v1)
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
[danvet: Remove accidental kerneldoc comment starter, to appease the 0
day builder.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_dma.c
drivers/gpu/drm/i915/i915_drv.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_irq.c

index 51e8fe5f18135ce50459ca7e281fe7ed32a2198c..6eaf79504b582b977545eaf740b9115c59ce9562 100644 (file)
@@ -790,6 +790,14 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
                goto out_freewq;
        }
 
+       dev_priv->gpu_error.hangcheck_wq =
+               alloc_ordered_workqueue("i915-hangcheck", 0);
+       if (dev_priv->gpu_error.hangcheck_wq == NULL) {
+               DRM_ERROR("Failed to create our hangcheck workqueue.\n");
+               ret = -ENOMEM;
+               goto out_freedpwq;
+       }
+
        intel_irq_init(dev_priv);
        intel_uncore_sanitize(dev);
 
@@ -864,6 +872,8 @@ out_gem_unload:
        intel_teardown_gmbus(dev);
        intel_teardown_mchbar(dev);
        pm_qos_remove_request(&dev_priv->pm_qos);
+       destroy_workqueue(dev_priv->gpu_error.hangcheck_wq);
+out_freedpwq:
        destroy_workqueue(dev_priv->dp_wq);
 out_freewq:
        destroy_workqueue(dev_priv->wq);
@@ -934,7 +944,7 @@ int i915_driver_unload(struct drm_device *dev)
        }
 
        /* Free error state after interrupts are fully disabled. */
-       del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+       cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
        cancel_work_sync(&dev_priv->gpu_error.work);
        i915_destroy_error_state(dev);
 
@@ -960,6 +970,7 @@ int i915_driver_unload(struct drm_device *dev)
 
        destroy_workqueue(dev_priv->dp_wq);
        destroy_workqueue(dev_priv->wq);
+       destroy_workqueue(dev_priv->gpu_error.hangcheck_wq);
        pm_qos_remove_request(&dev_priv->pm_qos);
 
        i915_global_gtt_cleanup(dev);
index 9da4e60bdc7af011115e38d3856038de6375a436..5f50e7033ef73cb6f8bc013e3743a9682954b9c9 100644 (file)
@@ -1402,7 +1402,7 @@ static int intel_runtime_suspend(struct device *device)
                return ret;
        }
 
-       del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+       cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
        intel_uncore_forcewake_reset(dev, false);
        dev_priv->pm.suspended = true;
 
index 760d239a73f3a3683204645d415f28b5984e4830..7aee7d5d90e25757ca457c836e51dbcba15d501a 100644 (file)
@@ -1345,7 +1345,8 @@ struct i915_gpu_error {
        /* Hang gpu twice in this window and your context gets banned */
 #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
-       struct timer_list hangcheck_timer;
+       struct workqueue_struct *hangcheck_wq;
+       struct delayed_work hangcheck_work;
 
        /* For reset and error_state handling. */
        spinlock_t lock;
index 9c7c95a8eae4e362aeabf1cdccea83a74db027c8..361d18b5223a41e71d206d9fe997958fba6e0970 100644 (file)
@@ -4608,7 +4608,7 @@ i915_gem_suspend(struct drm_device *dev)
        i915_gem_stop_ringbuffers(dev);
        mutex_unlock(&dev->struct_mutex);
 
-       del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+       cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
        cancel_delayed_work_sync(&dev_priv->mm.retire_work);
        flush_delayed_work(&dev_priv->mm.idle_work);
 
index 2399eaed2ca370f6cc276ce28f50adefb59f9f7d..23bfe2232b6a830d195a153ac858d0f8f966efcc 100644 (file)
@@ -2974,7 +2974,7 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
        return HANGCHECK_HUNG;
 }
 
-/**
+/*
  * This is called when the chip hasn't reported back with completed
  * batchbuffers in a long time. We keep track per ring seqno progress and
  * if there are no progress, hangcheck score for that ring is increased.
@@ -2982,10 +2982,12 @@ ring_stuck(struct intel_engine_cs *ring, u64 acthd)
  * we kick the ring. If we see no progress on three subsequent calls
  * we assume chip is wedged and try to fix it by resetting the chip.
  */
-static void i915_hangcheck_elapsed(unsigned long data)
+static void i915_hangcheck_elapsed(struct work_struct *work)
 {
-       struct drm_device *dev = (struct drm_device *)data;
-       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct drm_i915_private *dev_priv =
+               container_of(work, typeof(*dev_priv),
+                            gpu_error.hangcheck_work.work);
+       struct drm_device *dev = dev_priv->dev;
        struct intel_engine_cs *ring;
        int i;
        int busy_count = 0, rings_hung = 0;
@@ -3099,17 +3101,18 @@ static void i915_hangcheck_elapsed(unsigned long data)
 
 void i915_queue_hangcheck(struct drm_device *dev)
 {
-       struct drm_i915_private *dev_priv = dev->dev_private;
-       struct timer_list *timer = &dev_priv->gpu_error.hangcheck_timer;
+       struct i915_gpu_error *e = &to_i915(dev)->gpu_error;
 
        if (!i915.enable_hangcheck)
                return;
 
-       /* Don't continually defer the hangcheck, but make sure it is active */
-       if (timer_pending(timer))
-               return;
-       mod_timer(timer,
-                 round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+       /* Don't continually defer the hangcheck so that it is always run at
+        * least once after work has been scheduled on any ring. Otherwise,
+        * we will ignore a hung ring if a second ring is kept busy.
+        */
+
+       queue_delayed_work(e->hangcheck_wq, &e->hangcheck_work,
+                          round_jiffies_up_relative(DRM_I915_HANGCHECK_JIFFIES));
 }
 
 static void ibx_irq_reset(struct drm_device *dev)
@@ -4353,9 +4356,8 @@ void intel_irq_init(struct drm_i915_private *dev_priv)
        else
                dev_priv->pm_rps_events = GEN6_PM_RPS_EVENTS;
 
-       setup_timer(&dev_priv->gpu_error.hangcheck_timer,
-                   i915_hangcheck_elapsed,
-                   (unsigned long) dev);
+       INIT_DELAYED_WORK(&dev_priv->gpu_error.hangcheck_work,
+                         i915_hangcheck_elapsed);
        INIT_DELAYED_WORK(&dev_priv->hotplug_reenable_work,
                          intel_hpd_irq_reenable_work);