drm/i915: Only slightly increment hangcheck score if we succesfully kick a ring
authorChris Wilson <chris@chris-wilson.co.uk>
Mon, 10 Jun 2013 10:20:20 +0000 (11:20 +0100)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Tue, 11 Jun 2013 09:44:00 +0000 (11:44 +0200)
After kicking a ring, it should be free to make progress again and so
should not be accused of being stuck until hangcheck fires once more. In
order to catch a denial-of-service within a batch or across multiple
batches, we still do increment the hangcheck score - just not as
severely so that it takes multiple kicks to fail.

This should address part of Ben's justified criticism of

commit 05407ff889ceebe383aa5907219f86582ef96b72
Author: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Date:   Thu May 30 09:04:29 2013 +0300

    drm/i915: detect hang using per ring hangcheck_score

"There's also another corner case on the kick. If the seqno = 2
(though not stuck), and on the 3rd hangcheck, the ring is stuck, and
we try to kick it... we don't actually try to find out if the kick
helped."

v2: Make sure we catch DoS attempts with batches full of invalid WAITs.
v3: Preserve the ability to detect loops by always charging the ring
    if it is busy on the same request.
v4: Make sure we queue another check if on a new batch

References: https://bugs.freedesktop.org/show_bug.cgi?id=65394
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_irq.c

index c482e8ae58dcf0ddb1d33864e4fd18ef616b7f84..6a757691488e7d711b35f3076fe24a89f48318b5 100644 (file)
@@ -2314,21 +2314,11 @@ ring_last_seqno(struct intel_ring_buffer *ring)
                          struct drm_i915_gem_request, list)->seqno;
 }
 
-static bool i915_hangcheck_ring_idle(struct intel_ring_buffer *ring,
-                                    u32 ring_seqno, bool *err)
-{
-       if (list_empty(&ring->request_list) ||
-           i915_seqno_passed(ring_seqno, ring_last_seqno(ring))) {
-               /* Issue a wake-up to catch stuck h/w. */
-               if (waitqueue_active(&ring->irq_queue)) {
-                       DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-                                 ring->name);
-                       wake_up_all(&ring->irq_queue);
-                       *err = true;
-               }
-               return true;
-       }
-       return false;
+static bool
+ring_idle(struct intel_ring_buffer *ring, u32 seqno)
+{
+       return (list_empty(&ring->request_list) ||
+               i915_seqno_passed(seqno, ring_last_seqno(ring)));
 }
 
 static bool semaphore_passed(struct intel_ring_buffer *ring)
@@ -2362,16 +2352,26 @@ static bool semaphore_passed(struct intel_ring_buffer *ring)
                                 ioread32(ring->virtual_start+acthd+4)+1);
 }
 
-static bool kick_ring(struct intel_ring_buffer *ring)
+static bool ring_hung(struct intel_ring_buffer *ring)
 {
        struct drm_device *dev = ring->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u32 tmp = I915_READ_CTL(ring);
+       u32 tmp;
+
+       if (IS_GEN2(dev))
+               return true;
+
+       /* Is the chip hanging on a WAIT_FOR_EVENT?
+        * If so we can simply poke the RB_WAIT bit
+        * and break the hang. This should work on
+        * all but the second generation chipsets.
+        */
+       tmp = I915_READ_CTL(ring);
        if (tmp & RING_WAIT) {
                DRM_ERROR("Kicking stuck wait on %s\n",
                          ring->name);
                I915_WRITE_CTL(ring, tmp);
-               return true;
+               return false;
        }
 
        if (INTEL_INFO(dev)->gen >= 6 &&
@@ -2380,22 +2380,10 @@ static bool kick_ring(struct intel_ring_buffer *ring)
                DRM_ERROR("Kicking stuck semaphore on %s\n",
                          ring->name);
                I915_WRITE_CTL(ring, tmp);
-               return true;
-       }
-       return false;
-}
-
-static bool i915_hangcheck_ring_hung(struct intel_ring_buffer *ring)
-{
-       if (IS_GEN2(ring->dev))
                return false;
+       }
 
-       /* Is the chip hanging on a WAIT_FOR_EVENT?
-        * If so we can simply poke the RB_WAIT bit
-        * and break the hang. This should work on
-        * all but the second generation chipsets.
-        */
-       return !kick_ring(ring);
+       return true;
 }
 
 /**
@@ -2413,45 +2401,63 @@ void i915_hangcheck_elapsed(unsigned long data)
        struct intel_ring_buffer *ring;
        int i;
        int busy_count = 0, rings_hung = 0;
-       bool stuck[I915_NUM_RINGS];
+       bool stuck[I915_NUM_RINGS] = { 0 };
+#define BUSY 1
+#define KICK 5
+#define HUNG 20
+#define FIRE 30
 
        if (!i915_enable_hangcheck)
                return;
 
        for_each_ring(ring, dev_priv, i) {
                u32 seqno, acthd;
-               bool idle, err = false;
+               bool busy = true;
 
                seqno = ring->get_seqno(ring, false);
                acthd = intel_ring_get_active_head(ring);
-               idle = i915_hangcheck_ring_idle(ring, seqno, &err);
-               stuck[i] = ring->hangcheck.acthd == acthd;
-
-               if (idle) {
-                       if (err)
-                               ring->hangcheck.score += 2;
-                       else
-                               ring->hangcheck.score = 0;
-               } else {
-                       busy_count++;
 
-                       if (ring->hangcheck.seqno == seqno) {
-                               ring->hangcheck.score++;
-
-                               /* Kick ring if stuck*/
-                               if (stuck[i])
-                                       i915_hangcheck_ring_hung(ring);
+               if (ring->hangcheck.seqno == seqno) {
+                       if (ring_idle(ring, seqno)) {
+                               if (waitqueue_active(&ring->irq_queue)) {
+                                       /* Issue a wake-up to catch stuck h/w. */
+                                       DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+                                                 ring->name);
+                                       wake_up_all(&ring->irq_queue);
+                                       ring->hangcheck.score += HUNG;
+                               } else
+                                       busy = false;
                        } else {
-                               ring->hangcheck.score = 0;
+                               int score;
+
+                               stuck[i] = ring->hangcheck.acthd == acthd;
+                               if (stuck[i]) {
+                                       /* Every time we kick the ring, add a
+                                        * small increment to the hangcheck
+                                        * score so that we can catch a
+                                        * batch that is repeatedly kicked.
+                                        */
+                                       score = ring_hung(ring) ? HUNG : KICK;
+                               } else
+                                       score = BUSY;
+
+                               ring->hangcheck.score += score;
                        }
+               } else {
+                       /* Gradually reduce the count so that we catch DoS
+                        * attempts across multiple batches.
+                        */
+                       if (ring->hangcheck.score > 0)
+                               ring->hangcheck.score--;
                }
 
                ring->hangcheck.seqno = seqno;
                ring->hangcheck.acthd = acthd;
+               busy_count += busy;
        }
 
        for_each_ring(ring, dev_priv, i) {
-               if (ring->hangcheck.score > 2) {
+               if (ring->hangcheck.score > FIRE) {
                        rings_hung++;
                        DRM_ERROR("%s: %s on %s 0x%x\n", ring->name,
                                  stuck[i] ? "stuck" : "no progress",