drm/radeon: rework gpu lockup detection and processing
authorChristian König <deathsimple@vodafone.de>
Wed, 2 May 2012 13:11:13 +0000 (15:11 +0200)
committerDave Airlie <airlied@redhat.com>
Thu, 3 May 2012 08:16:12 +0000 (09:16 +0100)
Previusly multiple rings could trigger multiple GPU
resets at the same time.

Signed-off-by: Christian König <deathsimple@vodafone.de>
Reviewed-by: Jerome Glisse <jglisse@redhat.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_fence.c

index 65855af290a6f827a75b94570607707b5d2d1aeb..35db5bdceda99b70e121d6fc7af61214b6f7320f 100644 (file)
@@ -255,8 +255,7 @@ struct radeon_fence_driver {
        volatile uint32_t               *cpu_addr;
        atomic_t                        seq;
        uint32_t                        last_seq;
-       unsigned long                   last_jiffies;
-       unsigned long                   last_timeout;
+       unsigned long                   last_activity;
        wait_queue_head_t               queue;
        struct list_head                created;
        struct list_head                emitted;
index 36c411fd0fed9dbbbf8a38489d76a2d626b17a88..1a9765aae710eb50c0899ff34baeb8fbab96e9ec 100644 (file)
@@ -74,6 +74,10 @@ int radeon_fence_emit(struct radeon_device *rdev, struct radeon_fence *fence)
        radeon_fence_ring_emit(rdev, fence->ring, fence);
        trace_radeon_fence_emit(rdev->ddev, fence->seq);
        fence->emitted = true;
+       /* are we the first fence on a previusly idle ring? */
+       if (list_empty(&rdev->fence_drv[fence->ring].emitted)) {
+               rdev->fence_drv[fence->ring].last_activity = jiffies;
+       }
        list_move_tail(&fence->list, &rdev->fence_drv[fence->ring].emitted);
        write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
        return 0;
@@ -85,34 +89,14 @@ static bool radeon_fence_poll_locked(struct radeon_device *rdev, int ring)
        struct list_head *i, *n;
        uint32_t seq;
        bool wake = false;
-       unsigned long cjiffies;
 
        seq = radeon_fence_read(rdev, ring);
-       if (seq != rdev->fence_drv[ring].last_seq) {
-               rdev->fence_drv[ring].last_seq = seq;
-               rdev->fence_drv[ring].last_jiffies = jiffies;
-               rdev->fence_drv[ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
-       } else {
-               cjiffies = jiffies;
-               if (time_after(cjiffies, rdev->fence_drv[ring].last_jiffies)) {
-                       cjiffies -= rdev->fence_drv[ring].last_jiffies;
-                       if (time_after(rdev->fence_drv[ring].last_timeout, cjiffies)) {
-                               /* update the timeout */
-                               rdev->fence_drv[ring].last_timeout -= cjiffies;
-                       } else {
-                               /* the 500ms timeout is elapsed we should test
-                                * for GPU lockup
-                                */
-                               rdev->fence_drv[ring].last_timeout = 1;
-                       }
-               } else {
-                       /* wrap around update last jiffies, we will just wait
-                        * a little longer
-                        */
-                       rdev->fence_drv[ring].last_jiffies = cjiffies;
-               }
+       if (seq == rdev->fence_drv[ring].last_seq)
                return false;
-       }
+
+       rdev->fence_drv[ring].last_seq = seq;
+       rdev->fence_drv[ring].last_activity = jiffies;
+
        n = NULL;
        list_for_each(i, &rdev->fence_drv[ring].emitted) {
                fence = list_entry(i, struct radeon_fence, list);
@@ -207,66 +191,84 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
        struct radeon_device *rdev;
        unsigned long irq_flags, timeout;
        u32 seq;
-       int r;
+       int i, r;
+       bool signaled;
 
        if (fence == NULL) {
                WARN(1, "Querying an invalid fence : %p !\n", fence);
-               return 0;
+               return -EINVAL;
        }
+
        rdev = fence->rdev;
-       if (radeon_fence_signaled(fence)) {
-               return 0;
-       }
-       timeout = rdev->fence_drv[fence->ring].last_timeout;
-retry:
-       /* save current sequence used to check for GPU lockup */
-       seq = rdev->fence_drv[fence->ring].last_seq;
-       trace_radeon_fence_wait_begin(rdev->ddev, seq);
-       if (intr) {
+       signaled = radeon_fence_signaled(fence);
+       while (!signaled) {
+               read_lock_irqsave(&rdev->fence_lock, irq_flags);
+               timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
+               if (time_after(rdev->fence_drv[fence->ring].last_activity, timeout)) {
+                       /* the normal case, timeout is somewhere before last_activity */
+                       timeout = rdev->fence_drv[fence->ring].last_activity - timeout;
+               } else {
+                       /* either jiffies wrapped around, or no fence was signaled in the last 500ms
+                        * anyway we will just wait for the minimum amount and then check for a lockup */
+                       timeout = 1;
+               }
+               /* save current sequence value used to check for GPU lockups */
+               seq = rdev->fence_drv[fence->ring].last_seq;
+               read_unlock_irqrestore(&rdev->fence_lock, irq_flags);
+
+               trace_radeon_fence_wait_begin(rdev->ddev, seq);
                radeon_irq_kms_sw_irq_get(rdev, fence->ring);
-               r = wait_event_interruptible_timeout(rdev->fence_drv[fence->ring].queue,
-                               radeon_fence_signaled(fence), timeout);
+               if (intr) {
+                       r = wait_event_interruptible_timeout(
+                               rdev->fence_drv[fence->ring].queue,
+                               (signaled = radeon_fence_signaled(fence)), timeout);
+               } else {
+                       r = wait_event_timeout(
+                               rdev->fence_drv[fence->ring].queue,
+                               (signaled = radeon_fence_signaled(fence)), timeout);
+               }
                radeon_irq_kms_sw_irq_put(rdev, fence->ring);
                if (unlikely(r < 0)) {
                        return r;
                }
-       } else {
-               radeon_irq_kms_sw_irq_get(rdev, fence->ring);
-               r = wait_event_timeout(rdev->fence_drv[fence->ring].queue,
-                        radeon_fence_signaled(fence), timeout);
-               radeon_irq_kms_sw_irq_put(rdev, fence->ring);
-       }
-       trace_radeon_fence_wait_end(rdev->ddev, seq);
-       if (unlikely(!radeon_fence_signaled(fence))) {
-               /* we were interrupted for some reason and fence isn't
-                * isn't signaled yet, resume wait
-                */
-               if (r) {
-                       timeout = r;
-                       goto retry;
-               }
-               /* don't protect read access to rdev->fence_drv[t].last_seq
-                * if we experiencing a lockup the value doesn't change
-                */
-               if (seq == rdev->fence_drv[fence->ring].last_seq &&
-                   radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
-
-                       /* good news we believe it's a lockup */
-                       printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
-                            fence->seq, seq);
-
-                       /* mark the ring as not ready any more */
-                       rdev->ring[fence->ring].ready = false;
-                       r = radeon_gpu_reset(rdev);
-                       if (r)
-                               return r;
+               trace_radeon_fence_wait_end(rdev->ddev, seq);
+
+               if (unlikely(!signaled)) {
+                       /* we were interrupted for some reason and fence
+                        * isn't signaled yet, resume waiting */
+                       if (r) {
+                               continue;
+                       }
+
+                       write_lock_irqsave(&rdev->fence_lock, irq_flags);
+                       /* check if sequence value has changed since last_activity */
+                       if (seq != rdev->fence_drv[fence->ring].last_seq) {
+                               write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
+                               continue;
+                       }
+
+                       /* change sequence value on all rings, so nobody else things there is a lockup */
+                       for (i = 0; i < RADEON_NUM_RINGS; ++i)
+                               rdev->fence_drv[i].last_seq -= 0x10000;
+                       write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
+
+                       if (radeon_ring_is_lockup(rdev, fence->ring, &rdev->ring[fence->ring])) {
+
+                               /* good news we believe it's a lockup */
+                               printk(KERN_WARNING "GPU lockup (waiting for 0x%08X last fence id 0x%08X)\n",
+                                    fence->seq, seq);
+
+                               /* mark the ring as not ready any more */
+                               rdev->ring[fence->ring].ready = false;
+                               r = radeon_gpu_reset(rdev);
+                               if (r)
+                                       return r;
+
+                               write_lock_irqsave(&rdev->fence_lock, irq_flags);
+                               rdev->fence_drv[fence->ring].last_activity = jiffies;
+                               write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
+                       }
                }
-               timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
-               write_lock_irqsave(&rdev->fence_lock, irq_flags);
-               rdev->fence_drv[fence->ring].last_timeout = RADEON_FENCE_JIFFIES_TIMEOUT;
-               rdev->fence_drv[fence->ring].last_jiffies = jiffies;
-               write_unlock_irqrestore(&rdev->fence_lock, irq_flags);
-               goto retry;
        }
        return 0;
 }