len = snprintf(buf, sizeof(buf),
"wedged : %d\n",
- atomic_read(&dev_priv->gpu_error.wedged));
+ atomic_read(&dev_priv->gpu_error.reset_counter));
if (len > sizeof(buf))
len = sizeof(buf);
/* Protected by the above dev->gpu_error.lock. */
struct drm_i915_error_state *first_error;
struct work_struct work;
- struct completion completion;
unsigned long last_reset;
- atomic_t wedged;
+ /**
+ * State variable controlling the reset flow
+ *
+ * Upper bits are for the reset counter.
+ *
+ * Lowest bit controls the reset state machine: Set means a reset is in
+ * progress. This state will (presuming we don't have any bugs) decay
+ * into either unset (successful reset) or the special WEDGED value (hw
+ * terminally sour). All waiters on the reset_queue will be woken when
+ * that happens.
+ */
+ atomic_t reset_counter;
+
+ /**
+ * Special values/flags for reset_counter
+ *
+ * Note that the code relies on
+ * I915_WEDGED & I915_RESET_IN_PROGRESS_FLAG
+ * being true.
+ */
+#define I915_RESET_IN_PROGRESS_FLAG 1
+#define I915_WEDGED 0xffffffff
+
+ /**
+ * Waitqueue to signal when the reset has completed. Used by clients
+ * that wait for dev_priv->mm.wedged to settle.
+ */
+ wait_queue_head_t reset_queue;
/* For gpu hang simulation. */
unsigned int stop_rings;
void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
bool interruptible);
+static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+{
+ return unlikely(atomic_read(&error->reset_counter)
+ & I915_RESET_IN_PROGRESS_FLAG);
+}
+
+static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
+{
+ return atomic_read(&error->reset_counter) == I915_WEDGED;
+}
void i915_gem_reset(struct drm_device *dev);
void i915_gem_clflush_object(struct drm_i915_gem_object *obj);
static int
i915_gem_wait_for_error(struct i915_gpu_error *error)
{
- struct completion *x = &error->completion;
- unsigned long flags;
int ret;
- if (!atomic_read(&error->wedged))
+#define EXIT_COND (!i915_reset_in_progress(error))
+ if (EXIT_COND)
return 0;
+ /* GPU is already declared terminally dead, give up. */
+ if (i915_terminally_wedged(error))
+ return -EIO;
+
/*
* Only wait 10 seconds for the gpu reset to complete to avoid hanging
* userspace. If it takes that long something really bad is going on and
* we should simply try to bail out and fail as gracefully as possible.
*/
- ret = wait_for_completion_interruptible_timeout(x, 10*HZ);
+ ret = wait_event_interruptible_timeout(error->reset_queue,
+ EXIT_COND,
+ 10*HZ);
if (ret == 0) {
DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
return -EIO;
} else if (ret < 0) {
return ret;
}
+#undef EXIT_COND
- if (atomic_read(&error->wedged)) {
- /* GPU is hung, bump the completion count to account for
- * the token we just consumed so that we never hit zero and
- * end up waiting upon a subsequent completion event that
- * will never happen.
- */
- spin_lock_irqsave(&x->wait.lock, flags);
- x->done++;
- spin_unlock_irqrestore(&x->wait.lock, flags);
- }
return 0;
}
i915_gem_check_wedge(struct i915_gpu_error *error,
bool interruptible)
{
- if (atomic_read(&error->wedged)) {
- struct completion *x = &error->completion;
- bool recovery_complete;
- unsigned long flags;
-
- /* Give the error handler a chance to run. */
- spin_lock_irqsave(&x->wait.lock, flags);
- recovery_complete = x->done > 0;
- spin_unlock_irqrestore(&x->wait.lock, flags);
-
+ if (i915_reset_in_progress(error)) {
/* Non-interruptible callers can't handle -EAGAIN, hence return
* -EIO unconditionally for these. */
if (!interruptible)
return -EIO;
- /* Recovery complete, but still wedged means reset failure. */
- if (recovery_complete)
+ /* Recovery complete, but the reset failed ... */
+ if (i915_terminally_wedged(error))
return -EIO;
return -EAGAIN;
#define EXIT_COND \
(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
- atomic_read(&dev_priv->gpu_error.wedged))
+ i915_reset_in_progress(&dev_priv->gpu_error))
do {
if (interruptible)
end = wait_event_interruptible_timeout(ring->irq_queue,
/* If this -EIO is due to a gpu hang, give the reset code a
* chance to clean up the mess. Otherwise return the proper
* SIGBUS. */
- if (!atomic_read(&dev_priv->gpu_error.wedged))
+ if (i915_terminally_wedged(&dev_priv->gpu_error))
return VM_FAULT_SIGBUS;
case -EAGAIN:
/* Give the error handler a chance to run and move the
if (drm_core_check_feature(dev, DRIVER_MODESET))
return 0;
- if (atomic_read(&dev_priv->gpu_error.wedged)) {
+ if (i915_reset_in_progress(&dev_priv->gpu_error)) {
DRM_ERROR("Reenabling wedged hardware, good luck\n");
- atomic_set(&dev_priv->gpu_error.wedged, 0);
+ atomic_set(&dev_priv->gpu_error.reset_counter, 0);
}
mutex_lock(&dev->struct_mutex);
INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
INIT_DELAYED_WORK(&dev_priv->mm.retire_work,
i915_gem_retire_work_handler);
- init_completion(&dev_priv->gpu_error.completion);
+ init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
if (IS_GEN3(dev)) {
*/
static void i915_error_work_func(struct work_struct *work)
{
- drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
- gpu_error.work);
+ struct i915_gpu_error *error = container_of(work, struct i915_gpu_error,
+ work);
+ drm_i915_private_t *dev_priv = container_of(error, drm_i915_private_t,
+ gpu_error);
struct drm_device *dev = dev_priv->dev;
char *error_event[] = { "ERROR=1", NULL };
char *reset_event[] = { "RESET=1", NULL };
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, error_event);
- if (atomic_read(&dev_priv->gpu_error.wedged)) {
+ if (i915_reset_in_progress(error)) {
DRM_DEBUG_DRIVER("resetting chip\n");
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_event);
+
if (!i915_reset(dev)) {
- atomic_set(&dev_priv->gpu_error.wedged, 0);
+ atomic_set(&error->reset_counter, 0);
kobject_uevent_env(&dev->primary->kdev.kobj, KOBJ_CHANGE, reset_done_event);
+ } else {
+ atomic_set(&error->reset_counter, I915_WEDGED);
}
- complete_all(&dev_priv->gpu_error.completion);
+
+ wake_up_all(&dev_priv->gpu_error.reset_queue);
}
}
i915_report_and_clear_eir(dev);
if (wedged) {
- INIT_COMPLETION(dev_priv->gpu_error.completion);
- atomic_set(&dev_priv->gpu_error.wedged, 1);
+ atomic_set(&dev_priv->gpu_error.reset_counter,
+ I915_RESET_IN_PROGRESS_FLAG);
/*
- * Wakeup waiting processes so they don't hang
+ * Wakeup waiting processes so that the reset work item
+ * doesn't deadlock trying to grab various locks.
*/
for_each_ring(ring, dev_priv, i)
wake_up_all(&ring->irq_queue);
WARN_ON(waitqueue_active(&dev_priv->pending_flip_queue));
wait_event(dev_priv->pending_flip_queue,
- atomic_read(&dev_priv->gpu_error.wedged) ||
+ i915_reset_in_progress(&dev_priv->gpu_error) ||
atomic_read(&obj->pending_flip) == 0);
/* Big Hammer, we also need to ensure that any pending
unsigned long flags;
bool pending;
- if (atomic_read(&dev_priv->gpu_error.wedged))
+ if (i915_reset_in_progress(&dev_priv->gpu_error))
return false;
spin_lock_irqsave(&dev->event_lock, flags);