drm/i915/skl: Implement WaDisableDgMirrorFixInHalfSliceChicken5:skl
[firefly-linux-kernel-4.4.55.git] / drivers / gpu / drm / i915 / intel_pm.c
index b9edfd426a19716bd3e8ee463851b7c5e1a3a581..4f5dcf545c8989b0e167f865827e0fbb472eb2dc 100644 (file)
  * i915.i915_enable_fbc parameter
  */
 
+static void gen9_init_clock_gating(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       /*
+        * WaDisableSDEUnitClockGating:skl
+        * This seems to be a pre-production w/a.
+        */
+       I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
+                  GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
+
+       /*
+        * WaDisableDgMirrorFixInHalfSliceChicken5:skl
+        * This is a pre-production w/a.
+        */
+       I915_WRITE(GEN9_HALF_SLICE_CHICKEN5,
+                  I915_READ(GEN9_HALF_SLICE_CHICKEN5) &
+                  ~GEN9_DG_MIRROR_FIX_ENABLE);
+
+       /* Wa4x4STCOptimizationDisable:skl */
+       I915_WRITE(CACHE_MODE_1,
+                  _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
+}
+
 static void i8xx_disable_fbc(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
@@ -872,7 +896,7 @@ void intel_set_memory_cxsr(struct drm_i915_private *dev_priv, bool enable)
  * A value of 5us seems to be a good balance; safe for very low end
  * platforms but not overly aggressive on lower latency configs.
  */
-static const int latency_ns = 5000;
+static const int pessimal_latency_ns = 5000;
 
 static int i9xx_get_fifo_size(struct drm_device *dev, int plane)
 {
@@ -1001,13 +1025,20 @@ static const struct intel_watermark_params i915_wm_info = {
        .guard_size = 2,
        .cacheline_size = I915_FIFO_LINE_SIZE,
 };
-static const struct intel_watermark_params i830_wm_info = {
+static const struct intel_watermark_params i830_a_wm_info = {
        .fifo_size = I855GM_FIFO_SIZE,
        .max_wm = I915_MAX_WM,
        .default_wm = 1,
        .guard_size = 2,
        .cacheline_size = I830_FIFO_LINE_SIZE,
 };
+static const struct intel_watermark_params i830_bc_wm_info = {
+       .fifo_size = I855GM_FIFO_SIZE,
+       .max_wm = I915_MAX_WM/2,
+       .default_wm = 1,
+       .guard_size = 2,
+       .cacheline_size = I830_FIFO_LINE_SIZE,
+};
 static const struct intel_watermark_params i845_wm_info = {
        .fifo_size = I830_FIFO_SIZE,
        .max_wm = I915_MAX_WM,
@@ -1380,14 +1411,14 @@ static void valleyview_update_wm(struct drm_crtc *crtc)
        vlv_update_drain_latency(crtc);
 
        if (g4x_compute_wm0(dev, PIPE_A,
-                           &valleyview_wm_info, latency_ns,
-                           &valleyview_cursor_wm_info, latency_ns,
+                           &valleyview_wm_info, pessimal_latency_ns,
+                           &valleyview_cursor_wm_info, pessimal_latency_ns,
                            &planea_wm, &cursora_wm))
                enabled |= 1 << PIPE_A;
 
        if (g4x_compute_wm0(dev, PIPE_B,
-                           &valleyview_wm_info, latency_ns,
-                           &valleyview_cursor_wm_info, latency_ns,
+                           &valleyview_wm_info, pessimal_latency_ns,
+                           &valleyview_cursor_wm_info, pessimal_latency_ns,
                            &planeb_wm, &cursorb_wm))
                enabled |= 1 << PIPE_B;
 
@@ -1446,20 +1477,20 @@ static void cherryview_update_wm(struct drm_crtc *crtc)
        vlv_update_drain_latency(crtc);
 
        if (g4x_compute_wm0(dev, PIPE_A,
-                           &valleyview_wm_info, latency_ns,
-                           &valleyview_cursor_wm_info, latency_ns,
+                           &valleyview_wm_info, pessimal_latency_ns,
+                           &valleyview_cursor_wm_info, pessimal_latency_ns,
                            &planea_wm, &cursora_wm))
                enabled |= 1 << PIPE_A;
 
        if (g4x_compute_wm0(dev, PIPE_B,
-                           &valleyview_wm_info, latency_ns,
-                           &valleyview_cursor_wm_info, latency_ns,
+                           &valleyview_wm_info, pessimal_latency_ns,
+                           &valleyview_cursor_wm_info, pessimal_latency_ns,
                            &planeb_wm, &cursorb_wm))
                enabled |= 1 << PIPE_B;
 
        if (g4x_compute_wm0(dev, PIPE_C,
-                           &valleyview_wm_info, latency_ns,
-                           &valleyview_cursor_wm_info, latency_ns,
+                           &valleyview_wm_info, pessimal_latency_ns,
+                           &valleyview_cursor_wm_info, pessimal_latency_ns,
                            &planec_wm, &cursorc_wm))
                enabled |= 1 << PIPE_C;
 
@@ -1552,14 +1583,14 @@ static void g4x_update_wm(struct drm_crtc *crtc)
        bool cxsr_enabled;
 
        if (g4x_compute_wm0(dev, PIPE_A,
-                           &g4x_wm_info, latency_ns,
-                           &g4x_cursor_wm_info, latency_ns,
+                           &g4x_wm_info, pessimal_latency_ns,
+                           &g4x_cursor_wm_info, pessimal_latency_ns,
                            &planea_wm, &cursora_wm))
                enabled |= 1 << PIPE_A;
 
        if (g4x_compute_wm0(dev, PIPE_B,
-                           &g4x_wm_info, latency_ns,
-                           &g4x_cursor_wm_info, latency_ns,
+                           &g4x_wm_info, pessimal_latency_ns,
+                           &g4x_cursor_wm_info, pessimal_latency_ns,
                            &planeb_wm, &cursorb_wm))
                enabled |= 1 << PIPE_B;
 
@@ -1689,7 +1720,7 @@ static void i9xx_update_wm(struct drm_crtc *unused_crtc)
        else if (!IS_GEN2(dev))
                wm_info = &i915_wm_info;
        else
-               wm_info = &i830_wm_info;
+               wm_info = &i830_a_wm_info;
 
        fifo_size = dev_priv->display.get_fifo_size(dev, 0);
        crtc = intel_get_crtc_for_plane(dev, 0);
@@ -1702,10 +1733,16 @@ static void i9xx_update_wm(struct drm_crtc *unused_crtc)
                adjusted_mode = &to_intel_crtc(crtc)->config.adjusted_mode;
                planea_wm = intel_calculate_wm(adjusted_mode->crtc_clock,
                                               wm_info, fifo_size, cpp,
-                                              latency_ns);
+                                              pessimal_latency_ns);
                enabled = crtc;
-       } else
+       } else {
                planea_wm = fifo_size - wm_info->guard_size;
+               if (planea_wm > (long)wm_info->max_wm)
+                       planea_wm = wm_info->max_wm;
+       }
+
+       if (IS_GEN2(dev))
+               wm_info = &i830_bc_wm_info;
 
        fifo_size = dev_priv->display.get_fifo_size(dev, 1);
        crtc = intel_get_crtc_for_plane(dev, 1);
@@ -1718,13 +1755,16 @@ static void i9xx_update_wm(struct drm_crtc *unused_crtc)
                adjusted_mode = &to_intel_crtc(crtc)->config.adjusted_mode;
                planeb_wm = intel_calculate_wm(adjusted_mode->crtc_clock,
                                               wm_info, fifo_size, cpp,
-                                              latency_ns);
+                                              pessimal_latency_ns);
                if (enabled == NULL)
                        enabled = crtc;
                else
                        enabled = NULL;
-       } else
+       } else {
                planeb_wm = fifo_size - wm_info->guard_size;
+               if (planeb_wm > (long)wm_info->max_wm)
+                       planeb_wm = wm_info->max_wm;
+       }
 
        DRM_DEBUG_KMS("FIFO watermarks - A: %d, B: %d\n", planea_wm, planeb_wm);
 
@@ -1811,7 +1851,7 @@ static void i845_update_wm(struct drm_crtc *unused_crtc)
        planea_wm = intel_calculate_wm(adjusted_mode->crtc_clock,
                                       &i845_wm_info,
                                       dev_priv->display.get_fifo_size(dev, 0),
-                                      4, latency_ns);
+                                      4, pessimal_latency_ns);
        fwater_lo = I915_READ(FW_BLC) & ~0xfff;
        fwater_lo |= (3<<8) | planea_wm;
 
@@ -2258,7 +2298,6 @@ int ilk_wm_max_level(const struct drm_device *dev)
        else
                return 2;
 }
-
 static void intel_print_wm_latency(struct drm_device *dev,
                                   const char *name,
                                   const uint16_t wm[5])
@@ -3227,6 +3266,9 @@ static void gen6_set_rps_thresholds(struct drm_i915_private *dev_priv, u8 val)
 {
        int new_power;
 
+       if (dev_priv->rps.is_bdw_sw_turbo)
+               return;
+
        new_power = dev_priv->rps.power;
        switch (dev_priv->rps.power) {
        case LOW_POWER:
@@ -3434,8 +3476,11 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
                        valleyview_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
                else if (IS_VALLEYVIEW(dev))
                        vlv_set_rps_idle(dev_priv);
-               else
+               else if (!dev_priv->rps.is_bdw_sw_turbo
+                                       || atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
                        gen6_set_rps(dev_priv->dev, dev_priv->rps.min_freq_softlimit);
+               }
+
                dev_priv->rps.last_adj = 0;
        }
        mutex_unlock(&dev_priv->rps.hw_lock);
@@ -3449,8 +3494,11 @@ void gen6_rps_boost(struct drm_i915_private *dev_priv)
        if (dev_priv->rps.enabled) {
                if (IS_VALLEYVIEW(dev))
                        valleyview_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
-               else
+               else if (!dev_priv->rps.is_bdw_sw_turbo
+                                       || atomic_read(&dev_priv->rps.sw_turbo.flip_received)){
                        gen6_set_rps(dev_priv->dev, dev_priv->rps.max_freq_softlimit);
+               }
+
                dev_priv->rps.last_adj = 0;
        }
        mutex_unlock(&dev_priv->rps.hw_lock);
@@ -3469,6 +3517,10 @@ void valleyview_set_rps(struct drm_device *dev, u8 val)
                         dev_priv->rps.cur_freq,
                         vlv_gpu_freq(dev_priv, val), val);
 
+       if (WARN_ONCE(IS_CHERRYVIEW(dev) && (val & 1),
+                     "Odd GPU freq value\n"))
+               val &= ~1;
+
        if (val != dev_priv->rps.cur_freq)
                vlv_punit_write(dev_priv, PUNIT_REG_GPU_FREQ_REQ, val);
 
@@ -3481,21 +3533,26 @@ void valleyview_set_rps(struct drm_device *dev, u8 val)
 static void gen8_disable_rps_interrupts(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       if (IS_BROADWELL(dev) && dev_priv->rps.is_bdw_sw_turbo){
+               if (atomic_read(&dev_priv->rps.sw_turbo.flip_received))
+                       del_timer(&dev_priv->rps.sw_turbo.flip_timer);
+               dev_priv-> rps.is_bdw_sw_turbo = false;
+       } else {
+               I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
+               I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
+                                          ~dev_priv->pm_rps_events);
+               /* Complete PM interrupt masking here doesn't race with the rps work
+                * item again unmasking PM interrupts because that is using a different
+                * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
+                * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
+                * gen8_enable_rps will clean up. */
 
-       I915_WRITE(GEN6_PMINTRMSK, ~GEN8_PMINTR_REDIRECT_TO_NON_DISP);
-       I915_WRITE(GEN8_GT_IER(2), I915_READ(GEN8_GT_IER(2)) &
-                                  ~dev_priv->pm_rps_events);
-       /* Complete PM interrupt masking here doesn't race with the rps work
-        * item again unmasking PM interrupts because that is using a different
-        * register (GEN8_GT_IMR(2)) to mask PM interrupts. The only risk is in
-        * leaving stale bits in GEN8_GT_IIR(2) and GEN8_GT_IMR(2) which
-        * gen8_enable_rps will clean up. */
+               spin_lock_irq(&dev_priv->irq_lock);
+               dev_priv->rps.pm_iir = 0;
+               spin_unlock_irq(&dev_priv->irq_lock);
 
-       spin_lock_irq(&dev_priv->irq_lock);
-       dev_priv->rps.pm_iir = 0;
-       spin_unlock_irq(&dev_priv->irq_lock);
-
-       I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
+               I915_WRITE(GEN8_GT_IIR(2), dev_priv->pm_rps_events);
+       }
 }
 
 static void gen6_disable_rps_interrupts(struct drm_device *dev)
@@ -3653,13 +3710,111 @@ static void parse_rp_state_cap(struct drm_i915_private *dev_priv, u32 rp_state_c
                dev_priv->rps.min_freq_softlimit = dev_priv->rps.min_freq;
 }
 
+static void bdw_sw_calculate_freq(struct drm_device *dev,
+               struct intel_rps_bdw_cal *c, u32 *cur_time, u32 *c0)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       u64 busy = 0;
+       u32 busyness_pct = 0;
+       u32 elapsed_time = 0;
+       u16 new_freq = 0;
+
+       if (!c || !cur_time || !c0)
+               return;
+
+       if (0 == c->last_c0)
+               goto out;
+
+       /* Check Evaluation interval */
+       elapsed_time = *cur_time - c->last_ts;
+       if (elapsed_time < c->eval_interval)
+               return;
+
+       mutex_lock(&dev_priv->rps.hw_lock);
+
+       /*
+        * c0 unit in 32*1.28 usec, elapsed_time unit in 1 usec.
+        * Whole busyness_pct calculation should be
+        *     busy = ((u64)(*c0 - c->last_c0) << 5 << 7) / 100;
+        *     busyness_pct = (u32)(busy * 100 / elapsed_time);
+        * The final formula is to simplify CPU calculation
+        */
+       busy = (u64)(*c0 - c->last_c0) << 12;
+       do_div(busy, elapsed_time);
+       busyness_pct = (u32)busy;
+
+       if (c->is_up && busyness_pct >= c->it_threshold_pct)
+               new_freq = (u16)dev_priv->rps.cur_freq + 3;
+       if (!c->is_up && busyness_pct <= c->it_threshold_pct)
+               new_freq = (u16)dev_priv->rps.cur_freq - 1;
+
+       /* Adjust to new frequency busyness and compare with threshold */
+       if (0 != new_freq) {
+               if (new_freq > dev_priv->rps.max_freq_softlimit)
+                       new_freq = dev_priv->rps.max_freq_softlimit;
+               else if (new_freq < dev_priv->rps.min_freq_softlimit)
+                       new_freq = dev_priv->rps.min_freq_softlimit;
+
+               gen6_set_rps(dev, new_freq);
+       }
+
+       mutex_unlock(&dev_priv->rps.hw_lock);
+
+out:
+       c->last_c0 = *c0;
+       c->last_ts = *cur_time;
+}
+
+static void gen8_set_frequency_RP0(struct work_struct *work)
+{
+       struct intel_rps_bdw_turbo *p_bdw_turbo =
+                       container_of(work, struct intel_rps_bdw_turbo, work_max_freq);
+       struct intel_gen6_power_mgmt *p_power_mgmt =
+                       container_of(p_bdw_turbo, struct intel_gen6_power_mgmt, sw_turbo);
+       struct drm_i915_private *dev_priv =
+                       container_of(p_power_mgmt, struct drm_i915_private, rps);
+
+       mutex_lock(&dev_priv->rps.hw_lock);
+       gen6_set_rps(dev_priv->dev, dev_priv->rps.rp0_freq);
+       mutex_unlock(&dev_priv->rps.hw_lock);
+}
+
+static void flip_active_timeout_handler(unsigned long var)
+{
+       struct drm_i915_private *dev_priv = (struct drm_i915_private *) var;
+
+       del_timer(&dev_priv->rps.sw_turbo.flip_timer);
+       atomic_set(&dev_priv->rps.sw_turbo.flip_received, false);
+
+       queue_work(dev_priv->wq, &dev_priv->rps.sw_turbo.work_max_freq);
+}
+
+void bdw_software_turbo(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       u32 current_time = I915_READ(TIMESTAMP_CTR); /* unit in usec */
+       u32 current_c0 = I915_READ(MCHBAR_PCU_C0); /* unit in 32*1.28 usec */
+
+       bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.up,
+                       &current_time, &current_c0);
+       bdw_sw_calculate_freq(dev, &dev_priv->rps.sw_turbo.down,
+                       &current_time, &current_c0);
+}
+
 static void gen8_enable_rps(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_engine_cs *ring;
        uint32_t rc6_mask = 0, rp_state_cap;
+       uint32_t threshold_up_pct, threshold_down_pct;
+       uint32_t ei_up, ei_down; /* up and down evaluation interval */
+       u32 rp_ctl_flag;
        int unused;
 
+       /* Use software Turbo for BDW */
+       dev_priv->rps.is_bdw_sw_turbo = IS_BROADWELL(dev);
+
        /* 1a: Software RC state - RC0 */
        I915_WRITE(GEN6_RC_STATE, 0);
 
@@ -3703,35 +3858,74 @@ static void gen8_enable_rps(struct drm_device *dev)
                   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
        I915_WRITE(GEN6_RC_VIDEO_FREQ,
                   HSW_FREQUENCY(dev_priv->rps.rp1_freq));
-       /* NB: Docs say 1s, and 1000000 - which aren't equivalent */
-       I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 100000000 / 128); /* 1 second timeout */
+       ei_up = 84480; /* 84.48ms */
+       ei_down = 448000;
+       threshold_up_pct = 90; /* x percent busy */
+       threshold_down_pct = 70;
+
+       if (dev_priv->rps.is_bdw_sw_turbo) {
+               dev_priv->rps.sw_turbo.up.it_threshold_pct = threshold_up_pct;
+               dev_priv->rps.sw_turbo.up.eval_interval = ei_up;
+               dev_priv->rps.sw_turbo.up.is_up = true;
+               dev_priv->rps.sw_turbo.up.last_ts = 0;
+               dev_priv->rps.sw_turbo.up.last_c0 = 0;
+
+               dev_priv->rps.sw_turbo.down.it_threshold_pct = threshold_down_pct;
+               dev_priv->rps.sw_turbo.down.eval_interval = ei_down;
+               dev_priv->rps.sw_turbo.down.is_up = false;
+               dev_priv->rps.sw_turbo.down.last_ts = 0;
+               dev_priv->rps.sw_turbo.down.last_c0 = 0;
+
+               /* Start the timer to track if flip comes*/
+               dev_priv->rps.sw_turbo.timeout = 200*1000; /* in us */
+
+               init_timer(&dev_priv->rps.sw_turbo.flip_timer);
+               dev_priv->rps.sw_turbo.flip_timer.function = flip_active_timeout_handler;
+               dev_priv->rps.sw_turbo.flip_timer.data  = (unsigned long) dev_priv;
+               dev_priv->rps.sw_turbo.flip_timer.expires =
+                       usecs_to_jiffies(dev_priv->rps.sw_turbo.timeout) + jiffies;
+               add_timer(&dev_priv->rps.sw_turbo.flip_timer);
+               INIT_WORK(&dev_priv->rps.sw_turbo.work_max_freq, gen8_set_frequency_RP0);
+
+               atomic_set(&dev_priv->rps.sw_turbo.flip_received, true);
+       } else {
+               /* NB: Docs say 1s, and 1000000 - which aren't equivalent
+                * 1 second timeout*/
+               I915_WRITE(GEN6_RP_DOWN_TIMEOUT, FREQ_1_28_US(1000000));
 
-       /* Docs recommend 900MHz, and 300 MHz respectively */
-       I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
-                  dev_priv->rps.max_freq_softlimit << 24 |
-                  dev_priv->rps.min_freq_softlimit << 16);
+               /* Docs recommend 900MHz, and 300 MHz respectively */
+               I915_WRITE(GEN6_RP_INTERRUPT_LIMITS,
+                          dev_priv->rps.max_freq_softlimit << 24 |
+                          dev_priv->rps.min_freq_softlimit << 16);
 
-       I915_WRITE(GEN6_RP_UP_THRESHOLD, 7600000 / 128); /* 76ms busyness per EI, 90% */
-       I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 31300000 / 128); /* 313ms busyness per EI, 70%*/
-       I915_WRITE(GEN6_RP_UP_EI, 66000); /* 84.48ms, XXX: random? */
-       I915_WRITE(GEN6_RP_DOWN_EI, 350000); /* 448ms, XXX: random? */
+               I915_WRITE(GEN6_RP_UP_THRESHOLD,
+                       FREQ_1_28_US(ei_up * threshold_up_pct / 100));
+               I915_WRITE(GEN6_RP_DOWN_THRESHOLD,
+                       FREQ_1_28_US(ei_down * threshold_down_pct / 100));
+               I915_WRITE(GEN6_RP_UP_EI,
+                       FREQ_1_28_US(ei_up));
+               I915_WRITE(GEN6_RP_DOWN_EI,
+                       FREQ_1_28_US(ei_down));
 
-       I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
+               I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10);
+       }
 
        /* 5: Enable RPS */
-       I915_WRITE(GEN6_RP_CONTROL,
-                  GEN6_RP_MEDIA_TURBO |
-                  GEN6_RP_MEDIA_HW_NORMAL_MODE |
-                  GEN6_RP_MEDIA_IS_GFX |
-                  GEN6_RP_ENABLE |
-                  GEN6_RP_UP_BUSY_AVG |
-                  GEN6_RP_DOWN_IDLE_AVG);
-
-       /* 6: Ring frequency + overclocking (our driver does this later */
-
+       rp_ctl_flag = GEN6_RP_MEDIA_TURBO |
+                                       GEN6_RP_MEDIA_HW_NORMAL_MODE |
+                                       GEN6_RP_MEDIA_IS_GFX |
+                                       GEN6_RP_UP_BUSY_AVG |
+                                       GEN6_RP_DOWN_IDLE_AVG;
+       if (!dev_priv->rps.is_bdw_sw_turbo)
+               rp_ctl_flag |= GEN6_RP_ENABLE;
+
+       I915_WRITE(GEN6_RP_CONTROL, rp_ctl_flag);
+
+       /* 6: Ring frequency + overclocking
+        * (our driver does this later */
        gen6_set_rps(dev, (I915_READ(GEN6_GT_PERF_STATUS) & 0xff00) >> 8);
-
-       gen8_enable_rps_interrupts(dev);
+       if (!dev_priv->rps.is_bdw_sw_turbo)
+               gen8_enable_rps_interrupts(dev);
 
        gen6_gt_force_wake_put(dev_priv, FORCEWAKE_ALL);
 }
@@ -4106,11 +4300,27 @@ static void valleyview_cleanup_pctx(struct drm_device *dev)
 static void valleyview_init_gt_powersave(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       u32 val;
 
        valleyview_setup_pctx(dev);
 
        mutex_lock(&dev_priv->rps.hw_lock);
 
+       val = vlv_punit_read(dev_priv, PUNIT_REG_GPU_FREQ_STS);
+       switch ((val >> 6) & 3) {
+       case 0:
+       case 1:
+               dev_priv->mem_freq = 800;
+               break;
+       case 2:
+               dev_priv->mem_freq = 1066;
+               break;
+       case 3:
+               dev_priv->mem_freq = 1333;
+               break;
+       }
+       DRM_DEBUG_DRIVER("DDR speed: %d MHz", dev_priv->mem_freq);
+
        dev_priv->rps.max_freq = valleyview_rps_max_freq(dev_priv);
        dev_priv->rps.rp0_freq = dev_priv->rps.max_freq;
        DRM_DEBUG_DRIVER("max GPU freq: %d MHz (%u)\n",
@@ -4145,11 +4355,38 @@ static void valleyview_init_gt_powersave(struct drm_device *dev)
 static void cherryview_init_gt_powersave(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
+       u32 val;
 
        cherryview_setup_pctx(dev);
 
        mutex_lock(&dev_priv->rps.hw_lock);
 
+       val = vlv_punit_read(dev_priv, CCK_FUSE_REG);
+       switch ((val >> 2) & 0x7) {
+       case 0:
+       case 1:
+               dev_priv->rps.cz_freq = 200;
+               dev_priv->mem_freq = 1600;
+               break;
+       case 2:
+               dev_priv->rps.cz_freq = 267;
+               dev_priv->mem_freq = 1600;
+               break;
+       case 3:
+               dev_priv->rps.cz_freq = 333;
+               dev_priv->mem_freq = 2000;
+               break;
+       case 4:
+               dev_priv->rps.cz_freq = 320;
+               dev_priv->mem_freq = 1600;
+               break;
+       case 5:
+               dev_priv->rps.cz_freq = 400;
+               dev_priv->mem_freq = 1600;
+               break;
+       }
+       DRM_DEBUG_DRIVER("DDR speed: %d MHz", dev_priv->mem_freq);
+
        dev_priv->rps.max_freq = cherryview_rps_max_freq(dev_priv);
        dev_priv->rps.rp0_freq = dev_priv->rps.max_freq;
        DRM_DEBUG_DRIVER("max GPU freq: %d MHz (%u)\n",
@@ -4171,6 +4408,12 @@ static void cherryview_init_gt_powersave(struct drm_device *dev)
                         vlv_gpu_freq(dev_priv, dev_priv->rps.min_freq),
                         dev_priv->rps.min_freq);
 
+       WARN_ONCE((dev_priv->rps.max_freq |
+                  dev_priv->rps.efficient_freq |
+                  dev_priv->rps.rp1_freq |
+                  dev_priv->rps.min_freq) & 1,
+                 "Odd GPU freq values\n");
+
        /* Preserve min/max settings in case of re-init */
        if (dev_priv->rps.max_freq_softlimit == 0)
                dev_priv->rps.max_freq_softlimit = dev_priv->rps.max_freq;
@@ -5156,6 +5399,8 @@ static void intel_gen6_powersave_work(struct work_struct *work)
                             rps.delayed_resume_work.work);
        struct drm_device *dev = dev_priv->dev;
 
+       dev_priv->rps.is_bdw_sw_turbo = false;
+
        mutex_lock(&dev_priv->rps.hw_lock);
 
        if (IS_CHERRYVIEW(dev)) {
@@ -5536,37 +5781,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
        /* FIXME(BDW): Check all the w/a, some might only apply to
         * pre-production hw. */
 
-       /* WaDisablePartialInstShootdown:bdw */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
-
-       /* WaDisableThreadStallDopClockGating:bdw */
-       /* FIXME: Unclear whether we really need this on production bdw. */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
 
-       /*
-        * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
-        * pre-production hardware
-        */
-       I915_WRITE(HALF_SLICE_CHICKEN3,
-                  _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
-       I915_WRITE(HALF_SLICE_CHICKEN3,
-                  _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
        I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
 
        I915_WRITE(_3D_CHICKEN3,
                   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
 
-       I915_WRITE(COMMON_SLICE_CHICKEN2,
-                  _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
-
-       I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
-                  _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
-
-       /* WaDisableDopClockGating:bdw May not be needed for production */
-       I915_WRITE(GEN7_ROW_CHICKEN2,
-                  _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 
        /* WaSwitchSolVfFArbitrationPriority:bdw */
        I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
@@ -5582,31 +5802,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
                           BDW_DPRS_MASK_VBLANK_SRD);
        }
 
-       /* Use Force Non-Coherent whenever executing a 3D context. This is a
-        * workaround for for a possible hang in the unlikely event a TLB
-        * invalidation occurs during a PSD flush.
-        */
-       I915_WRITE(HDC_CHICKEN0,
-                  I915_READ(HDC_CHICKEN0) |
-                  _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
-
        /* WaVSRefCountFullforceMissDisable:bdw */
        /* WaDSRefCountFullforceMissDisable:bdw */
        I915_WRITE(GEN7_FF_THREAD_MODE,
                   I915_READ(GEN7_FF_THREAD_MODE) &
                   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
 
-       /*
-        * BSpec recommends 8x4 when MSAA is used,
-        * however in practice 16x4 seems fastest.
-        *
-        * Note that PS/WM thread counts depend on the WIZ hashing
-        * disable bit, which we don't touch here, but it's good
-        * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-        */
-       I915_WRITE(GEN7_GT_MODE,
-                  GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
-
        I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
                   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
 
@@ -5614,10 +5815,6 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
        I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
                   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
 
-       /* Wa4x4STCOptimizationDisable:bdw */
-       I915_WRITE(CACHE_MODE_1,
-                  _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
-
        lpt_init_clock_gating(dev);
 }
 
@@ -5774,24 +5971,6 @@ static void ivybridge_init_clock_gating(struct drm_device *dev)
 static void valleyview_init_clock_gating(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u32 val;
-
-       mutex_lock(&dev_priv->rps.hw_lock);
-       val = vlv_punit_read(dev_priv, PUNIT_REG_GPU_FREQ_STS);
-       mutex_unlock(&dev_priv->rps.hw_lock);
-       switch ((val >> 6) & 3) {
-       case 0:
-       case 1:
-               dev_priv->mem_freq = 800;
-               break;
-       case 2:
-               dev_priv->mem_freq = 1066;
-               break;
-       case 3:
-               dev_priv->mem_freq = 1333;
-               break;
-       }
-       DRM_DEBUG_DRIVER("DDR speed: %d MHz", dev_priv->mem_freq);
 
        I915_WRITE(DSPCLK_GATE_D, VRHUNIT_CLOCK_GATE_DISABLE);
 
@@ -5867,48 +6046,11 @@ static void valleyview_init_clock_gating(struct drm_device *dev)
 static void cherryview_init_clock_gating(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u32 val;
-
-       mutex_lock(&dev_priv->rps.hw_lock);
-       val = vlv_punit_read(dev_priv, CCK_FUSE_REG);
-       mutex_unlock(&dev_priv->rps.hw_lock);
-       switch ((val >> 2) & 0x7) {
-       case 0:
-       case 1:
-                       dev_priv->rps.cz_freq = CHV_CZ_CLOCK_FREQ_MODE_200;
-                       dev_priv->mem_freq = 1600;
-                       break;
-       case 2:
-                       dev_priv->rps.cz_freq = CHV_CZ_CLOCK_FREQ_MODE_267;
-                       dev_priv->mem_freq = 1600;
-                       break;
-       case 3:
-                       dev_priv->rps.cz_freq = CHV_CZ_CLOCK_FREQ_MODE_333;
-                       dev_priv->mem_freq = 2000;
-                       break;
-       case 4:
-                       dev_priv->rps.cz_freq = CHV_CZ_CLOCK_FREQ_MODE_320;
-                       dev_priv->mem_freq = 1600;
-                       break;
-       case 5:
-                       dev_priv->rps.cz_freq = CHV_CZ_CLOCK_FREQ_MODE_400;
-                       dev_priv->mem_freq = 1600;
-                       break;
-       }
-       DRM_DEBUG_DRIVER("DDR speed: %d MHz", dev_priv->mem_freq);
 
        I915_WRITE(DSPCLK_GATE_D, VRHUNIT_CLOCK_GATE_DISABLE);
 
        I915_WRITE(MI_ARB_VLV, MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE);
 
-       /* WaDisablePartialInstShootdown:chv */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
-
-       /* WaDisableThreadStallDopClockGating:chv */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
-
        /* WaVSRefCountFullforceMissDisable:chv */
        /* WaDSRefCountFullforceMissDisable:chv */
        I915_WRITE(GEN7_FF_THREAD_MODE,
@@ -5927,10 +6069,6 @@ static void cherryview_init_clock_gating(struct drm_device *dev)
        I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
                   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
 
-       /* WaDisableSamplerPowerBypass:chv (pre-production hw) */
-       I915_WRITE(HALF_SLICE_CHICKEN3,
-                  _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
-
        /* WaDisableGunitClockGating:chv (pre-production hw) */
        I915_WRITE(VLV_GUNIT_CLOCK_GATE, I915_READ(VLV_GUNIT_CLOCK_GATE) |
                   GINT_DIS);
@@ -5940,8 +6078,6 @@ static void cherryview_init_clock_gating(struct drm_device *dev)
                   _MASKED_BIT_ENABLE(GEN8_FF_DOP_CLOCK_GATE_DISABLE));
 
        /* WaDisableDopClockGating:chv (pre-production hw) */
-       I915_WRITE(GEN7_ROW_CHICKEN2,
-                  _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
        I915_WRITE(GEN6_UCGCTL1, I915_READ(GEN6_UCGCTL1) |
                   GEN6_EU_TCUNIT_CLOCK_GATE_DISABLE);
 }
@@ -6026,6 +6162,9 @@ static void gen3_init_clock_gating(struct drm_device *dev)
 
        /* On GEN3 we really need to make sure the ARB C3 LP bit is set */
        I915_WRITE(MI_ARB_STATE, _MASKED_BIT_ENABLE(MI_ARB_C3_LP_WRITE_ENABLE));
+
+       I915_WRITE(MI_ARB_STATE,
+                  _MASKED_BIT_ENABLE(MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE));
 }
 
 static void i85x_init_clock_gating(struct drm_device *dev)
@@ -6037,6 +6176,9 @@ static void i85x_init_clock_gating(struct drm_device *dev)
        /* interrupts should cause a wake up from C3 */
        I915_WRITE(MI_STATE, _MASKED_BIT_ENABLE(MI_AGPBUSY_INT_EN) |
                   _MASKED_BIT_DISABLE(MI_AGPBUSY_830_MODE));
+
+       I915_WRITE(MEM_MODE,
+                  _MASKED_BIT_ENABLE(MEM_DISPLAY_TRICKLE_FEED_DISABLE));
 }
 
 static void i830_init_clock_gating(struct drm_device *dev)
@@ -6044,6 +6186,10 @@ static void i830_init_clock_gating(struct drm_device *dev)
        struct drm_i915_private *dev_priv = dev->dev_private;
 
        I915_WRITE(DSPCLK_GATE_D, OVRUNIT_CLOCK_GATE_DISABLE);
+
+       I915_WRITE(MEM_MODE,
+                  _MASKED_BIT_ENABLE(MEM_DISPLAY_A_TRICKLE_FEED_DISABLE) |
+                  _MASKED_BIT_ENABLE(MEM_DISPLAY_B_TRICKLE_FEED_DISABLE));
 }
 
 void intel_init_clock_gating(struct drm_device *dev)
@@ -6151,7 +6297,7 @@ static void hsw_power_well_post_enable(struct drm_i915_private *dev_priv)
        outb(inb(VGA_MSR_READ), VGA_MSR_WRITE);
        vga_put(dev->pdev, VGA_RSRC_LEGACY_IO);
 
-       if (IS_BROADWELL(dev))
+       if (IS_BROADWELL(dev) || (INTEL_INFO(dev)->gen >= 9))
                gen8_irq_power_well_post_enable(dev_priv);
 }
 
@@ -6346,6 +6492,8 @@ static void vlv_display_power_well_disable(struct drm_i915_private *dev_priv,
        spin_unlock_irq(&dev_priv->irq_lock);
 
        vlv_set_power_well(dev_priv, power_well, false);
+
+       vlv_power_sequencer_reset(dev_priv);
 }
 
 static void vlv_dpio_cmn_power_well_enable(struct drm_i915_private *dev_priv,
@@ -7273,6 +7421,8 @@ void intel_init_pm(struct drm_device *dev)
                        dev_priv->display.init_clock_gating = haswell_init_clock_gating;
                else if (INTEL_INFO(dev)->gen == 8)
                        dev_priv->display.init_clock_gating = broadwell_init_clock_gating;
+               else if (INTEL_INFO(dev)->gen == 9)
+                       dev_priv->display.init_clock_gating = gen9_init_clock_gating;
        } else if (IS_CHERRYVIEW(dev)) {
                dev_priv->display.update_wm = cherryview_update_wm;
                dev_priv->display.update_sprite_wm = valleyview_update_sprite_wm;
@@ -7466,6 +7616,7 @@ static int chv_freq_opcode(struct drm_i915_private *dev_priv, int val)
                return -1;
        }
 
+       /* CHV needs even values */
        opcode = (DIV_ROUND_CLOSEST((val * 2 * mul), dev_priv->rps.cz_freq) * 2);
 
        return opcode;