update hdmi driver: support 480p

[firefly-linux-kernel-4.4.55.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index e8aa036468994fc8870c5a84f2e49f008388f03a..e28e6c520bca8c0560a909bd544d843306456157 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -542,7 +542,6 @@ struct rq {
         struct load_weight load;
         unsigned long nr_load_updates;
         u64 nr_switches;
-       u64 nr_migrations_in;
  
         struct cfs_rq cfs;
         struct rt_rq rt;
@@ -742,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 size_t cnt, loff_t *ppos)
  {
         char buf[64];
-       char *cmp = buf;
+       char *cmp;
         int neg = 0;
         int i;
  
@@ -753,6 +752,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                 return -EFAULT;
  
         buf[cnt] = 0;
+       cmp = strstrip(buf);
  
         if (strncmp(buf, "NO_", 3) == 0) {
                 neg = 1;
@@ -760,9 +760,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
         }
  
         for (i = 0; sched_feat_names[i]; i++) {
-               int len = strlen(sched_feat_names[i]);
-
-               if (strncmp(cmp, sched_feat_names[i], len) == 0) {
+               if (strcmp(cmp, sched_feat_names[i]) == 0) {
                         if (neg)
                                 sysctl_sched_features &= ~(1UL << i);
                         else
@@ -942,6 +940,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  }
  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  
+/*
+ * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * against ttwu().
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+       return unlikely(p->state == TASK_WAKING);
+}
+
  /*
   * __task_rq_lock - lock the runqueue a given task resides on.
   * Must be called interrupts disabled.
@@ -949,8 +956,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  static inline struct rq *__task_rq_lock(struct task_struct *p)
         __acquires(rq->lock)
  {
+       struct rq *rq;
+
         for (;;) {
-               struct rq *rq = task_rq(p);
+               rq = task_rq(p);
                 spin_lock(&rq->lock);
                 if (likely(rq == task_rq(p)))
                         return rq;
@@ -1822,6 +1831,20 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
  static void calc_load_account_active(struct rq *this_rq);
  static void update_sysctl(void);
  
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+       set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+       /*
+        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+        * successfuly executed on another CPU. We must ensure that updates of
+        * per-task data have been completed by this moment.
+        */
+       smp_wmb();
+       task_thread_info(p)->cpu = cpu;
+#endif
+}
+
  #include "sched_stats.h"
  #include "sched_idletask.c"
  #include "sched_fair.c"
@@ -1871,13 +1894,14 @@ static void update_avg(u64 *avg, u64 sample)
         *avg += diff >> 3;
  }
  
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static void
+enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
  {
         if (wakeup)
                 p->se.start_runtime = p->se.sum_exec_runtime;
  
         sched_info_queued(p);
-       p->sched_class->enqueue_task(rq, p, wakeup);
+       p->sched_class->enqueue_task(rq, p, wakeup, head);
         p->se.on_rq = 1;
  }
  
@@ -1953,7 +1977,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
         if (task_contributes_to_load(p))
                 rq->nr_uninterruptible--;
  
-       enqueue_task(rq, p, wakeup);
+       enqueue_task(rq, p, wakeup, false);
         inc_nr_running(rq);
  }
  
@@ -1978,20 +2002,6 @@ inline int task_curr(const struct task_struct *p)
         return cpu_curr(task_cpu(p)) == p;
  }
  
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-       set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-       /*
-        * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-        * successfuly executed on another CPU. We must ensure that updates of
-        * per-task data have been completed by this moment.
-        */
-       smp_wmb();
-       task_thread_info(p)->cpu = cpu;
-#endif
-}
-
  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
                                        int oldprio, int running)
@@ -2018,21 +2028,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
   */
  void kthread_bind(struct task_struct *p, unsigned int cpu)
  {
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
-
         /* Must have done schedule() in kthread() before we set_task_cpu */
         if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
                 WARN_ON(1);
                 return;
         }
  
-       spin_lock_irqsave(&rq->lock, flags);
-       set_task_cpu(p, cpu);
         p->cpus_allowed = cpumask_of_cpu(cpu);
         p->rt.nr_cpus_allowed = 1;
         p->flags |= PF_THREAD_BOUND;
-       spin_unlock_irqrestore(&rq->lock, flags);
  }
  EXPORT_SYMBOL(kthread_bind);
  
@@ -2070,35 +2074,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
         int old_cpu = task_cpu(p);
-       struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
-       struct cfs_rq *old_cfsrq = task_cfs_rq(p),
-                     *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
-       u64 clock_offset;
  
-       clock_offset = old_rq->clock - new_rq->clock;
+#ifdef CONFIG_SCHED_DEBUG
+       /*
+        * We should never call set_task_cpu() on a blocked task,
+        * ttwu() will sort out the placement.
+        */
+       WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
+                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#endif
  
         trace_sched_migrate_task(p, new_cpu);
  
-#ifdef CONFIG_SCHEDSTATS
-       if (p->se.wait_start)
-               p->se.wait_start -= clock_offset;
-       if (p->se.sleep_start)
-               p->se.sleep_start -= clock_offset;
-       if (p->se.block_start)
-               p->se.block_start -= clock_offset;
-#endif
         if (old_cpu != new_cpu) {
                 p->se.nr_migrations++;
-               new_rq->nr_migrations_in++;
-#ifdef CONFIG_SCHEDSTATS
-               if (task_hot(p, old_rq->clock, NULL))
-                       schedstat_inc(p, se.nr_forced2_migrations);
-#endif
                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
                                      1, 1, NULL, 0);
         }
-       p->se.vruntime -= old_cfsrq->min_vruntime -
-                                        new_cfsrq->min_vruntime;
  
         __set_task_cpu(p, new_cpu);
  }
@@ -2331,6 +2323,69 @@ void task_oncpu_function_call(struct task_struct *p,
         preempt_enable();
  }
  
+#ifdef CONFIG_SMP
+/*
+ * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ */
+static int select_fallback_rq(int cpu, struct task_struct *p)
+{
+       int dest_cpu;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       return dest_cpu;
+
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+       if (dest_cpu < nr_cpu_ids)
+               return dest_cpu;
+
+       /* No more Mr. Nice Guy. */
+       if (unlikely(dest_cpu >= nr_cpu_ids)) {
+               dest_cpu = cpuset_cpus_allowed_fallback(p);
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, cpu);
+               }
+       }
+
+       return dest_cpu;
+}
+
+/*
+ * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ */
+static inline
+int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+{
+       int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+
+       /*
+        * In order not to call set_task_cpu() on a blocking task we need
+        * to rely on ttwu() to place the task on a valid ->cpus_allowed
+        * cpu.
+        *
+        * Since this is common to all placement strategies, this lives here.
+        *
+        * [ this allows ->select_task() to simply return task_cpu(p) and
+        *   not worry about this generic constraint ]
+        */
+       if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+                    !cpu_online(cpu)))
+               cpu = select_fallback_rq(task_cpu(p), p);
+
+       return cpu;
+}
+#endif
+
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@ -2379,22 +2434,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
          *
          * First fix up the nr_uninterruptible count:
          */
-       if (task_contributes_to_load(p))
-               rq->nr_uninterruptible--;
+       if (task_contributes_to_load(p)) {
+               if (likely(cpu_online(orig_cpu)))
+                       rq->nr_uninterruptible--;
+               else
+                       this_rq()->nr_uninterruptible--;
+       }
         p->state = TASK_WAKING;
-       task_rq_unlock(rq, &flags);
  
-       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       if (p->sched_class->task_waking)
+               p->sched_class->task_waking(rq, p);
+
+       cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
         if (cpu != orig_cpu)
                 set_task_cpu(p, cpu);
+       __task_rq_unlock(rq);
  
-       rq = task_rq_lock(p, &flags);
-
-       if (rq != orig_rq)
-               update_rq_clock(rq);
+       rq = cpu_rq(cpu);
+       spin_lock(&rq->lock);
+       update_rq_clock(rq);
  
+       /*
+        * We migrated the task without holding either rq->lock, however
+        * since the task is not on the task list itself, nobody else
+        * will try and migrate the task, hence the rq should match the
+        * cpu we just moved it to.
+        */
+       WARN_ON(task_cpu(p) != cpu);
         WARN_ON(p->state != TASK_WAKING);
-       cpu = task_cpu(p);
  
  #ifdef CONFIG_SCHEDSTATS
         schedstat_inc(rq, ttwu_count);
@@ -2447,8 +2514,8 @@ out_running:
  
         p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
-       if (p->sched_class->task_wake_up)
-               p->sched_class->task_wake_up(rq, p);
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
  
         if (unlikely(rq->idle_stamp)) {
                 u64 delta = rq->clock - rq->idle_stamp;
@@ -2528,7 +2595,6 @@ static void __sched_fork(struct task_struct *p)
         p->se.nr_failed_migrations_running      = 0;
         p->se.nr_failed_migrations_hot          = 0;
         p->se.nr_forced_migrations              = 0;
-       p->se.nr_forced2_migrations             = 0;
  
         p->se.nr_wakeups                        = 0;
         p->se.nr_wakeups_sync                   = 0;
@@ -2549,14 +2615,6 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_PREEMPT_NOTIFIERS
         INIT_HLIST_HEAD(&p->preempt_notifiers);
  #endif
-
-       /*
-        * We mark the process as running here, but have not actually
-        * inserted it onto the runqueue yet. This guarantees that
-        * nobody will actually run it, and a signal or other external
-        * event cannot wake it up and insert it on the runqueue either.
-        */
-       p->state = TASK_RUNNING;
  }
  
  /*
@@ -2567,6 +2625,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
         int cpu = get_cpu();
  
         __sched_fork(p);
+       /*
+        * We mark the process as running here. This guarantees that
+        * nobody will actually run it, and a signal or other external
+        * event cannot wake it up and insert it on the runqueue either.
+        */
+       p->state = TASK_RUNNING;
  
         /*
          * Revert to default priority/policy on fork if requested.
@@ -2598,9 +2662,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
  
-#ifdef CONFIG_SMP
-       cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
+
         set_task_cpu(p, cpu);
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2630,28 +2694,38 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  {
         unsigned long flags;
         struct rq *rq;
+       int cpu = get_cpu();
  
+#ifdef CONFIG_SMP
         rq = task_rq_lock(p, &flags);
-       BUG_ON(p->state != TASK_RUNNING);
-       update_rq_clock(rq);
+       p->state = TASK_WAKING;
  
-       if (!p->sched_class->task_new || !current->se.on_rq) {
-               activate_task(rq, p, 0);
-       } else {
-               /*
-                * Let the scheduling class do new task startup
-                * management (if any):
-                */
-               p->sched_class->task_new(rq, p);
-               inc_nr_running(rq);
-       }
+       /*
+        * Fork balancing, do it here and not earlier because:
+        *  - cpus_allowed can change in the fork path
+        *  - any previously selected cpu might disappear through hotplug
+        *
+        * We set TASK_WAKING so that select_task_rq() can drop rq->lock
+        * without people poking at ->cpus_allowed.
+        */
+       cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+       set_task_cpu(p, cpu);
+
+       p->state = TASK_RUNNING;
+       task_rq_unlock(rq, &flags);
+#endif
+
+       rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
+       activate_task(rq, p, 0);
         trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
-       if (p->sched_class->task_wake_up)
-               p->sched_class->task_wake_up(rq, p);
+       if (p->sched_class->task_woken)
+               p->sched_class->task_woken(rq, p);
  #endif
         task_rq_unlock(rq, &flags);
+       put_cpu();
  }
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3037,15 +3111,6 @@ static void calc_load_account_active(struct rq *this_rq)
         }
  }
  
-/*
- * Externally visible per-cpu scheduler statistics:
- * cpu_nr_migrations(cpu) - number of migrations into that cpu
- */
-u64 cpu_nr_migrations(int cpu)
-{
-       return cpu_rq(cpu)->nr_migrations_in;
-}
-
  /*
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
@@ -3128,24 +3193,28 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
  }
  
  /*
- * If dest_cpu is allowed for this process, migrate the task to it.
- * This is accomplished by forcing the cpu_allowed mask to only
- * allow dest_cpu, which will force the cpu onto dest_cpu. Then
- * the cpu_allowed mask is restored.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
   */
-static void sched_migrate_task(struct task_struct *p, int dest_cpu)
+void sched_exec(void)
  {
+       struct task_struct *p = current;
         struct migration_req req;
         unsigned long flags;
         struct rq *rq;
+       int dest_cpu;
  
         rq = task_rq_lock(p, &flags);
-       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
-           || unlikely(!cpu_active(dest_cpu)))
-               goto out;
+       dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+       if (dest_cpu == smp_processor_id())
+               goto unlock;
  
-       /* force the process onto the specified CPU */
-       if (migrate_task(p, dest_cpu, &req)) {
+       /*
+        * select_task_rq() can race against ->cpus_allowed
+        */
+       if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
+           likely(cpu_active(dest_cpu)) &&
+           migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
                 struct task_struct *mt = rq->migration_thread;
  
@@ -3157,23 +3226,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
  
                 return;
         }
-out:
+unlock:
         task_rq_unlock(rq, &flags);
  }
  
-/*
- * sched_exec - execve() is a valuable balancing opportunity, because at
- * this point the task has the smallest effective memory and cache footprint.
- */
-void sched_exec(void)
-{
-       int new_cpu, this_cpu = get_cpu();
-       new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
-       put_cpu();
-       if (new_cpu != this_cpu)
-               sched_migrate_task(current, new_cpu);
-}
-
  /*
   * pull_task - move a task from a remote runqueue to the local runqueue.
   * Both runqueues must be locked.
@@ -3621,7 +3677,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
  
  unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
  {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
         unsigned long smt_gain = sd->smt_gain;
  
         smt_gain /= weight;
@@ -3654,7 +3710,7 @@ unsigned long scale_rt_power(int cpu)
  
  static void update_cpu_power(struct sched_domain *sd, int cpu)
  {
-       unsigned long weight = cpumask_weight(sched_domain_span(sd));
+       unsigned long weight = sd->span_weight;
         unsigned long power = SCHED_LOAD_SCALE;
         struct sched_group *sdg = sd->groups;
  
@@ -5284,9 +5340,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
  
         if (total) {
-               u64 temp;
+               u64 temp = rtime;
  
-               temp = (u64)(rtime * cputime.utime);
+               temp *= cputime.utime;
                 do_div(temp, total);
                 utime = (cputime_t)temp;
         } else
@@ -5590,7 +5646,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
          * the mutex owner just released it and exited.
          */
         if (probe_kernel_address(&owner->cpu, cpu))
-               goto out;
+               return 0;
  #else
         cpu = owner->cpu;
  #endif
@@ -5600,14 +5656,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
          * the cpu field may no longer be valid.
          */
         if (cpu >= nr_cpumask_bits)
-               goto out;
+               return 0;
  
         /*
          * We need to validate that we can do a
          * get_cpu() and that we have the percpu area.
          */
         if (!cpu_online(cpu))
-               goto out;
+               return 0;
  
         rq = cpu_rq(cpu);
  
@@ -5626,7 +5682,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
  
                 cpu_relax();
         }
-out:
+
         return 1;
  }
  #endif
@@ -5974,14 +6030,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
   */
  bool try_wait_for_completion(struct completion *x)
  {
+       unsigned long flags;
         int ret = 1;
  
-       spin_lock_irq(&x->wait.lock);
+       spin_lock_irqsave(&x->wait.lock, flags);
         if (!x->done)
                 ret = 0;
         else
                 x->done--;
-       spin_unlock_irq(&x->wait.lock);
+       spin_unlock_irqrestore(&x->wait.lock, flags);
         return ret;
  }
  EXPORT_SYMBOL(try_wait_for_completion);
@@ -5996,12 +6053,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
   */
  bool completion_done(struct completion *x)
  {
+       unsigned long flags;
         int ret = 1;
  
-       spin_lock_irq(&x->wait.lock);
+       spin_lock_irqsave(&x->wait.lock, flags);
         if (!x->done)
                 ret = 0;
-       spin_unlock_irq(&x->wait.lock);
+       spin_unlock_irqrestore(&x->wait.lock, flags);
         return ret;
  }
  EXPORT_SYMBOL(completion_done);
@@ -6095,7 +6153,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, 0, oldprio < prio);
  
                 check_class_changed(rq, p, prev_class, oldprio, running);
         }
@@ -6139,7 +6197,7 @@ void set_user_nice(struct task_struct *p, long nice)
         delta = p->prio - old_prio;
  
         if (on_rq) {
-               enqueue_task(rq, p, 0);
+               enqueue_task(rq, p, 0, false);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -6530,7 +6588,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                 return -EINVAL;
  
         retval = -ESRCH;
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
         p = find_process_by_pid(pid);
         if (p) {
                 retval = security_task_getscheduler(p);
@@ -6538,7 +6596,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
                         retval = p->policy
                                 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
         }
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
         return retval;
  }
  
@@ -6556,7 +6614,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
         if (!param || pid < 0)
                 return -EINVAL;
  
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
         p = find_process_by_pid(pid);
         retval = -ESRCH;
         if (!p)
@@ -6567,7 +6625,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
                 goto out_unlock;
  
         lp.sched_priority = p->rt_priority;
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
  
         /*
          * This one might sleep, we cannot do it with a spinlock held ...
@@ -6577,7 +6635,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
         return retval;
  
  out_unlock:
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
         return retval;
  }
  
@@ -6588,22 +6646,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         int retval;
  
         get_online_cpus();
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
  
         p = find_process_by_pid(pid);
         if (!p) {
-               read_unlock(&tasklist_lock);
+               rcu_read_unlock();
                 put_online_cpus();
                 return -ESRCH;
         }
  
-       /*
-        * It is not safe to call set_cpus_allowed with the
-        * tasklist_lock held. We will bump the task_struct's
-        * usage count and then drop tasklist_lock.
-        */
+       /* Prevent p going away */
         get_task_struct(p);
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
  
         if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
                 retval = -ENOMEM;
@@ -6684,10 +6738,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
  long sched_getaffinity(pid_t pid, struct cpumask *mask)
  {
         struct task_struct *p;
+       unsigned long flags;
+       struct rq *rq;
         int retval;
  
         get_online_cpus();
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
  
         retval = -ESRCH;
         p = find_process_by_pid(pid);
@@ -6698,10 +6754,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
         if (retval)
                 goto out_unlock;
  
+       rq = task_rq_lock(p, &flags);
         cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+       task_rq_unlock(rq, &flags);
  
  out_unlock:
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
         put_online_cpus();
  
         return retval;
@@ -6940,6 +6998,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
  {
         struct task_struct *p;
         unsigned int time_slice;
+       unsigned long flags;
+       struct rq *rq;
         int retval;
         struct timespec t;
  
@@ -6947,7 +7007,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                 return -EINVAL;
  
         retval = -ESRCH;
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
         p = find_process_by_pid(pid);
         if (!p)
                 goto out_unlock;
@@ -6956,15 +7016,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
         if (retval)
                 goto out_unlock;
  
-       time_slice = p->sched_class->get_rr_interval(p);
+       rq = task_rq_lock(p, &flags);
+       time_slice = p->sched_class->get_rr_interval(rq, p);
+       task_rq_unlock(rq, &flags);
  
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
         jiffies_to_timespec(time_slice, &t);
         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
         return retval;
  
  out_unlock:
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
         return retval;
  }
  
@@ -6976,7 +7038,7 @@ void sched_show_task(struct task_struct *p)
         unsigned state;
  
         state = p->state ? __ffs(p->state) + 1 : 0;
-       printk(KERN_INFO "%-13.13s %c", p->comm,
+       printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  #if BITS_PER_LONG == 32
         if (state == TASK_RUNNING)
@@ -7055,6 +7117,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         spin_lock_irqsave(&rq->lock, flags);
  
         __sched_fork(idle);
+       idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
@@ -7149,7 +7212,19 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
         struct rq *rq;
         int ret = 0;
  
+       /*
+        * Serialize against TASK_WAKING so that ttwu() and wunt() can
+        * drop the rq->lock and still rely on ->cpus_allowed.
+        */
+again:
+       while (task_is_waking(p))
+               cpu_relax();
         rq = task_rq_lock(p, &flags);
+       if (task_is_waking(p)) {
+               task_rq_unlock(rq, &flags);
+               goto again;
+       }
+
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
                 goto out;
@@ -7178,7 +7253,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  
                 get_task_struct(mt);
                 task_rq_unlock(rq, &flags);
-               wake_up_process(rq->migration_thread);
+               wake_up_process(mt);
                 put_task_struct(mt);
                 wait_for_completion(&req.done);
                 tlb_migrate_finish(p->mm);
@@ -7205,7 +7280,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
  {
         struct rq *rq_dest, *rq_src;
-       int ret = 0, on_rq;
+       int ret = 0;
  
         if (unlikely(!cpu_active(dest_cpu)))
                 return ret;
@@ -7217,19 +7292,17 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         /* Already moved. */
         if (task_cpu(p) != src_cpu)
                 goto done;
-       /* Waking up, don't get in the way of try_to_wake_up(). */
-       if (p->state == TASK_WAKING)
-               goto fail;
         /* Affinity changed (again). */
         if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 goto fail;
  
-       on_rq = p->se.on_rq;
-       if (on_rq)
+       /*
+        * If we're not on a rq, the next wake-up will ensure we're
+        * placed properly.
+        */
+       if (p->se.on_rq) {
                 deactivate_task(rq_src, p, 0);
-
-       set_task_cpu(p, dest_cpu);
-       if (on_rq) {
+               set_task_cpu(p, dest_cpu);
                 activate_task(rq_dest, p, 0);
                 check_preempt_curr(rq_dest, p, 0);
         }
@@ -7308,57 +7381,29 @@ static int migration_thread(void *data)
  }
  
  #ifdef CONFIG_HOTPLUG_CPU
-
-static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
-{
-       int ret;
-
-       local_irq_disable();
-       ret = __migrate_task(p, src_cpu, dest_cpu);
-       local_irq_enable();
-       return ret;
-}
-
  /*
   * Figure out where task on dead CPU should go, use force if necessary.
   */
-static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
+void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
-       int dest_cpu;
-       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
-
-again:
-       /* Look for allowed, online CPU in same node. */
-       for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
-                       goto move;
-
-       /* Any allowed, online CPU? */
-       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
-       if (dest_cpu < nr_cpu_ids)
-               goto move;
-
-       /* No more Mr. Nice Guy. */
-       if (dest_cpu >= nr_cpu_ids) {
-               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
-               dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+       struct rq *rq = cpu_rq(dead_cpu);
+       int needs_cpu, uninitialized_var(dest_cpu);
+       unsigned long flags;
  
-               /*
-                * Don't tell them about moving exiting tasks or
-                * kernel threads (both mm NULL), since they never
-                * leave kernel.
-                */
-               if (p->mm && printk_ratelimit()) {
-                       printk(KERN_INFO "process %d (%s) no "
-                              "longer affine to cpu%d\n",
-                              task_pid_nr(p), p->comm, dead_cpu);
-               }
-       }
+       local_irq_save(flags);
  
-move:
-       /* It can have affinity changed while we were choosing. */
-       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
-               goto again;
+       spin_lock(&rq->lock);
+       needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
+       if (needs_cpu)
+               dest_cpu = select_fallback_rq(dead_cpu, p);
+       spin_unlock(&rq->lock);
+       /*
+        * It can only fail if we race with set_cpus_allowed(),
+        * in the racer should migrate the task anyway.
+        */
+       if (needs_cpu)
+               __migrate_task(p, dead_cpu, dest_cpu);
+       local_irq_restore(flags);
  }
  
  /*
@@ -7706,10 +7751,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
         unsigned long flags;
         struct rq *rq;
  
-       switch (action) {
+       switch (action & ~CPU_TASKS_FROZEN) {
  
         case CPU_UP_PREPARE:
-       case CPU_UP_PREPARE_FROZEN:
                 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
                 if (IS_ERR(p))
                         return NOTIFY_BAD;
@@ -7724,7 +7768,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  
         case CPU_ONLINE:
-       case CPU_ONLINE_FROZEN:
                 /* Strictly unnecessary, as first user will wake it. */
                 wake_up_process(cpu_rq(cpu)->migration_thread);
  
@@ -7741,7 +7784,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  
  #ifdef CONFIG_HOTPLUG_CPU
         case CPU_UP_CANCELED:
-       case CPU_UP_CANCELED_FROZEN:
                 if (!cpu_rq(cpu)->migration_thread)
                         break;
                 /* Unbind it from offline cpu so it can run. Fall thru. */
@@ -7752,14 +7794,22 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
  
-       case CPU_DEAD:
-       case CPU_DEAD_FROZEN:
-               cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
-               migrate_live_tasks(cpu);
+       case CPU_POST_DEAD:
+               /*
+                * Bring the migration thread down in CPU_POST_DEAD event,
+                * since the timers should have got migrated by now and thus
+                * we should not see a deadlock between trying to kill the
+                * migration thread and the sched_rt_period_timer.
+                */
                 rq = cpu_rq(cpu);
                 kthread_stop(rq->migration_thread);
                 put_task_struct(rq->migration_thread);
                 rq->migration_thread = NULL;
+               break;
+
+       case CPU_DEAD:
+               migrate_live_tasks(cpu);
+               rq = cpu_rq(cpu);
                 /* Idle task back to normal (off runqueue, low prio) */
                 spin_lock_irq(&rq->lock);
                 update_rq_clock(rq);
@@ -7768,7 +7818,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 rq->idle->sched_class = &idle_sched_class;
                 migrate_dead_tasks(cpu);
                 spin_unlock_irq(&rq->lock);
-               cpuset_unlock();
                 migrate_nr_uninterruptible(rq);
                 BUG_ON(rq->nr_running != 0);
                 calc_global_load_remove(rq);
@@ -7792,7 +7841,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                 break;
  
         case CPU_DYING:
-       case CPU_DYING_FROZEN:
                 /* Update our root-domain */
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
@@ -8112,6 +8160,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         struct rq *rq = cpu_rq(cpu);
         struct sched_domain *tmp;
  
+       for (tmp = sd; tmp; tmp = tmp->parent)
+               tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
+
         /* Remove the sched domains which do not contribute to scheduling. */
         for (tmp = sd; tmp; ) {
                 struct sched_domain *parent = tmp->parent;
@@ -9693,13 +9744,24 @@ static inline int preempt_count_equals(int preempt_offset)
         return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
  }
  
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+       __might_sleep_init_called = 1;
+       return 0;
+}
+early_initcall(__might_sleep_init);
+
  void __might_sleep(char *file, int line, int preempt_offset)
  {
  #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
  
         if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
-           system_state != SYSTEM_RUNNING || oops_in_progress)
+           oops_in_progress)
+               return;
+       if (system_state != SYSTEM_RUNNING &&
+           (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
@@ -10099,13 +10161,13 @@ void sched_move_task(struct task_struct *tsk)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         if (tsk->sched_class->moved_group)
-               tsk->sched_class->moved_group(tsk);
+               tsk->sched_class->moved_group(tsk, on_rq);
  #endif
  
         if (unlikely(running))
                 tsk->sched_class->set_curr_task(rq);
         if (on_rq)
-               enqueue_task(rq, tsk, 0);
+               enqueue_task(rq, tsk, 0, false);
  
         task_rq_unlock(rq, &flags);
  }
@@ -10519,6 +10581,15 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
  static int
  cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
  {
+       if ((current != tsk) && (!capable(CAP_SYS_NICE))) {
+               const struct cred *cred = current_cred(), *tcred;
+
+               tcred = __task_cred(tsk);
+
+               if (cred->euid != tcred->uid && cred->euid != tcred->suid)
+                       return -EPERM;
+       }
+
  #ifdef CONFIG_RT_GROUP_SCHED
         if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
                 return -EINVAL;
@@ -10876,6 +10947,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
         rcu_read_unlock();
  }
  
+/*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH  \
+       min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH  0
+#endif
+
  /*
   * Charge the system/user time to the task's accounting group.
   */
@@ -10883,6 +10971,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
                 enum cpuacct_stat_index idx, cputime_t val)
  {
         struct cpuacct *ca;
+       int batch = CPUACCT_BATCH;
  
         if (unlikely(!cpuacct_subsys.active))
                 return;
@@ -10891,7 +10980,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
         ca = task_ca(tsk);
  
         do {
-               percpu_counter_add(&ca->cpustat[idx], val);
+               __percpu_counter_add(&ca->cpustat[idx], val, batch);
                 ca = ca->parent;
         } while (ca);
         rcu_read_unlock();