kernel/sched.c

   1 /*
   2  *  kernel/sched.c
   3  *
   4  *  Kernel scheduler and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  *
   8  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
   9  *              make semaphores SMP safe
  10  *  1998-11-19  Implemented schedule_timeout() and related stuff
  11  *              by Andrea Arcangeli
  12  *  2002-01-04  New ultra-scalable O(1) scheduler by Ingo Molnar:
  13  *              hybrid priority-list and round-robin design with
  14  *              an array-switch method of distributing timeslices
  15  *              and per-CPU runqueues.  Cleanups and useful suggestions
  16  *              by Davide Libenzi, preemptible kernel bits by Robert Love.
  17  *  2003-09-03  Interactivity tuning by Con Kolivas.
  18  *  2004-04-02  Scheduler domains code by Nick Piggin
  19  *  2007-04-15  Work begun on replacing all interactivity tuning with a
  20  *              fair scheduling design by Con Kolivas.
  21  *  2007-05-05  Load balancing (smp-nice) and other improvements
  22  *              by Peter Williams
  23  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  24  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  25  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  26  *              Thomas Gleixner, Mike Kravetz
  27  */
  28
  29 #include <linux/mm.h>
  30 #include <linux/module.h>
  31 #include <linux/nmi.h>
  32 #include <linux/init.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/highmem.h>
  35 #include <asm/mmu_context.h>
  36 #include <linux/interrupt.h>
  37 #include <linux/capability.h>
  38 #include <linux/completion.h>
  39 #include <linux/kernel_stat.h>
  40 #include <linux/debug_locks.h>
  41 #include <linux/perf_event.h>
  42 #include <linux/security.h>
  43 #include <linux/notifier.h>
  44 #include <linux/profile.h>
  45 #include <linux/freezer.h>
  46 #include <linux/vmalloc.h>
  47 #include <linux/blkdev.h>
  48 #include <linux/delay.h>
  49 #include <linux/pid_namespace.h>
  50 #include <linux/smp.h>
  51 #include <linux/threads.h>
  52 #include <linux/timer.h>
  53 #include <linux/rcupdate.h>
  54 #include <linux/cpu.h>
  55 #include <linux/cpuset.h>
  56 #include <linux/percpu.h>
  57 #include <linux/proc_fs.h>
  58 #include <linux/seq_file.h>
  59 #include <linux/stop_machine.h>
  60 #include <linux/sysctl.h>
  61 #include <linux/syscalls.h>
  62 #include <linux/times.h>
  63 #include <linux/tsacct_kern.h>
  64 #include <linux/kprobes.h>
  65 #include <linux/delayacct.h>
  66 #include <linux/unistd.h>
  67 #include <linux/pagemap.h>
  68 #include <linux/hrtimer.h>
  69 #include <linux/tick.h>
  70 #include <linux/debugfs.h>
  71 #include <linux/ctype.h>
  72 #include <linux/ftrace.h>
  73 #include <linux/slab.h>
  74
  75 #include <asm/tlb.h>
  76 #include <asm/irq_regs.h>
  77 #include <asm/mutex.h>
  78
  79 #include "sched_cpupri.h"
  80 #include "workqueue_sched.h"
  81 #include "sched_autogroup.h"
  82
  83 #define CREATE_TRACE_POINTS
  84 #include <trace/events/sched.h>
  85
  86 /*
  87  * Convert user-nice values [ -20 ... 0 ... 19 ]
  88  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  89  * and back.
  90  */
  91 #define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
  92 #define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
  93 #define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
  94
  95 /*
  96  * 'User priority' is the nice value converted to something we
  97  * can work with better when scaling various scheduler parameters,
  98  * it's a [ 0 ... 39 ] range.
  99  */
 100 #define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
 101 #define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
 102 #define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
 103
 104 /*
 105  * Helpers for converting nanosecond timing to jiffy resolution
 106  */
 107 #define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 108
 109 #define NICE_0_LOAD             SCHED_LOAD_SCALE
 110 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 111
 112 /*
 113  * These are the 'tuning knobs' of the scheduler:
 114  *
 115  * default timeslice is 100 msecs (used only for SCHED_RR tasks).
 116  * Timeslices get refilled after they expire.
 117  */
 118 #define DEF_TIMESLICE           (100 * HZ / 1000)
 119
 120 /*
 121  * single value that denotes runtime == period, ie unlimited time.
 122  */
 123 #define RUNTIME_INF     ((u64)~0ULL)
 124
 125 static inline int rt_policy(int policy)
 126 {
 127         if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
 128                 return 1;
 129         return 0;
 130 }
 131
 132 static inline int task_has_rt_policy(struct task_struct *p)
 133 {
 134         return rt_policy(p->policy);
 135 }
 136
 137 /*
 138  * This is the priority-queue data structure of the RT scheduling class:
 139  */
 140 struct rt_prio_array {
 141         DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
 142         struct list_head queue[MAX_RT_PRIO];
 143 };
 144
 145 struct rt_bandwidth {
 146         /* nests inside the rq lock: */
 147         raw_spinlock_t          rt_runtime_lock;
 148         ktime_t                 rt_period;
 149         u64                     rt_runtime;
 150         struct hrtimer          rt_period_timer;
 151 };
 152
 153 static struct rt_bandwidth def_rt_bandwidth;
 154
 155 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 156
 157 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 158 {
 159         struct rt_bandwidth *rt_b =
 160                 container_of(timer, struct rt_bandwidth, rt_period_timer);
 161         ktime_t now;
 162         int overrun;
 163         int idle = 0;
 164
 165         for (;;) {
 166                 now = hrtimer_cb_get_time(timer);
 167                 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
 168
 169                 if (!overrun)
 170                         break;
 171
 172                 idle = do_sched_rt_period_timer(rt_b, overrun);
 173         }
 174
 175         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 176 }
 177
 178 static
 179 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 180 {
 181         rt_b->rt_period = ns_to_ktime(period);
 182         rt_b->rt_runtime = runtime;
 183
 184         raw_spin_lock_init(&rt_b->rt_runtime_lock);
 185
 186         hrtimer_init(&rt_b->rt_period_timer,
 187                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 188         rt_b->rt_period_timer.function = sched_rt_period_timer;
 189 }
 190
 191 static inline int rt_bandwidth_enabled(void)
 192 {
 193         return sysctl_sched_rt_runtime >= 0;
 194 }
 195
 196 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 197 {
 198         ktime_t now;
 199
 200         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 201                 return;
 202
 203         if (hrtimer_active(&rt_b->rt_period_timer))
 204                 return;
 205
 206         raw_spin_lock(&rt_b->rt_runtime_lock);
 207         for (;;) {
 208                 unsigned long delta;
 209                 ktime_t soft, hard;
 210
 211                 if (hrtimer_active(&rt_b->rt_period_timer))
 212                         break;
 213
 214                 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 215                 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
 216
 217                 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
 218                 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
 219                 delta = ktime_to_ns(ktime_sub(hard, soft));
 220                 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
 221                                 HRTIMER_MODE_ABS_PINNED, 0);
 222         }
 223         raw_spin_unlock(&rt_b->rt_runtime_lock);
 224 }
 225
 226 #ifdef CONFIG_RT_GROUP_SCHED
 227 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 228 {
 229         hrtimer_cancel(&rt_b->rt_period_timer);
 230 }
 231 #endif
 232
 233 /*
 234  * sched_domains_mutex serializes calls to init_sched_domains,
 235  * detach_destroy_domains and partition_sched_domains.
 236  */
 237 static DEFINE_MUTEX(sched_domains_mutex);
 238
 239 #ifdef CONFIG_CGROUP_SCHED
 240
 241 #include <linux/cgroup.h>
 242
 243 struct cfs_rq;
 244
 245 static LIST_HEAD(task_groups);
 246
 247 /* task group related information */
 248 struct task_group {
 249         struct cgroup_subsys_state css;
 250
 251 #ifdef CONFIG_FAIR_GROUP_SCHED
 252         /* schedulable entities of this group on each cpu */
 253         struct sched_entity **se;
 254         /* runqueue "owned" by this group on each cpu */
 255         struct cfs_rq **cfs_rq;
 256         unsigned long shares;
 257
 258         atomic_t load_weight;
 259 #endif
 260
 261 #ifdef CONFIG_RT_GROUP_SCHED
 262         struct sched_rt_entity **rt_se;
 263         struct rt_rq **rt_rq;
 264
 265         struct rt_bandwidth rt_bandwidth;
 266 #endif
 267
 268         struct rcu_head rcu;
 269         struct list_head list;
 270
 271         struct task_group *parent;
 272         struct list_head siblings;
 273         struct list_head children;
 274
 275 #ifdef CONFIG_SCHED_AUTOGROUP
 276         struct autogroup *autogroup;
 277 #endif
 278 };
 279
 280 /* task_group_lock serializes the addition/removal of task groups */
 281 static DEFINE_SPINLOCK(task_group_lock);
 282
 283 #ifdef CONFIG_FAIR_GROUP_SCHED
 284
 285 # define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
 286
 287 /*
 288  * A weight of 0 or 1 can cause arithmetics problems.
 289  * A weight of a cfs_rq is the sum of weights of which entities
 290  * are queued on this cfs_rq, so a weight of a entity should not be
 291  * too large, so as the shares value of a task group.
 292  * (The default weight is 1024 - so there's no practical
 293  *  limitation from this.)
 294  */
 295 #define MIN_SHARES      (1UL <<  1)
 296 #define MAX_SHARES      (1UL << 18)
 297
 298 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 299 #endif
 300
 301 /* Default task group.
 302  *      Every task in system belong to this group at bootup.
 303  */
 304 struct task_group root_task_group;
 305
 306 #endif  /* CONFIG_CGROUP_SCHED */
 307
 308 /* CFS-related fields in a runqueue */
 309 struct cfs_rq {
 310         struct load_weight load;
 311         unsigned long nr_running;
 312
 313         u64 exec_clock;
 314         u64 min_vruntime;
 315 #ifndef CONFIG_64BIT
 316         u64 min_vruntime_copy;
 317 #endif
 318
 319         struct rb_root tasks_timeline;
 320         struct rb_node *rb_leftmost;
 321
 322         struct list_head tasks;
 323         struct list_head *balance_iterator;
 324
 325         /*
 326          * 'curr' points to currently running entity on this cfs_rq.
 327          * It is set to NULL otherwise (i.e when none are currently running).
 328          */
 329         struct sched_entity *curr, *next, *last, *skip;
 330
 331 #ifdef  CONFIG_SCHED_DEBUG
 332         unsigned int nr_spread_over;
 333 #endif
 334
 335 #ifdef CONFIG_FAIR_GROUP_SCHED
 336         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
 337
 338         /*
 339          * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 340          * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
 341          * (like users, containers etc.)
 342          *
 343          * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
 344          * list is used during load balance.
 345          */
 346         int on_list;
 347         struct list_head leaf_cfs_rq_list;
 348         struct task_group *tg;  /* group that "owns" this runqueue */
 349
 350 #ifdef CONFIG_SMP
 351         /*
 352          * the part of load.weight contributed by tasks
 353          */
 354         unsigned long task_weight;
 355
 356         /*
 357          *   h_load = weight * f(tg)
 358          *
 359          * Where f(tg) is the recursive weight fraction assigned to
 360          * this group.
 361          */
 362         unsigned long h_load;
 363
 364         /*
 365          * Maintaining per-cpu shares distribution for group scheduling
 366          *
 367          * load_stamp is the last time we updated the load average
 368          * load_last is the last time we updated the load average and saw load
 369          * load_unacc_exec_time is currently unaccounted execution time
 370          */
 371         u64 load_avg;
 372         u64 load_period;
 373         u64 load_stamp, load_last, load_unacc_exec_time;
 374
 375         unsigned long load_contribution;
 376 #endif
 377 #endif
 378 };
 379
 380 /* Real-Time classes' related field in a runqueue: */
 381 struct rt_rq {
 382         struct rt_prio_array active;
 383         unsigned long rt_nr_running;
 384 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 385         struct {
 386                 int curr; /* highest queued rt task prio */
 387 #ifdef CONFIG_SMP
 388                 int next; /* next highest */
 389 #endif
 390         } highest_prio;
 391 #endif
 392 #ifdef CONFIG_SMP
 393         unsigned long rt_nr_migratory;
 394         unsigned long rt_nr_total;
 395         int overloaded;
 396         struct plist_head pushable_tasks;
 397 #endif
 398         int rt_throttled;
 399         u64 rt_time;
 400         u64 rt_runtime;
 401         /* Nests inside the rq lock: */
 402         raw_spinlock_t rt_runtime_lock;
 403
 404 #ifdef CONFIG_RT_GROUP_SCHED
 405         unsigned long rt_nr_boosted;
 406
 407         struct rq *rq;
 408         struct list_head leaf_rt_rq_list;
 409         struct task_group *tg;
 410 #endif
 411 };
 412
 413 #ifdef CONFIG_SMP
 414
 415 /*
 416  * We add the notion of a root-domain which will be used to define per-domain
 417  * variables. Each exclusive cpuset essentially defines an island domain by
 418  * fully partitioning the member cpus from any other cpuset. Whenever a new
 419  * exclusive cpuset is created, we also create and attach a new root-domain
 420  * object.
 421  *
 422  */
 423 struct root_domain {
 424         atomic_t refcount;
 425         struct rcu_head rcu;
 426         cpumask_var_t span;
 427         cpumask_var_t online;
 428
 429         /*
 430          * The "RT overload" flag: it gets set if a CPU has more than
 431          * one runnable RT task.
 432          */
 433         cpumask_var_t rto_mask;
 434         atomic_t rto_count;
 435         struct cpupri cpupri;
 436 };
 437
 438 /*
 439  * By default the system creates a single root-domain with all cpus as
 440  * members (mimicking the global state we have today).
 441  */
 442 static struct root_domain def_root_domain;
 443
 444 #endif /* CONFIG_SMP */
 445
 446 /*
 447  * This is the main, per-CPU runqueue data structure.
 448  *
 449  * Locking rule: those places that want to lock multiple runqueues
 450  * (such as the load balancing or the thread migration code), lock
 451  * acquire operations must be ordered by ascending &runqueue.
 452  */
 453 struct rq {
 454         /* runqueue lock: */
 455         raw_spinlock_t lock;
 456
 457         /*
 458          * nr_running and cpu_load should be in the same cacheline because
 459          * remote CPUs use both these fields when doing load calculation.
 460          */
 461         unsigned long nr_running;
 462         #define CPU_LOAD_IDX_MAX 5
 463         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 464         unsigned long last_load_update_tick;
 465 #ifdef CONFIG_NO_HZ
 466         u64 nohz_stamp;
 467         unsigned char nohz_balance_kick;
 468 #endif
 469         int skip_clock_update;
 470
 471         /* capture load from *all* tasks on this cpu: */
 472         struct load_weight load;
 473         unsigned long nr_load_updates;
 474         u64 nr_switches;
 475
 476         struct cfs_rq cfs;
 477         struct rt_rq rt;
 478
 479 #ifdef CONFIG_FAIR_GROUP_SCHED
 480         /* list of leaf cfs_rq on this cpu: */
 481         struct list_head leaf_cfs_rq_list;
 482 #endif
 483 #ifdef CONFIG_RT_GROUP_SCHED
 484         struct list_head leaf_rt_rq_list;
 485 #endif
 486
 487         /*
 488          * This is part of a global counter where only the total sum
 489          * over all CPUs matters. A task can increase this counter on
 490          * one CPU and if it got migrated afterwards it may decrease
 491          * it on another CPU. Always updated under the runqueue lock:
 492          */
 493         unsigned long nr_uninterruptible;
 494
 495         struct task_struct *curr, *idle, *stop;
 496         unsigned long next_balance;
 497         struct mm_struct *prev_mm;
 498
 499         u64 clock;
 500         u64 clock_task;
 501
 502         atomic_t nr_iowait;
 503
 504 #ifdef CONFIG_SMP
 505         struct root_domain *rd;
 506         struct sched_domain *sd;
 507
 508         unsigned long cpu_power;
 509
 510         unsigned char idle_at_tick;
 511         /* For active balancing */
 512         int post_schedule;
 513         int active_balance;
 514         int push_cpu;
 515         struct cpu_stop_work active_balance_work;
 516         /* cpu of this runqueue: */
 517         int cpu;
 518         int online;
 519
 520         unsigned long avg_load_per_task;
 521
 522         u64 rt_avg;
 523         u64 age_stamp;
 524         u64 idle_stamp;
 525         u64 avg_idle;
 526 #endif
 527
 528 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 529         u64 prev_irq_time;
 530 #endif
 531
 532         /* calc_load related fields */
 533         unsigned long calc_load_update;
 534         long calc_load_active;
 535
 536 #ifdef CONFIG_SCHED_HRTICK
 537 #ifdef CONFIG_SMP
 538         int hrtick_csd_pending;
 539         struct call_single_data hrtick_csd;
 540 #endif
 541         struct hrtimer hrtick_timer;
 542 #endif
 543
 544 #ifdef CONFIG_SCHEDSTATS
 545         /* latency stats */
 546         struct sched_info rq_sched_info;
 547         unsigned long long rq_cpu_time;
 548         /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 549
 550         /* sys_sched_yield() stats */
 551         unsigned int yld_count;
 552
 553         /* schedule() stats */
 554         unsigned int sched_switch;
 555         unsigned int sched_count;
 556         unsigned int sched_goidle;
 557
 558         /* try_to_wake_up() stats */
 559         unsigned int ttwu_count;
 560         unsigned int ttwu_local;
 561 #endif
 562
 563 #ifdef CONFIG_SMP
 564         struct task_struct *wake_list;
 565 #endif
 566 };
 567
 568 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 569
 570
 571 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 572
 573 static inline int cpu_of(struct rq *rq)
 574 {
 575 #ifdef CONFIG_SMP
 576         return rq->cpu;
 577 #else
 578         return 0;
 579 #endif
 580 }
 581
 582 #define rcu_dereference_check_sched_domain(p) \
 583         rcu_dereference_check((p), \
 584                               rcu_read_lock_held() || \
 585                               lockdep_is_held(&sched_domains_mutex))
 586
 587 /*
 588  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
 589  * See detach_destroy_domains: synchronize_sched for details.
 590  *
 591  * The domain tree of any CPU may only be accessed from within
 592  * preempt-disabled sections.
 593  */
 594 #define for_each_domain(cpu, __sd) \
 595         for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
 596
 597 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 598 #define this_rq()               (&__get_cpu_var(runqueues))
 599 #define task_rq(p)              cpu_rq(task_cpu(p))
 600 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
 601 #define raw_rq()                (&__raw_get_cpu_var(runqueues))
 602
 603 #ifdef CONFIG_CGROUP_SCHED
 604
 605 /*
 606  * Return the group to which this tasks belongs.
 607  *
 608  * We cannot use task_subsys_state() and friends because the cgroup
 609  * subsystem changes that value before the cgroup_subsys::attach() method
 610  * is called, therefore we cannot pin it and might observe the wrong value.
 611  *
 612  * The same is true for autogroup's p->signal->autogroup->tg, the autogroup
 613  * core changes this before calling sched_move_task().
 614  *
 615  * Instead we use a 'copy' which is updated from sched_move_task() while
 616  * holding both task_struct::pi_lock and rq::lock.
 617  */
 618 static inline struct task_group *task_group(struct task_struct *p)
 619 {
 620         return p->sched_task_group;
 621 }
 622
 623 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 624 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 625 {
 626 #ifdef CONFIG_FAIR_GROUP_SCHED
 627         p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 628         p->se.parent = task_group(p)->se[cpu];
 629 #endif
 630
 631 #ifdef CONFIG_RT_GROUP_SCHED
 632         p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
 633         p->rt.parent = task_group(p)->rt_se[cpu];
 634 #endif
 635 }
 636
 637 #else /* CONFIG_CGROUP_SCHED */
 638
 639 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 640 static inline struct task_group *task_group(struct task_struct *p)
 641 {
 642         return NULL;
 643 }
 644
 645 #endif /* CONFIG_CGROUP_SCHED */
 646
 647 static void update_rq_clock_task(struct rq *rq, s64 delta);
 648
 649 static void update_rq_clock(struct rq *rq)
 650 {
 651         s64 delta;
 652
 653         if (rq->skip_clock_update > 0)
 654                 return;
 655
 656         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 657         rq->clock += delta;
 658         update_rq_clock_task(rq, delta);
 659 }
 660
 661 /*
 662  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 663  */
 664 #ifdef CONFIG_SCHED_DEBUG
 665 # define const_debug __read_mostly
 666 #else
 667 # define const_debug static const
 668 #endif
 669
 670 /**
 671  * runqueue_is_locked - Returns true if the current cpu runqueue is locked
 672  * @cpu: the processor in question.
 673  *
 674  * This interface allows printk to be called with the runqueue lock
 675  * held and know whether or not it is OK to wake up the klogd.
 676  */
 677 int runqueue_is_locked(int cpu)
 678 {
 679         return raw_spin_is_locked(&cpu_rq(cpu)->lock);
 680 }
 681
 682 /*
 683  * Debugging: various feature bits
 684  */
 685
 686 #define SCHED_FEAT(name, enabled)       \
 687         __SCHED_FEAT_##name ,
 688
 689 enum {
 690 #include "sched_features.h"
 691 };
 692
 693 #undef SCHED_FEAT
 694
 695 #define SCHED_FEAT(name, enabled)       \
 696         (1UL << __SCHED_FEAT_##name) * enabled |
 697
 698 const_debug unsigned int sysctl_sched_features =
 699 #include "sched_features.h"
 700         0;
 701
 702 #undef SCHED_FEAT
 703
 704 #ifdef CONFIG_SCHED_DEBUG
 705 #define SCHED_FEAT(name, enabled)       \
 706         #name ,
 707
 708 static __read_mostly char *sched_feat_names[] = {
 709 #include "sched_features.h"
 710         NULL
 711 };
 712
 713 #undef SCHED_FEAT
 714
 715 static int sched_feat_show(struct seq_file *m, void *v)
 716 {
 717         int i;
 718
 719         for (i = 0; sched_feat_names[i]; i++) {
 720                 if (!(sysctl_sched_features & (1UL << i)))
 721                         seq_puts(m, "NO_");
 722                 seq_printf(m, "%s ", sched_feat_names[i]);
 723         }
 724         seq_puts(m, "\n");
 725
 726         return 0;
 727 }
 728
 729 static ssize_t
 730 sched_feat_write(struct file *filp, const char __user *ubuf,
 731                 size_t cnt, loff_t *ppos)
 732 {
 733         char buf[64];
 734         char *cmp;
 735         int neg = 0;
 736         int i;
 737
 738         if (cnt > 63)
 739                 cnt = 63;
 740
 741         if (copy_from_user(&buf, ubuf, cnt))
 742                 return -EFAULT;
 743
 744         buf[cnt] = 0;
 745         cmp = strstrip(buf);
 746
 747         if (strncmp(cmp, "NO_", 3) == 0) {
 748                 neg = 1;
 749                 cmp += 3;
 750         }
 751
 752         for (i = 0; sched_feat_names[i]; i++) {
 753                 if (strcmp(cmp, sched_feat_names[i]) == 0) {
 754                         if (neg)
 755                                 sysctl_sched_features &= ~(1UL << i);
 756                         else
 757                                 sysctl_sched_features |= (1UL << i);
 758                         break;
 759                 }
 760         }
 761
 762         if (!sched_feat_names[i])
 763                 return -EINVAL;
 764
 765         *ppos += cnt;
 766
 767         return cnt;
 768 }
 769
 770 static int sched_feat_open(struct inode *inode, struct file *filp)
 771 {
 772         return single_open(filp, sched_feat_show, NULL);
 773 }
 774
 775 static const struct file_operations sched_feat_fops = {
 776         .open           = sched_feat_open,
 777         .write          = sched_feat_write,
 778         .read           = seq_read,
 779         .llseek         = seq_lseek,
 780         .release        = single_release,
 781 };
 782
 783 static __init int sched_init_debug(void)
 784 {
 785         debugfs_create_file("sched_features", 0644, NULL, NULL,
 786                         &sched_feat_fops);
 787
 788         return 0;
 789 }
 790 late_initcall(sched_init_debug);
 791
 792 #endif
 793
 794 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 795
 796 /*
 797  * Number of tasks to iterate in a single balance run.
 798  * Limited because this is done with IRQs disabled.
 799  */
 800 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 801
 802 /*
 803  * period over which we average the RT time consumption, measured
 804  * in ms.
 805  *
 806  * default: 1s
 807  */
 808 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 809
 810 /*
 811  * period over which we measure -rt task cpu usage in us.
 812  * default: 1s
 813  */
 814 unsigned int sysctl_sched_rt_period = 1000000;
 815
 816 static __read_mostly int scheduler_running;
 817
 818 /*
 819  * part of the period that we allow rt tasks to run in us.
 820  * default: 0.95s
 821  */
 822 int sysctl_sched_rt_runtime = 950000;
 823
 824 static inline u64 global_rt_period(void)
 825 {
 826         return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
 827 }
 828
 829 static inline u64 global_rt_runtime(void)
 830 {
 831         if (sysctl_sched_rt_runtime < 0)
 832                 return RUNTIME_INF;
 833
 834         return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 835 }
 836
 837 #ifndef prepare_arch_switch
 838 # define prepare_arch_switch(next)      do { } while (0)
 839 #endif
 840 #ifndef finish_arch_switch
 841 # define finish_arch_switch(prev)       do { } while (0)
 842 #endif
 843
 844 static inline int task_current(struct rq *rq, struct task_struct *p)
 845 {
 846         return rq->curr == p;
 847 }
 848
 849 static inline int task_running(struct rq *rq, struct task_struct *p)
 850 {
 851 #ifdef CONFIG_SMP
 852         return p->on_cpu;
 853 #else
 854         return task_current(rq, p);
 855 #endif
 856 }
 857
 858 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
 859 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 860 {
 861 #ifdef CONFIG_SMP
 862         /*
 863          * We can optimise this out completely for !SMP, because the
 864          * SMP rebalancing from interrupt is the only thing that cares
 865          * here.
 866          */
 867         next->on_cpu = 1;
 868 #endif
 869 }
 870
 871 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 872 {
 873 #ifdef CONFIG_SMP
 874         /*
 875          * After ->on_cpu is cleared, the task can be moved to a different CPU.
 876          * We must ensure this doesn't happen until the switch is completely
 877          * finished.
 878          */
 879         smp_wmb();
 880         prev->on_cpu = 0;
 881 #endif
 882 #ifdef CONFIG_DEBUG_SPINLOCK
 883         /* this is a valid case when another task releases the spinlock */
 884         rq->lock.owner = current;
 885 #endif
 886         /*
 887          * If we are tracking spinlock dependencies then we have to
 888          * fix up the runqueue lock - which gets 'carried over' from
 889          * prev into current:
 890          */
 891         spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
 892
 893         raw_spin_unlock_irq(&rq->lock);
 894 }
 895
 896 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
 897 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 898 {
 899 #ifdef CONFIG_SMP
 900         /*
 901          * We can optimise this out completely for !SMP, because the
 902          * SMP rebalancing from interrupt is the only thing that cares
 903          * here.
 904          */
 905         next->on_cpu = 1;
 906 #endif
 907 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 908         raw_spin_unlock_irq(&rq->lock);
 909 #else
 910         raw_spin_unlock(&rq->lock);
 911 #endif
 912 }
 913
 914 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 915 {
 916 #ifdef CONFIG_SMP
 917         /*
 918          * After ->on_cpu is cleared, the task can be moved to a different CPU.
 919          * We must ensure this doesn't happen until the switch is completely
 920          * finished.
 921          */
 922         smp_wmb();
 923         prev->on_cpu = 0;
 924 #endif
 925 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
 926         local_irq_enable();
 927 #endif
 928 }
 929 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 930
 931 /*
 932  * __task_rq_lock - lock the rq @p resides on.
 933  */
 934 static inline struct rq *__task_rq_lock(struct task_struct *p)
 935         __acquires(rq->lock)
 936 {
 937         struct rq *rq;
 938
 939         lockdep_assert_held(&p->pi_lock);
 940
 941         for (;;) {
 942                 rq = task_rq(p);
 943                 raw_spin_lock(&rq->lock);
 944                 if (likely(rq == task_rq(p)))
 945                         return rq;
 946                 raw_spin_unlock(&rq->lock);
 947         }
 948 }
 949
 950 /*
 951  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 952  */
 953 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 954         __acquires(p->pi_lock)
 955         __acquires(rq->lock)
 956 {
 957         struct rq *rq;
 958
 959         for (;;) {
 960                 raw_spin_lock_irqsave(&p->pi_lock, *flags);
 961                 rq = task_rq(p);
 962                 raw_spin_lock(&rq->lock);
 963                 if (likely(rq == task_rq(p)))
 964                         return rq;
 965                 raw_spin_unlock(&rq->lock);
 966                 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 967         }
 968 }
 969
 970 static void __task_rq_unlock(struct rq *rq)
 971         __releases(rq->lock)
 972 {
 973         raw_spin_unlock(&rq->lock);
 974 }
 975
 976 static inline void
 977 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 978         __releases(rq->lock)
 979         __releases(p->pi_lock)
 980 {
 981         raw_spin_unlock(&rq->lock);
 982         raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 983 }
 984
 985 /*
 986  * this_rq_lock - lock this runqueue and disable interrupts.
 987  */
 988 static struct rq *this_rq_lock(void)
 989         __acquires(rq->lock)
 990 {
 991         struct rq *rq;
 992
 993         local_irq_disable();
 994         rq = this_rq();
 995         raw_spin_lock(&rq->lock);
 996
 997         return rq;
 998 }
 999
1000 #ifdef CONFIG_SCHED_HRTICK
1001 /*
1002  * Use HR-timers to deliver accurate preemption points.
1003  *
1004  * Its all a bit involved since we cannot program an hrt while holding the
1005  * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1006  * reschedule event.
1007  *
1008  * When we get rescheduled we reprogram the hrtick_timer outside of the
1009  * rq->lock.
1010  */
1011
1012 /*
1013  * Use hrtick when:
1014  *  - enabled by features
1015  *  - hrtimer is actually high res
1016  */
1017 static inline int hrtick_enabled(struct rq *rq)
1018 {
1019         if (!sched_feat(HRTICK))
1020                 return 0;
1021         if (!cpu_active(cpu_of(rq)))
1022                 return 0;
1023         return hrtimer_is_hres_active(&rq->hrtick_timer);
1024 }
1025
1026 static void hrtick_clear(struct rq *rq)
1027 {
1028         if (hrtimer_active(&rq->hrtick_timer))
1029                 hrtimer_cancel(&rq->hrtick_timer);
1030 }
1031
1032 /*
1033  * High-resolution timer tick.
1034  * Runs from hardirq context with interrupts disabled.
1035  */
1036 static enum hrtimer_restart hrtick(struct hrtimer *timer)
1037 {
1038         struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1039
1040         WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1041
1042         raw_spin_lock(&rq->lock);
1043         update_rq_clock(rq);
1044         rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1045         raw_spin_unlock(&rq->lock);
1046
1047         return HRTIMER_NORESTART;
1048 }
1049
1050 #ifdef CONFIG_SMP
1051 /*
1052  * called from hardirq (IPI) context
1053  */
1054 static void __hrtick_start(void *arg)
1055 {
1056         struct rq *rq = arg;
1057
1058         raw_spin_lock(&rq->lock);
1059         hrtimer_restart(&rq->hrtick_timer);
1060         rq->hrtick_csd_pending = 0;
1061         raw_spin_unlock(&rq->lock);
1062 }
1063
1064 /*
1065  * Called to set the hrtick timer state.
1066  *
1067  * called with rq->lock held and irqs disabled
1068  */
1069 static void hrtick_start(struct rq *rq, u64 delay)
1070 {
1071         struct hrtimer *timer = &rq->hrtick_timer;
1072         ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1073
1074         hrtimer_set_expires(timer, time);
1075
1076         if (rq == this_rq()) {
1077                 hrtimer_restart(timer);
1078         } else if (!rq->hrtick_csd_pending) {
1079                 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1080                 rq->hrtick_csd_pending = 1;
1081         }
1082 }
1083
1084 static int
1085 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1086 {
1087         int cpu = (int)(long)hcpu;
1088
1089         switch (action) {
1090         case CPU_UP_CANCELED:
1091         case CPU_UP_CANCELED_FROZEN:
1092         case CPU_DOWN_PREPARE:
1093         case CPU_DOWN_PREPARE_FROZEN:
1094         case CPU_DEAD:
1095         case CPU_DEAD_FROZEN:
1096                 hrtick_clear(cpu_rq(cpu));
1097                 return NOTIFY_OK;
1098         }
1099
1100         return NOTIFY_DONE;
1101 }
1102
1103 static __init void init_hrtick(void)
1104 {
1105         hotcpu_notifier(hotplug_hrtick, 0);
1106 }
1107 #else
1108 /*
1109  * Called to set the hrtick timer state.
1110  *
1111  * called with rq->lock held and irqs disabled
1112  */
1113 static void hrtick_start(struct rq *rq, u64 delay)
1114 {
1115         __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1116                         HRTIMER_MODE_REL_PINNED, 0);
1117 }
1118
1119 static inline void init_hrtick(void)
1120 {
1121 }
1122 #endif /* CONFIG_SMP */
1123
1124 static void init_rq_hrtick(struct rq *rq)
1125 {
1126 #ifdef CONFIG_SMP
1127         rq->hrtick_csd_pending = 0;
1128
1129         rq->hrtick_csd.flags = 0;
1130         rq->hrtick_csd.func = __hrtick_start;
1131         rq->hrtick_csd.info = rq;
1132 #endif
1133
1134         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1135         rq->hrtick_timer.function = hrtick;
1136 }
1137 #else   /* CONFIG_SCHED_HRTICK */
1138 static inline void hrtick_clear(struct rq *rq)
1139 {
1140 }
1141
1142 static inline void init_rq_hrtick(struct rq *rq)
1143 {
1144 }
1145
1146 static inline void init_hrtick(void)
1147 {
1148 }
1149 #endif  /* CONFIG_SCHED_HRTICK */
1150
1151 /*
1152  * resched_task - mark a task 'to be rescheduled now'.
1153  *
1154  * On UP this means the setting of the need_resched flag, on SMP it
1155  * might also involve a cross-CPU call to trigger the scheduler on
1156  * the target CPU.
1157  */
1158 #ifdef CONFIG_SMP
1159
1160 #ifndef tsk_is_polling
1161 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1162 #endif
1163
1164 static void resched_task(struct task_struct *p)
1165 {
1166         int cpu;
1167
1168         assert_raw_spin_locked(&task_rq(p)->lock);
1169
1170         if (test_tsk_need_resched(p))
1171                 return;
1172
1173         set_tsk_need_resched(p);
1174
1175         cpu = task_cpu(p);
1176         if (cpu == smp_processor_id())
1177                 return;
1178
1179         /* NEED_RESCHED must be visible before we test polling */
1180         smp_mb();
1181         if (!tsk_is_polling(p))
1182                 smp_send_reschedule(cpu);
1183 }
1184
1185 static void resched_cpu(int cpu)
1186 {
1187         struct rq *rq = cpu_rq(cpu);
1188         unsigned long flags;
1189
1190         if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1191                 return;
1192         resched_task(cpu_curr(cpu));
1193         raw_spin_unlock_irqrestore(&rq->lock, flags);
1194 }
1195
1196 #ifdef CONFIG_NO_HZ
1197 /*
1198  * In the semi idle case, use the nearest busy cpu for migrating timers
1199  * from an idle cpu.  This is good for power-savings.
1200  *
1201  * We don't do similar optimization for completely idle system, as
1202  * selecting an idle cpu will add more delays to the timers than intended
1203  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204  */
1205 int get_nohz_timer_target(void)
1206 {
1207         int cpu = smp_processor_id();
1208         int i;
1209         struct sched_domain *sd;
1210
1211         rcu_read_lock();
1212         for_each_domain(cpu, sd) {
1213                 for_each_cpu(i, sched_domain_span(sd)) {
1214                         if (!idle_cpu(i)) {
1215                                 cpu = i;
1216                                 goto unlock;
1217                         }
1218                 }
1219         }
1220 unlock:
1221         rcu_read_unlock();
1222         return cpu;
1223 }
1224 /*
1225  * When add_timer_on() enqueues a timer into the timer wheel of an
1226  * idle CPU then this timer might expire before the next timer event
1227  * which is scheduled to wake up that CPU. In case of a completely
1228  * idle system the next event might even be infinite time into the
1229  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1230  * leaves the inner idle loop so the newly added timer is taken into
1231  * account when the CPU goes back to idle and evaluates the timer
1232  * wheel for the next timer event.
1233  */
1234 void wake_up_idle_cpu(int cpu)
1235 {
1236         struct rq *rq = cpu_rq(cpu);
1237
1238         if (cpu == smp_processor_id())
1239                 return;
1240
1241         /*
1242          * This is safe, as this function is called with the timer
1243          * wheel base lock of (cpu) held. When the CPU is on the way
1244          * to idle and has not yet set rq->curr to idle then it will
1245          * be serialized on the timer wheel base lock and take the new
1246          * timer into account automatically.
1247          */
1248         if (rq->curr != rq->idle)
1249                 return;
1250
1251         /*
1252          * We can set TIF_RESCHED on the idle task of the other CPU
1253          * lockless. The worst case is that the other CPU runs the
1254          * idle task through an additional NOOP schedule()
1255          */
1256         set_tsk_need_resched(rq->idle);
1257
1258         /* NEED_RESCHED must be visible before we test polling */
1259         smp_mb();
1260         if (!tsk_is_polling(rq->idle))
1261                 smp_send_reschedule(cpu);
1262 }
1263
1264 #endif /* CONFIG_NO_HZ */
1265
1266 static u64 sched_avg_period(void)
1267 {
1268         return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1269 }
1270
1271 static void sched_avg_update(struct rq *rq)
1272 {
1273         s64 period = sched_avg_period();
1274
1275         while ((s64)(rq->clock - rq->age_stamp) > period) {
1276                 /*
1277                  * Inline assembly required to prevent the compiler
1278                  * optimising this loop into a divmod call.
1279                  * See __iter_div_u64_rem() for another example of this.
1280                  */
1281                 asm("" : "+rm" (rq->age_stamp));
1282                 rq->age_stamp += period;
1283                 rq->rt_avg /= 2;
1284         }
1285 }
1286
1287 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1288 {
1289         rq->rt_avg += rt_delta;
1290         sched_avg_update(rq);
1291 }
1292
1293 #else /* !CONFIG_SMP */
1294 static void resched_task(struct task_struct *p)
1295 {
1296         assert_raw_spin_locked(&task_rq(p)->lock);
1297         set_tsk_need_resched(p);
1298 }
1299
1300 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1301 {
1302 }
1303
1304 static void sched_avg_update(struct rq *rq)
1305 {
1306 }
1307 #endif /* CONFIG_SMP */
1308
1309 #if BITS_PER_LONG == 32
1310 # define WMULT_CONST    (~0UL)
1311 #else
1312 # define WMULT_CONST    (1UL << 32)
1313 #endif
1314
1315 #define WMULT_SHIFT     32
1316
1317 /*
1318  * Shift right and round:
1319  */
1320 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1321
1322 /*
1323  * delta *= weight / lw
1324  */
1325 static unsigned long
1326 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1327                 struct load_weight *lw)
1328 {
1329         u64 tmp;
1330
1331         /*
1332          * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1333          * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1334          * 2^SCHED_LOAD_RESOLUTION.
1335          */
1336         if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1337                 tmp = (u64)delta_exec * scale_load_down(weight);
1338         else
1339                 tmp = (u64)delta_exec;
1340
1341         if (!lw->inv_weight) {
1342                 unsigned long w = scale_load_down(lw->weight);
1343
1344                 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1345                         lw->inv_weight = 1;
1346                 else if (unlikely(!w))
1347                         lw->inv_weight = WMULT_CONST;
1348                 else
1349                         lw->inv_weight = WMULT_CONST / w;
1350         }
1351
1352         /*
1353          * Check whether we'd overflow the 64-bit multiplication:
1354          */
1355         if (unlikely(tmp > WMULT_CONST))
1356                 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1357                         WMULT_SHIFT/2);
1358         else
1359                 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1360
1361         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1362 }
1363
1364 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1365 {
1366         lw->weight += inc;
1367         lw->inv_weight = 0;
1368 }
1369
1370 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1371 {
1372         lw->weight -= dec;
1373         lw->inv_weight = 0;
1374 }
1375
1376 static inline void update_load_set(struct load_weight *lw, unsigned long w)
1377 {
1378         lw->weight = w;
1379         lw->inv_weight = 0;
1380 }
1381
1382 /*
1383  * To aid in avoiding the subversion of "niceness" due to uneven distribution
1384  * of tasks with abnormal "nice" values across CPUs the contribution that
1385  * each task makes to its run queue's load is weighted according to its
1386  * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1387  * scaled version of the new time slice allocation that they receive on time
1388  * slice expiry etc.
1389  */
1390
1391 #define WEIGHT_IDLEPRIO                3
1392 #define WMULT_IDLEPRIO         1431655765
1393
1394 /*
1395  * Nice levels are multiplicative, with a gentle 10% change for every
1396  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1397  * nice 1, it will get ~10% less CPU time than another CPU-bound task
1398  * that remained on nice 0.
1399  *
1400  * The "10% effect" is relative and cumulative: from _any_ nice level,
1401  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1402  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1403  * If a task goes up by ~10% and another task goes down by ~10% then
1404  * the relative distance between them is ~25%.)
1405  */
1406 static const int prio_to_weight[40] = {
1407  /* -20 */     88761,     71755,     56483,     46273,     36291,
1408  /* -15 */     29154,     23254,     18705,     14949,     11916,
1409  /* -10 */      9548,      7620,      6100,      4904,      3906,
1410  /*  -5 */      3121,      2501,      1991,      1586,      1277,
1411  /*   0 */      1024,       820,       655,       526,       423,
1412  /*   5 */       335,       272,       215,       172,       137,
1413  /*  10 */       110,        87,        70,        56,        45,
1414  /*  15 */        36,        29,        23,        18,        15,
1415 };
1416
1417 /*
1418  * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1419  *
1420  * In cases where the weight does not change often, we can use the
1421  * precalculated inverse to speed up arithmetics by turning divisions
1422  * into multiplications:
1423  */
1424 static const u32 prio_to_wmult[40] = {
1425  /* -20 */     48388,     59856,     76040,     92818,    118348,
1426  /* -15 */    147320,    184698,    229616,    287308,    360437,
1427  /* -10 */    449829,    563644,    704093,    875809,   1099582,
1428  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
1429  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
1430  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
1431  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
1432  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1433 };
1434
1435 /* Time spent by the tasks of the cpu accounting group executing in ... */
1436 enum cpuacct_stat_index {
1437         CPUACCT_STAT_USER,      /* ... user mode */
1438         CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
1439
1440         CPUACCT_STAT_NSTATS,
1441 };
1442
1443 #ifdef CONFIG_CGROUP_CPUACCT
1444 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1445 static void cpuacct_update_stats(struct task_struct *tsk,
1446                 enum cpuacct_stat_index idx, cputime_t val);
1447 #else
1448 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1449 static inline void cpuacct_update_stats(struct task_struct *tsk,
1450                 enum cpuacct_stat_index idx, cputime_t val) {}
1451 #endif
1452
1453 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1454 {
1455         update_load_add(&rq->load, load);
1456 }
1457
1458 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1459 {
1460         update_load_sub(&rq->load, load);
1461 }
1462
1463 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1464 typedef int (*tg_visitor)(struct task_group *, void *);
1465
1466 /*
1467  * Iterate the full tree, calling @down when first entering a node and @up when
1468  * leaving it for the final time.
1469  */
1470 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1471 {
1472         struct task_group *parent, *child;
1473         int ret;
1474
1475         rcu_read_lock();
1476         parent = &root_task_group;
1477 down:
1478         ret = (*down)(parent, data);
1479         if (ret)
1480                 goto out_unlock;
1481         list_for_each_entry_rcu(child, &parent->children, siblings) {
1482                 parent = child;
1483                 goto down;
1484
1485 up:
1486                 continue;
1487         }
1488         ret = (*up)(parent, data);
1489         if (ret)
1490                 goto out_unlock;
1491
1492         child = parent;
1493         parent = parent->parent;
1494         if (parent)
1495                 goto up;
1496 out_unlock:
1497         rcu_read_unlock();
1498
1499         return ret;
1500 }
1501
1502 static int tg_nop(struct task_group *tg, void *data)
1503 {
1504         return 0;
1505 }
1506 #endif
1507
1508 #ifdef CONFIG_SMP
1509 /* Used instead of source_load when we know the type == 0 */
1510 static unsigned long weighted_cpuload(const int cpu)
1511 {
1512         return cpu_rq(cpu)->load.weight;
1513 }
1514
1515 /*
1516  * Return a low guess at the load of a migration-source cpu weighted
1517  * according to the scheduling class and "nice" value.
1518  *
1519  * We want to under-estimate the load of migration sources, to
1520  * balance conservatively.
1521  */
1522 static unsigned long source_load(int cpu, int type)
1523 {
1524         struct rq *rq = cpu_rq(cpu);
1525         unsigned long total = weighted_cpuload(cpu);
1526
1527         if (type == 0 || !sched_feat(LB_BIAS))
1528                 return total;
1529
1530         return min(rq->cpu_load[type-1], total);
1531 }
1532
1533 /*
1534  * Return a high guess at the load of a migration-target cpu weighted
1535  * according to the scheduling class and "nice" value.
1536  */
1537 static unsigned long target_load(int cpu, int type)
1538 {
1539         struct rq *rq = cpu_rq(cpu);
1540         unsigned long total = weighted_cpuload(cpu);
1541
1542         if (type == 0 || !sched_feat(LB_BIAS))
1543                 return total;
1544
1545         return max(rq->cpu_load[type-1], total);
1546 }
1547
1548 static unsigned long power_of(int cpu)
1549 {
1550         return cpu_rq(cpu)->cpu_power;
1551 }
1552
1553 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1554
1555 static unsigned long cpu_avg_load_per_task(int cpu)
1556 {
1557         struct rq *rq = cpu_rq(cpu);
1558         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1559
1560         if (nr_running)
1561                 rq->avg_load_per_task = rq->load.weight / nr_running;
1562         else
1563                 rq->avg_load_per_task = 0;
1564
1565         return rq->avg_load_per_task;
1566 }
1567
1568 #ifdef CONFIG_FAIR_GROUP_SCHED
1569
1570 /*
1571  * Compute the cpu's hierarchical load factor for each task group.
1572  * This needs to be done in a top-down fashion because the load of a child
1573  * group is a fraction of its parents load.
1574  */
1575 static int tg_load_down(struct task_group *tg, void *data)
1576 {
1577         unsigned long load;
1578         long cpu = (long)data;
1579
1580         if (!tg->parent) {
1581                 load = cpu_rq(cpu)->load.weight;
1582         } else {
1583                 load = tg->parent->cfs_rq[cpu]->h_load;
1584                 load *= tg->se[cpu]->load.weight;
1585                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1586         }
1587
1588         tg->cfs_rq[cpu]->h_load = load;
1589
1590         return 0;
1591 }
1592
1593 static void update_h_load(long cpu)
1594 {
1595         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1596 }
1597
1598 #endif
1599
1600 #ifdef CONFIG_PREEMPT
1601
1602 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1603
1604 /*
1605  * fair double_lock_balance: Safely acquires both rq->locks in a fair
1606  * way at the expense of forcing extra atomic operations in all
1607  * invocations.  This assures that the double_lock is acquired using the
1608  * same underlying policy as the spinlock_t on this architecture, which
1609  * reduces latency compared to the unfair variant below.  However, it
1610  * also adds more overhead and therefore may reduce throughput.
1611  */
1612 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1613         __releases(this_rq->lock)
1614         __acquires(busiest->lock)
1615         __acquires(this_rq->lock)
1616 {
1617         raw_spin_unlock(&this_rq->lock);
1618         double_rq_lock(this_rq, busiest);
1619
1620         return 1;
1621 }
1622
1623 #else
1624 /*
1625  * Unfair double_lock_balance: Optimizes throughput at the expense of
1626  * latency by eliminating extra atomic operations when the locks are
1627  * already in proper order on entry.  This favors lower cpu-ids and will
1628  * grant the double lock to lower cpus over higher ids under contention,
1629  * regardless of entry order into the function.
1630  */
1631 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1632         __releases(this_rq->lock)
1633         __acquires(busiest->lock)
1634         __acquires(this_rq->lock)
1635 {
1636         int ret = 0;
1637
1638         if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1639                 if (busiest < this_rq) {
1640                         raw_spin_unlock(&this_rq->lock);
1641                         raw_spin_lock(&busiest->lock);
1642                         raw_spin_lock_nested(&this_rq->lock,
1643                                               SINGLE_DEPTH_NESTING);
1644                         ret = 1;
1645                 } else
1646                         raw_spin_lock_nested(&busiest->lock,
1647                                               SINGLE_DEPTH_NESTING);
1648         }
1649         return ret;
1650 }
1651
1652 #endif /* CONFIG_PREEMPT */
1653
1654 /*
1655  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1656  */
1657 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1658 {
1659         if (unlikely(!irqs_disabled())) {
1660                 /* printk() doesn't work good under rq->lock */
1661                 raw_spin_unlock(&this_rq->lock);
1662                 BUG_ON(1);
1663         }
1664
1665         return _double_lock_balance(this_rq, busiest);
1666 }
1667
1668 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1669         __releases(busiest->lock)
1670 {
1671         raw_spin_unlock(&busiest->lock);
1672         lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1673 }
1674
1675 /*
1676  * double_rq_lock - safely lock two runqueues
1677  *
1678  * Note this does not disable interrupts like task_rq_lock,
1679  * you need to do so manually before calling.
1680  */
1681 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1682         __acquires(rq1->lock)
1683         __acquires(rq2->lock)
1684 {
1685         BUG_ON(!irqs_disabled());
1686         if (rq1 == rq2) {
1687                 raw_spin_lock(&rq1->lock);
1688                 __acquire(rq2->lock);   /* Fake it out ;) */
1689         } else {
1690                 if (rq1 < rq2) {
1691                         raw_spin_lock(&rq1->lock);
1692                         raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1693                 } else {
1694                         raw_spin_lock(&rq2->lock);
1695                         raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1696                 }
1697         }
1698 }
1699
1700 /*
1701  * double_rq_unlock - safely unlock two runqueues
1702  *
1703  * Note this does not restore interrupts like task_rq_unlock,
1704  * you need to do so manually after calling.
1705  */
1706 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1707         __releases(rq1->lock)
1708         __releases(rq2->lock)
1709 {
1710         raw_spin_unlock(&rq1->lock);
1711         if (rq1 != rq2)
1712                 raw_spin_unlock(&rq2->lock);
1713         else
1714                 __release(rq2->lock);
1715 }
1716
1717 #else /* CONFIG_SMP */
1718
1719 /*
1720  * double_rq_lock - safely lock two runqueues
1721  *
1722  * Note this does not disable interrupts like task_rq_lock,
1723  * you need to do so manually before calling.
1724  */
1725 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1726         __acquires(rq1->lock)
1727         __acquires(rq2->lock)
1728 {
1729         BUG_ON(!irqs_disabled());
1730         BUG_ON(rq1 != rq2);
1731         raw_spin_lock(&rq1->lock);
1732         __acquire(rq2->lock);   /* Fake it out ;) */
1733 }
1734
1735 /*
1736  * double_rq_unlock - safely unlock two runqueues
1737  *
1738  * Note this does not restore interrupts like task_rq_unlock,
1739  * you need to do so manually after calling.
1740  */
1741 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1742         __releases(rq1->lock)
1743         __releases(rq2->lock)
1744 {
1745         BUG_ON(rq1 != rq2);
1746         raw_spin_unlock(&rq1->lock);
1747         __release(rq2->lock);
1748 }
1749
1750 #endif
1751
1752 static void calc_load_account_idle(struct rq *this_rq);
1753 static void update_sysctl(void);
1754 static int get_update_sysctl_factor(void);
1755 static void update_cpu_load(struct rq *this_rq);
1756
1757 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1758 {
1759         set_task_rq(p, cpu);
1760 #ifdef CONFIG_SMP
1761         /*
1762          * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1763          * successfuly executed on another CPU. We must ensure that updates of
1764          * per-task data have been completed by this moment.
1765          */
1766         smp_wmb();
1767         task_thread_info(p)->cpu = cpu;
1768 #endif
1769 }
1770
1771 static const struct sched_class rt_sched_class;
1772
1773 #define sched_class_highest (&stop_sched_class)
1774 #define for_each_class(class) \
1775    for (class = sched_class_highest; class; class = class->next)
1776
1777 #include "sched_stats.h"
1778
1779 static void inc_nr_running(struct rq *rq)
1780 {
1781         rq->nr_running++;
1782 }
1783
1784 static void dec_nr_running(struct rq *rq)
1785 {
1786         rq->nr_running--;
1787 }
1788
1789 static void set_load_weight(struct task_struct *p)
1790 {
1791         int prio = p->static_prio - MAX_RT_PRIO;
1792         struct load_weight *load = &p->se.load;
1793
1794         /*
1795          * SCHED_IDLE tasks get minimal weight:
1796          */
1797         if (p->policy == SCHED_IDLE) {
1798                 load->weight = scale_load(WEIGHT_IDLEPRIO);
1799                 load->inv_weight = WMULT_IDLEPRIO;
1800                 return;
1801         }
1802
1803         load->weight = scale_load(prio_to_weight[prio]);
1804         load->inv_weight = prio_to_wmult[prio];
1805 }
1806
1807 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1808 {
1809         update_rq_clock(rq);
1810         sched_info_queued(p);
1811         p->sched_class->enqueue_task(rq, p, flags);
1812 }
1813
1814 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1815 {
1816         update_rq_clock(rq);
1817         sched_info_dequeued(p);
1818         p->sched_class->dequeue_task(rq, p, flags);
1819 }
1820
1821 /*
1822  * activate_task - move a task to the runqueue.
1823  */
1824 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1825 {
1826         if (task_contributes_to_load(p))
1827                 rq->nr_uninterruptible--;
1828
1829         enqueue_task(rq, p, flags);
1830         inc_nr_running(rq);
1831 }
1832
1833 /*
1834  * deactivate_task - remove a task from the runqueue.
1835  */
1836 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1837 {
1838         if (task_contributes_to_load(p))
1839                 rq->nr_uninterruptible++;
1840
1841         dequeue_task(rq, p, flags);
1842         dec_nr_running(rq);
1843 }
1844
1845 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
1846
1847 /*
1848  * There are no locks covering percpu hardirq/softirq time.
1849  * They are only modified in account_system_vtime, on corresponding CPU
1850  * with interrupts disabled. So, writes are safe.
1851  * They are read and saved off onto struct rq in update_rq_clock().
1852  * This may result in other CPU reading this CPU's irq time and can
1853  * race with irq/account_system_vtime on this CPU. We would either get old
1854  * or new value with a side effect of accounting a slice of irq time to wrong
1855  * task when irq is in progress while we read rq->clock. That is a worthy
1856  * compromise in place of having locks on each irq in account_system_time.
1857  */
1858 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1859 static DEFINE_PER_CPU(u64, cpu_softirq_time);
1860
1861 static DEFINE_PER_CPU(u64, irq_start_time);
1862 static int sched_clock_irqtime;
1863
1864 void enable_sched_clock_irqtime(void)
1865 {
1866         sched_clock_irqtime = 1;
1867 }
1868
1869 void disable_sched_clock_irqtime(void)
1870 {
1871         sched_clock_irqtime = 0;
1872 }
1873
1874 #ifndef CONFIG_64BIT
1875 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1876
1877 static inline void irq_time_write_begin(void)
1878 {
1879         __this_cpu_inc(irq_time_seq.sequence);
1880         smp_wmb();
1881 }
1882
1883 static inline void irq_time_write_end(void)
1884 {
1885         smp_wmb();
1886         __this_cpu_inc(irq_time_seq.sequence);
1887 }
1888
1889 static inline u64 irq_time_read(int cpu)
1890 {
1891         u64 irq_time;
1892         unsigned seq;
1893
1894         do {
1895                 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1896                 irq_time = per_cpu(cpu_softirq_time, cpu) +
1897                            per_cpu(cpu_hardirq_time, cpu);
1898         } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1899
1900         return irq_time;
1901 }
1902 #else /* CONFIG_64BIT */
1903 static inline void irq_time_write_begin(void)
1904 {
1905 }
1906
1907 static inline void irq_time_write_end(void)
1908 {
1909 }
1910
1911 static inline u64 irq_time_read(int cpu)
1912 {
1913         return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1914 }
1915 #endif /* CONFIG_64BIT */
1916
1917 /*
1918  * Called before incrementing preempt_count on {soft,}irq_enter
1919  * and before decrementing preempt_count on {soft,}irq_exit.
1920  */
1921 void account_system_vtime(struct task_struct *curr)
1922 {
1923         unsigned long flags;
1924         s64 delta;
1925         int cpu;
1926
1927         if (!sched_clock_irqtime)
1928                 return;
1929
1930         local_irq_save(flags);
1931
1932         cpu = smp_processor_id();
1933         delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1934         __this_cpu_add(irq_start_time, delta);
1935
1936         irq_time_write_begin();
1937         /*
1938          * We do not account for softirq time from ksoftirqd here.
1939          * We want to continue accounting softirq time to ksoftirqd thread
1940          * in that case, so as not to confuse scheduler with a special task
1941          * that do not consume any time, but still wants to run.
1942          */
1943         if (hardirq_count())
1944                 __this_cpu_add(cpu_hardirq_time, delta);
1945         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1946                 __this_cpu_add(cpu_softirq_time, delta);
1947
1948         irq_time_write_end();
1949         local_irq_restore(flags);
1950 }
1951 EXPORT_SYMBOL_GPL(account_system_vtime);
1952
1953 static void update_rq_clock_task(struct rq *rq, s64 delta)
1954 {
1955         s64 irq_delta;
1956
1957         irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1958
1959         /*
1960          * Since irq_time is only updated on {soft,}irq_exit, we might run into
1961          * this case when a previous update_rq_clock() happened inside a
1962          * {soft,}irq region.
1963          *
1964          * When this happens, we stop ->clock_task and only update the
1965          * prev_irq_time stamp to account for the part that fit, so that a next
1966          * update will consume the rest. This ensures ->clock_task is
1967          * monotonic.
1968          *
1969          * It does however cause some slight miss-attribution of {soft,}irq
1970          * time, a more accurate solution would be to update the irq_time using
1971          * the current rq->clock timestamp, except that would require using
1972          * atomic ops.
1973          */
1974         if (irq_delta > delta)
1975                 irq_delta = delta;
1976
1977         rq->prev_irq_time += irq_delta;
1978         delta -= irq_delta;
1979         rq->clock_task += delta;
1980
1981         if (irq_delta && sched_feat(NONIRQ_POWER))
1982                 sched_rt_avg_update(rq, irq_delta);
1983 }
1984
1985 static int irqtime_account_hi_update(void)
1986 {
1987         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1988         unsigned long flags;
1989         u64 latest_ns;
1990         int ret = 0;
1991
1992         local_irq_save(flags);
1993         latest_ns = this_cpu_read(cpu_hardirq_time);
1994         if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1995                 ret = 1;
1996         local_irq_restore(flags);
1997         return ret;
1998 }
1999
2000 static int irqtime_account_si_update(void)
2001 {
2002         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2003         unsigned long flags;
2004         u64 latest_ns;
2005         int ret = 0;
2006
2007         local_irq_save(flags);
2008         latest_ns = this_cpu_read(cpu_softirq_time);
2009         if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2010                 ret = 1;
2011         local_irq_restore(flags);
2012         return ret;
2013 }
2014
2015 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
2016
2017 #define sched_clock_irqtime     (0)
2018
2019 static void update_rq_clock_task(struct rq *rq, s64 delta)
2020 {
2021         rq->clock_task += delta;
2022 }
2023
2024 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2025
2026 #include "sched_idletask.c"
2027 #include "sched_fair.c"
2028 #include "sched_rt.c"
2029 #include "sched_autogroup.c"
2030 #include "sched_stoptask.c"
2031 #ifdef CONFIG_SCHED_DEBUG
2032 # include "sched_debug.c"
2033 #endif
2034
2035 void sched_set_stop_task(int cpu, struct task_struct *stop)
2036 {
2037         struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2038         struct task_struct *old_stop = cpu_rq(cpu)->stop;
2039
2040         if (stop) {
2041                 /*
2042                  * Make it appear like a SCHED_FIFO task, its something
2043                  * userspace knows about and won't get confused about.
2044                  *
2045                  * Also, it will make PI more or less work without too
2046                  * much confusion -- but then, stop work should not
2047                  * rely on PI working anyway.
2048                  */
2049                 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2050
2051                 stop->sched_class = &stop_sched_class;
2052         }
2053
2054         cpu_rq(cpu)->stop = stop;
2055
2056         if (old_stop) {
2057                 /*
2058                  * Reset it back to a normal scheduling class so that
2059                  * it can die in pieces.
2060                  */
2061                 old_stop->sched_class = &rt_sched_class;
2062         }
2063 }
2064
2065 /*
2066  * __normal_prio - return the priority that is based on the static prio
2067  */
2068 static inline int __normal_prio(struct task_struct *p)
2069 {
2070         return p->static_prio;
2071 }
2072
2073 /*
2074  * Calculate the expected normal priority: i.e. priority
2075  * without taking RT-inheritance into account. Might be
2076  * boosted by interactivity modifiers. Changes upon fork,
2077  * setprio syscalls, and whenever the interactivity
2078  * estimator recalculates.
2079  */
2080 static inline int normal_prio(struct task_struct *p)
2081 {
2082         int prio;
2083
2084         if (task_has_rt_policy(p))
2085                 prio = MAX_RT_PRIO-1 - p->rt_priority;
2086         else
2087                 prio = __normal_prio(p);
2088         return prio;
2089 }
2090
2091 /*
2092  * Calculate the current priority, i.e. the priority
2093  * taken into account by the scheduler. This value might
2094  * be boosted by RT tasks, or might be boosted by
2095  * interactivity modifiers. Will be RT if the task got
2096  * RT-boosted. If not then it returns p->normal_prio.
2097  */
2098 static int effective_prio(struct task_struct *p)
2099 {
2100         p->normal_prio = normal_prio(p);
2101         /*
2102          * If we are RT tasks or we were boosted to RT priority,
2103          * keep the priority unchanged. Otherwise, update priority
2104          * to the normal priority:
2105          */
2106         if (!rt_prio(p->prio))
2107                 return p->normal_prio;
2108         return p->prio;
2109 }
2110
2111 /**
2112  * task_curr - is this task currently executing on a CPU?
2113  * @p: the task in question.
2114  */
2115 inline int task_curr(const struct task_struct *p)
2116 {
2117         return cpu_curr(task_cpu(p)) == p;
2118 }
2119
2120 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2121                                        const struct sched_class *prev_class,
2122                                        int oldprio)
2123 {
2124         if (prev_class != p->sched_class) {
2125                 if (prev_class->switched_from)
2126                         prev_class->switched_from(rq, p);
2127                 p->sched_class->switched_to(rq, p);
2128         } else if (oldprio != p->prio)
2129                 p->sched_class->prio_changed(rq, p, oldprio);
2130 }
2131
2132 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2133 {
2134         const struct sched_class *class;
2135
2136         if (p->sched_class == rq->curr->sched_class) {
2137                 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2138         } else {
2139                 for_each_class(class) {
2140                         if (class == rq->curr->sched_class)
2141                                 break;
2142                         if (class == p->sched_class) {
2143                                 resched_task(rq->curr);
2144                                 break;
2145                         }
2146                 }
2147         }
2148
2149         /*
2150          * A queue event has occurred, and we're going to schedule.  In
2151          * this case, we can save a useless back to back clock update.
2152          */
2153         if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2154                 rq->skip_clock_update = 1;
2155 }
2156
2157 #ifdef CONFIG_SMP
2158 /*
2159  * Is this task likely cache-hot:
2160  */
2161 static int
2162 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2163 {
2164         s64 delta;
2165
2166         if (p->sched_class != &fair_sched_class)
2167                 return 0;
2168
2169         if (unlikely(p->policy == SCHED_IDLE))
2170                 return 0;
2171
2172         /*
2173          * Buddy candidates are cache hot:
2174          */
2175         if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2176                         (&p->se == cfs_rq_of(&p->se)->next ||
2177                          &p->se == cfs_rq_of(&p->se)->last))
2178                 return 1;
2179
2180         if (sysctl_sched_migration_cost == -1)
2181                 return 1;
2182         if (sysctl_sched_migration_cost == 0)
2183                 return 0;
2184
2185         delta = now - p->se.exec_start;
2186
2187         return delta < (s64)sysctl_sched_migration_cost;
2188 }
2189
2190 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2191 {
2192 #ifdef CONFIG_SCHED_DEBUG
2193         /*
2194          * We should never call set_task_cpu() on a blocked task,
2195          * ttwu() will sort out the placement.
2196          */
2197         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2198                         !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2199
2200 #ifdef CONFIG_LOCKDEP
2201         /*
2202          * The caller should hold either p->pi_lock or rq->lock, when changing
2203          * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2204          *
2205          * sched_move_task() holds both and thus holding either pins the cgroup,
2206          * see task_group().
2207          *
2208          * Furthermore, all task_rq users should acquire both locks, see
2209          * task_rq_lock().
2210          */
2211         WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2212                                       lockdep_is_held(&task_rq(p)->lock)));
2213 #endif
2214 #endif
2215
2216         trace_sched_migrate_task(p, new_cpu);
2217
2218         if (task_cpu(p) != new_cpu) {
2219                 p->se.nr_migrations++;
2220                 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2221         }
2222
2223         __set_task_cpu(p, new_cpu);
2224 }
2225
2226 struct migration_arg {
2227         struct task_struct *task;
2228         int dest_cpu;
2229 };
2230
2231 static int migration_cpu_stop(void *data);
2232
2233 /*
2234  * wait_task_inactive - wait for a thread to unschedule.
2235  *
2236  * If @match_state is nonzero, it's the @p->state value just checked and
2237  * not expected to change.  If it changes, i.e. @p might have woken up,
2238  * then return zero.  When we succeed in waiting for @p to be off its CPU,
2239  * we return a positive number (its total switch count).  If a second call
2240  * a short while later returns the same number, the caller can be sure that
2241  * @p has remained unscheduled the whole time.
2242  *
2243  * The caller must ensure that the task *will* unschedule sometime soon,
2244  * else this function might spin for a *long* time. This function can't
2245  * be called with interrupts off, or it may introduce deadlock with
2246  * smp_call_function() if an IPI is sent by the same process we are
2247  * waiting to become inactive.
2248  */
2249 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2250 {
2251         unsigned long flags;
2252         int running, on_rq;
2253         unsigned long ncsw;
2254         struct rq *rq;
2255
2256         for (;;) {
2257                 /*
2258                  * We do the initial early heuristics without holding
2259                  * any task-queue locks at all. We'll only try to get
2260                  * the runqueue lock when things look like they will
2261                  * work out!
2262                  */
2263                 rq = task_rq(p);
2264
2265                 /*
2266                  * If the task is actively running on another CPU
2267                  * still, just relax and busy-wait without holding
2268                  * any locks.
2269                  *
2270                  * NOTE! Since we don't hold any locks, it's not
2271                  * even sure that "rq" stays as the right runqueue!
2272                  * But we don't care, since "task_running()" will
2273                  * return false if the runqueue has changed and p
2274                  * is actually now running somewhere else!
2275                  */
2276                 while (task_running(rq, p)) {
2277                         if (match_state && unlikely(p->state != match_state))
2278                                 return 0;
2279                         cpu_relax();
2280                 }
2281
2282                 /*
2283                  * Ok, time to look more closely! We need the rq
2284                  * lock now, to be *sure*. If we're wrong, we'll
2285                  * just go back and repeat.
2286                  */
2287                 rq = task_rq_lock(p, &flags);
2288                 trace_sched_wait_task(p);
2289                 running = task_running(rq, p);
2290                 on_rq = p->on_rq;
2291                 ncsw = 0;
2292                 if (!match_state || p->state == match_state)
2293                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2294                 task_rq_unlock(rq, p, &flags);
2295
2296                 /*
2297                  * If it changed from the expected state, bail out now.
2298                  */
2299                 if (unlikely(!ncsw))
2300                         break;
2301
2302                 /*
2303                  * Was it really running after all now that we
2304                  * checked with the proper locks actually held?
2305                  *
2306                  * Oops. Go back and try again..
2307                  */
2308                 if (unlikely(running)) {
2309                         cpu_relax();
2310                         continue;
2311                 }
2312
2313                 /*
2314                  * It's not enough that it's not actively running,
2315                  * it must be off the runqueue _entirely_, and not
2316                  * preempted!
2317                  *
2318                  * So if it was still runnable (but just not actively
2319                  * running right now), it's preempted, and we should
2320                  * yield - it could be a while.
2321                  */
2322                 if (unlikely(on_rq)) {
2323                         ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2324
2325                         set_current_state(TASK_UNINTERRUPTIBLE);
2326                         schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2327                         continue;
2328                 }
2329
2330                 /*
2331                  * Ahh, all good. It wasn't running, and it wasn't
2332                  * runnable, which means that it will never become
2333                  * running in the future either. We're all done!
2334                  */
2335                 break;
2336         }
2337
2338         return ncsw;
2339 }
2340
2341 /***
2342  * kick_process - kick a running thread to enter/exit the kernel
2343  * @p: the to-be-kicked thread
2344  *
2345  * Cause a process which is running on another CPU to enter
2346  * kernel-mode, without any delay. (to get signals handled.)
2347  *
2348  * NOTE: this function doesn't have to take the runqueue lock,
2349  * because all it wants to ensure is that the remote task enters
2350  * the kernel. If the IPI races and the task has been migrated
2351  * to another CPU then no harm is done and the purpose has been
2352  * achieved as well.
2353  */
2354 void kick_process(struct task_struct *p)
2355 {
2356         int cpu;
2357
2358         preempt_disable();
2359         cpu = task_cpu(p);
2360         if ((cpu != smp_processor_id()) && task_curr(p))
2361                 smp_send_reschedule(cpu);
2362         preempt_enable();
2363 }
2364 EXPORT_SYMBOL_GPL(kick_process);
2365 #endif /* CONFIG_SMP */
2366
2367 #ifdef CONFIG_SMP
2368 /*
2369  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2370  */
2371 static int select_fallback_rq(int cpu, struct task_struct *p)
2372 {
2373         int dest_cpu;
2374         const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2375
2376         /* Look for allowed, online CPU in same node. */
2377         for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2378                 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2379                         return dest_cpu;
2380
2381         /* Any allowed, online CPU? */
2382         dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2383         if (dest_cpu < nr_cpu_ids)
2384                 return dest_cpu;
2385
2386         /* No more Mr. Nice Guy. */
2387         dest_cpu = cpuset_cpus_allowed_fallback(p);
2388         /*
2389          * Don't tell them about moving exiting tasks or
2390          * kernel threads (both mm NULL), since they never
2391          * leave kernel.
2392          */
2393         if (p->mm && printk_ratelimit()) {
2394                 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2395                                 task_pid_nr(p), p->comm, cpu);
2396         }
2397
2398         return dest_cpu;
2399 }
2400
2401 /*
2402  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2403  */
2404 static inline
2405 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2406 {
2407         int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2408
2409         /*
2410          * In order not to call set_task_cpu() on a blocking task we need
2411          * to rely on ttwu() to place the task on a valid ->cpus_allowed
2412          * cpu.
2413          *
2414          * Since this is common to all placement strategies, this lives here.
2415          *
2416          * [ this allows ->select_task() to simply return task_cpu(p) and
2417          *   not worry about this generic constraint ]
2418          */
2419         if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2420                      !cpu_online(cpu)))
2421                 cpu = select_fallback_rq(task_cpu(p), p);
2422
2423         return cpu;
2424 }
2425
2426 static void update_avg(u64 *avg, u64 sample)
2427 {
2428         s64 diff = sample - *avg;
2429         *avg += diff >> 3;
2430 }
2431 #endif
2432
2433 static void
2434 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2435 {
2436 #ifdef CONFIG_SCHEDSTATS
2437         struct rq *rq = this_rq();
2438
2439 #ifdef CONFIG_SMP
2440         int this_cpu = smp_processor_id();
2441
2442         if (cpu == this_cpu) {
2443                 schedstat_inc(rq, ttwu_local);
2444                 schedstat_inc(p, se.statistics.nr_wakeups_local);
2445         } else {
2446                 struct sched_domain *sd;
2447
2448                 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2449                 rcu_read_lock();
2450                 for_each_domain(this_cpu, sd) {
2451                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2452                                 schedstat_inc(sd, ttwu_wake_remote);
2453                                 break;
2454                         }
2455                 }
2456                 rcu_read_unlock();
2457         }
2458
2459         if (wake_flags & WF_MIGRATED)
2460                 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2461
2462 #endif /* CONFIG_SMP */
2463
2464         schedstat_inc(rq, ttwu_count);
2465         schedstat_inc(p, se.statistics.nr_wakeups);
2466
2467         if (wake_flags & WF_SYNC)
2468                 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2469
2470 #endif /* CONFIG_SCHEDSTATS */
2471 }
2472
2473 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2474 {
2475         activate_task(rq, p, en_flags);
2476         p->on_rq = 1;
2477
2478         /* if a worker is waking up, notify workqueue */
2479         if (p->flags & PF_WQ_WORKER)
2480                 wq_worker_waking_up(p, cpu_of(rq));
2481 }
2482
2483 /*
2484  * Mark the task runnable and perform wakeup-preemption.
2485  */
2486 static void
2487 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2488 {
2489         trace_sched_wakeup(p, true);
2490         check_preempt_curr(rq, p, wake_flags);
2491
2492         p->state = TASK_RUNNING;
2493 #ifdef CONFIG_SMP
2494         if (p->sched_class->task_woken)
2495                 p->sched_class->task_woken(rq, p);
2496
2497         if (unlikely(rq->idle_stamp)) {
2498                 u64 delta = rq->clock - rq->idle_stamp;
2499                 u64 max = 2*sysctl_sched_migration_cost;
2500
2501                 if (delta > max)
2502                         rq->avg_idle = max;
2503                 else
2504                         update_avg(&rq->avg_idle, delta);
2505                 rq->idle_stamp = 0;
2506         }
2507 #endif
2508 }
2509
2510 static void
2511 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2512 {
2513 #ifdef CONFIG_SMP
2514         if (p->sched_contributes_to_load)
2515                 rq->nr_uninterruptible--;
2516 #endif
2517
2518         ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2519         ttwu_do_wakeup(rq, p, wake_flags);
2520 }
2521
2522 /*
2523  * Called in case the task @p isn't fully descheduled from its runqueue,
2524  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2525  * since all we need to do is flip p->state to TASK_RUNNING, since
2526  * the task is still ->on_rq.
2527  */
2528 static int ttwu_remote(struct task_struct *p, int wake_flags)
2529 {
2530         struct rq *rq;
2531         int ret = 0;
2532
2533         rq = __task_rq_lock(p);
2534         if (p->on_rq) {
2535                 ttwu_do_wakeup(rq, p, wake_flags);
2536                 ret = 1;
2537         }
2538         __task_rq_unlock(rq);
2539
2540         return ret;
2541 }
2542
2543 #ifdef CONFIG_SMP
2544 static void sched_ttwu_do_pending(struct task_struct *list)
2545 {
2546         struct rq *rq = this_rq();
2547
2548         raw_spin_lock(&rq->lock);
2549
2550         while (list) {
2551                 struct task_struct *p = list;
2552                 list = list->wake_entry;
2553                 ttwu_do_activate(rq, p, 0);
2554         }
2555
2556         raw_spin_unlock(&rq->lock);
2557 }
2558
2559 #ifdef CONFIG_HOTPLUG_CPU
2560
2561 static void sched_ttwu_pending(void)
2562 {
2563         struct rq *rq = this_rq();
2564         struct task_struct *list = xchg(&rq->wake_list, NULL);
2565
2566         if (!list)
2567                 return;
2568
2569         sched_ttwu_do_pending(list);
2570 }
2571
2572 #endif /* CONFIG_HOTPLUG_CPU */
2573
2574 void scheduler_ipi(void)
2575 {
2576         struct rq *rq = this_rq();
2577         struct task_struct *list = xchg(&rq->wake_list, NULL);
2578
2579         if (!list)
2580                 return;
2581
2582         /*
2583          * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2584          * traditionally all their work was done from the interrupt return
2585          * path. Now that we actually do some work, we need to make sure
2586          * we do call them.
2587          *
2588          * Some archs already do call them, luckily irq_enter/exit nest
2589          * properly.
2590          *
2591          * Arguably we should visit all archs and update all handlers,
2592          * however a fair share of IPIs are still resched only so this would
2593          * somewhat pessimize the simple resched case.
2594          */
2595         irq_enter();
2596         sched_ttwu_do_pending(list);
2597         irq_exit();
2598 }
2599
2600 static void ttwu_queue_remote(struct task_struct *p, int cpu)
2601 {
2602         struct rq *rq = cpu_rq(cpu);
2603         struct task_struct *next = rq->wake_list;
2604
2605         for (;;) {
2606                 struct task_struct *old = next;
2607
2608                 p->wake_entry = next;
2609                 next = cmpxchg(&rq->wake_list, old, p);
2610                 if (next == old)
2611                         break;
2612         }
2613
2614         if (!next)
2615                 smp_send_reschedule(cpu);
2616 }
2617
2618 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2619 static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2620 {
2621         struct rq *rq;
2622         int ret = 0;
2623
2624         rq = __task_rq_lock(p);
2625         if (p->on_cpu) {
2626                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2627                 ttwu_do_wakeup(rq, p, wake_flags);
2628                 ret = 1;
2629         }
2630         __task_rq_unlock(rq);
2631
2632         return ret;
2633
2634 }
2635 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2636 #endif /* CONFIG_SMP */
2637
2638 static void ttwu_queue(struct task_struct *p, int cpu)
2639 {
2640         struct rq *rq = cpu_rq(cpu);
2641
2642 #if defined(CONFIG_SMP)
2643         if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2644                 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2645                 ttwu_queue_remote(p, cpu);
2646                 return;
2647         }
2648 #endif
2649
2650         raw_spin_lock(&rq->lock);
2651         ttwu_do_activate(rq, p, 0);
2652         raw_spin_unlock(&rq->lock);
2653 }
2654
2655 /**
2656  * try_to_wake_up - wake up a thread
2657  * @p: the thread to be awakened
2658  * @state: the mask of task states that can be woken
2659  * @wake_flags: wake modifier flags (WF_*)
2660  *
2661  * Put it on the run-queue if it's not already there. The "current"
2662  * thread is always on the run-queue (except when the actual
2663  * re-schedule is in progress), and as such you're allowed to do
2664  * the simpler "current->state = TASK_RUNNING" to mark yourself
2665  * runnable without the overhead of this.
2666  *
2667  * Returns %true if @p was woken up, %false if it was already running
2668  * or @state didn't match @p's state.
2669  */
2670 static int
2671 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2672 {
2673         unsigned long flags;
2674         int cpu, success = 0;
2675
2676         smp_wmb();
2677         raw_spin_lock_irqsave(&p->pi_lock, flags);
2678         if (!(p->state & state))
2679                 goto out;
2680
2681         success = 1; /* we're going to change ->state */
2682         cpu = task_cpu(p);
2683
2684         if (p->on_rq && ttwu_remote(p, wake_flags))
2685                 goto stat;
2686
2687 #ifdef CONFIG_SMP
2688         /*
2689          * If the owning (remote) cpu is still in the middle of schedule() with
2690          * this task as prev, wait until its done referencing the task.
2691          */
2692         while (p->on_cpu) {
2693 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2694                 /*
2695                  * In case the architecture enables interrupts in
2696                  * context_switch(), we cannot busy wait, since that
2697                  * would lead to deadlocks when an interrupt hits and
2698                  * tries to wake up @prev. So bail and do a complete
2699                  * remote wakeup.
2700                  */
2701                 if (ttwu_activate_remote(p, wake_flags))
2702                         goto stat;
2703 #else
2704                 cpu_relax();
2705 #endif
2706         }
2707         /*
2708          * Pairs with the smp_wmb() in finish_lock_switch().
2709          */
2710         smp_rmb();
2711
2712         p->sched_contributes_to_load = !!task_contributes_to_load(p);
2713         p->state = TASK_WAKING;
2714
2715         if (p->sched_class->task_waking)
2716                 p->sched_class->task_waking(p);
2717
2718         cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2719         if (task_cpu(p) != cpu) {
2720                 wake_flags |= WF_MIGRATED;
2721                 set_task_cpu(p, cpu);
2722         }
2723 #endif /* CONFIG_SMP */
2724
2725         ttwu_queue(p, cpu);
2726 stat:
2727         ttwu_stat(p, cpu, wake_flags);
2728 out:
2729         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2730
2731         return success;
2732 }
2733
2734 /**
2735  * try_to_wake_up_local - try to wake up a local task with rq lock held
2736  * @p: the thread to be awakened
2737  *
2738  * Put @p on the run-queue if it's not already there. The caller must
2739  * ensure that this_rq() is locked, @p is bound to this_rq() and not
2740  * the current task.
2741  */
2742 static void try_to_wake_up_local(struct task_struct *p)
2743 {
2744         struct rq *rq = task_rq(p);
2745
2746         BUG_ON(rq != this_rq());
2747         BUG_ON(p == current);
2748         lockdep_assert_held(&rq->lock);
2749
2750         if (!raw_spin_trylock(&p->pi_lock)) {
2751                 raw_spin_unlock(&rq->lock);
2752                 raw_spin_lock(&p->pi_lock);
2753                 raw_spin_lock(&rq->lock);
2754         }
2755
2756         if (!(p->state & TASK_NORMAL))
2757                 goto out;
2758
2759         if (!p->on_rq)
2760                 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2761
2762         ttwu_do_wakeup(rq, p, 0);
2763         ttwu_stat(p, smp_processor_id(), 0);
2764 out:
2765         raw_spin_unlock(&p->pi_lock);
2766 }
2767
2768 /**
2769  * wake_up_process - Wake up a specific process
2770  * @p: The process to be woken up.
2771  *
2772  * Attempt to wake up the nominated process and move it to the set of runnable
2773  * processes.  Returns 1 if the process was woken up, 0 if it was already
2774  * running.
2775  *
2776  * It may be assumed that this function implies a write memory barrier before
2777  * changing the task state if and only if any tasks are woken up.
2778  */
2779 int wake_up_process(struct task_struct *p)
2780 {
2781         return try_to_wake_up(p, TASK_ALL, 0);
2782 }
2783 EXPORT_SYMBOL(wake_up_process);
2784
2785 int wake_up_state(struct task_struct *p, unsigned int state)
2786 {
2787         return try_to_wake_up(p, state, 0);
2788 }
2789
2790 /*
2791  * Perform scheduler related setup for a newly forked process p.
2792  * p is forked by current.
2793  *
2794  * __sched_fork() is basic setup used by init_idle() too:
2795  */
2796 static void __sched_fork(struct task_struct *p)
2797 {
2798         p->on_rq                        = 0;
2799
2800         p->se.on_rq                     = 0;
2801         p->se.exec_start                = 0;
2802         p->se.sum_exec_runtime          = 0;
2803         p->se.prev_sum_exec_runtime     = 0;
2804         p->se.nr_migrations             = 0;
2805         p->se.vruntime                  = 0;
2806         INIT_LIST_HEAD(&p->se.group_node);
2807
2808 #ifdef CONFIG_SCHEDSTATS
2809         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2810 #endif
2811
2812         INIT_LIST_HEAD(&p->rt.run_list);
2813
2814 #ifdef CONFIG_PREEMPT_NOTIFIERS
2815         INIT_HLIST_HEAD(&p->preempt_notifiers);
2816 #endif
2817 }
2818
2819 /*
2820  * fork()/clone()-time setup:
2821  */
2822 void sched_fork(struct task_struct *p)
2823 {
2824         unsigned long flags;
2825         int cpu = get_cpu();
2826
2827         __sched_fork(p);
2828         /*
2829          * We mark the process as running here. This guarantees that
2830          * nobody will actually run it, and a signal or other external
2831          * event cannot wake it up and insert it on the runqueue either.
2832          */
2833         p->state = TASK_RUNNING;
2834
2835         /*
2836          * Revert to default priority/policy on fork if requested.
2837          */
2838         if (unlikely(p->sched_reset_on_fork)) {
2839                 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2840                         p->policy = SCHED_NORMAL;
2841                         p->normal_prio = p->static_prio;
2842                 }
2843
2844                 if (PRIO_TO_NICE(p->static_prio) < 0) {
2845                         p->static_prio = NICE_TO_PRIO(0);
2846                         p->normal_prio = p->static_prio;
2847                         set_load_weight(p);
2848                 }
2849
2850                 /*
2851                  * We don't need the reset flag anymore after the fork. It has
2852                  * fulfilled its duty:
2853                  */
2854                 p->sched_reset_on_fork = 0;
2855         }
2856
2857         /*
2858          * Make sure we do not leak PI boosting priority to the child.
2859          */
2860         p->prio = current->normal_prio;
2861
2862         if (!rt_prio(p->prio))
2863                 p->sched_class = &fair_sched_class;
2864
2865         if (p->sched_class->task_fork)
2866                 p->sched_class->task_fork(p);
2867
2868         /*
2869          * The child is not yet in the pid-hash so no cgroup attach races,
2870          * and the cgroup is pinned to this child due to cgroup_fork()
2871          * is ran before sched_fork().
2872          *
2873          * Silence PROVE_RCU.
2874          */
2875         raw_spin_lock_irqsave(&p->pi_lock, flags);
2876         set_task_cpu(p, cpu);
2877         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2878
2879 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2880         if (likely(sched_info_on()))
2881                 memset(&p->sched_info, 0, sizeof(p->sched_info));
2882 #endif
2883 #if defined(CONFIG_SMP)
2884         p->on_cpu = 0;
2885 #endif
2886 #ifdef CONFIG_PREEMPT
2887         /* Want to start with kernel preemption disabled. */
2888         task_thread_info(p)->preempt_count = 1;
2889 #endif
2890 #ifdef CONFIG_SMP
2891         plist_node_init(&p->pushable_tasks, MAX_PRIO);
2892 #endif
2893
2894         put_cpu();
2895 }
2896
2897 /*
2898  * wake_up_new_task - wake up a newly created task for the first time.
2899  *
2900  * This function will do some initial scheduler statistics housekeeping
2901  * that must be done for every newly created context, then puts the task
2902  * on the runqueue and wakes it.
2903  */
2904 void wake_up_new_task(struct task_struct *p)
2905 {
2906         unsigned long flags;
2907         struct rq *rq;
2908
2909         raw_spin_lock_irqsave(&p->pi_lock, flags);
2910 #ifdef CONFIG_SMP
2911         /*
2912          * Fork balancing, do it here and not earlier because:
2913          *  - cpus_allowed can change in the fork path
2914          *  - any previously selected cpu might disappear through hotplug
2915          */
2916         set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2917 #endif
2918
2919         rq = __task_rq_lock(p);
2920         activate_task(rq, p, 0);
2921         p->on_rq = 1;
2922         trace_sched_wakeup_new(p, true);
2923         check_preempt_curr(rq, p, WF_FORK);
2924 #ifdef CONFIG_SMP
2925         if (p->sched_class->task_woken)
2926                 p->sched_class->task_woken(rq, p);
2927 #endif
2928         task_rq_unlock(rq, p, &flags);
2929 }
2930
2931 #ifdef CONFIG_PREEMPT_NOTIFIERS
2932
2933 /**
2934  * preempt_notifier_register - tell me when current is being preempted & rescheduled
2935  * @notifier: notifier struct to register
2936  */
2937 void preempt_notifier_register(struct preempt_notifier *notifier)
2938 {
2939         hlist_add_head(&notifier->link, &current->preempt_notifiers);
2940 }
2941 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2942
2943 /**
2944  * preempt_notifier_unregister - no longer interested in preemption notifications
2945  * @notifier: notifier struct to unregister
2946  *
2947  * This is safe to call from within a preemption notifier.
2948  */
2949 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2950 {
2951         hlist_del(&notifier->link);
2952 }
2953 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2954
2955 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2956 {
2957         struct preempt_notifier *notifier;
2958         struct hlist_node *node;
2959
2960         hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2961                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2962 }
2963
2964 static void
2965 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2966                                  struct task_struct *next)
2967 {
2968         struct preempt_notifier *notifier;
2969         struct hlist_node *node;
2970
2971         hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2972                 notifier->ops->sched_out(notifier, next);
2973 }
2974
2975 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2976
2977 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2978 {
2979 }
2980
2981 static void
2982 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2983                                  struct task_struct *next)
2984 {
2985 }
2986
2987 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2988
2989 /**
2990  * prepare_task_switch - prepare to switch tasks
2991  * @rq: the runqueue preparing to switch
2992  * @prev: the current task that is being switched out
2993  * @next: the task we are going to switch to.
2994  *
2995  * This is called with the rq lock held and interrupts off. It must
2996  * be paired with a subsequent finish_task_switch after the context
2997  * switch.
2998  *
2999  * prepare_task_switch sets up locking and calls architecture specific
3000  * hooks.
3001  */
3002 static inline void
3003 prepare_task_switch(struct rq *rq, struct task_struct *prev,
3004                     struct task_struct *next)
3005 {
3006         sched_info_switch(prev, next);
3007         perf_event_task_sched_out(prev, next);
3008         fire_sched_out_preempt_notifiers(prev, next);
3009         prepare_lock_switch(rq, next);
3010         prepare_arch_switch(next);
3011         trace_sched_switch(prev, next);
3012 }
3013
3014 /**
3015  * finish_task_switch - clean up after a task-switch
3016  * @rq: runqueue associated with task-switch
3017  * @prev: the thread we just switched away from.
3018  *
3019  * finish_task_switch must be called after the context switch, paired
3020  * with a prepare_task_switch call before the context switch.
3021  * finish_task_switch will reconcile locking set up by prepare_task_switch,
3022  * and do any other architecture-specific cleanup actions.
3023  *
3024  * Note that we may have delayed dropping an mm in context_switch(). If
3025  * so, we finish that here outside of the runqueue lock. (Doing it
3026  * with the lock held can cause deadlocks; see schedule() for
3027  * details.)
3028  */
3029 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
3030         __releases(rq->lock)
3031 {
3032         struct mm_struct *mm = rq->prev_mm;
3033         long prev_state;
3034
3035         rq->prev_mm = NULL;
3036
3037         /*
3038          * A task struct has one reference for the use as "current".
3039          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3040          * schedule one last time. The schedule call will never return, and
3041          * the scheduled task must drop that reference.
3042          * The test for TASK_DEAD must occur while the runqueue locks are
3043          * still held, otherwise prev could be scheduled on another cpu, die
3044          * there before we look at prev->state, and then the reference would
3045          * be dropped twice.
3046          *              Manfred Spraul <manfred@colorfullife.com>
3047          */
3048         prev_state = prev->state;
3049         finish_arch_switch(prev);
3050 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3051         local_irq_disable();
3052 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3053         perf_event_task_sched_in(current);
3054 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3055         local_irq_enable();
3056 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
3057         finish_lock_switch(rq, prev);
3058
3059         fire_sched_in_preempt_notifiers(current);
3060         if (mm)
3061                 mmdrop(mm);
3062         if (unlikely(prev_state == TASK_DEAD)) {
3063                 /*
3064                  * Remove function-return probe instances associated with this
3065                  * task and put them back on the free list.
3066                  */
3067                 kprobe_flush_task(prev);
3068                 put_task_struct(prev);
3069         }
3070 }
3071
3072 #ifdef CONFIG_SMP
3073
3074 /* assumes rq->lock is held */
3075 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3076 {
3077         if (prev->sched_class->pre_schedule)
3078                 prev->sched_class->pre_schedule(rq, prev);
3079 }
3080
3081 /* rq->lock is NOT held, but preemption is disabled */
3082 static inline void post_schedule(struct rq *rq)
3083 {
3084         if (rq->post_schedule) {
3085                 unsigned long flags;
3086
3087                 raw_spin_lock_irqsave(&rq->lock, flags);
3088                 if (rq->curr->sched_class->post_schedule)
3089                         rq->curr->sched_class->post_schedule(rq);
3090                 raw_spin_unlock_irqrestore(&rq->lock, flags);
3091
3092                 rq->post_schedule = 0;
3093         }
3094 }
3095
3096 #else
3097
3098 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3099 {
3100 }
3101
3102 static inline void post_schedule(struct rq *rq)
3103 {
3104 }
3105
3106 #endif
3107
3108 /**
3109  * schedule_tail - first thing a freshly forked thread must call.
3110  * @prev: the thread we just switched away from.
3111  */
3112 asmlinkage void schedule_tail(struct task_struct *prev)
3113         __releases(rq->lock)
3114 {
3115         struct rq *rq = this_rq();
3116
3117         finish_task_switch(rq, prev);
3118
3119         /*
3120          * FIXME: do we need to worry about rq being invalidated by the
3121          * task_switch?
3122          */
3123         post_schedule(rq);
3124
3125 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
3126         /* In this case, finish_task_switch does not reenable preemption */
3127         preempt_enable();
3128 #endif
3129         if (current->set_child_tid)
3130                 put_user(task_pid_vnr(current), current->set_child_tid);
3131 }
3132
3133 /*
3134  * context_switch - switch to the new MM and the new
3135  * thread's register state.
3136  */
3137 static inline void
3138 context_switch(struct rq *rq, struct task_struct *prev,
3139                struct task_struct *next)
3140 {
3141         struct mm_struct *mm, *oldmm;
3142
3143         prepare_task_switch(rq, prev, next);
3144
3145         mm = next->mm;
3146         oldmm = prev->active_mm;
3147         /*
3148          * For paravirt, this is coupled with an exit in switch_to to
3149          * combine the page table reload and the switch backend into
3150          * one hypercall.
3151          */
3152         arch_start_context_switch(prev);
3153
3154         if (!mm) {
3155                 next->active_mm = oldmm;
3156                 atomic_inc(&oldmm->mm_count);
3157                 enter_lazy_tlb(oldmm, next);
3158         } else
3159                 switch_mm(oldmm, mm, next);
3160
3161         if (!prev->mm) {
3162                 prev->active_mm = NULL;
3163                 rq->prev_mm = oldmm;
3164         }
3165         /*
3166          * Since the runqueue lock will be released by the next
3167          * task (which is an invalid locking op but in the case
3168          * of the scheduler it's an obvious special-case), so we
3169          * do an early lockdep release here:
3170          */
3171 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
3172         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
3173 #endif
3174
3175         /* Here we just switch the register state and the stack. */
3176         switch_to(prev, next, prev);
3177
3178         barrier();
3179         /*
3180          * this_rq must be evaluated again because prev may have moved
3181          * CPUs since it called schedule(), thus the 'rq' on its stack
3182          * frame will be invalid.
3183          */
3184         finish_task_switch(this_rq(), prev);
3185 }
3186
3187 /*
3188  * nr_running, nr_uninterruptible and nr_context_switches:
3189  *
3190  * externally visible scheduler statistics: current number of runnable
3191  * threads, current number of uninterruptible-sleeping threads, total
3192  * number of context switches performed since bootup.
3193  */
3194 unsigned long nr_running(void)
3195 {
3196         unsigned long i, sum = 0;
3197
3198         for_each_online_cpu(i)
3199                 sum += cpu_rq(i)->nr_running;
3200
3201         return sum;
3202 }
3203
3204 unsigned long nr_uninterruptible(void)
3205 {
3206         unsigned long i, sum = 0;
3207
3208         for_each_possible_cpu(i)
3209                 sum += cpu_rq(i)->nr_uninterruptible;
3210
3211         /*
3212          * Since we read the counters lockless, it might be slightly
3213          * inaccurate. Do not allow it to go below zero though:
3214          */
3215         if (unlikely((long)sum < 0))
3216                 sum = 0;
3217
3218         return sum;
3219 }
3220
3221 unsigned long long nr_context_switches(void)
3222 {
3223         int i;
3224         unsigned long long sum = 0;
3225
3226         for_each_possible_cpu(i)
3227                 sum += cpu_rq(i)->nr_switches;
3228
3229         return sum;
3230 }
3231
3232 unsigned long nr_iowait(void)
3233 {
3234         unsigned long i, sum = 0;
3235
3236         for_each_possible_cpu(i)
3237                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3238
3239         return sum;
3240 }
3241
3242 unsigned long nr_iowait_cpu(int cpu)
3243 {
3244         struct rq *this = cpu_rq(cpu);
3245         return atomic_read(&this->nr_iowait);
3246 }
3247
3248 unsigned long this_cpu_load(void)
3249 {
3250         struct rq *this = this_rq();
3251         return this->cpu_load[0];
3252 }
3253
3254
3255 /* Variables and functions for calc_load */
3256 static atomic_long_t calc_load_tasks;
3257 static unsigned long calc_load_update;
3258 unsigned long avenrun[3];
3259 EXPORT_SYMBOL(avenrun);
3260
3261 static long calc_load_fold_active(struct rq *this_rq)
3262 {
3263         long nr_active, delta = 0;
3264
3265         nr_active = this_rq->nr_running;
3266         nr_active += (long) this_rq->nr_uninterruptible;
3267
3268         if (nr_active != this_rq->calc_load_active) {
3269                 delta = nr_active - this_rq->calc_load_active;
3270                 this_rq->calc_load_active = nr_active;
3271         }
3272
3273         return delta;
3274 }
3275
3276 static unsigned long
3277 calc_load(unsigned long load, unsigned long exp, unsigned long active)
3278 {
3279         load *= exp;
3280         load += active * (FIXED_1 - exp);
3281         load += 1UL << (FSHIFT - 1);
3282         return load >> FSHIFT;
3283 }
3284
3285 #ifdef CONFIG_NO_HZ
3286 /*
3287  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3288  *
3289  * When making the ILB scale, we should try to pull this in as well.
3290  */
3291 static atomic_long_t calc_load_tasks_idle;
3292
3293 static void calc_load_account_idle(struct rq *this_rq)
3294 {
3295         long delta;
3296
3297         delta = calc_load_fold_active(this_rq);
3298         if (delta)
3299                 atomic_long_add(delta, &calc_load_tasks_idle);
3300 }
3301
3302 static long calc_load_fold_idle(void)
3303 {
3304         long delta = 0;
3305
3306         /*
3307          * Its got a race, we don't care...
3308          */
3309         if (atomic_long_read(&calc_load_tasks_idle))
3310                 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3311
3312         return delta;
3313 }
3314
3315 /**
3316  * fixed_power_int - compute: x^n, in O(log n) time
3317  *
3318  * @x:         base of the power
3319  * @frac_bits: fractional bits of @x
3320  * @n:         power to raise @x to.
3321  *
3322  * By exploiting the relation between the definition of the natural power
3323  * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3324  * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3325  * (where: n_i \elem {0, 1}, the binary vector representing n),
3326  * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3327  * of course trivially computable in O(log_2 n), the length of our binary
3328  * vector.
3329  */
3330 static unsigned long
3331 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3332 {
3333         unsigned long result = 1UL << frac_bits;
3334
3335         if (n) for (;;) {
3336                 if (n & 1) {
3337                         result *= x;
3338                         result += 1UL << (frac_bits - 1);
3339                         result >>= frac_bits;
3340                 }
3341                 n >>= 1;
3342                 if (!n)
3343                         break;
3344                 x *= x;
3345                 x += 1UL << (frac_bits - 1);
3346                 x >>= frac_bits;
3347         }
3348
3349         return result;
3350 }
3351
3352 /*
3353  * a1 = a0 * e + a * (1 - e)
3354  *
3355  * a2 = a1 * e + a * (1 - e)
3356  *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3357  *    = a0 * e^2 + a * (1 - e) * (1 + e)
3358  *
3359  * a3 = a2 * e + a * (1 - e)
3360  *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3361  *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3362  *
3363  *  ...
3364  *
3365  * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3366  *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3367  *    = a0 * e^n + a * (1 - e^n)
3368  *
3369  * [1] application of the geometric series:
3370  *
3371  *              n         1 - x^(n+1)
3372  *     S_n := \Sum x^i = -------------
3373  *             i=0          1 - x
3374  */
3375 static unsigned long
3376 calc_load_n(unsigned long load, unsigned long exp,
3377             unsigned long active, unsigned int n)
3378 {
3379
3380         return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3381 }
3382
3383 /*
3384  * NO_HZ can leave us missing all per-cpu ticks calling
3385  * calc_load_account_active(), but since an idle CPU folds its delta into
3386  * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3387  * in the pending idle delta if our idle period crossed a load cycle boundary.
3388  *
3389  * Once we've updated the global active value, we need to apply the exponential
3390  * weights adjusted to the number of cycles missed.
3391  */
3392 static void calc_global_nohz(void)
3393 {
3394         long delta, active, n;
3395
3396         /*
3397          * If we crossed a calc_load_update boundary, make sure to fold
3398          * any pending idle changes, the respective CPUs might have
3399          * missed the tick driven calc_load_account_active() update
3400          * due to NO_HZ.
3401          */
3402         delta = calc_load_fold_idle();
3403         if (delta)
3404                 atomic_long_add(delta, &calc_load_tasks);
3405
3406         /*
3407          * It could be the one fold was all it took, we done!
3408          */
3409         if (time_before(jiffies, calc_load_update + 10))
3410                 return;
3411
3412         /*
3413          * Catch-up, fold however many we are behind still
3414          */
3415         delta = jiffies - calc_load_update - 10;
3416         n = 1 + (delta / LOAD_FREQ);
3417
3418         active = atomic_long_read(&calc_load_tasks);
3419         active = active > 0 ? active * FIXED_1 : 0;
3420
3421         avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3422         avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3423         avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3424
3425         calc_load_update += n * LOAD_FREQ;
3426 }
3427 #else
3428 static void calc_load_account_idle(struct rq *this_rq)
3429 {
3430 }
3431
3432 static inline long calc_load_fold_idle(void)
3433 {
3434         return 0;
3435 }
3436
3437 static void calc_global_nohz(void)
3438 {
3439 }
3440 #endif
3441
3442 /**
3443  * get_avenrun - get the load average array
3444  * @loads:      pointer to dest load array
3445  * @offset:     offset to add
3446  * @shift:      shift count to shift the result left
3447  *
3448  * These values are estimates at best, so no need for locking.
3449  */
3450 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3451 {
3452         loads[0] = (avenrun[0] + offset) << shift;
3453         loads[1] = (avenrun[1] + offset) << shift;
3454         loads[2] = (avenrun[2] + offset) << shift;
3455 }
3456
3457 /*
3458  * calc_load - update the avenrun load estimates 10 ticks after the
3459  * CPUs have updated calc_load_tasks.
3460  */
3461 void calc_global_load(unsigned long ticks)
3462 {
3463         long active;
3464
3465         if (time_before(jiffies, calc_load_update + 10))
3466                 return;
3467
3468         active = atomic_long_read(&calc_load_tasks);
3469         active = active > 0 ? active * FIXED_1 : 0;
3470
3471         avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3472         avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3473         avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3474
3475         calc_load_update += LOAD_FREQ;
3476
3477         /*
3478          * Account one period with whatever state we found before
3479          * folding in the nohz state and ageing the entire idle period.
3480          *
3481          * This avoids loosing a sample when we go idle between
3482          * calc_load_account_active() (10 ticks ago) and now and thus
3483          * under-accounting.
3484          */
3485         calc_global_nohz();
3486 }
3487
3488 /*
3489  * Called from update_cpu_load() to periodically update this CPU's
3490  * active count.
3491  */
3492 static void calc_load_account_active(struct rq *this_rq)
3493 {
3494         long delta;
3495
3496         if (time_before(jiffies, this_rq->calc_load_update))
3497                 return;
3498
3499         delta  = calc_load_fold_active(this_rq);
3500         delta += calc_load_fold_idle();
3501         if (delta)
3502                 atomic_long_add(delta, &calc_load_tasks);
3503
3504         this_rq->calc_load_update += LOAD_FREQ;
3505 }
3506
3507 /*
3508  * The exact cpuload at various idx values, calculated at every tick would be
3509  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3510  *
3511  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3512  * on nth tick when cpu may be busy, then we have:
3513  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3514  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3515  *
3516  * decay_load_missed() below does efficient calculation of
3517  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3518  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3519  *
3520  * The calculation is approximated on a 128 point scale.
3521  * degrade_zero_ticks is the number of ticks after which load at any
3522  * particular idx is approximated to be zero.
3523  * degrade_factor is a precomputed table, a row for each load idx.
3524  * Each column corresponds to degradation factor for a power of two ticks,
3525  * based on 128 point scale.
3526  * Example:
3527  * row 2, col 3 (=12) says that the degradation at load idx 2 after
3528  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3529  *
3530  * With this power of 2 load factors, we can degrade the load n times
3531  * by looking at 1 bits in n and doing as many mult/shift instead of
3532  * n mult/shifts needed by the exact degradation.
3533  */
3534 #define DEGRADE_SHIFT           7
3535 static const unsigned char
3536                 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3537 static const unsigned char
3538                 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3539                                         {0, 0, 0, 0, 0, 0, 0, 0},
3540                                         {64, 32, 8, 0, 0, 0, 0, 0},
3541                                         {96, 72, 40, 12, 1, 0, 0},
3542                                         {112, 98, 75, 43, 15, 1, 0},
3543                                         {120, 112, 98, 76, 45, 16, 2} };
3544
3545 /*
3546  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3547  * would be when CPU is idle and so we just decay the old load without
3548  * adding any new load.
3549  */
3550 static unsigned long
3551 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3552 {
3553         int j = 0;
3554
3555         if (!missed_updates)
3556                 return load;
3557
3558         if (missed_updates >= degrade_zero_ticks[idx])
3559                 return 0;
3560
3561         if (idx == 1)
3562                 return load >> missed_updates;
3563
3564         while (missed_updates) {
3565                 if (missed_updates % 2)
3566                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3567
3568                 missed_updates >>= 1;
3569                 j++;
3570         }
3571         return load;
3572 }
3573
3574 /*
3575  * Update rq->cpu_load[] statistics. This function is usually called every
3576  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3577  * every tick. We fix it up based on jiffies.
3578  */
3579 static void update_cpu_load(struct rq *this_rq)
3580 {
3581         unsigned long this_load = this_rq->load.weight;
3582         unsigned long curr_jiffies = jiffies;
3583         unsigned long pending_updates;
3584         int i, scale;
3585
3586         this_rq->nr_load_updates++;
3587
3588         /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3589         if (curr_jiffies == this_rq->last_load_update_tick)
3590                 return;
3591
3592         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3593         this_rq->last_load_update_tick = curr_jiffies;
3594
3595         /* Update our load: */
3596         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3597         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3598                 unsigned long old_load, new_load;
3599
3600                 /* scale is effectively 1 << i now, and >> i divides by scale */
3601
3602                 old_load = this_rq->cpu_load[i];
3603                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3604                 new_load = this_load;
3605                 /*
3606                  * Round up the averaging division if load is increasing. This
3607                  * prevents us from getting stuck on 9 if the load is 10, for
3608                  * example.
3609                  */
3610                 if (new_load > old_load)
3611                         new_load += scale - 1;
3612
3613                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3614         }
3615
3616         sched_avg_update(this_rq);
3617 }
3618
3619 static void update_cpu_load_active(struct rq *this_rq)
3620 {
3621         update_cpu_load(this_rq);
3622
3623         calc_load_account_active(this_rq);
3624 }
3625
3626 #ifdef CONFIG_SMP
3627
3628 /*
3629  * sched_exec - execve() is a valuable balancing opportunity, because at
3630  * this point the task has the smallest effective memory and cache footprint.
3631  */
3632 void sched_exec(void)
3633 {
3634         struct task_struct *p = current;
3635         unsigned long flags;
3636         int dest_cpu;
3637
3638         raw_spin_lock_irqsave(&p->pi_lock, flags);
3639         dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3640         if (dest_cpu == smp_processor_id())
3641                 goto unlock;
3642
3643         if (likely(cpu_active(dest_cpu))) {
3644                 struct migration_arg arg = { p, dest_cpu };
3645
3646                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3647                 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3648                 return;
3649         }
3650 unlock:
3651         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3652 }
3653
3654 #endif
3655
3656 DEFINE_PER_CPU(struct kernel_stat, kstat);
3657
3658 EXPORT_PER_CPU_SYMBOL(kstat);
3659
3660 /*
3661  * Return any ns on the sched_clock that have not yet been accounted in
3662  * @p in case that task is currently running.
3663  *
3664  * Called with task_rq_lock() held on @rq.
3665  */
3666 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3667 {
3668         u64 ns = 0;
3669
3670         if (task_current(rq, p)) {
3671                 update_rq_clock(rq);
3672                 ns = rq->clock_task - p->se.exec_start;
3673                 if ((s64)ns < 0)
3674                         ns = 0;
3675         }
3676
3677         return ns;
3678 }
3679
3680 unsigned long long task_delta_exec(struct task_struct *p)
3681 {
3682         unsigned long flags;
3683         struct rq *rq;
3684         u64 ns = 0;
3685
3686         rq = task_rq_lock(p, &flags);
3687         ns = do_task_delta_exec(p, rq);
3688         task_rq_unlock(rq, p, &flags);
3689
3690         return ns;
3691 }
3692
3693 /*
3694  * Return accounted runtime for the task.
3695  * In case the task is currently running, return the runtime plus current's
3696  * pending runtime that have not been accounted yet.
3697  */
3698 unsigned long long task_sched_runtime(struct task_struct *p)
3699 {
3700         unsigned long flags;
3701         struct rq *rq;
3702         u64 ns = 0;
3703
3704         rq = task_rq_lock(p, &flags);
3705         ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3706         task_rq_unlock(rq, p, &flags);
3707
3708         return ns;
3709 }
3710
3711 /*
3712  * Account user cpu time to a process.
3713  * @p: the process that the cpu time gets accounted to
3714  * @cputime: the cpu time spent in user space since the last update
3715  * @cputime_scaled: cputime scaled by cpu frequency
3716  */
3717 void account_user_time(struct task_struct *p, cputime_t cputime,
3718                        cputime_t cputime_scaled)
3719 {
3720         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3721         cputime64_t tmp;
3722
3723         /* Add user time to process. */
3724         p->utime = cputime_add(p->utime, cputime);
3725         p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3726         account_group_user_time(p, cputime);
3727
3728         /* Add user time to cpustat. */
3729         tmp = cputime_to_cputime64(cputime);
3730         if (TASK_NICE(p) > 0)
3731                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3732         else
3733                 cpustat->user = cputime64_add(cpustat->user, tmp);
3734
3735         cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3736         /* Account for user time used */
3737         acct_update_integrals(p);
3738 }
3739
3740 /*
3741  * Account guest cpu time to a process.
3742  * @p: the process that the cpu time gets accounted to
3743  * @cputime: the cpu time spent in virtual machine since the last update
3744  * @cputime_scaled: cputime scaled by cpu frequency
3745  */
3746 static void account_guest_time(struct task_struct *p, cputime_t cputime,
3747                                cputime_t cputime_scaled)
3748 {
3749         cputime64_t tmp;
3750         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3751
3752         tmp = cputime_to_cputime64(cputime);
3753
3754         /* Add guest time to process. */
3755         p->utime = cputime_add(p->utime, cputime);
3756         p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3757         account_group_user_time(p, cputime);
3758         p->gtime = cputime_add(p->gtime, cputime);
3759
3760         /* Add guest time to cpustat. */
3761         if (TASK_NICE(p) > 0) {
3762                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3763                 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3764         } else {
3765                 cpustat->user = cputime64_add(cpustat->user, tmp);
3766                 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3767         }
3768 }
3769
3770 /*
3771  * Account system cpu time to a process and desired cpustat field
3772  * @p: the process that the cpu time gets accounted to
3773  * @cputime: the cpu time spent in kernel space since the last update
3774  * @cputime_scaled: cputime scaled by cpu frequency
3775  * @target_cputime64: pointer to cpustat field that has to be updated
3776  */
3777 static inline
3778 void __account_system_time(struct task_struct *p, cputime_t cputime,
3779                         cputime_t cputime_scaled, cputime64_t *target_cputime64)
3780 {
3781         cputime64_t tmp = cputime_to_cputime64(cputime);
3782
3783         /* Add system time to process. */
3784         p->stime = cputime_add(p->stime, cputime);
3785         p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3786         account_group_system_time(p, cputime);
3787
3788         /* Add system time to cpustat. */
3789         *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3790         cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3791
3792         /* Account for system time used */
3793         acct_update_integrals(p);
3794 }
3795
3796 /*
3797  * Account system cpu time to a process.
3798  * @p: the process that the cpu time gets accounted to
3799  * @hardirq_offset: the offset to subtract from hardirq_count()
3800  * @cputime: the cpu time spent in kernel space since the last update
3801  * @cputime_scaled: cputime scaled by cpu frequency
3802  */
3803 void account_system_time(struct task_struct *p, int hardirq_offset,
3804                          cputime_t cputime, cputime_t cputime_scaled)
3805 {
3806         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3807         cputime64_t *target_cputime64;
3808
3809         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3810                 account_guest_time(p, cputime, cputime_scaled);
3811                 return;
3812         }
3813
3814         if (hardirq_count() - hardirq_offset)
3815                 target_cputime64 = &cpustat->irq;
3816         else if (in_serving_softirq())
3817                 target_cputime64 = &cpustat->softirq;
3818         else
3819                 target_cputime64 = &cpustat->system;
3820
3821         __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3822 }
3823
3824 /*
3825  * Account for involuntary wait time.
3826  * @cputime: the cpu time spent in involuntary wait
3827  */
3828 void account_steal_time(cputime_t cputime)
3829 {
3830         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3831         cputime64_t cputime64 = cputime_to_cputime64(cputime);
3832
3833         cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3834 }
3835
3836 /*
3837  * Account for idle time.
3838  * @cputime: the cpu time spent in idle wait
3839  */
3840 void account_idle_time(cputime_t cputime)
3841 {
3842         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3843         cputime64_t cputime64 = cputime_to_cputime64(cputime);
3844         struct rq *rq = this_rq();
3845
3846         if (atomic_read(&rq->nr_iowait) > 0)
3847                 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3848         else
3849                 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3850 }
3851
3852 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
3853
3854 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
3855 /*
3856  * Account a tick to a process and cpustat
3857  * @p: the process that the cpu time gets accounted to
3858  * @user_tick: is the tick from userspace
3859  * @rq: the pointer to rq
3860  *
3861  * Tick demultiplexing follows the order
3862  * - pending hardirq update
3863  * - pending softirq update
3864  * - user_time
3865  * - idle_time
3866  * - system time
3867  *   - check for guest_time
3868  *   - else account as system_time
3869  *
3870  * Check for hardirq is done both for system and user time as there is
3871  * no timer going off while we are on hardirq and hence we may never get an
3872  * opportunity to update it solely in system time.
3873  * p->stime and friends are only updated on system time and not on irq
3874  * softirq as those do not count in task exec_runtime any more.
3875  */
3876 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3877                                                 struct rq *rq)
3878 {
3879         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3880         cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3881         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3882
3883         if (irqtime_account_hi_update()) {
3884                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3885         } else if (irqtime_account_si_update()) {
3886                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3887         } else if (this_cpu_ksoftirqd() == p) {
3888                 /*
3889                  * ksoftirqd time do not get accounted in cpu_softirq_time.
3890                  * So, we have to handle it separately here.
3891                  * Also, p->stime needs to be updated for ksoftirqd.
3892                  */
3893                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3894                                         &cpustat->softirq);
3895         } else if (user_tick) {
3896                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3897         } else if (p == rq->idle) {
3898                 account_idle_time(cputime_one_jiffy);
3899         } else if (p->flags & PF_VCPU) { /* System time or guest time */
3900                 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3901         } else {
3902                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3903                                         &cpustat->system);
3904         }
3905 }
3906
3907 static void irqtime_account_idle_ticks(int ticks)
3908 {
3909         int i;
3910         struct rq *rq = this_rq();
3911
3912         for (i = 0; i < ticks; i++)
3913                 irqtime_account_process_tick(current, 0, rq);
3914 }
3915 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
3916 static void irqtime_account_idle_ticks(int ticks) {}
3917 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3918                                                 struct rq *rq) {}
3919 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3920
3921 /*
3922  * Account a single tick of cpu time.
3923  * @p: the process that the cpu time gets accounted to
3924  * @user_tick: indicates if the tick is a user or a system tick
3925  */
3926 void account_process_tick(struct task_struct *p, int user_tick)
3927 {
3928         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3929         struct rq *rq = this_rq();
3930
3931         if (sched_clock_irqtime) {
3932                 irqtime_account_process_tick(p, user_tick, rq);
3933                 return;
3934         }
3935
3936         if (user_tick)
3937                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3938         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3939                 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3940                                     one_jiffy_scaled);
3941         else
3942                 account_idle_time(cputime_one_jiffy);
3943 }
3944
3945 /*
3946  * Account multiple ticks of steal time.
3947  * @p: the process from which the cpu time has been stolen
3948  * @ticks: number of stolen ticks
3949  */
3950 void account_steal_ticks(unsigned long ticks)
3951 {
3952         account_steal_time(jiffies_to_cputime(ticks));
3953 }
3954
3955 /*
3956  * Account multiple ticks of idle time.
3957  * @ticks: number of stolen ticks
3958  */
3959 void account_idle_ticks(unsigned long ticks)
3960 {
3961
3962         if (sched_clock_irqtime) {
3963                 irqtime_account_idle_ticks(ticks);
3964                 return;
3965         }
3966
3967         account_idle_time(jiffies_to_cputime(ticks));
3968 }
3969
3970 #endif
3971
3972 /*
3973  * Use precise platform statistics if available:
3974  */
3975 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
3976 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3977 {
3978         *ut = p->utime;
3979         *st = p->stime;
3980 }
3981
3982 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3983 {
3984         struct task_cputime cputime;
3985
3986         thread_group_cputime(p, &cputime);
3987
3988         *ut = cputime.utime;
3989         *st = cputime.stime;
3990 }
3991 #else
3992
3993 #ifndef nsecs_to_cputime
3994 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
3995 #endif
3996
3997 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3998 {
3999         cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
4000
4001         /*
4002          * Use CFS's precise accounting:
4003          */
4004         rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
4005
4006         if (total) {
4007                 u64 temp = rtime;
4008
4009                 temp *= utime;
4010                 do_div(temp, total);
4011                 utime = (cputime_t)temp;
4012         } else
4013                 utime = rtime;
4014
4015         /*
4016          * Compare with previous values, to keep monotonicity:
4017          */
4018         p->prev_utime = max(p->prev_utime, utime);
4019         p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
4020
4021         *ut = p->prev_utime;
4022         *st = p->prev_stime;
4023 }
4024
4025 /*
4026  * Must be called with siglock held.
4027  */
4028 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4029 {
4030         struct signal_struct *sig = p->signal;
4031         struct task_cputime cputime;
4032         cputime_t rtime, utime, total;
4033
4034         thread_group_cputime(p, &cputime);
4035
4036         total = cputime_add(cputime.utime, cputime.stime);
4037         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4038
4039         if (total) {
4040                 u64 temp = rtime;
4041
4042                 temp *= cputime.utime;
4043                 do_div(temp, total);
4044                 utime = (cputime_t)temp;
4045         } else
4046                 utime = rtime;
4047
4048         sig->prev_utime = max(sig->prev_utime, utime);
4049         sig->prev_stime = max(sig->prev_stime,
4050                               cputime_sub(rtime, sig->prev_utime));
4051
4052         *ut = sig->prev_utime;
4053         *st = sig->prev_stime;
4054 }
4055 #endif
4056
4057 /*
4058  * This function gets called by the timer code, with HZ frequency.
4059  * We call it with interrupts disabled.
4060  */
4061 void scheduler_tick(void)
4062 {
4063         int cpu = smp_processor_id();
4064         struct rq *rq = cpu_rq(cpu);
4065         struct task_struct *curr = rq->curr;
4066
4067         sched_clock_tick();
4068
4069         raw_spin_lock(&rq->lock);
4070         update_rq_clock(rq);
4071         update_cpu_load_active(rq);
4072         curr->sched_class->task_tick(rq, curr, 0);
4073         raw_spin_unlock(&rq->lock);
4074
4075         perf_event_task_tick();
4076
4077 #ifdef CONFIG_SMP
4078         rq->idle_at_tick = idle_cpu(cpu);
4079         trigger_load_balance(rq, cpu);
4080 #endif
4081 }
4082
4083 notrace unsigned long get_parent_ip(unsigned long addr)
4084 {
4085         if (in_lock_functions(addr)) {
4086                 addr = CALLER_ADDR2;
4087                 if (in_lock_functions(addr))
4088                         addr = CALLER_ADDR3;
4089         }
4090         return addr;
4091 }
4092
4093 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4094                                 defined(CONFIG_PREEMPT_TRACER))
4095
4096 void __kprobes add_preempt_count(int val)
4097 {
4098 #ifdef CONFIG_DEBUG_PREEMPT
4099         /*
4100          * Underflow?
4101          */
4102         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4103                 return;
4104 #endif
4105         preempt_count() += val;
4106 #ifdef CONFIG_DEBUG_PREEMPT
4107         /*
4108          * Spinlock count overflowing soon?
4109          */
4110         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4111                                 PREEMPT_MASK - 10);
4112 #endif
4113         if (preempt_count() == val)
4114                 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4115 }
4116 EXPORT_SYMBOL(add_preempt_count);
4117
4118 void __kprobes sub_preempt_count(int val)
4119 {
4120 #ifdef CONFIG_DEBUG_PREEMPT
4121         /*
4122          * Underflow?
4123          */
4124         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
4125                 return;
4126         /*
4127          * Is the spinlock portion underflowing?
4128          */
4129         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4130                         !(preempt_count() & PREEMPT_MASK)))
4131                 return;
4132 #endif
4133
4134         if (preempt_count() == val)
4135                 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4136         preempt_count() -= val;
4137 }
4138 EXPORT_SYMBOL(sub_preempt_count);
4139
4140 #endif
4141
4142 /*
4143  * Print scheduling while atomic bug:
4144  */
4145 static noinline void __schedule_bug(struct task_struct *prev)
4146 {
4147         struct pt_regs *regs = get_irq_regs();
4148
4149         printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4150                 prev->comm, prev->pid, preempt_count());
4151
4152         debug_show_held_locks(prev);
4153         print_modules();
4154         if (irqs_disabled())
4155                 print_irqtrace_events(prev);
4156
4157         if (regs)
4158                 show_regs(regs);
4159         else
4160                 dump_stack();
4161 }
4162
4163 /*
4164  * Various schedule()-time debugging checks and statistics:
4165  */
4166 static inline void schedule_debug(struct task_struct *prev)
4167 {
4168         /*
4169          * Test if we are atomic. Since do_exit() needs to call into
4170          * schedule() atomically, we ignore that path for now.
4171          * Otherwise, whine if we are scheduling when we should not be.
4172          */
4173         if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4174                 __schedule_bug(prev);
4175
4176         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4177
4178         schedstat_inc(this_rq(), sched_count);
4179 }
4180
4181 static void put_prev_task(struct rq *rq, struct task_struct *prev)
4182 {
4183         if (prev->on_rq || rq->skip_clock_update < 0)
4184                 update_rq_clock(rq);
4185         prev->sched_class->put_prev_task(rq, prev);
4186 }
4187
4188 /*
4189  * Pick up the highest-prio task:
4190  */
4191 static inline struct task_struct *
4192 pick_next_task(struct rq *rq)
4193 {
4194         const struct sched_class *class;
4195         struct task_struct *p;
4196
4197         /*
4198          * Optimization: we know that if all tasks are in
4199          * the fair class we can call that function directly:
4200          */
4201         if (likely(rq->nr_running == rq->cfs.nr_running)) {
4202                 p = fair_sched_class.pick_next_task(rq);
4203                 if (likely(p))
4204                         return p;
4205         }
4206
4207         for_each_class(class) {
4208                 p = class->pick_next_task(rq);
4209                 if (p)
4210                         return p;
4211         }
4212
4213         BUG(); /* the idle class will always have a runnable task */
4214 }
4215
4216 /*
4217  * __schedule() is the main scheduler function.
4218  */
4219 static void __sched __schedule(void)
4220 {
4221         struct task_struct *prev, *next;
4222         unsigned long *switch_count;
4223         struct rq *rq;
4224         int cpu;
4225
4226 need_resched:
4227         preempt_disable();
4228         cpu = smp_processor_id();
4229         rq = cpu_rq(cpu);
4230         rcu_note_context_switch(cpu);
4231         prev = rq->curr;
4232
4233         schedule_debug(prev);
4234
4235         if (sched_feat(HRTICK))
4236                 hrtick_clear(rq);
4237
4238         raw_spin_lock_irq(&rq->lock);
4239
4240         switch_count = &prev->nivcsw;
4241         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4242                 if (unlikely(signal_pending_state(prev->state, prev))) {
4243                         prev->state = TASK_RUNNING;
4244                 } else {
4245                         deactivate_task(rq, prev, DEQUEUE_SLEEP);
4246                         prev->on_rq = 0;
4247
4248                         /*
4249                          * If a worker went to sleep, notify and ask workqueue
4250                          * whether it wants to wake up a task to maintain
4251                          * concurrency.
4252                          */
4253                         if (prev->flags & PF_WQ_WORKER) {
4254                                 struct task_struct *to_wakeup;
4255
4256                                 to_wakeup = wq_worker_sleeping(prev, cpu);
4257                                 if (to_wakeup)
4258                                         try_to_wake_up_local(to_wakeup);
4259                         }
4260                 }
4261                 switch_count = &prev->nvcsw;
4262         }
4263
4264         pre_schedule(rq, prev);
4265
4266         if (unlikely(!rq->nr_running))
4267                 idle_balance(cpu, rq);
4268
4269         put_prev_task(rq, prev);
4270         next = pick_next_task(rq);
4271         clear_tsk_need_resched(prev);
4272         rq->skip_clock_update = 0;
4273
4274         if (likely(prev != next)) {
4275                 rq->nr_switches++;
4276                 rq->curr = next;
4277                 ++*switch_count;
4278
4279                 context_switch(rq, prev, next); /* unlocks the rq */
4280                 /*
4281                  * The context switch have flipped the stack from under us
4282                  * and restored the local variables which were saved when
4283                  * this task called schedule() in the past. prev == current
4284                  * is still correct, but it can be moved to another cpu/rq.
4285                  */
4286                 cpu = smp_processor_id();
4287                 rq = cpu_rq(cpu);
4288         } else
4289                 raw_spin_unlock_irq(&rq->lock);
4290
4291         post_schedule(rq);
4292
4293         preempt_enable_no_resched();
4294         if (need_resched())
4295                 goto need_resched;
4296 }
4297
4298 static inline void sched_submit_work(struct task_struct *tsk)
4299 {
4300         if (!tsk->state)
4301                 return;
4302         /*
4303          * If we are going to sleep and we have plugged IO queued,
4304          * make sure to submit it to avoid deadlocks.
4305          */
4306         if (blk_needs_flush_plug(tsk))
4307                 blk_schedule_flush_plug(tsk);
4308 }
4309
4310 asmlinkage void __sched schedule(void)
4311 {
4312         struct task_struct *tsk = current;
4313
4314         sched_submit_work(tsk);
4315         __schedule();
4316 }
4317 EXPORT_SYMBOL(schedule);
4318
4319 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4320
4321 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4322 {
4323         bool ret = false;
4324
4325         rcu_read_lock();
4326         if (lock->owner != owner)
4327                 goto fail;
4328
4329         /*
4330          * Ensure we emit the owner->on_cpu, dereference _after_ checking
4331          * lock->owner still matches owner, if that fails, owner might
4332          * point to free()d memory, if it still matches, the rcu_read_lock()
4333          * ensures the memory stays valid.
4334          */
4335         barrier();
4336
4337         ret = owner->on_cpu;
4338 fail:
4339         rcu_read_unlock();
4340
4341         return ret;
4342 }
4343
4344 /*
4345  * Look out! "owner" is an entirely speculative pointer
4346  * access and not reliable.
4347  */
4348 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4349 {
4350         if (!sched_feat(OWNER_SPIN))
4351                 return 0;
4352
4353         while (owner_running(lock, owner)) {
4354                 if (need_resched())
4355                         return 0;
4356
4357                 arch_mutex_cpu_relax();
4358         }
4359
4360         /*
4361          * If the owner changed to another task there is likely
4362          * heavy contention, stop spinning.
4363          */
4364         if (lock->owner)
4365                 return 0;
4366
4367         return 1;
4368 }
4369 #endif
4370
4371 #ifdef CONFIG_PREEMPT
4372 /*
4373  * this is the entry point to schedule() from in-kernel preemption
4374  * off of preempt_enable. Kernel preemptions off return from interrupt
4375  * occur there and call schedule directly.
4376  */
4377 asmlinkage void __sched notrace preempt_schedule(void)
4378 {
4379         struct thread_info *ti = current_thread_info();
4380
4381         /*
4382          * If there is a non-zero preempt_count or interrupts are disabled,
4383          * we do not want to preempt the current task. Just return..
4384          */
4385         if (likely(ti->preempt_count || irqs_disabled()))
4386                 return;
4387
4388         do {
4389                 add_preempt_count_notrace(PREEMPT_ACTIVE);
4390                 __schedule();
4391                 sub_preempt_count_notrace(PREEMPT_ACTIVE);
4392
4393                 /*
4394                  * Check again in case we missed a preemption opportunity
4395                  * between schedule and now.
4396                  */
4397                 barrier();
4398         } while (need_resched());
4399 }
4400 EXPORT_SYMBOL(preempt_schedule);
4401
4402 /*
4403  * this is the entry point to schedule() from kernel preemption
4404  * off of irq context.
4405  * Note, that this is called and return with irqs disabled. This will
4406  * protect us against recursive calling from irq.
4407  */
4408 asmlinkage void __sched preempt_schedule_irq(void)
4409 {
4410         struct thread_info *ti = current_thread_info();
4411
4412         /* Catch callers which need to be fixed */
4413         BUG_ON(ti->preempt_count || !irqs_disabled());
4414
4415         do {
4416                 add_preempt_count(PREEMPT_ACTIVE);
4417                 local_irq_enable();
4418                 __schedule();
4419                 local_irq_disable();
4420                 sub_preempt_count(PREEMPT_ACTIVE);
4421
4422                 /*
4423                  * Check again in case we missed a preemption opportunity
4424                  * between schedule and now.
4425                  */
4426                 barrier();
4427         } while (need_resched());
4428 }
4429
4430 #endif /* CONFIG_PREEMPT */
4431
4432 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
4433                           void *key)
4434 {
4435         return try_to_wake_up(curr->private, mode, wake_flags);
4436 }
4437 EXPORT_SYMBOL(default_wake_function);
4438
4439 /*
4440  * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4441  * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
4442  * number) then we wake all the non-exclusive tasks and one exclusive task.
4443  *
4444  * There are circumstances in which we can try to wake a task which has already
4445  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4446  * zero in this (rare) case, and we handle it by continuing to scan the queue.
4447  */
4448 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4449                         int nr_exclusive, int wake_flags, void *key)
4450 {
4451         wait_queue_t *curr, *next;
4452
4453         list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
4454                 unsigned flags = curr->flags;
4455
4456                 if (curr->func(curr, mode, wake_flags, key) &&
4457                                 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
4458                         break;
4459         }
4460 }
4461
4462 /**
4463  * __wake_up - wake up threads blocked on a waitqueue.
4464  * @q: the waitqueue
4465  * @mode: which threads
4466  * @nr_exclusive: how many wake-one or wake-many threads to wake up
4467  * @key: is directly passed to the wakeup function
4468  *
4469  * It may be assumed that this function implies a write memory barrier before
4470  * changing the task state if and only if any tasks are woken up.
4471  */
4472 void __wake_up(wait_queue_head_t *q, unsigned int mode,
4473                         int nr_exclusive, void *key)
4474 {
4475         unsigned long flags;
4476
4477         spin_lock_irqsave(&q->lock, flags);
4478         __wake_up_common(q, mode, nr_exclusive, 0, key);
4479         spin_unlock_irqrestore(&q->lock, flags);
4480 }
4481 EXPORT_SYMBOL(__wake_up);
4482
4483 /*
4484  * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4485  */
4486 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4487 {
4488         __wake_up_common(q, mode, 1, 0, NULL);
4489 }
4490 EXPORT_SYMBOL_GPL(__wake_up_locked);
4491
4492 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4493 {
4494         __wake_up_common(q, mode, 1, 0, key);
4495 }
4496 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4497
4498 /**
4499  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4500  * @q: the waitqueue
4501  * @mode: which threads
4502  * @nr_exclusive: how many wake-one or wake-many threads to wake up
4503  * @key: opaque value to be passed to wakeup targets
4504  *
4505  * The sync wakeup differs that the waker knows that it will schedule
4506  * away soon, so while the target thread will be woken up, it will not
4507  * be migrated to another CPU - ie. the two threads are 'synchronized'
4508  * with each other. This can prevent needless bouncing between CPUs.
4509  *
4510  * On UP it can prevent extra preemption.
4511  *
4512  * It may be assumed that this function implies a write memory barrier before
4513  * changing the task state if and only if any tasks are woken up.
4514  */
4515 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4516                         int nr_exclusive, void *key)
4517 {
4518         unsigned long flags;
4519         int wake_flags = WF_SYNC;
4520
4521         if (unlikely(!q))
4522                 return;
4523
4524         if (unlikely(!nr_exclusive))
4525                 wake_flags = 0;
4526
4527         spin_lock_irqsave(&q->lock, flags);
4528         __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
4529         spin_unlock_irqrestore(&q->lock, flags);
4530 }
4531 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4532
4533 /*
4534  * __wake_up_sync - see __wake_up_sync_key()
4535  */
4536 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4537 {
4538         __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4539 }
4540 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
4541
4542 /**
4543  * complete: - signals a single thread waiting on this completion
4544  * @x:  holds the state of this particular completion
4545  *
4546  * This will wake up a single thread waiting on this completion. Threads will be
4547  * awakened in the same order in which they were queued.
4548  *
4549  * See also complete_all(), wait_for_completion() and related routines.
4550  *
4551  * It may be assumed that this function implies a write memory barrier before
4552  * changing the task state if and only if any tasks are woken up.
4553  */
4554 void complete(struct completion *x)
4555 {
4556         unsigned long flags;
4557
4558         spin_lock_irqsave(&x->wait.lock, flags);
4559         x->done++;
4560         __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
4561         spin_unlock_irqrestore(&x->wait.lock, flags);
4562 }
4563 EXPORT_SYMBOL(complete);
4564
4565 /**
4566  * complete_all: - signals all threads waiting on this completion
4567  * @x:  holds the state of this particular completion
4568  *
4569  * This will wake up all threads waiting on this particular completion event.
4570  *
4571  * It may be assumed that this function implies a write memory barrier before
4572  * changing the task state if and only if any tasks are woken up.
4573  */
4574 void complete_all(struct completion *x)
4575 {
4576         unsigned long flags;
4577
4578         spin_lock_irqsave(&x->wait.lock, flags);
4579         x->done += UINT_MAX/2;
4580         __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
4581         spin_unlock_irqrestore(&x->wait.lock, flags);
4582 }
4583 EXPORT_SYMBOL(complete_all);
4584
4585 static inline long __sched
4586 do_wait_for_common(struct completion *x, long timeout, int state)
4587 {
4588         if (!x->done) {
4589                 DECLARE_WAITQUEUE(wait, current);
4590
4591                 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4592                 do {
4593                         if (signal_pending_state(state, current)) {
4594                                 timeout = -ERESTARTSYS;
4595                                 break;
4596                         }
4597                         __set_current_state(state);
4598                         spin_unlock_irq(&x->wait.lock);
4599                         timeout = schedule_timeout(timeout);
4600                         spin_lock_irq(&x->wait.lock);
4601                 } while (!x->done && timeout);
4602                 __remove_wait_queue(&x->wait, &wait);
4603                 if (!x->done)
4604                         return timeout;
4605         }
4606         x->done--;
4607         return timeout ?: 1;
4608 }
4609
4610 static long __sched
4611 wait_for_common(struct completion *x, long timeout, int state)
4612 {
4613         might_sleep();
4614
4615         spin_lock_irq(&x->wait.lock);
4616         timeout = do_wait_for_common(x, timeout, state);
4617         spin_unlock_irq(&x->wait.lock);
4618         return timeout;
4619 }
4620
4621 /**
4622  * wait_for_completion: - waits for completion of a task
4623  * @x:  holds the state of this particular completion
4624  *
4625  * This waits to be signaled for completion of a specific task. It is NOT
4626  * interruptible and there is no timeout.
4627  *
4628  * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4629  * and interrupt capability. Also see complete().
4630  */
4631 void __sched wait_for_completion(struct completion *x)
4632 {
4633         wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4634 }
4635 EXPORT_SYMBOL(wait_for_completion);
4636
4637 /**
4638  * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4639  * @x:  holds the state of this particular completion
4640  * @timeout:  timeout value in jiffies
4641  *
4642  * This waits for either a completion of a specific task to be signaled or for a
4643  * specified timeout to expire. The timeout is in jiffies. It is not
4644  * interruptible.
4645  */
4646 unsigned long __sched
4647 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4648 {
4649         return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4650 }
4651 EXPORT_SYMBOL(wait_for_completion_timeout);
4652
4653 /**
4654  * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4655  * @x:  holds the state of this particular completion
4656  *
4657  * This waits for completion of a specific task to be signaled. It is
4658  * interruptible.
4659  */
4660 int __sched wait_for_completion_interruptible(struct completion *x)
4661 {
4662         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4663         if (t == -ERESTARTSYS)
4664                 return t;
4665         return 0;
4666 }
4667 EXPORT_SYMBOL(wait_for_completion_interruptible);
4668
4669 /**
4670  * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4671  * @x:  holds the state of this particular completion
4672  * @timeout:  timeout value in jiffies
4673  *
4674  * This waits for either a completion of a specific task to be signaled or for a
4675  * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4676  */
4677 long __sched
4678 wait_for_completion_interruptible_timeout(struct completion *x,
4679                                           unsigned long timeout)
4680 {
4681         return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4682 }
4683 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4684
4685 /**
4686  * wait_for_completion_killable: - waits for completion of a task (killable)
4687  * @x:  holds the state of this particular completion
4688  *
4689  * This waits to be signaled for completion of a specific task. It can be
4690  * interrupted by a kill signal.
4691  */
4692 int __sched wait_for_completion_killable(struct completion *x)
4693 {
4694         long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4695         if (t == -ERESTARTSYS)
4696                 return t;
4697         return 0;
4698 }
4699 EXPORT_SYMBOL(wait_for_completion_killable);
4700
4701 /**
4702  * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4703  * @x:  holds the state of this particular completion
4704  * @timeout:  timeout value in jiffies
4705  *
4706  * This waits for either a completion of a specific task to be
4707  * signaled or for a specified timeout to expire. It can be
4708  * interrupted by a kill signal. The timeout is in jiffies.
4709  */
4710 long __sched
4711 wait_for_completion_killable_timeout(struct completion *x,
4712                                      unsigned long timeout)
4713 {
4714         return wait_for_common(x, timeout, TASK_KILLABLE);
4715 }
4716 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4717
4718 /**
4719  *      try_wait_for_completion - try to decrement a completion without blocking
4720  *      @x:     completion structure
4721  *
4722  *      Returns: 0 if a decrement cannot be done without blocking
4723  *               1 if a decrement succeeded.
4724  *
4725  *      If a completion is being used as a counting completion,
4726  *      attempt to decrement the counter without blocking. This
4727  *      enables us to avoid waiting if the resource the completion
4728  *      is protecting is not available.
4729  */
4730 bool try_wait_for_completion(struct completion *x)
4731 {
4732         unsigned long flags;
4733         int ret = 1;
4734
4735         spin_lock_irqsave(&x->wait.lock, flags);
4736         if (!x->done)
4737                 ret = 0;
4738         else
4739                 x->done--;
4740         spin_unlock_irqrestore(&x->wait.lock, flags);
4741         return ret;
4742 }
4743 EXPORT_SYMBOL(try_wait_for_completion);
4744
4745 /**
4746  *      completion_done - Test to see if a completion has any waiters
4747  *      @x:     completion structure
4748  *
4749  *      Returns: 0 if there are waiters (wait_for_completion() in progress)
4750  *               1 if there are no waiters.
4751  *
4752  */
4753 bool completion_done(struct completion *x)
4754 {
4755         unsigned long flags;
4756         int ret = 1;
4757
4758         spin_lock_irqsave(&x->wait.lock, flags);
4759         if (!x->done)
4760                 ret = 0;
4761         spin_unlock_irqrestore(&x->wait.lock, flags);
4762         return ret;
4763 }
4764 EXPORT_SYMBOL(completion_done);
4765
4766 static long __sched
4767 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4768 {
4769         unsigned long flags;
4770         wait_queue_t wait;
4771
4772         init_waitqueue_entry(&wait, current);
4773
4774         __set_current_state(state);
4775
4776         spin_lock_irqsave(&q->lock, flags);
4777         __add_wait_queue(q, &wait);
4778         spin_unlock(&q->lock);
4779         timeout = schedule_timeout(timeout);
4780         spin_lock_irq(&q->lock);
4781         __remove_wait_queue(q, &wait);
4782         spin_unlock_irqrestore(&q->lock, flags);
4783
4784         return timeout;
4785 }
4786
4787 void __sched interruptible_sleep_on(wait_queue_head_t *q)
4788 {
4789         sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4790 }
4791 EXPORT_SYMBOL(interruptible_sleep_on);
4792
4793 long __sched
4794 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4795 {
4796         return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4797 }
4798 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4799
4800 void __sched sleep_on(wait_queue_head_t *q)
4801 {
4802         sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4803 }
4804 EXPORT_SYMBOL(sleep_on);
4805
4806 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4807 {
4808         return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4809 }
4810 EXPORT_SYMBOL(sleep_on_timeout);
4811
4812 #ifdef CONFIG_RT_MUTEXES
4813
4814 /*
4815  * rt_mutex_setprio - set the current priority of a task
4816  * @p: task
4817  * @prio: prio value (kernel-internal form)
4818  *
4819  * This function changes the 'effective' priority of a task. It does
4820  * not touch ->normal_prio like __setscheduler().
4821  *
4822  * Used by the rt_mutex code to implement priority inheritance logic.
4823  */
4824 void rt_mutex_setprio(struct task_struct *p, int prio)
4825 {
4826         int oldprio, on_rq, running;
4827         struct rq *rq;
4828         const struct sched_class *prev_class;
4829
4830         BUG_ON(prio < 0 || prio > MAX_PRIO);
4831
4832         rq = __task_rq_lock(p);
4833
4834         trace_sched_pi_setprio(p, prio);
4835         oldprio = p->prio;
4836         prev_class = p->sched_class;
4837         on_rq = p->on_rq;
4838         running = task_current(rq, p);
4839         if (on_rq)
4840                 dequeue_task(rq, p, 0);
4841         if (running)
4842                 p->sched_class->put_prev_task(rq, p);
4843
4844         if (rt_prio(prio))
4845                 p->sched_class = &rt_sched_class;
4846         else
4847                 p->sched_class = &fair_sched_class;
4848
4849         p->prio = prio;
4850
4851         if (running)
4852                 p->sched_class->set_curr_task(rq);
4853         if (on_rq)
4854                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4855
4856         check_class_changed(rq, p, prev_class, oldprio);
4857         __task_rq_unlock(rq);
4858 }
4859
4860 #endif
4861
4862 void set_user_nice(struct task_struct *p, long nice)
4863 {
4864         int old_prio, delta, on_rq;
4865         unsigned long flags;
4866         struct rq *rq;
4867
4868         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4869                 return;
4870         /*
4871          * We have to be careful, if called from sys_setpriority(),
4872          * the task might be in the middle of scheduling on another CPU.
4873          */
4874         rq = task_rq_lock(p, &flags);
4875         /*
4876          * The RT priorities are set via sched_setscheduler(), but we still
4877          * allow the 'normal' nice value to be set - but as expected
4878          * it wont have any effect on scheduling until the task is
4879          * SCHED_FIFO/SCHED_RR:
4880          */
4881         if (task_has_rt_policy(p)) {
4882                 p->static_prio = NICE_TO_PRIO(nice);
4883                 goto out_unlock;
4884         }
4885         on_rq = p->on_rq;
4886         if (on_rq)
4887                 dequeue_task(rq, p, 0);
4888
4889         p->static_prio = NICE_TO_PRIO(nice);
4890         set_load_weight(p);
4891         old_prio = p->prio;
4892         p->prio = effective_prio(p);
4893         delta = p->prio - old_prio;
4894
4895         if (on_rq) {
4896                 enqueue_task(rq, p, 0);
4897                 /*
4898                  * If the task increased its priority or is running and
4899                  * lowered its priority, then reschedule its CPU:
4900                  */
4901                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4902                         resched_task(rq->curr);
4903         }
4904 out_unlock:
4905         task_rq_unlock(rq, p, &flags);
4906 }
4907 EXPORT_SYMBOL(set_user_nice);
4908
4909 /*
4910  * can_nice - check if a task can reduce its nice value
4911  * @p: task
4912  * @nice: nice value
4913  */
4914 int can_nice(const struct task_struct *p, const int nice)
4915 {
4916         /* convert nice value [19,-20] to rlimit style value [1,40] */
4917         int nice_rlim = 20 - nice;
4918
4919         return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4920                 capable(CAP_SYS_NICE));
4921 }
4922
4923 #ifdef __ARCH_WANT_SYS_NICE
4924
4925 /*
4926  * sys_nice - change the priority of the current process.
4927  * @increment: priority increment
4928  *
4929  * sys_setpriority is a more generic, but much slower function that
4930  * does similar things.
4931  */
4932 SYSCALL_DEFINE1(nice, int, increment)
4933 {
4934         long nice, retval;
4935
4936         /*
4937          * Setpriority might change our priority at the same moment.
4938          * We don't have to worry. Conceptually one call occurs first
4939          * and we have a single winner.
4940          */
4941         if (increment < -40)
4942                 increment = -40;
4943         if (increment > 40)
4944                 increment = 40;
4945
4946         nice = TASK_NICE(current) + increment;
4947         if (nice < -20)
4948                 nice = -20;
4949         if (nice > 19)
4950                 nice = 19;
4951
4952         if (increment < 0 && !can_nice(current, nice))
4953                 return -EPERM;
4954
4955         retval = security_task_setnice(current, nice);
4956         if (retval)
4957                 return retval;
4958
4959         set_user_nice(current, nice);
4960         return 0;
4961 }
4962
4963 #endif
4964
4965 /**
4966  * task_prio - return the priority value of a given task.
4967  * @p: the task in question.
4968  *
4969  * This is the priority value as seen by users in /proc.
4970  * RT tasks are offset by -200. Normal tasks are centered
4971  * around 0, value goes from -16 to +15.
4972  */
4973 int task_prio(const struct task_struct *p)
4974 {
4975         return p->prio - MAX_RT_PRIO;
4976 }
4977
4978 /**
4979  * task_nice - return the nice value of a given task.
4980  * @p: the task in question.
4981  */
4982 int task_nice(const struct task_struct *p)
4983 {
4984         return TASK_NICE(p);
4985 }
4986 EXPORT_SYMBOL(task_nice);
4987
4988 /**
4989  * idle_cpu - is a given cpu idle currently?
4990  * @cpu: the processor in question.
4991  */
4992 int idle_cpu(int cpu)
4993 {
4994         return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4995 }
4996
4997 /**
4998  * idle_task - return the idle task for a given cpu.
4999  * @cpu: the processor in question.
5000  */
5001 struct task_struct *idle_task(int cpu)
5002 {
5003         return cpu_rq(cpu)->idle;
5004 }
5005
5006 /**
5007  * find_process_by_pid - find a process with a matching PID value.
5008  * @pid: the pid in question.
5009  */
5010 static struct task_struct *find_process_by_pid(pid_t pid)
5011 {
5012         return pid ? find_task_by_vpid(pid) : current;
5013 }
5014
5015 /* Actually do priority change: must hold rq lock. */
5016 static void
5017 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5018 {
5019         p->policy = policy;
5020         p->rt_priority = prio;
5021         p->normal_prio = normal_prio(p);
5022         /* we are holding p->pi_lock already */
5023         p->prio = rt_mutex_getprio(p);
5024         if (rt_prio(p->prio))
5025                 p->sched_class = &rt_sched_class;
5026         else
5027                 p->sched_class = &fair_sched_class;
5028         set_load_weight(p);
5029 }
5030
5031 /*
5032  * check the target process has a UID that matches the current process's
5033  */
5034 static bool check_same_owner(struct task_struct *p)
5035 {
5036         const struct cred *cred = current_cred(), *pcred;
5037         bool match;
5038
5039         rcu_read_lock();
5040         pcred = __task_cred(p);
5041         if (cred->user->user_ns == pcred->user->user_ns)
5042                 match = (cred->euid == pcred->euid ||
5043                          cred->euid == pcred->uid);
5044         else
5045                 match = false;
5046         rcu_read_unlock();
5047         return match;
5048 }
5049
5050 static int __sched_setscheduler(struct task_struct *p, int policy,
5051                                 const struct sched_param *param, bool user)
5052 {
5053         int retval, oldprio, oldpolicy = -1, on_rq, running;
5054         unsigned long flags;
5055         const struct sched_class *prev_class;
5056         struct rq *rq;
5057         int reset_on_fork;
5058
5059         /* may grab non-irq protected spin_locks */
5060         BUG_ON(in_interrupt());
5061 recheck:
5062         /* double check policy once rq lock held */
5063         if (policy < 0) {
5064                 reset_on_fork = p->sched_reset_on_fork;
5065                 policy = oldpolicy = p->policy;
5066         } else {
5067                 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5068                 policy &= ~SCHED_RESET_ON_FORK;
5069
5070                 if (policy != SCHED_FIFO && policy != SCHED_RR &&
5071                                 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5072                                 policy != SCHED_IDLE)
5073                         return -EINVAL;
5074         }
5075
5076         /*
5077          * Valid priorities for SCHED_FIFO and SCHED_RR are
5078          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5079          * SCHED_BATCH and SCHED_IDLE is 0.
5080          */
5081         if (param->sched_priority < 0 ||
5082             (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
5083             (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
5084                 return -EINVAL;
5085         if (rt_policy(policy) != (param->sched_priority != 0))
5086                 return -EINVAL;
5087
5088         /*
5089          * Allow unprivileged RT tasks to decrease priority:
5090          */
5091         if (user && !capable(CAP_SYS_NICE)) {
5092                 if (rt_policy(policy)) {
5093                         unsigned long rlim_rtprio =
5094                                         task_rlimit(p, RLIMIT_RTPRIO);
5095
5096                         /* can't set/change the rt policy */
5097                         if (policy != p->policy && !rlim_rtprio)
5098                                 return -EPERM;
5099
5100                         /* can't increase priority */
5101                         if (param->sched_priority > p->rt_priority &&
5102                             param->sched_priority > rlim_rtprio)
5103                                 return -EPERM;
5104                 }
5105
5106                 /*
5107                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
5108                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5109                  */
5110                 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5111                         if (!can_nice(p, TASK_NICE(p)))
5112                                 return -EPERM;
5113                 }
5114
5115                 /* can't change other user's priorities */
5116                 if (!check_same_owner(p))
5117                         return -EPERM;
5118
5119                 /* Normal users shall not reset the sched_reset_on_fork flag */
5120                 if (p->sched_reset_on_fork && !reset_on_fork)
5121                         return -EPERM;
5122         }
5123
5124         if (user) {
5125                 retval = security_task_setscheduler(p);
5126                 if (retval)
5127                         return retval;
5128         }
5129
5130         /*
5131          * make sure no PI-waiters arrive (or leave) while we are
5132          * changing the priority of the task:
5133          *
5134          * To be able to change p->policy safely, the appropriate
5135          * runqueue lock must be held.
5136          */
5137         rq = task_rq_lock(p, &flags);
5138
5139         /*
5140          * Changing the policy of the stop threads its a very bad idea
5141          */
5142         if (p == rq->stop) {
5143                 task_rq_unlock(rq, p, &flags);
5144                 return -EINVAL;
5145         }
5146
5147         /*
5148          * If not changing anything there's no need to proceed further:
5149          */
5150         if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5151                         param->sched_priority == p->rt_priority))) {
5152
5153                 __task_rq_unlock(rq);
5154                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5155                 return 0;
5156         }
5157
5158 #ifdef CONFIG_RT_GROUP_SCHED
5159         if (user) {
5160                 /*
5161                  * Do not allow realtime tasks into groups that have no runtime
5162                  * assigned.
5163                  */
5164                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5165                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5166                                 !task_group_is_autogroup(task_group(p))) {
5167                         task_rq_unlock(rq, p, &flags);
5168                         return -EPERM;
5169                 }
5170         }
5171 #endif
5172
5173         /* recheck policy now with rq lock held */
5174         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5175                 policy = oldpolicy = -1;
5176                 task_rq_unlock(rq, p, &flags);
5177                 goto recheck;
5178         }
5179         on_rq = p->on_rq;
5180         running = task_current(rq, p);
5181         if (on_rq)
5182                 deactivate_task(rq, p, 0);
5183         if (running)
5184                 p->sched_class->put_prev_task(rq, p);
5185
5186         p->sched_reset_on_fork = reset_on_fork;
5187
5188         oldprio = p->prio;
5189         prev_class = p->sched_class;
5190         __setscheduler(rq, p, policy, param->sched_priority);
5191
5192         if (running)
5193                 p->sched_class->set_curr_task(rq);
5194         if (on_rq)
5195                 activate_task(rq, p, 0);
5196
5197         check_class_changed(rq, p, prev_class, oldprio);
5198         task_rq_unlock(rq, p, &flags);
5199
5200         rt_mutex_adjust_pi(p);
5201
5202         return 0;
5203 }
5204
5205 /**
5206  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5207  * @p: the task in question.
5208  * @policy: new policy.
5209  * @param: structure containing the new RT priority.
5210  *
5211  * NOTE that the task may be already dead.
5212  */
5213 int sched_setscheduler(struct task_struct *p, int policy,
5214                        const struct sched_param *param)
5215 {
5216         return __sched_setscheduler(p, policy, param, true);
5217 }
5218 EXPORT_SYMBOL_GPL(sched_setscheduler);
5219
5220 /**
5221  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5222  * @p: the task in question.
5223  * @policy: new policy.
5224  * @param: structure containing the new RT priority.
5225  *
5226  * Just like sched_setscheduler, only don't bother checking if the
5227  * current context has permission.  For example, this is needed in
5228  * stop_machine(): we create temporary high priority worker threads,
5229  * but our caller might not have that capability.
5230  */
5231 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5232                                const struct sched_param *param)
5233 {
5234         return __sched_setscheduler(p, policy, param, false);
5235 }
5236
5237 static int
5238 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5239 {
5240         struct sched_param lparam;
5241         struct task_struct *p;
5242         int retval;
5243
5244         if (!param || pid < 0)
5245                 return -EINVAL;
5246         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5247                 return -EFAULT;
5248
5249         rcu_read_lock();
5250         retval = -ESRCH;
5251         p = find_process_by_pid(pid);
5252         if (p != NULL)
5253                 retval = sched_setscheduler(p, policy, &lparam);
5254         rcu_read_unlock();
5255
5256         return retval;
5257 }
5258
5259 /**
5260  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5261  * @pid: the pid in question.
5262  * @policy: new policy.
5263  * @param: structure containing the new RT priority.
5264  */
5265 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5266                 struct sched_param __user *, param)
5267 {
5268         /* negative values for policy are not valid */
5269         if (policy < 0)
5270                 return -EINVAL;
5271
5272         return do_sched_setscheduler(pid, policy, param);
5273 }
5274
5275 /**
5276  * sys_sched_setparam - set/change the RT priority of a thread
5277  * @pid: the pid in question.
5278  * @param: structure containing the new RT priority.
5279  */
5280 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
5281 {
5282         return do_sched_setscheduler(pid, -1, param);
5283 }
5284
5285 /**
5286  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5287  * @pid: the pid in question.
5288  */
5289 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
5290 {
5291         struct task_struct *p;
5292         int retval;
5293
5294         if (pid < 0)
5295                 return -EINVAL;
5296
5297         retval = -ESRCH;
5298         rcu_read_lock();
5299         p = find_process_by_pid(pid);
5300         if (p) {
5301                 retval = security_task_getscheduler(p);
5302                 if (!retval)
5303                         retval = p->policy
5304                                 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
5305         }
5306         rcu_read_unlock();
5307         return retval;
5308 }
5309
5310 /**
5311  * sys_sched_getparam - get the RT priority of a thread
5312  * @pid: the pid in question.
5313  * @param: structure containing the RT priority.
5314  */
5315 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
5316 {
5317         struct sched_param lp;
5318         struct task_struct *p;
5319         int retval;
5320
5321         if (!param || pid < 0)
5322                 return -EINVAL;
5323
5324         rcu_read_lock();
5325         p = find_process_by_pid(pid);
5326         retval = -ESRCH;
5327         if (!p)
5328                 goto out_unlock;
5329
5330         retval = security_task_getscheduler(p);
5331         if (retval)
5332                 goto out_unlock;
5333
5334         lp.sched_priority = p->rt_priority;
5335         rcu_read_unlock();
5336
5337         /*
5338          * This one might sleep, we cannot do it with a spinlock held ...
5339          */
5340         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5341
5342         return retval;
5343
5344 out_unlock:
5345         rcu_read_unlock();
5346         return retval;
5347 }
5348
5349 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5350 {
5351         cpumask_var_t cpus_allowed, new_mask;
5352         struct task_struct *p;
5353         int retval;
5354
5355         get_online_cpus();
5356         rcu_read_lock();
5357
5358         p = find_process_by_pid(pid);
5359         if (!p) {
5360                 rcu_read_unlock();
5361                 put_online_cpus();
5362                 return -ESRCH;
5363         }
5364
5365         /* Prevent p going away */
5366         get_task_struct(p);
5367         rcu_read_unlock();
5368
5369         if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5370                 retval = -ENOMEM;
5371                 goto out_put_task;
5372         }
5373         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5374                 retval = -ENOMEM;
5375                 goto out_free_cpus_allowed;
5376         }
5377         retval = -EPERM;
5378         if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5379                 goto out_unlock;
5380
5381         retval = security_task_setscheduler(p);
5382         if (retval)
5383                 goto out_unlock;
5384
5385         cpuset_cpus_allowed(p, cpus_allowed);
5386         cpumask_and(new_mask, in_mask, cpus_allowed);
5387 again:
5388         retval = set_cpus_allowed_ptr(p, new_mask);
5389
5390         if (!retval) {
5391                 cpuset_cpus_allowed(p, cpus_allowed);
5392                 if (!cpumask_subset(new_mask, cpus_allowed)) {
5393                         /*
5394                          * We must have raced with a concurrent cpuset
5395                          * update. Just reset the cpus_allowed to the
5396                          * cpuset's cpus_allowed
5397                          */
5398                         cpumask_copy(new_mask, cpus_allowed);
5399                         goto again;
5400                 }
5401         }
5402 out_unlock:
5403         free_cpumask_var(new_mask);
5404 out_free_cpus_allowed:
5405         free_cpumask_var(cpus_allowed);
5406 out_put_task:
5407         put_task_struct(p);
5408         put_online_cpus();
5409         return retval;
5410 }
5411
5412 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5413                              struct cpumask *new_mask)
5414 {
5415         if (len < cpumask_size())
5416                 cpumask_clear(new_mask);
5417         else if (len > cpumask_size())
5418                 len = cpumask_size();
5419
5420         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5421 }
5422
5423 /**
5424  * sys_sched_setaffinity - set the cpu affinity of a process
5425  * @pid: pid of the process
5426  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5427  * @user_mask_ptr: user-space pointer to the new cpu mask
5428  */
5429 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5430                 unsigned long __user *, user_mask_ptr)
5431 {
5432         cpumask_var_t new_mask;
5433         int retval;
5434
5435         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5436                 return -ENOMEM;
5437
5438         retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5439         if (retval == 0)
5440                 retval = sched_setaffinity(pid, new_mask);
5441         free_cpumask_var(new_mask);
5442         return retval;
5443 }
5444
5445 long sched_getaffinity(pid_t pid, struct cpumask *mask)
5446 {
5447         struct task_struct *p;
5448         unsigned long flags;
5449         int retval;
5450
5451         get_online_cpus();
5452         rcu_read_lock();
5453
5454         retval = -ESRCH;
5455         p = find_process_by_pid(pid);
5456         if (!p)
5457                 goto out_unlock;
5458
5459         retval = security_task_getscheduler(p);
5460         if (retval)
5461                 goto out_unlock;
5462
5463         raw_spin_lock_irqsave(&p->pi_lock, flags);
5464         cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5465         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5466
5467 out_unlock:
5468         rcu_read_unlock();
5469         put_online_cpus();
5470
5471         return retval;
5472 }
5473
5474 /**
5475  * sys_sched_getaffinity - get the cpu affinity of a process
5476  * @pid: pid of the process
5477  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5478  * @user_mask_ptr: user-space pointer to hold the current cpu mask
5479  */
5480 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5481                 unsigned long __user *, user_mask_ptr)
5482 {
5483         int ret;
5484         cpumask_var_t mask;
5485
5486         if ((len * BITS_PER_BYTE) < nr_cpu_ids)
5487                 return -EINVAL;
5488         if (len & (sizeof(unsigned long)-1))
5489                 return -EINVAL;
5490
5491         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5492                 return -ENOMEM;
5493
5494         ret = sched_getaffinity(pid, mask);
5495         if (ret == 0) {
5496                 size_t retlen = min_t(size_t, len, cpumask_size());
5497
5498                 if (copy_to_user(user_mask_ptr, mask, retlen))
5499                         ret = -EFAULT;
5500                 else
5501                         ret = retlen;
5502         }
5503         free_cpumask_var(mask);
5504
5505         return ret;
5506 }
5507
5508 /**
5509  * sys_sched_yield - yield the current processor to other threads.
5510  *
5511  * This function yields the current CPU to other tasks. If there are no
5512  * other threads running on this CPU then this function will return.
5513  */
5514 SYSCALL_DEFINE0(sched_yield)
5515 {
5516         struct rq *rq = this_rq_lock();
5517
5518         schedstat_inc(rq, yld_count);
5519         current->sched_class->yield_task(rq);
5520
5521         /*
5522          * Since we are going to call schedule() anyway, there's
5523          * no need to preempt or enable interrupts:
5524          */
5525         __release(rq->lock);
5526         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
5527         do_raw_spin_unlock(&rq->lock);
5528         preempt_enable_no_resched();
5529
5530         schedule();
5531
5532         return 0;
5533 }
5534
5535 static inline int should_resched(void)
5536 {
5537         return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5538 }
5539
5540 static void __cond_resched(void)
5541 {
5542         add_preempt_count(PREEMPT_ACTIVE);
5543         __schedule();
5544         sub_preempt_count(PREEMPT_ACTIVE);
5545 }
5546
5547 int __sched _cond_resched(void)
5548 {
5549         if (should_resched()) {
5550                 __cond_resched();
5551                 return 1;
5552         }
5553         return 0;
5554 }
5555 EXPORT_SYMBOL(_cond_resched);
5556
5557 /*
5558  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5559  * call schedule, and on return reacquire the lock.
5560  *
5561  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
5562  * operations here to prevent schedule() from being called twice (once via
5563  * spin_unlock(), once by hand).
5564  */
5565 int __cond_resched_lock(spinlock_t *lock)
5566 {
5567         int resched = should_resched();
5568         int ret = 0;
5569
5570         lockdep_assert_held(lock);
5571
5572         if (spin_needbreak(lock) || resched) {
5573                 spin_unlock(lock);
5574                 if (resched)
5575                         __cond_resched();
5576                 else
5577                         cpu_relax();
5578                 ret = 1;
5579                 spin_lock(lock);
5580         }
5581         return ret;
5582 }
5583 EXPORT_SYMBOL(__cond_resched_lock);
5584
5585 int __sched __cond_resched_softirq(void)
5586 {
5587         BUG_ON(!in_softirq());
5588
5589         if (should_resched()) {
5590                 local_bh_enable();
5591                 __cond_resched();
5592                 local_bh_disable();
5593                 return 1;
5594         }
5595         return 0;
5596 }
5597 EXPORT_SYMBOL(__cond_resched_softirq);
5598
5599 /**
5600  * yield - yield the current processor to other threads.
5601  *
5602  * This is a shortcut for kernel-space yielding - it marks the
5603  * thread runnable and calls sys_sched_yield().
5604  */
5605 void __sched yield(void)
5606 {
5607         set_current_state(TASK_RUNNING);
5608         sys_sched_yield();
5609 }
5610 EXPORT_SYMBOL(yield);
5611
5612 /**
5613  * yield_to - yield the current processor to another thread in
5614  * your thread group, or accelerate that thread toward the
5615  * processor it's on.
5616  * @p: target task
5617  * @preempt: whether task preemption is allowed or not
5618  *
5619  * It's the caller's job to ensure that the target task struct
5620  * can't go away on us before we can do any checks.
5621  *
5622  * Returns true if we indeed boosted the target task.
5623  */
5624 bool __sched yield_to(struct task_struct *p, bool preempt)
5625 {
5626         struct task_struct *curr = current;
5627         struct rq *rq, *p_rq;
5628         unsigned long flags;
5629         bool yielded = 0;
5630
5631         local_irq_save(flags);
5632         rq = this_rq();
5633
5634 again:
5635         p_rq = task_rq(p);
5636         double_rq_lock(rq, p_rq);
5637         while (task_rq(p) != p_rq) {
5638                 double_rq_unlock(rq, p_rq);
5639                 goto again;
5640         }
5641
5642         if (!curr->sched_class->yield_to_task)
5643                 goto out;
5644
5645         if (curr->sched_class != p->sched_class)
5646                 goto out;
5647
5648         if (task_running(p_rq, p) || p->state)
5649                 goto out;
5650
5651         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5652         if (yielded) {
5653                 schedstat_inc(rq, yld_count);
5654                 /*
5655                  * Make p's CPU reschedule; pick_next_entity takes care of
5656                  * fairness.
5657                  */
5658                 if (preempt && rq != p_rq)
5659                         resched_task(p_rq->curr);
5660         }
5661
5662 out:
5663         double_rq_unlock(rq, p_rq);
5664         local_irq_restore(flags);
5665
5666         if (yielded)
5667                 schedule();
5668
5669         return yielded;
5670 }
5671 EXPORT_SYMBOL_GPL(yield_to);
5672
5673 /*
5674  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5675  * that process accounting knows that this is a task in IO wait state.
5676  */
5677 void __sched io_schedule(void)
5678 {
5679         struct rq *rq = raw_rq();
5680
5681         delayacct_blkio_start();
5682         atomic_inc(&rq->nr_iowait);
5683         blk_flush_plug(current);
5684         current->in_iowait = 1;
5685         schedule();
5686         current->in_iowait = 0;
5687         atomic_dec(&rq->nr_iowait);
5688         delayacct_blkio_end();
5689 }
5690 EXPORT_SYMBOL(io_schedule);
5691
5692 long __sched io_schedule_timeout(long timeout)
5693 {
5694         struct rq *rq = raw_rq();
5695         long ret;
5696
5697         delayacct_blkio_start();
5698         atomic_inc(&rq->nr_iowait);
5699         blk_flush_plug(current);
5700         current->in_iowait = 1;
5701         ret = schedule_timeout(timeout);
5702         current->in_iowait = 0;
5703         atomic_dec(&rq->nr_iowait);
5704         delayacct_blkio_end();
5705         return ret;
5706 }
5707
5708 /**
5709  * sys_sched_get_priority_max - return maximum RT priority.
5710  * @policy: scheduling class.
5711  *
5712  * this syscall returns the maximum rt_priority that can be used
5713  * by a given scheduling class.
5714  */
5715 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5716 {
5717         int ret = -EINVAL;
5718
5719         switch (policy) {
5720         case SCHED_FIFO:
5721         case SCHED_RR:
5722                 ret = MAX_USER_RT_PRIO-1;
5723                 break;
5724         case SCHED_NORMAL:
5725         case SCHED_BATCH:
5726         case SCHED_IDLE:
5727                 ret = 0;
5728                 break;
5729         }
5730         return ret;
5731 }
5732
5733 /**
5734  * sys_sched_get_priority_min - return minimum RT priority.
5735  * @policy: scheduling class.
5736  *
5737  * this syscall returns the minimum rt_priority that can be used
5738  * by a given scheduling class.
5739  */
5740 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5741 {
5742         int ret = -EINVAL;
5743
5744         switch (policy) {
5745         case SCHED_FIFO:
5746         case SCHED_RR:
5747                 ret = 1;
5748                 break;
5749         case SCHED_NORMAL:
5750         case SCHED_BATCH:
5751         case SCHED_IDLE:
5752                 ret = 0;
5753         }
5754         return ret;
5755 }
5756
5757 /**
5758  * sys_sched_rr_get_interval - return the default timeslice of a process.
5759  * @pid: pid of the process.
5760  * @interval: userspace pointer to the timeslice value.
5761  *
5762  * this syscall writes the default timeslice value of a given process
5763  * into the user-space timespec buffer. A value of '0' means infinity.
5764  */
5765 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5766                 struct timespec __user *, interval)
5767 {
5768         struct task_struct *p;
5769         unsigned int time_slice;
5770         unsigned long flags;
5771         struct rq *rq;
5772         int retval;
5773         struct timespec t;
5774
5775         if (pid < 0)
5776                 return -EINVAL;
5777
5778         retval = -ESRCH;
5779         rcu_read_lock();
5780         p = find_process_by_pid(pid);
5781         if (!p)
5782                 goto out_unlock;
5783
5784         retval = security_task_getscheduler(p);
5785         if (retval)
5786                 goto out_unlock;
5787
5788         rq = task_rq_lock(p, &flags);
5789         time_slice = p->sched_class->get_rr_interval(rq, p);
5790         task_rq_unlock(rq, p, &flags);
5791
5792         rcu_read_unlock();
5793         jiffies_to_timespec(time_slice, &t);
5794         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5795         return retval;
5796
5797 out_unlock:
5798         rcu_read_unlock();
5799         return retval;
5800 }
5801
5802 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5803
5804 void sched_show_task(struct task_struct *p)
5805 {
5806         unsigned long free = 0;
5807         unsigned state;
5808
5809         state = p->state ? __ffs(p->state) + 1 : 0;
5810         printk(KERN_INFO "%-15.15s %c", p->comm,
5811                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5812 #if BITS_PER_LONG == 32
5813         if (state == TASK_RUNNING)
5814                 printk(KERN_CONT " running  ");
5815         else
5816                 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5817 #else
5818         if (state == TASK_RUNNING)
5819                 printk(KERN_CONT "  running task    ");
5820         else
5821                 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5822 #endif
5823 #ifdef CONFIG_DEBUG_STACK_USAGE
5824         free = stack_not_used(p);
5825 #endif
5826         printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5827                 task_pid_nr(p), task_pid_nr(p->real_parent),
5828                 (unsigned long)task_thread_info(p)->flags);
5829
5830         show_stack(p, NULL);
5831 }
5832
5833 void show_state_filter(unsigned long state_filter)
5834 {
5835         struct task_struct *g, *p;
5836
5837 #if BITS_PER_LONG == 32
5838         printk(KERN_INFO
5839                 "  task                PC stack   pid father\n");
5840 #else
5841         printk(KERN_INFO
5842                 "  task                        PC stack   pid father\n");
5843 #endif
5844         read_lock(&tasklist_lock);
5845         do_each_thread(g, p) {
5846                 /*
5847                  * reset the NMI-timeout, listing all files on a slow
5848                  * console might take a lot of time:
5849                  */
5850                 touch_nmi_watchdog();
5851                 if (!state_filter || (p->state & state_filter))
5852                         sched_show_task(p);
5853         } while_each_thread(g, p);
5854
5855         touch_all_softlockup_watchdogs();
5856
5857 #ifdef CONFIG_SCHED_DEBUG
5858         sysrq_sched_debug_show();
5859 #endif
5860         read_unlock(&tasklist_lock);
5861         /*
5862          * Only show locks if all tasks are dumped:
5863          */
5864         if (!state_filter)
5865                 debug_show_all_locks();
5866 }
5867
5868 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5869 {
5870         idle->sched_class = &idle_sched_class;
5871 }
5872
5873 /**
5874  * init_idle - set up an idle thread for a given CPU
5875  * @idle: task in question
5876  * @cpu: cpu the idle task belongs to
5877  *
5878  * NOTE: this function does not set the idle thread's NEED_RESCHED
5879  * flag, to make booting more robust.
5880  */
5881 void __cpuinit init_idle(struct task_struct *idle, int cpu)
5882 {
5883         struct rq *rq = cpu_rq(cpu);
5884         unsigned long flags;
5885
5886         raw_spin_lock_irqsave(&rq->lock, flags);
5887
5888         __sched_fork(idle);
5889         idle->state = TASK_RUNNING;
5890         idle->se.exec_start = sched_clock();
5891
5892         do_set_cpus_allowed(idle, cpumask_of(cpu));
5893         /*
5894          * We're having a chicken and egg problem, even though we are
5895          * holding rq->lock, the cpu isn't yet set to this cpu so the
5896          * lockdep check in task_group() will fail.
5897          *
5898          * Similar case to sched_fork(). / Alternatively we could
5899          * use task_rq_lock() here and obtain the other rq->lock.
5900          *
5901          * Silence PROVE_RCU
5902          */
5903         rcu_read_lock();
5904         __set_task_cpu(idle, cpu);
5905         rcu_read_unlock();
5906
5907         rq->curr = rq->idle = idle;
5908 #if defined(CONFIG_SMP)
5909         idle->on_cpu = 1;
5910 #endif
5911         raw_spin_unlock_irqrestore(&rq->lock, flags);
5912
5913         /* Set the preempt count _outside_ the spinlocks! */
5914         task_thread_info(idle)->preempt_count = 0;
5915
5916         /*
5917          * The idle tasks have their own, simple scheduling class:
5918          */
5919         idle->sched_class = &idle_sched_class;
5920         ftrace_graph_init_idle_task(idle, cpu);
5921 }
5922
5923 /*
5924  * In a system that switches off the HZ timer nohz_cpu_mask
5925  * indicates which cpus entered this state. This is used
5926  * in the rcu update to wait only for active cpus. For system
5927  * which do not switch off the HZ timer nohz_cpu_mask should
5928  * always be CPU_BITS_NONE.
5929  */
5930 cpumask_var_t nohz_cpu_mask;
5931
5932 /*
5933  * Increase the granularity value when there are more CPUs,
5934  * because with more CPUs the 'effective latency' as visible
5935  * to users decreases. But the relationship is not linear,
5936  * so pick a second-best guess by going with the log2 of the
5937  * number of CPUs.
5938  *
5939  * This idea comes from the SD scheduler of Con Kolivas:
5940  */
5941 static int get_update_sysctl_factor(void)
5942 {
5943         unsigned int cpus = min_t(int, num_online_cpus(), 8);
5944         unsigned int factor;
5945
5946         switch (sysctl_sched_tunable_scaling) {
5947         case SCHED_TUNABLESCALING_NONE:
5948                 factor = 1;
5949                 break;
5950         case SCHED_TUNABLESCALING_LINEAR:
5951                 factor = cpus;
5952                 break;
5953         case SCHED_TUNABLESCALING_LOG:
5954         default:
5955                 factor = 1 + ilog2(cpus);
5956                 break;
5957         }
5958
5959         return factor;
5960 }
5961
5962 static void update_sysctl(void)
5963 {
5964         unsigned int factor = get_update_sysctl_factor();
5965
5966 #define SET_SYSCTL(name) \
5967         (sysctl_##name = (factor) * normalized_sysctl_##name)
5968         SET_SYSCTL(sched_min_granularity);
5969         SET_SYSCTL(sched_latency);
5970         SET_SYSCTL(sched_wakeup_granularity);
5971 #undef SET_SYSCTL
5972 }
5973
5974 static inline void sched_init_granularity(void)
5975 {
5976         update_sysctl();
5977 }
5978
5979 #ifdef CONFIG_SMP
5980 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5981 {
5982         if (p->sched_class && p->sched_class->set_cpus_allowed)
5983                 p->sched_class->set_cpus_allowed(p, new_mask);
5984         else {
5985                 cpumask_copy(&p->cpus_allowed, new_mask);
5986                 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5987         }
5988 }
5989
5990 /*
5991  * This is how migration works:
5992  *
5993  * 1) we invoke migration_cpu_stop() on the target CPU using
5994  *    stop_one_cpu().
5995  * 2) stopper starts to run (implicitly forcing the migrated thread
5996  *    off the CPU)
5997  * 3) it checks whether the migrated task is still in the wrong runqueue.
5998  * 4) if it's in the wrong runqueue then the migration thread removes
5999  *    it and puts it into the right queue.
6000  * 5) stopper completes and stop_one_cpu() returns and the migration
6001  *    is done.
6002  */
6003
6004 /*
6005  * Change a given task's CPU affinity. Migrate the thread to a
6006  * proper CPU and schedule it away if the CPU it's executing on
6007  * is removed from the allowed bitmask.
6008  *
6009  * NOTE: the caller must have a valid reference to the task, the
6010  * task must not exit() & deallocate itself prematurely. The
6011  * call is not atomic; no spinlocks may be held.
6012  */
6013 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6014 {
6015         unsigned long flags;
6016         struct rq *rq;
6017         unsigned int dest_cpu;
6018         int ret = 0;
6019
6020         rq = task_rq_lock(p, &flags);
6021
6022         if (cpumask_equal(&p->cpus_allowed, new_mask))
6023                 goto out;
6024
6025         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
6026                 ret = -EINVAL;
6027                 goto out;
6028         }
6029
6030         if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
6031                 ret = -EINVAL;
6032                 goto out;
6033         }
6034
6035         do_set_cpus_allowed(p, new_mask);
6036
6037         /* Can the task run on the task's current CPU? If so, we're done */
6038         if (cpumask_test_cpu(task_cpu(p), new_mask))
6039                 goto out;
6040
6041         dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
6042         if (p->on_rq) {
6043                 struct migration_arg arg = { p, dest_cpu };
6044                 /* Need help from migration thread: drop lock and wait. */
6045                 task_rq_unlock(rq, p, &flags);
6046                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
6047                 tlb_migrate_finish(p->mm);
6048                 return 0;
6049         }
6050 out:
6051         task_rq_unlock(rq, p, &flags);
6052
6053         return ret;
6054 }
6055 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
6056
6057 /*
6058  * Move (not current) task off this cpu, onto dest cpu. We're doing
6059  * this because either it can't run here any more (set_cpus_allowed()
6060  * away from this CPU, or CPU going down), or because we're
6061  * attempting to rebalance this task on exec (sched_exec).
6062  *
6063  * So we race with normal scheduler movements, but that's OK, as long
6064  * as the task is no longer on this CPU.
6065  *
6066  * Returns non-zero if task was successfully migrated.
6067  */
6068 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6069 {
6070         struct rq *rq_dest, *rq_src;
6071         int ret = 0;
6072
6073         if (unlikely(!cpu_active(dest_cpu)))
6074                 return ret;
6075
6076         rq_src = cpu_rq(src_cpu);
6077         rq_dest = cpu_rq(dest_cpu);
6078
6079         raw_spin_lock(&p->pi_lock);
6080         double_rq_lock(rq_src, rq_dest);
6081         /* Already moved. */
6082         if (task_cpu(p) != src_cpu)
6083                 goto done;
6084         /* Affinity changed (again). */
6085         if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6086                 goto fail;
6087
6088         /*
6089          * If we're not on a rq, the next wake-up will ensure we're
6090          * placed properly.
6091          */
6092         if (p->on_rq) {
6093                 deactivate_task(rq_src, p, 0);
6094                 set_task_cpu(p, dest_cpu);
6095                 activate_task(rq_dest, p, 0);
6096                 check_preempt_curr(rq_dest, p, 0);
6097         }
6098 done:
6099         ret = 1;
6100 fail:
6101         double_rq_unlock(rq_src, rq_dest);
6102         raw_spin_unlock(&p->pi_lock);
6103         return ret;
6104 }
6105
6106 /*
6107  * migration_cpu_stop - this will be executed by a highprio stopper thread
6108  * and performs thread migration by bumping thread off CPU then
6109  * 'pushing' onto another runqueue.
6110  */
6111 static int migration_cpu_stop(void *data)
6112 {
6113         struct migration_arg *arg = data;
6114
6115         /*
6116          * The original target cpu might have gone down and we might
6117          * be on another cpu but it doesn't matter.
6118          */
6119         local_irq_disable();
6120         __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6121         local_irq_enable();
6122         return 0;
6123 }
6124
6125 #ifdef CONFIG_HOTPLUG_CPU
6126
6127 /*
6128  * Ensures that the idle task is using init_mm right before its cpu goes
6129  * offline.
6130  */
6131 void idle_task_exit(void)
6132 {
6133         struct mm_struct *mm = current->active_mm;
6134
6135         BUG_ON(cpu_online(smp_processor_id()));
6136
6137         if (mm != &init_mm)
6138                 switch_mm(mm, &init_mm, current);
6139         mmdrop(mm);
6140 }
6141
6142 /*
6143  * While a dead CPU has no uninterruptible tasks queued at this point,
6144  * it might still have a nonzero ->nr_uninterruptible counter, because
6145  * for performance reasons the counter is not stricly tracking tasks to
6146  * their home CPUs. So we just add the counter to another CPU's counter,
6147  * to keep the global sum constant after CPU-down:
6148  */
6149 static void migrate_nr_uninterruptible(struct rq *rq_src)
6150 {
6151         struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
6152
6153         rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6154         rq_src->nr_uninterruptible = 0;
6155 }
6156
6157 /*
6158  * remove the tasks which were accounted by rq from calc_load_tasks.
6159  */
6160 static void calc_global_load_remove(struct rq *rq)
6161 {
6162         atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6163         rq->calc_load_active = 0;
6164 }
6165
6166 /*
6167  * Migrate all tasks from the rq, sleeping tasks will be migrated by
6168  * try_to_wake_up()->select_task_rq().
6169  *
6170  * Called with rq->lock held even though we'er in stop_machine() and
6171  * there's no concurrency possible, we hold the required locks anyway
6172  * because of lock validation efforts.
6173  */
6174 static void migrate_tasks(unsigned int dead_cpu)
6175 {
6176         struct rq *rq = cpu_rq(dead_cpu);
6177         struct task_struct *next, *stop = rq->stop;
6178         int dest_cpu;
6179
6180         /*
6181          * Fudge the rq selection such that the below task selection loop
6182          * doesn't get stuck on the currently eligible stop task.
6183          *
6184          * We're currently inside stop_machine() and the rq is either stuck
6185          * in the stop_machine_cpu_stop() loop, or we're executing this code,
6186          * either way we should never end up calling schedule() until we're
6187          * done here.
6188          */
6189         rq->stop = NULL;
6190
6191         for ( ; ; ) {
6192                 /*
6193                  * There's this thread running, bail when that's the only
6194                  * remaining thread.
6195                  */
6196                 if (rq->nr_running == 1)
6197                         break;
6198
6199                 next = pick_next_task(rq);
6200                 BUG_ON(!next);
6201                 next->sched_class->put_prev_task(rq, next);
6202
6203                 /* Find suitable destination for @next, with force if needed. */
6204                 dest_cpu = select_fallback_rq(dead_cpu, next);
6205                 raw_spin_unlock(&rq->lock);
6206
6207                 __migrate_task(next, dead_cpu, dest_cpu);
6208
6209                 raw_spin_lock(&rq->lock);
6210         }
6211
6212         rq->stop = stop;
6213 }
6214
6215 #endif /* CONFIG_HOTPLUG_CPU */
6216
6217 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6218
6219 static struct ctl_table sd_ctl_dir[] = {
6220         {
6221                 .procname       = "sched_domain",
6222                 .mode           = 0555,
6223         },
6224         {}
6225 };
6226
6227 static struct ctl_table sd_ctl_root[] = {
6228         {
6229                 .procname       = "kernel",
6230                 .mode           = 0555,
6231                 .child          = sd_ctl_dir,
6232         },
6233         {}
6234 };
6235
6236 static struct ctl_table *sd_alloc_ctl_entry(int n)
6237 {
6238         struct ctl_table *entry =
6239                 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
6240
6241         return entry;
6242 }
6243
6244 static void sd_free_ctl_entry(struct ctl_table **tablep)
6245 {
6246         struct ctl_table *entry;
6247
6248         /*
6249          * In the intermediate directories, both the child directory and
6250          * procname are dynamically allocated and could fail but the mode
6251          * will always be set. In the lowest directory the names are
6252          * static strings and all have proc handlers.
6253          */
6254         for (entry = *tablep; entry->mode; entry++) {
6255                 if (entry->child)
6256                         sd_free_ctl_entry(&entry->child);
6257                 if (entry->proc_handler == NULL)
6258                         kfree(entry->procname);
6259         }
6260
6261         kfree(*tablep);
6262         *tablep = NULL;
6263 }
6264
6265 static void
6266 set_table_entry(struct ctl_table *entry,
6267                 const char *procname, void *data, int maxlen,
6268                 mode_t mode, proc_handler *proc_handler)
6269 {
6270         entry->procname = procname;
6271         entry->data = data;
6272         entry->maxlen = maxlen;
6273         entry->mode = mode;
6274         entry->proc_handler = proc_handler;
6275 }
6276
6277 static struct ctl_table *
6278 sd_alloc_ctl_domain_table(struct sched_domain *sd)
6279 {
6280         struct ctl_table *table = sd_alloc_ctl_entry(13);
6281
6282         if (table == NULL)
6283                 return NULL;
6284
6285         set_table_entry(&table[0], "min_interval", &sd->min_interval,
6286                 sizeof(long), 0644, proc_doulongvec_minmax);
6287         set_table_entry(&table[1], "max_interval", &sd->max_interval,
6288                 sizeof(long), 0644, proc_doulongvec_minmax);
6289         set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
6290                 sizeof(int), 0644, proc_dointvec_minmax);
6291         set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
6292                 sizeof(int), 0644, proc_dointvec_minmax);
6293         set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
6294                 sizeof(int), 0644, proc_dointvec_minmax);
6295         set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
6296                 sizeof(int), 0644, proc_dointvec_minmax);
6297         set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
6298                 sizeof(int), 0644, proc_dointvec_minmax);
6299         set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
6300                 sizeof(int), 0644, proc_dointvec_minmax);
6301         set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
6302                 sizeof(int), 0644, proc_dointvec_minmax);
6303         set_table_entry(&table[9], "cache_nice_tries",
6304                 &sd->cache_nice_tries,
6305                 sizeof(int), 0644, proc_dointvec_minmax);
6306         set_table_entry(&table[10], "flags", &sd->flags,
6307                 sizeof(int), 0644, proc_dointvec_minmax);
6308         set_table_entry(&table[11], "name", sd->name,
6309                 CORENAME_MAX_SIZE, 0444, proc_dostring);
6310         /* &table[12] is terminator */
6311
6312         return table;
6313 }
6314
6315 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
6316 {
6317         struct ctl_table *entry, *table;
6318         struct sched_domain *sd;
6319         int domain_num = 0, i;
6320         char buf[32];
6321
6322         for_each_domain(cpu, sd)
6323                 domain_num++;
6324         entry = table = sd_alloc_ctl_entry(domain_num + 1);
6325         if (table == NULL)
6326                 return NULL;
6327
6328         i = 0;
6329         for_each_domain(cpu, sd) {
6330                 snprintf(buf, 32, "domain%d", i);
6331                 entry->procname = kstrdup(buf, GFP_KERNEL);
6332                 entry->mode = 0555;
6333                 entry->child = sd_alloc_ctl_domain_table(sd);
6334                 entry++;
6335                 i++;
6336         }
6337         return table;
6338 }
6339
6340 static struct ctl_table_header *sd_sysctl_header;
6341 static void register_sched_domain_sysctl(void)
6342 {
6343         int i, cpu_num = num_possible_cpus();
6344         struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6345         char buf[32];
6346
6347         WARN_ON(sd_ctl_dir[0].child);
6348         sd_ctl_dir[0].child = entry;
6349
6350         if (entry == NULL)
6351                 return;
6352
6353         for_each_possible_cpu(i) {
6354                 snprintf(buf, 32, "cpu%d", i);
6355                 entry->procname = kstrdup(buf, GFP_KERNEL);
6356                 entry->mode = 0555;
6357                 entry->child = sd_alloc_ctl_cpu_table(i);
6358                 entry++;
6359         }
6360
6361         WARN_ON(sd_sysctl_header);
6362         sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6363 }
6364
6365 /* may be called multiple times per register */
6366 static void unregister_sched_domain_sysctl(void)
6367 {
6368         if (sd_sysctl_header)
6369                 unregister_sysctl_table(sd_sysctl_header);
6370         sd_sysctl_header = NULL;
6371         if (sd_ctl_dir[0].child)
6372                 sd_free_ctl_entry(&sd_ctl_dir[0].child);
6373 }
6374 #else
6375 static void register_sched_domain_sysctl(void)
6376 {
6377 }
6378 static void unregister_sched_domain_sysctl(void)
6379 {
6380 }
6381 #endif
6382
6383 static void set_rq_online(struct rq *rq)
6384 {
6385         if (!rq->online) {
6386                 const struct sched_class *class;
6387
6388                 cpumask_set_cpu(rq->cpu, rq->rd->online);
6389                 rq->online = 1;
6390
6391                 for_each_class(class) {
6392                         if (class->rq_online)
6393                                 class->rq_online(rq);
6394                 }
6395         }
6396 }
6397
6398 static void set_rq_offline(struct rq *rq)
6399 {
6400         if (rq->online) {
6401                 const struct sched_class *class;
6402
6403                 for_each_class(class) {
6404                         if (class->rq_offline)
6405                                 class->rq_offline(rq);
6406                 }
6407
6408                 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6409                 rq->online = 0;
6410         }
6411 }
6412
6413 /*
6414  * migration_call - callback that gets triggered when a CPU is added.
6415  * Here we can start up the necessary migration thread for the new CPU.
6416  */
6417 static int __cpuinit
6418 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6419 {
6420         int cpu = (long)hcpu;
6421         unsigned long flags;
6422         struct rq *rq = cpu_rq(cpu);
6423
6424         switch (action & ~CPU_TASKS_FROZEN) {
6425
6426         case CPU_UP_PREPARE:
6427                 rq->calc_load_update = calc_load_update;
6428                 break;
6429
6430         case CPU_ONLINE:
6431                 /* Update our root-domain */
6432                 raw_spin_lock_irqsave(&rq->lock, flags);
6433                 if (rq->rd) {
6434                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6435
6436                         set_rq_online(rq);
6437                 }
6438                 raw_spin_unlock_irqrestore(&rq->lock, flags);
6439                 break;
6440
6441 #ifdef CONFIG_HOTPLUG_CPU
6442         case CPU_DYING:
6443                 sched_ttwu_pending();
6444                 /* Update our root-domain */
6445                 raw_spin_lock_irqsave(&rq->lock, flags);
6446                 if (rq->rd) {
6447                         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6448                         set_rq_offline(rq);
6449                 }
6450                 migrate_tasks(cpu);
6451                 BUG_ON(rq->nr_running != 1); /* the migration thread */
6452                 raw_spin_unlock_irqrestore(&rq->lock, flags);
6453
6454                 migrate_nr_uninterruptible(rq);
6455                 calc_global_load_remove(rq);
6456                 break;
6457 #endif
6458         }
6459
6460         update_max_interval();
6461
6462         return NOTIFY_OK;
6463 }
6464
6465 /*
6466  * Register at high priority so that task migration (migrate_all_tasks)
6467  * happens before everything else.  This has to be lower priority than
6468  * the notifier in the perf_event subsystem, though.
6469  */
6470 static struct notifier_block __cpuinitdata migration_notifier = {
6471         .notifier_call = migration_call,
6472         .priority = CPU_PRI_MIGRATION,
6473 };
6474
6475 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6476                                       unsigned long action, void *hcpu)
6477 {
6478         switch (action & ~CPU_TASKS_FROZEN) {
6479         case CPU_ONLINE:
6480         case CPU_DOWN_FAILED:
6481                 set_cpu_active((long)hcpu, true);
6482                 return NOTIFY_OK;
6483         default:
6484                 return NOTIFY_DONE;
6485         }
6486 }
6487
6488 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6489                                         unsigned long action, void *hcpu)
6490 {
6491         switch (action & ~CPU_TASKS_FROZEN) {
6492         case CPU_DOWN_PREPARE:
6493                 set_cpu_active((long)hcpu, false);
6494                 return NOTIFY_OK;
6495         default:
6496                 return NOTIFY_DONE;
6497         }
6498 }
6499
6500 static int __init migration_init(void)
6501 {
6502         void *cpu = (void *)(long)smp_processor_id();
6503         int err;
6504
6505         /* Initialize migration for the boot CPU */
6506         err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6507         BUG_ON(err == NOTIFY_BAD);
6508         migration_call(&migration_notifier, CPU_ONLINE, cpu);
6509         register_cpu_notifier(&migration_notifier);
6510
6511         /* Register cpu active notifiers */
6512         cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6513         cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6514
6515         return 0;
6516 }
6517 early_initcall(migration_init);
6518 #endif
6519
6520 #ifdef CONFIG_SMP
6521
6522 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6523
6524 #ifdef CONFIG_SCHED_DEBUG
6525
6526 static __read_mostly int sched_domain_debug_enabled;
6527
6528 static int __init sched_domain_debug_setup(char *str)
6529 {
6530         sched_domain_debug_enabled = 1;
6531
6532         return 0;
6533 }
6534 early_param("sched_debug", sched_domain_debug_setup);
6535
6536 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6537                                   struct cpumask *groupmask)
6538 {
6539         struct sched_group *group = sd->groups;
6540         char str[256];
6541
6542         cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6543         cpumask_clear(groupmask);
6544
6545         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6546
6547         if (!(sd->flags & SD_LOAD_BALANCE)) {
6548                 printk("does not load-balance\n");
6549                 if (sd->parent)
6550                         printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6551                                         " has parent");
6552                 return -1;
6553         }
6554
6555         printk(KERN_CONT "span %s level %s\n", str, sd->name);
6556
6557         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6558                 printk(KERN_ERR "ERROR: domain->span does not contain "
6559                                 "CPU%d\n", cpu);
6560         }
6561         if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6562                 printk(KERN_ERR "ERROR: domain->groups does not contain"
6563                                 " CPU%d\n", cpu);
6564         }
6565
6566         printk(KERN_DEBUG "%*s groups:", level + 1, "");
6567         do {
6568                 if (!group) {
6569                         printk("\n");
6570                         printk(KERN_ERR "ERROR: group is NULL\n");
6571                         break;
6572                 }
6573
6574                 if (!group->sgp->power) {
6575                         printk(KERN_CONT "\n");
6576                         printk(KERN_ERR "ERROR: domain->cpu_power not "
6577                                         "set\n");
6578                         break;
6579                 }
6580
6581                 if (!cpumask_weight(sched_group_cpus(group))) {
6582                         printk(KERN_CONT "\n");
6583                         printk(KERN_ERR "ERROR: empty group\n");
6584                         break;
6585                 }
6586
6587                 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6588                         printk(KERN_CONT "\n");
6589                         printk(KERN_ERR "ERROR: repeated CPUs\n");
6590                         break;
6591                 }
6592
6593                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6594
6595                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6596
6597                 printk(KERN_CONT " %s", str);
6598                 if (group->sgp->power != SCHED_POWER_SCALE) {
6599                         printk(KERN_CONT " (cpu_power = %d)",
6600                                 group->sgp->power);
6601                 }
6602
6603                 group = group->next;
6604         } while (group != sd->groups);
6605         printk(KERN_CONT "\n");
6606
6607         if (!cpumask_equal(sched_domain_span(sd), groupmask))
6608                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6609
6610         if (sd->parent &&
6611             !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6612                 printk(KERN_ERR "ERROR: parent span is not a superset "
6613                         "of domain->span\n");
6614         return 0;
6615 }
6616
6617 static void sched_domain_debug(struct sched_domain *sd, int cpu)
6618 {
6619         int level = 0;
6620
6621         if (!sched_domain_debug_enabled)
6622                 return;
6623
6624         if (!sd) {
6625                 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6626                 return;
6627         }
6628
6629         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6630
6631         for (;;) {
6632                 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6633                         break;
6634                 level++;
6635                 sd = sd->parent;
6636                 if (!sd)
6637                         break;
6638         }
6639 }
6640 #else /* !CONFIG_SCHED_DEBUG */
6641 # define sched_domain_debug(sd, cpu) do { } while (0)
6642 #endif /* CONFIG_SCHED_DEBUG */
6643
6644 static int sd_degenerate(struct sched_domain *sd)
6645 {
6646         if (cpumask_weight(sched_domain_span(sd)) == 1)
6647                 return 1;
6648
6649         /* Following flags need at least 2 groups */
6650         if (sd->flags & (SD_LOAD_BALANCE |
6651                          SD_BALANCE_NEWIDLE |
6652                          SD_BALANCE_FORK |
6653                          SD_BALANCE_EXEC |
6654                          SD_SHARE_CPUPOWER |
6655                          SD_SHARE_PKG_RESOURCES)) {
6656                 if (sd->groups != sd->groups->next)
6657                         return 0;
6658         }
6659
6660         /* Following flags don't use groups */
6661         if (sd->flags & (SD_WAKE_AFFINE))
6662                 return 0;
6663
6664         return 1;
6665 }
6666
6667 static int
6668 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6669 {
6670         unsigned long cflags = sd->flags, pflags = parent->flags;
6671
6672         if (sd_degenerate(parent))
6673                 return 1;
6674
6675         if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6676                 return 0;
6677
6678         /* Flags needing groups don't count if only 1 group in parent */
6679         if (parent->groups == parent->groups->next) {
6680                 pflags &= ~(SD_LOAD_BALANCE |
6681                                 SD_BALANCE_NEWIDLE |
6682                                 SD_BALANCE_FORK |
6683                                 SD_BALANCE_EXEC |
6684                                 SD_SHARE_CPUPOWER |
6685                                 SD_SHARE_PKG_RESOURCES);
6686                 if (nr_node_ids == 1)
6687                         pflags &= ~SD_SERIALIZE;
6688         }
6689         if (~cflags & pflags)
6690                 return 0;
6691
6692         return 1;
6693 }
6694
6695 static void free_rootdomain(struct rcu_head *rcu)
6696 {
6697         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6698
6699         cpupri_cleanup(&rd->cpupri);
6700         free_cpumask_var(rd->rto_mask);
6701         free_cpumask_var(rd->online);
6702         free_cpumask_var(rd->span);
6703         kfree(rd);
6704 }
6705
6706 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6707 {
6708         struct root_domain *old_rd = NULL;
6709         unsigned long flags;
6710
6711         raw_spin_lock_irqsave(&rq->lock, flags);
6712
6713         if (rq->rd) {
6714                 old_rd = rq->rd;
6715
6716                 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6717                         set_rq_offline(rq);
6718
6719                 cpumask_clear_cpu(rq->cpu, old_rd->span);
6720
6721                 /*
6722                  * If we dont want to free the old_rt yet then
6723                  * set old_rd to NULL to skip the freeing later
6724                  * in this function:
6725                  */
6726                 if (!atomic_dec_and_test(&old_rd->refcount))
6727                         old_rd = NULL;
6728         }
6729
6730         atomic_inc(&rd->refcount);
6731         rq->rd = rd;
6732
6733         cpumask_set_cpu(rq->cpu, rd->span);
6734         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6735                 set_rq_online(rq);
6736
6737         raw_spin_unlock_irqrestore(&rq->lock, flags);
6738
6739         if (old_rd)
6740                 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6741 }
6742
6743 static int init_rootdomain(struct root_domain *rd)
6744 {
6745         memset(rd, 0, sizeof(*rd));
6746
6747         if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6748                 goto out;
6749         if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6750                 goto free_span;
6751         if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6752                 goto free_online;
6753
6754         if (cpupri_init(&rd->cpupri) != 0)
6755                 goto free_rto_mask;
6756         return 0;
6757
6758 free_rto_mask:
6759         free_cpumask_var(rd->rto_mask);
6760 free_online:
6761         free_cpumask_var(rd->online);
6762 free_span:
6763         free_cpumask_var(rd->span);
6764 out:
6765         return -ENOMEM;
6766 }
6767
6768 static void init_defrootdomain(void)
6769 {
6770         init_rootdomain(&def_root_domain);
6771
6772         atomic_set(&def_root_domain.refcount, 1);
6773 }
6774
6775 static struct root_domain *alloc_rootdomain(void)
6776 {
6777         struct root_domain *rd;
6778
6779         rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6780         if (!rd)
6781                 return NULL;
6782
6783         if (init_rootdomain(rd) != 0) {
6784                 kfree(rd);
6785                 return NULL;
6786         }
6787
6788         return rd;
6789 }
6790
6791 static void free_sched_groups(struct sched_group *sg, int free_sgp)
6792 {
6793         struct sched_group *tmp, *first;
6794
6795         if (!sg)
6796                 return;
6797
6798         first = sg;
6799         do {
6800                 tmp = sg->next;
6801
6802                 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
6803                         kfree(sg->sgp);
6804
6805                 kfree(sg);
6806                 sg = tmp;
6807         } while (sg != first);
6808 }
6809
6810 static void free_sched_domain(struct rcu_head *rcu)
6811 {
6812         struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6813
6814         /*
6815          * If its an overlapping domain it has private groups, iterate and
6816          * nuke them all.
6817          */
6818         if (sd->flags & SD_OVERLAP) {
6819                 free_sched_groups(sd->groups, 1);
6820         } else if (atomic_dec_and_test(&sd->groups->ref)) {
6821                 kfree(sd->groups->sgp);
6822                 kfree(sd->groups);
6823         }
6824         kfree(sd);
6825 }
6826
6827 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6828 {
6829         call_rcu(&sd->rcu, free_sched_domain);
6830 }
6831
6832 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6833 {
6834         for (; sd; sd = sd->parent)
6835                 destroy_sched_domain(sd, cpu);
6836 }
6837
6838 /*
6839  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6840  * hold the hotplug lock.
6841  */
6842 static void
6843 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6844 {
6845         struct rq *rq = cpu_rq(cpu);
6846         struct sched_domain *tmp;
6847
6848         /* Remove the sched domains which do not contribute to scheduling. */
6849         for (tmp = sd; tmp; ) {
6850                 struct sched_domain *parent = tmp->parent;
6851                 if (!parent)
6852                         break;
6853
6854                 if (sd_parent_degenerate(tmp, parent)) {
6855                         tmp->parent = parent->parent;
6856                         if (parent->parent)
6857                                 parent->parent->child = tmp;
6858                         destroy_sched_domain(parent, cpu);
6859                 } else
6860                         tmp = tmp->parent;
6861         }
6862
6863         if (sd && sd_degenerate(sd)) {
6864                 tmp = sd;
6865                 sd = sd->parent;
6866                 destroy_sched_domain(tmp, cpu);
6867                 if (sd)
6868                         sd->child = NULL;
6869         }
6870
6871         sched_domain_debug(sd, cpu);
6872
6873         rq_attach_root(rq, rd);
6874         tmp = rq->sd;
6875         rcu_assign_pointer(rq->sd, sd);
6876         destroy_sched_domains(tmp, cpu);
6877 }
6878
6879 /* cpus with isolated domains */
6880 static cpumask_var_t cpu_isolated_map;
6881
6882 /* Setup the mask of cpus configured for isolated domains */
6883 static int __init isolated_cpu_setup(char *str)
6884 {
6885         alloc_bootmem_cpumask_var(&cpu_isolated_map);
6886         cpulist_parse(str, cpu_isolated_map);
6887         return 1;
6888 }
6889
6890 __setup("isolcpus=", isolated_cpu_setup);
6891
6892 #define SD_NODES_PER_DOMAIN 16
6893
6894 #ifdef CONFIG_NUMA
6895
6896 /**
6897  * find_next_best_node - find the next node to include in a sched_domain
6898  * @node: node whose sched_domain we're building
6899  * @used_nodes: nodes already in the sched_domain
6900  *
6901  * Find the next node to include in a given scheduling domain. Simply
6902  * finds the closest node not already in the @used_nodes map.
6903  *
6904  * Should use nodemask_t.
6905  */
6906 static int find_next_best_node(int node, nodemask_t *used_nodes)
6907 {
6908         int i, n, val, min_val, best_node = -1;
6909
6910         min_val = INT_MAX;
6911
6912         for (i = 0; i < nr_node_ids; i++) {
6913                 /* Start at @node */
6914                 n = (node + i) % nr_node_ids;
6915
6916                 if (!nr_cpus_node(n))
6917                         continue;
6918
6919                 /* Skip already used nodes */
6920                 if (node_isset(n, *used_nodes))
6921                         continue;
6922
6923                 /* Simple min distance search */
6924                 val = node_distance(node, n);
6925
6926                 if (val < min_val) {
6927                         min_val = val;
6928                         best_node = n;
6929                 }
6930         }
6931
6932         if (best_node != -1)
6933                 node_set(best_node, *used_nodes);
6934         return best_node;
6935 }
6936
6937 /**
6938  * sched_domain_node_span - get a cpumask for a node's sched_domain
6939  * @node: node whose cpumask we're constructing
6940  * @span: resulting cpumask
6941  *
6942  * Given a node, construct a good cpumask for its sched_domain to span. It
6943  * should be one that prevents unnecessary balancing, but also spreads tasks
6944  * out optimally.
6945  */
6946 static void sched_domain_node_span(int node, struct cpumask *span)
6947 {
6948         nodemask_t used_nodes;
6949         int i;
6950
6951         cpumask_clear(span);
6952         nodes_clear(used_nodes);
6953
6954         cpumask_or(span, span, cpumask_of_node(node));
6955         node_set(node, used_nodes);
6956
6957         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6958                 int next_node = find_next_best_node(node, &used_nodes);
6959                 if (next_node < 0)
6960                         break;
6961                 cpumask_or(span, span, cpumask_of_node(next_node));
6962         }
6963 }
6964
6965 static const struct cpumask *cpu_node_mask(int cpu)
6966 {
6967         lockdep_assert_held(&sched_domains_mutex);
6968
6969         sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6970
6971         return sched_domains_tmpmask;
6972 }
6973
6974 static const struct cpumask *cpu_allnodes_mask(int cpu)
6975 {
6976         return cpu_possible_mask;
6977 }
6978 #endif /* CONFIG_NUMA */
6979
6980 static const struct cpumask *cpu_cpu_mask(int cpu)
6981 {
6982         return cpumask_of_node(cpu_to_node(cpu));
6983 }
6984
6985 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6986
6987 struct sd_data {
6988         struct sched_domain **__percpu sd;
6989         struct sched_group **__percpu sg;
6990         struct sched_group_power **__percpu sgp;
6991 };
6992
6993 struct s_data {
6994         struct sched_domain ** __percpu sd;
6995         struct root_domain      *rd;
6996 };
6997
6998 enum s_alloc {
6999         sa_rootdomain,
7000         sa_sd,
7001         sa_sd_storage,
7002         sa_none,
7003 };
7004
7005 struct sched_domain_topology_level;
7006
7007 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
7008 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
7009
7010 #define SDTL_OVERLAP    0x01
7011
7012 struct sched_domain_topology_level {
7013         sched_domain_init_f init;
7014         sched_domain_mask_f mask;
7015         int                 flags;
7016         struct sd_data      data;
7017 };
7018
7019 static int
7020 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7021 {
7022         struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7023         const struct cpumask *span = sched_domain_span(sd);
7024         struct cpumask *covered = sched_domains_tmpmask;
7025         struct sd_data *sdd = sd->private;
7026         struct sched_domain *child;
7027         int i;
7028
7029         cpumask_clear(covered);
7030
7031         for_each_cpu(i, span) {
7032                 struct cpumask *sg_span;
7033
7034                 if (cpumask_test_cpu(i, covered))
7035                         continue;
7036
7037                 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7038                                 GFP_KERNEL, cpu_to_node(i));
7039
7040                 if (!sg)
7041                         goto fail;
7042
7043                 sg_span = sched_group_cpus(sg);
7044
7045                 child = *per_cpu_ptr(sdd->sd, i);
7046                 if (child->child) {
7047                         child = child->child;
7048                         cpumask_copy(sg_span, sched_domain_span(child));
7049                 } else
7050                         cpumask_set_cpu(i, sg_span);
7051
7052                 cpumask_or(covered, covered, sg_span);
7053
7054                 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7055                 atomic_inc(&sg->sgp->ref);
7056
7057                 if (cpumask_test_cpu(cpu, sg_span))
7058                         groups = sg;
7059
7060                 if (!first)
7061                         first = sg;
7062                 if (last)
7063                         last->next = sg;
7064                 last = sg;
7065                 last->next = first;
7066         }
7067         sd->groups = groups;
7068
7069         return 0;
7070
7071 fail:
7072         free_sched_groups(first, 0);
7073
7074         return -ENOMEM;
7075 }
7076
7077 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
7078 {
7079         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7080         struct sched_domain *child = sd->child;
7081
7082         if (child)
7083                 cpu = cpumask_first(sched_domain_span(child));
7084
7085         if (sg) {
7086                 *sg = *per_cpu_ptr(sdd->sg, cpu);
7087                 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
7088                 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
7089         }
7090
7091         return cpu;
7092 }
7093
7094 /*
7095  * build_sched_groups will build a circular linked list of the groups
7096  * covered by the given span, and will set each group's ->cpumask correctly,
7097  * and ->cpu_power to 0.
7098  *
7099  * Assumes the sched_domain tree is fully constructed
7100  */
7101 static int
7102 build_sched_groups(struct sched_domain *sd, int cpu)
7103 {
7104         struct sched_group *first = NULL, *last = NULL;
7105         struct sd_data *sdd = sd->private;
7106         const struct cpumask *span = sched_domain_span(sd);
7107         struct cpumask *covered;
7108         int i;
7109
7110         get_group(cpu, sdd, &sd->groups);
7111         atomic_inc(&sd->groups->ref);
7112
7113         if (cpu != cpumask_first(sched_domain_span(sd)))
7114                 return 0;
7115
7116         lockdep_assert_held(&sched_domains_mutex);
7117         covered = sched_domains_tmpmask;
7118
7119         cpumask_clear(covered);
7120
7121         for_each_cpu(i, span) {
7122                 struct sched_group *sg;
7123                 int group = get_group(i, sdd, &sg);
7124                 int j;
7125
7126                 if (cpumask_test_cpu(i, covered))
7127                         continue;
7128
7129                 cpumask_clear(sched_group_cpus(sg));
7130                 sg->sgp->power = 0;
7131
7132                 for_each_cpu(j, span) {
7133                         if (get_group(j, sdd, NULL) != group)
7134                                 continue;
7135
7136                         cpumask_set_cpu(j, covered);
7137                         cpumask_set_cpu(j, sched_group_cpus(sg));
7138                 }
7139
7140                 if (!first)
7141                         first = sg;
7142                 if (last)
7143                         last->next = sg;
7144                 last = sg;
7145         }
7146         last->next = first;
7147
7148         return 0;
7149 }
7150
7151 /*
7152  * Initialize sched groups cpu_power.
7153  *
7154  * cpu_power indicates the capacity of sched group, which is used while
7155  * distributing the load between different sched groups in a sched domain.
7156  * Typically cpu_power for all the groups in a sched domain will be same unless
7157  * there are asymmetries in the topology. If there are asymmetries, group
7158  * having more cpu_power will pickup more load compared to the group having
7159  * less cpu_power.
7160  */
7161 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7162 {
7163         struct sched_group *sg = sd->groups;
7164
7165         WARN_ON(!sd || !sg);
7166
7167         do {
7168                 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7169                 sg = sg->next;
7170         } while (sg != sd->groups);
7171
7172         if (cpu != group_first_cpu(sg))
7173                 return;
7174
7175         update_group_power(sd, cpu);
7176 }
7177
7178 /*
7179  * Initializers for schedule domains
7180  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7181  */
7182
7183 #ifdef CONFIG_SCHED_DEBUG
7184 # define SD_INIT_NAME(sd, type)         sd->name = #type
7185 #else
7186 # define SD_INIT_NAME(sd, type)         do { } while (0)
7187 #endif
7188
7189 #define SD_INIT_FUNC(type)                                              \
7190 static noinline struct sched_domain *                                   \
7191 sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
7192 {                                                                       \
7193         struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
7194         *sd = SD_##type##_INIT;                                         \
7195         SD_INIT_NAME(sd, type);                                         \
7196         sd->private = &tl->data;                                        \
7197         return sd;                                                      \
7198 }
7199
7200 SD_INIT_FUNC(CPU)
7201 #ifdef CONFIG_NUMA
7202  SD_INIT_FUNC(ALLNODES)
7203  SD_INIT_FUNC(NODE)
7204 #endif
7205 #ifdef CONFIG_SCHED_SMT
7206  SD_INIT_FUNC(SIBLING)
7207 #endif
7208 #ifdef CONFIG_SCHED_MC
7209  SD_INIT_FUNC(MC)
7210 #endif
7211 #ifdef CONFIG_SCHED_BOOK
7212  SD_INIT_FUNC(BOOK)
7213 #endif
7214
7215 static int default_relax_domain_level = -1;
7216 int sched_domain_level_max;
7217
7218 static int __init setup_relax_domain_level(char *str)
7219 {
7220         if (kstrtoint(str, 0, &default_relax_domain_level))
7221                 pr_warn("Unable to set relax_domain_level\n");
7222
7223         return 1;
7224 }
7225 __setup("relax_domain_level=", setup_relax_domain_level);
7226
7227 static void set_domain_attribute(struct sched_domain *sd,
7228                                  struct sched_domain_attr *attr)
7229 {
7230         int request;
7231
7232         if (!attr || attr->relax_domain_level < 0) {
7233                 if (default_relax_domain_level < 0)
7234                         return;
7235                 else
7236                         request = default_relax_domain_level;
7237         } else
7238                 request = attr->relax_domain_level;
7239         if (request < sd->level) {
7240                 /* turn off idle balance on this domain */
7241                 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7242         } else {
7243                 /* turn on idle balance on this domain */
7244                 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
7245         }
7246 }
7247
7248 static void __sdt_free(const struct cpumask *cpu_map);
7249 static int __sdt_alloc(const struct cpumask *cpu_map);
7250
7251 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7252                                  const struct cpumask *cpu_map)
7253 {
7254         switch (what) {
7255         case sa_rootdomain:
7256                 if (!atomic_read(&d->rd->refcount))
7257                         free_rootdomain(&d->rd->rcu); /* fall through */
7258         case sa_sd:
7259                 free_percpu(d->sd); /* fall through */
7260         case sa_sd_storage:
7261                 __sdt_free(cpu_map); /* fall through */
7262         case sa_none:
7263                 break;
7264         }
7265 }
7266
7267 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7268                                                    const struct cpumask *cpu_map)
7269 {
7270         memset(d, 0, sizeof(*d));
7271
7272         if (__sdt_alloc(cpu_map))
7273                 return sa_sd_storage;
7274         d->sd = alloc_percpu(struct sched_domain *);
7275         if (!d->sd)
7276                 return sa_sd_storage;
7277         d->rd = alloc_rootdomain();
7278         if (!d->rd)
7279                 return sa_sd;
7280         return sa_rootdomain;
7281 }
7282
7283 /*
7284  * NULL the sd_data elements we've used to build the sched_domain and
7285  * sched_group structure so that the subsequent __free_domain_allocs()
7286  * will not free the data we're using.
7287  */
7288 static void claim_allocations(int cpu, struct sched_domain *sd)
7289 {
7290         struct sd_data *sdd = sd->private;
7291
7292         WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7293         *per_cpu_ptr(sdd->sd, cpu) = NULL;
7294
7295         if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
7296                 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7297
7298         if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
7299                 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
7300 }
7301
7302 #ifdef CONFIG_SCHED_SMT
7303 static const struct cpumask *cpu_smt_mask(int cpu)
7304 {
7305         return topology_thread_cpumask(cpu);
7306 }
7307 #endif
7308
7309 /*
7310  * Topology list, bottom-up.
7311  */
7312 static struct sched_domain_topology_level default_topology[] = {
7313 #ifdef CONFIG_SCHED_SMT
7314         { sd_init_SIBLING, cpu_smt_mask, },
7315 #endif
7316 #ifdef CONFIG_SCHED_MC
7317         { sd_init_MC, cpu_coregroup_mask, },
7318 #endif
7319 #ifdef CONFIG_SCHED_BOOK
7320         { sd_init_BOOK, cpu_book_mask, },
7321 #endif
7322         { sd_init_CPU, cpu_cpu_mask, },
7323 #ifdef CONFIG_NUMA
7324         { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
7325         { sd_init_ALLNODES, cpu_allnodes_mask, },
7326 #endif
7327         { NULL, },
7328 };
7329
7330 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7331
7332 static int __sdt_alloc(const struct cpumask *cpu_map)
7333 {
7334         struct sched_domain_topology_level *tl;
7335         int j;
7336
7337         for (tl = sched_domain_topology; tl->init; tl++) {
7338                 struct sd_data *sdd = &tl->data;
7339
7340                 sdd->sd = alloc_percpu(struct sched_domain *);
7341                 if (!sdd->sd)
7342                         return -ENOMEM;
7343
7344                 sdd->sg = alloc_percpu(struct sched_group *);
7345                 if (!sdd->sg)
7346                         return -ENOMEM;
7347
7348                 sdd->sgp = alloc_percpu(struct sched_group_power *);
7349                 if (!sdd->sgp)
7350                         return -ENOMEM;
7351
7352                 for_each_cpu(j, cpu_map) {
7353                         struct sched_domain *sd;
7354                         struct sched_group *sg;
7355                         struct sched_group_power *sgp;
7356
7357                         sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7358                                         GFP_KERNEL, cpu_to_node(j));
7359                         if (!sd)
7360                                 return -ENOMEM;
7361
7362                         *per_cpu_ptr(sdd->sd, j) = sd;
7363
7364                         sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7365                                         GFP_KERNEL, cpu_to_node(j));
7366                         if (!sg)
7367                                 return -ENOMEM;
7368
7369                         *per_cpu_ptr(sdd->sg, j) = sg;
7370
7371                         sgp = kzalloc_node(sizeof(struct sched_group_power),
7372                                         GFP_KERNEL, cpu_to_node(j));
7373                         if (!sgp)
7374                                 return -ENOMEM;
7375
7376                         *per_cpu_ptr(sdd->sgp, j) = sgp;
7377                 }
7378         }
7379
7380         return 0;
7381 }
7382
7383 static void __sdt_free(const struct cpumask *cpu_map)
7384 {
7385         struct sched_domain_topology_level *tl;
7386         int j;
7387
7388         for (tl = sched_domain_topology; tl->init; tl++) {
7389                 struct sd_data *sdd = &tl->data;
7390
7391                 for_each_cpu(j, cpu_map) {
7392                         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7393                         if (sd && (sd->flags & SD_OVERLAP))
7394                                 free_sched_groups(sd->groups, 0);
7395                         kfree(*per_cpu_ptr(sdd->sd, j));
7396                         kfree(*per_cpu_ptr(sdd->sg, j));
7397                         kfree(*per_cpu_ptr(sdd->sgp, j));
7398                 }
7399                 free_percpu(sdd->sd);
7400                 free_percpu(sdd->sg);
7401                 free_percpu(sdd->sgp);
7402         }
7403 }
7404
7405 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7406                 struct s_data *d, const struct cpumask *cpu_map,
7407                 struct sched_domain_attr *attr, struct sched_domain *child,
7408                 int cpu)
7409 {
7410         struct sched_domain *sd = tl->init(tl, cpu);
7411         if (!sd)
7412                 return child;
7413
7414         cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7415         if (child) {
7416                 sd->level = child->level + 1;
7417                 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7418                 child->parent = sd;
7419         }
7420         sd->child = child;
7421         set_domain_attribute(sd, attr);
7422
7423         return sd;
7424 }
7425
7426 /*
7427  * Build sched domains for a given set of cpus and attach the sched domains
7428  * to the individual cpus
7429  */
7430 static int build_sched_domains(const struct cpumask *cpu_map,
7431                                struct sched_domain_attr *attr)
7432 {
7433         enum s_alloc alloc_state = sa_none;
7434         struct sched_domain *sd;
7435         struct s_data d;
7436         int i, ret = -ENOMEM;
7437
7438         alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7439         if (alloc_state != sa_rootdomain)
7440                 goto error;
7441
7442         /* Set up domains for cpus specified by the cpu_map. */
7443         for_each_cpu(i, cpu_map) {
7444                 struct sched_domain_topology_level *tl;
7445
7446                 sd = NULL;
7447                 for (tl = sched_domain_topology; tl->init; tl++) {
7448                         sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7449                         if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7450                                 sd->flags |= SD_OVERLAP;
7451                         if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7452                                 break;
7453                 }
7454
7455                 while (sd->child)
7456                         sd = sd->child;
7457
7458                 *per_cpu_ptr(d.sd, i) = sd;
7459         }
7460
7461         /* Build the groups for the domains */
7462         for_each_cpu(i, cpu_map) {
7463                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7464                         sd->span_weight = cpumask_weight(sched_domain_span(sd));
7465                         if (sd->flags & SD_OVERLAP) {
7466                                 if (build_overlap_sched_groups(sd, i))
7467                                         goto error;
7468                         } else {
7469                                 if (build_sched_groups(sd, i))
7470                                         goto error;
7471                         }
7472                 }
7473         }
7474
7475         /* Calculate CPU power for physical packages and nodes */
7476         for (i = nr_cpumask_bits-1; i >= 0; i--) {
7477                 if (!cpumask_test_cpu(i, cpu_map))
7478                         continue;
7479
7480                 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7481                         claim_allocations(i, sd);
7482                         init_sched_groups_power(i, sd);
7483                 }
7484         }
7485
7486         /* Attach the domains */
7487         rcu_read_lock();
7488         for_each_cpu(i, cpu_map) {
7489                 sd = *per_cpu_ptr(d.sd, i);
7490                 cpu_attach_domain(sd, d.rd, i);
7491         }
7492         rcu_read_unlock();
7493
7494         ret = 0;
7495 error:
7496         __free_domain_allocs(&d, alloc_state, cpu_map);
7497         return ret;
7498 }
7499
7500 static cpumask_var_t *doms_cur; /* current sched domains */
7501 static int ndoms_cur;           /* number of sched domains in 'doms_cur' */
7502 static struct sched_domain_attr *dattr_cur;
7503                                 /* attribues of custom domains in 'doms_cur' */
7504
7505 /*
7506  * Special case: If a kmalloc of a doms_cur partition (array of
7507  * cpumask) fails, then fallback to a single sched domain,
7508  * as determined by the single cpumask fallback_doms.
7509  */
7510 static cpumask_var_t fallback_doms;
7511
7512 /*
7513  * arch_update_cpu_topology lets virtualized architectures update the
7514  * cpu core maps. It is supposed to return 1 if the topology changed
7515  * or 0 if it stayed the same.
7516  */
7517 int __attribute__((weak)) arch_update_cpu_topology(void)
7518 {
7519         return 0;
7520 }
7521
7522 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7523 {
7524         int i;
7525         cpumask_var_t *doms;
7526
7527         doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7528         if (!doms)
7529                 return NULL;
7530         for (i = 0; i < ndoms; i++) {
7531                 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7532                         free_sched_domains(doms, i);
7533                         return NULL;
7534                 }
7535         }
7536         return doms;
7537 }
7538
7539 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7540 {
7541         unsigned int i;
7542         for (i = 0; i < ndoms; i++)
7543                 free_cpumask_var(doms[i]);
7544         kfree(doms);
7545 }
7546
7547 /*
7548  * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7549  * For now this just excludes isolated cpus, but could be used to
7550  * exclude other special cases in the future.
7551  */
7552 static int init_sched_domains(const struct cpumask *cpu_map)
7553 {
7554         int err;
7555
7556         arch_update_cpu_topology();
7557         ndoms_cur = 1;
7558         doms_cur = alloc_sched_domains(ndoms_cur);
7559         if (!doms_cur)
7560                 doms_cur = &fallback_doms;
7561         cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7562         dattr_cur = NULL;
7563         err = build_sched_domains(doms_cur[0], NULL);
7564         register_sched_domain_sysctl();
7565
7566         return err;
7567 }
7568
7569 /*
7570  * Detach sched domains from a group of cpus specified in cpu_map
7571  * These cpus will now be attached to the NULL domain
7572  */
7573 static void detach_destroy_domains(const struct cpumask *cpu_map)
7574 {
7575         int i;
7576
7577         rcu_read_lock();
7578         for_each_cpu(i, cpu_map)
7579                 cpu_attach_domain(NULL, &def_root_domain, i);
7580         rcu_read_unlock();
7581 }
7582
7583 /* handle null as "default" */
7584 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7585                         struct sched_domain_attr *new, int idx_new)
7586 {
7587         struct sched_domain_attr tmp;
7588
7589         /* fast path */
7590         if (!new && !cur)
7591                 return 1;
7592
7593         tmp = SD_ATTR_INIT;
7594         return !memcmp(cur ? (cur + idx_cur) : &tmp,
7595                         new ? (new + idx_new) : &tmp,
7596                         sizeof(struct sched_domain_attr));
7597 }
7598
7599 /*
7600  * Partition sched domains as specified by the 'ndoms_new'
7601  * cpumasks in the array doms_new[] of cpumasks. This compares
7602  * doms_new[] to the current sched domain partitioning, doms_cur[].
7603  * It destroys each deleted domain and builds each new domain.
7604  *
7605  * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7606  * The masks don't intersect (don't overlap.) We should setup one
7607  * sched domain for each mask. CPUs not in any of the cpumasks will
7608  * not be load balanced. If the same cpumask appears both in the
7609  * current 'doms_cur' domains and in the new 'doms_new', we can leave
7610  * it as it is.
7611  *
7612  * The passed in 'doms_new' should be allocated using
7613  * alloc_sched_domains.  This routine takes ownership of it and will
7614  * free_sched_domains it when done with it. If the caller failed the
7615  * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7616  * and partition_sched_domains() will fallback to the single partition
7617  * 'fallback_doms', it also forces the domains to be rebuilt.
7618  *
7619  * If doms_new == NULL it will be replaced with cpu_online_mask.
7620  * ndoms_new == 0 is a special case for destroying existing domains,
7621  * and it will not create the default domain.
7622  *
7623  * Call with hotplug lock held
7624  */
7625 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7626                              struct sched_domain_attr *dattr_new)
7627 {
7628         int i, j, n;
7629         int new_topology;
7630
7631         mutex_lock(&sched_domains_mutex);
7632
7633         /* always unregister in case we don't destroy any domains */
7634         unregister_sched_domain_sysctl();
7635
7636         /* Let architecture update cpu core mappings. */
7637         new_topology = arch_update_cpu_topology();
7638
7639         n = doms_new ? ndoms_new : 0;
7640
7641         /* Destroy deleted domains */
7642         for (i = 0; i < ndoms_cur; i++) {
7643                 for (j = 0; j < n && !new_topology; j++) {
7644                         if (cpumask_equal(doms_cur[i], doms_new[j])
7645                             && dattrs_equal(dattr_cur, i, dattr_new, j))
7646                                 goto match1;
7647                 }
7648                 /* no match - a current sched domain not in new doms_new[] */
7649                 detach_destroy_domains(doms_cur[i]);
7650 match1:
7651                 ;
7652         }
7653
7654         if (doms_new == NULL) {
7655                 ndoms_cur = 0;
7656                 doms_new = &fallback_doms;
7657                 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7658                 WARN_ON_ONCE(dattr_new);
7659         }
7660
7661         /* Build new domains */
7662         for (i = 0; i < ndoms_new; i++) {
7663                 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7664                         if (cpumask_equal(doms_new[i], doms_cur[j])
7665                             && dattrs_equal(dattr_new, i, dattr_cur, j))
7666                                 goto match2;
7667                 }
7668                 /* no match - add a new doms_new */
7669                 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7670 match2:
7671                 ;
7672         }
7673
7674         /* Remember the new sched domains */
7675         if (doms_cur != &fallback_doms)
7676                 free_sched_domains(doms_cur, ndoms_cur);
7677         kfree(dattr_cur);       /* kfree(NULL) is safe */
7678         doms_cur = doms_new;
7679         dattr_cur = dattr_new;
7680         ndoms_cur = ndoms_new;
7681
7682         register_sched_domain_sysctl();
7683
7684         mutex_unlock(&sched_domains_mutex);
7685 }
7686
7687 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7688 static void reinit_sched_domains(void)
7689 {
7690         get_online_cpus();
7691
7692         /* Destroy domains first to force the rebuild */
7693         partition_sched_domains(0, NULL, NULL);
7694
7695         rebuild_sched_domains();
7696         put_online_cpus();
7697 }
7698
7699 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7700 {
7701         unsigned int level = 0;
7702
7703         if (sscanf(buf, "%u", &level) != 1)
7704                 return -EINVAL;
7705
7706         /*
7707          * level is always be positive so don't check for
7708          * level < POWERSAVINGS_BALANCE_NONE which is 0
7709          * What happens on 0 or 1 byte write,
7710          * need to check for count as well?
7711          */
7712
7713         if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7714                 return -EINVAL;
7715
7716         if (smt)
7717                 sched_smt_power_savings = level;
7718         else
7719                 sched_mc_power_savings = level;
7720
7721         reinit_sched_domains();
7722
7723         return count;
7724 }
7725
7726 #ifdef CONFIG_SCHED_MC
7727 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7728                                            struct sysdev_class_attribute *attr,
7729                                            char *page)
7730 {
7731         return sprintf(page, "%u\n", sched_mc_power_savings);
7732 }
7733 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7734                                             struct sysdev_class_attribute *attr,
7735                                             const char *buf, size_t count)
7736 {
7737         return sched_power_savings_store(buf, count, 0);
7738 }
7739 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7740                          sched_mc_power_savings_show,
7741                          sched_mc_power_savings_store);
7742 #endif
7743
7744 #ifdef CONFIG_SCHED_SMT
7745 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7746                                             struct sysdev_class_attribute *attr,
7747                                             char *page)
7748 {
7749         return sprintf(page, "%u\n", sched_smt_power_savings);
7750 }
7751 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7752                                              struct sysdev_class_attribute *attr,
7753                                              const char *buf, size_t count)
7754 {
7755         return sched_power_savings_store(buf, count, 1);
7756 }
7757 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7758                    sched_smt_power_savings_show,
7759                    sched_smt_power_savings_store);
7760 #endif
7761
7762 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7763 {
7764         int err = 0;
7765
7766 #ifdef CONFIG_SCHED_SMT
7767         if (smt_capable())
7768                 err = sysfs_create_file(&cls->kset.kobj,
7769                                         &attr_sched_smt_power_savings.attr);
7770 #endif
7771 #ifdef CONFIG_SCHED_MC
7772         if (!err && mc_capable())
7773                 err = sysfs_create_file(&cls->kset.kobj,
7774                                         &attr_sched_mc_power_savings.attr);
7775 #endif
7776         return err;
7777 }
7778 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7779
7780 /*
7781  * Update cpusets according to cpu_active mask.  If cpusets are
7782  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7783  * around partition_sched_domains().
7784  */
7785 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7786                              void *hcpu)
7787 {
7788         switch (action & ~CPU_TASKS_FROZEN) {
7789         case CPU_ONLINE:
7790         case CPU_DOWN_FAILED:
7791                 cpuset_update_active_cpus();
7792                 return NOTIFY_OK;
7793         default:
7794                 return NOTIFY_DONE;
7795         }
7796 }
7797
7798 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7799                                void *hcpu)
7800 {
7801         switch (action & ~CPU_TASKS_FROZEN) {
7802         case CPU_DOWN_PREPARE:
7803                 cpuset_update_active_cpus();
7804                 return NOTIFY_OK;
7805         default:
7806                 return NOTIFY_DONE;
7807         }
7808 }
7809
7810 static int update_runtime(struct notifier_block *nfb,
7811                                 unsigned long action, void *hcpu)
7812 {
7813         int cpu = (int)(long)hcpu;
7814
7815         switch (action) {
7816         case CPU_DOWN_PREPARE:
7817         case CPU_DOWN_PREPARE_FROZEN:
7818                 disable_runtime(cpu_rq(cpu));
7819                 return NOTIFY_OK;
7820
7821         case CPU_DOWN_FAILED:
7822         case CPU_DOWN_FAILED_FROZEN:
7823         case CPU_ONLINE:
7824         case CPU_ONLINE_FROZEN:
7825                 enable_runtime(cpu_rq(cpu));
7826                 return NOTIFY_OK;
7827
7828         default:
7829                 return NOTIFY_DONE;
7830         }
7831 }
7832
7833 void __init sched_init_smp(void)
7834 {
7835         cpumask_var_t non_isolated_cpus;
7836
7837         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7838         alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7839
7840         get_online_cpus();
7841         mutex_lock(&sched_domains_mutex);
7842         init_sched_domains(cpu_active_mask);
7843         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7844         if (cpumask_empty(non_isolated_cpus))
7845                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7846         mutex_unlock(&sched_domains_mutex);
7847         put_online_cpus();
7848
7849         hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7850         hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7851
7852         /* RT runtime code needs to handle some hotplug events */
7853         hotcpu_notifier(update_runtime, 0);
7854
7855         init_hrtick();
7856
7857         /* Move init over to a non-isolated CPU */
7858         if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7859                 BUG();
7860         sched_init_granularity();
7861         free_cpumask_var(non_isolated_cpus);
7862
7863         init_sched_rt_class();
7864 }
7865 #else
7866 void __init sched_init_smp(void)
7867 {
7868         sched_init_granularity();
7869 }
7870 #endif /* CONFIG_SMP */
7871
7872 const_debug unsigned int sysctl_timer_migration = 1;
7873
7874 int in_sched_functions(unsigned long addr)
7875 {
7876         return in_lock_functions(addr) ||
7877                 (addr >= (unsigned long)__sched_text_start
7878                 && addr < (unsigned long)__sched_text_end);
7879 }
7880
7881 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7882 {
7883         cfs_rq->tasks_timeline = RB_ROOT;
7884         INIT_LIST_HEAD(&cfs_rq->tasks);
7885 #ifdef CONFIG_FAIR_GROUP_SCHED
7886         cfs_rq->rq = rq;
7887         /* allow initial update_cfs_load() to truncate */
7888 #ifdef CONFIG_SMP
7889         cfs_rq->load_stamp = 1;
7890 #endif
7891 #endif
7892         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7893 #ifndef CONFIG_64BIT
7894         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7895 #endif
7896 }
7897
7898 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7899 {
7900         struct rt_prio_array *array;
7901         int i;
7902
7903         array = &rt_rq->active;
7904         for (i = 0; i < MAX_RT_PRIO; i++) {
7905                 INIT_LIST_HEAD(array->queue + i);
7906                 __clear_bit(i, array->bitmap);
7907         }
7908         /* delimiter for bitsearch: */
7909         __set_bit(MAX_RT_PRIO, array->bitmap);
7910
7911 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7912         rt_rq->highest_prio.curr = MAX_RT_PRIO;
7913 #ifdef CONFIG_SMP
7914         rt_rq->highest_prio.next = MAX_RT_PRIO;
7915 #endif
7916 #endif
7917 #ifdef CONFIG_SMP
7918         rt_rq->rt_nr_migratory = 0;
7919         rt_rq->overloaded = 0;
7920         plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
7921 #endif
7922
7923         rt_rq->rt_time = 0;
7924         rt_rq->rt_throttled = 0;
7925         rt_rq->rt_runtime = 0;
7926         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7927
7928 #ifdef CONFIG_RT_GROUP_SCHED
7929         rt_rq->rt_nr_boosted = 0;
7930         rt_rq->rq = rq;
7931 #endif
7932 }
7933
7934 #ifdef CONFIG_FAIR_GROUP_SCHED
7935 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7936                                 struct sched_entity *se, int cpu,
7937                                 struct sched_entity *parent)
7938 {
7939         struct rq *rq = cpu_rq(cpu);
7940         tg->cfs_rq[cpu] = cfs_rq;
7941         init_cfs_rq(cfs_rq, rq);
7942         cfs_rq->tg = tg;
7943
7944         tg->se[cpu] = se;
7945         /* se could be NULL for root_task_group */
7946         if (!se)
7947                 return;
7948
7949         if (!parent)
7950                 se->cfs_rq = &rq->cfs;
7951         else
7952                 se->cfs_rq = parent->my_q;
7953
7954         se->my_q = cfs_rq;
7955         update_load_set(&se->load, 0);
7956         se->parent = parent;
7957 }
7958 #endif
7959
7960 #ifdef CONFIG_RT_GROUP_SCHED
7961 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7962                 struct sched_rt_entity *rt_se, int cpu,
7963                 struct sched_rt_entity *parent)
7964 {
7965         struct rq *rq = cpu_rq(cpu);
7966
7967         tg->rt_rq[cpu] = rt_rq;
7968         init_rt_rq(rt_rq, rq);
7969         rt_rq->tg = tg;
7970         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7971
7972         tg->rt_se[cpu] = rt_se;
7973         if (!rt_se)
7974                 return;
7975
7976         if (!parent)
7977                 rt_se->rt_rq = &rq->rt;
7978         else
7979                 rt_se->rt_rq = parent->my_q;
7980
7981         rt_se->my_q = rt_rq;
7982         rt_se->parent = parent;
7983         INIT_LIST_HEAD(&rt_se->run_list);
7984 }
7985 #endif
7986
7987 void __init sched_init(void)
7988 {
7989         int i, j;
7990         unsigned long alloc_size = 0, ptr;
7991
7992 #ifdef CONFIG_FAIR_GROUP_SCHED
7993         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7994 #endif
7995 #ifdef CONFIG_RT_GROUP_SCHED
7996         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7997 #endif
7998 #ifdef CONFIG_CPUMASK_OFFSTACK
7999         alloc_size += num_possible_cpus() * cpumask_size();
8000 #endif
8001         if (alloc_size) {
8002                 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8003
8004 #ifdef CONFIG_FAIR_GROUP_SCHED
8005                 root_task_group.se = (struct sched_entity **)ptr;
8006                 ptr += nr_cpu_ids * sizeof(void **);
8007
8008                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8009                 ptr += nr_cpu_ids * sizeof(void **);
8010
8011 #endif /* CONFIG_FAIR_GROUP_SCHED */
8012 #ifdef CONFIG_RT_GROUP_SCHED
8013                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8014                 ptr += nr_cpu_ids * sizeof(void **);
8015
8016                 root_task_group.rt_rq = (struct rt_rq **)ptr;
8017                 ptr += nr_cpu_ids * sizeof(void **);
8018
8019 #endif /* CONFIG_RT_GROUP_SCHED */
8020 #ifdef CONFIG_CPUMASK_OFFSTACK
8021                 for_each_possible_cpu(i) {
8022                         per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8023                         ptr += cpumask_size();
8024                 }
8025 #endif /* CONFIG_CPUMASK_OFFSTACK */
8026         }
8027
8028 #ifdef CONFIG_SMP
8029         init_defrootdomain();
8030 #endif
8031
8032         init_rt_bandwidth(&def_rt_bandwidth,
8033                         global_rt_period(), global_rt_runtime());
8034
8035 #ifdef CONFIG_RT_GROUP_SCHED
8036         init_rt_bandwidth(&root_task_group.rt_bandwidth,
8037                         global_rt_period(), global_rt_runtime());
8038 #endif /* CONFIG_RT_GROUP_SCHED */
8039
8040 #ifdef CONFIG_CGROUP_SCHED
8041         list_add(&root_task_group.list, &task_groups);
8042         INIT_LIST_HEAD(&root_task_group.children);
8043         autogroup_init(&init_task);
8044 #endif /* CONFIG_CGROUP_SCHED */
8045
8046         for_each_possible_cpu(i) {
8047                 struct rq *rq;
8048
8049                 rq = cpu_rq(i);
8050                 raw_spin_lock_init(&rq->lock);
8051                 rq->nr_running = 0;
8052                 rq->calc_load_active = 0;
8053                 rq->calc_load_update = jiffies + LOAD_FREQ;
8054                 init_cfs_rq(&rq->cfs, rq);
8055                 init_rt_rq(&rq->rt, rq);
8056 #ifdef CONFIG_FAIR_GROUP_SCHED
8057                 root_task_group.shares = root_task_group_load;
8058                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8059                 /*
8060                  * How much cpu bandwidth does root_task_group get?
8061                  *
8062                  * In case of task-groups formed thr' the cgroup filesystem, it
8063                  * gets 100% of the cpu resources in the system. This overall
8064                  * system cpu resource is divided among the tasks of
8065                  * root_task_group and its child task-groups in a fair manner,
8066                  * based on each entity's (task or task-group's) weight
8067                  * (se->load.weight).
8068                  *
8069                  * In other words, if root_task_group has 10 tasks of weight
8070                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
8071                  * then A0's share of the cpu resource is:
8072                  *
8073                  *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8074                  *
8075                  * We achieve this by letting root_task_group's tasks sit
8076                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8077                  */
8078                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8079 #endif /* CONFIG_FAIR_GROUP_SCHED */
8080
8081                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8082 #ifdef CONFIG_RT_GROUP_SCHED
8083                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8084                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8085 #endif
8086
8087                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8088                         rq->cpu_load[j] = 0;
8089
8090                 rq->last_load_update_tick = jiffies;
8091
8092 #ifdef CONFIG_SMP
8093                 rq->sd = NULL;
8094                 rq->rd = NULL;
8095                 rq->cpu_power = SCHED_POWER_SCALE;
8096                 rq->post_schedule = 0;
8097                 rq->active_balance = 0;
8098                 rq->next_balance = jiffies;
8099                 rq->push_cpu = 0;
8100                 rq->cpu = i;
8101                 rq->online = 0;
8102                 rq->idle_stamp = 0;
8103                 rq->avg_idle = 2*sysctl_sched_migration_cost;
8104                 rq_attach_root(rq, &def_root_domain);
8105 #ifdef CONFIG_NO_HZ
8106                 rq->nohz_balance_kick = 0;
8107                 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
8108 #endif
8109 #endif
8110                 init_rq_hrtick(rq);
8111                 atomic_set(&rq->nr_iowait, 0);
8112         }
8113
8114         set_load_weight(&init_task);
8115
8116 #ifdef CONFIG_PREEMPT_NOTIFIERS
8117         INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8118 #endif
8119
8120 #ifdef CONFIG_SMP
8121         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8122 #endif
8123
8124 #ifdef CONFIG_RT_MUTEXES
8125         plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
8126 #endif
8127
8128         /*
8129          * The boot idle thread does lazy MMU switching as well:
8130          */
8131         atomic_inc(&init_mm.mm_count);
8132         enter_lazy_tlb(&init_mm, current);
8133
8134         /*
8135          * Make us the idle thread. Technically, schedule() should not be
8136          * called from this thread, however somewhere below it might be,
8137          * but because we are the idle thread, we just pick up running again
8138          * when this runqueue becomes "idle".
8139          */
8140         init_idle(current, smp_processor_id());
8141
8142         calc_load_update = jiffies + LOAD_FREQ;
8143
8144         /*
8145          * During early bootup we pretend to be a normal task:
8146          */
8147         current->sched_class = &fair_sched_class;
8148
8149         /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8150         zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8151 #ifdef CONFIG_SMP
8152         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8153 #ifdef CONFIG_NO_HZ
8154         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8155         alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8156         atomic_set(&nohz.load_balancer, nr_cpu_ids);
8157         atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8158         atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
8159 #endif
8160         /* May be allocated at isolcpus cmdline parse time */
8161         if (cpu_isolated_map == NULL)
8162                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8163 #endif /* SMP */
8164
8165         scheduler_running = 1;
8166 }
8167
8168 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
8169 static inline int preempt_count_equals(int preempt_offset)
8170 {
8171         int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8172
8173         return (nested == preempt_offset);
8174 }
8175
8176 void __might_sleep(const char *file, int line, int preempt_offset)
8177 {
8178 #ifdef in_atomic
8179         static unsigned long prev_jiffy;        /* ratelimiting */
8180
8181         if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8182             system_state != SYSTEM_RUNNING || oops_in_progress)
8183                 return;
8184         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8185                 return;
8186         prev_jiffy = jiffies;
8187
8188         printk(KERN_ERR
8189                 "BUG: sleeping function called from invalid context at %s:%d\n",
8190                         file, line);
8191         printk(KERN_ERR
8192                 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8193                         in_atomic(), irqs_disabled(),
8194                         current->pid, current->comm);
8195
8196         debug_show_held_locks(current);
8197         if (irqs_disabled())
8198                 print_irqtrace_events(current);
8199         dump_stack();
8200 #endif
8201 }
8202 EXPORT_SYMBOL(__might_sleep);
8203 #endif
8204
8205 #ifdef CONFIG_MAGIC_SYSRQ
8206 static void normalize_task(struct rq *rq, struct task_struct *p)
8207 {
8208         const struct sched_class *prev_class = p->sched_class;
8209         int old_prio = p->prio;
8210         int on_rq;
8211
8212         on_rq = p->on_rq;
8213         if (on_rq)
8214                 deactivate_task(rq, p, 0);
8215         __setscheduler(rq, p, SCHED_NORMAL, 0);
8216         if (on_rq) {
8217                 activate_task(rq, p, 0);
8218                 resched_task(rq->curr);
8219         }
8220
8221         check_class_changed(rq, p, prev_class, old_prio);
8222 }
8223
8224 void normalize_rt_tasks(void)
8225 {
8226         struct task_struct *g, *p;
8227         unsigned long flags;
8228         struct rq *rq;
8229
8230         read_lock_irqsave(&tasklist_lock, flags);
8231         do_each_thread(g, p) {
8232                 /*
8233                  * Only normalize user tasks:
8234                  */
8235                 if (!p->mm)
8236                         continue;
8237
8238                 p->se.exec_start                = 0;
8239 #ifdef CONFIG_SCHEDSTATS
8240                 p->se.statistics.wait_start     = 0;
8241                 p->se.statistics.sleep_start    = 0;
8242                 p->se.statistics.block_start    = 0;
8243 #endif
8244
8245                 if (!rt_task(p)) {
8246                         /*
8247                          * Renice negative nice level userspace
8248                          * tasks back to 0:
8249                          */
8250                         if (TASK_NICE(p) < 0 && p->mm)
8251                                 set_user_nice(p, 0);
8252                         continue;
8253                 }
8254
8255                 raw_spin_lock(&p->pi_lock);
8256                 rq = __task_rq_lock(p);
8257
8258                 normalize_task(rq, p);
8259
8260                 __task_rq_unlock(rq);
8261                 raw_spin_unlock(&p->pi_lock);
8262         } while_each_thread(g, p);
8263
8264         read_unlock_irqrestore(&tasklist_lock, flags);
8265 }
8266
8267 #endif /* CONFIG_MAGIC_SYSRQ */
8268
8269 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8270 /*
8271  * These functions are only useful for the IA64 MCA handling, or kdb.
8272  *
8273  * They can only be called when the whole system has been
8274  * stopped - every CPU needs to be quiescent, and no scheduling
8275  * activity can take place. Using them for anything else would
8276  * be a serious bug, and as a result, they aren't even visible
8277  * under any other configuration.
8278  */
8279
8280 /**
8281  * curr_task - return the current task for a given cpu.
8282  * @cpu: the processor in question.
8283  *
8284  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8285  */
8286 struct task_struct *curr_task(int cpu)
8287 {
8288         return cpu_curr(cpu);
8289 }
8290
8291 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8292
8293 #ifdef CONFIG_IA64
8294 /**
8295  * set_curr_task - set the current task for a given cpu.
8296  * @cpu: the processor in question.
8297  * @p: the task pointer to set.
8298  *
8299  * Description: This function must only be used when non-maskable interrupts
8300  * are serviced on a separate stack. It allows the architecture to switch the
8301  * notion of the current task on a cpu in a non-blocking manner. This function
8302  * must be called with all CPU's synchronized, and interrupts disabled, the
8303  * and caller must save the original value of the current task (see
8304  * curr_task() above) and restore that value before reenabling interrupts and
8305  * re-starting the system.
8306  *
8307  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8308  */
8309 void set_curr_task(int cpu, struct task_struct *p)
8310 {
8311         cpu_curr(cpu) = p;
8312 }
8313
8314 #endif
8315
8316 #ifdef CONFIG_FAIR_GROUP_SCHED
8317 static void free_fair_sched_group(struct task_group *tg)
8318 {
8319         int i;
8320
8321         for_each_possible_cpu(i) {
8322                 if (tg->cfs_rq)
8323                         kfree(tg->cfs_rq[i]);
8324                 if (tg->se)
8325                         kfree(tg->se[i]);
8326         }
8327
8328         kfree(tg->cfs_rq);
8329         kfree(tg->se);
8330 }
8331
8332 static
8333 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8334 {
8335         struct cfs_rq *cfs_rq;
8336         struct sched_entity *se;
8337         int i;
8338
8339         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8340         if (!tg->cfs_rq)
8341                 goto err;
8342         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8343         if (!tg->se)
8344                 goto err;
8345
8346         tg->shares = NICE_0_LOAD;
8347
8348         for_each_possible_cpu(i) {
8349                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8350                                       GFP_KERNEL, cpu_to_node(i));
8351                 if (!cfs_rq)
8352                         goto err;
8353
8354                 se = kzalloc_node(sizeof(struct sched_entity),
8355                                   GFP_KERNEL, cpu_to_node(i));
8356                 if (!se)
8357                         goto err_free_rq;
8358
8359                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8360         }
8361
8362         return 1;
8363
8364 err_free_rq:
8365         kfree(cfs_rq);
8366 err:
8367         return 0;
8368 }
8369
8370 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8371 {
8372         struct rq *rq = cpu_rq(cpu);
8373         unsigned long flags;
8374
8375         /*
8376         * Only empty task groups can be destroyed; so we can speculatively
8377         * check on_list without danger of it being re-added.
8378         */
8379         if (!tg->cfs_rq[cpu]->on_list)
8380                 return;
8381
8382         raw_spin_lock_irqsave(&rq->lock, flags);
8383         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8384         raw_spin_unlock_irqrestore(&rq->lock, flags);
8385 }
8386 #else /* !CONFG_FAIR_GROUP_SCHED */
8387 static inline void free_fair_sched_group(struct task_group *tg)
8388 {
8389 }
8390
8391 static inline
8392 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8393 {
8394         return 1;
8395 }
8396
8397 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8398 {
8399 }
8400 #endif /* CONFIG_FAIR_GROUP_SCHED */
8401
8402 #ifdef CONFIG_RT_GROUP_SCHED
8403 static void free_rt_sched_group(struct task_group *tg)
8404 {
8405         int i;
8406
8407         destroy_rt_bandwidth(&tg->rt_bandwidth);
8408
8409         for_each_possible_cpu(i) {
8410                 if (tg->rt_rq)
8411                         kfree(tg->rt_rq[i]);
8412                 if (tg->rt_se)
8413                         kfree(tg->rt_se[i]);
8414         }
8415
8416         kfree(tg->rt_rq);
8417         kfree(tg->rt_se);
8418 }
8419
8420 static
8421 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8422 {
8423         struct rt_rq *rt_rq;
8424         struct sched_rt_entity *rt_se;
8425         int i;
8426
8427         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8428         if (!tg->rt_rq)
8429                 goto err;
8430         tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8431         if (!tg->rt_se)
8432                 goto err;
8433
8434         init_rt_bandwidth(&tg->rt_bandwidth,
8435                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8436
8437         for_each_possible_cpu(i) {
8438                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8439                                      GFP_KERNEL, cpu_to_node(i));
8440                 if (!rt_rq)
8441                         goto err;
8442
8443                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8444                                      GFP_KERNEL, cpu_to_node(i));
8445                 if (!rt_se)
8446                         goto err_free_rq;
8447
8448                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8449         }
8450
8451         return 1;
8452
8453 err_free_rq:
8454         kfree(rt_rq);
8455 err:
8456         return 0;
8457 }
8458 #else /* !CONFIG_RT_GROUP_SCHED */
8459 static inline void free_rt_sched_group(struct task_group *tg)
8460 {
8461 }
8462
8463 static inline
8464 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8465 {
8466         return 1;
8467 }
8468 #endif /* CONFIG_RT_GROUP_SCHED */
8469
8470 #ifdef CONFIG_CGROUP_SCHED
8471 static void free_sched_group(struct task_group *tg)
8472 {
8473         free_fair_sched_group(tg);
8474         free_rt_sched_group(tg);
8475         autogroup_free(tg);
8476         kfree(tg);
8477 }
8478
8479 /* allocate runqueue etc for a new task group */
8480 struct task_group *sched_create_group(struct task_group *parent)
8481 {
8482         struct task_group *tg;
8483         unsigned long flags;
8484
8485         tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8486         if (!tg)
8487                 return ERR_PTR(-ENOMEM);
8488
8489         if (!alloc_fair_sched_group(tg, parent))
8490                 goto err;
8491
8492         if (!alloc_rt_sched_group(tg, parent))
8493                 goto err;
8494
8495         spin_lock_irqsave(&task_group_lock, flags);
8496         list_add_rcu(&tg->list, &task_groups);
8497
8498         WARN_ON(!parent); /* root should already exist */
8499
8500         tg->parent = parent;
8501         INIT_LIST_HEAD(&tg->children);
8502         list_add_rcu(&tg->siblings, &parent->children);
8503         spin_unlock_irqrestore(&task_group_lock, flags);
8504
8505         return tg;
8506
8507 err:
8508         free_sched_group(tg);
8509         return ERR_PTR(-ENOMEM);
8510 }
8511
8512 /* rcu callback to free various structures associated with a task group */
8513 static void free_sched_group_rcu(struct rcu_head *rhp)
8514 {
8515         /* now it should be safe to free those cfs_rqs */
8516         free_sched_group(container_of(rhp, struct task_group, rcu));
8517 }
8518
8519 /* Destroy runqueue etc associated with a task group */
8520 void sched_destroy_group(struct task_group *tg)
8521 {
8522         unsigned long flags;
8523         int i;
8524
8525         /* end participation in shares distribution */
8526         for_each_possible_cpu(i)
8527                 unregister_fair_sched_group(tg, i);
8528
8529         spin_lock_irqsave(&task_group_lock, flags);
8530         list_del_rcu(&tg->list);
8531         list_del_rcu(&tg->siblings);
8532         spin_unlock_irqrestore(&task_group_lock, flags);
8533
8534         /* wait for possible concurrent references to cfs_rqs complete */
8535         call_rcu(&tg->rcu, free_sched_group_rcu);
8536 }
8537
8538 /* change task's runqueue when it moves between groups.
8539  *      The caller of this function should have put the task in its new group
8540  *      by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8541  *      reflect its new group.
8542  */
8543 void sched_move_task(struct task_struct *tsk)
8544 {
8545         struct task_group *tg;
8546         int on_rq, running;
8547         unsigned long flags;
8548         struct rq *rq;
8549
8550         rq = task_rq_lock(tsk, &flags);
8551
8552         running = task_current(rq, tsk);
8553         on_rq = tsk->on_rq;
8554
8555         if (on_rq)
8556                 dequeue_task(rq, tsk, 0);
8557         if (unlikely(running))
8558                 tsk->sched_class->put_prev_task(rq, tsk);
8559
8560         tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
8561                                 lockdep_is_held(&tsk->sighand->siglock)),
8562                           struct task_group, css);
8563         tg = autogroup_task_group(tsk, tg);
8564         tsk->sched_task_group = tg;
8565
8566 #ifdef CONFIG_FAIR_GROUP_SCHED
8567         if (tsk->sched_class->task_move_group)
8568                 tsk->sched_class->task_move_group(tsk, on_rq);
8569         else
8570 #endif
8571                 set_task_rq(tsk, task_cpu(tsk));
8572
8573         if (unlikely(running))
8574                 tsk->sched_class->set_curr_task(rq);
8575         if (on_rq)
8576                 enqueue_task(rq, tsk, 0);
8577
8578         task_rq_unlock(rq, tsk, &flags);
8579 }
8580 #endif /* CONFIG_CGROUP_SCHED */
8581
8582 #ifdef CONFIG_FAIR_GROUP_SCHED
8583 static DEFINE_MUTEX(shares_mutex);
8584
8585 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8586 {
8587         int i;
8588         unsigned long flags;
8589
8590         /*
8591          * We can't change the weight of the root cgroup.
8592          */
8593         if (!tg->se[0])
8594                 return -EINVAL;
8595
8596         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8597
8598         mutex_lock(&shares_mutex);
8599         if (tg->shares == shares)
8600                 goto done;
8601
8602         tg->shares = shares;
8603         for_each_possible_cpu(i) {
8604                 struct rq *rq = cpu_rq(i);
8605                 struct sched_entity *se;
8606
8607                 se = tg->se[i];
8608                 /* Propagate contribution to hierarchy */
8609                 raw_spin_lock_irqsave(&rq->lock, flags);
8610                 for_each_sched_entity(se)
8611                         update_cfs_shares(group_cfs_rq(se));
8612                 raw_spin_unlock_irqrestore(&rq->lock, flags);
8613         }
8614
8615 done:
8616         mutex_unlock(&shares_mutex);
8617         return 0;
8618 }
8619
8620 unsigned long sched_group_shares(struct task_group *tg)
8621 {
8622         return tg->shares;
8623 }
8624 #endif
8625
8626 #ifdef CONFIG_RT_GROUP_SCHED
8627 /*
8628  * Ensure that the real time constraints are schedulable.
8629  */
8630 static DEFINE_MUTEX(rt_constraints_mutex);
8631
8632 static unsigned long to_ratio(u64 period, u64 runtime)
8633 {
8634         if (runtime == RUNTIME_INF)
8635                 return 1ULL << 20;
8636
8637         return div64_u64(runtime << 20, period);
8638 }
8639
8640 /* Must be called with tasklist_lock held */
8641 static inline int tg_has_rt_tasks(struct task_group *tg)
8642 {
8643         struct task_struct *g, *p;
8644
8645         do_each_thread(g, p) {
8646                 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8647                         return 1;
8648         } while_each_thread(g, p);
8649
8650         return 0;
8651 }
8652
8653 struct rt_schedulable_data {
8654         struct task_group *tg;
8655         u64 rt_period;
8656         u64 rt_runtime;
8657 };
8658
8659 static int tg_schedulable(struct task_group *tg, void *data)
8660 {
8661         struct rt_schedulable_data *d = data;
8662         struct task_group *child;
8663         unsigned long total, sum = 0;
8664         u64 period, runtime;
8665
8666         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8667         runtime = tg->rt_bandwidth.rt_runtime;
8668
8669         if (tg == d->tg) {
8670                 period = d->rt_period;
8671                 runtime = d->rt_runtime;
8672         }
8673
8674         /*
8675          * Cannot have more runtime than the period.
8676          */
8677         if (runtime > period && runtime != RUNTIME_INF)
8678                 return -EINVAL;
8679
8680         /*
8681          * Ensure we don't starve existing RT tasks.
8682          */
8683         if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8684                 return -EBUSY;
8685
8686         total = to_ratio(period, runtime);
8687
8688         /*
8689          * Nobody can have more than the global setting allows.
8690          */
8691         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8692                 return -EINVAL;
8693
8694         /*
8695          * The sum of our children's runtime should not exceed our own.
8696          */
8697         list_for_each_entry_rcu(child, &tg->children, siblings) {
8698                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8699                 runtime = child->rt_bandwidth.rt_runtime;
8700
8701                 if (child == d->tg) {
8702                         period = d->rt_period;
8703                         runtime = d->rt_runtime;
8704                 }
8705
8706                 sum += to_ratio(period, runtime);
8707         }
8708
8709         if (sum > total)
8710                 return -EINVAL;
8711
8712         return 0;
8713 }
8714
8715 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8716 {
8717         struct rt_schedulable_data data = {
8718                 .tg = tg,
8719                 .rt_period = period,
8720                 .rt_runtime = runtime,
8721         };
8722
8723         return walk_tg_tree(tg_schedulable, tg_nop, &data);
8724 }
8725
8726 static int tg_set_bandwidth(struct task_group *tg,
8727                 u64 rt_period, u64 rt_runtime)
8728 {
8729         int i, err = 0;
8730
8731         mutex_lock(&rt_constraints_mutex);
8732         read_lock(&tasklist_lock);
8733         err = __rt_schedulable(tg, rt_period, rt_runtime);
8734         if (err)
8735                 goto unlock;
8736
8737         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8738         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8739         tg->rt_bandwidth.rt_runtime = rt_runtime;
8740
8741         for_each_possible_cpu(i) {
8742                 struct rt_rq *rt_rq = tg->rt_rq[i];
8743
8744                 raw_spin_lock(&rt_rq->rt_runtime_lock);
8745                 rt_rq->rt_runtime = rt_runtime;
8746                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8747         }
8748         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8749 unlock:
8750         read_unlock(&tasklist_lock);
8751         mutex_unlock(&rt_constraints_mutex);
8752
8753         return err;
8754 }
8755
8756 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8757 {
8758         u64 rt_runtime, rt_period;
8759
8760         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8761         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8762         if (rt_runtime_us < 0)
8763                 rt_runtime = RUNTIME_INF;
8764
8765         return tg_set_bandwidth(tg, rt_period, rt_runtime);
8766 }
8767
8768 long sched_group_rt_runtime(struct task_group *tg)
8769 {
8770         u64 rt_runtime_us;
8771
8772         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8773                 return -1;
8774
8775         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8776         do_div(rt_runtime_us, NSEC_PER_USEC);
8777         return rt_runtime_us;
8778 }
8779
8780 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8781 {
8782         u64 rt_runtime, rt_period;
8783
8784         rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8785         rt_runtime = tg->rt_bandwidth.rt_runtime;
8786
8787         if (rt_period == 0)
8788                 return -EINVAL;
8789
8790         return tg_set_bandwidth(tg, rt_period, rt_runtime);
8791 }
8792
8793 long sched_group_rt_period(struct task_group *tg)
8794 {
8795         u64 rt_period_us;
8796
8797         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8798         do_div(rt_period_us, NSEC_PER_USEC);
8799         return rt_period_us;
8800 }
8801
8802 static int sched_rt_global_constraints(void)
8803 {
8804         u64 runtime, period;
8805         int ret = 0;
8806
8807         if (sysctl_sched_rt_period <= 0)
8808                 return -EINVAL;
8809
8810         runtime = global_rt_runtime();
8811         period = global_rt_period();
8812
8813         /*
8814          * Sanity check on the sysctl variables.
8815          */
8816         if (runtime > period && runtime != RUNTIME_INF)
8817                 return -EINVAL;
8818
8819         mutex_lock(&rt_constraints_mutex);
8820         read_lock(&tasklist_lock);
8821         ret = __rt_schedulable(NULL, 0, 0);
8822         read_unlock(&tasklist_lock);
8823         mutex_unlock(&rt_constraints_mutex);
8824
8825         return ret;
8826 }
8827
8828 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8829 {
8830         /* Don't accept realtime tasks when there is no way for them to run */
8831         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8832                 return 0;
8833
8834         return 1;
8835 }
8836
8837 #else /* !CONFIG_RT_GROUP_SCHED */
8838 static int sched_rt_global_constraints(void)
8839 {
8840         unsigned long flags;
8841         int i;
8842
8843         if (sysctl_sched_rt_period <= 0)
8844                 return -EINVAL;
8845
8846         /*
8847          * There's always some RT tasks in the root group
8848          * -- migration, kstopmachine etc..
8849          */
8850         if (sysctl_sched_rt_runtime == 0)
8851                 return -EBUSY;
8852
8853         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8854         for_each_possible_cpu(i) {
8855                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8856
8857                 raw_spin_lock(&rt_rq->rt_runtime_lock);
8858                 rt_rq->rt_runtime = global_rt_runtime();
8859                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8860         }
8861         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8862
8863         return 0;
8864 }
8865 #endif /* CONFIG_RT_GROUP_SCHED */
8866
8867 int sched_rt_handler(struct ctl_table *table, int write,
8868                 void __user *buffer, size_t *lenp,
8869                 loff_t *ppos)
8870 {
8871         int ret;
8872         int old_period, old_runtime;
8873         static DEFINE_MUTEX(mutex);
8874
8875         mutex_lock(&mutex);
8876         old_period = sysctl_sched_rt_period;
8877         old_runtime = sysctl_sched_rt_runtime;
8878
8879         ret = proc_dointvec(table, write, buffer, lenp, ppos);
8880
8881         if (!ret && write) {
8882                 ret = sched_rt_global_constraints();
8883                 if (ret) {
8884                         sysctl_sched_rt_period = old_period;
8885                         sysctl_sched_rt_runtime = old_runtime;
8886                 } else {
8887                         def_rt_bandwidth.rt_runtime = global_rt_runtime();
8888                         def_rt_bandwidth.rt_period =
8889                                 ns_to_ktime(global_rt_period());
8890                 }
8891         }
8892         mutex_unlock(&mutex);
8893
8894         return ret;
8895 }
8896
8897 #ifdef CONFIG_CGROUP_SCHED
8898
8899 /* return corresponding task_group object of a cgroup */
8900 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
8901 {
8902         return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8903                             struct task_group, css);
8904 }
8905
8906 static struct cgroup_subsys_state *
8907 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8908 {
8909         struct task_group *tg, *parent;
8910
8911         if (!cgrp->parent) {
8912                 /* This is early initialization for the top cgroup */
8913                 return &root_task_group.css;
8914         }
8915
8916         parent = cgroup_tg(cgrp->parent);
8917         tg = sched_create_group(parent);
8918         if (IS_ERR(tg))
8919                 return ERR_PTR(-ENOMEM);
8920
8921         return &tg->css;
8922 }
8923
8924 static void
8925 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8926 {
8927         struct task_group *tg = cgroup_tg(cgrp);
8928
8929         sched_destroy_group(tg);
8930 }
8931
8932 static int
8933 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8934 {
8935 #ifdef CONFIG_RT_GROUP_SCHED
8936         if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
8937                 return -EINVAL;
8938 #else
8939         /* We don't support RT-tasks being in separate groups */
8940         if (tsk->sched_class != &fair_sched_class)
8941                 return -EINVAL;
8942 #endif
8943         return 0;
8944 }
8945
8946 static void
8947 cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8948 {
8949         sched_move_task(tsk);
8950 }
8951
8952 static void
8953 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
8954                 struct cgroup *old_cgrp, struct task_struct *task)
8955 {
8956         /*
8957          * cgroup_exit() is called in the copy_process() failure path.
8958          * Ignore this case since the task hasn't ran yet, this avoids
8959          * trying to poke a half freed task state from generic code.
8960          */
8961         if (!(task->flags & PF_EXITING))
8962                 return;
8963
8964         sched_move_task(task);
8965 }
8966
8967 #ifdef CONFIG_FAIR_GROUP_SCHED
8968 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8969                                 u64 shareval)
8970 {
8971         return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
8972 }
8973
8974 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8975 {
8976         struct task_group *tg = cgroup_tg(cgrp);
8977
8978         return (u64) scale_load_down(tg->shares);
8979 }
8980 #endif /* CONFIG_FAIR_GROUP_SCHED */
8981
8982 #ifdef CONFIG_RT_GROUP_SCHED
8983 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8984                                 s64 val)
8985 {
8986         return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8987 }
8988
8989 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8990 {
8991         return sched_group_rt_runtime(cgroup_tg(cgrp));
8992 }
8993
8994 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8995                 u64 rt_period_us)
8996 {
8997         return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8998 }
8999
9000 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9001 {
9002         return sched_group_rt_period(cgroup_tg(cgrp));
9003 }
9004 #endif /* CONFIG_RT_GROUP_SCHED */
9005
9006 static struct cftype cpu_files[] = {
9007 #ifdef CONFIG_FAIR_GROUP_SCHED
9008         {
9009                 .name = "shares",
9010                 .read_u64 = cpu_shares_read_u64,
9011                 .write_u64 = cpu_shares_write_u64,
9012         },
9013 #endif
9014 #ifdef CONFIG_RT_GROUP_SCHED
9015         {
9016                 .name = "rt_runtime_us",
9017                 .read_s64 = cpu_rt_runtime_read,
9018                 .write_s64 = cpu_rt_runtime_write,
9019         },
9020         {
9021                 .name = "rt_period_us",
9022                 .read_u64 = cpu_rt_period_read_uint,
9023                 .write_u64 = cpu_rt_period_write_uint,
9024         },
9025 #endif
9026 };
9027
9028 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9029 {
9030         return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
9031 }
9032
9033 struct cgroup_subsys cpu_cgroup_subsys = {
9034         .name           = "cpu",
9035         .create         = cpu_cgroup_create,
9036         .destroy        = cpu_cgroup_destroy,
9037         .can_attach_task = cpu_cgroup_can_attach_task,
9038         .attach_task    = cpu_cgroup_attach_task,
9039         .exit           = cpu_cgroup_exit,
9040         .populate       = cpu_cgroup_populate,
9041         .subsys_id      = cpu_cgroup_subsys_id,
9042         .early_init     = 1,
9043 };
9044
9045 #endif  /* CONFIG_CGROUP_SCHED */
9046
9047 #ifdef CONFIG_CGROUP_CPUACCT
9048
9049 /*
9050  * CPU accounting code for task groups.
9051  *
9052  * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9053  * (balbir@in.ibm.com).
9054  */
9055
9056 /* track cpu usage of a group of tasks and its child groups */
9057 struct cpuacct {
9058         struct cgroup_subsys_state css;
9059         /* cpuusage holds pointer to a u64-type object on every cpu */
9060         u64 __percpu *cpuusage;
9061         struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9062         struct cpuacct *parent;
9063 };
9064
9065 struct cgroup_subsys cpuacct_subsys;
9066
9067 /* return cpu accounting group corresponding to this container */
9068 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
9069 {
9070         return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
9071                             struct cpuacct, css);
9072 }
9073
9074 /* return cpu accounting group to which this task belongs */
9075 static inline struct cpuacct *task_ca(struct task_struct *tsk)
9076 {
9077         return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9078                             struct cpuacct, css);
9079 }
9080
9081 /* create a new cpu accounting group */
9082 static struct cgroup_subsys_state *cpuacct_create(
9083         struct cgroup_subsys *ss, struct cgroup *cgrp)
9084 {
9085         struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
9086         int i;
9087
9088         if (!ca)
9089                 goto out;
9090
9091         ca->cpuusage = alloc_percpu(u64);
9092         if (!ca->cpuusage)
9093                 goto out_free_ca;
9094
9095         for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9096                 if (percpu_counter_init(&ca->cpustat[i], 0))
9097                         goto out_free_counters;
9098
9099         if (cgrp->parent)
9100                 ca->parent = cgroup_ca(cgrp->parent);
9101
9102         return &ca->css;
9103
9104 out_free_counters:
9105         while (--i >= 0)
9106                 percpu_counter_destroy(&ca->cpustat[i]);
9107         free_percpu(ca->cpuusage);
9108 out_free_ca:
9109         kfree(ca);
9110 out:
9111         return ERR_PTR(-ENOMEM);
9112 }
9113
9114 /* destroy an existing cpu accounting group */
9115 static void
9116 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9117 {
9118         struct cpuacct *ca = cgroup_ca(cgrp);
9119         int i;
9120
9121         for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9122                 percpu_counter_destroy(&ca->cpustat[i]);
9123         free_percpu(ca->cpuusage);
9124         kfree(ca);
9125 }
9126
9127 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9128 {
9129         u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9130         u64 data;
9131
9132 #ifndef CONFIG_64BIT
9133         /*
9134          * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9135          */
9136         raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9137         data = *cpuusage;
9138         raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9139 #else
9140         data = *cpuusage;
9141 #endif
9142
9143         return data;
9144 }
9145
9146 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9147 {
9148         u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9149
9150 #ifndef CONFIG_64BIT
9151         /*
9152          * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9153          */
9154         raw_spin_lock_irq(&cpu_rq(cpu)->lock);
9155         *cpuusage = val;
9156         raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
9157 #else
9158         *cpuusage = val;
9159 #endif
9160 }
9161
9162 /* return total cpu usage (in nanoseconds) of a group */
9163 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
9164 {
9165         struct cpuacct *ca = cgroup_ca(cgrp);
9166         u64 totalcpuusage = 0;
9167         int i;
9168
9169         for_each_present_cpu(i)
9170                 totalcpuusage += cpuacct_cpuusage_read(ca, i);
9171
9172         return totalcpuusage;
9173 }
9174
9175 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9176                                                                 u64 reset)
9177 {
9178         struct cpuacct *ca = cgroup_ca(cgrp);
9179         int err = 0;
9180         int i;
9181
9182         if (reset) {
9183                 err = -EINVAL;
9184                 goto out;
9185         }
9186
9187         for_each_present_cpu(i)
9188                 cpuacct_cpuusage_write(ca, i, 0);
9189
9190 out:
9191         return err;
9192 }
9193
9194 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9195                                    struct seq_file *m)
9196 {
9197         struct cpuacct *ca = cgroup_ca(cgroup);
9198         u64 percpu;
9199         int i;
9200
9201         for_each_present_cpu(i) {
9202                 percpu = cpuacct_cpuusage_read(ca, i);
9203                 seq_printf(m, "%llu ", (unsigned long long) percpu);
9204         }
9205         seq_printf(m, "\n");
9206         return 0;
9207 }
9208
9209 static const char *cpuacct_stat_desc[] = {
9210         [CPUACCT_STAT_USER] = "user",
9211         [CPUACCT_STAT_SYSTEM] = "system",
9212 };
9213
9214 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9215                 struct cgroup_map_cb *cb)
9216 {
9217         struct cpuacct *ca = cgroup_ca(cgrp);
9218         int i;
9219
9220         for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9221                 s64 val = percpu_counter_read(&ca->cpustat[i]);
9222                 val = cputime64_to_clock_t(val);
9223                 cb->fill(cb, cpuacct_stat_desc[i], val);
9224         }
9225         return 0;
9226 }
9227
9228 static struct cftype files[] = {
9229         {
9230                 .name = "usage",
9231                 .read_u64 = cpuusage_read,
9232                 .write_u64 = cpuusage_write,
9233         },
9234         {
9235                 .name = "usage_percpu",
9236                 .read_seq_string = cpuacct_percpu_seq_read,
9237         },
9238         {
9239                 .name = "stat",
9240                 .read_map = cpuacct_stats_show,
9241         },
9242 };
9243
9244 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9245 {
9246         return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
9247 }
9248
9249 /*
9250  * charge this task's execution time to its accounting group.
9251  *
9252  * called with rq->lock held.
9253  */
9254 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9255 {
9256         struct cpuacct *ca;
9257         int cpu;
9258
9259         if (unlikely(!cpuacct_subsys.active))
9260                 return;
9261
9262         cpu = task_cpu(tsk);
9263
9264         rcu_read_lock();
9265
9266         ca = task_ca(tsk);
9267
9268         for (; ca; ca = ca->parent) {
9269                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9270                 *cpuusage += cputime;
9271         }
9272
9273         rcu_read_unlock();
9274 }
9275
9276 /*
9277  * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9278  * in cputime_t units. As a result, cpuacct_update_stats calls
9279  * percpu_counter_add with values large enough to always overflow the
9280  * per cpu batch limit causing bad SMP scalability.
9281  *
9282  * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9283  * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9284  * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9285  */
9286 #ifdef CONFIG_SMP
9287 #define CPUACCT_BATCH   \
9288         min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9289 #else
9290 #define CPUACCT_BATCH   0
9291 #endif
9292
9293 /*
9294  * Charge the system/user time to the task's accounting group.
9295  */
9296 static void cpuacct_update_stats(struct task_struct *tsk,
9297                 enum cpuacct_stat_index idx, cputime_t val)
9298 {
9299         struct cpuacct *ca;
9300         int batch = CPUACCT_BATCH;
9301
9302         if (unlikely(!cpuacct_subsys.active))
9303                 return;
9304
9305         rcu_read_lock();
9306         ca = task_ca(tsk);
9307
9308         do {
9309                 __percpu_counter_add(&ca->cpustat[idx], val, batch);
9310                 ca = ca->parent;
9311         } while (ca);
9312         rcu_read_unlock();
9313 }
9314
9315 struct cgroup_subsys cpuacct_subsys = {
9316         .name = "cpuacct",
9317         .create = cpuacct_create,
9318         .destroy = cpuacct_destroy,
9319         .populate = cpuacct_populate,
9320         .subsys_id = cpuacct_subsys_id,
9321 };
9322 #endif  /* CONFIG_CGROUP_CPUACCT */
9323