kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/slab.h>
  27 #include <linux/profile.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/mempolicy.h>
  30 #include <linux/migrate.h>
  31 #include <linux/task_work.h>
  32
  33 #include <trace/events/sched.h>
  34
  35 #include "sched.h"
  36
  37 /*
  38  * Targeted preemption latency for CPU-bound tasks:
  39  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  40  *
  41  * NOTE: this latency value is not the same as the concept of
  42  * 'timeslice length' - timeslices in CFS are of variable length
  43  * and have no persistent notion like in traditional, time-slice
  44  * based scheduling concepts.
  45  *
  46  * (to see the precise effective timeslice length of your workload,
  47  *  run vmstat and monitor the context-switches (cs) field)
  48  */
  49 unsigned int sysctl_sched_latency = 6000000ULL;
  50 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  51
  52 /*
  53  * The initial- and re-scaling of tunables is configurable
  54  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  55  *
  56  * Options are:
  57  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  58  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  59  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  60  */
  61 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  62         = SCHED_TUNABLESCALING_LOG;
  63
  64 /*
  65  * Minimal preemption granularity for CPU-bound tasks:
  66  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  67  */
  68 unsigned int sysctl_sched_min_granularity = 750000ULL;
  69 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  70
  71 /*
  72  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  73  */
  74 static unsigned int sched_nr_latency = 8;
  75
  76 /*
  77  * After fork, child runs first. If set to 0 (default) then
  78  * parent will (try to) run first.
  79  */
  80 unsigned int sysctl_sched_child_runs_first __read_mostly;
  81
  82 /*
  83  * SCHED_OTHER wake-up granularity.
  84  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  85  *
  86  * This option delays the preemption effects of decoupled workloads
  87  * and reduces their over-scheduling. Synchronous workloads will still
  88  * have immediate wakeup/sleep latencies.
  89  */
  90 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  91 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  92
  93 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  94
  95 /*
  96  * The exponential sliding  window over which load is averaged for shares
  97  * distribution.
  98  * (default: 10msec)
  99  */
 100 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 101
 102 #ifdef CONFIG_CFS_BANDWIDTH
 103 /*
 104  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 105  * each time a cfs_rq requests quota.
 106  *
 107  * Note: in the case that the slice exceeds the runtime remaining (either due
 108  * to consumption or the quota being specified to be smaller than the slice)
 109  * we will always only issue the remaining available time.
 110  *
 111  * default: 5 msec, units: microseconds
 112   */
 113 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 114 #endif
 115
 116 /*
 117  * Increase the granularity value when there are more CPUs,
 118  * because with more CPUs the 'effective latency' as visible
 119  * to users decreases. But the relationship is not linear,
 120  * so pick a second-best guess by going with the log2 of the
 121  * number of CPUs.
 122  *
 123  * This idea comes from the SD scheduler of Con Kolivas:
 124  */
 125 static int get_update_sysctl_factor(void)
 126 {
 127         unsigned int cpus = min_t(int, num_online_cpus(), 8);
 128         unsigned int factor;
 129
 130         switch (sysctl_sched_tunable_scaling) {
 131         case SCHED_TUNABLESCALING_NONE:
 132                 factor = 1;
 133                 break;
 134         case SCHED_TUNABLESCALING_LINEAR:
 135                 factor = cpus;
 136                 break;
 137         case SCHED_TUNABLESCALING_LOG:
 138         default:
 139                 factor = 1 + ilog2(cpus);
 140                 break;
 141         }
 142
 143         return factor;
 144 }
 145
 146 static void update_sysctl(void)
 147 {
 148         unsigned int factor = get_update_sysctl_factor();
 149
 150 #define SET_SYSCTL(name) \
 151         (sysctl_##name = (factor) * normalized_sysctl_##name)
 152         SET_SYSCTL(sched_min_granularity);
 153         SET_SYSCTL(sched_latency);
 154         SET_SYSCTL(sched_wakeup_granularity);
 155 #undef SET_SYSCTL
 156 }
 157
 158 void sched_init_granularity(void)
 159 {
 160         update_sysctl();
 161 }
 162
 163 #if BITS_PER_LONG == 32
 164 # define WMULT_CONST    (~0UL)
 165 #else
 166 # define WMULT_CONST    (1UL << 32)
 167 #endif
 168
 169 #define WMULT_SHIFT     32
 170
 171 /*
 172  * Shift right and round:
 173  */
 174 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 175
 176 /*
 177  * delta *= weight / lw
 178  */
 179 static unsigned long
 180 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 181                 struct load_weight *lw)
 182 {
 183         u64 tmp;
 184
 185         /*
 186          * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 187          * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 188          * 2^SCHED_LOAD_RESOLUTION.
 189          */
 190         if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 191                 tmp = (u64)delta_exec * scale_load_down(weight);
 192         else
 193                 tmp = (u64)delta_exec;
 194
 195         if (!lw->inv_weight) {
 196                 unsigned long w = scale_load_down(lw->weight);
 197
 198                 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 199                         lw->inv_weight = 1;
 200                 else if (unlikely(!w))
 201                         lw->inv_weight = WMULT_CONST;
 202                 else
 203                         lw->inv_weight = WMULT_CONST / w;
 204         }
 205
 206         /*
 207          * Check whether we'd overflow the 64-bit multiplication:
 208          */
 209         if (unlikely(tmp > WMULT_CONST))
 210                 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 211                         WMULT_SHIFT/2);
 212         else
 213                 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 214
 215         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 216 }
 217
 218
 219 const struct sched_class fair_sched_class;
 220
 221 /**************************************************************
 222  * CFS operations on generic schedulable entities:
 223  */
 224
 225 #ifdef CONFIG_FAIR_GROUP_SCHED
 226
 227 /* cpu runqueue to which this cfs_rq is attached */
 228 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 229 {
 230         return cfs_rq->rq;
 231 }
 232
 233 /* An entity is a task if it doesn't "own" a runqueue */
 234 #define entity_is_task(se)      (!se->my_q)
 235
 236 static inline struct task_struct *task_of(struct sched_entity *se)
 237 {
 238 #ifdef CONFIG_SCHED_DEBUG
 239         WARN_ON_ONCE(!entity_is_task(se));
 240 #endif
 241         return container_of(se, struct task_struct, se);
 242 }
 243
 244 /* Walk up scheduling entities hierarchy */
 245 #define for_each_sched_entity(se) \
 246                 for (; se; se = se->parent)
 247
 248 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 249 {
 250         return p->se.cfs_rq;
 251 }
 252
 253 /* runqueue on which this entity is (to be) queued */
 254 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 255 {
 256         return se->cfs_rq;
 257 }
 258
 259 /* runqueue "owned" by this group */
 260 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 261 {
 262         return grp->my_q;
 263 }
 264
 265 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 266                                        int force_update);
 267
 268 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 269 {
 270         if (!cfs_rq->on_list) {
 271                 /*
 272                  * Ensure we either appear before our parent (if already
 273                  * enqueued) or force our parent to appear after us when it is
 274                  * enqueued.  The fact that we always enqueue bottom-up
 275                  * reduces this to two cases.
 276                  */
 277                 if (cfs_rq->tg->parent &&
 278                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 279                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 280                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 281                 } else {
 282                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 283                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 284                 }
 285
 286                 cfs_rq->on_list = 1;
 287                 /* We should have no load, but we need to update last_decay. */
 288                 update_cfs_rq_blocked_load(cfs_rq, 0);
 289         }
 290 }
 291
 292 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 293 {
 294         if (cfs_rq->on_list) {
 295                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 296                 cfs_rq->on_list = 0;
 297         }
 298 }
 299
 300 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 301 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 302         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 303
 304 /* Do the two (enqueued) entities belong to the same group ? */
 305 static inline int
 306 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 307 {
 308         if (se->cfs_rq == pse->cfs_rq)
 309                 return 1;
 310
 311         return 0;
 312 }
 313
 314 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 315 {
 316         return se->parent;
 317 }
 318
 319 /* return depth at which a sched entity is present in the hierarchy */
 320 static inline int depth_se(struct sched_entity *se)
 321 {
 322         int depth = 0;
 323
 324         for_each_sched_entity(se)
 325                 depth++;
 326
 327         return depth;
 328 }
 329
 330 static void
 331 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 332 {
 333         int se_depth, pse_depth;
 334
 335         /*
 336          * preemption test can be made between sibling entities who are in the
 337          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 338          * both tasks until we find their ancestors who are siblings of common
 339          * parent.
 340          */
 341
 342         /* First walk up until both entities are at same depth */
 343         se_depth = depth_se(*se);
 344         pse_depth = depth_se(*pse);
 345
 346         while (se_depth > pse_depth) {
 347                 se_depth--;
 348                 *se = parent_entity(*se);
 349         }
 350
 351         while (pse_depth > se_depth) {
 352                 pse_depth--;
 353                 *pse = parent_entity(*pse);
 354         }
 355
 356         while (!is_same_group(*se, *pse)) {
 357                 *se = parent_entity(*se);
 358                 *pse = parent_entity(*pse);
 359         }
 360 }
 361
 362 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 363
 364 static inline struct task_struct *task_of(struct sched_entity *se)
 365 {
 366         return container_of(se, struct task_struct, se);
 367 }
 368
 369 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 370 {
 371         return container_of(cfs_rq, struct rq, cfs);
 372 }
 373
 374 #define entity_is_task(se)      1
 375
 376 #define for_each_sched_entity(se) \
 377                 for (; se; se = NULL)
 378
 379 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 380 {
 381         return &task_rq(p)->cfs;
 382 }
 383
 384 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 385 {
 386         struct task_struct *p = task_of(se);
 387         struct rq *rq = task_rq(p);
 388
 389         return &rq->cfs;
 390 }
 391
 392 /* runqueue "owned" by this group */
 393 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 394 {
 395         return NULL;
 396 }
 397
 398 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 399 {
 400 }
 401
 402 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 403 {
 404 }
 405
 406 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 407                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 408
 409 static inline int
 410 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 411 {
 412         return 1;
 413 }
 414
 415 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 416 {
 417         return NULL;
 418 }
 419
 420 static inline void
 421 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 422 {
 423 }
 424
 425 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 426
 427 static __always_inline
 428 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 429
 430 /**************************************************************
 431  * Scheduling class tree data structure manipulation methods:
 432  */
 433
 434 static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
 435 {
 436         s64 delta = (s64)(vruntime - min_vruntime);
 437         if (delta > 0)
 438                 min_vruntime = vruntime;
 439
 440         return min_vruntime;
 441 }
 442
 443 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 444 {
 445         s64 delta = (s64)(vruntime - min_vruntime);
 446         if (delta < 0)
 447                 min_vruntime = vruntime;
 448
 449         return min_vruntime;
 450 }
 451
 452 static inline int entity_before(struct sched_entity *a,
 453                                 struct sched_entity *b)
 454 {
 455         return (s64)(a->vruntime - b->vruntime) < 0;
 456 }
 457
 458 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 459 {
 460         u64 vruntime = cfs_rq->min_vruntime;
 461
 462         if (cfs_rq->curr)
 463                 vruntime = cfs_rq->curr->vruntime;
 464
 465         if (cfs_rq->rb_leftmost) {
 466                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 467                                                    struct sched_entity,
 468                                                    run_node);
 469
 470                 if (!cfs_rq->curr)
 471                         vruntime = se->vruntime;
 472                 else
 473                         vruntime = min_vruntime(vruntime, se->vruntime);
 474         }
 475
 476         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 477 #ifndef CONFIG_64BIT
 478         smp_wmb();
 479         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 480 #endif
 481 }
 482
 483 /*
 484  * Enqueue an entity into the rb-tree:
 485  */
 486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 487 {
 488         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 489         struct rb_node *parent = NULL;
 490         struct sched_entity *entry;
 491         int leftmost = 1;
 492
 493         /*
 494          * Find the right place in the rbtree:
 495          */
 496         while (*link) {
 497                 parent = *link;
 498                 entry = rb_entry(parent, struct sched_entity, run_node);
 499                 /*
 500                  * We dont care about collisions. Nodes with
 501                  * the same key stay together.
 502                  */
 503                 if (entity_before(se, entry)) {
 504                         link = &parent->rb_left;
 505                 } else {
 506                         link = &parent->rb_right;
 507                         leftmost = 0;
 508                 }
 509         }
 510
 511         /*
 512          * Maintain a cache of leftmost tree entries (it is frequently
 513          * used):
 514          */
 515         if (leftmost)
 516                 cfs_rq->rb_leftmost = &se->run_node;
 517
 518         rb_link_node(&se->run_node, parent, link);
 519         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 520 }
 521
 522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 523 {
 524         if (cfs_rq->rb_leftmost == &se->run_node) {
 525                 struct rb_node *next_node;
 526
 527                 next_node = rb_next(&se->run_node);
 528                 cfs_rq->rb_leftmost = next_node;
 529         }
 530
 531         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 532 }
 533
 534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 535 {
 536         struct rb_node *left = cfs_rq->rb_leftmost;
 537
 538         if (!left)
 539                 return NULL;
 540
 541         return rb_entry(left, struct sched_entity, run_node);
 542 }
 543
 544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 545 {
 546         struct rb_node *next = rb_next(&se->run_node);
 547
 548         if (!next)
 549                 return NULL;
 550
 551         return rb_entry(next, struct sched_entity, run_node);
 552 }
 553
 554 #ifdef CONFIG_SCHED_DEBUG
 555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 556 {
 557         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 558
 559         if (!last)
 560                 return NULL;
 561
 562         return rb_entry(last, struct sched_entity, run_node);
 563 }
 564
 565 /**************************************************************
 566  * Scheduling class statistics methods:
 567  */
 568
 569 int sched_proc_update_handler(struct ctl_table *table, int write,
 570                 void __user *buffer, size_t *lenp,
 571                 loff_t *ppos)
 572 {
 573         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 574         int factor = get_update_sysctl_factor();
 575
 576         if (ret || !write)
 577                 return ret;
 578
 579         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 580                                         sysctl_sched_min_granularity);
 581
 582 #define WRT_SYSCTL(name) \
 583         (normalized_sysctl_##name = sysctl_##name / (factor))
 584         WRT_SYSCTL(sched_min_granularity);
 585         WRT_SYSCTL(sched_latency);
 586         WRT_SYSCTL(sched_wakeup_granularity);
 587 #undef WRT_SYSCTL
 588
 589         return 0;
 590 }
 591 #endif
 592
 593 /*
 594  * delta /= w
 595  */
 596 static inline unsigned long
 597 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 598 {
 599         if (unlikely(se->load.weight != NICE_0_LOAD))
 600                 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
 601
 602         return delta;
 603 }
 604
 605 /*
 606  * The idea is to set a period in which each task runs once.
 607  *
 608  * When there are too many tasks (sched_nr_latency) we have to stretch
 609  * this period because otherwise the slices get too small.
 610  *
 611  * p = (nr <= nl) ? l : l*nr/nl
 612  */
 613 static u64 __sched_period(unsigned long nr_running)
 614 {
 615         u64 period = sysctl_sched_latency;
 616         unsigned long nr_latency = sched_nr_latency;
 617
 618         if (unlikely(nr_running > nr_latency)) {
 619                 period = sysctl_sched_min_granularity;
 620                 period *= nr_running;
 621         }
 622
 623         return period;
 624 }
 625
 626 /*
 627  * We calculate the wall-time slice from the period by taking a part
 628  * proportional to the weight.
 629  *
 630  * s = p*P[w/rw]
 631  */
 632 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 633 {
 634         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 635
 636         for_each_sched_entity(se) {
 637                 struct load_weight *load;
 638                 struct load_weight lw;
 639
 640                 cfs_rq = cfs_rq_of(se);
 641                 load = &cfs_rq->load;
 642
 643                 if (unlikely(!se->on_rq)) {
 644                         lw = cfs_rq->load;
 645
 646                         update_load_add(&lw, se->load.weight);
 647                         load = &lw;
 648                 }
 649                 slice = calc_delta_mine(slice, se->load.weight, load);
 650         }
 651         return slice;
 652 }
 653
 654 /*
 655  * We calculate the vruntime slice of a to be inserted task
 656  *
 657  * vs = s/w
 658  */
 659 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 660 {
 661         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 662 }
 663
 664 /*
 665  * Update the current task's runtime statistics. Skip current tasks that
 666  * are not in our scheduling class.
 667  */
 668 static inline void
 669 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 670               unsigned long delta_exec)
 671 {
 672         unsigned long delta_exec_weighted;
 673
 674         schedstat_set(curr->statistics.exec_max,
 675                       max((u64)delta_exec, curr->statistics.exec_max));
 676
 677         curr->sum_exec_runtime += delta_exec;
 678         schedstat_add(cfs_rq, exec_clock, delta_exec);
 679         delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 680
 681         curr->vruntime += delta_exec_weighted;
 682         update_min_vruntime(cfs_rq);
 683 }
 684
 685 static void update_curr(struct cfs_rq *cfs_rq)
 686 {
 687         struct sched_entity *curr = cfs_rq->curr;
 688         u64 now = rq_of(cfs_rq)->clock_task;
 689         unsigned long delta_exec;
 690
 691         if (unlikely(!curr))
 692                 return;
 693
 694         /*
 695          * Get the amount of time the current task was running
 696          * since the last time we changed load (this cannot
 697          * overflow on 32 bits):
 698          */
 699         delta_exec = (unsigned long)(now - curr->exec_start);
 700         if (!delta_exec)
 701                 return;
 702
 703         __update_curr(cfs_rq, curr, delta_exec);
 704         curr->exec_start = now;
 705
 706         if (entity_is_task(curr)) {
 707                 struct task_struct *curtask = task_of(curr);
 708
 709                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 710                 cpuacct_charge(curtask, delta_exec);
 711                 account_group_exec_runtime(curtask, delta_exec);
 712         }
 713
 714         account_cfs_rq_runtime(cfs_rq, delta_exec);
 715 }
 716
 717 static inline void
 718 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 719 {
 720         schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 721 }
 722
 723 /*
 724  * Task is being enqueued - update stats:
 725  */
 726 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 727 {
 728         /*
 729          * Are we enqueueing a waiting task? (for current tasks
 730          * a dequeue/enqueue event is a NOP)
 731          */
 732         if (se != cfs_rq->curr)
 733                 update_stats_wait_start(cfs_rq, se);
 734 }
 735
 736 static void
 737 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 738 {
 739         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 740                         rq_of(cfs_rq)->clock - se->statistics.wait_start));
 741         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 742         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 743                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 744 #ifdef CONFIG_SCHEDSTATS
 745         if (entity_is_task(se)) {
 746                 trace_sched_stat_wait(task_of(se),
 747                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 748         }
 749 #endif
 750         schedstat_set(se->statistics.wait_start, 0);
 751 }
 752
 753 static inline void
 754 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 755 {
 756         /*
 757          * Mark the end of the wait period if dequeueing a
 758          * waiting task:
 759          */
 760         if (se != cfs_rq->curr)
 761                 update_stats_wait_end(cfs_rq, se);
 762 }
 763
 764 /*
 765  * We are picking a new current task - update its stats:
 766  */
 767 static inline void
 768 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 769 {
 770         /*
 771          * We are starting a new run period:
 772          */
 773         se->exec_start = rq_of(cfs_rq)->clock_task;
 774 }
 775
 776 /**************************************************
 777  * Scheduling class queueing methods:
 778  */
 779
 780 #ifdef CONFIG_NUMA_BALANCING
 781 /*
 782  * numa task sample period in ms
 783  */
 784 unsigned int sysctl_numa_balancing_scan_period_min = 100;
 785 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
 786 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 787
 788 /* Portion of address space to scan in MB */
 789 unsigned int sysctl_numa_balancing_scan_size = 256;
 790
 791 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 792 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 793
 794 static void task_numa_placement(struct task_struct *p)
 795 {
 796         int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 797
 798         if (p->numa_scan_seq == seq)
 799                 return;
 800         p->numa_scan_seq = seq;
 801
 802         /* FIXME: Scheduling placement policy hints go here */
 803 }
 804
 805 /*
 806  * Got a PROT_NONE fault for a page on @node.
 807  */
 808 void task_numa_fault(int node, int pages, bool migrated)
 809 {
 810         struct task_struct *p = current;
 811
 812         if (!sched_feat_numa(NUMA))
 813                 return;
 814
 815         /* FIXME: Allocate task-specific structure for placement policy here */
 816
 817         /*
 818          * If pages are properly placed (did not migrate) then scan slower.
 819          * This is reset periodically in case of phase changes
 820          */
 821         if (!migrated)
 822                 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
 823                         p->numa_scan_period + jiffies_to_msecs(10));
 824
 825         task_numa_placement(p);
 826 }
 827
 828 static void reset_ptenuma_scan(struct task_struct *p)
 829 {
 830         ACCESS_ONCE(p->mm->numa_scan_seq)++;
 831         p->mm->numa_scan_offset = 0;
 832 }
 833
 834 /*
 835  * The expensive part of numa migration is done from task_work context.
 836  * Triggered from task_tick_numa().
 837  */
 838 void task_numa_work(struct callback_head *work)
 839 {
 840         unsigned long migrate, next_scan, now = jiffies;
 841         struct task_struct *p = current;
 842         struct mm_struct *mm = p->mm;
 843         struct vm_area_struct *vma;
 844         unsigned long start, end;
 845         long pages;
 846
 847         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
 848
 849         work->next = work; /* protect against double add */
 850         /*
 851          * Who cares about NUMA placement when they're dying.
 852          *
 853          * NOTE: make sure not to dereference p->mm before this check,
 854          * exit_task_work() happens _after_ exit_mm() so we could be called
 855          * without p->mm even though we still had it when we enqueued this
 856          * work.
 857          */
 858         if (p->flags & PF_EXITING)
 859                 return;
 860
 861         /*
 862          * We do not care about task placement until a task runs on a node
 863          * other than the first one used by the address space. This is
 864          * largely because migrations are driven by what CPU the task
 865          * is running on. If it's never scheduled on another node, it'll
 866          * not migrate so why bother trapping the fault.
 867          */
 868         if (mm->first_nid == NUMA_PTE_SCAN_INIT)
 869                 mm->first_nid = numa_node_id();
 870         if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
 871                 /* Are we running on a new node yet? */
 872                 if (numa_node_id() == mm->first_nid &&
 873                     !sched_feat_numa(NUMA_FORCE))
 874                         return;
 875
 876                 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
 877         }
 878
 879         /*
 880          * Reset the scan period if enough time has gone by. Objective is that
 881          * scanning will be reduced if pages are properly placed. As tasks
 882          * can enter different phases this needs to be re-examined. Lacking
 883          * proper tracking of reference behaviour, this blunt hammer is used.
 884          */
 885         migrate = mm->numa_next_reset;
 886         if (time_after(now, migrate)) {
 887                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 888                 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
 889                 xchg(&mm->numa_next_reset, next_scan);
 890         }
 891
 892         /*
 893          * Enforce maximal scan/migration frequency..
 894          */
 895         migrate = mm->numa_next_scan;
 896         if (time_before(now, migrate))
 897                 return;
 898
 899         if (p->numa_scan_period == 0)
 900                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 901
 902         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 903         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 904                 return;
 905
 906         /*
 907          * Do not set pte_numa if the current running node is rate-limited.
 908          * This loses statistics on the fault but if we are unwilling to
 909          * migrate to this node, it is less likely we can do useful work
 910          */
 911         if (migrate_ratelimited(numa_node_id()))
 912                 return;
 913
 914         start = mm->numa_scan_offset;
 915         pages = sysctl_numa_balancing_scan_size;
 916         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 917         if (!pages)
 918                 return;
 919
 920         down_read(&mm->mmap_sem);
 921         vma = find_vma(mm, start);
 922         if (!vma) {
 923                 reset_ptenuma_scan(p);
 924                 start = 0;
 925                 vma = mm->mmap;
 926         }
 927         for (; vma; vma = vma->vm_next) {
 928                 if (!vma_migratable(vma))
 929                         continue;
 930
 931                 /* Skip small VMAs. They are not likely to be of relevance */
 932                 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 933                         continue;
 934
 935                 do {
 936                         start = max(start, vma->vm_start);
 937                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 938                         end = min(end, vma->vm_end);
 939                         pages -= change_prot_numa(vma, start, end);
 940
 941                         start = end;
 942                         if (pages <= 0)
 943                                 goto out;
 944                 } while (end != vma->vm_end);
 945         }
 946
 947 out:
 948         /*
 949          * It is possible to reach the end of the VMA list but the last few VMAs are
 950          * not guaranteed to the vma_migratable. If they are not, we would find the
 951          * !migratable VMA on the next scan but not reset the scanner to the start
 952          * so check it now.
 953          */
 954         if (vma)
 955                 mm->numa_scan_offset = start;
 956         else
 957                 reset_ptenuma_scan(p);
 958         up_read(&mm->mmap_sem);
 959 }
 960
 961 /*
 962  * Drive the periodic memory faults..
 963  */
 964 void task_tick_numa(struct rq *rq, struct task_struct *curr)
 965 {
 966         struct callback_head *work = &curr->numa_work;
 967         u64 period, now;
 968
 969         /*
 970          * We don't care about NUMA placement if we don't have memory.
 971          */
 972         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 973                 return;
 974
 975         /*
 976          * Using runtime rather than walltime has the dual advantage that
 977          * we (mostly) drive the selection from busy threads and that the
 978          * task needs to have done some actual work before we bother with
 979          * NUMA placement.
 980          */
 981         now = curr->se.sum_exec_runtime;
 982         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 983
 984         if (now - curr->node_stamp > period) {
 985                 if (!curr->node_stamp)
 986                         curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 987                 curr->node_stamp = now;
 988
 989                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
 990                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
 991                         task_work_add(curr, work, true);
 992                 }
 993         }
 994 }
 995 #else
 996 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 997 {
 998 }
 999 #endif /* CONFIG_NUMA_BALANCING */
1000
1001 static void
1002 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1003 {
1004         update_load_add(&cfs_rq->load, se->load.weight);
1005         if (!parent_entity(se))
1006                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1007 #ifdef CONFIG_SMP
1008         if (entity_is_task(se))
1009                 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1010 #endif
1011         cfs_rq->nr_running++;
1012 }
1013
1014 static void
1015 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1016 {
1017         update_load_sub(&cfs_rq->load, se->load.weight);
1018         if (!parent_entity(se))
1019                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1020         if (entity_is_task(se))
1021                 list_del_init(&se->group_node);
1022         cfs_rq->nr_running--;
1023 }
1024
1025 #ifdef CONFIG_FAIR_GROUP_SCHED
1026 # ifdef CONFIG_SMP
1027 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1028 {
1029         long tg_weight;
1030
1031         /*
1032          * Use this CPU's actual weight instead of the last load_contribution
1033          * to gain a more accurate current total weight. See
1034          * update_cfs_rq_load_contribution().
1035          */
1036         tg_weight = atomic64_read(&tg->load_avg);
1037         tg_weight -= cfs_rq->tg_load_contrib;
1038         tg_weight += cfs_rq->load.weight;
1039
1040         return tg_weight;
1041 }
1042
1043 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1044 {
1045         long tg_weight, load, shares;
1046
1047         tg_weight = calc_tg_weight(tg, cfs_rq);
1048         load = cfs_rq->load.weight;
1049
1050         shares = (tg->shares * load);
1051         if (tg_weight)
1052                 shares /= tg_weight;
1053
1054         if (shares < MIN_SHARES)
1055                 shares = MIN_SHARES;
1056         if (shares > tg->shares)
1057                 shares = tg->shares;
1058
1059         return shares;
1060 }
1061 # else /* CONFIG_SMP */
1062 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1063 {
1064         return tg->shares;
1065 }
1066 # endif /* CONFIG_SMP */
1067 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1068                             unsigned long weight)
1069 {
1070         if (se->on_rq) {
1071                 /* commit outstanding execution time */
1072                 if (cfs_rq->curr == se)
1073                         update_curr(cfs_rq);
1074                 account_entity_dequeue(cfs_rq, se);
1075         }
1076
1077         update_load_set(&se->load, weight);
1078
1079         if (se->on_rq)
1080                 account_entity_enqueue(cfs_rq, se);
1081 }
1082
1083 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1084
1085 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1086 {
1087         struct task_group *tg;
1088         struct sched_entity *se;
1089         long shares;
1090
1091         tg = cfs_rq->tg;
1092         se = tg->se[cpu_of(rq_of(cfs_rq))];
1093         if (!se || throttled_hierarchy(cfs_rq))
1094                 return;
1095 #ifndef CONFIG_SMP
1096         if (likely(se->load.weight == tg->shares))
1097                 return;
1098 #endif
1099         shares = calc_cfs_shares(cfs_rq, tg);
1100
1101         reweight_entity(cfs_rq_of(se), se, shares);
1102 }
1103 #else /* CONFIG_FAIR_GROUP_SCHED */
1104 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1105 {
1106 }
1107 #endif /* CONFIG_FAIR_GROUP_SCHED */
1108
1109 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1110 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1111 /*
1112  * We choose a half-life close to 1 scheduling period.
1113  * Note: The tables below are dependent on this value.
1114  */
1115 #define LOAD_AVG_PERIOD 32
1116 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1117 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1118
1119 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1120 static const u32 runnable_avg_yN_inv[] = {
1121         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1122         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1123         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1124         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1125         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1126         0x85aac367, 0x82cd8698,
1127 };
1128
1129 /*
1130  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1131  * over-estimates when re-combining.
1132  */
1133 static const u32 runnable_avg_yN_sum[] = {
1134             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1135          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1136         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1137 };
1138
1139 /*
1140  * Approximate:
1141  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1142  */
1143 static __always_inline u64 decay_load(u64 val, u64 n)
1144 {
1145         unsigned int local_n;
1146
1147         if (!n)
1148                 return val;
1149         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1150                 return 0;
1151
1152         /* after bounds checking we can collapse to 32-bit */
1153         local_n = n;
1154
1155         /*
1156          * As y^PERIOD = 1/2, we can combine
1157          *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1158          * With a look-up table which covers k^n (n<PERIOD)
1159          *
1160          * To achieve constant time decay_load.
1161          */
1162         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1163                 val >>= local_n / LOAD_AVG_PERIOD;
1164                 local_n %= LOAD_AVG_PERIOD;
1165         }
1166
1167         val *= runnable_avg_yN_inv[local_n];
1168         /* We don't use SRR here since we always want to round down. */
1169         return val >> 32;
1170 }
1171
1172 /*
1173  * For updates fully spanning n periods, the contribution to runnable
1174  * average will be: \Sum 1024*y^n
1175  *
1176  * We can compute this reasonably efficiently by combining:
1177  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1178  */
1179 static u32 __compute_runnable_contrib(u64 n)
1180 {
1181         u32 contrib = 0;
1182
1183         if (likely(n <= LOAD_AVG_PERIOD))
1184                 return runnable_avg_yN_sum[n];
1185         else if (unlikely(n >= LOAD_AVG_MAX_N))
1186                 return LOAD_AVG_MAX;
1187
1188         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1189         do {
1190                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1191                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1192
1193                 n -= LOAD_AVG_PERIOD;
1194         } while (n > LOAD_AVG_PERIOD);
1195
1196         contrib = decay_load(contrib, n);
1197         return contrib + runnable_avg_yN_sum[n];
1198 }
1199
1200 /*
1201  * We can represent the historical contribution to runnable average as the
1202  * coefficients of a geometric series.  To do this we sub-divide our runnable
1203  * history into segments of approximately 1ms (1024us); label the segment that
1204  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1205  *
1206  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1207  *      p0            p1           p2
1208  *     (now)       (~1ms ago)  (~2ms ago)
1209  *
1210  * Let u_i denote the fraction of p_i that the entity was runnable.
1211  *
1212  * We then designate the fractions u_i as our co-efficients, yielding the
1213  * following representation of historical load:
1214  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1215  *
1216  * We choose y based on the with of a reasonably scheduling period, fixing:
1217  *   y^32 = 0.5
1218  *
1219  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1220  * approximately half as much as the contribution to load within the last ms
1221  * (u_0).
1222  *
1223  * When a period "rolls over" and we have new u_0`, multiplying the previous
1224  * sum again by y is sufficient to update:
1225  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1226  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1227  */
1228 static __always_inline int __update_entity_runnable_avg(u64 now,
1229                                                         struct sched_avg *sa,
1230                                                         int runnable)
1231 {
1232         u64 delta, periods;
1233         u32 runnable_contrib;
1234         int delta_w, decayed = 0;
1235
1236         delta = now - sa->last_runnable_update;
1237         /*
1238          * This should only happen when time goes backwards, which it
1239          * unfortunately does during sched clock init when we swap over to TSC.
1240          */
1241         if ((s64)delta < 0) {
1242                 sa->last_runnable_update = now;
1243                 return 0;
1244         }
1245
1246         /*
1247          * Use 1024ns as the unit of measurement since it's a reasonable
1248          * approximation of 1us and fast to compute.
1249          */
1250         delta >>= 10;
1251         if (!delta)
1252                 return 0;
1253         sa->last_runnable_update = now;
1254
1255         /* delta_w is the amount already accumulated against our next period */
1256         delta_w = sa->runnable_avg_period % 1024;
1257         if (delta + delta_w >= 1024) {
1258                 /* period roll-over */
1259                 decayed = 1;
1260
1261                 /*
1262                  * Now that we know we're crossing a period boundary, figure
1263                  * out how much from delta we need to complete the current
1264                  * period and accrue it.
1265                  */
1266                 delta_w = 1024 - delta_w;
1267                 if (runnable)
1268                         sa->runnable_avg_sum += delta_w;
1269                 sa->runnable_avg_period += delta_w;
1270
1271                 delta -= delta_w;
1272
1273                 /* Figure out how many additional periods this update spans */
1274                 periods = delta / 1024;
1275                 delta %= 1024;
1276
1277                 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1278                                                   periods + 1);
1279                 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1280                                                      periods + 1);
1281
1282                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1283                 runnable_contrib = __compute_runnable_contrib(periods);
1284                 if (runnable)
1285                         sa->runnable_avg_sum += runnable_contrib;
1286                 sa->runnable_avg_period += runnable_contrib;
1287         }
1288
1289         /* Remainder of delta accrued against u_0` */
1290         if (runnable)
1291                 sa->runnable_avg_sum += delta;
1292         sa->runnable_avg_period += delta;
1293
1294         return decayed;
1295 }
1296
1297 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1298 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1299 {
1300         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1301         u64 decays = atomic64_read(&cfs_rq->decay_counter);
1302
1303         decays -= se->avg.decay_count;
1304         if (!decays)
1305                 return 0;
1306
1307         se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1308         se->avg.decay_count = 0;
1309
1310         return decays;
1311 }
1312
1313 #ifdef CONFIG_FAIR_GROUP_SCHED
1314 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1315                                                  int force_update)
1316 {
1317         struct task_group *tg = cfs_rq->tg;
1318         s64 tg_contrib;
1319
1320         tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1321         tg_contrib -= cfs_rq->tg_load_contrib;
1322
1323         if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1324                 atomic64_add(tg_contrib, &tg->load_avg);
1325                 cfs_rq->tg_load_contrib += tg_contrib;
1326         }
1327 }
1328
1329 /*
1330  * Aggregate cfs_rq runnable averages into an equivalent task_group
1331  * representation for computing load contributions.
1332  */
1333 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1334                                                   struct cfs_rq *cfs_rq)
1335 {
1336         struct task_group *tg = cfs_rq->tg;
1337         long contrib;
1338
1339         /* The fraction of a cpu used by this cfs_rq */
1340         contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1341                           sa->runnable_avg_period + 1);
1342         contrib -= cfs_rq->tg_runnable_contrib;
1343
1344         if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1345                 atomic_add(contrib, &tg->runnable_avg);
1346                 cfs_rq->tg_runnable_contrib += contrib;
1347         }
1348 }
1349
1350 static inline void __update_group_entity_contrib(struct sched_entity *se)
1351 {
1352         struct cfs_rq *cfs_rq = group_cfs_rq(se);
1353         struct task_group *tg = cfs_rq->tg;
1354         int runnable_avg;
1355
1356         u64 contrib;
1357
1358         contrib = cfs_rq->tg_load_contrib * tg->shares;
1359         se->avg.load_avg_contrib = div64_u64(contrib,
1360                                              atomic64_read(&tg->load_avg) + 1);
1361
1362         /*
1363          * For group entities we need to compute a correction term in the case
1364          * that they are consuming <1 cpu so that we would contribute the same
1365          * load as a task of equal weight.
1366          *
1367          * Explicitly co-ordinating this measurement would be expensive, but
1368          * fortunately the sum of each cpus contribution forms a usable
1369          * lower-bound on the true value.
1370          *
1371          * Consider the aggregate of 2 contributions.  Either they are disjoint
1372          * (and the sum represents true value) or they are disjoint and we are
1373          * understating by the aggregate of their overlap.
1374          *
1375          * Extending this to N cpus, for a given overlap, the maximum amount we
1376          * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1377          * cpus that overlap for this interval and w_i is the interval width.
1378          *
1379          * On a small machine; the first term is well-bounded which bounds the
1380          * total error since w_i is a subset of the period.  Whereas on a
1381          * larger machine, while this first term can be larger, if w_i is the
1382          * of consequential size guaranteed to see n_i*w_i quickly converge to
1383          * our upper bound of 1-cpu.
1384          */
1385         runnable_avg = atomic_read(&tg->runnable_avg);
1386         if (runnable_avg < NICE_0_LOAD) {
1387                 se->avg.load_avg_contrib *= runnable_avg;
1388                 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1389         }
1390 }
1391 #else
1392 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1393                                                  int force_update) {}
1394 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1395                                                   struct cfs_rq *cfs_rq) {}
1396 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1397 #endif
1398
1399 static inline void __update_task_entity_contrib(struct sched_entity *se)
1400 {
1401         u32 contrib;
1402
1403         /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1404         contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1405         contrib /= (se->avg.runnable_avg_period + 1);
1406         se->avg.load_avg_contrib = scale_load(contrib);
1407 }
1408
1409 /* Compute the current contribution to load_avg by se, return any delta */
1410 static long __update_entity_load_avg_contrib(struct sched_entity *se)
1411 {
1412         long old_contrib = se->avg.load_avg_contrib;
1413
1414         if (entity_is_task(se)) {
1415                 __update_task_entity_contrib(se);
1416         } else {
1417                 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1418                 __update_group_entity_contrib(se);
1419         }
1420
1421         return se->avg.load_avg_contrib - old_contrib;
1422 }
1423
1424 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1425                                                  long load_contrib)
1426 {
1427         if (likely(load_contrib < cfs_rq->blocked_load_avg))
1428                 cfs_rq->blocked_load_avg -= load_contrib;
1429         else
1430                 cfs_rq->blocked_load_avg = 0;
1431 }
1432
1433 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1434
1435 /* Update a sched_entity's runnable average */
1436 static inline void update_entity_load_avg(struct sched_entity *se,
1437                                           int update_cfs_rq)
1438 {
1439         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1440         long contrib_delta;
1441         u64 now;
1442
1443         /*
1444          * For a group entity we need to use their owned cfs_rq_clock_task() in
1445          * case they are the parent of a throttled hierarchy.
1446          */
1447         if (entity_is_task(se))
1448                 now = cfs_rq_clock_task(cfs_rq);
1449         else
1450                 now = cfs_rq_clock_task(group_cfs_rq(se));
1451
1452         if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
1453                 return;
1454
1455         contrib_delta = __update_entity_load_avg_contrib(se);
1456
1457         if (!update_cfs_rq)
1458                 return;
1459
1460         if (se->on_rq)
1461                 cfs_rq->runnable_load_avg += contrib_delta;
1462         else
1463                 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1464 }
1465
1466 /*
1467  * Decay the load contributed by all blocked children and account this so that
1468  * their contribution may appropriately discounted when they wake up.
1469  */
1470 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1471 {
1472         u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1473         u64 decays;
1474
1475         decays = now - cfs_rq->last_decay;
1476         if (!decays && !force_update)
1477                 return;
1478
1479         if (atomic64_read(&cfs_rq->removed_load)) {
1480                 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1481                 subtract_blocked_load_contrib(cfs_rq, removed_load);
1482         }
1483
1484         if (decays) {
1485                 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1486                                                       decays);
1487                 atomic64_add(decays, &cfs_rq->decay_counter);
1488                 cfs_rq->last_decay = now;
1489         }
1490
1491         __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1492 }
1493
1494 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1495 {
1496         __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
1497         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1498 }
1499
1500 /* Add the load generated by se into cfs_rq's child load-average */
1501 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1502                                                   struct sched_entity *se,
1503                                                   int wakeup)
1504 {
1505         /*
1506          * We track migrations using entity decay_count <= 0, on a wake-up
1507          * migration we use a negative decay count to track the remote decays
1508          * accumulated while sleeping.
1509          */
1510         if (unlikely(se->avg.decay_count <= 0)) {
1511                 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1512                 if (se->avg.decay_count) {
1513                         /*
1514                          * In a wake-up migration we have to approximate the
1515                          * time sleeping.  This is because we can't synchronize
1516                          * clock_task between the two cpus, and it is not
1517                          * guaranteed to be read-safe.  Instead, we can
1518                          * approximate this using our carried decays, which are
1519                          * explicitly atomically readable.
1520                          */
1521                         se->avg.last_runnable_update -= (-se->avg.decay_count)
1522                                                         << 20;
1523                         update_entity_load_avg(se, 0);
1524                         /* Indicate that we're now synchronized and on-rq */
1525                         se->avg.decay_count = 0;
1526                 }
1527                 wakeup = 0;
1528         } else {
1529                 __synchronize_entity_decay(se);
1530         }
1531
1532         /* migrated tasks did not contribute to our blocked load */
1533         if (wakeup) {
1534                 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1535                 update_entity_load_avg(se, 0);
1536         }
1537
1538         cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1539         /* we force update consideration on load-balancer moves */
1540         update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1541 }
1542
1543 /*
1544  * Remove se's load from this cfs_rq child load-average, if the entity is
1545  * transitioning to a blocked state we track its projected decay using
1546  * blocked_load_avg.
1547  */
1548 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1549                                                   struct sched_entity *se,
1550                                                   int sleep)
1551 {
1552         update_entity_load_avg(se, 1);
1553         /* we force update consideration on load-balancer moves */
1554         update_cfs_rq_blocked_load(cfs_rq, !sleep);
1555
1556         cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1557         if (sleep) {
1558                 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1559                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1560         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1561 }
1562 #else
1563 static inline void update_entity_load_avg(struct sched_entity *se,
1564                                           int update_cfs_rq) {}
1565 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1566 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1567                                            struct sched_entity *se,
1568                                            int wakeup) {}
1569 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1570                                            struct sched_entity *se,
1571                                            int sleep) {}
1572 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1573                                               int force_update) {}
1574 #endif
1575
1576 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1577 {
1578 #ifdef CONFIG_SCHEDSTATS
1579         struct task_struct *tsk = NULL;
1580
1581         if (entity_is_task(se))
1582                 tsk = task_of(se);
1583
1584         if (se->statistics.sleep_start) {
1585                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1586
1587                 if ((s64)delta < 0)
1588                         delta = 0;
1589
1590                 if (unlikely(delta > se->statistics.sleep_max))
1591                         se->statistics.sleep_max = delta;
1592
1593                 se->statistics.sleep_start = 0;
1594                 se->statistics.sum_sleep_runtime += delta;
1595
1596                 if (tsk) {
1597                         account_scheduler_latency(tsk, delta >> 10, 1);
1598                         trace_sched_stat_sleep(tsk, delta);
1599                 }
1600         }
1601         if (se->statistics.block_start) {
1602                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1603
1604                 if ((s64)delta < 0)
1605                         delta = 0;
1606
1607                 if (unlikely(delta > se->statistics.block_max))
1608                         se->statistics.block_max = delta;
1609
1610                 se->statistics.block_start = 0;
1611                 se->statistics.sum_sleep_runtime += delta;
1612
1613                 if (tsk) {
1614                         if (tsk->in_iowait) {
1615                                 se->statistics.iowait_sum += delta;
1616                                 se->statistics.iowait_count++;
1617                                 trace_sched_stat_iowait(tsk, delta);
1618                         }
1619
1620                         trace_sched_stat_blocked(tsk, delta);
1621
1622                         /*
1623                          * Blocking time is in units of nanosecs, so shift by
1624                          * 20 to get a milliseconds-range estimation of the
1625                          * amount of time that the task spent sleeping:
1626                          */
1627                         if (unlikely(prof_on == SLEEP_PROFILING)) {
1628                                 profile_hits(SLEEP_PROFILING,
1629                                                 (void *)get_wchan(tsk),
1630                                                 delta >> 20);
1631                         }
1632                         account_scheduler_latency(tsk, delta >> 10, 0);
1633                 }
1634         }
1635 #endif
1636 }
1637
1638 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1639 {
1640 #ifdef CONFIG_SCHED_DEBUG
1641         s64 d = se->vruntime - cfs_rq->min_vruntime;
1642
1643         if (d < 0)
1644                 d = -d;
1645
1646         if (d > 3*sysctl_sched_latency)
1647                 schedstat_inc(cfs_rq, nr_spread_over);
1648 #endif
1649 }
1650
1651 static void
1652 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1653 {
1654         u64 vruntime = cfs_rq->min_vruntime;
1655
1656         /*
1657          * The 'current' period is already promised to the current tasks,
1658          * however the extra weight of the new task will slow them down a
1659          * little, place the new task so that it fits in the slot that
1660          * stays open at the end.
1661          */
1662         if (initial && sched_feat(START_DEBIT))
1663                 vruntime += sched_vslice(cfs_rq, se);
1664
1665         /* sleeps up to a single latency don't count. */
1666         if (!initial) {
1667                 unsigned long thresh = sysctl_sched_latency;
1668
1669                 /*
1670                  * Halve their sleep time's effect, to allow
1671                  * for a gentler effect of sleepers:
1672                  */
1673                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1674                         thresh >>= 1;
1675
1676                 vruntime -= thresh;
1677         }
1678
1679         /* ensure we never gain time by being placed backwards. */
1680         vruntime = max_vruntime(se->vruntime, vruntime);
1681
1682         se->vruntime = vruntime;
1683 }
1684
1685 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1686
1687 static void
1688 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1689 {
1690         /*
1691          * Update the normalized vruntime before updating min_vruntime
1692          * through callig update_curr().
1693          */
1694         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1695                 se->vruntime += cfs_rq->min_vruntime;
1696
1697         /*
1698          * Update run-time statistics of the 'current'.
1699          */
1700         update_curr(cfs_rq);
1701         enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1702         account_entity_enqueue(cfs_rq, se);
1703         update_cfs_shares(cfs_rq);
1704
1705         if (flags & ENQUEUE_WAKEUP) {
1706                 place_entity(cfs_rq, se, 0);
1707                 enqueue_sleeper(cfs_rq, se);
1708         }
1709
1710         update_stats_enqueue(cfs_rq, se);
1711         check_spread(cfs_rq, se);
1712         if (se != cfs_rq->curr)
1713                 __enqueue_entity(cfs_rq, se);
1714         se->on_rq = 1;
1715
1716         if (cfs_rq->nr_running == 1) {
1717                 list_add_leaf_cfs_rq(cfs_rq);
1718                 check_enqueue_throttle(cfs_rq);
1719         }
1720 }
1721
1722 static void __clear_buddies_last(struct sched_entity *se)
1723 {
1724         for_each_sched_entity(se) {
1725                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1726                 if (cfs_rq->last == se)
1727                         cfs_rq->last = NULL;
1728                 else
1729                         break;
1730         }
1731 }
1732
1733 static void __clear_buddies_next(struct sched_entity *se)
1734 {
1735         for_each_sched_entity(se) {
1736                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1737                 if (cfs_rq->next == se)
1738                         cfs_rq->next = NULL;
1739                 else
1740                         break;
1741         }
1742 }
1743
1744 static void __clear_buddies_skip(struct sched_entity *se)
1745 {
1746         for_each_sched_entity(se) {
1747                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1748                 if (cfs_rq->skip == se)
1749                         cfs_rq->skip = NULL;
1750                 else
1751                         break;
1752         }
1753 }
1754
1755 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1756 {
1757         if (cfs_rq->last == se)
1758                 __clear_buddies_last(se);
1759
1760         if (cfs_rq->next == se)
1761                 __clear_buddies_next(se);
1762
1763         if (cfs_rq->skip == se)
1764                 __clear_buddies_skip(se);
1765 }
1766
1767 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1768
1769 static void
1770 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1771 {
1772         /*
1773          * Update run-time statistics of the 'current'.
1774          */
1775         update_curr(cfs_rq);
1776         dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1777
1778         update_stats_dequeue(cfs_rq, se);
1779         if (flags & DEQUEUE_SLEEP) {
1780 #ifdef CONFIG_SCHEDSTATS
1781                 if (entity_is_task(se)) {
1782                         struct task_struct *tsk = task_of(se);
1783
1784                         if (tsk->state & TASK_INTERRUPTIBLE)
1785                                 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1786                         if (tsk->state & TASK_UNINTERRUPTIBLE)
1787                                 se->statistics.block_start = rq_of(cfs_rq)->clock;
1788                 }
1789 #endif
1790         }
1791
1792         clear_buddies(cfs_rq, se);
1793
1794         if (se != cfs_rq->curr)
1795                 __dequeue_entity(cfs_rq, se);
1796         se->on_rq = 0;
1797         account_entity_dequeue(cfs_rq, se);
1798
1799         /*
1800          * Normalize the entity after updating the min_vruntime because the
1801          * update can refer to the ->curr item and we need to reflect this
1802          * movement in our normalized position.
1803          */
1804         if (!(flags & DEQUEUE_SLEEP))
1805                 se->vruntime -= cfs_rq->min_vruntime;
1806
1807         /* return excess runtime on last dequeue */
1808         return_cfs_rq_runtime(cfs_rq);
1809
1810         update_min_vruntime(cfs_rq);
1811         update_cfs_shares(cfs_rq);
1812 }
1813
1814 /*
1815  * Preempt the current task with a newly woken task if needed:
1816  */
1817 static void
1818 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1819 {
1820         unsigned long ideal_runtime, delta_exec;
1821         struct sched_entity *se;
1822         s64 delta;
1823
1824         ideal_runtime = sched_slice(cfs_rq, curr);
1825         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1826         if (delta_exec > ideal_runtime) {
1827                 resched_task(rq_of(cfs_rq)->curr);
1828                 /*
1829                  * The current task ran long enough, ensure it doesn't get
1830                  * re-elected due to buddy favours.
1831                  */
1832                 clear_buddies(cfs_rq, curr);
1833                 return;
1834         }
1835
1836         /*
1837          * Ensure that a task that missed wakeup preemption by a
1838          * narrow margin doesn't have to wait for a full slice.
1839          * This also mitigates buddy induced latencies under load.
1840          */
1841         if (delta_exec < sysctl_sched_min_granularity)
1842                 return;
1843
1844         se = __pick_first_entity(cfs_rq);
1845         delta = curr->vruntime - se->vruntime;
1846
1847         if (delta < 0)
1848                 return;
1849
1850         if (delta > ideal_runtime)
1851                 resched_task(rq_of(cfs_rq)->curr);
1852 }
1853
1854 static void
1855 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1856 {
1857         /* 'current' is not kept within the tree. */
1858         if (se->on_rq) {
1859                 /*
1860                  * Any task has to be enqueued before it get to execute on
1861                  * a CPU. So account for the time it spent waiting on the
1862                  * runqueue.
1863                  */
1864                 update_stats_wait_end(cfs_rq, se);
1865                 __dequeue_entity(cfs_rq, se);
1866         }
1867
1868         update_stats_curr_start(cfs_rq, se);
1869         cfs_rq->curr = se;
1870 #ifdef CONFIG_SCHEDSTATS
1871         /*
1872          * Track our maximum slice length, if the CPU's load is at
1873          * least twice that of our own weight (i.e. dont track it
1874          * when there are only lesser-weight tasks around):
1875          */
1876         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
1877                 se->statistics.slice_max = max(se->statistics.slice_max,
1878                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
1879         }
1880 #endif
1881         se->prev_sum_exec_runtime = se->sum_exec_runtime;
1882 }
1883
1884 static int
1885 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1886
1887 /*
1888  * Pick the next process, keeping these things in mind, in this order:
1889  * 1) keep things fair between processes/task groups
1890  * 2) pick the "next" process, since someone really wants that to run
1891  * 3) pick the "last" process, for cache locality
1892  * 4) do not run the "skip" process, if something else is available
1893  */
1894 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1895 {
1896         struct sched_entity *se = __pick_first_entity(cfs_rq);
1897         struct sched_entity *left = se;
1898
1899         /*
1900          * Avoid running the skip buddy, if running something else can
1901          * be done without getting too unfair.
1902          */
1903         if (cfs_rq->skip == se) {
1904                 struct sched_entity *second = __pick_next_entity(se);
1905                 if (second && wakeup_preempt_entity(second, left) < 1)
1906                         se = second;
1907         }
1908
1909         /*
1910          * Prefer last buddy, try to return the CPU to a preempted task.
1911          */
1912         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1913                 se = cfs_rq->last;
1914
1915         /*
1916          * Someone really wants this to run. If it's not unfair, run it.
1917          */
1918         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1919                 se = cfs_rq->next;
1920
1921         clear_buddies(cfs_rq, se);
1922
1923         return se;
1924 }
1925
1926 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1927
1928 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
1929 {
1930         /*
1931          * If still on the runqueue then deactivate_task()
1932          * was not called and update_curr() has to be done:
1933          */
1934         if (prev->on_rq)
1935                 update_curr(cfs_rq);
1936
1937         /* throttle cfs_rqs exceeding runtime */
1938         check_cfs_rq_runtime(cfs_rq);
1939
1940         check_spread(cfs_rq, prev);
1941         if (prev->on_rq) {
1942                 update_stats_wait_start(cfs_rq, prev);
1943                 /* Put 'current' back into the tree. */
1944                 __enqueue_entity(cfs_rq, prev);
1945                 /* in !on_rq case, update occurred at dequeue */
1946                 update_entity_load_avg(prev, 1);
1947         }
1948         cfs_rq->curr = NULL;
1949 }
1950
1951 static void
1952 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1953 {
1954         /*
1955          * Update run-time statistics of the 'current'.
1956          */
1957         update_curr(cfs_rq);
1958
1959         /*
1960          * Ensure that runnable average is periodically updated.
1961          */
1962         update_entity_load_avg(curr, 1);
1963         update_cfs_rq_blocked_load(cfs_rq, 1);
1964
1965 #ifdef CONFIG_SCHED_HRTICK
1966         /*
1967          * queued ticks are scheduled to match the slice, so don't bother
1968          * validating it and just reschedule.
1969          */
1970         if (queued) {
1971                 resched_task(rq_of(cfs_rq)->curr);
1972                 return;
1973         }
1974         /*
1975          * don't let the period tick interfere with the hrtick preemption
1976          */
1977         if (!sched_feat(DOUBLE_TICK) &&
1978                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
1979                 return;
1980 #endif
1981
1982         if (cfs_rq->nr_running > 1)
1983                 check_preempt_tick(cfs_rq, curr);
1984 }
1985
1986
1987 /**************************************************
1988  * CFS bandwidth control machinery
1989  */
1990
1991 #ifdef CONFIG_CFS_BANDWIDTH
1992
1993 #ifdef HAVE_JUMP_LABEL
1994 static struct static_key __cfs_bandwidth_used;
1995
1996 static inline bool cfs_bandwidth_used(void)
1997 {
1998         return static_key_false(&__cfs_bandwidth_used);
1999 }
2000
2001 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2002 {
2003         /* only need to count groups transitioning between enabled/!enabled */
2004         if (enabled && !was_enabled)
2005                 static_key_slow_inc(&__cfs_bandwidth_used);
2006         else if (!enabled && was_enabled)
2007                 static_key_slow_dec(&__cfs_bandwidth_used);
2008 }
2009 #else /* HAVE_JUMP_LABEL */
2010 static bool cfs_bandwidth_used(void)
2011 {
2012         return true;
2013 }
2014
2015 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2016 #endif /* HAVE_JUMP_LABEL */
2017
2018 /*
2019  * default period for cfs group bandwidth.
2020  * default: 0.1s, units: nanoseconds
2021  */
2022 static inline u64 default_cfs_period(void)
2023 {
2024         return 100000000ULL;
2025 }
2026
2027 static inline u64 sched_cfs_bandwidth_slice(void)
2028 {
2029         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2030 }
2031
2032 /*
2033  * Replenish runtime according to assigned quota and update expiration time.
2034  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2035  * additional synchronization around rq->lock.
2036  *
2037  * requires cfs_b->lock
2038  */
2039 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2040 {
2041         u64 now;
2042
2043         if (cfs_b->quota == RUNTIME_INF)
2044                 return;
2045
2046         now = sched_clock_cpu(smp_processor_id());
2047         cfs_b->runtime = cfs_b->quota;
2048         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2049 }
2050
2051 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2052 {
2053         return &tg->cfs_bandwidth;
2054 }
2055
2056 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2057 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2058 {
2059         if (unlikely(cfs_rq->throttle_count))
2060                 return cfs_rq->throttled_clock_task;
2061
2062         return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2063 }
2064
2065 /* returns 0 on failure to allocate runtime */
2066 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2067 {
2068         struct task_group *tg = cfs_rq->tg;
2069         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2070         u64 amount = 0, min_amount, expires;
2071
2072         /* note: this is a positive sum as runtime_remaining <= 0 */
2073         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2074
2075         raw_spin_lock(&cfs_b->lock);
2076         if (cfs_b->quota == RUNTIME_INF)
2077                 amount = min_amount;
2078         else {
2079                 /*
2080                  * If the bandwidth pool has become inactive, then at least one
2081                  * period must have elapsed since the last consumption.
2082                  * Refresh the global state and ensure bandwidth timer becomes
2083                  * active.
2084                  */
2085                 if (!cfs_b->timer_active) {
2086                         __refill_cfs_bandwidth_runtime(cfs_b);
2087                         __start_cfs_bandwidth(cfs_b);
2088                 }
2089
2090                 if (cfs_b->runtime > 0) {
2091                         amount = min(cfs_b->runtime, min_amount);
2092                         cfs_b->runtime -= amount;
2093                         cfs_b->idle = 0;
2094                 }
2095         }
2096         expires = cfs_b->runtime_expires;
2097         raw_spin_unlock(&cfs_b->lock);
2098
2099         cfs_rq->runtime_remaining += amount;
2100         /*
2101          * we may have advanced our local expiration to account for allowed
2102          * spread between our sched_clock and the one on which runtime was
2103          * issued.
2104          */
2105         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2106                 cfs_rq->runtime_expires = expires;
2107
2108         return cfs_rq->runtime_remaining > 0;
2109 }
2110
2111 /*
2112  * Note: This depends on the synchronization provided by sched_clock and the
2113  * fact that rq->clock snapshots this value.
2114  */
2115 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2116 {
2117         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2118         struct rq *rq = rq_of(cfs_rq);
2119
2120         /* if the deadline is ahead of our clock, nothing to do */
2121         if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2122                 return;
2123
2124         if (cfs_rq->runtime_remaining < 0)
2125                 return;
2126
2127         /*
2128          * If the local deadline has passed we have to consider the
2129          * possibility that our sched_clock is 'fast' and the global deadline
2130          * has not truly expired.
2131          *
2132          * Fortunately we can check determine whether this the case by checking
2133          * whether the global deadline has advanced.
2134          */
2135
2136         if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2137                 /* extend local deadline, drift is bounded above by 2 ticks */
2138                 cfs_rq->runtime_expires += TICK_NSEC;
2139         } else {
2140                 /* global deadline is ahead, expiration has passed */
2141                 cfs_rq->runtime_remaining = 0;
2142         }
2143 }
2144
2145 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2146                                      unsigned long delta_exec)
2147 {
2148         /* dock delta_exec before expiring quota (as it could span periods) */
2149         cfs_rq->runtime_remaining -= delta_exec;
2150         expire_cfs_rq_runtime(cfs_rq);
2151
2152         if (likely(cfs_rq->runtime_remaining > 0))
2153                 return;
2154
2155         /*
2156          * if we're unable to extend our runtime we resched so that the active
2157          * hierarchy can be throttled
2158          */
2159         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2160                 resched_task(rq_of(cfs_rq)->curr);
2161 }
2162
2163 static __always_inline
2164 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2165 {
2166         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2167                 return;
2168
2169         __account_cfs_rq_runtime(cfs_rq, delta_exec);
2170 }
2171
2172 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2173 {
2174         return cfs_bandwidth_used() && cfs_rq->throttled;
2175 }
2176
2177 /* check whether cfs_rq, or any parent, is throttled */
2178 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2179 {
2180         return cfs_bandwidth_used() && cfs_rq->throttle_count;
2181 }
2182
2183 /*
2184  * Ensure that neither of the group entities corresponding to src_cpu or
2185  * dest_cpu are members of a throttled hierarchy when performing group
2186  * load-balance operations.
2187  */
2188 static inline int throttled_lb_pair(struct task_group *tg,
2189                                     int src_cpu, int dest_cpu)
2190 {
2191         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2192
2193         src_cfs_rq = tg->cfs_rq[src_cpu];
2194         dest_cfs_rq = tg->cfs_rq[dest_cpu];
2195
2196         return throttled_hierarchy(src_cfs_rq) ||
2197                throttled_hierarchy(dest_cfs_rq);
2198 }
2199
2200 /* updated child weight may affect parent so we have to do this bottom up */
2201 static int tg_unthrottle_up(struct task_group *tg, void *data)
2202 {
2203         struct rq *rq = data;
2204         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2205
2206         cfs_rq->throttle_count--;
2207 #ifdef CONFIG_SMP
2208         if (!cfs_rq->throttle_count) {
2209                 /* adjust cfs_rq_clock_task() */
2210                 cfs_rq->throttled_clock_task_time += rq->clock_task -
2211                                              cfs_rq->throttled_clock_task;
2212         }
2213 #endif
2214
2215         return 0;
2216 }
2217
2218 static int tg_throttle_down(struct task_group *tg, void *data)
2219 {
2220         struct rq *rq = data;
2221         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2222
2223         /* group is entering throttled state, stop time */
2224         if (!cfs_rq->throttle_count)
2225                 cfs_rq->throttled_clock_task = rq->clock_task;
2226         cfs_rq->throttle_count++;
2227
2228         return 0;
2229 }
2230
2231 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2232 {
2233         struct rq *rq = rq_of(cfs_rq);
2234         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2235         struct sched_entity *se;
2236         long task_delta, dequeue = 1;
2237
2238         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2239
2240         /* freeze hierarchy runnable averages while throttled */
2241         rcu_read_lock();
2242         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2243         rcu_read_unlock();
2244
2245         task_delta = cfs_rq->h_nr_running;
2246         for_each_sched_entity(se) {
2247                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2248                 /* throttled entity or throttle-on-deactivate */
2249                 if (!se->on_rq)
2250                         break;
2251
2252                 if (dequeue)
2253                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2254                 qcfs_rq->h_nr_running -= task_delta;
2255
2256                 if (qcfs_rq->load.weight)
2257                         dequeue = 0;
2258         }
2259
2260         if (!se)
2261                 rq->nr_running -= task_delta;
2262
2263         cfs_rq->throttled = 1;
2264         cfs_rq->throttled_clock = rq->clock;
2265         raw_spin_lock(&cfs_b->lock);
2266         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2267         raw_spin_unlock(&cfs_b->lock);
2268 }
2269
2270 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2271 {
2272         struct rq *rq = rq_of(cfs_rq);
2273         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2274         struct sched_entity *se;
2275         int enqueue = 1;
2276         long task_delta;
2277
2278         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2279
2280         cfs_rq->throttled = 0;
2281         raw_spin_lock(&cfs_b->lock);
2282         cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2283         list_del_rcu(&cfs_rq->throttled_list);
2284         raw_spin_unlock(&cfs_b->lock);
2285
2286         update_rq_clock(rq);
2287         /* update hierarchical throttle state */
2288         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2289
2290         if (!cfs_rq->load.weight)
2291                 return;
2292
2293         task_delta = cfs_rq->h_nr_running;
2294         for_each_sched_entity(se) {
2295                 if (se->on_rq)
2296                         enqueue = 0;
2297
2298                 cfs_rq = cfs_rq_of(se);
2299                 if (enqueue)
2300                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2301                 cfs_rq->h_nr_running += task_delta;
2302
2303                 if (cfs_rq_throttled(cfs_rq))
2304                         break;
2305         }
2306
2307         if (!se)
2308                 rq->nr_running += task_delta;
2309
2310         /* determine whether we need to wake up potentially idle cpu */
2311         if (rq->curr == rq->idle && rq->cfs.nr_running)
2312                 resched_task(rq->curr);
2313 }
2314
2315 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2316                 u64 remaining, u64 expires)
2317 {
2318         struct cfs_rq *cfs_rq;
2319         u64 runtime = remaining;
2320
2321         rcu_read_lock();
2322         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2323                                 throttled_list) {
2324                 struct rq *rq = rq_of(cfs_rq);
2325
2326                 raw_spin_lock(&rq->lock);
2327                 if (!cfs_rq_throttled(cfs_rq))
2328                         goto next;
2329
2330                 runtime = -cfs_rq->runtime_remaining + 1;
2331                 if (runtime > remaining)
2332                         runtime = remaining;
2333                 remaining -= runtime;
2334
2335                 cfs_rq->runtime_remaining += runtime;
2336                 cfs_rq->runtime_expires = expires;
2337
2338                 /* we check whether we're throttled above */
2339                 if (cfs_rq->runtime_remaining > 0)
2340                         unthrottle_cfs_rq(cfs_rq);
2341
2342 next:
2343                 raw_spin_unlock(&rq->lock);
2344
2345                 if (!remaining)
2346                         break;
2347         }
2348         rcu_read_unlock();
2349
2350         return remaining;
2351 }
2352
2353 /*
2354  * Responsible for refilling a task_group's bandwidth and unthrottling its
2355  * cfs_rqs as appropriate. If there has been no activity within the last
2356  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2357  * used to track this state.
2358  */
2359 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2360 {
2361         u64 runtime, runtime_expires;
2362         int idle = 1, throttled;
2363
2364         raw_spin_lock(&cfs_b->lock);
2365         /* no need to continue the timer with no bandwidth constraint */
2366         if (cfs_b->quota == RUNTIME_INF)
2367                 goto out_unlock;
2368
2369         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2370         /* idle depends on !throttled (for the case of a large deficit) */
2371         idle = cfs_b->idle && !throttled;
2372         cfs_b->nr_periods += overrun;
2373
2374         /* if we're going inactive then everything else can be deferred */
2375         if (idle)
2376                 goto out_unlock;
2377
2378         __refill_cfs_bandwidth_runtime(cfs_b);
2379
2380         if (!throttled) {
2381                 /* mark as potentially idle for the upcoming period */
2382                 cfs_b->idle = 1;
2383                 goto out_unlock;
2384         }
2385
2386         /* account preceding periods in which throttling occurred */
2387         cfs_b->nr_throttled += overrun;
2388
2389         /*
2390          * There are throttled entities so we must first use the new bandwidth
2391          * to unthrottle them before making it generally available.  This
2392          * ensures that all existing debts will be paid before a new cfs_rq is
2393          * allowed to run.
2394          */
2395         runtime = cfs_b->runtime;
2396         runtime_expires = cfs_b->runtime_expires;
2397         cfs_b->runtime = 0;
2398
2399         /*
2400          * This check is repeated as we are holding onto the new bandwidth
2401          * while we unthrottle.  This can potentially race with an unthrottled
2402          * group trying to acquire new bandwidth from the global pool.
2403          */
2404         while (throttled && runtime > 0) {
2405                 raw_spin_unlock(&cfs_b->lock);
2406                 /* we can't nest cfs_b->lock while distributing bandwidth */
2407                 runtime = distribute_cfs_runtime(cfs_b, runtime,
2408                                                  runtime_expires);
2409                 raw_spin_lock(&cfs_b->lock);
2410
2411                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2412         }
2413
2414         /* return (any) remaining runtime */
2415         cfs_b->runtime = runtime;
2416         /*
2417          * While we are ensured activity in the period following an
2418          * unthrottle, this also covers the case in which the new bandwidth is
2419          * insufficient to cover the existing bandwidth deficit.  (Forcing the
2420          * timer to remain active while there are any throttled entities.)
2421          */
2422         cfs_b->idle = 0;
2423 out_unlock:
2424         if (idle)
2425                 cfs_b->timer_active = 0;
2426         raw_spin_unlock(&cfs_b->lock);
2427
2428         return idle;
2429 }
2430
2431 /* a cfs_rq won't donate quota below this amount */
2432 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2433 /* minimum remaining period time to redistribute slack quota */
2434 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2435 /* how long we wait to gather additional slack before distributing */
2436 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2437
2438 /* are we near the end of the current quota period? */
2439 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2440 {
2441         struct hrtimer *refresh_timer = &cfs_b->period_timer;
2442         u64 remaining;
2443
2444         /* if the call-back is running a quota refresh is already occurring */
2445         if (hrtimer_callback_running(refresh_timer))
2446                 return 1;
2447
2448         /* is a quota refresh about to occur? */
2449         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2450         if (remaining < min_expire)
2451                 return 1;
2452
2453         return 0;
2454 }
2455
2456 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2457 {
2458         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2459
2460         /* if there's a quota refresh soon don't bother with slack */
2461         if (runtime_refresh_within(cfs_b, min_left))
2462                 return;
2463
2464         start_bandwidth_timer(&cfs_b->slack_timer,
2465                                 ns_to_ktime(cfs_bandwidth_slack_period));
2466 }
2467
2468 /* we know any runtime found here is valid as update_curr() precedes return */
2469 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2470 {
2471         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2472         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2473
2474         if (slack_runtime <= 0)
2475                 return;
2476
2477         raw_spin_lock(&cfs_b->lock);
2478         if (cfs_b->quota != RUNTIME_INF &&
2479             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2480                 cfs_b->runtime += slack_runtime;
2481
2482                 /* we are under rq->lock, defer unthrottling using a timer */
2483                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2484                     !list_empty(&cfs_b->throttled_cfs_rq))
2485                         start_cfs_slack_bandwidth(cfs_b);
2486         }
2487         raw_spin_unlock(&cfs_b->lock);
2488
2489         /* even if it's not valid for return we don't want to try again */
2490         cfs_rq->runtime_remaining -= slack_runtime;
2491 }
2492
2493 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2494 {
2495         if (!cfs_bandwidth_used())
2496                 return;
2497
2498         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2499                 return;
2500
2501         __return_cfs_rq_runtime(cfs_rq);
2502 }
2503
2504 /*
2505  * This is done with a timer (instead of inline with bandwidth return) since
2506  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2507  */
2508 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2509 {
2510         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2511         u64 expires;
2512
2513         /* confirm we're still not at a refresh boundary */
2514         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2515                 return;
2516
2517         raw_spin_lock(&cfs_b->lock);
2518         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2519                 runtime = cfs_b->runtime;
2520                 cfs_b->runtime = 0;
2521         }
2522         expires = cfs_b->runtime_expires;
2523         raw_spin_unlock(&cfs_b->lock);
2524
2525         if (!runtime)
2526                 return;
2527
2528         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2529
2530         raw_spin_lock(&cfs_b->lock);
2531         if (expires == cfs_b->runtime_expires)
2532                 cfs_b->runtime = runtime;
2533         raw_spin_unlock(&cfs_b->lock);
2534 }
2535
2536 /*
2537  * When a group wakes up we want to make sure that its quota is not already
2538  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2539  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2540  */
2541 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2542 {
2543         if (!cfs_bandwidth_used())
2544                 return;
2545
2546         /* an active group must be handled by the update_curr()->put() path */
2547         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2548                 return;
2549
2550         /* ensure the group is not already throttled */
2551         if (cfs_rq_throttled(cfs_rq))
2552                 return;
2553
2554         /* update runtime allocation */
2555         account_cfs_rq_runtime(cfs_rq, 0);
2556         if (cfs_rq->runtime_remaining <= 0)
2557                 throttle_cfs_rq(cfs_rq);
2558 }
2559
2560 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2561 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2562 {
2563         if (!cfs_bandwidth_used())
2564                 return;
2565
2566         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2567                 return;
2568
2569         /*
2570          * it's possible for a throttled entity to be forced into a running
2571          * state (e.g. set_curr_task), in this case we're finished.
2572          */
2573         if (cfs_rq_throttled(cfs_rq))
2574                 return;
2575
2576         throttle_cfs_rq(cfs_rq);
2577 }
2578
2579 static inline u64 default_cfs_period(void);
2580 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2581 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2582
2583 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2584 {
2585         struct cfs_bandwidth *cfs_b =
2586                 container_of(timer, struct cfs_bandwidth, slack_timer);
2587         do_sched_cfs_slack_timer(cfs_b);
2588
2589         return HRTIMER_NORESTART;
2590 }
2591
2592 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2593 {
2594         struct cfs_bandwidth *cfs_b =
2595                 container_of(timer, struct cfs_bandwidth, period_timer);
2596         ktime_t now;
2597         int overrun;
2598         int idle = 0;
2599
2600         for (;;) {
2601                 now = hrtimer_cb_get_time(timer);
2602                 overrun = hrtimer_forward(timer, now, cfs_b->period);
2603
2604                 if (!overrun)
2605                         break;
2606
2607                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2608         }
2609
2610         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2611 }
2612
2613 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2614 {
2615         raw_spin_lock_init(&cfs_b->lock);
2616         cfs_b->runtime = 0;
2617         cfs_b->quota = RUNTIME_INF;
2618         cfs_b->period = ns_to_ktime(default_cfs_period());
2619
2620         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2621         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2622         cfs_b->period_timer.function = sched_cfs_period_timer;
2623         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2624         cfs_b->slack_timer.function = sched_cfs_slack_timer;
2625 }
2626
2627 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2628 {
2629         cfs_rq->runtime_enabled = 0;
2630         INIT_LIST_HEAD(&cfs_rq->throttled_list);
2631 }
2632
2633 /* requires cfs_b->lock, may release to reprogram timer */
2634 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2635 {
2636         /*
2637          * The timer may be active because we're trying to set a new bandwidth
2638          * period or because we're racing with the tear-down path
2639          * (timer_active==0 becomes visible before the hrtimer call-back
2640          * terminates).  In either case we ensure that it's re-programmed
2641          */
2642         while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2643                 raw_spin_unlock(&cfs_b->lock);
2644                 /* ensure cfs_b->lock is available while we wait */
2645                 hrtimer_cancel(&cfs_b->period_timer);
2646
2647                 raw_spin_lock(&cfs_b->lock);
2648                 /* if someone else restarted the timer then we're done */
2649                 if (cfs_b->timer_active)
2650                         return;
2651         }
2652
2653         cfs_b->timer_active = 1;
2654         start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2655 }
2656
2657 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2658 {
2659         hrtimer_cancel(&cfs_b->period_timer);
2660         hrtimer_cancel(&cfs_b->slack_timer);
2661 }
2662
2663 static void unthrottle_offline_cfs_rqs(struct rq *rq)
2664 {
2665         struct cfs_rq *cfs_rq;
2666
2667         for_each_leaf_cfs_rq(rq, cfs_rq) {
2668                 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2669
2670                 if (!cfs_rq->runtime_enabled)
2671                         continue;
2672
2673                 /*
2674                  * clock_task is not advancing so we just need to make sure
2675                  * there's some valid quota amount
2676                  */
2677                 cfs_rq->runtime_remaining = cfs_b->quota;
2678                 if (cfs_rq_throttled(cfs_rq))
2679                         unthrottle_cfs_rq(cfs_rq);
2680         }
2681 }
2682
2683 #else /* CONFIG_CFS_BANDWIDTH */
2684 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2685 {
2686         return rq_of(cfs_rq)->clock_task;
2687 }
2688
2689 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2690                                      unsigned long delta_exec) {}
2691 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2692 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2693 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2694
2695 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2696 {
2697         return 0;
2698 }
2699
2700 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2701 {
2702         return 0;
2703 }
2704
2705 static inline int throttled_lb_pair(struct task_group *tg,
2706                                     int src_cpu, int dest_cpu)
2707 {
2708         return 0;
2709 }
2710
2711 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2712
2713 #ifdef CONFIG_FAIR_GROUP_SCHED
2714 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2715 #endif
2716
2717 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2718 {
2719         return NULL;
2720 }
2721 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2722 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2723
2724 #endif /* CONFIG_CFS_BANDWIDTH */
2725
2726 /**************************************************
2727  * CFS operations on tasks:
2728  */
2729
2730 #ifdef CONFIG_SCHED_HRTICK
2731 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2732 {
2733         struct sched_entity *se = &p->se;
2734         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2735
2736         WARN_ON(task_rq(p) != rq);
2737
2738         if (cfs_rq->nr_running > 1) {
2739                 u64 slice = sched_slice(cfs_rq, se);
2740                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2741                 s64 delta = slice - ran;
2742
2743                 if (delta < 0) {
2744                         if (rq->curr == p)
2745                                 resched_task(p);
2746                         return;
2747                 }
2748
2749                 /*
2750                  * Don't schedule slices shorter than 10000ns, that just
2751                  * doesn't make sense. Rely on vruntime for fairness.
2752                  */
2753                 if (rq->curr != p)
2754                         delta = max_t(s64, 10000LL, delta);
2755
2756                 hrtick_start(rq, delta);
2757         }
2758 }
2759
2760 /*
2761  * called from enqueue/dequeue and updates the hrtick when the
2762  * current task is from our class and nr_running is low enough
2763  * to matter.
2764  */
2765 static void hrtick_update(struct rq *rq)
2766 {
2767         struct task_struct *curr = rq->curr;
2768
2769         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2770                 return;
2771
2772         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2773                 hrtick_start_fair(rq, curr);
2774 }
2775 #else /* !CONFIG_SCHED_HRTICK */
2776 static inline void
2777 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2778 {
2779 }
2780
2781 static inline void hrtick_update(struct rq *rq)
2782 {
2783 }
2784 #endif
2785
2786 /*
2787  * The enqueue_task method is called before nr_running is
2788  * increased. Here we update the fair scheduling stats and
2789  * then put the task into the rbtree:
2790  */
2791 static void
2792 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2793 {
2794         struct cfs_rq *cfs_rq;
2795         struct sched_entity *se = &p->se;
2796
2797         for_each_sched_entity(se) {
2798                 if (se->on_rq)
2799                         break;
2800                 cfs_rq = cfs_rq_of(se);
2801                 enqueue_entity(cfs_rq, se, flags);
2802
2803                 /*
2804                  * end evaluation on encountering a throttled cfs_rq
2805                  *
2806                  * note: in the case of encountering a throttled cfs_rq we will
2807                  * post the final h_nr_running increment below.
2808                 */
2809                 if (cfs_rq_throttled(cfs_rq))
2810                         break;
2811                 cfs_rq->h_nr_running++;
2812
2813                 flags = ENQUEUE_WAKEUP;
2814         }
2815
2816         for_each_sched_entity(se) {
2817                 cfs_rq = cfs_rq_of(se);
2818                 cfs_rq->h_nr_running++;
2819
2820                 if (cfs_rq_throttled(cfs_rq))
2821                         break;
2822
2823                 update_cfs_shares(cfs_rq);
2824                 update_entity_load_avg(se, 1);
2825         }
2826
2827         if (!se) {
2828                 update_rq_runnable_avg(rq, rq->nr_running);
2829                 inc_nr_running(rq);
2830         }
2831         hrtick_update(rq);
2832 }
2833
2834 static void set_next_buddy(struct sched_entity *se);
2835
2836 /*
2837  * The dequeue_task method is called before nr_running is
2838  * decreased. We remove the task from the rbtree and
2839  * update the fair scheduling stats:
2840  */
2841 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
2842 {
2843         struct cfs_rq *cfs_rq;
2844         struct sched_entity *se = &p->se;
2845         int task_sleep = flags & DEQUEUE_SLEEP;
2846
2847         for_each_sched_entity(se) {
2848                 cfs_rq = cfs_rq_of(se);
2849                 dequeue_entity(cfs_rq, se, flags);
2850
2851                 /*
2852                  * end evaluation on encountering a throttled cfs_rq
2853                  *
2854                  * note: in the case of encountering a throttled cfs_rq we will
2855                  * post the final h_nr_running decrement below.
2856                 */
2857                 if (cfs_rq_throttled(cfs_rq))
2858                         break;
2859                 cfs_rq->h_nr_running--;
2860
2861                 /* Don't dequeue parent if it has other entities besides us */
2862                 if (cfs_rq->load.weight) {
2863                         /*
2864                          * Bias pick_next to pick a task from this cfs_rq, as
2865                          * p is sleeping when it is within its sched_slice.
2866                          */
2867                         if (task_sleep && parent_entity(se))
2868                                 set_next_buddy(parent_entity(se));
2869
2870                         /* avoid re-evaluating load for this entity */
2871                         se = parent_entity(se);
2872                         break;
2873                 }
2874                 flags |= DEQUEUE_SLEEP;
2875         }
2876
2877         for_each_sched_entity(se) {
2878                 cfs_rq = cfs_rq_of(se);
2879                 cfs_rq->h_nr_running--;
2880
2881                 if (cfs_rq_throttled(cfs_rq))
2882                         break;
2883
2884                 update_cfs_shares(cfs_rq);
2885                 update_entity_load_avg(se, 1);
2886         }
2887
2888         if (!se) {
2889                 dec_nr_running(rq);
2890                 update_rq_runnable_avg(rq, 1);
2891         }
2892         hrtick_update(rq);
2893 }
2894
2895 #ifdef CONFIG_SMP
2896 /* Used instead of source_load when we know the type == 0 */
2897 static unsigned long weighted_cpuload(const int cpu)
2898 {
2899         return cpu_rq(cpu)->load.weight;
2900 }
2901
2902 /*
2903  * Return a low guess at the load of a migration-source cpu weighted
2904  * according to the scheduling class and "nice" value.
2905  *
2906  * We want to under-estimate the load of migration sources, to
2907  * balance conservatively.
2908  */
2909 static unsigned long source_load(int cpu, int type)
2910 {
2911         struct rq *rq = cpu_rq(cpu);
2912         unsigned long total = weighted_cpuload(cpu);
2913
2914         if (type == 0 || !sched_feat(LB_BIAS))
2915                 return total;
2916
2917         return min(rq->cpu_load[type-1], total);
2918 }
2919
2920 /*
2921  * Return a high guess at the load of a migration-target cpu weighted
2922  * according to the scheduling class and "nice" value.
2923  */
2924 static unsigned long target_load(int cpu, int type)
2925 {
2926         struct rq *rq = cpu_rq(cpu);
2927         unsigned long total = weighted_cpuload(cpu);
2928
2929         if (type == 0 || !sched_feat(LB_BIAS))
2930                 return total;
2931
2932         return max(rq->cpu_load[type-1], total);
2933 }
2934
2935 static unsigned long power_of(int cpu)
2936 {
2937         return cpu_rq(cpu)->cpu_power;
2938 }
2939
2940 static unsigned long cpu_avg_load_per_task(int cpu)
2941 {
2942         struct rq *rq = cpu_rq(cpu);
2943         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2944
2945         if (nr_running)
2946                 return rq->load.weight / nr_running;
2947
2948         return 0;
2949 }
2950
2951
2952 static void task_waking_fair(struct task_struct *p)
2953 {
2954         struct sched_entity *se = &p->se;
2955         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2956         u64 min_vruntime;
2957
2958 #ifndef CONFIG_64BIT
2959         u64 min_vruntime_copy;
2960
2961         do {
2962                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
2963                 smp_rmb();
2964                 min_vruntime = cfs_rq->min_vruntime;
2965         } while (min_vruntime != min_vruntime_copy);
2966 #else
2967         min_vruntime = cfs_rq->min_vruntime;
2968 #endif
2969
2970         se->vruntime -= min_vruntime;
2971 }
2972
2973 #ifdef CONFIG_FAIR_GROUP_SCHED
2974 /*
2975  * effective_load() calculates the load change as seen from the root_task_group
2976  *
2977  * Adding load to a group doesn't make a group heavier, but can cause movement
2978  * of group shares between cpus. Assuming the shares were perfectly aligned one
2979  * can calculate the shift in shares.
2980  *
2981  * Calculate the effective load difference if @wl is added (subtracted) to @tg
2982  * on this @cpu and results in a total addition (subtraction) of @wg to the
2983  * total group weight.
2984  *
2985  * Given a runqueue weight distribution (rw_i) we can compute a shares
2986  * distribution (s_i) using:
2987  *
2988  *   s_i = rw_i / \Sum rw_j                                             (1)
2989  *
2990  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
2991  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
2992  * shares distribution (s_i):
2993  *
2994  *   rw_i = {   2,   4,   1,   0 }
2995  *   s_i  = { 2/7, 4/7, 1/7,   0 }
2996  *
2997  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
2998  * task used to run on and the CPU the waker is running on), we need to
2999  * compute the effect of waking a task on either CPU and, in case of a sync
3000  * wakeup, compute the effect of the current task going to sleep.
3001  *
3002  * So for a change of @wl to the local @cpu with an overall group weight change
3003  * of @wl we can compute the new shares distribution (s'_i) using:
3004  *
3005  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
3006  *
3007  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3008  * differences in waking a task to CPU 0. The additional task changes the
3009  * weight and shares distributions like:
3010  *
3011  *   rw'_i = {   3,   4,   1,   0 }
3012  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3013  *
3014  * We can then compute the difference in effective weight by using:
3015  *
3016  *   dw_i = S * (s'_i - s_i)                                            (3)
3017  *
3018  * Where 'S' is the group weight as seen by its parent.
3019  *
3020  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3021  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3022  * 4/7) times the weight of the group.
3023  */
3024 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3025 {
3026         struct sched_entity *se = tg->se[cpu];
3027
3028         if (!tg->parent)        /* the trivial, non-cgroup case */
3029                 return wl;
3030
3031         for_each_sched_entity(se) {
3032                 long w, W;
3033
3034                 tg = se->my_q->tg;
3035
3036                 /*
3037                  * W = @wg + \Sum rw_j
3038                  */
3039                 W = wg + calc_tg_weight(tg, se->my_q);
3040
3041                 /*
3042                  * w = rw_i + @wl
3043                  */
3044                 w = se->my_q->load.weight + wl;
3045
3046                 /*
3047                  * wl = S * s'_i; see (2)
3048                  */
3049                 if (W > 0 && w < W)
3050                         wl = (w * tg->shares) / W;
3051                 else
3052                         wl = tg->shares;
3053
3054                 /*
3055                  * Per the above, wl is the new se->load.weight value; since
3056                  * those are clipped to [MIN_SHARES, ...) do so now. See
3057                  * calc_cfs_shares().
3058                  */
3059                 if (wl < MIN_SHARES)
3060                         wl = MIN_SHARES;
3061
3062                 /*
3063                  * wl = dw_i = S * (s'_i - s_i); see (3)
3064                  */
3065                 wl -= se->load.weight;
3066
3067                 /*
3068                  * Recursively apply this logic to all parent groups to compute
3069                  * the final effective load change on the root group. Since
3070                  * only the @tg group gets extra weight, all parent groups can
3071                  * only redistribute existing shares. @wl is the shift in shares
3072                  * resulting from this level per the above.
3073                  */
3074                 wg = 0;
3075         }
3076
3077         return wl;
3078 }
3079 #else
3080
3081 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3082                 unsigned long wl, unsigned long wg)
3083 {
3084         return wl;
3085 }
3086
3087 #endif
3088
3089 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3090 {
3091         s64 this_load, load;
3092         int idx, this_cpu, prev_cpu;
3093         unsigned long tl_per_task;
3094         struct task_group *tg;
3095         unsigned long weight;
3096         int balanced;
3097
3098         idx       = sd->wake_idx;
3099         this_cpu  = smp_processor_id();
3100         prev_cpu  = task_cpu(p);
3101         load      = source_load(prev_cpu, idx);
3102         this_load = target_load(this_cpu, idx);
3103
3104         /*
3105          * If sync wakeup then subtract the (maximum possible)
3106          * effect of the currently running task from the load
3107          * of the current CPU:
3108          */
3109         if (sync) {
3110                 tg = task_group(current);
3111                 weight = current->se.load.weight;
3112
3113                 this_load += effective_load(tg, this_cpu, -weight, -weight);
3114                 load += effective_load(tg, prev_cpu, 0, -weight);
3115         }
3116
3117         tg = task_group(p);
3118         weight = p->se.load.weight;
3119
3120         /*
3121          * In low-load situations, where prev_cpu is idle and this_cpu is idle
3122          * due to the sync cause above having dropped this_load to 0, we'll
3123          * always have an imbalance, but there's really nothing you can do
3124          * about that, so that's good too.
3125          *
3126          * Otherwise check if either cpus are near enough in load to allow this
3127          * task to be woken on this_cpu.
3128          */
3129         if (this_load > 0) {
3130                 s64 this_eff_load, prev_eff_load;
3131
3132                 this_eff_load = 100;
3133                 this_eff_load *= power_of(prev_cpu);
3134                 this_eff_load *= this_load +
3135                         effective_load(tg, this_cpu, weight, weight);
3136
3137                 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3138                 prev_eff_load *= power_of(this_cpu);
3139                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3140
3141                 balanced = this_eff_load <= prev_eff_load;
3142         } else
3143                 balanced = true;
3144
3145         /*
3146          * If the currently running task will sleep within
3147          * a reasonable amount of time then attract this newly
3148          * woken task:
3149          */
3150         if (sync && balanced)
3151                 return 1;
3152
3153         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3154         tl_per_task = cpu_avg_load_per_task(this_cpu);
3155
3156         if (balanced ||
3157             (this_load <= load &&
3158              this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3159                 /*
3160                  * This domain has SD_WAKE_AFFINE and
3161                  * p is cache cold in this domain, and
3162                  * there is no bad imbalance.
3163                  */
3164                 schedstat_inc(sd, ttwu_move_affine);
3165                 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3166
3167                 return 1;
3168         }
3169         return 0;
3170 }
3171
3172 /*
3173  * find_idlest_group finds and returns the least busy CPU group within the
3174  * domain.
3175  */
3176 static struct sched_group *
3177 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3178                   int this_cpu, int load_idx)
3179 {
3180         struct sched_group *idlest = NULL, *group = sd->groups;
3181         unsigned long min_load = ULONG_MAX, this_load = 0;
3182         int imbalance = 100 + (sd->imbalance_pct-100)/2;
3183
3184         do {
3185                 unsigned long load, avg_load;
3186                 int local_group;
3187                 int i;
3188
3189                 /* Skip over this group if it has no CPUs allowed */
3190                 if (!cpumask_intersects(sched_group_cpus(group),
3191                                         tsk_cpus_allowed(p)))
3192                         continue;
3193
3194                 local_group = cpumask_test_cpu(this_cpu,
3195                                                sched_group_cpus(group));
3196
3197                 /* Tally up the load of all CPUs in the group */
3198                 avg_load = 0;
3199
3200                 for_each_cpu(i, sched_group_cpus(group)) {
3201                         /* Bias balancing toward cpus of our domain */
3202                         if (local_group)
3203                                 load = source_load(i, load_idx);
3204                         else
3205                                 load = target_load(i, load_idx);
3206
3207                         avg_load += load;
3208                 }
3209
3210                 /* Adjust by relative CPU power of the group */
3211                 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3212
3213                 if (local_group) {
3214                         this_load = avg_load;
3215                 } else if (avg_load < min_load) {
3216                         min_load = avg_load;
3217                         idlest = group;
3218                 }
3219         } while (group = group->next, group != sd->groups);
3220
3221         if (!idlest || 100*this_load < imbalance*min_load)
3222                 return NULL;
3223         return idlest;
3224 }
3225
3226 /*
3227  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3228  */
3229 static int
3230 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3231 {
3232         unsigned long load, min_load = ULONG_MAX;
3233         int idlest = -1;
3234         int i;
3235
3236         /* Traverse only the allowed CPUs */
3237         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3238                 load = weighted_cpuload(i);
3239
3240                 if (load < min_load || (load == min_load && i == this_cpu)) {
3241                         min_load = load;
3242                         idlest = i;
3243                 }
3244         }
3245
3246         return idlest;
3247 }
3248
3249 /*
3250  * Try and locate an idle CPU in the sched_domain.
3251  */
3252 static int select_idle_sibling(struct task_struct *p, int target)
3253 {
3254         int cpu = smp_processor_id();
3255         int prev_cpu = task_cpu(p);
3256         struct sched_domain *sd;
3257         struct sched_group *sg;
3258         int i;
3259
3260         /*
3261          * If the task is going to be woken-up on this cpu and if it is
3262          * already idle, then it is the right target.
3263          */
3264         if (target == cpu && idle_cpu(cpu))
3265                 return cpu;
3266
3267         /*
3268          * If the task is going to be woken-up on the cpu where it previously
3269          * ran and if it is currently idle, then it the right target.
3270          */
3271         if (target == prev_cpu && idle_cpu(prev_cpu))
3272                 return prev_cpu;
3273
3274         /*
3275          * Otherwise, iterate the domains and find an elegible idle cpu.
3276          */
3277         sd = rcu_dereference(per_cpu(sd_llc, target));
3278         for_each_lower_domain(sd) {
3279                 sg = sd->groups;
3280                 do {
3281                         if (!cpumask_intersects(sched_group_cpus(sg),
3282                                                 tsk_cpus_allowed(p)))
3283                                 goto next;
3284
3285                         for_each_cpu(i, sched_group_cpus(sg)) {
3286                                 if (!idle_cpu(i))
3287                                         goto next;
3288                         }
3289
3290                         target = cpumask_first_and(sched_group_cpus(sg),
3291                                         tsk_cpus_allowed(p));
3292                         goto done;
3293 next:
3294                         sg = sg->next;
3295                 } while (sg != sd->groups);
3296         }
3297 done:
3298         return target;
3299 }
3300
3301 /*
3302  * sched_balance_self: balance the current task (running on cpu) in domains
3303  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3304  * SD_BALANCE_EXEC.
3305  *
3306  * Balance, ie. select the least loaded group.
3307  *
3308  * Returns the target CPU number, or the same CPU if no balancing is needed.
3309  *
3310  * preempt must be disabled.
3311  */
3312 static int
3313 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
3314 {
3315         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3316         int cpu = smp_processor_id();
3317         int prev_cpu = task_cpu(p);
3318         int new_cpu = cpu;
3319         int want_affine = 0;
3320         int sync = wake_flags & WF_SYNC;
3321
3322         if (p->nr_cpus_allowed == 1)
3323                 return prev_cpu;
3324
3325         if (sd_flag & SD_BALANCE_WAKE) {
3326                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
3327                         want_affine = 1;
3328                 new_cpu = prev_cpu;
3329         }
3330
3331         rcu_read_lock();
3332         for_each_domain(cpu, tmp) {
3333                 if (!(tmp->flags & SD_LOAD_BALANCE))
3334                         continue;
3335
3336                 /*
3337                  * If both cpu and prev_cpu are part of this domain,
3338                  * cpu is a valid SD_WAKE_AFFINE target.
3339                  */
3340                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3341                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3342                         affine_sd = tmp;
3343                         break;
3344                 }
3345
3346                 if (tmp->flags & sd_flag)
3347                         sd = tmp;
3348         }
3349
3350         if (affine_sd) {
3351                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
3352                         prev_cpu = cpu;
3353
3354                 new_cpu = select_idle_sibling(p, prev_cpu);
3355                 goto unlock;
3356         }
3357
3358         while (sd) {
3359                 int load_idx = sd->forkexec_idx;
3360                 struct sched_group *group;
3361                 int weight;
3362
3363                 if (!(sd->flags & sd_flag)) {
3364                         sd = sd->child;
3365                         continue;
3366                 }
3367
3368                 if (sd_flag & SD_BALANCE_WAKE)
3369                         load_idx = sd->wake_idx;
3370
3371                 group = find_idlest_group(sd, p, cpu, load_idx);
3372                 if (!group) {
3373                         sd = sd->child;
3374                         continue;
3375                 }
3376
3377                 new_cpu = find_idlest_cpu(group, p, cpu);
3378                 if (new_cpu == -1 || new_cpu == cpu) {
3379                         /* Now try balancing at a lower domain level of cpu */
3380                         sd = sd->child;
3381                         continue;
3382                 }
3383
3384                 /* Now try balancing at a lower domain level of new_cpu */
3385                 cpu = new_cpu;
3386                 weight = sd->span_weight;
3387                 sd = NULL;
3388                 for_each_domain(cpu, tmp) {
3389                         if (weight <= tmp->span_weight)
3390                                 break;
3391                         if (tmp->flags & sd_flag)
3392                                 sd = tmp;
3393                 }
3394                 /* while loop will break here if sd == NULL */
3395         }
3396 unlock:
3397         rcu_read_unlock();
3398
3399         return new_cpu;
3400 }
3401
3402 /*
3403  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3404  * removed when useful for applications beyond shares distribution (e.g.
3405  * load-balance).
3406  */
3407 #ifdef CONFIG_FAIR_GROUP_SCHED
3408 /*
3409  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3410  * cfs_rq_of(p) references at time of call are still valid and identify the
3411  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
3412  * other assumptions, including the state of rq->lock, should be made.
3413  */
3414 static void
3415 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3416 {
3417         struct sched_entity *se = &p->se;
3418         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3419
3420         /*
3421          * Load tracking: accumulate removed load so that it can be processed
3422          * when we next update owning cfs_rq under rq->lock.  Tasks contribute
3423          * to blocked load iff they have a positive decay-count.  It can never
3424          * be negative here since on-rq tasks have decay-count == 0.
3425          */
3426         if (se->avg.decay_count) {
3427                 se->avg.decay_count = -__synchronize_entity_decay(se);
3428                 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3429         }
3430 }
3431 #endif
3432 #endif /* CONFIG_SMP */
3433
3434 static unsigned long
3435 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
3436 {
3437         unsigned long gran = sysctl_sched_wakeup_granularity;
3438
3439         /*
3440          * Since its curr running now, convert the gran from real-time
3441          * to virtual-time in his units.
3442          *
3443          * By using 'se' instead of 'curr' we penalize light tasks, so
3444          * they get preempted easier. That is, if 'se' < 'curr' then
3445          * the resulting gran will be larger, therefore penalizing the
3446          * lighter, if otoh 'se' > 'curr' then the resulting gran will
3447          * be smaller, again penalizing the lighter task.
3448          *
3449          * This is especially important for buddies when the leftmost
3450          * task is higher priority than the buddy.
3451          */
3452         return calc_delta_fair(gran, se);
3453 }
3454
3455 /*
3456  * Should 'se' preempt 'curr'.
3457  *
3458  *             |s1
3459  *        |s2
3460  *   |s3
3461  *         g
3462  *      |<--->|c
3463  *
3464  *  w(c, s1) = -1
3465  *  w(c, s2) =  0
3466  *  w(c, s3) =  1
3467  *
3468  */
3469 static int
3470 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3471 {
3472         s64 gran, vdiff = curr->vruntime - se->vruntime;
3473
3474         if (vdiff <= 0)
3475                 return -1;
3476
3477         gran = wakeup_gran(curr, se);
3478         if (vdiff > gran)
3479                 return 1;
3480
3481         return 0;
3482 }
3483
3484 static void set_last_buddy(struct sched_entity *se)
3485 {
3486         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3487                 return;
3488
3489         for_each_sched_entity(se)
3490                 cfs_rq_of(se)->last = se;
3491 }
3492
3493 static void set_next_buddy(struct sched_entity *se)
3494 {
3495         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3496                 return;
3497
3498         for_each_sched_entity(se)
3499                 cfs_rq_of(se)->next = se;
3500 }
3501
3502 static void set_skip_buddy(struct sched_entity *se)
3503 {
3504         for_each_sched_entity(se)
3505                 cfs_rq_of(se)->skip = se;
3506 }
3507
3508 /*
3509  * Preempt the current task with a newly woken task if needed:
3510  */
3511 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
3512 {
3513         struct task_struct *curr = rq->curr;
3514         struct sched_entity *se = &curr->se, *pse = &p->se;
3515         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3516         int scale = cfs_rq->nr_running >= sched_nr_latency;
3517         int next_buddy_marked = 0;
3518
3519         if (unlikely(se == pse))
3520                 return;
3521
3522         /*
3523          * This is possible from callers such as move_task(), in which we
3524          * unconditionally check_prempt_curr() after an enqueue (which may have
3525          * lead to a throttle).  This both saves work and prevents false
3526          * next-buddy nomination below.
3527          */
3528         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3529                 return;
3530
3531         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3532                 set_next_buddy(pse);
3533                 next_buddy_marked = 1;
3534         }
3535
3536         /*
3537          * We can come here with TIF_NEED_RESCHED already set from new task
3538          * wake up path.
3539          *
3540          * Note: this also catches the edge-case of curr being in a throttled
3541          * group (e.g. via set_curr_task), since update_curr() (in the
3542          * enqueue of curr) will have resulted in resched being set.  This
3543          * prevents us from potentially nominating it as a false LAST_BUDDY
3544          * below.
3545          */
3546         if (test_tsk_need_resched(curr))
3547                 return;
3548
3549         /* Idle tasks are by definition preempted by non-idle tasks. */
3550         if (unlikely(curr->policy == SCHED_IDLE) &&
3551             likely(p->policy != SCHED_IDLE))
3552                 goto preempt;
3553
3554         /*
3555          * Batch and idle tasks do not preempt non-idle tasks (their preemption
3556          * is driven by the tick):
3557          */
3558         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
3559                 return;
3560
3561         find_matching_se(&se, &pse);
3562         update_curr(cfs_rq_of(se));
3563         BUG_ON(!pse);
3564         if (wakeup_preempt_entity(se, pse) == 1) {
3565                 /*
3566                  * Bias pick_next to pick the sched entity that is
3567                  * triggering this preemption.
3568                  */
3569                 if (!next_buddy_marked)
3570                         set_next_buddy(pse);
3571                 goto preempt;
3572         }
3573
3574         return;
3575
3576 preempt:
3577         resched_task(curr);
3578         /*
3579          * Only set the backward buddy when the current task is still
3580          * on the rq. This can happen when a wakeup gets interleaved
3581          * with schedule on the ->pre_schedule() or idle_balance()
3582          * point, either of which can * drop the rq lock.
3583          *
3584          * Also, during early boot the idle thread is in the fair class,
3585          * for obvious reasons its a bad idea to schedule back to it.
3586          */
3587         if (unlikely(!se->on_rq || curr == rq->idle))
3588                 return;
3589
3590         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3591                 set_last_buddy(se);
3592 }
3593
3594 static struct task_struct *pick_next_task_fair(struct rq *rq)
3595 {
3596         struct task_struct *p;
3597         struct cfs_rq *cfs_rq = &rq->cfs;
3598         struct sched_entity *se;
3599
3600         if (!cfs_rq->nr_running)
3601                 return NULL;
3602
3603         do {
3604                 se = pick_next_entity(cfs_rq);
3605                 set_next_entity(cfs_rq, se);
3606                 cfs_rq = group_cfs_rq(se);
3607         } while (cfs_rq);
3608
3609         p = task_of(se);
3610         if (hrtick_enabled(rq))
3611                 hrtick_start_fair(rq, p);
3612
3613         return p;
3614 }
3615
3616 /*
3617  * Account for a descheduled task:
3618  */
3619 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
3620 {
3621         struct sched_entity *se = &prev->se;
3622         struct cfs_rq *cfs_rq;
3623
3624         for_each_sched_entity(se) {
3625                 cfs_rq = cfs_rq_of(se);
3626                 put_prev_entity(cfs_rq, se);
3627         }
3628 }
3629
3630 /*
3631  * sched_yield() is very simple
3632  *
3633  * The magic of dealing with the ->skip buddy is in pick_next_entity.
3634  */
3635 static void yield_task_fair(struct rq *rq)
3636 {
3637         struct task_struct *curr = rq->curr;
3638         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3639         struct sched_entity *se = &curr->se;
3640
3641         /*
3642          * Are we the only task in the tree?
3643          */
3644         if (unlikely(rq->nr_running == 1))
3645                 return;
3646
3647         clear_buddies(cfs_rq, se);
3648
3649         if (curr->policy != SCHED_BATCH) {
3650                 update_rq_clock(rq);
3651                 /*
3652                  * Update run-time statistics of the 'current'.
3653                  */
3654                 update_curr(cfs_rq);
3655                 /*
3656                  * Tell update_rq_clock() that we've just updated,
3657                  * so we don't do microscopic update in schedule()
3658                  * and double the fastpath cost.
3659                  */
3660                  rq->skip_clock_update = 1;
3661         }
3662
3663         set_skip_buddy(se);
3664 }
3665
3666 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3667 {
3668         struct sched_entity *se = &p->se;
3669
3670         /* throttled hierarchies are not runnable */
3671         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
3672                 return false;
3673
3674         /* Tell the scheduler that we'd really like pse to run next. */
3675         set_next_buddy(se);
3676
3677         yield_task_fair(rq);
3678
3679         return true;
3680 }
3681
3682 #ifdef CONFIG_SMP
3683 /**************************************************
3684  * Fair scheduling class load-balancing methods.
3685  *
3686  * BASICS
3687  *
3688  * The purpose of load-balancing is to achieve the same basic fairness the
3689  * per-cpu scheduler provides, namely provide a proportional amount of compute
3690  * time to each task. This is expressed in the following equation:
3691  *
3692  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
3693  *
3694  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3695  * W_i,0 is defined as:
3696  *
3697  *   W_i,0 = \Sum_j w_i,j                                             (2)
3698  *
3699  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3700  * is derived from the nice value as per prio_to_weight[].
3701  *
3702  * The weight average is an exponential decay average of the instantaneous
3703  * weight:
3704  *
3705  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
3706  *
3707  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3708  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3709  * can also include other factors [XXX].
3710  *
3711  * To achieve this balance we define a measure of imbalance which follows
3712  * directly from (1):
3713  *
3714  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
3715  *
3716  * We them move tasks around to minimize the imbalance. In the continuous
3717  * function space it is obvious this converges, in the discrete case we get
3718  * a few fun cases generally called infeasible weight scenarios.
3719  *
3720  * [XXX expand on:
3721  *     - infeasible weights;
3722  *     - local vs global optima in the discrete case. ]
3723  *
3724  *
3725  * SCHED DOMAINS
3726  *
3727  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3728  * for all i,j solution, we create a tree of cpus that follows the hardware
3729  * topology where each level pairs two lower groups (or better). This results
3730  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3731  * tree to only the first of the previous level and we decrease the frequency
3732  * of load-balance at each level inv. proportional to the number of cpus in
3733  * the groups.
3734  *
3735  * This yields:
3736  *
3737  *     log_2 n     1     n
3738  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
3739  *     i = 0      2^i   2^i
3740  *                               `- size of each group
3741  *         |         |     `- number of cpus doing load-balance
3742  *         |         `- freq
3743  *         `- sum over all levels
3744  *
3745  * Coupled with a limit on how many tasks we can migrate every balance pass,
3746  * this makes (5) the runtime complexity of the balancer.
3747  *
3748  * An important property here is that each CPU is still (indirectly) connected
3749  * to every other cpu in at most O(log n) steps:
3750  *
3751  * The adjacency matrix of the resulting graph is given by:
3752  *
3753  *             log_2 n
3754  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
3755  *             k = 0
3756  *
3757  * And you'll find that:
3758  *
3759  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
3760  *
3761  * Showing there's indeed a path between every cpu in at most O(log n) steps.
3762  * The task movement gives a factor of O(m), giving a convergence complexity
3763  * of:
3764  *
3765  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
3766  *
3767  *
3768  * WORK CONSERVING
3769  *
3770  * In order to avoid CPUs going idle while there's still work to do, new idle
3771  * balancing is more aggressive and has the newly idle cpu iterate up the domain
3772  * tree itself instead of relying on other CPUs to bring it work.
3773  *
3774  * This adds some complexity to both (5) and (8) but it reduces the total idle
3775  * time.
3776  *
3777  * [XXX more?]
3778  *
3779  *
3780  * CGROUPS
3781  *
3782  * Cgroups make a horror show out of (2), instead of a simple sum we get:
3783  *
3784  *                                s_k,i
3785  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
3786  *                                 S_k
3787  *
3788  * Where
3789  *
3790  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
3791  *
3792  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3793  *
3794  * The big problem is S_k, its a global sum needed to compute a local (W_i)
3795  * property.
3796  *
3797  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3798  *      rewrite all of this once again.]
3799  */
3800
3801 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3802
3803 #define LBF_ALL_PINNED  0x01
3804 #define LBF_NEED_BREAK  0x02
3805 #define LBF_SOME_PINNED 0x04
3806
3807 struct lb_env {
3808         struct sched_domain     *sd;
3809
3810         struct rq               *src_rq;
3811         int                     src_cpu;
3812
3813         int                     dst_cpu;
3814         struct rq               *dst_rq;
3815
3816         struct cpumask          *dst_grpmask;
3817         int                     new_dst_cpu;
3818         enum cpu_idle_type      idle;
3819         long                    imbalance;
3820         /* The set of CPUs under consideration for load-balancing */
3821         struct cpumask          *cpus;
3822
3823         unsigned int            flags;
3824
3825         unsigned int            loop;
3826         unsigned int            loop_break;
3827         unsigned int            loop_max;
3828 };
3829
3830 /*
3831  * move_task - move a task from one runqueue to another runqueue.
3832  * Both runqueues must be locked.
3833  */
3834 static void move_task(struct task_struct *p, struct lb_env *env)
3835 {
3836         deactivate_task(env->src_rq, p, 0);
3837         set_task_cpu(p, env->dst_cpu);
3838         activate_task(env->dst_rq, p, 0);
3839         check_preempt_curr(env->dst_rq, p, 0);
3840 }
3841
3842 /*
3843  * Is this task likely cache-hot:
3844  */
3845 static int
3846 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3847 {
3848         s64 delta;
3849
3850         if (p->sched_class != &fair_sched_class)
3851                 return 0;
3852
3853         if (unlikely(p->policy == SCHED_IDLE))
3854                 return 0;
3855
3856         /*
3857          * Buddy candidates are cache hot:
3858          */
3859         if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3860                         (&p->se == cfs_rq_of(&p->se)->next ||
3861                          &p->se == cfs_rq_of(&p->se)->last))
3862                 return 1;
3863
3864         if (sysctl_sched_migration_cost == -1)
3865                 return 1;
3866         if (sysctl_sched_migration_cost == 0)
3867                 return 0;
3868
3869         delta = now - p->se.exec_start;
3870
3871         return delta < (s64)sysctl_sched_migration_cost;
3872 }
3873
3874 /*
3875  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3876  */
3877 static
3878 int can_migrate_task(struct task_struct *p, struct lb_env *env)
3879 {
3880         int tsk_cache_hot = 0;
3881         /*
3882          * We do not migrate tasks that are:
3883          * 1) running (obviously), or
3884          * 2) cannot be migrated to this CPU due to cpus_allowed, or
3885          * 3) are cache-hot on their current CPU.
3886          */
3887         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3888                 int new_dst_cpu;
3889
3890                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3891
3892                 /*
3893                  * Remember if this task can be migrated to any other cpu in
3894                  * our sched_group. We may want to revisit it if we couldn't
3895                  * meet load balance goals by pulling other tasks on src_cpu.
3896                  *
3897                  * Also avoid computing new_dst_cpu if we have already computed
3898                  * one in current iteration.
3899                  */
3900                 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3901                         return 0;
3902
3903                 new_dst_cpu = cpumask_first_and(env->dst_grpmask,
3904                                                 tsk_cpus_allowed(p));
3905                 if (new_dst_cpu < nr_cpu_ids) {
3906                         env->flags |= LBF_SOME_PINNED;
3907                         env->new_dst_cpu = new_dst_cpu;
3908                 }
3909                 return 0;
3910         }
3911
3912         /* Record that we found atleast one task that could run on dst_cpu */
3913         env->flags &= ~LBF_ALL_PINNED;
3914
3915         if (task_running(env->src_rq, p)) {
3916                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3917                 return 0;
3918         }
3919
3920         /*
3921          * Aggressive migration if:
3922          * 1) task is cache cold, or
3923          * 2) too many balance attempts have failed.
3924          */
3925
3926         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3927         if (!tsk_cache_hot ||
3928                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3929 #ifdef CONFIG_SCHEDSTATS
3930                 if (tsk_cache_hot) {
3931                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3932                         schedstat_inc(p, se.statistics.nr_forced_migrations);
3933                 }
3934 #endif
3935                 return 1;
3936         }
3937
3938         if (tsk_cache_hot) {
3939                 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3940                 return 0;
3941         }
3942         return 1;
3943 }
3944
3945 /*
3946  * move_one_task tries to move exactly one task from busiest to this_rq, as
3947  * part of active balancing operations within "domain".
3948  * Returns 1 if successful and 0 otherwise.
3949  *
3950  * Called with both runqueues locked.
3951  */
3952 static int move_one_task(struct lb_env *env)
3953 {
3954         struct task_struct *p, *n;
3955
3956         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3957                 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3958                         continue;
3959
3960                 if (!can_migrate_task(p, env))
3961                         continue;
3962
3963                 move_task(p, env);
3964                 /*
3965                  * Right now, this is only the second place move_task()
3966                  * is called, so we can safely collect move_task()
3967                  * stats here rather than inside move_task().
3968                  */
3969                 schedstat_inc(env->sd, lb_gained[env->idle]);
3970                 return 1;
3971         }
3972         return 0;
3973 }
3974
3975 static unsigned long task_h_load(struct task_struct *p);
3976
3977 static const unsigned int sched_nr_migrate_break = 32;
3978
3979 /*
3980  * move_tasks tries to move up to imbalance weighted load from busiest to
3981  * this_rq, as part of a balancing operation within domain "sd".
3982  * Returns 1 if successful and 0 otherwise.
3983  *
3984  * Called with both runqueues locked.
3985  */
3986 static int move_tasks(struct lb_env *env)
3987 {
3988         struct list_head *tasks = &env->src_rq->cfs_tasks;
3989         struct task_struct *p;
3990         unsigned long load;
3991         int pulled = 0;
3992
3993         if (env->imbalance <= 0)
3994                 return 0;
3995
3996         while (!list_empty(tasks)) {
3997                 p = list_first_entry(tasks, struct task_struct, se.group_node);
3998
3999                 env->loop++;
4000                 /* We've more or less seen every task there is, call it quits */
4001                 if (env->loop > env->loop_max)
4002                         break;
4003
4004                 /* take a breather every nr_migrate tasks */
4005                 if (env->loop > env->loop_break) {
4006                         env->loop_break += sched_nr_migrate_break;
4007                         env->flags |= LBF_NEED_BREAK;
4008                         break;
4009                 }
4010
4011                 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4012                         goto next;
4013
4014                 load = task_h_load(p);
4015
4016                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4017                         goto next;
4018
4019                 if ((load / 2) > env->imbalance)
4020                         goto next;
4021
4022                 if (!can_migrate_task(p, env))
4023                         goto next;
4024
4025                 move_task(p, env);
4026                 pulled++;
4027                 env->imbalance -= load;
4028
4029 #ifdef CONFIG_PREEMPT
4030                 /*
4031                  * NEWIDLE balancing is a source of latency, so preemptible
4032                  * kernels will stop after the first task is pulled to minimize
4033                  * the critical section.
4034                  */
4035                 if (env->idle == CPU_NEWLY_IDLE)
4036                         break;
4037 #endif
4038
4039                 /*
4040                  * We only want to steal up to the prescribed amount of
4041                  * weighted load.
4042                  */
4043                 if (env->imbalance <= 0)
4044                         break;
4045
4046                 continue;
4047 next:
4048                 list_move_tail(&p->se.group_node, tasks);
4049         }
4050
4051         /*
4052          * Right now, this is one of only two places move_task() is called,
4053          * so we can safely collect move_task() stats here rather than
4054          * inside move_task().
4055          */
4056         schedstat_add(env->sd, lb_gained[env->idle], pulled);
4057
4058         return pulled;
4059 }
4060
4061 #ifdef CONFIG_FAIR_GROUP_SCHED
4062 /*
4063  * update tg->load_weight by folding this cpu's load_avg
4064  */
4065 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4066 {
4067         struct sched_entity *se = tg->se[cpu];
4068         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4069
4070         /* throttled entities do not contribute to load */
4071         if (throttled_hierarchy(cfs_rq))
4072                 return;
4073
4074         update_cfs_rq_blocked_load(cfs_rq, 1);
4075
4076         if (se) {
4077                 update_entity_load_avg(se, 1);
4078                 /*
4079                  * We pivot on our runnable average having decayed to zero for
4080                  * list removal.  This generally implies that all our children
4081                  * have also been removed (modulo rounding error or bandwidth
4082                  * control); however, such cases are rare and we can fix these
4083                  * at enqueue.
4084                  *
4085                  * TODO: fix up out-of-order children on enqueue.
4086                  */
4087                 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4088                         list_del_leaf_cfs_rq(cfs_rq);
4089         } else {
4090                 struct rq *rq = rq_of(cfs_rq);
4091                 update_rq_runnable_avg(rq, rq->nr_running);
4092         }
4093 }
4094
4095 static void update_blocked_averages(int cpu)
4096 {
4097         struct rq *rq = cpu_rq(cpu);
4098         struct cfs_rq *cfs_rq;
4099         unsigned long flags;
4100
4101         raw_spin_lock_irqsave(&rq->lock, flags);
4102         update_rq_clock(rq);
4103         /*
4104          * Iterates the task_group tree in a bottom up fashion, see
4105          * list_add_leaf_cfs_rq() for details.
4106          */
4107         for_each_leaf_cfs_rq(rq, cfs_rq) {
4108                 /*
4109                  * Note: We may want to consider periodically releasing
4110                  * rq->lock about these updates so that creating many task
4111                  * groups does not result in continually extending hold time.
4112                  */
4113                 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
4114         }
4115
4116         raw_spin_unlock_irqrestore(&rq->lock, flags);
4117 }
4118
4119 /*
4120  * Compute the cpu's hierarchical load factor for each task group.
4121  * This needs to be done in a top-down fashion because the load of a child
4122  * group is a fraction of its parents load.
4123  */
4124 static int tg_load_down(struct task_group *tg, void *data)
4125 {
4126         unsigned long load;
4127         long cpu = (long)data;
4128
4129         if (!tg->parent) {
4130                 load = cpu_rq(cpu)->load.weight;
4131         } else {
4132                 load = tg->parent->cfs_rq[cpu]->h_load;
4133                 load *= tg->se[cpu]->load.weight;
4134                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4135         }
4136
4137         tg->cfs_rq[cpu]->h_load = load;
4138
4139         return 0;
4140 }
4141
4142 static void update_h_load(long cpu)
4143 {
4144         struct rq *rq = cpu_rq(cpu);
4145         unsigned long now = jiffies;
4146
4147         if (rq->h_load_throttle == now)
4148                 return;
4149
4150         rq->h_load_throttle = now;
4151
4152         rcu_read_lock();
4153         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
4154         rcu_read_unlock();
4155 }
4156
4157 static unsigned long task_h_load(struct task_struct *p)
4158 {
4159         struct cfs_rq *cfs_rq = task_cfs_rq(p);
4160         unsigned long load;
4161
4162         load = p->se.load.weight;
4163         load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
4164
4165         return load;
4166 }
4167 #else
4168 static inline void update_blocked_averages(int cpu)
4169 {
4170 }
4171
4172 static inline void update_h_load(long cpu)
4173 {
4174 }
4175
4176 static unsigned long task_h_load(struct task_struct *p)
4177 {
4178         return p->se.load.weight;
4179 }
4180 #endif
4181
4182 /********** Helpers for find_busiest_group ************************/
4183 /*
4184  * sd_lb_stats - Structure to store the statistics of a sched_domain
4185  *              during load balancing.
4186  */
4187 struct sd_lb_stats {
4188         struct sched_group *busiest; /* Busiest group in this sd */
4189         struct sched_group *this;  /* Local group in this sd */
4190         unsigned long total_load;  /* Total load of all groups in sd */
4191         unsigned long total_pwr;   /*   Total power of all groups in sd */
4192         unsigned long avg_load;    /* Average load across all groups in sd */
4193
4194         /** Statistics of this group */
4195         unsigned long this_load;
4196         unsigned long this_load_per_task;
4197         unsigned long this_nr_running;
4198         unsigned long this_has_capacity;
4199         unsigned int  this_idle_cpus;
4200
4201         /* Statistics of the busiest group */
4202         unsigned int  busiest_idle_cpus;
4203         unsigned long max_load;
4204         unsigned long busiest_load_per_task;
4205         unsigned long busiest_nr_running;
4206         unsigned long busiest_group_capacity;
4207         unsigned long busiest_has_capacity;
4208         unsigned int  busiest_group_weight;
4209
4210         int group_imb; /* Is there imbalance in this sd */
4211 };
4212
4213 /*
4214  * sg_lb_stats - stats of a sched_group required for load_balancing
4215  */
4216 struct sg_lb_stats {
4217         unsigned long avg_load; /*Avg load across the CPUs of the group */
4218         unsigned long group_load; /* Total load over the CPUs of the group */
4219         unsigned long sum_nr_running; /* Nr tasks running in the group */
4220         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4221         unsigned long group_capacity;
4222         unsigned long idle_cpus;
4223         unsigned long group_weight;
4224         int group_imb; /* Is there an imbalance in the group ? */
4225         int group_has_capacity; /* Is there extra capacity in the group? */
4226 };
4227
4228 /**
4229  * get_sd_load_idx - Obtain the load index for a given sched domain.
4230  * @sd: The sched_domain whose load_idx is to be obtained.
4231  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4232  */
4233 static inline int get_sd_load_idx(struct sched_domain *sd,
4234                                         enum cpu_idle_type idle)
4235 {
4236         int load_idx;
4237
4238         switch (idle) {
4239         case CPU_NOT_IDLE:
4240                 load_idx = sd->busy_idx;
4241                 break;
4242
4243         case CPU_NEWLY_IDLE:
4244                 load_idx = sd->newidle_idx;
4245                 break;
4246         default:
4247                 load_idx = sd->idle_idx;
4248                 break;
4249         }
4250
4251         return load_idx;
4252 }
4253
4254 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
4255 {
4256         return SCHED_POWER_SCALE;
4257 }
4258
4259 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4260 {
4261         return default_scale_freq_power(sd, cpu);
4262 }
4263
4264 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
4265 {
4266         unsigned long weight = sd->span_weight;
4267         unsigned long smt_gain = sd->smt_gain;
4268
4269         smt_gain /= weight;
4270
4271         return smt_gain;
4272 }
4273
4274 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4275 {
4276         return default_scale_smt_power(sd, cpu);
4277 }
4278
4279 unsigned long scale_rt_power(int cpu)
4280 {
4281         struct rq *rq = cpu_rq(cpu);
4282         u64 total, available, age_stamp, avg;
4283
4284         /*
4285          * Since we're reading these variables without serialization make sure
4286          * we read them once before doing sanity checks on them.
4287          */
4288         age_stamp = ACCESS_ONCE(rq->age_stamp);
4289         avg = ACCESS_ONCE(rq->rt_avg);
4290
4291         total = sched_avg_period() + (rq->clock - age_stamp);
4292
4293         if (unlikely(total < avg)) {
4294                 /* Ensures that power won't end up being negative */
4295                 available = 0;
4296         } else {
4297                 available = total - avg;
4298         }
4299
4300         if (unlikely((s64)total < SCHED_POWER_SCALE))
4301                 total = SCHED_POWER_SCALE;
4302
4303         total >>= SCHED_POWER_SHIFT;
4304
4305         return div_u64(available, total);
4306 }
4307
4308 static void update_cpu_power(struct sched_domain *sd, int cpu)
4309 {
4310         unsigned long weight = sd->span_weight;
4311         unsigned long power = SCHED_POWER_SCALE;
4312         struct sched_group *sdg = sd->groups;
4313
4314         if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4315                 if (sched_feat(ARCH_POWER))
4316                         power *= arch_scale_smt_power(sd, cpu);
4317                 else
4318                         power *= default_scale_smt_power(sd, cpu);
4319
4320                 power >>= SCHED_POWER_SHIFT;
4321         }
4322
4323         sdg->sgp->power_orig = power;
4324
4325         if (sched_feat(ARCH_POWER))
4326                 power *= arch_scale_freq_power(sd, cpu);
4327         else
4328                 power *= default_scale_freq_power(sd, cpu);
4329
4330         power >>= SCHED_POWER_SHIFT;
4331
4332         power *= scale_rt_power(cpu);
4333         power >>= SCHED_POWER_SHIFT;
4334
4335         if (!power)
4336                 power = 1;
4337
4338         cpu_rq(cpu)->cpu_power = power;
4339         sdg->sgp->power = power;
4340 }
4341
4342 void update_group_power(struct sched_domain *sd, int cpu)
4343 {
4344         struct sched_domain *child = sd->child;
4345         struct sched_group *group, *sdg = sd->groups;
4346         unsigned long power;
4347         unsigned long interval;
4348
4349         interval = msecs_to_jiffies(sd->balance_interval);
4350         interval = clamp(interval, 1UL, max_load_balance_interval);
4351         sdg->sgp->next_update = jiffies + interval;
4352
4353         if (!child) {
4354                 update_cpu_power(sd, cpu);
4355                 return;
4356         }
4357
4358         power = 0;
4359
4360         if (child->flags & SD_OVERLAP) {
4361                 /*
4362                  * SD_OVERLAP domains cannot assume that child groups
4363                  * span the current group.
4364                  */
4365
4366                 for_each_cpu(cpu, sched_group_cpus(sdg))
4367                         power += power_of(cpu);
4368         } else  {
4369                 /*
4370                  * !SD_OVERLAP domains can assume that child groups
4371                  * span the current group.
4372                  */
4373
4374                 group = child->groups;
4375                 do {
4376                         power += group->sgp->power;
4377                         group = group->next;
4378                 } while (group != child->groups);
4379         }
4380
4381         sdg->sgp->power_orig = sdg->sgp->power = power;
4382 }
4383
4384 /*
4385  * Try and fix up capacity for tiny siblings, this is needed when
4386  * things like SD_ASYM_PACKING need f_b_g to select another sibling
4387  * which on its own isn't powerful enough.
4388  *
4389  * See update_sd_pick_busiest() and check_asym_packing().
4390  */
4391 static inline int
4392 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4393 {
4394         /*
4395          * Only siblings can have significantly less than SCHED_POWER_SCALE
4396          */
4397         if (!(sd->flags & SD_SHARE_CPUPOWER))
4398                 return 0;
4399
4400         /*
4401          * If ~90% of the cpu_power is still there, we're good.
4402          */
4403         if (group->sgp->power * 32 > group->sgp->power_orig * 29)
4404                 return 1;
4405
4406         return 0;
4407 }
4408
4409 /**
4410  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
4411  * @env: The load balancing environment.
4412  * @group: sched_group whose statistics are to be updated.
4413  * @load_idx: Load index of sched_domain of this_cpu for load calc.
4414  * @local_group: Does group contain this_cpu.
4415  * @balance: Should we balance.
4416  * @sgs: variable to hold the statistics for this group.
4417  */
4418 static inline void update_sg_lb_stats(struct lb_env *env,
4419                         struct sched_group *group, int load_idx,
4420                         int local_group, int *balance, struct sg_lb_stats *sgs)
4421 {
4422         unsigned long nr_running, max_nr_running, min_nr_running;
4423         unsigned long load, max_cpu_load, min_cpu_load;
4424         unsigned int balance_cpu = -1, first_idle_cpu = 0;
4425         unsigned long avg_load_per_task = 0;
4426         int i;
4427
4428         if (local_group)
4429                 balance_cpu = group_balance_cpu(group);
4430
4431         /* Tally up the load of all CPUs in the group */
4432         max_cpu_load = 0;
4433         min_cpu_load = ~0UL;
4434         max_nr_running = 0;
4435         min_nr_running = ~0UL;
4436
4437         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4438                 struct rq *rq = cpu_rq(i);
4439
4440                 nr_running = rq->nr_running;
4441
4442                 /* Bias balancing toward cpus of our domain */
4443                 if (local_group) {
4444                         if (idle_cpu(i) && !first_idle_cpu &&
4445                                         cpumask_test_cpu(i, sched_group_mask(group))) {
4446                                 first_idle_cpu = 1;
4447                                 balance_cpu = i;
4448                         }
4449
4450                         load = target_load(i, load_idx);
4451                 } else {
4452                         load = source_load(i, load_idx);
4453                         if (load > max_cpu_load)
4454                                 max_cpu_load = load;
4455                         if (min_cpu_load > load)
4456                                 min_cpu_load = load;
4457
4458                         if (nr_running > max_nr_running)
4459                                 max_nr_running = nr_running;
4460                         if (min_nr_running > nr_running)
4461                                 min_nr_running = nr_running;
4462                 }
4463
4464                 sgs->group_load += load;
4465                 sgs->sum_nr_running += nr_running;
4466                 sgs->sum_weighted_load += weighted_cpuload(i);
4467                 if (idle_cpu(i))
4468                         sgs->idle_cpus++;
4469         }
4470
4471         /*
4472          * First idle cpu or the first cpu(busiest) in this sched group
4473          * is eligible for doing load balancing at this and above
4474          * domains. In the newly idle case, we will allow all the cpu's
4475          * to do the newly idle load balance.
4476          */
4477         if (local_group) {
4478                 if (env->idle != CPU_NEWLY_IDLE) {
4479                         if (balance_cpu != env->dst_cpu) {
4480                                 *balance = 0;
4481                                 return;
4482                         }
4483                         update_group_power(env->sd, env->dst_cpu);
4484                 } else if (time_after_eq(jiffies, group->sgp->next_update))
4485                         update_group_power(env->sd, env->dst_cpu);
4486         }
4487
4488         /* Adjust by relative CPU power of the group */
4489         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
4490
4491         /*
4492          * Consider the group unbalanced when the imbalance is larger
4493          * than the average weight of a task.
4494          *
4495          * APZ: with cgroup the avg task weight can vary wildly and
4496          *      might not be a suitable number - should we keep a
4497          *      normalized nr_running number somewhere that negates
4498          *      the hierarchy?
4499          */
4500         if (sgs->sum_nr_running)
4501                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4502
4503         if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4504             (max_nr_running - min_nr_running) > 1)
4505                 sgs->group_imb = 1;
4506
4507         sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
4508                                                 SCHED_POWER_SCALE);
4509         if (!sgs->group_capacity)
4510                 sgs->group_capacity = fix_small_capacity(env->sd, group);
4511         sgs->group_weight = group->group_weight;
4512
4513         if (sgs->group_capacity > sgs->sum_nr_running)
4514                 sgs->group_has_capacity = 1;
4515 }
4516
4517 /**
4518  * update_sd_pick_busiest - return 1 on busiest group
4519  * @env: The load balancing environment.
4520  * @sds: sched_domain statistics
4521  * @sg: sched_group candidate to be checked for being the busiest
4522  * @sgs: sched_group statistics
4523  *
4524  * Determine if @sg is a busier group than the previously selected
4525  * busiest group.
4526  */
4527 static bool update_sd_pick_busiest(struct lb_env *env,
4528                                    struct sd_lb_stats *sds,
4529                                    struct sched_group *sg,
4530                                    struct sg_lb_stats *sgs)
4531 {
4532         if (sgs->avg_load <= sds->max_load)
4533                 return false;
4534
4535         if (sgs->sum_nr_running > sgs->group_capacity)
4536                 return true;
4537
4538         if (sgs->group_imb)
4539                 return true;
4540
4541         /*
4542          * ASYM_PACKING needs to move all the work to the lowest
4543          * numbered CPUs in the group, therefore mark all groups
4544          * higher than ourself as busy.
4545          */
4546         if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4547             env->dst_cpu < group_first_cpu(sg)) {
4548                 if (!sds->busiest)
4549                         return true;
4550
4551                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4552                         return true;
4553         }
4554
4555         return false;
4556 }
4557
4558 /**
4559  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4560  * @env: The load balancing environment.
4561  * @balance: Should we balance.
4562  * @sds: variable to hold the statistics for this sched_domain.
4563  */
4564 static inline void update_sd_lb_stats(struct lb_env *env,
4565                                         int *balance, struct sd_lb_stats *sds)
4566 {
4567         struct sched_domain *child = env->sd->child;
4568         struct sched_group *sg = env->sd->groups;
4569         struct sg_lb_stats sgs;
4570         int load_idx, prefer_sibling = 0;
4571
4572         if (child && child->flags & SD_PREFER_SIBLING)
4573                 prefer_sibling = 1;
4574
4575         load_idx = get_sd_load_idx(env->sd, env->idle);
4576
4577         do {
4578                 int local_group;
4579
4580                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
4581                 memset(&sgs, 0, sizeof(sgs));
4582                 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
4583
4584                 if (local_group && !(*balance))
4585                         return;
4586
4587                 sds->total_load += sgs.group_load;
4588                 sds->total_pwr += sg->sgp->power;
4589
4590                 /*
4591                  * In case the child domain prefers tasks go to siblings
4592                  * first, lower the sg capacity to one so that we'll try
4593                  * and move all the excess tasks away. We lower the capacity
4594                  * of a group only if the local group has the capacity to fit
4595                  * these excess tasks, i.e. nr_running < group_capacity. The
4596                  * extra check prevents the case where you always pull from the
4597                  * heaviest group when it is already under-utilized (possible
4598                  * with a large weight task outweighs the tasks on the system).
4599                  */
4600                 if (prefer_sibling && !local_group && sds->this_has_capacity)
4601                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
4602
4603                 if (local_group) {
4604                         sds->this_load = sgs.avg_load;
4605                         sds->this = sg;
4606                         sds->this_nr_running = sgs.sum_nr_running;
4607                         sds->this_load_per_task = sgs.sum_weighted_load;
4608                         sds->this_has_capacity = sgs.group_has_capacity;
4609                         sds->this_idle_cpus = sgs.idle_cpus;
4610                 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
4611                         sds->max_load = sgs.avg_load;
4612                         sds->busiest = sg;
4613                         sds->busiest_nr_running = sgs.sum_nr_running;
4614                         sds->busiest_idle_cpus = sgs.idle_cpus;
4615                         sds->busiest_group_capacity = sgs.group_capacity;
4616                         sds->busiest_load_per_task = sgs.sum_weighted_load;
4617                         sds->busiest_has_capacity = sgs.group_has_capacity;
4618                         sds->busiest_group_weight = sgs.group_weight;
4619                         sds->group_imb = sgs.group_imb;
4620                 }
4621
4622                 sg = sg->next;
4623         } while (sg != env->sd->groups);
4624 }
4625
4626 /**
4627  * check_asym_packing - Check to see if the group is packed into the
4628  *                      sched doman.
4629  *
4630  * This is primarily intended to used at the sibling level.  Some
4631  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
4632  * case of POWER7, it can move to lower SMT modes only when higher
4633  * threads are idle.  When in lower SMT modes, the threads will
4634  * perform better since they share less core resources.  Hence when we
4635  * have idle threads, we want them to be the higher ones.
4636  *
4637  * This packing function is run on idle threads.  It checks to see if
4638  * the busiest CPU in this domain (core in the P7 case) has a higher
4639  * CPU number than the packing function is being run on.  Here we are
4640  * assuming lower CPU number will be equivalent to lower a SMT thread
4641  * number.
4642  *
4643  * Returns 1 when packing is required and a task should be moved to
4644  * this CPU.  The amount of the imbalance is returned in *imbalance.
4645  *
4646  * @env: The load balancing environment.
4647  * @sds: Statistics of the sched_domain which is to be packed
4648  */
4649 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
4650 {
4651         int busiest_cpu;
4652
4653         if (!(env->sd->flags & SD_ASYM_PACKING))
4654                 return 0;
4655
4656         if (!sds->busiest)
4657                 return 0;
4658
4659         busiest_cpu = group_first_cpu(sds->busiest);
4660         if (env->dst_cpu > busiest_cpu)
4661                 return 0;
4662
4663         env->imbalance = DIV_ROUND_CLOSEST(
4664                 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4665
4666         return 1;
4667 }
4668
4669 /**
4670  * fix_small_imbalance - Calculate the minor imbalance that exists
4671  *                      amongst the groups of a sched_domain, during
4672  *                      load balancing.
4673  * @env: The load balancing environment.
4674  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
4675  */
4676 static inline
4677 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4678 {
4679         unsigned long tmp, pwr_now = 0, pwr_move = 0;
4680         unsigned int imbn = 2;
4681         unsigned long scaled_busy_load_per_task;
4682
4683         if (sds->this_nr_running) {
4684                 sds->this_load_per_task /= sds->this_nr_running;
4685                 if (sds->busiest_load_per_task >
4686                                 sds->this_load_per_task)
4687                         imbn = 1;
4688         } else {
4689                 sds->this_load_per_task =
4690                         cpu_avg_load_per_task(env->dst_cpu);
4691         }
4692
4693         scaled_busy_load_per_task = sds->busiest_load_per_task
4694                                          * SCHED_POWER_SCALE;
4695         scaled_busy_load_per_task /= sds->busiest->sgp->power;
4696
4697         if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4698                         (scaled_busy_load_per_task * imbn)) {
4699                 env->imbalance = sds->busiest_load_per_task;
4700                 return;
4701         }
4702
4703         /*
4704          * OK, we don't have enough imbalance to justify moving tasks,
4705          * however we may be able to increase total CPU power used by
4706          * moving them.
4707          */
4708
4709         pwr_now += sds->busiest->sgp->power *
4710                         min(sds->busiest_load_per_task, sds->max_load);
4711         pwr_now += sds->this->sgp->power *
4712                         min(sds->this_load_per_task, sds->this_load);
4713         pwr_now /= SCHED_POWER_SCALE;
4714
4715         /* Amount of load we'd subtract */
4716         tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4717                 sds->busiest->sgp->power;
4718         if (sds->max_load > tmp)
4719                 pwr_move += sds->busiest->sgp->power *
4720                         min(sds->busiest_load_per_task, sds->max_load - tmp);
4721
4722         /* Amount of load we'd add */
4723         if (sds->max_load * sds->busiest->sgp->power <
4724                 sds->busiest_load_per_task * SCHED_POWER_SCALE)
4725                 tmp = (sds->max_load * sds->busiest->sgp->power) /
4726                         sds->this->sgp->power;
4727         else
4728                 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
4729                         sds->this->sgp->power;
4730         pwr_move += sds->this->sgp->power *
4731                         min(sds->this_load_per_task, sds->this_load + tmp);
4732         pwr_move /= SCHED_POWER_SCALE;
4733
4734         /* Move if we gain throughput */
4735         if (pwr_move > pwr_now)
4736                 env->imbalance = sds->busiest_load_per_task;
4737 }
4738
4739 /**
4740  * calculate_imbalance - Calculate the amount of imbalance present within the
4741  *                       groups of a given sched_domain during load balance.
4742  * @env: load balance environment
4743  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4744  */
4745 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
4746 {
4747         unsigned long max_pull, load_above_capacity = ~0UL;
4748
4749         sds->busiest_load_per_task /= sds->busiest_nr_running;
4750         if (sds->group_imb) {
4751                 sds->busiest_load_per_task =
4752                         min(sds->busiest_load_per_task, sds->avg_load);
4753         }
4754
4755         /*
4756          * In the presence of smp nice balancing, certain scenarios can have
4757          * max load less than avg load(as we skip the groups at or below
4758          * its cpu_power, while calculating max_load..)
4759          */
4760         if (sds->max_load < sds->avg_load) {
4761                 env->imbalance = 0;
4762                 return fix_small_imbalance(env, sds);
4763         }
4764
4765         if (!sds->group_imb) {
4766                 /*
4767                  * Don't want to pull so many tasks that a group would go idle.
4768                  */
4769                 load_above_capacity = (sds->busiest_nr_running -
4770                                                 sds->busiest_group_capacity);
4771
4772                 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
4773
4774                 load_above_capacity /= sds->busiest->sgp->power;
4775         }
4776
4777         /*
4778          * We're trying to get all the cpus to the average_load, so we don't
4779          * want to push ourselves above the average load, nor do we wish to
4780          * reduce the max loaded cpu below the average load. At the same time,
4781          * we also don't want to reduce the group load below the group capacity
4782          * (so that we can implement power-savings policies etc). Thus we look
4783          * for the minimum possible imbalance.
4784          * Be careful of negative numbers as they'll appear as very large values
4785          * with unsigned longs.
4786          */
4787         max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
4788
4789         /* How much load to actually move to equalise the imbalance */
4790         env->imbalance = min(max_pull * sds->busiest->sgp->power,
4791                 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
4792                         / SCHED_POWER_SCALE;
4793
4794         /*
4795          * if *imbalance is less than the average load per runnable task
4796          * there is no guarantee that any tasks will be moved so we'll have
4797          * a think about bumping its value to force at least one task to be
4798          * moved
4799          */
4800         if (env->imbalance < sds->busiest_load_per_task)
4801                 return fix_small_imbalance(env, sds);
4802
4803 }
4804
4805 /******* find_busiest_group() helpers end here *********************/
4806
4807 /**
4808  * find_busiest_group - Returns the busiest group within the sched_domain
4809  * if there is an imbalance. If there isn't an imbalance, and
4810  * the user has opted for power-savings, it returns a group whose
4811  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4812  * such a group exists.
4813  *
4814  * Also calculates the amount of weighted load which should be moved
4815  * to restore balance.
4816  *
4817  * @env: The load balancing environment.
4818  * @balance: Pointer to a variable indicating if this_cpu
4819  *      is the appropriate cpu to perform load balancing at this_level.
4820  *
4821  * Returns:     - the busiest group if imbalance exists.
4822  *              - If no imbalance and user has opted for power-savings balance,
4823  *                 return the least loaded group whose CPUs can be
4824  *                 put to idle by rebalancing its tasks onto our group.
4825  */
4826 static struct sched_group *
4827 find_busiest_group(struct lb_env *env, int *balance)
4828 {
4829         struct sd_lb_stats sds;
4830
4831         memset(&sds, 0, sizeof(sds));
4832
4833         /*
4834          * Compute the various statistics relavent for load balancing at
4835          * this level.
4836          */
4837         update_sd_lb_stats(env, balance, &sds);
4838
4839         /*
4840          * this_cpu is not the appropriate cpu to perform load balancing at
4841          * this level.
4842          */
4843         if (!(*balance))
4844                 goto ret;
4845
4846         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4847             check_asym_packing(env, &sds))
4848                 return sds.busiest;
4849
4850         /* There is no busy sibling group to pull tasks from */
4851         if (!sds.busiest || sds.busiest_nr_running == 0)
4852                 goto out_balanced;
4853
4854         sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
4855
4856         /*
4857          * If the busiest group is imbalanced the below checks don't
4858          * work because they assumes all things are equal, which typically
4859          * isn't true due to cpus_allowed constraints and the like.
4860          */
4861         if (sds.group_imb)
4862                 goto force_balance;
4863
4864         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
4865         if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
4866                         !sds.busiest_has_capacity)
4867                 goto force_balance;
4868
4869         /*
4870          * If the local group is more busy than the selected busiest group
4871          * don't try and pull any tasks.
4872          */
4873         if (sds.this_load >= sds.max_load)
4874                 goto out_balanced;
4875
4876         /*
4877          * Don't pull any tasks if this group is already above the domain
4878          * average load.
4879          */
4880         if (sds.this_load >= sds.avg_load)
4881                 goto out_balanced;
4882
4883         if (env->idle == CPU_IDLE) {
4884                 /*
4885                  * This cpu is idle. If the busiest group load doesn't
4886                  * have more tasks than the number of available cpu's and
4887                  * there is no imbalance between this and busiest group
4888                  * wrt to idle cpu's, it is balanced.
4889                  */
4890                 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
4891                     sds.busiest_nr_running <= sds.busiest_group_weight)
4892                         goto out_balanced;
4893         } else {
4894                 /*
4895                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4896                  * imbalance_pct to be conservative.
4897                  */
4898                 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
4899                         goto out_balanced;
4900         }
4901
4902 force_balance:
4903         /* Looks like there is an imbalance. Compute it */
4904         calculate_imbalance(env, &sds);
4905         return sds.busiest;
4906
4907 out_balanced:
4908 ret:
4909         env->imbalance = 0;
4910         return NULL;
4911 }
4912
4913 /*
4914  * find_busiest_queue - find the busiest runqueue among the cpus in group.
4915  */
4916 static struct rq *find_busiest_queue(struct lb_env *env,
4917                                      struct sched_group *group)
4918 {
4919         struct rq *busiest = NULL, *rq;
4920         unsigned long max_load = 0;
4921         int i;
4922
4923         for_each_cpu(i, sched_group_cpus(group)) {
4924                 unsigned long power = power_of(i);
4925                 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4926                                                            SCHED_POWER_SCALE);
4927                 unsigned long wl;
4928
4929                 if (!capacity)
4930                         capacity = fix_small_capacity(env->sd, group);
4931
4932                 if (!cpumask_test_cpu(i, env->cpus))
4933                         continue;
4934
4935                 rq = cpu_rq(i);
4936                 wl = weighted_cpuload(i);
4937
4938                 /*
4939                  * When comparing with imbalance, use weighted_cpuload()
4940                  * which is not scaled with the cpu power.
4941                  */
4942                 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
4943                         continue;
4944
4945                 /*
4946                  * For the load comparisons with the other cpu's, consider
4947                  * the weighted_cpuload() scaled with the cpu power, so that
4948                  * the load can be moved away from the cpu that is potentially
4949                  * running at a lower capacity.
4950                  */
4951                 wl = (wl * SCHED_POWER_SCALE) / power;
4952
4953                 if (wl > max_load) {
4954                         max_load = wl;
4955                         busiest = rq;
4956                 }
4957         }
4958
4959         return busiest;
4960 }
4961
4962 /*
4963  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4964  * so long as it is large enough.
4965  */
4966 #define MAX_PINNED_INTERVAL     512
4967
4968 /* Working cpumask for load_balance and load_balance_newidle. */
4969 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4970
4971 static int need_active_balance(struct lb_env *env)
4972 {
4973         struct sched_domain *sd = env->sd;
4974
4975         if (env->idle == CPU_NEWLY_IDLE) {
4976
4977                 /*
4978                  * ASYM_PACKING needs to force migrate tasks from busy but
4979                  * higher numbered CPUs in order to pack all tasks in the
4980                  * lowest numbered CPUs.
4981                  */
4982                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
4983                         return 1;
4984         }
4985
4986         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
4987 }
4988
4989 static int active_load_balance_cpu_stop(void *data);
4990
4991 /*
4992  * Check this_cpu to ensure it is balanced within domain. Attempt to move
4993  * tasks if there is an imbalance.
4994  */
4995 static int load_balance(int this_cpu, struct rq *this_rq,
4996                         struct sched_domain *sd, enum cpu_idle_type idle,
4997                         int *balance)
4998 {
4999         int ld_moved, cur_ld_moved, active_balance = 0;
5000         int lb_iterations, max_lb_iterations;
5001         struct sched_group *group;
5002         struct rq *busiest;
5003         unsigned long flags;
5004         struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
5005
5006         struct lb_env env = {
5007                 .sd             = sd,
5008                 .dst_cpu        = this_cpu,
5009                 .dst_rq         = this_rq,
5010                 .dst_grpmask    = sched_group_cpus(sd->groups),
5011                 .idle           = idle,
5012                 .loop_break     = sched_nr_migrate_break,
5013                 .cpus           = cpus,
5014         };
5015
5016         cpumask_copy(cpus, cpu_active_mask);
5017         max_lb_iterations = cpumask_weight(env.dst_grpmask);
5018
5019         schedstat_inc(sd, lb_count[idle]);
5020
5021 redo:
5022         group = find_busiest_group(&env, balance);
5023
5024         if (*balance == 0)
5025                 goto out_balanced;
5026
5027         if (!group) {
5028                 schedstat_inc(sd, lb_nobusyg[idle]);
5029                 goto out_balanced;
5030         }
5031
5032         busiest = find_busiest_queue(&env, group);
5033         if (!busiest) {
5034                 schedstat_inc(sd, lb_nobusyq[idle]);
5035                 goto out_balanced;
5036         }
5037
5038         BUG_ON(busiest == env.dst_rq);
5039
5040         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5041
5042         ld_moved = 0;
5043         lb_iterations = 1;
5044         if (busiest->nr_running > 1) {
5045                 /*
5046                  * Attempt to move tasks. If find_busiest_group has found
5047                  * an imbalance but busiest->nr_running <= 1, the group is
5048                  * still unbalanced. ld_moved simply stays zero, so it is
5049                  * correctly treated as an imbalance.
5050                  */
5051                 env.flags |= LBF_ALL_PINNED;
5052                 env.src_cpu   = busiest->cpu;
5053                 env.src_rq    = busiest;
5054                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
5055
5056                 update_h_load(env.src_cpu);
5057 more_balance:
5058                 local_irq_save(flags);
5059                 double_rq_lock(env.dst_rq, busiest);
5060
5061                 /*
5062                  * cur_ld_moved - load moved in current iteration
5063                  * ld_moved     - cumulative load moved across iterations
5064                  */
5065                 cur_ld_moved = move_tasks(&env);
5066                 ld_moved += cur_ld_moved;
5067                 double_rq_unlock(env.dst_rq, busiest);
5068                 local_irq_restore(flags);
5069
5070                 if (env.flags & LBF_NEED_BREAK) {
5071                         env.flags &= ~LBF_NEED_BREAK;
5072                         goto more_balance;
5073                 }
5074
5075                 /*
5076                  * some other cpu did the load balance for us.
5077                  */
5078                 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5079                         resched_cpu(env.dst_cpu);
5080
5081                 /*
5082                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
5083                  * us and move them to an alternate dst_cpu in our sched_group
5084                  * where they can run. The upper limit on how many times we
5085                  * iterate on same src_cpu is dependent on number of cpus in our
5086                  * sched_group.
5087                  *
5088                  * This changes load balance semantics a bit on who can move
5089                  * load to a given_cpu. In addition to the given_cpu itself
5090                  * (or a ilb_cpu acting on its behalf where given_cpu is
5091                  * nohz-idle), we now have balance_cpu in a position to move
5092                  * load to given_cpu. In rare situations, this may cause
5093                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5094                  * _independently_ and at _same_ time to move some load to
5095                  * given_cpu) causing exceess load to be moved to given_cpu.
5096                  * This however should not happen so much in practice and
5097                  * moreover subsequent load balance cycles should correct the
5098                  * excess load moved.
5099                  */
5100                 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
5101                                 lb_iterations++ < max_lb_iterations) {
5102
5103                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
5104                         env.dst_cpu      = env.new_dst_cpu;
5105                         env.flags       &= ~LBF_SOME_PINNED;
5106                         env.loop         = 0;
5107                         env.loop_break   = sched_nr_migrate_break;
5108                         /*
5109                          * Go back to "more_balance" rather than "redo" since we
5110                          * need to continue with same src_cpu.
5111                          */
5112                         goto more_balance;
5113                 }
5114
5115                 /* All tasks on this runqueue were pinned by CPU affinity */
5116                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5117                         cpumask_clear_cpu(cpu_of(busiest), cpus);
5118                         if (!cpumask_empty(cpus)) {
5119                                 env.loop = 0;
5120                                 env.loop_break = sched_nr_migrate_break;
5121                                 goto redo;
5122                         }
5123                         goto out_balanced;
5124                 }
5125         }
5126
5127         if (!ld_moved) {
5128                 schedstat_inc(sd, lb_failed[idle]);
5129                 /*
5130                  * Increment the failure counter only on periodic balance.
5131                  * We do not want newidle balance, which can be very
5132                  * frequent, pollute the failure counter causing
5133                  * excessive cache_hot migrations and active balances.
5134                  */
5135                 if (idle != CPU_NEWLY_IDLE)
5136                         sd->nr_balance_failed++;
5137
5138                 if (need_active_balance(&env)) {
5139                         raw_spin_lock_irqsave(&busiest->lock, flags);
5140
5141                         /* don't kick the active_load_balance_cpu_stop,
5142                          * if the curr task on busiest cpu can't be
5143                          * moved to this_cpu
5144                          */
5145                         if (!cpumask_test_cpu(this_cpu,
5146                                         tsk_cpus_allowed(busiest->curr))) {
5147                                 raw_spin_unlock_irqrestore(&busiest->lock,
5148                                                             flags);
5149                                 env.flags |= LBF_ALL_PINNED;
5150                                 goto out_one_pinned;
5151                         }
5152
5153                         /*
5154                          * ->active_balance synchronizes accesses to
5155                          * ->active_balance_work.  Once set, it's cleared
5156                          * only after active load balance is finished.
5157                          */
5158                         if (!busiest->active_balance) {
5159                                 busiest->active_balance = 1;
5160                                 busiest->push_cpu = this_cpu;
5161                                 active_balance = 1;
5162                         }
5163                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
5164
5165                         if (active_balance) {
5166                                 stop_one_cpu_nowait(cpu_of(busiest),
5167                                         active_load_balance_cpu_stop, busiest,
5168                                         &busiest->active_balance_work);
5169                         }
5170
5171                         /*
5172                          * We've kicked active balancing, reset the failure
5173                          * counter.
5174                          */
5175                         sd->nr_balance_failed = sd->cache_nice_tries+1;
5176                 }
5177         } else
5178                 sd->nr_balance_failed = 0;
5179
5180         if (likely(!active_balance)) {
5181                 /* We were unbalanced, so reset the balancing interval */
5182                 sd->balance_interval = sd->min_interval;
5183         } else {
5184                 /*
5185                  * If we've begun active balancing, start to back off. This
5186                  * case may not be covered by the all_pinned logic if there
5187                  * is only 1 task on the busy runqueue (because we don't call
5188                  * move_tasks).
5189                  */
5190                 if (sd->balance_interval < sd->max_interval)
5191                         sd->balance_interval *= 2;
5192         }
5193
5194         goto out;
5195
5196 out_balanced:
5197         schedstat_inc(sd, lb_balanced[idle]);
5198
5199         sd->nr_balance_failed = 0;
5200
5201 out_one_pinned:
5202         /* tune up the balancing interval */
5203         if (((env.flags & LBF_ALL_PINNED) &&
5204                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
5205                         (sd->balance_interval < sd->max_interval))
5206                 sd->balance_interval *= 2;
5207
5208         ld_moved = 0;
5209 out:
5210         return ld_moved;
5211 }
5212
5213 /*
5214  * idle_balance is called by schedule() if this_cpu is about to become
5215  * idle. Attempts to pull tasks from other CPUs.
5216  */
5217 void idle_balance(int this_cpu, struct rq *this_rq)
5218 {
5219         struct sched_domain *sd;
5220         int pulled_task = 0;
5221         unsigned long next_balance = jiffies + HZ;
5222
5223         this_rq->idle_stamp = this_rq->clock;
5224
5225         if (this_rq->avg_idle < sysctl_sched_migration_cost)
5226                 return;
5227
5228         update_rq_runnable_avg(this_rq, 1);
5229
5230         /*
5231          * Drop the rq->lock, but keep IRQ/preempt disabled.
5232          */
5233         raw_spin_unlock(&this_rq->lock);
5234
5235         update_blocked_averages(this_cpu);
5236         rcu_read_lock();
5237         for_each_domain(this_cpu, sd) {
5238                 unsigned long interval;
5239                 int balance = 1;
5240
5241                 if (!(sd->flags & SD_LOAD_BALANCE))
5242                         continue;
5243
5244                 if (sd->flags & SD_BALANCE_NEWIDLE) {
5245                         /* If we've pulled tasks over stop searching: */
5246                         pulled_task = load_balance(this_cpu, this_rq,
5247                                                    sd, CPU_NEWLY_IDLE, &balance);
5248                 }
5249
5250                 interval = msecs_to_jiffies(sd->balance_interval);
5251                 if (time_after(next_balance, sd->last_balance + interval))
5252                         next_balance = sd->last_balance + interval;
5253                 if (pulled_task) {
5254                         this_rq->idle_stamp = 0;
5255                         break;
5256                 }
5257         }
5258         rcu_read_unlock();
5259
5260         raw_spin_lock(&this_rq->lock);
5261
5262         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5263                 /*
5264                  * We are going idle. next_balance may be set based on
5265                  * a busy processor. So reset next_balance.
5266                  */
5267                 this_rq->next_balance = next_balance;
5268         }
5269 }
5270
5271 /*
5272  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5273  * running tasks off the busiest CPU onto idle CPUs. It requires at
5274  * least 1 task to be running on each physical CPU where possible, and
5275  * avoids physical / logical imbalances.
5276  */
5277 static int active_load_balance_cpu_stop(void *data)
5278 {
5279         struct rq *busiest_rq = data;
5280         int busiest_cpu = cpu_of(busiest_rq);
5281         int target_cpu = busiest_rq->push_cpu;
5282         struct rq *target_rq = cpu_rq(target_cpu);
5283         struct sched_domain *sd;
5284
5285         raw_spin_lock_irq(&busiest_rq->lock);
5286
5287         /* make sure the requested cpu hasn't gone down in the meantime */
5288         if (unlikely(busiest_cpu != smp_processor_id() ||
5289                      !busiest_rq->active_balance))
5290                 goto out_unlock;
5291
5292         /* Is there any task to move? */
5293         if (busiest_rq->nr_running <= 1)
5294                 goto out_unlock;
5295
5296         /*
5297          * This condition is "impossible", if it occurs
5298          * we need to fix it. Originally reported by
5299          * Bjorn Helgaas on a 128-cpu setup.
5300          */
5301         BUG_ON(busiest_rq == target_rq);
5302
5303         /* move a task from busiest_rq to target_rq */
5304         double_lock_balance(busiest_rq, target_rq);
5305
5306         /* Search for an sd spanning us and the target CPU. */
5307         rcu_read_lock();
5308         for_each_domain(target_cpu, sd) {
5309                 if ((sd->flags & SD_LOAD_BALANCE) &&
5310                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5311                                 break;
5312         }
5313
5314         if (likely(sd)) {
5315                 struct lb_env env = {
5316                         .sd             = sd,
5317                         .dst_cpu        = target_cpu,
5318                         .dst_rq         = target_rq,
5319                         .src_cpu        = busiest_rq->cpu,
5320                         .src_rq         = busiest_rq,
5321                         .idle           = CPU_IDLE,
5322                 };
5323
5324                 schedstat_inc(sd, alb_count);
5325
5326                 if (move_one_task(&env))
5327                         schedstat_inc(sd, alb_pushed);
5328                 else
5329                         schedstat_inc(sd, alb_failed);
5330         }
5331         rcu_read_unlock();
5332         double_unlock_balance(busiest_rq, target_rq);
5333 out_unlock:
5334         busiest_rq->active_balance = 0;
5335         raw_spin_unlock_irq(&busiest_rq->lock);
5336         return 0;
5337 }
5338
5339 #ifdef CONFIG_NO_HZ
5340 /*
5341  * idle load balancing details
5342  * - When one of the busy CPUs notice that there may be an idle rebalancing
5343  *   needed, they will kick the idle load balancer, which then does idle
5344  *   load balancing for all the idle CPUs.
5345  */
5346 static struct {
5347         cpumask_var_t idle_cpus_mask;
5348         atomic_t nr_cpus;
5349         unsigned long next_balance;     /* in jiffy units */
5350 } nohz ____cacheline_aligned;
5351
5352 static inline int find_new_ilb(int call_cpu)
5353 {
5354         int ilb = cpumask_first(nohz.idle_cpus_mask);
5355
5356         if (ilb < nr_cpu_ids && idle_cpu(ilb))
5357                 return ilb;
5358
5359         return nr_cpu_ids;
5360 }
5361
5362 /*
5363  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5364  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5365  * CPU (if there is one).
5366  */
5367 static void nohz_balancer_kick(int cpu)
5368 {
5369         int ilb_cpu;
5370
5371         nohz.next_balance++;
5372
5373         ilb_cpu = find_new_ilb(cpu);
5374
5375         if (ilb_cpu >= nr_cpu_ids)
5376                 return;
5377
5378         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
5379                 return;
5380         /*
5381          * Use smp_send_reschedule() instead of resched_cpu().
5382          * This way we generate a sched IPI on the target cpu which
5383          * is idle. And the softirq performing nohz idle load balance
5384          * will be run before returning from the IPI.
5385          */
5386         smp_send_reschedule(ilb_cpu);
5387         return;
5388 }
5389
5390 static inline void nohz_balance_exit_idle(int cpu)
5391 {
5392         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5393                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5394                 atomic_dec(&nohz.nr_cpus);
5395                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5396         }
5397 }
5398
5399 static inline void set_cpu_sd_state_busy(void)
5400 {
5401         struct sched_domain *sd;
5402         int cpu = smp_processor_id();
5403
5404         if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5405                 return;
5406         clear_bit(NOHZ_IDLE, nohz_flags(cpu));
5407
5408         rcu_read_lock();
5409         for_each_domain(cpu, sd)
5410                 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5411         rcu_read_unlock();
5412 }
5413
5414 void set_cpu_sd_state_idle(void)
5415 {
5416         struct sched_domain *sd;
5417         int cpu = smp_processor_id();
5418
5419         if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
5420                 return;
5421         set_bit(NOHZ_IDLE, nohz_flags(cpu));
5422
5423         rcu_read_lock();
5424         for_each_domain(cpu, sd)
5425                 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5426         rcu_read_unlock();
5427 }
5428
5429 /*
5430  * This routine will record that the cpu is going idle with tick stopped.
5431  * This info will be used in performing idle load balancing in the future.
5432  */
5433 void nohz_balance_enter_idle(int cpu)
5434 {
5435         /*
5436          * If this cpu is going down, then nothing needs to be done.
5437          */
5438         if (!cpu_active(cpu))
5439                 return;
5440
5441         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5442                 return;
5443
5444         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5445         atomic_inc(&nohz.nr_cpus);
5446         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5447 }
5448
5449 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5450                                         unsigned long action, void *hcpu)
5451 {
5452         switch (action & ~CPU_TASKS_FROZEN) {
5453         case CPU_DYING:
5454                 nohz_balance_exit_idle(smp_processor_id());
5455                 return NOTIFY_OK;
5456         default:
5457                 return NOTIFY_DONE;
5458         }
5459 }
5460 #endif
5461
5462 static DEFINE_SPINLOCK(balancing);
5463
5464 /*
5465  * Scale the max load_balance interval with the number of CPUs in the system.
5466  * This trades load-balance latency on larger machines for less cross talk.
5467  */
5468 void update_max_interval(void)
5469 {
5470         max_load_balance_interval = HZ*num_online_cpus()/10;
5471 }
5472
5473 /*
5474  * It checks each scheduling domain to see if it is due to be balanced,
5475  * and initiates a balancing operation if so.
5476  *
5477  * Balancing parameters are set up in arch_init_sched_domains.
5478  */
5479 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5480 {
5481         int balance = 1;
5482         struct rq *rq = cpu_rq(cpu);
5483         unsigned long interval;
5484         struct sched_domain *sd;
5485         /* Earliest time when we have to do rebalance again */
5486         unsigned long next_balance = jiffies + 60*HZ;
5487         int update_next_balance = 0;
5488         int need_serialize;
5489
5490         update_blocked_averages(cpu);
5491
5492         rcu_read_lock();
5493         for_each_domain(cpu, sd) {
5494                 if (!(sd->flags & SD_LOAD_BALANCE))
5495                         continue;
5496
5497                 interval = sd->balance_interval;
5498                 if (idle != CPU_IDLE)
5499                         interval *= sd->busy_factor;
5500
5501                 /* scale ms to jiffies */
5502                 interval = msecs_to_jiffies(interval);
5503                 interval = clamp(interval, 1UL, max_load_balance_interval);
5504
5505                 need_serialize = sd->flags & SD_SERIALIZE;
5506
5507                 if (need_serialize) {
5508                         if (!spin_trylock(&balancing))
5509                                 goto out;
5510                 }
5511
5512                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5513                         if (load_balance(cpu, rq, sd, idle, &balance)) {
5514                                 /*
5515                                  * We've pulled tasks over so either we're no
5516                                  * longer idle.
5517                                  */
5518                                 idle = CPU_NOT_IDLE;
5519                         }
5520                         sd->last_balance = jiffies;
5521                 }
5522                 if (need_serialize)
5523                         spin_unlock(&balancing);
5524 out:
5525                 if (time_after(next_balance, sd->last_balance + interval)) {
5526                         next_balance = sd->last_balance + interval;
5527                         update_next_balance = 1;
5528                 }
5529
5530                 /*
5531                  * Stop the load balance at this level. There is another
5532                  * CPU in our sched group which is doing load balancing more
5533                  * actively.
5534                  */
5535                 if (!balance)
5536                         break;
5537         }
5538         rcu_read_unlock();
5539
5540         /*
5541          * next_balance will be updated only when there is a need.
5542          * When the cpu is attached to null domain for ex, it will not be
5543          * updated.
5544          */
5545         if (likely(update_next_balance))
5546                 rq->next_balance = next_balance;
5547 }
5548
5549 #ifdef CONFIG_NO_HZ
5550 /*
5551  * In CONFIG_NO_HZ case, the idle balance kickee will do the
5552  * rebalancing for all the cpus for whom scheduler ticks are stopped.
5553  */
5554 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5555 {
5556         struct rq *this_rq = cpu_rq(this_cpu);
5557         struct rq *rq;
5558         int balance_cpu;
5559
5560         if (idle != CPU_IDLE ||
5561             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5562                 goto end;
5563
5564         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
5565                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
5566                         continue;
5567
5568                 /*
5569                  * If this cpu gets work to do, stop the load balancing
5570                  * work being done for other cpus. Next load
5571                  * balancing owner will pick it up.
5572                  */
5573                 if (need_resched())
5574                         break;
5575
5576                 rq = cpu_rq(balance_cpu);
5577
5578                 raw_spin_lock_irq(&rq->lock);
5579                 update_rq_clock(rq);
5580                 update_idle_cpu_load(rq);
5581                 raw_spin_unlock_irq(&rq->lock);
5582
5583                 rebalance_domains(balance_cpu, CPU_IDLE);
5584
5585                 if (time_after(this_rq->next_balance, rq->next_balance))
5586                         this_rq->next_balance = rq->next_balance;
5587         }
5588         nohz.next_balance = this_rq->next_balance;
5589 end:
5590         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
5591 }
5592
5593 /*
5594  * Current heuristic for kicking the idle load balancer in the presence
5595  * of an idle cpu is the system.
5596  *   - This rq has more than one task.
5597  *   - At any scheduler domain level, this cpu's scheduler group has multiple
5598  *     busy cpu's exceeding the group's power.
5599  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5600  *     domain span are idle.
5601  */
5602 static inline int nohz_kick_needed(struct rq *rq, int cpu)
5603 {
5604         unsigned long now = jiffies;
5605         struct sched_domain *sd;
5606
5607         if (unlikely(idle_cpu(cpu)))
5608                 return 0;
5609
5610        /*
5611         * We may be recently in ticked or tickless idle mode. At the first
5612         * busy tick after returning from idle, we will update the busy stats.
5613         */
5614         set_cpu_sd_state_busy();
5615         nohz_balance_exit_idle(cpu);
5616
5617         /*
5618          * None are in tickless mode and hence no need for NOHZ idle load
5619          * balancing.
5620          */
5621         if (likely(!atomic_read(&nohz.nr_cpus)))
5622                 return 0;
5623
5624         if (time_before(now, nohz.next_balance))
5625                 return 0;
5626
5627         if (rq->nr_running >= 2)
5628                 goto need_kick;
5629
5630         rcu_read_lock();
5631         for_each_domain(cpu, sd) {
5632                 struct sched_group *sg = sd->groups;
5633                 struct sched_group_power *sgp = sg->sgp;
5634                 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5635
5636                 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
5637                         goto need_kick_unlock;
5638
5639                 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5640                     && (cpumask_first_and(nohz.idle_cpus_mask,
5641                                           sched_domain_span(sd)) < cpu))
5642                         goto need_kick_unlock;
5643
5644                 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5645                         break;
5646         }
5647         rcu_read_unlock();
5648         return 0;
5649
5650 need_kick_unlock:
5651         rcu_read_unlock();
5652 need_kick:
5653         return 1;
5654 }
5655 #else
5656 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5657 #endif
5658
5659 /*
5660  * run_rebalance_domains is triggered when needed from the scheduler tick.
5661  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5662  */
5663 static void run_rebalance_domains(struct softirq_action *h)
5664 {
5665         int this_cpu = smp_processor_id();
5666         struct rq *this_rq = cpu_rq(this_cpu);
5667         enum cpu_idle_type idle = this_rq->idle_balance ?
5668                                                 CPU_IDLE : CPU_NOT_IDLE;
5669
5670         rebalance_domains(this_cpu, idle);
5671
5672         /*
5673          * If this cpu has a pending nohz_balance_kick, then do the
5674          * balancing on behalf of the other idle cpus whose ticks are
5675          * stopped.
5676          */
5677         nohz_idle_balance(this_cpu, idle);
5678 }
5679
5680 static inline int on_null_domain(int cpu)
5681 {
5682         return !rcu_dereference_sched(cpu_rq(cpu)->sd);
5683 }
5684
5685 /*
5686  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
5687  */
5688 void trigger_load_balance(struct rq *rq, int cpu)
5689 {
5690         /* Don't need to rebalance while attached to NULL domain */
5691         if (time_after_eq(jiffies, rq->next_balance) &&
5692             likely(!on_null_domain(cpu)))
5693                 raise_softirq(SCHED_SOFTIRQ);
5694 #ifdef CONFIG_NO_HZ
5695         if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
5696                 nohz_balancer_kick(cpu);
5697 #endif
5698 }
5699
5700 static void rq_online_fair(struct rq *rq)
5701 {
5702         update_sysctl();
5703 }
5704
5705 static void rq_offline_fair(struct rq *rq)
5706 {
5707         update_sysctl();
5708
5709         /* Ensure any throttled groups are reachable by pick_next_task */
5710         unthrottle_offline_cfs_rqs(rq);
5711 }
5712
5713 #endif /* CONFIG_SMP */
5714
5715 /*
5716  * scheduler tick hitting a task of our scheduling class:
5717  */
5718 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5719 {
5720         struct cfs_rq *cfs_rq;
5721         struct sched_entity *se = &curr->se;
5722
5723         for_each_sched_entity(se) {
5724                 cfs_rq = cfs_rq_of(se);
5725                 entity_tick(cfs_rq, se, queued);
5726         }
5727
5728         if (sched_feat_numa(NUMA))
5729                 task_tick_numa(rq, curr);
5730
5731         update_rq_runnable_avg(rq, 1);
5732 }
5733
5734 /*
5735  * called on fork with the child task as argument from the parent's context
5736  *  - child not yet on the tasklist
5737  *  - preemption disabled
5738  */
5739 static void task_fork_fair(struct task_struct *p)
5740 {
5741         struct cfs_rq *cfs_rq;
5742         struct sched_entity *se = &p->se, *curr;
5743         int this_cpu = smp_processor_id();
5744         struct rq *rq = this_rq();
5745         unsigned long flags;
5746
5747         raw_spin_lock_irqsave(&rq->lock, flags);
5748
5749         update_rq_clock(rq);
5750
5751         cfs_rq = task_cfs_rq(current);
5752         curr = cfs_rq->curr;
5753
5754         if (unlikely(task_cpu(p) != this_cpu)) {
5755                 rcu_read_lock();
5756                 __set_task_cpu(p, this_cpu);
5757                 rcu_read_unlock();
5758         }
5759
5760         update_curr(cfs_rq);
5761
5762         if (curr)
5763                 se->vruntime = curr->vruntime;
5764         place_entity(cfs_rq, se, 1);
5765
5766         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
5767                 /*
5768                  * Upon rescheduling, sched_class::put_prev_task() will place
5769                  * 'current' within the tree based on its new key value.
5770                  */
5771                 swap(curr->vruntime, se->vruntime);
5772                 resched_task(rq->curr);
5773         }
5774
5775         se->vruntime -= cfs_rq->min_vruntime;
5776
5777         raw_spin_unlock_irqrestore(&rq->lock, flags);
5778 }
5779
5780 /*
5781  * Priority of the task has changed. Check to see if we preempt
5782  * the current task.
5783  */
5784 static void
5785 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
5786 {
5787         if (!p->se.on_rq)
5788                 return;
5789
5790         /*
5791          * Reschedule if we are currently running on this runqueue and
5792          * our priority decreased, or if we are not currently running on
5793          * this runqueue and our priority is higher than the current's
5794          */
5795         if (rq->curr == p) {
5796                 if (p->prio > oldprio)
5797                         resched_task(rq->curr);
5798         } else
5799                 check_preempt_curr(rq, p, 0);
5800 }
5801
5802 static void switched_from_fair(struct rq *rq, struct task_struct *p)
5803 {
5804         struct sched_entity *se = &p->se;
5805         struct cfs_rq *cfs_rq = cfs_rq_of(se);
5806
5807         /*
5808          * Ensure the task's vruntime is normalized, so that when its
5809          * switched back to the fair class the enqueue_entity(.flags=0) will
5810          * do the right thing.
5811          *
5812          * If it was on_rq, then the dequeue_entity(.flags=0) will already
5813          * have normalized the vruntime, if it was !on_rq, then only when
5814          * the task is sleeping will it still have non-normalized vruntime.
5815          */
5816         if (!se->on_rq && p->state != TASK_RUNNING) {
5817                 /*
5818                  * Fix up our vruntime so that the current sleep doesn't
5819                  * cause 'unlimited' sleep bonus.
5820                  */
5821                 place_entity(cfs_rq, se, 0);
5822                 se->vruntime -= cfs_rq->min_vruntime;
5823         }
5824
5825 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5826         /*
5827         * Remove our load from contribution when we leave sched_fair
5828         * and ensure we don't carry in an old decay_count if we
5829         * switch back.
5830         */
5831         if (p->se.avg.decay_count) {
5832                 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5833                 __synchronize_entity_decay(&p->se);
5834                 subtract_blocked_load_contrib(cfs_rq,
5835                                 p->se.avg.load_avg_contrib);
5836         }
5837 #endif
5838 }
5839
5840 /*
5841  * We switched to the sched_fair class.
5842  */
5843 static void switched_to_fair(struct rq *rq, struct task_struct *p)
5844 {
5845         if (!p->se.on_rq)
5846                 return;
5847
5848         /*
5849          * We were most likely switched from sched_rt, so
5850          * kick off the schedule if running, otherwise just see
5851          * if we can still preempt the current task.
5852          */
5853         if (rq->curr == p)
5854                 resched_task(rq->curr);
5855         else
5856                 check_preempt_curr(rq, p, 0);
5857 }
5858
5859 /* Account for a task changing its policy or group.
5860  *
5861  * This routine is mostly called to set cfs_rq->curr field when a task
5862  * migrates between groups/classes.
5863  */
5864 static void set_curr_task_fair(struct rq *rq)
5865 {
5866         struct sched_entity *se = &rq->curr->se;
5867
5868         for_each_sched_entity(se) {
5869                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5870
5871                 set_next_entity(cfs_rq, se);
5872                 /* ensure bandwidth has been allocated on our new cfs_rq */
5873                 account_cfs_rq_runtime(cfs_rq, 0);
5874         }
5875 }
5876
5877 void init_cfs_rq(struct cfs_rq *cfs_rq)
5878 {
5879         cfs_rq->tasks_timeline = RB_ROOT;
5880         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5881 #ifndef CONFIG_64BIT
5882         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5883 #endif
5884 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5885         atomic64_set(&cfs_rq->decay_counter, 1);
5886         atomic64_set(&cfs_rq->removed_load, 0);
5887 #endif
5888 }
5889
5890 #ifdef CONFIG_FAIR_GROUP_SCHED
5891 static void task_move_group_fair(struct task_struct *p, int on_rq)
5892 {
5893         struct cfs_rq *cfs_rq;
5894         /*
5895          * If the task was not on the rq at the time of this cgroup movement
5896          * it must have been asleep, sleeping tasks keep their ->vruntime
5897          * absolute on their old rq until wakeup (needed for the fair sleeper
5898          * bonus in place_entity()).
5899          *
5900          * If it was on the rq, we've just 'preempted' it, which does convert
5901          * ->vruntime to a relative base.
5902          *
5903          * Make sure both cases convert their relative position when migrating
5904          * to another cgroup's rq. This does somewhat interfere with the
5905          * fair sleeper stuff for the first placement, but who cares.
5906          */
5907         /*
5908          * When !on_rq, vruntime of the task has usually NOT been normalized.
5909          * But there are some cases where it has already been normalized:
5910          *
5911          * - Moving a forked child which is waiting for being woken up by
5912          *   wake_up_new_task().
5913          * - Moving a task which has been woken up by try_to_wake_up() and
5914          *   waiting for actually being woken up by sched_ttwu_pending().
5915          *
5916          * To prevent boost or penalty in the new cfs_rq caused by delta
5917          * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5918          */
5919         if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
5920                 on_rq = 1;
5921
5922         if (!on_rq)
5923                 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5924         set_task_rq(p, task_cpu(p));
5925         if (!on_rq) {
5926                 cfs_rq = cfs_rq_of(&p->se);
5927                 p->se.vruntime += cfs_rq->min_vruntime;
5928 #ifdef CONFIG_SMP
5929                 /*
5930                  * migrate_task_rq_fair() will have removed our previous
5931                  * contribution, but we must synchronize for ongoing future
5932                  * decay.
5933                  */
5934                 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5935                 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5936 #endif
5937         }
5938 }
5939
5940 void free_fair_sched_group(struct task_group *tg)
5941 {
5942         int i;
5943
5944         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
5945
5946         for_each_possible_cpu(i) {
5947                 if (tg->cfs_rq)
5948                         kfree(tg->cfs_rq[i]);
5949                 if (tg->se)
5950                         kfree(tg->se[i]);
5951         }
5952
5953         kfree(tg->cfs_rq);
5954         kfree(tg->se);
5955 }
5956
5957 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
5958 {
5959         struct cfs_rq *cfs_rq;
5960         struct sched_entity *se;
5961         int i;
5962
5963         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
5964         if (!tg->cfs_rq)
5965                 goto err;
5966         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
5967         if (!tg->se)
5968                 goto err;
5969
5970         tg->shares = NICE_0_LOAD;
5971
5972         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
5973
5974         for_each_possible_cpu(i) {
5975                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
5976                                       GFP_KERNEL, cpu_to_node(i));
5977                 if (!cfs_rq)
5978                         goto err;
5979
5980                 se = kzalloc_node(sizeof(struct sched_entity),
5981                                   GFP_KERNEL, cpu_to_node(i));
5982                 if (!se)
5983                         goto err_free_rq;
5984
5985                 init_cfs_rq(cfs_rq);
5986                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
5987         }
5988
5989         return 1;
5990
5991 err_free_rq:
5992         kfree(cfs_rq);
5993 err:
5994         return 0;
5995 }
5996
5997 void unregister_fair_sched_group(struct task_group *tg, int cpu)
5998 {
5999         struct rq *rq = cpu_rq(cpu);
6000         unsigned long flags;
6001
6002         /*
6003         * Only empty task groups can be destroyed; so we can speculatively
6004         * check on_list without danger of it being re-added.
6005         */
6006         if (!tg->cfs_rq[cpu]->on_list)
6007                 return;
6008
6009         raw_spin_lock_irqsave(&rq->lock, flags);
6010         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6011         raw_spin_unlock_irqrestore(&rq->lock, flags);
6012 }
6013
6014 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6015                         struct sched_entity *se, int cpu,
6016                         struct sched_entity *parent)
6017 {
6018         struct rq *rq = cpu_rq(cpu);
6019
6020         cfs_rq->tg = tg;
6021         cfs_rq->rq = rq;
6022         init_cfs_rq_runtime(cfs_rq);
6023
6024         tg->cfs_rq[cpu] = cfs_rq;
6025         tg->se[cpu] = se;
6026
6027         /* se could be NULL for root_task_group */
6028         if (!se)
6029                 return;
6030
6031         if (!parent)
6032                 se->cfs_rq = &rq->cfs;
6033         else
6034                 se->cfs_rq = parent->my_q;
6035
6036         se->my_q = cfs_rq;
6037         update_load_set(&se->load, 0);
6038         se->parent = parent;
6039 }
6040
6041 static DEFINE_MUTEX(shares_mutex);
6042
6043 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6044 {
6045         int i;
6046         unsigned long flags;
6047
6048         /*
6049          * We can't change the weight of the root cgroup.
6050          */
6051         if (!tg->se[0])
6052                 return -EINVAL;
6053
6054         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6055
6056         mutex_lock(&shares_mutex);
6057         if (tg->shares == shares)
6058                 goto done;
6059
6060         tg->shares = shares;
6061         for_each_possible_cpu(i) {
6062                 struct rq *rq = cpu_rq(i);
6063                 struct sched_entity *se;
6064
6065                 se = tg->se[i];
6066                 /* Propagate contribution to hierarchy */
6067                 raw_spin_lock_irqsave(&rq->lock, flags);
6068                 for_each_sched_entity(se)
6069                         update_cfs_shares(group_cfs_rq(se));
6070                 raw_spin_unlock_irqrestore(&rq->lock, flags);
6071         }
6072
6073 done:
6074         mutex_unlock(&shares_mutex);
6075         return 0;
6076 }
6077 #else /* CONFIG_FAIR_GROUP_SCHED */
6078
6079 void free_fair_sched_group(struct task_group *tg) { }
6080
6081 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6082 {
6083         return 1;
6084 }
6085
6086 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6087
6088 #endif /* CONFIG_FAIR_GROUP_SCHED */
6089
6090
6091 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
6092 {
6093         struct sched_entity *se = &task->se;
6094         unsigned int rr_interval = 0;
6095
6096         /*
6097          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6098          * idle runqueue:
6099          */
6100         if (rq->cfs.load.weight)
6101                 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6102
6103         return rr_interval;
6104 }
6105
6106 /*
6107  * All the scheduling class methods:
6108  */
6109 const struct sched_class fair_sched_class = {
6110         .next                   = &idle_sched_class,
6111         .enqueue_task           = enqueue_task_fair,
6112         .dequeue_task           = dequeue_task_fair,
6113         .yield_task             = yield_task_fair,
6114         .yield_to_task          = yield_to_task_fair,
6115
6116         .check_preempt_curr     = check_preempt_wakeup,
6117
6118         .pick_next_task         = pick_next_task_fair,
6119         .put_prev_task          = put_prev_task_fair,
6120
6121 #ifdef CONFIG_SMP
6122         .select_task_rq         = select_task_rq_fair,
6123 #ifdef CONFIG_FAIR_GROUP_SCHED
6124         .migrate_task_rq        = migrate_task_rq_fair,
6125 #endif
6126         .rq_online              = rq_online_fair,
6127         .rq_offline             = rq_offline_fair,
6128
6129         .task_waking            = task_waking_fair,
6130 #endif
6131
6132         .set_curr_task          = set_curr_task_fair,
6133         .task_tick              = task_tick_fair,
6134         .task_fork              = task_fork_fair,
6135
6136         .prio_changed           = prio_changed_fair,
6137         .switched_from          = switched_from_fair,
6138         .switched_to            = switched_to_fair,
6139
6140         .get_rr_interval        = get_rr_interval_fair,
6141
6142 #ifdef CONFIG_FAIR_GROUP_SCHED
6143         .task_move_group        = task_move_group_fair,
6144 #endif
6145 };
6146
6147 #ifdef CONFIG_SCHED_DEBUG
6148 void print_cfs_stats(struct seq_file *m, int cpu)
6149 {
6150         struct cfs_rq *cfs_rq;
6151
6152         rcu_read_lock();
6153         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
6154                 print_cfs_rq(m, cpu, cfs_rq);
6155         rcu_read_unlock();
6156 }
6157 #endif
6158
6159 __init void init_sched_fair_class(void)
6160 {
6161 #ifdef CONFIG_SMP
6162         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6163
6164 #ifdef CONFIG_NO_HZ
6165         nohz.next_balance = jiffies;
6166         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
6167         cpu_notifier(sched_ilb_notifier, 0);
6168 #endif
6169 #endif /* SMP */
6170
6171 }