kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/slab.h>
  27 #include <linux/profile.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/mempolicy.h>
  30 #include <linux/migrate.h>
  31 #include <linux/task_work.h>
  32
  33 #include <trace/events/sched.h>
  34 #ifdef CONFIG_HMP_VARIABLE_SCALE
  35 #include <linux/sysfs.h>
  36 #include <linux/vmalloc.h>
  37 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
  38 /* Include cpufreq header to add a notifier so that cpu frequency
  39  * scaling can track the current CPU frequency
  40  */
  41 #include <linux/cpufreq.h>
  42 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
  43 #endif /* CONFIG_HMP_VARIABLE_SCALE */
  44
  45 #include "sched.h"
  46
  47
  48 /*
  49  * Targeted preemption latency for CPU-bound tasks:
  50  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  51  *
  52  * NOTE: this latency value is not the same as the concept of
  53  * 'timeslice length' - timeslices in CFS are of variable length
  54  * and have no persistent notion like in traditional, time-slice
  55  * based scheduling concepts.
  56  *
  57  * (to see the precise effective timeslice length of your workload,
  58  *  run vmstat and monitor the context-switches (cs) field)
  59  */
  60 unsigned int sysctl_sched_latency = 6000000ULL;
  61 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  62
  63 /*
  64  * The initial- and re-scaling of tunables is configurable
  65  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  66  *
  67  * Options are:
  68  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  69  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  70  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  71  */
  72 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  73         = SCHED_TUNABLESCALING_LOG;
  74
  75 /*
  76  * Minimal preemption granularity for CPU-bound tasks:
  77  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  78  */
  79 unsigned int sysctl_sched_min_granularity = 750000ULL;
  80 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  81
  82 /*
  83  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  84  */
  85 static unsigned int sched_nr_latency = 8;
  86
  87 /*
  88  * After fork, child runs first. If set to 0 (default) then
  89  * parent will (try to) run first.
  90  */
  91 unsigned int sysctl_sched_child_runs_first __read_mostly;
  92
  93 /*
  94  * SCHED_OTHER wake-up granularity.
  95  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  96  *
  97  * This option delays the preemption effects of decoupled workloads
  98  * and reduces their over-scheduling. Synchronous workloads will still
  99  * have immediate wakeup/sleep latencies.
 100  */
 101 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 102 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 103
 104 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 105
 106 /*
 107  * The exponential sliding  window over which load is averaged for shares
 108  * distribution.
 109  * (default: 10msec)
 110  */
 111 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 112
 113 #ifdef CONFIG_CFS_BANDWIDTH
 114 /*
 115  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 116  * each time a cfs_rq requests quota.
 117  *
 118  * Note: in the case that the slice exceeds the runtime remaining (either due
 119  * to consumption or the quota being specified to be smaller than the slice)
 120  * we will always only issue the remaining available time.
 121  *
 122  * default: 5 msec, units: microseconds
 123   */
 124 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 125 #endif
 126
 127 /*
 128  * Increase the granularity value when there are more CPUs,
 129  * because with more CPUs the 'effective latency' as visible
 130  * to users decreases. But the relationship is not linear,
 131  * so pick a second-best guess by going with the log2 of the
 132  * number of CPUs.
 133  *
 134  * This idea comes from the SD scheduler of Con Kolivas:
 135  */
 136 static int get_update_sysctl_factor(void)
 137 {
 138         unsigned int cpus = min_t(int, num_online_cpus(), 8);
 139         unsigned int factor;
 140
 141         switch (sysctl_sched_tunable_scaling) {
 142         case SCHED_TUNABLESCALING_NONE:
 143                 factor = 1;
 144                 break;
 145         case SCHED_TUNABLESCALING_LINEAR:
 146                 factor = cpus;
 147                 break;
 148         case SCHED_TUNABLESCALING_LOG:
 149         default:
 150                 factor = 1 + ilog2(cpus);
 151                 break;
 152         }
 153
 154         return factor;
 155 }
 156
 157 static void update_sysctl(void)
 158 {
 159         unsigned int factor = get_update_sysctl_factor();
 160
 161 #define SET_SYSCTL(name) \
 162         (sysctl_##name = (factor) * normalized_sysctl_##name)
 163         SET_SYSCTL(sched_min_granularity);
 164         SET_SYSCTL(sched_latency);
 165         SET_SYSCTL(sched_wakeup_granularity);
 166 #undef SET_SYSCTL
 167 }
 168
 169 void sched_init_granularity(void)
 170 {
 171         update_sysctl();
 172 }
 173
 174 #if BITS_PER_LONG == 32
 175 # define WMULT_CONST    (~0UL)
 176 #else
 177 # define WMULT_CONST    (1UL << 32)
 178 #endif
 179
 180 #define WMULT_SHIFT     32
 181
 182 /*
 183  * Shift right and round:
 184  */
 185 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 186
 187 /*
 188  * delta *= weight / lw
 189  */
 190 static unsigned long
 191 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 192                 struct load_weight *lw)
 193 {
 194         u64 tmp;
 195
 196         /*
 197          * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 198          * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 199          * 2^SCHED_LOAD_RESOLUTION.
 200          */
 201         if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 202                 tmp = (u64)delta_exec * scale_load_down(weight);
 203         else
 204                 tmp = (u64)delta_exec;
 205
 206         if (!lw->inv_weight) {
 207                 unsigned long w = scale_load_down(lw->weight);
 208
 209                 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 210                         lw->inv_weight = 1;
 211                 else if (unlikely(!w))
 212                         lw->inv_weight = WMULT_CONST;
 213                 else
 214                         lw->inv_weight = WMULT_CONST / w;
 215         }
 216
 217         /*
 218          * Check whether we'd overflow the 64-bit multiplication:
 219          */
 220         if (unlikely(tmp > WMULT_CONST))
 221                 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 222                         WMULT_SHIFT/2);
 223         else
 224                 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 225
 226         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 227 }
 228
 229
 230 const struct sched_class fair_sched_class;
 231
 232 /**************************************************************
 233  * CFS operations on generic schedulable entities:
 234  */
 235
 236 #ifdef CONFIG_FAIR_GROUP_SCHED
 237
 238 /* cpu runqueue to which this cfs_rq is attached */
 239 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 240 {
 241         return cfs_rq->rq;
 242 }
 243
 244 /* An entity is a task if it doesn't "own" a runqueue */
 245 #define entity_is_task(se)      (!se->my_q)
 246
 247 static inline struct task_struct *task_of(struct sched_entity *se)
 248 {
 249 #ifdef CONFIG_SCHED_DEBUG
 250         WARN_ON_ONCE(!entity_is_task(se));
 251 #endif
 252         return container_of(se, struct task_struct, se);
 253 }
 254
 255 /* Walk up scheduling entities hierarchy */
 256 #define for_each_sched_entity(se) \
 257                 for (; se; se = se->parent)
 258
 259 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 260 {
 261         return p->se.cfs_rq;
 262 }
 263
 264 /* runqueue on which this entity is (to be) queued */
 265 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 266 {
 267         return se->cfs_rq;
 268 }
 269
 270 /* runqueue "owned" by this group */
 271 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 272 {
 273         return grp->my_q;
 274 }
 275
 276 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 277                                        int force_update);
 278
 279 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 280 {
 281         if (!cfs_rq->on_list) {
 282                 /*
 283                  * Ensure we either appear before our parent (if already
 284                  * enqueued) or force our parent to appear after us when it is
 285                  * enqueued.  The fact that we always enqueue bottom-up
 286                  * reduces this to two cases.
 287                  */
 288                 if (cfs_rq->tg->parent &&
 289                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 290                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 291                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 292                 } else {
 293                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 294                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 295                 }
 296
 297                 cfs_rq->on_list = 1;
 298                 /* We should have no load, but we need to update last_decay. */
 299                 update_cfs_rq_blocked_load(cfs_rq, 0);
 300         }
 301 }
 302
 303 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 304 {
 305         if (cfs_rq->on_list) {
 306                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 307                 cfs_rq->on_list = 0;
 308         }
 309 }
 310
 311 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 312 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 313         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 314
 315 /* Do the two (enqueued) entities belong to the same group ? */
 316 static inline int
 317 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 318 {
 319         if (se->cfs_rq == pse->cfs_rq)
 320                 return 1;
 321
 322         return 0;
 323 }
 324
 325 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 326 {
 327         return se->parent;
 328 }
 329
 330 /* return depth at which a sched entity is present in the hierarchy */
 331 static inline int depth_se(struct sched_entity *se)
 332 {
 333         int depth = 0;
 334
 335         for_each_sched_entity(se)
 336                 depth++;
 337
 338         return depth;
 339 }
 340
 341 static void
 342 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 343 {
 344         int se_depth, pse_depth;
 345
 346         /*
 347          * preemption test can be made between sibling entities who are in the
 348          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 349          * both tasks until we find their ancestors who are siblings of common
 350          * parent.
 351          */
 352
 353         /* First walk up until both entities are at same depth */
 354         se_depth = depth_se(*se);
 355         pse_depth = depth_se(*pse);
 356
 357         while (se_depth > pse_depth) {
 358                 se_depth--;
 359                 *se = parent_entity(*se);
 360         }
 361
 362         while (pse_depth > se_depth) {
 363                 pse_depth--;
 364                 *pse = parent_entity(*pse);
 365         }
 366
 367         while (!is_same_group(*se, *pse)) {
 368                 *se = parent_entity(*se);
 369                 *pse = parent_entity(*pse);
 370         }
 371 }
 372
 373 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 374
 375 static inline struct task_struct *task_of(struct sched_entity *se)
 376 {
 377         return container_of(se, struct task_struct, se);
 378 }
 379
 380 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 381 {
 382         return container_of(cfs_rq, struct rq, cfs);
 383 }
 384
 385 #define entity_is_task(se)      1
 386
 387 #define for_each_sched_entity(se) \
 388                 for (; se; se = NULL)
 389
 390 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 391 {
 392         return &task_rq(p)->cfs;
 393 }
 394
 395 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 396 {
 397         struct task_struct *p = task_of(se);
 398         struct rq *rq = task_rq(p);
 399
 400         return &rq->cfs;
 401 }
 402
 403 /* runqueue "owned" by this group */
 404 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 405 {
 406         return NULL;
 407 }
 408
 409 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 410 {
 411 }
 412
 413 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 414 {
 415 }
 416
 417 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 418                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 419
 420 static inline int
 421 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 422 {
 423         return 1;
 424 }
 425
 426 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 427 {
 428         return NULL;
 429 }
 430
 431 static inline void
 432 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 433 {
 434 }
 435
 436 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 437
 438 static __always_inline
 439 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 440
 441 /**************************************************************
 442  * Scheduling class tree data structure manipulation methods:
 443  */
 444
 445 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 446 {
 447         s64 delta = (s64)(vruntime - max_vruntime);
 448         if (delta > 0)
 449                 max_vruntime = vruntime;
 450
 451         return max_vruntime;
 452 }
 453
 454 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 455 {
 456         s64 delta = (s64)(vruntime - min_vruntime);
 457         if (delta < 0)
 458                 min_vruntime = vruntime;
 459
 460         return min_vruntime;
 461 }
 462
 463 static inline int entity_before(struct sched_entity *a,
 464                                 struct sched_entity *b)
 465 {
 466         return (s64)(a->vruntime - b->vruntime) < 0;
 467 }
 468
 469 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 470 {
 471         u64 vruntime = cfs_rq->min_vruntime;
 472
 473         if (cfs_rq->curr)
 474                 vruntime = cfs_rq->curr->vruntime;
 475
 476         if (cfs_rq->rb_leftmost) {
 477                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 478                                                    struct sched_entity,
 479                                                    run_node);
 480
 481                 if (!cfs_rq->curr)
 482                         vruntime = se->vruntime;
 483                 else
 484                         vruntime = min_vruntime(vruntime, se->vruntime);
 485         }
 486
 487         /* ensure we never gain time by being placed backwards. */
 488         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 489 #ifndef CONFIG_64BIT
 490         smp_wmb();
 491         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 492 #endif
 493 }
 494
 495 /*
 496  * Enqueue an entity into the rb-tree:
 497  */
 498 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 499 {
 500         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 501         struct rb_node *parent = NULL;
 502         struct sched_entity *entry;
 503         int leftmost = 1;
 504
 505         /*
 506          * Find the right place in the rbtree:
 507          */
 508         while (*link) {
 509                 parent = *link;
 510                 entry = rb_entry(parent, struct sched_entity, run_node);
 511                 /*
 512                  * We dont care about collisions. Nodes with
 513                  * the same key stay together.
 514                  */
 515                 if (entity_before(se, entry)) {
 516                         link = &parent->rb_left;
 517                 } else {
 518                         link = &parent->rb_right;
 519                         leftmost = 0;
 520                 }
 521         }
 522
 523         /*
 524          * Maintain a cache of leftmost tree entries (it is frequently
 525          * used):
 526          */
 527         if (leftmost)
 528                 cfs_rq->rb_leftmost = &se->run_node;
 529
 530         rb_link_node(&se->run_node, parent, link);
 531         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 532 }
 533
 534 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 535 {
 536         if (cfs_rq->rb_leftmost == &se->run_node) {
 537                 struct rb_node *next_node;
 538
 539                 next_node = rb_next(&se->run_node);
 540                 cfs_rq->rb_leftmost = next_node;
 541         }
 542
 543         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 544 }
 545
 546 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 547 {
 548         struct rb_node *left = cfs_rq->rb_leftmost;
 549
 550         if (!left)
 551                 return NULL;
 552
 553         return rb_entry(left, struct sched_entity, run_node);
 554 }
 555
 556 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 557 {
 558         struct rb_node *next = rb_next(&se->run_node);
 559
 560         if (!next)
 561                 return NULL;
 562
 563         return rb_entry(next, struct sched_entity, run_node);
 564 }
 565
 566 #ifdef CONFIG_SCHED_DEBUG
 567 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 568 {
 569         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 570
 571         if (!last)
 572                 return NULL;
 573
 574         return rb_entry(last, struct sched_entity, run_node);
 575 }
 576
 577 /**************************************************************
 578  * Scheduling class statistics methods:
 579  */
 580
 581 int sched_proc_update_handler(struct ctl_table *table, int write,
 582                 void __user *buffer, size_t *lenp,
 583                 loff_t *ppos)
 584 {
 585         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 586         int factor = get_update_sysctl_factor();
 587
 588         if (ret || !write)
 589                 return ret;
 590
 591         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 592                                         sysctl_sched_min_granularity);
 593
 594 #define WRT_SYSCTL(name) \
 595         (normalized_sysctl_##name = sysctl_##name / (factor))
 596         WRT_SYSCTL(sched_min_granularity);
 597         WRT_SYSCTL(sched_latency);
 598         WRT_SYSCTL(sched_wakeup_granularity);
 599 #undef WRT_SYSCTL
 600
 601         return 0;
 602 }
 603 #endif
 604
 605 /*
 606  * delta /= w
 607  */
 608 static inline unsigned long
 609 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 610 {
 611         if (unlikely(se->load.weight != NICE_0_LOAD))
 612                 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
 613
 614         return delta;
 615 }
 616
 617 /*
 618  * The idea is to set a period in which each task runs once.
 619  *
 620  * When there are too many tasks (sched_nr_latency) we have to stretch
 621  * this period because otherwise the slices get too small.
 622  *
 623  * p = (nr <= nl) ? l : l*nr/nl
 624  */
 625 static u64 __sched_period(unsigned long nr_running)
 626 {
 627         u64 period = sysctl_sched_latency;
 628         unsigned long nr_latency = sched_nr_latency;
 629
 630         if (unlikely(nr_running > nr_latency)) {
 631                 period = sysctl_sched_min_granularity;
 632                 period *= nr_running;
 633         }
 634
 635         return period;
 636 }
 637
 638 /*
 639  * We calculate the wall-time slice from the period by taking a part
 640  * proportional to the weight.
 641  *
 642  * s = p*P[w/rw]
 643  */
 644 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 645 {
 646         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 647
 648         for_each_sched_entity(se) {
 649                 struct load_weight *load;
 650                 struct load_weight lw;
 651
 652                 cfs_rq = cfs_rq_of(se);
 653                 load = &cfs_rq->load;
 654
 655                 if (unlikely(!se->on_rq)) {
 656                         lw = cfs_rq->load;
 657
 658                         update_load_add(&lw, se->load.weight);
 659                         load = &lw;
 660                 }
 661                 slice = calc_delta_mine(slice, se->load.weight, load);
 662         }
 663         return slice;
 664 }
 665
 666 /*
 667  * We calculate the vruntime slice of a to-be-inserted task.
 668  *
 669  * vs = s/w
 670  */
 671 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 672 {
 673         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 674 }
 675
 676 /*
 677  * Update the current task's runtime statistics. Skip current tasks that
 678  * are not in our scheduling class.
 679  */
 680 static inline void
 681 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 682               unsigned long delta_exec)
 683 {
 684         unsigned long delta_exec_weighted;
 685
 686         schedstat_set(curr->statistics.exec_max,
 687                       max((u64)delta_exec, curr->statistics.exec_max));
 688
 689         curr->sum_exec_runtime += delta_exec;
 690         schedstat_add(cfs_rq, exec_clock, delta_exec);
 691         delta_exec_weighted = calc_delta_fair(delta_exec, curr);
 692
 693         curr->vruntime += delta_exec_weighted;
 694         update_min_vruntime(cfs_rq);
 695 }
 696
 697 static void update_curr(struct cfs_rq *cfs_rq)
 698 {
 699         struct sched_entity *curr = cfs_rq->curr;
 700         u64 now = rq_of(cfs_rq)->clock_task;
 701         unsigned long delta_exec;
 702
 703         if (unlikely(!curr))
 704                 return;
 705
 706         /*
 707          * Get the amount of time the current task was running
 708          * since the last time we changed load (this cannot
 709          * overflow on 32 bits):
 710          */
 711         delta_exec = (unsigned long)(now - curr->exec_start);
 712         if (!delta_exec)
 713                 return;
 714
 715         __update_curr(cfs_rq, curr, delta_exec);
 716         curr->exec_start = now;
 717
 718         if (entity_is_task(curr)) {
 719                 struct task_struct *curtask = task_of(curr);
 720
 721                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 722                 cpuacct_charge(curtask, delta_exec);
 723                 account_group_exec_runtime(curtask, delta_exec);
 724         }
 725
 726         account_cfs_rq_runtime(cfs_rq, delta_exec);
 727 }
 728
 729 static inline void
 730 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 731 {
 732         schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
 733 }
 734
 735 /*
 736  * Task is being enqueued - update stats:
 737  */
 738 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 739 {
 740         /*
 741          * Are we enqueueing a waiting task? (for current tasks
 742          * a dequeue/enqueue event is a NOP)
 743          */
 744         if (se != cfs_rq->curr)
 745                 update_stats_wait_start(cfs_rq, se);
 746 }
 747
 748 static void
 749 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 750 {
 751         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 752                         rq_of(cfs_rq)->clock - se->statistics.wait_start));
 753         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 754         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 755                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 756 #ifdef CONFIG_SCHEDSTATS
 757         if (entity_is_task(se)) {
 758                 trace_sched_stat_wait(task_of(se),
 759                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
 760         }
 761 #endif
 762         schedstat_set(se->statistics.wait_start, 0);
 763 }
 764
 765 static inline void
 766 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 767 {
 768         /*
 769          * Mark the end of the wait period if dequeueing a
 770          * waiting task:
 771          */
 772         if (se != cfs_rq->curr)
 773                 update_stats_wait_end(cfs_rq, se);
 774 }
 775
 776 /*
 777  * We are picking a new current task - update its stats:
 778  */
 779 static inline void
 780 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 781 {
 782         /*
 783          * We are starting a new run period:
 784          */
 785         se->exec_start = rq_of(cfs_rq)->clock_task;
 786 }
 787
 788 /**************************************************
 789  * Scheduling class queueing methods:
 790  */
 791
 792 #ifdef CONFIG_NUMA_BALANCING
 793 /*
 794  * numa task sample period in ms
 795  */
 796 unsigned int sysctl_numa_balancing_scan_period_min = 100;
 797 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
 798 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 799
 800 /* Portion of address space to scan in MB */
 801 unsigned int sysctl_numa_balancing_scan_size = 256;
 802
 803 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 804 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 805
 806 static void task_numa_placement(struct task_struct *p)
 807 {
 808         int seq;
 809
 810         if (!p->mm)     /* for example, ksmd faulting in a user's mm */
 811                 return;
 812         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
 813         if (p->numa_scan_seq == seq)
 814                 return;
 815         p->numa_scan_seq = seq;
 816
 817         /* FIXME: Scheduling placement policy hints go here */
 818 }
 819
 820 /*
 821  * Got a PROT_NONE fault for a page on @node.
 822  */
 823 void task_numa_fault(int node, int pages, bool migrated)
 824 {
 825         struct task_struct *p = current;
 826
 827         if (!sched_feat_numa(NUMA))
 828                 return;
 829
 830         /* FIXME: Allocate task-specific structure for placement policy here */
 831
 832         /*
 833          * If pages are properly placed (did not migrate) then scan slower.
 834          * This is reset periodically in case of phase changes
 835          */
 836         if (!migrated)
 837                 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
 838                         p->numa_scan_period + jiffies_to_msecs(10));
 839
 840         task_numa_placement(p);
 841 }
 842
 843 static void reset_ptenuma_scan(struct task_struct *p)
 844 {
 845         ACCESS_ONCE(p->mm->numa_scan_seq)++;
 846         p->mm->numa_scan_offset = 0;
 847 }
 848
 849 /*
 850  * The expensive part of numa migration is done from task_work context.
 851  * Triggered from task_tick_numa().
 852  */
 853 void task_numa_work(struct callback_head *work)
 854 {
 855         unsigned long migrate, next_scan, now = jiffies;
 856         struct task_struct *p = current;
 857         struct mm_struct *mm = p->mm;
 858         struct vm_area_struct *vma;
 859         unsigned long start, end;
 860         long pages;
 861
 862         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
 863
 864         work->next = work; /* protect against double add */
 865         /*
 866          * Who cares about NUMA placement when they're dying.
 867          *
 868          * NOTE: make sure not to dereference p->mm before this check,
 869          * exit_task_work() happens _after_ exit_mm() so we could be called
 870          * without p->mm even though we still had it when we enqueued this
 871          * work.
 872          */
 873         if (p->flags & PF_EXITING)
 874                 return;
 875
 876         /*
 877          * We do not care about task placement until a task runs on a node
 878          * other than the first one used by the address space. This is
 879          * largely because migrations are driven by what CPU the task
 880          * is running on. If it's never scheduled on another node, it'll
 881          * not migrate so why bother trapping the fault.
 882          */
 883         if (mm->first_nid == NUMA_PTE_SCAN_INIT)
 884                 mm->first_nid = numa_node_id();
 885         if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
 886                 /* Are we running on a new node yet? */
 887                 if (numa_node_id() == mm->first_nid &&
 888                     !sched_feat_numa(NUMA_FORCE))
 889                         return;
 890
 891                 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
 892         }
 893
 894         /*
 895          * Reset the scan period if enough time has gone by. Objective is that
 896          * scanning will be reduced if pages are properly placed. As tasks
 897          * can enter different phases this needs to be re-examined. Lacking
 898          * proper tracking of reference behaviour, this blunt hammer is used.
 899          */
 900         migrate = mm->numa_next_reset;
 901         if (time_after(now, migrate)) {
 902                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 903                 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
 904                 xchg(&mm->numa_next_reset, next_scan);
 905         }
 906
 907         /*
 908          * Enforce maximal scan/migration frequency..
 909          */
 910         migrate = mm->numa_next_scan;
 911         if (time_before(now, migrate))
 912                 return;
 913
 914         if (p->numa_scan_period == 0)
 915                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
 916
 917         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
 918         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
 919                 return;
 920
 921         /*
 922          * Do not set pte_numa if the current running node is rate-limited.
 923          * This loses statistics on the fault but if we are unwilling to
 924          * migrate to this node, it is less likely we can do useful work
 925          */
 926         if (migrate_ratelimited(numa_node_id()))
 927                 return;
 928
 929         start = mm->numa_scan_offset;
 930         pages = sysctl_numa_balancing_scan_size;
 931         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
 932         if (!pages)
 933                 return;
 934
 935         down_read(&mm->mmap_sem);
 936         vma = find_vma(mm, start);
 937         if (!vma) {
 938                 reset_ptenuma_scan(p);
 939                 start = 0;
 940                 vma = mm->mmap;
 941         }
 942         for (; vma; vma = vma->vm_next) {
 943                 if (!vma_migratable(vma))
 944                         continue;
 945
 946                 /* Skip small VMAs. They are not likely to be of relevance */
 947                 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 948                         continue;
 949
 950                 do {
 951                         start = max(start, vma->vm_start);
 952                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
 953                         end = min(end, vma->vm_end);
 954                         pages -= change_prot_numa(vma, start, end);
 955
 956                         start = end;
 957                         if (pages <= 0)
 958                                 goto out;
 959                 } while (end != vma->vm_end);
 960         }
 961
 962 out:
 963         /*
 964          * It is possible to reach the end of the VMA list but the last few VMAs are
 965          * not guaranteed to the vma_migratable. If they are not, we would find the
 966          * !migratable VMA on the next scan but not reset the scanner to the start
 967          * so check it now.
 968          */
 969         if (vma)
 970                 mm->numa_scan_offset = start;
 971         else
 972                 reset_ptenuma_scan(p);
 973         up_read(&mm->mmap_sem);
 974 }
 975
 976 /*
 977  * Drive the periodic memory faults..
 978  */
 979 void task_tick_numa(struct rq *rq, struct task_struct *curr)
 980 {
 981         struct callback_head *work = &curr->numa_work;
 982         u64 period, now;
 983
 984         /*
 985          * We don't care about NUMA placement if we don't have memory.
 986          */
 987         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
 988                 return;
 989
 990         /*
 991          * Using runtime rather than walltime has the dual advantage that
 992          * we (mostly) drive the selection from busy threads and that the
 993          * task needs to have done some actual work before we bother with
 994          * NUMA placement.
 995          */
 996         now = curr->se.sum_exec_runtime;
 997         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 998
 999         if (now - curr->node_stamp > period) {
1000                 if (!curr->node_stamp)
1001                         curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1002                 curr->node_stamp = now;
1003
1004                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1005                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1006                         task_work_add(curr, work, true);
1007                 }
1008         }
1009 }
1010 #else
1011 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1012 {
1013 }
1014 #endif /* CONFIG_NUMA_BALANCING */
1015
1016 static void
1017 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1018 {
1019         update_load_add(&cfs_rq->load, se->load.weight);
1020         if (!parent_entity(se))
1021                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1022 #ifdef CONFIG_SMP
1023         if (entity_is_task(se))
1024                 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1025 #endif
1026         cfs_rq->nr_running++;
1027 }
1028
1029 static void
1030 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1031 {
1032         update_load_sub(&cfs_rq->load, se->load.weight);
1033         if (!parent_entity(se))
1034                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1035         if (entity_is_task(se))
1036                 list_del_init(&se->group_node);
1037         cfs_rq->nr_running--;
1038 }
1039
1040 #ifdef CONFIG_FAIR_GROUP_SCHED
1041 # ifdef CONFIG_SMP
1042 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1043 {
1044         long tg_weight;
1045
1046         /*
1047          * Use this CPU's actual weight instead of the last load_contribution
1048          * to gain a more accurate current total weight. See
1049          * update_cfs_rq_load_contribution().
1050          */
1051         tg_weight = atomic64_read(&tg->load_avg);
1052         tg_weight -= cfs_rq->tg_load_contrib;
1053         tg_weight += cfs_rq->load.weight;
1054
1055         return tg_weight;
1056 }
1057
1058 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1059 {
1060         long tg_weight, load, shares;
1061
1062         tg_weight = calc_tg_weight(tg, cfs_rq);
1063         load = cfs_rq->load.weight;
1064
1065         shares = (tg->shares * load);
1066         if (tg_weight)
1067                 shares /= tg_weight;
1068
1069         if (shares < MIN_SHARES)
1070                 shares = MIN_SHARES;
1071         if (shares > tg->shares)
1072                 shares = tg->shares;
1073
1074         return shares;
1075 }
1076 # else /* CONFIG_SMP */
1077 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1078 {
1079         return tg->shares;
1080 }
1081 # endif /* CONFIG_SMP */
1082 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1083                             unsigned long weight)
1084 {
1085         if (se->on_rq) {
1086                 /* commit outstanding execution time */
1087                 if (cfs_rq->curr == se)
1088                         update_curr(cfs_rq);
1089                 account_entity_dequeue(cfs_rq, se);
1090         }
1091
1092         update_load_set(&se->load, weight);
1093
1094         if (se->on_rq)
1095                 account_entity_enqueue(cfs_rq, se);
1096 }
1097
1098 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1099
1100 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1101 {
1102         struct task_group *tg;
1103         struct sched_entity *se;
1104         long shares;
1105
1106         tg = cfs_rq->tg;
1107         se = tg->se[cpu_of(rq_of(cfs_rq))];
1108         if (!se || throttled_hierarchy(cfs_rq))
1109                 return;
1110 #ifndef CONFIG_SMP
1111         if (likely(se->load.weight == tg->shares))
1112                 return;
1113 #endif
1114         shares = calc_cfs_shares(cfs_rq, tg);
1115
1116         reweight_entity(cfs_rq_of(se), se, shares);
1117 }
1118 #else /* CONFIG_FAIR_GROUP_SCHED */
1119 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1120 {
1121 }
1122 #endif /* CONFIG_FAIR_GROUP_SCHED */
1123
1124 /* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1125 #if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
1126 /*
1127  * We choose a half-life close to 1 scheduling period.
1128  * Note: The tables below are dependent on this value.
1129  */
1130 #define LOAD_AVG_PERIOD 32
1131 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1132 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1133
1134 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1135 static const u32 runnable_avg_yN_inv[] = {
1136         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1137         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1138         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1139         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1140         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1141         0x85aac367, 0x82cd8698,
1142 };
1143
1144 /*
1145  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1146  * over-estimates when re-combining.
1147  */
1148 static const u32 runnable_avg_yN_sum[] = {
1149             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1150          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1151         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1152 };
1153
1154 /*
1155  * Approximate:
1156  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1157  */
1158 static __always_inline u64 decay_load(u64 val, u64 n)
1159 {
1160         unsigned int local_n;
1161
1162         if (!n)
1163                 return val;
1164         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1165                 return 0;
1166
1167         /* after bounds checking we can collapse to 32-bit */
1168         local_n = n;
1169
1170         /*
1171          * As y^PERIOD = 1/2, we can combine
1172          *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1173          * With a look-up table which covers k^n (n<PERIOD)
1174          *
1175          * To achieve constant time decay_load.
1176          */
1177         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1178                 val >>= local_n / LOAD_AVG_PERIOD;
1179                 local_n %= LOAD_AVG_PERIOD;
1180         }
1181
1182         val *= runnable_avg_yN_inv[local_n];
1183         /* We don't use SRR here since we always want to round down. */
1184         return val >> 32;
1185 }
1186
1187 /*
1188  * For updates fully spanning n periods, the contribution to runnable
1189  * average will be: \Sum 1024*y^n
1190  *
1191  * We can compute this reasonably efficiently by combining:
1192  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1193  */
1194 static u32 __compute_runnable_contrib(u64 n)
1195 {
1196         u32 contrib = 0;
1197
1198         if (likely(n <= LOAD_AVG_PERIOD))
1199                 return runnable_avg_yN_sum[n];
1200         else if (unlikely(n >= LOAD_AVG_MAX_N))
1201                 return LOAD_AVG_MAX;
1202
1203         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1204         do {
1205                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1206                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1207
1208                 n -= LOAD_AVG_PERIOD;
1209         } while (n > LOAD_AVG_PERIOD);
1210
1211         contrib = decay_load(contrib, n);
1212         return contrib + runnable_avg_yN_sum[n];
1213 }
1214
1215 #ifdef CONFIG_HMP_VARIABLE_SCALE
1216
1217 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1218 struct hmp_global_attr {
1219         struct attribute attr;
1220         ssize_t (*show)(struct kobject *kobj,
1221                         struct attribute *attr, char *buf);
1222         ssize_t (*store)(struct kobject *a, struct attribute *b,
1223                         const char *c, size_t count);
1224         int *value;
1225         int (*to_sysfs)(int);
1226         int (*from_sysfs)(int);
1227 };
1228
1229 #define HMP_DATA_SYSFS_MAX 8
1230
1231 struct hmp_data_struct {
1232 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1233         int freqinvar_load_scale_enabled;
1234 #endif
1235         int multiplier; /* used to scale the time delta */
1236         struct attribute_group attr_group;
1237         struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1238         struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1239 } hmp_data;
1240
1241 static u64 hmp_variable_scale_convert(u64 delta);
1242 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1243 /* Frequency-Invariant Load Modification:
1244  * Loads are calculated as in PJT's patch however we also scale the current
1245  * contribution in line with the frequency of the CPU that the task was
1246  * executed on.
1247  * In this version, we use a simple linear scale derived from the maximum
1248  * frequency reported by CPUFreq. As an example:
1249  *
1250  * Consider that we ran a task for 100% of the previous interval.
1251  *
1252  * Our CPU was under asynchronous frequency control through one of the
1253  * CPUFreq governors.
1254  *
1255  * The CPUFreq governor reports that it is able to scale the CPU between
1256  * 500MHz and 1GHz.
1257  *
1258  * During the period, the CPU was running at 1GHz.
1259  *
1260  * In this case, our load contribution for that period is calculated as
1261  * 1 * (number_of_active_microseconds)
1262  *
1263  * This results in our task being able to accumulate maximum load as normal.
1264  *
1265  *
1266  * Consider now that our CPU was executing at 500MHz.
1267  *
1268  * We now scale the load contribution such that it is calculated as
1269  * 0.5 * (number_of_active_microseconds)
1270  *
1271  * Our task can only record 50% maximum load during this period.
1272  *
1273  * This represents the task consuming 50% of the CPU's *possible* compute
1274  * capacity. However the task did consume 100% of the CPU's *available*
1275  * compute capacity which is the value seen by the CPUFreq governor and
1276  * user-side CPU Utilization tools.
1277  *
1278  * Restricting tracked load to be scaled by the CPU's frequency accurately
1279  * represents the consumption of possible compute capacity and allows the
1280  * HMP migration's simple threshold migration strategy to interact more
1281  * predictably with CPUFreq's asynchronous compute capacity changes.
1282  */
1283 #define SCHED_FREQSCALE_SHIFT 10
1284 struct cpufreq_extents {
1285         u32 curr_scale;
1286         u32 min;
1287         u32 max;
1288         u32 flags;
1289 };
1290 /* Flag set when the governor in use only allows one frequency.
1291  * Disables scaling.
1292  */
1293 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1294
1295 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1296 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1297 #endif /* CONFIG_HMP_VARIABLE_SCALE */
1298
1299 /* We can represent the historical contribution to runnable average as the
1300  * coefficients of a geometric series.  To do this we sub-divide our runnable
1301  * history into segments of approximately 1ms (1024us); label the segment that
1302  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1303  *
1304  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1305  *      p0            p1           p2
1306  *     (now)       (~1ms ago)  (~2ms ago)
1307  *
1308  * Let u_i denote the fraction of p_i that the entity was runnable.
1309  *
1310  * We then designate the fractions u_i as our co-efficients, yielding the
1311  * following representation of historical load:
1312  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1313  *
1314  * We choose y based on the with of a reasonably scheduling period, fixing:
1315  *   y^32 = 0.5
1316  *
1317  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1318  * approximately half as much as the contribution to load within the last ms
1319  * (u_0).
1320  *
1321  * When a period "rolls over" and we have new u_0`, multiplying the previous
1322  * sum again by y is sufficient to update:
1323  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1324  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1325  */
1326 static __always_inline int __update_entity_runnable_avg(u64 now,
1327                                                         struct sched_avg *sa,
1328                                                         int runnable,
1329                                                         int running,
1330                                                         int cpu)
1331 {
1332         u64 delta, periods;
1333         u32 runnable_contrib;
1334         int delta_w, decayed = 0;
1335 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1336         u64 scaled_delta;
1337         u32 scaled_runnable_contrib;
1338         int scaled_delta_w;
1339         u32 curr_scale = 1024;
1340 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1341
1342         delta = now - sa->last_runnable_update;
1343 #ifdef CONFIG_HMP_VARIABLE_SCALE
1344         delta = hmp_variable_scale_convert(delta);
1345 #endif
1346         /*
1347          * This should only happen when time goes backwards, which it
1348          * unfortunately does during sched clock init when we swap over to TSC.
1349          */
1350         if ((s64)delta < 0) {
1351                 sa->last_runnable_update = now;
1352                 return 0;
1353         }
1354
1355         /*
1356          * Use 1024ns as the unit of measurement since it's a reasonable
1357          * approximation of 1us and fast to compute.
1358          */
1359         delta >>= 10;
1360         if (!delta)
1361                 return 0;
1362         sa->last_runnable_update = now;
1363
1364 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1365         /* retrieve scale factor for load */
1366         if (hmp_data.freqinvar_load_scale_enabled)
1367                 curr_scale = freq_scale[cpu].curr_scale;
1368 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1369
1370         /* delta_w is the amount already accumulated against our next period */
1371         delta_w = sa->runnable_avg_period % 1024;
1372         if (delta + delta_w >= 1024) {
1373                 /* period roll-over */
1374                 decayed = 1;
1375
1376                 /*
1377                  * Now that we know we're crossing a period boundary, figure
1378                  * out how much from delta we need to complete the current
1379                  * period and accrue it.
1380                  */
1381                 delta_w = 1024 - delta_w;
1382                 /* scale runnable time if necessary */
1383 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1384                 scaled_delta_w = (delta_w * curr_scale)
1385                                 >> SCHED_FREQSCALE_SHIFT;
1386                 if (runnable)
1387                         sa->runnable_avg_sum += scaled_delta_w;
1388                 if (running)
1389                         sa->usage_avg_sum += scaled_delta_w;
1390 #else
1391                 if (runnable)
1392                         sa->runnable_avg_sum += delta_w;
1393                 if (running)
1394                         sa->usage_avg_sum += delta_w;
1395 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1396                 sa->runnable_avg_period += delta_w;
1397
1398                 delta -= delta_w;
1399
1400                 /* Figure out how many additional periods this update spans */
1401                 periods = delta / 1024;
1402                 delta %= 1024;
1403                 /* decay the load we have accumulated so far */
1404                 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1405                                                   periods + 1);
1406                 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1407                                                      periods + 1);
1408                 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1409                 /* add the contribution from this period */
1410                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1411                 runnable_contrib = __compute_runnable_contrib(periods);
1412                 /* Apply load scaling if necessary.
1413                  * Note that multiplying the whole series is same as
1414                  * multiplying all terms
1415                  */
1416 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1417                 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1418                                 >> SCHED_FREQSCALE_SHIFT;
1419                 if (runnable)
1420                         sa->runnable_avg_sum += scaled_runnable_contrib;
1421                 if (running)
1422                         sa->usage_avg_sum += scaled_runnable_contrib;
1423 #else
1424                 if (runnable)
1425                         sa->runnable_avg_sum += runnable_contrib;
1426                 if (running)
1427                         sa->usage_avg_sum += runnable_contrib;
1428 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1429                 sa->runnable_avg_period += runnable_contrib;
1430         }
1431
1432         /* Remainder of delta accrued against u_0` */
1433         /* scale if necessary */
1434 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1435         scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1436         if (runnable)
1437                 sa->runnable_avg_sum += scaled_delta;
1438         if (running)
1439                 sa->usage_avg_sum += scaled_delta;
1440 #else
1441         if (runnable)
1442                 sa->runnable_avg_sum += delta;
1443         if (running)
1444                 sa->usage_avg_sum += delta;
1445 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1446         sa->runnable_avg_period += delta;
1447
1448         return decayed;
1449 }
1450
1451 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1452 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1453 {
1454         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1455         u64 decays = atomic64_read(&cfs_rq->decay_counter);
1456
1457         decays -= se->avg.decay_count;
1458         if (!decays)
1459                 return 0;
1460
1461         se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1462         se->avg.decay_count = 0;
1463
1464         return decays;
1465 }
1466
1467 #ifdef CONFIG_FAIR_GROUP_SCHED
1468 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1469                                                  int force_update)
1470 {
1471         struct task_group *tg = cfs_rq->tg;
1472         s64 tg_contrib;
1473
1474         tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1475         tg_contrib -= cfs_rq->tg_load_contrib;
1476
1477         if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1478                 atomic64_add(tg_contrib, &tg->load_avg);
1479                 cfs_rq->tg_load_contrib += tg_contrib;
1480         }
1481 }
1482
1483 /*
1484  * Aggregate cfs_rq runnable averages into an equivalent task_group
1485  * representation for computing load contributions.
1486  */
1487 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1488                                                   struct cfs_rq *cfs_rq)
1489 {
1490         struct task_group *tg = cfs_rq->tg;
1491         long contrib, usage_contrib;
1492
1493         /* The fraction of a cpu used by this cfs_rq */
1494         contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1495                           sa->runnable_avg_period + 1);
1496         contrib -= cfs_rq->tg_runnable_contrib;
1497
1498         usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1499                                 sa->runnable_avg_period + 1);
1500         usage_contrib -= cfs_rq->tg_usage_contrib;
1501
1502         /*
1503          * contrib/usage at this point represent deltas, only update if they
1504          * are substantive.
1505          */
1506         if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1507             (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
1508                 atomic_add(contrib, &tg->runnable_avg);
1509                 cfs_rq->tg_runnable_contrib += contrib;
1510
1511                 atomic_add(usage_contrib, &tg->usage_avg);
1512                 cfs_rq->tg_usage_contrib += usage_contrib;
1513         }
1514 }
1515
1516 static inline void __update_group_entity_contrib(struct sched_entity *se)
1517 {
1518         struct cfs_rq *cfs_rq = group_cfs_rq(se);
1519         struct task_group *tg = cfs_rq->tg;
1520         int runnable_avg;
1521
1522         u64 contrib;
1523
1524         contrib = cfs_rq->tg_load_contrib * tg->shares;
1525         se->avg.load_avg_contrib = div64_u64(contrib,
1526                                              atomic64_read(&tg->load_avg) + 1);
1527
1528         /*
1529          * For group entities we need to compute a correction term in the case
1530          * that they are consuming <1 cpu so that we would contribute the same
1531          * load as a task of equal weight.
1532          *
1533          * Explicitly co-ordinating this measurement would be expensive, but
1534          * fortunately the sum of each cpus contribution forms a usable
1535          * lower-bound on the true value.
1536          *
1537          * Consider the aggregate of 2 contributions.  Either they are disjoint
1538          * (and the sum represents true value) or they are disjoint and we are
1539          * understating by the aggregate of their overlap.
1540          *
1541          * Extending this to N cpus, for a given overlap, the maximum amount we
1542          * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1543          * cpus that overlap for this interval and w_i is the interval width.
1544          *
1545          * On a small machine; the first term is well-bounded which bounds the
1546          * total error since w_i is a subset of the period.  Whereas on a
1547          * larger machine, while this first term can be larger, if w_i is the
1548          * of consequential size guaranteed to see n_i*w_i quickly converge to
1549          * our upper bound of 1-cpu.
1550          */
1551         runnable_avg = atomic_read(&tg->runnable_avg);
1552         if (runnable_avg < NICE_0_LOAD) {
1553                 se->avg.load_avg_contrib *= runnable_avg;
1554                 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1555         }
1556 }
1557 #else
1558 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1559                                                  int force_update) {}
1560 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1561                                                   struct cfs_rq *cfs_rq) {}
1562 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
1563 #endif
1564
1565 static inline void __update_task_entity_contrib(struct sched_entity *se)
1566 {
1567         u32 contrib;
1568
1569         /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1570         contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1571         contrib /= (se->avg.runnable_avg_period + 1);
1572         se->avg.load_avg_contrib = scale_load(contrib);
1573         trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
1574         contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
1575         contrib /= (se->avg.runnable_avg_period + 1);
1576         se->avg.load_avg_ratio = scale_load(contrib);
1577         trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
1578 }
1579
1580 /* Compute the current contribution to load_avg by se, return any delta */
1581 static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio)
1582 {
1583         long old_contrib = se->avg.load_avg_contrib;
1584         long old_ratio   = se->avg.load_avg_ratio;
1585
1586         if (entity_is_task(se)) {
1587                 __update_task_entity_contrib(se);
1588         } else {
1589                 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
1590                 __update_group_entity_contrib(se);
1591         }
1592
1593         if (ratio)
1594                 *ratio = se->avg.load_avg_ratio - old_ratio;
1595         return se->avg.load_avg_contrib - old_contrib;
1596 }
1597
1598 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1599                                                  long load_contrib)
1600 {
1601         if (likely(load_contrib < cfs_rq->blocked_load_avg))
1602                 cfs_rq->blocked_load_avg -= load_contrib;
1603         else
1604                 cfs_rq->blocked_load_avg = 0;
1605 }
1606
1607 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1608
1609 /* Update a sched_entity's runnable average */
1610 static inline void update_entity_load_avg(struct sched_entity *se,
1611                                           int update_cfs_rq)
1612 {
1613         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1614         long contrib_delta, ratio_delta;
1615         u64 now;
1616         int cpu = -1;   /* not used in normal case */
1617
1618 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1619         cpu = cfs_rq->rq->cpu;
1620 #endif
1621         /*
1622          * For a group entity we need to use their owned cfs_rq_clock_task() in
1623          * case they are the parent of a throttled hierarchy.
1624          */
1625         if (entity_is_task(se))
1626                 now = cfs_rq_clock_task(cfs_rq);
1627         else
1628                 now = cfs_rq_clock_task(group_cfs_rq(se));
1629
1630         if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
1631                         cfs_rq->curr == se, cpu))
1632                 return;
1633
1634         contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta);
1635
1636         if (!update_cfs_rq)
1637                 return;
1638
1639         if (se->on_rq) {
1640                 cfs_rq->runnable_load_avg += contrib_delta;
1641                 rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta;
1642         } else {
1643                 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1644         }
1645 }
1646
1647 /*
1648  * Decay the load contributed by all blocked children and account this so that
1649  * their contribution may appropriately discounted when they wake up.
1650  */
1651 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
1652 {
1653         u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
1654         u64 decays;
1655
1656         decays = now - cfs_rq->last_decay;
1657         if (!decays && !force_update)
1658                 return;
1659
1660         if (atomic64_read(&cfs_rq->removed_load)) {
1661                 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1662                 subtract_blocked_load_contrib(cfs_rq, removed_load);
1663         }
1664
1665         if (decays) {
1666                 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1667                                                       decays);
1668                 atomic64_add(decays, &cfs_rq->decay_counter);
1669                 cfs_rq->last_decay = now;
1670         }
1671
1672         __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
1673 }
1674
1675 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1676 {
1677         int cpu = -1;   /* not used in normal case */
1678
1679 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1680         cpu = rq->cpu;
1681 #endif
1682         __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
1683                                      runnable, cpu);
1684         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
1685         trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
1686         trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
1687         trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
1688 }
1689
1690 /* Add the load generated by se into cfs_rq's child load-average */
1691 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1692                                                   struct sched_entity *se,
1693                                                   int wakeup)
1694 {
1695         /*
1696          * We track migrations using entity decay_count <= 0, on a wake-up
1697          * migration we use a negative decay count to track the remote decays
1698          * accumulated while sleeping.
1699          */
1700         if (unlikely(se->avg.decay_count <= 0)) {
1701                 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
1702                 if (se->avg.decay_count) {
1703                         /*
1704                          * In a wake-up migration we have to approximate the
1705                          * time sleeping.  This is because we can't synchronize
1706                          * clock_task between the two cpus, and it is not
1707                          * guaranteed to be read-safe.  Instead, we can
1708                          * approximate this using our carried decays, which are
1709                          * explicitly atomically readable.
1710                          */
1711                         se->avg.last_runnable_update -= (-se->avg.decay_count)
1712                                                         << 20;
1713                         update_entity_load_avg(se, 0);
1714                         /* Indicate that we're now synchronized and on-rq */
1715                         se->avg.decay_count = 0;
1716                 }
1717                 wakeup = 0;
1718         } else {
1719                 __synchronize_entity_decay(se);
1720         }
1721
1722         /* migrated tasks did not contribute to our blocked load */
1723         if (wakeup) {
1724                 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
1725                 update_entity_load_avg(se, 0);
1726         }
1727
1728         cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
1729         rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio;
1730
1731         /* we force update consideration on load-balancer moves */
1732         update_cfs_rq_blocked_load(cfs_rq, !wakeup);
1733 }
1734
1735 /*
1736  * Remove se's load from this cfs_rq child load-average, if the entity is
1737  * transitioning to a blocked state we track its projected decay using
1738  * blocked_load_avg.
1739  */
1740 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1741                                                   struct sched_entity *se,
1742                                                   int sleep)
1743 {
1744         update_entity_load_avg(se, 1);
1745         /* we force update consideration on load-balancer moves */
1746         update_cfs_rq_blocked_load(cfs_rq, !sleep);
1747
1748         cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
1749         rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio;
1750
1751         if (sleep) {
1752                 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1753                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1754         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
1755 }
1756
1757 /*
1758  * Update the rq's load with the elapsed running time before entering
1759  * idle. if the last scheduled task is not a CFS task, idle_enter will
1760  * be the only way to update the runnable statistic.
1761  */
1762 void idle_enter_fair(struct rq *this_rq)
1763 {
1764         update_rq_runnable_avg(this_rq, 1);
1765 }
1766
1767 /*
1768  * Update the rq's load with the elapsed idle time before a task is
1769  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1770  * be the only way to update the runnable statistic.
1771  */
1772 void idle_exit_fair(struct rq *this_rq)
1773 {
1774         update_rq_runnable_avg(this_rq, 0);
1775 }
1776
1777 #else
1778 static inline void update_entity_load_avg(struct sched_entity *se,
1779                                           int update_cfs_rq) {}
1780 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
1781 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
1782                                            struct sched_entity *se,
1783                                            int wakeup) {}
1784 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
1785                                            struct sched_entity *se,
1786                                            int sleep) {}
1787 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1788                                               int force_update) {}
1789 #endif
1790
1791 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1792 {
1793 #ifdef CONFIG_SCHEDSTATS
1794         struct task_struct *tsk = NULL;
1795
1796         if (entity_is_task(se))
1797                 tsk = task_of(se);
1798
1799         if (se->statistics.sleep_start) {
1800                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
1801
1802                 if ((s64)delta < 0)
1803                         delta = 0;
1804
1805                 if (unlikely(delta > se->statistics.sleep_max))
1806                         se->statistics.sleep_max = delta;
1807
1808                 se->statistics.sleep_start = 0;
1809                 se->statistics.sum_sleep_runtime += delta;
1810
1811                 if (tsk) {
1812                         account_scheduler_latency(tsk, delta >> 10, 1);
1813                         trace_sched_stat_sleep(tsk, delta);
1814                 }
1815         }
1816         if (se->statistics.block_start) {
1817                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
1818
1819                 if ((s64)delta < 0)
1820                         delta = 0;
1821
1822                 if (unlikely(delta > se->statistics.block_max))
1823                         se->statistics.block_max = delta;
1824
1825                 se->statistics.block_start = 0;
1826                 se->statistics.sum_sleep_runtime += delta;
1827
1828                 if (tsk) {
1829                         if (tsk->in_iowait) {
1830                                 se->statistics.iowait_sum += delta;
1831                                 se->statistics.iowait_count++;
1832                                 trace_sched_stat_iowait(tsk, delta);
1833                         }
1834
1835                         trace_sched_stat_blocked(tsk, delta);
1836
1837                         /*
1838                          * Blocking time is in units of nanosecs, so shift by
1839                          * 20 to get a milliseconds-range estimation of the
1840                          * amount of time that the task spent sleeping:
1841                          */
1842                         if (unlikely(prof_on == SLEEP_PROFILING)) {
1843                                 profile_hits(SLEEP_PROFILING,
1844                                                 (void *)get_wchan(tsk),
1845                                                 delta >> 20);
1846                         }
1847                         account_scheduler_latency(tsk, delta >> 10, 0);
1848                 }
1849         }
1850 #endif
1851 }
1852
1853 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1854 {
1855 #ifdef CONFIG_SCHED_DEBUG
1856         s64 d = se->vruntime - cfs_rq->min_vruntime;
1857
1858         if (d < 0)
1859                 d = -d;
1860
1861         if (d > 3*sysctl_sched_latency)
1862                 schedstat_inc(cfs_rq, nr_spread_over);
1863 #endif
1864 }
1865
1866 static void
1867 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1868 {
1869         u64 vruntime = cfs_rq->min_vruntime;
1870
1871         /*
1872          * The 'current' period is already promised to the current tasks,
1873          * however the extra weight of the new task will slow them down a
1874          * little, place the new task so that it fits in the slot that
1875          * stays open at the end.
1876          */
1877         if (initial && sched_feat(START_DEBIT))
1878                 vruntime += sched_vslice(cfs_rq, se);
1879
1880         /* sleeps up to a single latency don't count. */
1881         if (!initial) {
1882                 unsigned long thresh = sysctl_sched_latency;
1883
1884                 /*
1885                  * Halve their sleep time's effect, to allow
1886                  * for a gentler effect of sleepers:
1887                  */
1888                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1889                         thresh >>= 1;
1890
1891                 vruntime -= thresh;
1892         }
1893
1894         /* ensure we never gain time by being placed backwards. */
1895         se->vruntime = max_vruntime(se->vruntime, vruntime);
1896 }
1897
1898 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1899
1900 static void
1901 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1902 {
1903         /*
1904          * Update the normalized vruntime before updating min_vruntime
1905          * through callig update_curr().
1906          */
1907         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
1908                 se->vruntime += cfs_rq->min_vruntime;
1909
1910         /*
1911          * Update run-time statistics of the 'current'.
1912          */
1913         update_curr(cfs_rq);
1914         enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
1915         account_entity_enqueue(cfs_rq, se);
1916         update_cfs_shares(cfs_rq);
1917
1918         if (flags & ENQUEUE_WAKEUP) {
1919                 place_entity(cfs_rq, se, 0);
1920                 enqueue_sleeper(cfs_rq, se);
1921         }
1922
1923         update_stats_enqueue(cfs_rq, se);
1924         check_spread(cfs_rq, se);
1925         if (se != cfs_rq->curr)
1926                 __enqueue_entity(cfs_rq, se);
1927         se->on_rq = 1;
1928
1929         if (cfs_rq->nr_running == 1) {
1930                 list_add_leaf_cfs_rq(cfs_rq);
1931                 check_enqueue_throttle(cfs_rq);
1932         }
1933 }
1934
1935 static void __clear_buddies_last(struct sched_entity *se)
1936 {
1937         for_each_sched_entity(se) {
1938                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1939                 if (cfs_rq->last == se)
1940                         cfs_rq->last = NULL;
1941                 else
1942                         break;
1943         }
1944 }
1945
1946 static void __clear_buddies_next(struct sched_entity *se)
1947 {
1948         for_each_sched_entity(se) {
1949                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1950                 if (cfs_rq->next == se)
1951                         cfs_rq->next = NULL;
1952                 else
1953                         break;
1954         }
1955 }
1956
1957 static void __clear_buddies_skip(struct sched_entity *se)
1958 {
1959         for_each_sched_entity(se) {
1960                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1961                 if (cfs_rq->skip == se)
1962                         cfs_rq->skip = NULL;
1963                 else
1964                         break;
1965         }
1966 }
1967
1968 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1969 {
1970         if (cfs_rq->last == se)
1971                 __clear_buddies_last(se);
1972
1973         if (cfs_rq->next == se)
1974                 __clear_buddies_next(se);
1975
1976         if (cfs_rq->skip == se)
1977                 __clear_buddies_skip(se);
1978 }
1979
1980 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1981
1982 static void
1983 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1984 {
1985         /*
1986          * Update run-time statistics of the 'current'.
1987          */
1988         update_curr(cfs_rq);
1989         dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
1990
1991         update_stats_dequeue(cfs_rq, se);
1992         if (flags & DEQUEUE_SLEEP) {
1993 #ifdef CONFIG_SCHEDSTATS
1994                 if (entity_is_task(se)) {
1995                         struct task_struct *tsk = task_of(se);
1996
1997                         if (tsk->state & TASK_INTERRUPTIBLE)
1998                                 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
1999                         if (tsk->state & TASK_UNINTERRUPTIBLE)
2000                                 se->statistics.block_start = rq_of(cfs_rq)->clock;
2001                 }
2002 #endif
2003         }
2004
2005         clear_buddies(cfs_rq, se);
2006
2007         if (se != cfs_rq->curr)
2008                 __dequeue_entity(cfs_rq, se);
2009         se->on_rq = 0;
2010         account_entity_dequeue(cfs_rq, se);
2011
2012         /*
2013          * Normalize the entity after updating the min_vruntime because the
2014          * update can refer to the ->curr item and we need to reflect this
2015          * movement in our normalized position.
2016          */
2017         if (!(flags & DEQUEUE_SLEEP))
2018                 se->vruntime -= cfs_rq->min_vruntime;
2019
2020         /* return excess runtime on last dequeue */
2021         return_cfs_rq_runtime(cfs_rq);
2022
2023         update_min_vruntime(cfs_rq);
2024         update_cfs_shares(cfs_rq);
2025 }
2026
2027 /*
2028  * Preempt the current task with a newly woken task if needed:
2029  */
2030 static void
2031 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2032 {
2033         unsigned long ideal_runtime, delta_exec;
2034         struct sched_entity *se;
2035         s64 delta;
2036
2037         ideal_runtime = sched_slice(cfs_rq, curr);
2038         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2039         if (delta_exec > ideal_runtime) {
2040                 resched_task(rq_of(cfs_rq)->curr);
2041                 /*
2042                  * The current task ran long enough, ensure it doesn't get
2043                  * re-elected due to buddy favours.
2044                  */
2045                 clear_buddies(cfs_rq, curr);
2046                 return;
2047         }
2048
2049         /*
2050          * Ensure that a task that missed wakeup preemption by a
2051          * narrow margin doesn't have to wait for a full slice.
2052          * This also mitigates buddy induced latencies under load.
2053          */
2054         if (delta_exec < sysctl_sched_min_granularity)
2055                 return;
2056
2057         se = __pick_first_entity(cfs_rq);
2058         delta = curr->vruntime - se->vruntime;
2059
2060         if (delta < 0)
2061                 return;
2062
2063         if (delta > ideal_runtime)
2064                 resched_task(rq_of(cfs_rq)->curr);
2065 }
2066
2067 static void
2068 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2069 {
2070         /* 'current' is not kept within the tree. */
2071         if (se->on_rq) {
2072                 /*
2073                  * Any task has to be enqueued before it get to execute on
2074                  * a CPU. So account for the time it spent waiting on the
2075                  * runqueue.
2076                  */
2077                 update_stats_wait_end(cfs_rq, se);
2078                 __dequeue_entity(cfs_rq, se);
2079                 update_entity_load_avg(se, 1);
2080         }
2081
2082         update_stats_curr_start(cfs_rq, se);
2083         cfs_rq->curr = se;
2084 #ifdef CONFIG_SCHEDSTATS
2085         /*
2086          * Track our maximum slice length, if the CPU's load is at
2087          * least twice that of our own weight (i.e. dont track it
2088          * when there are only lesser-weight tasks around):
2089          */
2090         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2091                 se->statistics.slice_max = max(se->statistics.slice_max,
2092                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
2093         }
2094 #endif
2095         se->prev_sum_exec_runtime = se->sum_exec_runtime;
2096 }
2097
2098 static int
2099 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2100
2101 /*
2102  * Pick the next process, keeping these things in mind, in this order:
2103  * 1) keep things fair between processes/task groups
2104  * 2) pick the "next" process, since someone really wants that to run
2105  * 3) pick the "last" process, for cache locality
2106  * 4) do not run the "skip" process, if something else is available
2107  */
2108 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2109 {
2110         struct sched_entity *se = __pick_first_entity(cfs_rq);
2111         struct sched_entity *left = se;
2112
2113         /*
2114          * Avoid running the skip buddy, if running something else can
2115          * be done without getting too unfair.
2116          */
2117         if (cfs_rq->skip == se) {
2118                 struct sched_entity *second = __pick_next_entity(se);
2119                 if (second && wakeup_preempt_entity(second, left) < 1)
2120                         se = second;
2121         }
2122
2123         /*
2124          * Prefer last buddy, try to return the CPU to a preempted task.
2125          */
2126         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2127                 se = cfs_rq->last;
2128
2129         /*
2130          * Someone really wants this to run. If it's not unfair, run it.
2131          */
2132         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2133                 se = cfs_rq->next;
2134
2135         clear_buddies(cfs_rq, se);
2136
2137         return se;
2138 }
2139
2140 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2141
2142 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2143 {
2144         /*
2145          * If still on the runqueue then deactivate_task()
2146          * was not called and update_curr() has to be done:
2147          */
2148         if (prev->on_rq)
2149                 update_curr(cfs_rq);
2150
2151         /* throttle cfs_rqs exceeding runtime */
2152         check_cfs_rq_runtime(cfs_rq);
2153
2154         check_spread(cfs_rq, prev);
2155         if (prev->on_rq) {
2156                 update_stats_wait_start(cfs_rq, prev);
2157                 /* Put 'current' back into the tree. */
2158                 __enqueue_entity(cfs_rq, prev);
2159                 /* in !on_rq case, update occurred at dequeue */
2160                 update_entity_load_avg(prev, 1);
2161         }
2162         cfs_rq->curr = NULL;
2163 }
2164
2165 static void
2166 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2167 {
2168         /*
2169          * Update run-time statistics of the 'current'.
2170          */
2171         update_curr(cfs_rq);
2172
2173         /*
2174          * Ensure that runnable average is periodically updated.
2175          */
2176         update_entity_load_avg(curr, 1);
2177         update_cfs_rq_blocked_load(cfs_rq, 1);
2178
2179 #ifdef CONFIG_SCHED_HRTICK
2180         /*
2181          * queued ticks are scheduled to match the slice, so don't bother
2182          * validating it and just reschedule.
2183          */
2184         if (queued) {
2185                 resched_task(rq_of(cfs_rq)->curr);
2186                 return;
2187         }
2188         /*
2189          * don't let the period tick interfere with the hrtick preemption
2190          */
2191         if (!sched_feat(DOUBLE_TICK) &&
2192                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2193                 return;
2194 #endif
2195
2196         if (cfs_rq->nr_running > 1)
2197                 check_preempt_tick(cfs_rq, curr);
2198 }
2199
2200
2201 /**************************************************
2202  * CFS bandwidth control machinery
2203  */
2204
2205 #ifdef CONFIG_CFS_BANDWIDTH
2206
2207 #ifdef HAVE_JUMP_LABEL
2208 static struct static_key __cfs_bandwidth_used;
2209
2210 static inline bool cfs_bandwidth_used(void)
2211 {
2212         return static_key_false(&__cfs_bandwidth_used);
2213 }
2214
2215 void account_cfs_bandwidth_used(int enabled, int was_enabled)
2216 {
2217         /* only need to count groups transitioning between enabled/!enabled */
2218         if (enabled && !was_enabled)
2219                 static_key_slow_inc(&__cfs_bandwidth_used);
2220         else if (!enabled && was_enabled)
2221                 static_key_slow_dec(&__cfs_bandwidth_used);
2222 }
2223 #else /* HAVE_JUMP_LABEL */
2224 static bool cfs_bandwidth_used(void)
2225 {
2226         return true;
2227 }
2228
2229 void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
2230 #endif /* HAVE_JUMP_LABEL */
2231
2232 /*
2233  * default period for cfs group bandwidth.
2234  * default: 0.1s, units: nanoseconds
2235  */
2236 static inline u64 default_cfs_period(void)
2237 {
2238         return 100000000ULL;
2239 }
2240
2241 static inline u64 sched_cfs_bandwidth_slice(void)
2242 {
2243         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2244 }
2245
2246 /*
2247  * Replenish runtime according to assigned quota and update expiration time.
2248  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2249  * additional synchronization around rq->lock.
2250  *
2251  * requires cfs_b->lock
2252  */
2253 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2254 {
2255         u64 now;
2256
2257         if (cfs_b->quota == RUNTIME_INF)
2258                 return;
2259
2260         now = sched_clock_cpu(smp_processor_id());
2261         cfs_b->runtime = cfs_b->quota;
2262         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2263 }
2264
2265 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2266 {
2267         return &tg->cfs_bandwidth;
2268 }
2269
2270 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2271 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2272 {
2273         if (unlikely(cfs_rq->throttle_count))
2274                 return cfs_rq->throttled_clock_task;
2275
2276         return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2277 }
2278
2279 /* returns 0 on failure to allocate runtime */
2280 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2281 {
2282         struct task_group *tg = cfs_rq->tg;
2283         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2284         u64 amount = 0, min_amount, expires;
2285
2286         /* note: this is a positive sum as runtime_remaining <= 0 */
2287         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2288
2289         raw_spin_lock(&cfs_b->lock);
2290         if (cfs_b->quota == RUNTIME_INF)
2291                 amount = min_amount;
2292         else {
2293                 /*
2294                  * If the bandwidth pool has become inactive, then at least one
2295                  * period must have elapsed since the last consumption.
2296                  * Refresh the global state and ensure bandwidth timer becomes
2297                  * active.
2298                  */
2299                 if (!cfs_b->timer_active) {
2300                         __refill_cfs_bandwidth_runtime(cfs_b);
2301                         __start_cfs_bandwidth(cfs_b);
2302                 }
2303
2304                 if (cfs_b->runtime > 0) {
2305                         amount = min(cfs_b->runtime, min_amount);
2306                         cfs_b->runtime -= amount;
2307                         cfs_b->idle = 0;
2308                 }
2309         }
2310         expires = cfs_b->runtime_expires;
2311         raw_spin_unlock(&cfs_b->lock);
2312
2313         cfs_rq->runtime_remaining += amount;
2314         /*
2315          * we may have advanced our local expiration to account for allowed
2316          * spread between our sched_clock and the one on which runtime was
2317          * issued.
2318          */
2319         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2320                 cfs_rq->runtime_expires = expires;
2321
2322         return cfs_rq->runtime_remaining > 0;
2323 }
2324
2325 /*
2326  * Note: This depends on the synchronization provided by sched_clock and the
2327  * fact that rq->clock snapshots this value.
2328  */
2329 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2330 {
2331         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2332         struct rq *rq = rq_of(cfs_rq);
2333
2334         /* if the deadline is ahead of our clock, nothing to do */
2335         if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
2336                 return;
2337
2338         if (cfs_rq->runtime_remaining < 0)
2339                 return;
2340
2341         /*
2342          * If the local deadline has passed we have to consider the
2343          * possibility that our sched_clock is 'fast' and the global deadline
2344          * has not truly expired.
2345          *
2346          * Fortunately we can check determine whether this the case by checking
2347          * whether the global deadline has advanced.
2348          */
2349
2350         if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2351                 /* extend local deadline, drift is bounded above by 2 ticks */
2352                 cfs_rq->runtime_expires += TICK_NSEC;
2353         } else {
2354                 /* global deadline is ahead, expiration has passed */
2355                 cfs_rq->runtime_remaining = 0;
2356         }
2357 }
2358
2359 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2360                                      unsigned long delta_exec)
2361 {
2362         /* dock delta_exec before expiring quota (as it could span periods) */
2363         cfs_rq->runtime_remaining -= delta_exec;
2364         expire_cfs_rq_runtime(cfs_rq);
2365
2366         if (likely(cfs_rq->runtime_remaining > 0))
2367                 return;
2368
2369         /*
2370          * if we're unable to extend our runtime we resched so that the active
2371          * hierarchy can be throttled
2372          */
2373         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2374                 resched_task(rq_of(cfs_rq)->curr);
2375 }
2376
2377 static __always_inline
2378 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
2379 {
2380         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
2381                 return;
2382
2383         __account_cfs_rq_runtime(cfs_rq, delta_exec);
2384 }
2385
2386 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2387 {
2388         return cfs_bandwidth_used() && cfs_rq->throttled;
2389 }
2390
2391 /* check whether cfs_rq, or any parent, is throttled */
2392 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2393 {
2394         return cfs_bandwidth_used() && cfs_rq->throttle_count;
2395 }
2396
2397 /*
2398  * Ensure that neither of the group entities corresponding to src_cpu or
2399  * dest_cpu are members of a throttled hierarchy when performing group
2400  * load-balance operations.
2401  */
2402 static inline int throttled_lb_pair(struct task_group *tg,
2403                                     int src_cpu, int dest_cpu)
2404 {
2405         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2406
2407         src_cfs_rq = tg->cfs_rq[src_cpu];
2408         dest_cfs_rq = tg->cfs_rq[dest_cpu];
2409
2410         return throttled_hierarchy(src_cfs_rq) ||
2411                throttled_hierarchy(dest_cfs_rq);
2412 }
2413
2414 /* updated child weight may affect parent so we have to do this bottom up */
2415 static int tg_unthrottle_up(struct task_group *tg, void *data)
2416 {
2417         struct rq *rq = data;
2418         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2419
2420         cfs_rq->throttle_count--;
2421 #ifdef CONFIG_SMP
2422         if (!cfs_rq->throttle_count) {
2423                 /* adjust cfs_rq_clock_task() */
2424                 cfs_rq->throttled_clock_task_time += rq->clock_task -
2425                                              cfs_rq->throttled_clock_task;
2426         }
2427 #endif
2428
2429         return 0;
2430 }
2431
2432 static int tg_throttle_down(struct task_group *tg, void *data)
2433 {
2434         struct rq *rq = data;
2435         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2436
2437         /* group is entering throttled state, stop time */
2438         if (!cfs_rq->throttle_count)
2439                 cfs_rq->throttled_clock_task = rq->clock_task;
2440         cfs_rq->throttle_count++;
2441
2442         return 0;
2443 }
2444
2445 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2446 {
2447         struct rq *rq = rq_of(cfs_rq);
2448         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2449         struct sched_entity *se;
2450         long task_delta, dequeue = 1;
2451
2452         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2453
2454         /* freeze hierarchy runnable averages while throttled */
2455         rcu_read_lock();
2456         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2457         rcu_read_unlock();
2458
2459         task_delta = cfs_rq->h_nr_running;
2460         for_each_sched_entity(se) {
2461                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2462                 /* throttled entity or throttle-on-deactivate */
2463                 if (!se->on_rq)
2464                         break;
2465
2466                 if (dequeue)
2467                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2468                 qcfs_rq->h_nr_running -= task_delta;
2469
2470                 if (qcfs_rq->load.weight)
2471                         dequeue = 0;
2472         }
2473
2474         if (!se)
2475                 rq->nr_running -= task_delta;
2476
2477         cfs_rq->throttled = 1;
2478         cfs_rq->throttled_clock = rq->clock;
2479         raw_spin_lock(&cfs_b->lock);
2480         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
2481         raw_spin_unlock(&cfs_b->lock);
2482 }
2483
2484 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
2485 {
2486         struct rq *rq = rq_of(cfs_rq);
2487         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2488         struct sched_entity *se;
2489         int enqueue = 1;
2490         long task_delta;
2491
2492         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2493
2494         cfs_rq->throttled = 0;
2495         raw_spin_lock(&cfs_b->lock);
2496         cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
2497         list_del_rcu(&cfs_rq->throttled_list);
2498         raw_spin_unlock(&cfs_b->lock);
2499
2500         update_rq_clock(rq);
2501         /* update hierarchical throttle state */
2502         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2503
2504         if (!cfs_rq->load.weight)
2505                 return;
2506
2507         task_delta = cfs_rq->h_nr_running;
2508         for_each_sched_entity(se) {
2509                 if (se->on_rq)
2510                         enqueue = 0;
2511
2512                 cfs_rq = cfs_rq_of(se);
2513                 if (enqueue)
2514                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2515                 cfs_rq->h_nr_running += task_delta;
2516
2517                 if (cfs_rq_throttled(cfs_rq))
2518                         break;
2519         }
2520
2521         if (!se)
2522                 rq->nr_running += task_delta;
2523
2524         /* determine whether we need to wake up potentially idle cpu */
2525         if (rq->curr == rq->idle && rq->cfs.nr_running)
2526                 resched_task(rq->curr);
2527 }
2528
2529 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2530                 u64 remaining, u64 expires)
2531 {
2532         struct cfs_rq *cfs_rq;
2533         u64 runtime = remaining;
2534
2535         rcu_read_lock();
2536         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2537                                 throttled_list) {
2538                 struct rq *rq = rq_of(cfs_rq);
2539
2540                 raw_spin_lock(&rq->lock);
2541                 if (!cfs_rq_throttled(cfs_rq))
2542                         goto next;
2543
2544                 runtime = -cfs_rq->runtime_remaining + 1;
2545                 if (runtime > remaining)
2546                         runtime = remaining;
2547                 remaining -= runtime;
2548
2549                 cfs_rq->runtime_remaining += runtime;
2550                 cfs_rq->runtime_expires = expires;
2551
2552                 /* we check whether we're throttled above */
2553                 if (cfs_rq->runtime_remaining > 0)
2554                         unthrottle_cfs_rq(cfs_rq);
2555
2556 next:
2557                 raw_spin_unlock(&rq->lock);
2558
2559                 if (!remaining)
2560                         break;
2561         }
2562         rcu_read_unlock();
2563
2564         return remaining;
2565 }
2566
2567 /*
2568  * Responsible for refilling a task_group's bandwidth and unthrottling its
2569  * cfs_rqs as appropriate. If there has been no activity within the last
2570  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2571  * used to track this state.
2572  */
2573 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2574 {
2575         u64 runtime, runtime_expires;
2576         int idle = 1, throttled;
2577
2578         raw_spin_lock(&cfs_b->lock);
2579         /* no need to continue the timer with no bandwidth constraint */
2580         if (cfs_b->quota == RUNTIME_INF)
2581                 goto out_unlock;
2582
2583         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2584         /* idle depends on !throttled (for the case of a large deficit) */
2585         idle = cfs_b->idle && !throttled;
2586         cfs_b->nr_periods += overrun;
2587
2588         /* if we're going inactive then everything else can be deferred */
2589         if (idle)
2590                 goto out_unlock;
2591
2592         __refill_cfs_bandwidth_runtime(cfs_b);
2593
2594         if (!throttled) {
2595                 /* mark as potentially idle for the upcoming period */
2596                 cfs_b->idle = 1;
2597                 goto out_unlock;
2598         }
2599
2600         /* account preceding periods in which throttling occurred */
2601         cfs_b->nr_throttled += overrun;
2602
2603         /*
2604          * There are throttled entities so we must first use the new bandwidth
2605          * to unthrottle them before making it generally available.  This
2606          * ensures that all existing debts will be paid before a new cfs_rq is
2607          * allowed to run.
2608          */
2609         runtime = cfs_b->runtime;
2610         runtime_expires = cfs_b->runtime_expires;
2611         cfs_b->runtime = 0;
2612
2613         /*
2614          * This check is repeated as we are holding onto the new bandwidth
2615          * while we unthrottle.  This can potentially race with an unthrottled
2616          * group trying to acquire new bandwidth from the global pool.
2617          */
2618         while (throttled && runtime > 0) {
2619                 raw_spin_unlock(&cfs_b->lock);
2620                 /* we can't nest cfs_b->lock while distributing bandwidth */
2621                 runtime = distribute_cfs_runtime(cfs_b, runtime,
2622                                                  runtime_expires);
2623                 raw_spin_lock(&cfs_b->lock);
2624
2625                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2626         }
2627
2628         /* return (any) remaining runtime */
2629         cfs_b->runtime = runtime;
2630         /*
2631          * While we are ensured activity in the period following an
2632          * unthrottle, this also covers the case in which the new bandwidth is
2633          * insufficient to cover the existing bandwidth deficit.  (Forcing the
2634          * timer to remain active while there are any throttled entities.)
2635          */
2636         cfs_b->idle = 0;
2637 out_unlock:
2638         if (idle)
2639                 cfs_b->timer_active = 0;
2640         raw_spin_unlock(&cfs_b->lock);
2641
2642         return idle;
2643 }
2644
2645 /* a cfs_rq won't donate quota below this amount */
2646 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2647 /* minimum remaining period time to redistribute slack quota */
2648 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2649 /* how long we wait to gather additional slack before distributing */
2650 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2651
2652 /* are we near the end of the current quota period? */
2653 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2654 {
2655         struct hrtimer *refresh_timer = &cfs_b->period_timer;
2656         u64 remaining;
2657
2658         /* if the call-back is running a quota refresh is already occurring */
2659         if (hrtimer_callback_running(refresh_timer))
2660                 return 1;
2661
2662         /* is a quota refresh about to occur? */
2663         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2664         if (remaining < min_expire)
2665                 return 1;
2666
2667         return 0;
2668 }
2669
2670 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2671 {
2672         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2673
2674         /* if there's a quota refresh soon don't bother with slack */
2675         if (runtime_refresh_within(cfs_b, min_left))
2676                 return;
2677
2678         start_bandwidth_timer(&cfs_b->slack_timer,
2679                                 ns_to_ktime(cfs_bandwidth_slack_period));
2680 }
2681
2682 /* we know any runtime found here is valid as update_curr() precedes return */
2683 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2684 {
2685         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2686         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2687
2688         if (slack_runtime <= 0)
2689                 return;
2690
2691         raw_spin_lock(&cfs_b->lock);
2692         if (cfs_b->quota != RUNTIME_INF &&
2693             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2694                 cfs_b->runtime += slack_runtime;
2695
2696                 /* we are under rq->lock, defer unthrottling using a timer */
2697                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2698                     !list_empty(&cfs_b->throttled_cfs_rq))
2699                         start_cfs_slack_bandwidth(cfs_b);
2700         }
2701         raw_spin_unlock(&cfs_b->lock);
2702
2703         /* even if it's not valid for return we don't want to try again */
2704         cfs_rq->runtime_remaining -= slack_runtime;
2705 }
2706
2707 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2708 {
2709         if (!cfs_bandwidth_used())
2710                 return;
2711
2712         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
2713                 return;
2714
2715         __return_cfs_rq_runtime(cfs_rq);
2716 }
2717
2718 /*
2719  * This is done with a timer (instead of inline with bandwidth return) since
2720  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2721  */
2722 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2723 {
2724         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2725         u64 expires;
2726
2727         /* confirm we're still not at a refresh boundary */
2728         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
2729                 return;
2730
2731         raw_spin_lock(&cfs_b->lock);
2732         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2733                 runtime = cfs_b->runtime;
2734                 cfs_b->runtime = 0;
2735         }
2736         expires = cfs_b->runtime_expires;
2737         raw_spin_unlock(&cfs_b->lock);
2738
2739         if (!runtime)
2740                 return;
2741
2742         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2743
2744         raw_spin_lock(&cfs_b->lock);
2745         if (expires == cfs_b->runtime_expires)
2746                 cfs_b->runtime = runtime;
2747         raw_spin_unlock(&cfs_b->lock);
2748 }
2749
2750 /*
2751  * When a group wakes up we want to make sure that its quota is not already
2752  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2753  * runtime as update_curr() throttling can not not trigger until it's on-rq.
2754  */
2755 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2756 {
2757         if (!cfs_bandwidth_used())
2758                 return;
2759
2760         /* an active group must be handled by the update_curr()->put() path */
2761         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2762                 return;
2763
2764         /* ensure the group is not already throttled */
2765         if (cfs_rq_throttled(cfs_rq))
2766                 return;
2767
2768         /* update runtime allocation */
2769         account_cfs_rq_runtime(cfs_rq, 0);
2770         if (cfs_rq->runtime_remaining <= 0)
2771                 throttle_cfs_rq(cfs_rq);
2772 }
2773
2774 /* conditionally throttle active cfs_rq's from put_prev_entity() */
2775 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2776 {
2777         if (!cfs_bandwidth_used())
2778                 return;
2779
2780         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2781                 return;
2782
2783         /*
2784          * it's possible for a throttled entity to be forced into a running
2785          * state (e.g. set_curr_task), in this case we're finished.
2786          */
2787         if (cfs_rq_throttled(cfs_rq))
2788                 return;
2789
2790         throttle_cfs_rq(cfs_rq);
2791 }
2792
2793 static inline u64 default_cfs_period(void);
2794 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2795 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2796
2797 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2798 {
2799         struct cfs_bandwidth *cfs_b =
2800                 container_of(timer, struct cfs_bandwidth, slack_timer);
2801         do_sched_cfs_slack_timer(cfs_b);
2802
2803         return HRTIMER_NORESTART;
2804 }
2805
2806 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2807 {
2808         struct cfs_bandwidth *cfs_b =
2809                 container_of(timer, struct cfs_bandwidth, period_timer);
2810         ktime_t now;
2811         int overrun;
2812         int idle = 0;
2813
2814         for (;;) {
2815                 now = hrtimer_cb_get_time(timer);
2816                 overrun = hrtimer_forward(timer, now, cfs_b->period);
2817
2818                 if (!overrun)
2819                         break;
2820
2821                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2822         }
2823
2824         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2825 }
2826
2827 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2828 {
2829         raw_spin_lock_init(&cfs_b->lock);
2830         cfs_b->runtime = 0;
2831         cfs_b->quota = RUNTIME_INF;
2832         cfs_b->period = ns_to_ktime(default_cfs_period());
2833
2834         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2835         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2836         cfs_b->period_timer.function = sched_cfs_period_timer;
2837         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2838         cfs_b->slack_timer.function = sched_cfs_slack_timer;
2839 }
2840
2841 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2842 {
2843         cfs_rq->runtime_enabled = 0;
2844         INIT_LIST_HEAD(&cfs_rq->throttled_list);
2845 }
2846
2847 /* requires cfs_b->lock, may release to reprogram timer */
2848 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2849 {
2850         /*
2851          * The timer may be active because we're trying to set a new bandwidth
2852          * period or because we're racing with the tear-down path
2853          * (timer_active==0 becomes visible before the hrtimer call-back
2854          * terminates).  In either case we ensure that it's re-programmed
2855          */
2856         while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
2857                 raw_spin_unlock(&cfs_b->lock);
2858                 /* ensure cfs_b->lock is available while we wait */
2859                 hrtimer_cancel(&cfs_b->period_timer);
2860
2861                 raw_spin_lock(&cfs_b->lock);
2862                 /* if someone else restarted the timer then we're done */
2863                 if (cfs_b->timer_active)
2864                         return;
2865         }
2866
2867         cfs_b->timer_active = 1;
2868         start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2869 }
2870
2871 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2872 {
2873         hrtimer_cancel(&cfs_b->period_timer);
2874         hrtimer_cancel(&cfs_b->slack_timer);
2875 }
2876
2877 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2878 {
2879         struct cfs_rq *cfs_rq;
2880
2881         for_each_leaf_cfs_rq(rq, cfs_rq) {
2882                 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2883
2884                 if (!cfs_rq->runtime_enabled)
2885                         continue;
2886
2887                 /*
2888                  * clock_task is not advancing so we just need to make sure
2889                  * there's some valid quota amount
2890                  */
2891                 cfs_rq->runtime_remaining = cfs_b->quota;
2892                 if (cfs_rq_throttled(cfs_rq))
2893                         unthrottle_cfs_rq(cfs_rq);
2894         }
2895 }
2896
2897 #else /* CONFIG_CFS_BANDWIDTH */
2898 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2899 {
2900         return rq_of(cfs_rq)->clock_task;
2901 }
2902
2903 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2904                                      unsigned long delta_exec) {}
2905 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2906 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2907 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2908
2909 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2910 {
2911         return 0;
2912 }
2913
2914 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2915 {
2916         return 0;
2917 }
2918
2919 static inline int throttled_lb_pair(struct task_group *tg,
2920                                     int src_cpu, int dest_cpu)
2921 {
2922         return 0;
2923 }
2924
2925 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2926
2927 #ifdef CONFIG_FAIR_GROUP_SCHED
2928 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2929 #endif
2930
2931 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2932 {
2933         return NULL;
2934 }
2935 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2936 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
2937
2938 #endif /* CONFIG_CFS_BANDWIDTH */
2939
2940 /**************************************************
2941  * CFS operations on tasks:
2942  */
2943
2944 #ifdef CONFIG_SCHED_HRTICK
2945 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2946 {
2947         struct sched_entity *se = &p->se;
2948         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2949
2950         WARN_ON(task_rq(p) != rq);
2951
2952         if (cfs_rq->nr_running > 1) {
2953                 u64 slice = sched_slice(cfs_rq, se);
2954                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2955                 s64 delta = slice - ran;
2956
2957                 if (delta < 0) {
2958                         if (rq->curr == p)
2959                                 resched_task(p);
2960                         return;
2961                 }
2962
2963                 /*
2964                  * Don't schedule slices shorter than 10000ns, that just
2965                  * doesn't make sense. Rely on vruntime for fairness.
2966                  */
2967                 if (rq->curr != p)
2968                         delta = max_t(s64, 10000LL, delta);
2969
2970                 hrtick_start(rq, delta);
2971         }
2972 }
2973
2974 /*
2975  * called from enqueue/dequeue and updates the hrtick when the
2976  * current task is from our class and nr_running is low enough
2977  * to matter.
2978  */
2979 static void hrtick_update(struct rq *rq)
2980 {
2981         struct task_struct *curr = rq->curr;
2982
2983         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
2984                 return;
2985
2986         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2987                 hrtick_start_fair(rq, curr);
2988 }
2989 #else /* !CONFIG_SCHED_HRTICK */
2990 static inline void
2991 hrtick_start_fair(struct rq *rq, struct task_struct *p)
2992 {
2993 }
2994
2995 static inline void hrtick_update(struct rq *rq)
2996 {
2997 }
2998 #endif
2999
3000 /*
3001  * The enqueue_task method is called before nr_running is
3002  * increased. Here we update the fair scheduling stats and
3003  * then put the task into the rbtree:
3004  */
3005 static void
3006 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3007 {
3008         struct cfs_rq *cfs_rq;
3009         struct sched_entity *se = &p->se;
3010
3011         for_each_sched_entity(se) {
3012                 if (se->on_rq)
3013                         break;
3014                 cfs_rq = cfs_rq_of(se);
3015                 enqueue_entity(cfs_rq, se, flags);
3016
3017                 /*
3018                  * end evaluation on encountering a throttled cfs_rq
3019                  *
3020                  * note: in the case of encountering a throttled cfs_rq we will
3021                  * post the final h_nr_running increment below.
3022                 */
3023                 if (cfs_rq_throttled(cfs_rq))
3024                         break;
3025                 cfs_rq->h_nr_running++;
3026
3027                 flags = ENQUEUE_WAKEUP;
3028         }
3029
3030         for_each_sched_entity(se) {
3031                 cfs_rq = cfs_rq_of(se);
3032                 cfs_rq->h_nr_running++;
3033
3034                 if (cfs_rq_throttled(cfs_rq))
3035                         break;
3036
3037                 update_cfs_shares(cfs_rq);
3038                 update_entity_load_avg(se, 1);
3039         }
3040
3041         if (!se) {
3042                 update_rq_runnable_avg(rq, rq->nr_running);
3043                 inc_nr_running(rq);
3044         }
3045         hrtick_update(rq);
3046 }
3047
3048 static void set_next_buddy(struct sched_entity *se);
3049
3050 /*
3051  * The dequeue_task method is called before nr_running is
3052  * decreased. We remove the task from the rbtree and
3053  * update the fair scheduling stats:
3054  */
3055 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
3056 {
3057         struct cfs_rq *cfs_rq;
3058         struct sched_entity *se = &p->se;
3059         int task_sleep = flags & DEQUEUE_SLEEP;
3060
3061         for_each_sched_entity(se) {
3062                 cfs_rq = cfs_rq_of(se);
3063                 dequeue_entity(cfs_rq, se, flags);
3064
3065                 /*
3066                  * end evaluation on encountering a throttled cfs_rq
3067                  *
3068                  * note: in the case of encountering a throttled cfs_rq we will
3069                  * post the final h_nr_running decrement below.
3070                 */
3071                 if (cfs_rq_throttled(cfs_rq))
3072                         break;
3073                 cfs_rq->h_nr_running--;
3074
3075                 /* Don't dequeue parent if it has other entities besides us */
3076                 if (cfs_rq->load.weight) {
3077                         /*
3078                          * Bias pick_next to pick a task from this cfs_rq, as
3079                          * p is sleeping when it is within its sched_slice.
3080                          */
3081                         if (task_sleep && parent_entity(se))
3082                                 set_next_buddy(parent_entity(se));
3083
3084                         /* avoid re-evaluating load for this entity */
3085                         se = parent_entity(se);
3086                         break;
3087                 }
3088                 flags |= DEQUEUE_SLEEP;
3089         }
3090
3091         for_each_sched_entity(se) {
3092                 cfs_rq = cfs_rq_of(se);
3093                 cfs_rq->h_nr_running--;
3094
3095                 if (cfs_rq_throttled(cfs_rq))
3096                         break;
3097
3098                 update_cfs_shares(cfs_rq);
3099                 update_entity_load_avg(se, 1);
3100         }
3101
3102         if (!se) {
3103                 dec_nr_running(rq);
3104                 update_rq_runnable_avg(rq, 1);
3105         }
3106         hrtick_update(rq);
3107 }
3108
3109 #ifdef CONFIG_SMP
3110 /* Used instead of source_load when we know the type == 0 */
3111 static unsigned long weighted_cpuload(const int cpu)
3112 {
3113         return cpu_rq(cpu)->load.weight;
3114 }
3115
3116 /*
3117  * Return a low guess at the load of a migration-source cpu weighted
3118  * according to the scheduling class and "nice" value.
3119  *
3120  * We want to under-estimate the load of migration sources, to
3121  * balance conservatively.
3122  */
3123 static unsigned long source_load(int cpu, int type)
3124 {
3125         struct rq *rq = cpu_rq(cpu);
3126         unsigned long total = weighted_cpuload(cpu);
3127
3128         if (type == 0 || !sched_feat(LB_BIAS))
3129                 return total;
3130
3131         return min(rq->cpu_load[type-1], total);
3132 }
3133
3134 /*
3135  * Return a high guess at the load of a migration-target cpu weighted
3136  * according to the scheduling class and "nice" value.
3137  */
3138 static unsigned long target_load(int cpu, int type)
3139 {
3140         struct rq *rq = cpu_rq(cpu);
3141         unsigned long total = weighted_cpuload(cpu);
3142
3143         if (type == 0 || !sched_feat(LB_BIAS))
3144                 return total;
3145
3146         return max(rq->cpu_load[type-1], total);
3147 }
3148
3149 static unsigned long power_of(int cpu)
3150 {
3151         return cpu_rq(cpu)->cpu_power;
3152 }
3153
3154 static unsigned long cpu_avg_load_per_task(int cpu)
3155 {
3156         struct rq *rq = cpu_rq(cpu);
3157         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
3158
3159         if (nr_running)
3160                 return rq->load.weight / nr_running;
3161
3162         return 0;
3163 }
3164
3165
3166 static void task_waking_fair(struct task_struct *p)
3167 {
3168         struct sched_entity *se = &p->se;
3169         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3170         u64 min_vruntime;
3171
3172 #ifndef CONFIG_64BIT
3173         u64 min_vruntime_copy;
3174
3175         do {
3176                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3177                 smp_rmb();
3178                 min_vruntime = cfs_rq->min_vruntime;
3179         } while (min_vruntime != min_vruntime_copy);
3180 #else
3181         min_vruntime = cfs_rq->min_vruntime;
3182 #endif
3183
3184         se->vruntime -= min_vruntime;
3185 }
3186
3187 #ifdef CONFIG_FAIR_GROUP_SCHED
3188 /*
3189  * effective_load() calculates the load change as seen from the root_task_group
3190  *
3191  * Adding load to a group doesn't make a group heavier, but can cause movement
3192  * of group shares between cpus. Assuming the shares were perfectly aligned one
3193  * can calculate the shift in shares.
3194  *
3195  * Calculate the effective load difference if @wl is added (subtracted) to @tg
3196  * on this @cpu and results in a total addition (subtraction) of @wg to the
3197  * total group weight.
3198  *
3199  * Given a runqueue weight distribution (rw_i) we can compute a shares
3200  * distribution (s_i) using:
3201  *
3202  *   s_i = rw_i / \Sum rw_j                                             (1)
3203  *
3204  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3205  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3206  * shares distribution (s_i):
3207  *
3208  *   rw_i = {   2,   4,   1,   0 }
3209  *   s_i  = { 2/7, 4/7, 1/7,   0 }
3210  *
3211  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3212  * task used to run on and the CPU the waker is running on), we need to
3213  * compute the effect of waking a task on either CPU and, in case of a sync
3214  * wakeup, compute the effect of the current task going to sleep.
3215  *
3216  * So for a change of @wl to the local @cpu with an overall group weight change
3217  * of @wl we can compute the new shares distribution (s'_i) using:
3218  *
3219  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
3220  *
3221  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3222  * differences in waking a task to CPU 0. The additional task changes the
3223  * weight and shares distributions like:
3224  *
3225  *   rw'_i = {   3,   4,   1,   0 }
3226  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
3227  *
3228  * We can then compute the difference in effective weight by using:
3229  *
3230  *   dw_i = S * (s'_i - s_i)                                            (3)
3231  *
3232  * Where 'S' is the group weight as seen by its parent.
3233  *
3234  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3235  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3236  * 4/7) times the weight of the group.
3237  */
3238 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3239 {
3240         struct sched_entity *se = tg->se[cpu];
3241
3242         if (!tg->parent)        /* the trivial, non-cgroup case */
3243                 return wl;
3244
3245         for_each_sched_entity(se) {
3246                 long w, W;
3247
3248                 tg = se->my_q->tg;
3249
3250                 /*
3251                  * W = @wg + \Sum rw_j
3252                  */
3253                 W = wg + calc_tg_weight(tg, se->my_q);
3254
3255                 /*
3256                  * w = rw_i + @wl
3257                  */
3258                 w = se->my_q->load.weight + wl;
3259
3260                 /*
3261                  * wl = S * s'_i; see (2)
3262                  */
3263                 if (W > 0 && w < W)
3264                         wl = (w * tg->shares) / W;
3265                 else
3266                         wl = tg->shares;
3267
3268                 /*
3269                  * Per the above, wl is the new se->load.weight value; since
3270                  * those are clipped to [MIN_SHARES, ...) do so now. See
3271                  * calc_cfs_shares().
3272                  */
3273                 if (wl < MIN_SHARES)
3274                         wl = MIN_SHARES;
3275
3276                 /*
3277                  * wl = dw_i = S * (s'_i - s_i); see (3)
3278                  */
3279                 wl -= se->load.weight;
3280
3281                 /*
3282                  * Recursively apply this logic to all parent groups to compute
3283                  * the final effective load change on the root group. Since
3284                  * only the @tg group gets extra weight, all parent groups can
3285                  * only redistribute existing shares. @wl is the shift in shares
3286                  * resulting from this level per the above.
3287                  */
3288                 wg = 0;
3289         }
3290
3291         return wl;
3292 }
3293 #else
3294
3295 static inline unsigned long effective_load(struct task_group *tg, int cpu,
3296                 unsigned long wl, unsigned long wg)
3297 {
3298         return wl;
3299 }
3300
3301 #endif
3302
3303 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
3304 {
3305         s64 this_load, load;
3306         int idx, this_cpu, prev_cpu;
3307         unsigned long tl_per_task;
3308         struct task_group *tg;
3309         unsigned long weight;
3310         int balanced;
3311
3312         idx       = sd->wake_idx;
3313         this_cpu  = smp_processor_id();
3314         prev_cpu  = task_cpu(p);
3315         load      = source_load(prev_cpu, idx);
3316         this_load = target_load(this_cpu, idx);
3317
3318         /*
3319          * If sync wakeup then subtract the (maximum possible)
3320          * effect of the currently running task from the load
3321          * of the current CPU:
3322          */
3323         if (sync) {
3324                 tg = task_group(current);
3325                 weight = current->se.load.weight;
3326
3327                 this_load += effective_load(tg, this_cpu, -weight, -weight);
3328                 load += effective_load(tg, prev_cpu, 0, -weight);
3329         }
3330
3331         tg = task_group(p);
3332         weight = p->se.load.weight;
3333
3334         /*
3335          * In low-load situations, where prev_cpu is idle and this_cpu is idle
3336          * due to the sync cause above having dropped this_load to 0, we'll
3337          * always have an imbalance, but there's really nothing you can do
3338          * about that, so that's good too.
3339          *
3340          * Otherwise check if either cpus are near enough in load to allow this
3341          * task to be woken on this_cpu.
3342          */
3343         if (this_load > 0) {
3344                 s64 this_eff_load, prev_eff_load;
3345
3346                 this_eff_load = 100;
3347                 this_eff_load *= power_of(prev_cpu);
3348                 this_eff_load *= this_load +
3349                         effective_load(tg, this_cpu, weight, weight);
3350
3351                 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3352                 prev_eff_load *= power_of(this_cpu);
3353                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3354
3355                 balanced = this_eff_load <= prev_eff_load;
3356         } else
3357                 balanced = true;
3358
3359         /*
3360          * If the currently running task will sleep within
3361          * a reasonable amount of time then attract this newly
3362          * woken task:
3363          */
3364         if (sync && balanced)
3365                 return 1;
3366
3367         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
3368         tl_per_task = cpu_avg_load_per_task(this_cpu);
3369
3370         if (balanced ||
3371             (this_load <= load &&
3372              this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
3373                 /*
3374                  * This domain has SD_WAKE_AFFINE and
3375                  * p is cache cold in this domain, and
3376                  * there is no bad imbalance.
3377                  */
3378                 schedstat_inc(sd, ttwu_move_affine);
3379                 schedstat_inc(p, se.statistics.nr_wakeups_affine);
3380
3381                 return 1;
3382         }
3383         return 0;
3384 }
3385
3386 /*
3387  * find_idlest_group finds and returns the least busy CPU group within the
3388  * domain.
3389  */
3390 static struct sched_group *
3391 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
3392                   int this_cpu, int load_idx)
3393 {
3394         struct sched_group *idlest = NULL, *group = sd->groups;
3395         unsigned long min_load = ULONG_MAX, this_load = 0;
3396         int imbalance = 100 + (sd->imbalance_pct-100)/2;
3397
3398         do {
3399                 unsigned long load, avg_load;
3400                 int local_group;
3401                 int i;
3402
3403                 /* Skip over this group if it has no CPUs allowed */
3404                 if (!cpumask_intersects(sched_group_cpus(group),
3405                                         tsk_cpus_allowed(p)))
3406                         continue;
3407
3408                 local_group = cpumask_test_cpu(this_cpu,
3409                                                sched_group_cpus(group));
3410
3411                 /* Tally up the load of all CPUs in the group */
3412                 avg_load = 0;
3413
3414                 for_each_cpu(i, sched_group_cpus(group)) {
3415                         /* Bias balancing toward cpus of our domain */
3416                         if (local_group)
3417                                 load = source_load(i, load_idx);
3418                         else
3419                                 load = target_load(i, load_idx);
3420
3421                         avg_load += load;
3422                 }
3423
3424                 /* Adjust by relative CPU power of the group */
3425                 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
3426
3427                 if (local_group) {
3428                         this_load = avg_load;
3429                 } else if (avg_load < min_load) {
3430                         min_load = avg_load;
3431                         idlest = group;
3432                 }
3433         } while (group = group->next, group != sd->groups);
3434
3435         if (!idlest || 100*this_load < imbalance*min_load)
3436                 return NULL;
3437         return idlest;
3438 }
3439
3440 /*
3441  * find_idlest_cpu - find the idlest cpu among the cpus in group.
3442  */
3443 static int
3444 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3445 {
3446         unsigned long load, min_load = ULONG_MAX;
3447         int idlest = -1;
3448         int i;
3449
3450         /* Traverse only the allowed CPUs */
3451         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
3452                 load = weighted_cpuload(i);
3453
3454                 if (load < min_load || (load == min_load && i == this_cpu)) {
3455                         min_load = load;
3456                         idlest = i;
3457                 }
3458         }
3459
3460         return idlest;
3461 }
3462
3463 /*
3464  * Try and locate an idle CPU in the sched_domain.
3465  */
3466 static int select_idle_sibling(struct task_struct *p, int target)
3467 {
3468         struct sched_domain *sd;
3469         struct sched_group *sg;
3470         int i = task_cpu(p);
3471
3472         if (idle_cpu(target))
3473                 return target;
3474
3475         /*
3476          * If the prevous cpu is cache affine and idle, don't be stupid.
3477          */
3478         if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3479                 return i;
3480
3481         /*
3482          * Otherwise, iterate the domains and find an elegible idle cpu.
3483          */
3484         sd = rcu_dereference(per_cpu(sd_llc, target));
3485         for_each_lower_domain(sd) {
3486                 sg = sd->groups;
3487                 do {
3488                         if (!cpumask_intersects(sched_group_cpus(sg),
3489                                                 tsk_cpus_allowed(p)))
3490                                 goto next;
3491
3492                         for_each_cpu(i, sched_group_cpus(sg)) {
3493                                 if (i == target || !idle_cpu(i))
3494                                         goto next;
3495                         }
3496
3497                         target = cpumask_first_and(sched_group_cpus(sg),
3498                                         tsk_cpus_allowed(p));
3499                         goto done;
3500 next:
3501                         sg = sg->next;
3502                 } while (sg != sd->groups);
3503         }
3504 done:
3505         return target;
3506 }
3507
3508 #ifdef CONFIG_SCHED_HMP
3509 /*
3510  * Heterogenous multiprocessor (HMP) optimizations
3511  *
3512  * The cpu types are distinguished using a list of hmp_domains
3513  * which each represent one cpu type using a cpumask.
3514  * The list is assumed ordered by compute capacity with the
3515  * fastest domain first.
3516  */
3517 DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3518 static const int hmp_max_tasks = 5;
3519
3520 extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3521
3522 /* Setup hmp_domains */
3523 static int __init hmp_cpu_mask_setup(void)
3524 {
3525         char buf[64];
3526         struct hmp_domain *domain;
3527         struct list_head *pos;
3528         int dc, cpu;
3529
3530         pr_debug("Initializing HMP scheduler:\n");
3531
3532         /* Initialize hmp_domains using platform code */
3533         arch_get_hmp_domains(&hmp_domains);
3534         if (list_empty(&hmp_domains)) {
3535                 pr_debug("HMP domain list is empty!\n");
3536                 return 0;
3537         }
3538
3539         /* Print hmp_domains */
3540         dc = 0;
3541         list_for_each(pos, &hmp_domains) {
3542                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3543                 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3544                 pr_debug("  HMP domain %d: %s\n", dc, buf);
3545
3546                 for_each_cpu_mask(cpu, domain->possible_cpus) {
3547                         per_cpu(hmp_cpu_domain, cpu) = domain;
3548                 }
3549                 dc++;
3550         }
3551
3552         return 1;
3553 }
3554
3555 static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3556 {
3557         struct hmp_domain *domain;
3558         struct list_head *pos;
3559
3560         list_for_each(pos, &hmp_domains) {
3561                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3562                 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3563                         return domain;
3564         }
3565         return NULL;
3566 }
3567
3568 static void hmp_online_cpu(int cpu)
3569 {
3570         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3571
3572         if(domain)
3573                 cpumask_set_cpu(cpu, &domain->cpus);
3574 }
3575
3576 static void hmp_offline_cpu(int cpu)
3577 {
3578         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3579
3580         if(domain)
3581                 cpumask_clear_cpu(cpu, &domain->cpus);
3582 }
3583 /*
3584  * Needed to determine heaviest tasks etc.
3585  */
3586 static inline unsigned int hmp_cpu_is_fastest(int cpu);
3587 static inline unsigned int hmp_cpu_is_slowest(int cpu);
3588 static inline struct hmp_domain *hmp_slower_domain(int cpu);
3589 static inline struct hmp_domain *hmp_faster_domain(int cpu);
3590
3591 /* must hold runqueue lock for queue se is currently on */
3592 static struct sched_entity *hmp_get_heaviest_task(
3593                                 struct sched_entity *se, int migrate_up)
3594 {
3595         int num_tasks = hmp_max_tasks;
3596         struct sched_entity *max_se = se;
3597         unsigned long int max_ratio = se->avg.load_avg_ratio;
3598         const struct cpumask *hmp_target_mask = NULL;
3599
3600         if (migrate_up) {
3601                 struct hmp_domain *hmp;
3602                 if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
3603                         return max_se;
3604
3605                 hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
3606                 hmp_target_mask = &hmp->cpus;
3607         }
3608         /* The currently running task is not on the runqueue */
3609         se = __pick_first_entity(cfs_rq_of(se));
3610
3611         while (num_tasks && se) {
3612                 if (entity_is_task(se) &&
3613                         (se->avg.load_avg_ratio > max_ratio &&
3614                          hmp_target_mask &&
3615                          cpumask_intersects(hmp_target_mask,
3616                                 tsk_cpus_allowed(task_of(se))))) {
3617                         max_se = se;
3618                         max_ratio = se->avg.load_avg_ratio;
3619                 }
3620                 se = __pick_next_entity(se);
3621                 num_tasks--;
3622         }
3623         return max_se;
3624 }
3625
3626 static struct sched_entity *hmp_get_lightest_task(
3627                                 struct sched_entity *se, int migrate_down)
3628 {
3629         int num_tasks = hmp_max_tasks;
3630         struct sched_entity *min_se = se;
3631         unsigned long int min_ratio = se->avg.load_avg_ratio;
3632         const struct cpumask *hmp_target_mask = NULL;
3633
3634         if (migrate_down) {
3635                 struct hmp_domain *hmp;
3636                 if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
3637                         return min_se;
3638                 hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
3639                 hmp_target_mask = &hmp->cpus;
3640         }
3641         /* The currently running task is not on the runqueue */
3642         se = __pick_first_entity(cfs_rq_of(se));
3643
3644         while (num_tasks && se) {
3645                 if (entity_is_task(se) &&
3646                         (se->avg.load_avg_ratio < min_ratio &&
3647                         hmp_target_mask &&
3648                                 cpumask_intersects(hmp_target_mask,
3649                                 tsk_cpus_allowed(task_of(se))))) {
3650                         min_se = se;
3651                         min_ratio = se->avg.load_avg_ratio;
3652                 }
3653                 se = __pick_next_entity(se);
3654                 num_tasks--;
3655         }
3656         return min_se;
3657 }
3658
3659 /*
3660  * Migration thresholds should be in the range [0..1023]
3661  * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3662  * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3663  *
3664  * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
3665  * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
3666  * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
3667  *
3668  * Small Task Packing:
3669  * We can choose to fill the littlest CPUs in an HMP system rather than
3670  * the typical spreading mechanic. This behavior is controllable using
3671  * two variables.
3672  * hmp_packing_enabled: runtime control over pack/spread
3673  * hmp_full_threshold: Consider a CPU with this much unweighted load full
3674  */
3675 unsigned int hmp_up_threshold = 700;
3676 unsigned int hmp_down_threshold = 512;
3677 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3678 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
3679 #endif
3680 unsigned int hmp_next_up_threshold = 4096;
3681 unsigned int hmp_next_down_threshold = 4096;
3682
3683 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3684 unsigned int hmp_packing_enabled = 1;
3685 #ifndef CONFIG_ARCH_VEXPRESS_TC2
3686 unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
3687 #else
3688 /* TC2 has a sharp consumption curve @ around 800Mhz, so
3689    we aim to spread the load around that frequency. */
3690 unsigned int hmp_full_threshold = 650;  /*  80% of the 800Mhz freq * NICE_0_LOAD */
3691 #endif
3692 #endif
3693
3694 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
3695 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
3696 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
3697                                                 int *min_cpu, struct cpumask *affinity);
3698
3699 static inline struct hmp_domain *hmp_smallest_domain(void)
3700 {
3701         return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
3702 }
3703
3704 /* Check if cpu is in fastest hmp_domain */
3705 static inline unsigned int hmp_cpu_is_fastest(int cpu)
3706 {
3707         struct list_head *pos;
3708
3709         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3710         return pos == hmp_domains.next;
3711 }
3712
3713 /* Check if cpu is in slowest hmp_domain */
3714 static inline unsigned int hmp_cpu_is_slowest(int cpu)
3715 {
3716         struct list_head *pos;
3717
3718         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3719         return list_is_last(pos, &hmp_domains);
3720 }
3721
3722 /* Next (slower) hmp_domain relative to cpu */
3723 static inline struct hmp_domain *hmp_slower_domain(int cpu)
3724 {
3725         struct list_head *pos;
3726
3727         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3728         return list_entry(pos->next, struct hmp_domain, hmp_domains);
3729 }
3730
3731 /* Previous (faster) hmp_domain relative to cpu */
3732 static inline struct hmp_domain *hmp_faster_domain(int cpu)
3733 {
3734         struct list_head *pos;
3735
3736         pos = &hmp_cpu_domain(cpu)->hmp_domains;
3737         return list_entry(pos->prev, struct hmp_domain, hmp_domains);
3738 }
3739
3740 /*
3741  * Selects a cpu in previous (faster) hmp_domain
3742  */
3743 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
3744                                                         int cpu)
3745 {
3746         int lowest_cpu=NR_CPUS;
3747         __always_unused int lowest_ratio;
3748         struct hmp_domain *hmp;
3749
3750         if (hmp_cpu_is_fastest(cpu))
3751                 hmp = hmp_cpu_domain(cpu);
3752         else
3753                 hmp = hmp_faster_domain(cpu);
3754
3755         lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3756                         tsk_cpus_allowed(tsk));
3757
3758         return lowest_cpu;
3759 }
3760
3761 /*
3762  * Selects a cpu in next (slower) hmp_domain
3763  * Note that cpumask_any_and() returns the first cpu in the cpumask
3764  */
3765 static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
3766                                                         int cpu)
3767 {
3768         int lowest_cpu=NR_CPUS;
3769         struct hmp_domain *hmp;
3770         __always_unused int lowest_ratio;
3771
3772         if (hmp_cpu_is_slowest(cpu))
3773                 hmp = hmp_cpu_domain(cpu);
3774         else
3775                 hmp = hmp_slower_domain(cpu);
3776
3777         lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
3778                         tsk_cpus_allowed(tsk));
3779
3780         return lowest_cpu;
3781 }
3782 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3783 /*
3784  * Select the 'best' candidate little CPU to wake up on.
3785  * Implements a packing strategy which examines CPU in
3786  * logical CPU order, and selects the first which will
3787  * have at least 10% capacity available, according to
3788  * both tracked load of the runqueue and the task.
3789  */
3790 static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
3791                 int cpu) {
3792         int tmp_cpu;
3793         unsigned long estimated_load;
3794         struct hmp_domain *hmp;
3795         struct sched_avg *avg;
3796         struct cpumask allowed_hmp_cpus;
3797
3798         if(!hmp_packing_enabled ||
3799                         tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
3800                 return hmp_select_slower_cpu(tsk, cpu);
3801
3802         if (hmp_cpu_is_slowest(cpu))
3803                 hmp = hmp_cpu_domain(cpu);
3804         else
3805                 hmp = hmp_slower_domain(cpu);
3806
3807         /* respect affinity */
3808         cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
3809                         tsk_cpus_allowed(tsk));
3810
3811         for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
3812                 avg = &cpu_rq(tmp_cpu)->avg;
3813                 /* estimate new rq load if we add this task */
3814                 estimated_load = avg->load_avg_ratio +
3815                                 tsk->se.avg.load_avg_ratio;
3816                 if (estimated_load <= hmp_full_threshold) {
3817                         cpu = tmp_cpu;
3818                         break;
3819                 }
3820         }
3821         /* if no match was found, the task uses the initial value */
3822         return cpu;
3823 }
3824 #endif
3825 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
3826 {
3827         /* hack - always use clock from first online CPU */
3828         u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3829         se->avg.hmp_last_up_migration = now;
3830         se->avg.hmp_last_down_migration = 0;
3831         cpu_rq(cpu)->avg.hmp_last_up_migration = now;
3832         cpu_rq(cpu)->avg.hmp_last_down_migration = 0;
3833 }
3834
3835 static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
3836 {
3837         /* hack - always use clock from first online CPU */
3838         u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
3839         se->avg.hmp_last_down_migration = now;
3840         se->avg.hmp_last_up_migration = 0;
3841         cpu_rq(cpu)->avg.hmp_last_down_migration = now;
3842         cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
3843 }
3844
3845 #ifdef CONFIG_HMP_VARIABLE_SCALE
3846 /*
3847  * Heterogenous multiprocessor (HMP) optimizations
3848  *
3849  * These functions allow to change the growing speed of the load_avg_ratio
3850  * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
3851  * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
3852  *
3853  * These functions also allow to change the up and down threshold of HMP
3854  * using /sys/kernel/hmp/{up,down}_threshold.
3855  * Both must be between 0 and 1023. The threshold that is compared
3856  * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
3857  *
3858  * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
3859  * task with a load of 0 will reach the threshold after 64ms of busy loop.
3860  *
3861  * Changing load_avg_periods_ms has the same effect than changing the
3862  * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
3863  * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
3864  * could trigger overflows.
3865  * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
3866  * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
3867  * could be overflowed for a weight > 2^12 even is the load_avg_contrib
3868  * should still be a 32bits result. This would not happen by multiplicating
3869  * delta time by 1/22 and setting load_avg_period_ms = 706.
3870  */
3871
3872 /*
3873  * By scaling the delta time it end-up increasing or decrease the
3874  * growing speed of the per entity load_avg_ratio
3875  * The scale factor hmp_data.multiplier is a fixed point
3876  * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
3877  */
3878 static u64 hmp_variable_scale_convert(u64 delta)
3879 {
3880         u64 high = delta >> 32ULL;
3881         u64 low = delta & 0xffffffffULL;
3882         low *= hmp_data.multiplier;
3883         high *= hmp_data.multiplier;
3884         return (low >> HMP_VARIABLE_SCALE_SHIFT)
3885                         + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
3886 }
3887
3888 static ssize_t hmp_show(struct kobject *kobj,
3889                                 struct attribute *attr, char *buf)
3890 {
3891         ssize_t ret = 0;
3892         struct hmp_global_attr *hmp_attr =
3893                 container_of(attr, struct hmp_global_attr, attr);
3894         int temp = *(hmp_attr->value);
3895         if (hmp_attr->to_sysfs != NULL)
3896                 temp = hmp_attr->to_sysfs(temp);
3897         ret = sprintf(buf, "%d\n", temp);
3898         return ret;
3899 }
3900
3901 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
3902                                 const char *buf, size_t count)
3903 {
3904         int temp;
3905         ssize_t ret = count;
3906         struct hmp_global_attr *hmp_attr =
3907                 container_of(attr, struct hmp_global_attr, attr);
3908         char *str = vmalloc(count + 1);
3909         if (str == NULL)
3910                 return -ENOMEM;
3911         memcpy(str, buf, count);
3912         str[count] = 0;
3913         if (sscanf(str, "%d", &temp) < 1)
3914                 ret = -EINVAL;
3915         else {
3916                 if (hmp_attr->from_sysfs != NULL)
3917                         temp = hmp_attr->from_sysfs(temp);
3918                 if (temp < 0)
3919                         ret = -EINVAL;
3920                 else
3921                         *(hmp_attr->value) = temp;
3922         }
3923         vfree(str);
3924         return ret;
3925 }
3926
3927 static int hmp_period_tofrom_sysfs(int value)
3928 {
3929         return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
3930 }
3931
3932 /* max value for threshold is 1024 */
3933 static int hmp_theshold_from_sysfs(int value)
3934 {
3935         if (value > 1024)
3936                 return -1;
3937         return value;
3938 }
3939 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
3940 /* freqinvar control is only 0,1 off/on */
3941 static int hmp_freqinvar_from_sysfs(int value)
3942 {
3943         if (value < 0 || value > 1)
3944                 return -1;
3945         return value;
3946 }
3947 #endif
3948 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
3949 /* packing value must be non-negative */
3950 static int hmp_packing_from_sysfs(int value)
3951 {
3952         if (value < 0)
3953                 return -1;
3954         return value;
3955 }
3956 #endif
3957 static void hmp_attr_add(
3958         const char *name,
3959         int *value,
3960         int (*to_sysfs)(int),
3961         int (*from_sysfs)(int))
3962 {
3963         int i = 0;
3964         while (hmp_data.attributes[i] != NULL) {
3965                 i++;
3966                 if (i >= HMP_DATA_SYSFS_MAX)
3967                         return;
3968         }
3969         hmp_data.attr[i].attr.mode = 0644;
3970         hmp_data.attr[i].show = hmp_show;
3971         hmp_data.attr[i].store = hmp_store;
3972         hmp_data.attr[i].attr.name = name;
3973         hmp_data.attr[i].value = value;
3974         hmp_data.attr[i].to_sysfs = to_sysfs;
3975         hmp_data.attr[i].from_sysfs = from_sysfs;
3976         hmp_data.attributes[i] = &hmp_data.attr[i].attr;
3977         hmp_data.attributes[i + 1] = NULL;
3978 }
3979
3980 static int hmp_attr_init(void)
3981 {
3982         int ret;
3983         memset(&hmp_data, sizeof(hmp_data), 0);
3984         /* by default load_avg_period_ms == LOAD_AVG_PERIOD
3985          * meaning no change
3986          */
3987         hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
3988
3989         hmp_attr_add("load_avg_period_ms",
3990                 &hmp_data.multiplier,
3991                 hmp_period_tofrom_sysfs,
3992                 hmp_period_tofrom_sysfs);
3993         hmp_attr_add("up_threshold",
3994                 &hmp_up_threshold,
3995                 NULL,
3996                 hmp_theshold_from_sysfs);
3997         hmp_attr_add("down_threshold",
3998                 &hmp_down_threshold,
3999                 NULL,
4000                 hmp_theshold_from_sysfs);
4001 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4002         /* default frequency-invariant scaling ON */
4003         hmp_data.freqinvar_load_scale_enabled = 1;
4004         hmp_attr_add("frequency_invariant_load_scale",
4005                 &hmp_data.freqinvar_load_scale_enabled,
4006                 NULL,
4007                 hmp_freqinvar_from_sysfs);
4008 #endif
4009 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4010         hmp_attr_add("packing_enable",
4011                 &hmp_packing_enabled,
4012                 NULL,
4013                 hmp_freqinvar_from_sysfs);
4014         hmp_attr_add("packing_limit",
4015                 &hmp_full_threshold,
4016                 NULL,
4017                 hmp_packing_from_sysfs);
4018 #endif
4019         hmp_data.attr_group.name = "hmp";
4020         hmp_data.attr_group.attrs = hmp_data.attributes;
4021         ret = sysfs_create_group(kernel_kobj,
4022                 &hmp_data.attr_group);
4023         return 0;
4024 }
4025 late_initcall(hmp_attr_init);
4026 #endif /* CONFIG_HMP_VARIABLE_SCALE */
4027 /*
4028  * return the load of the lowest-loaded CPU in a given HMP domain
4029  * min_cpu optionally points to an int to receive the CPU.
4030  * affinity optionally points to a cpumask containing the
4031  * CPUs to be considered. note:
4032  *   + min_cpu = NR_CPUS only if no CPUs are in the set of
4033  *     affinity && hmp_domain cpus
4034  *   + min_cpu will always otherwise equal one of the CPUs in
4035  *     the hmp domain
4036  *   + when more than one CPU has the same load, the one which
4037  *     is least-recently-disturbed by an HMP migration will be
4038  *     selected
4039  *   + if all CPUs are equally loaded or idle and the times are
4040  *     all the same, the first in the set will be used
4041  *   + if affinity is not set, cpu_online_mask is used
4042  */
4043 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4044                                                 int *min_cpu, struct cpumask *affinity)
4045 {
4046         int cpu;
4047         int min_cpu_runnable_temp = NR_CPUS;
4048         u64 min_target_last_migration = ULLONG_MAX;
4049         u64 curr_last_migration;
4050         unsigned long min_runnable_load = INT_MAX;
4051         unsigned long contrib;
4052         struct sched_avg *avg;
4053         struct cpumask temp_cpumask;
4054         /*
4055          * only look at CPUs allowed if specified,
4056          * otherwise look at all online CPUs in the
4057          * right HMP domain
4058          */
4059         cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);
4060
4061         for_each_cpu_mask(cpu, temp_cpumask) {
4062                 avg = &cpu_rq(cpu)->avg;
4063                 /* used for both up and down migration */
4064                 curr_last_migration = avg->hmp_last_up_migration ?
4065                         avg->hmp_last_up_migration : avg->hmp_last_down_migration;
4066
4067                 contrib = avg->load_avg_ratio;
4068                 /*
4069                  * Consider a runqueue completely busy if there is any load
4070                  * on it. Definitely not the best for overall fairness, but
4071                  * does well in typical Android use cases.
4072                  */
4073                 if (contrib)
4074                         contrib = 1023;
4075
4076                 if ((contrib < min_runnable_load) ||
4077                         (contrib == min_runnable_load &&
4078                          curr_last_migration < min_target_last_migration)) {
4079                         /*
4080                          * if the load is the same target the CPU with
4081                          * the longest time since a migration.
4082                          * This is to spread migration load between
4083                          * members of a domain more evenly when the
4084                          * domain is fully loaded
4085                          */
4086                         min_runnable_load = contrib;
4087                         min_cpu_runnable_temp = cpu;
4088                         min_target_last_migration = curr_last_migration;
4089                 }
4090         }
4091
4092         if (min_cpu)
4093                 *min_cpu = min_cpu_runnable_temp;
4094
4095         return min_runnable_load;
4096 }
4097
4098 /*
4099  * Calculate the task starvation
4100  * This is the ratio of actually running time vs. runnable time.
4101  * If the two are equal the task is getting the cpu time it needs or
4102  * it is alone on the cpu and the cpu is fully utilized.
4103  */
4104 static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4105 {
4106         u32 starvation;
4107
4108         starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4109         starvation /= (se->avg.runnable_avg_sum + 1);
4110
4111         return scale_load(starvation);
4112 }
4113
4114 static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4115 {
4116         int min_usage;
4117         int dest_cpu = NR_CPUS;
4118
4119         if (hmp_cpu_is_slowest(cpu))
4120                 return NR_CPUS;
4121
4122         /* Is there an idle CPU in the current domain */
4123         min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
4124         if (min_usage == 0) {
4125                 trace_sched_hmp_offload_abort(cpu, min_usage, "load");
4126                 return NR_CPUS;
4127         }
4128
4129         /* Is the task alone on the cpu? */
4130         if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
4131                 trace_sched_hmp_offload_abort(cpu,
4132                         cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
4133                 return NR_CPUS;
4134         }
4135
4136         /* Is the task actually starving? */
4137         /* >=25% ratio running/runnable = starving */
4138         if (hmp_task_starvation(se) > 768) {
4139                 trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
4140                         "starvation");
4141                 return NR_CPUS;
4142         }
4143
4144         /* Does the slower domain have any idle CPUs? */
4145         min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
4146                         tsk_cpus_allowed(task_of(se)));
4147
4148         if (min_usage == 0) {
4149                 trace_sched_hmp_offload_succeed(cpu, dest_cpu);
4150                 return dest_cpu;
4151         } else
4152                 trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
4153         return NR_CPUS;
4154 }
4155 #endif /* CONFIG_SCHED_HMP */
4156
4157 /*
4158  * sched_balance_self: balance the current task (running on cpu) in domains
4159  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
4160  * SD_BALANCE_EXEC.
4161  *
4162  * Balance, ie. select the least loaded group.
4163  *
4164  * Returns the target CPU number, or the same CPU if no balancing is needed.
4165  *
4166  * preempt must be disabled.
4167  */
4168 static int
4169 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
4170 {
4171         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4172         int cpu = smp_processor_id();
4173         int prev_cpu = task_cpu(p);
4174         int new_cpu = cpu;
4175         int want_affine = 0;
4176         int sync = wake_flags & WF_SYNC;
4177
4178         if (p->nr_cpus_allowed == 1)
4179                 return prev_cpu;
4180
4181 #ifdef CONFIG_SCHED_HMP
4182         /* always put non-kernel forking tasks on a big domain */
4183         if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
4184                 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
4185                 if (new_cpu != NR_CPUS) {
4186                         hmp_next_up_delay(&p->se, new_cpu);
4187                         return new_cpu;
4188                 }
4189                 /* failed to perform HMP fork balance, use normal balance */
4190                 new_cpu = cpu;
4191         }
4192 #endif
4193
4194         if (sd_flag & SD_BALANCE_WAKE) {
4195                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
4196                         want_affine = 1;
4197                 new_cpu = prev_cpu;
4198         }
4199
4200         rcu_read_lock();
4201         for_each_domain(cpu, tmp) {
4202                 if (!(tmp->flags & SD_LOAD_BALANCE))
4203                         continue;
4204
4205                 /*
4206                  * If both cpu and prev_cpu are part of this domain,
4207                  * cpu is a valid SD_WAKE_AFFINE target.
4208                  */
4209                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4210                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4211                         affine_sd = tmp;
4212                         break;
4213                 }
4214
4215                 if (tmp->flags & sd_flag)
4216                         sd = tmp;
4217         }
4218
4219         if (affine_sd) {
4220                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4221                         prev_cpu = cpu;
4222
4223                 new_cpu = select_idle_sibling(p, prev_cpu);
4224                 goto unlock;
4225         }
4226
4227         while (sd) {
4228                 int load_idx = sd->forkexec_idx;
4229                 struct sched_group *group;
4230                 int weight;
4231
4232                 if (!(sd->flags & sd_flag)) {
4233                         sd = sd->child;
4234                         continue;
4235                 }
4236
4237                 if (sd_flag & SD_BALANCE_WAKE)
4238                         load_idx = sd->wake_idx;
4239
4240                 group = find_idlest_group(sd, p, cpu, load_idx);
4241                 if (!group) {
4242                         sd = sd->child;
4243                         continue;
4244                 }
4245
4246                 new_cpu = find_idlest_cpu(group, p, cpu);
4247                 if (new_cpu == -1 || new_cpu == cpu) {
4248                         /* Now try balancing at a lower domain level of cpu */
4249                         sd = sd->child;
4250                         continue;
4251                 }
4252
4253                 /* Now try balancing at a lower domain level of new_cpu */
4254                 cpu = new_cpu;
4255                 weight = sd->span_weight;
4256                 sd = NULL;
4257                 for_each_domain(cpu, tmp) {
4258                         if (weight <= tmp->span_weight)
4259                                 break;
4260                         if (tmp->flags & sd_flag)
4261                                 sd = tmp;
4262                 }
4263                 /* while loop will break here if sd == NULL */
4264         }
4265 unlock:
4266         rcu_read_unlock();
4267
4268 #ifdef CONFIG_SCHED_HMP
4269         prev_cpu = task_cpu(p);
4270
4271         if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
4272                 hmp_next_up_delay(&p->se, new_cpu);
4273                 trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4274                 return new_cpu;
4275         }
4276         if (hmp_down_migration(prev_cpu, &p->se)) {
4277 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
4278                 new_cpu = hmp_best_little_cpu(p, prev_cpu);
4279 #else
4280                 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
4281 #endif
4282                 if (new_cpu != prev_cpu) {
4283                         hmp_next_down_delay(&p->se, new_cpu);
4284                         trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
4285                         return new_cpu;
4286                 }
4287         }
4288         /* Make sure that the task stays in its previous hmp domain */
4289         if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
4290                 return prev_cpu;
4291 #endif
4292
4293         return new_cpu;
4294 }
4295
4296 /*
4297  * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
4298  * removed when useful for applications beyond shares distribution (e.g.
4299  * load-balance).
4300  */
4301 #ifdef CONFIG_FAIR_GROUP_SCHED
4302 /*
4303  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
4304  * cfs_rq_of(p) references at time of call are still valid and identify the
4305  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
4306  * other assumptions, including the state of rq->lock, should be made.
4307  */
4308 static void
4309 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
4310 {
4311         struct sched_entity *se = &p->se;
4312         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4313
4314         /*
4315          * Load tracking: accumulate removed load so that it can be processed
4316          * when we next update owning cfs_rq under rq->lock.  Tasks contribute
4317          * to blocked load iff they have a positive decay-count.  It can never
4318          * be negative here since on-rq tasks have decay-count == 0.
4319          */
4320         if (se->avg.decay_count) {
4321                 se->avg.decay_count = -__synchronize_entity_decay(se);
4322                 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
4323         }
4324 }
4325 #endif
4326 #endif /* CONFIG_SMP */
4327
4328 static unsigned long
4329 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
4330 {
4331         unsigned long gran = sysctl_sched_wakeup_granularity;
4332
4333         /*
4334          * Since its curr running now, convert the gran from real-time
4335          * to virtual-time in his units.
4336          *
4337          * By using 'se' instead of 'curr' we penalize light tasks, so
4338          * they get preempted easier. That is, if 'se' < 'curr' then
4339          * the resulting gran will be larger, therefore penalizing the
4340          * lighter, if otoh 'se' > 'curr' then the resulting gran will
4341          * be smaller, again penalizing the lighter task.
4342          *
4343          * This is especially important for buddies when the leftmost
4344          * task is higher priority than the buddy.
4345          */
4346         return calc_delta_fair(gran, se);
4347 }
4348
4349 /*
4350  * Should 'se' preempt 'curr'.
4351  *
4352  *             |s1
4353  *        |s2
4354  *   |s3
4355  *         g
4356  *      |<--->|c
4357  *
4358  *  w(c, s1) = -1
4359  *  w(c, s2) =  0
4360  *  w(c, s3) =  1
4361  *
4362  */
4363 static int
4364 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
4365 {
4366         s64 gran, vdiff = curr->vruntime - se->vruntime;
4367
4368         if (vdiff <= 0)
4369                 return -1;
4370
4371         gran = wakeup_gran(curr, se);
4372         if (vdiff > gran)
4373                 return 1;
4374
4375         return 0;
4376 }
4377
4378 static void set_last_buddy(struct sched_entity *se)
4379 {
4380         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4381                 return;
4382
4383         for_each_sched_entity(se)
4384                 cfs_rq_of(se)->last = se;
4385 }
4386
4387 static void set_next_buddy(struct sched_entity *se)
4388 {
4389         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
4390                 return;
4391
4392         for_each_sched_entity(se)
4393                 cfs_rq_of(se)->next = se;
4394 }
4395
4396 static void set_skip_buddy(struct sched_entity *se)
4397 {
4398         for_each_sched_entity(se)
4399                 cfs_rq_of(se)->skip = se;
4400 }
4401
4402 /*
4403  * Preempt the current task with a newly woken task if needed:
4404  */
4405 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
4406 {
4407         struct task_struct *curr = rq->curr;
4408         struct sched_entity *se = &curr->se, *pse = &p->se;
4409         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4410         int scale = cfs_rq->nr_running >= sched_nr_latency;
4411         int next_buddy_marked = 0;
4412
4413         if (unlikely(se == pse))
4414                 return;
4415
4416         /*
4417          * This is possible from callers such as move_task(), in which we
4418          * unconditionally check_prempt_curr() after an enqueue (which may have
4419          * lead to a throttle).  This both saves work and prevents false
4420          * next-buddy nomination below.
4421          */
4422         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
4423                 return;
4424
4425         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
4426                 set_next_buddy(pse);
4427                 next_buddy_marked = 1;
4428         }
4429
4430         /*
4431          * We can come here with TIF_NEED_RESCHED already set from new task
4432          * wake up path.
4433          *
4434          * Note: this also catches the edge-case of curr being in a throttled
4435          * group (e.g. via set_curr_task), since update_curr() (in the
4436          * enqueue of curr) will have resulted in resched being set.  This
4437          * prevents us from potentially nominating it as a false LAST_BUDDY
4438          * below.
4439          */
4440         if (test_tsk_need_resched(curr))
4441                 return;
4442
4443         /* Idle tasks are by definition preempted by non-idle tasks. */
4444         if (unlikely(curr->policy == SCHED_IDLE) &&
4445             likely(p->policy != SCHED_IDLE))
4446                 goto preempt;
4447
4448         /*
4449          * Batch and idle tasks do not preempt non-idle tasks (their preemption
4450          * is driven by the tick):
4451          */
4452         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
4453                 return;
4454
4455         find_matching_se(&se, &pse);
4456         update_curr(cfs_rq_of(se));
4457         BUG_ON(!pse);
4458         if (wakeup_preempt_entity(se, pse) == 1) {
4459                 /*
4460                  * Bias pick_next to pick the sched entity that is
4461                  * triggering this preemption.
4462                  */
4463                 if (!next_buddy_marked)
4464                         set_next_buddy(pse);
4465                 goto preempt;
4466         }
4467
4468         return;
4469
4470 preempt:
4471         resched_task(curr);
4472         /*
4473          * Only set the backward buddy when the current task is still
4474          * on the rq. This can happen when a wakeup gets interleaved
4475          * with schedule on the ->pre_schedule() or idle_balance()
4476          * point, either of which can * drop the rq lock.
4477          *
4478          * Also, during early boot the idle thread is in the fair class,
4479          * for obvious reasons its a bad idea to schedule back to it.
4480          */
4481         if (unlikely(!se->on_rq || curr == rq->idle))
4482                 return;
4483
4484         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
4485                 set_last_buddy(se);
4486 }
4487
4488 static struct task_struct *pick_next_task_fair(struct rq *rq)
4489 {
4490         struct task_struct *p;
4491         struct cfs_rq *cfs_rq = &rq->cfs;
4492         struct sched_entity *se;
4493
4494         if (!cfs_rq->nr_running)
4495                 return NULL;
4496
4497         do {
4498                 se = pick_next_entity(cfs_rq);
4499                 set_next_entity(cfs_rq, se);
4500                 cfs_rq = group_cfs_rq(se);
4501         } while (cfs_rq);
4502
4503         p = task_of(se);
4504         if (hrtick_enabled(rq))
4505                 hrtick_start_fair(rq, p);
4506
4507         return p;
4508 }
4509
4510 /*
4511  * Account for a descheduled task:
4512  */
4513 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
4514 {
4515         struct sched_entity *se = &prev->se;
4516         struct cfs_rq *cfs_rq;
4517
4518         for_each_sched_entity(se) {
4519                 cfs_rq = cfs_rq_of(se);
4520                 put_prev_entity(cfs_rq, se);
4521         }
4522 }
4523
4524 /*
4525  * sched_yield() is very simple
4526  *
4527  * The magic of dealing with the ->skip buddy is in pick_next_entity.
4528  */
4529 static void yield_task_fair(struct rq *rq)
4530 {
4531         struct task_struct *curr = rq->curr;
4532         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
4533         struct sched_entity *se = &curr->se;
4534
4535         /*
4536          * Are we the only task in the tree?
4537          */
4538         if (unlikely(rq->nr_running == 1))
4539                 return;
4540
4541         clear_buddies(cfs_rq, se);
4542
4543         if (curr->policy != SCHED_BATCH) {
4544                 update_rq_clock(rq);
4545                 /*
4546                  * Update run-time statistics of the 'current'.
4547                  */
4548                 update_curr(cfs_rq);
4549                 /*
4550                  * Tell update_rq_clock() that we've just updated,
4551                  * so we don't do microscopic update in schedule()
4552                  * and double the fastpath cost.
4553                  */
4554                  rq->skip_clock_update = 1;
4555         }
4556
4557         set_skip_buddy(se);
4558 }
4559
4560 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
4561 {
4562         struct sched_entity *se = &p->se;
4563
4564         /* throttled hierarchies are not runnable */
4565         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
4566                 return false;
4567
4568         /* Tell the scheduler that we'd really like pse to run next. */
4569         set_next_buddy(se);
4570
4571         yield_task_fair(rq);
4572
4573         return true;
4574 }
4575
4576 #ifdef CONFIG_SMP
4577 /**************************************************
4578  * Fair scheduling class load-balancing methods.
4579  *
4580  * BASICS
4581  *
4582  * The purpose of load-balancing is to achieve the same basic fairness the
4583  * per-cpu scheduler provides, namely provide a proportional amount of compute
4584  * time to each task. This is expressed in the following equation:
4585  *
4586  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
4587  *
4588  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
4589  * W_i,0 is defined as:
4590  *
4591  *   W_i,0 = \Sum_j w_i,j                                             (2)
4592  *
4593  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
4594  * is derived from the nice value as per prio_to_weight[].
4595  *
4596  * The weight average is an exponential decay average of the instantaneous
4597  * weight:
4598  *
4599  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
4600  *
4601  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
4602  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
4603  * can also include other factors [XXX].
4604  *
4605  * To achieve this balance we define a measure of imbalance which follows
4606  * directly from (1):
4607  *
4608  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
4609  *
4610  * We them move tasks around to minimize the imbalance. In the continuous
4611  * function space it is obvious this converges, in the discrete case we get
4612  * a few fun cases generally called infeasible weight scenarios.
4613  *
4614  * [XXX expand on:
4615  *     - infeasible weights;
4616  *     - local vs global optima in the discrete case. ]
4617  *
4618  *
4619  * SCHED DOMAINS
4620  *
4621  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
4622  * for all i,j solution, we create a tree of cpus that follows the hardware
4623  * topology where each level pairs two lower groups (or better). This results
4624  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
4625  * tree to only the first of the previous level and we decrease the frequency
4626  * of load-balance at each level inv. proportional to the number of cpus in
4627  * the groups.
4628  *
4629  * This yields:
4630  *
4631  *     log_2 n     1     n
4632  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
4633  *     i = 0      2^i   2^i
4634  *                               `- size of each group
4635  *         |         |     `- number of cpus doing load-balance
4636  *         |         `- freq
4637  *         `- sum over all levels
4638  *
4639  * Coupled with a limit on how many tasks we can migrate every balance pass,
4640  * this makes (5) the runtime complexity of the balancer.
4641  *
4642  * An important property here is that each CPU is still (indirectly) connected
4643  * to every other cpu in at most O(log n) steps:
4644  *
4645  * The adjacency matrix of the resulting graph is given by:
4646  *
4647  *             log_2 n
4648  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
4649  *             k = 0
4650  *
4651  * And you'll find that:
4652  *
4653  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
4654  *
4655  * Showing there's indeed a path between every cpu in at most O(log n) steps.
4656  * The task movement gives a factor of O(m), giving a convergence complexity
4657  * of:
4658  *
4659  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
4660  *
4661  *
4662  * WORK CONSERVING
4663  *
4664  * In order to avoid CPUs going idle while there's still work to do, new idle
4665  * balancing is more aggressive and has the newly idle cpu iterate up the domain
4666  * tree itself instead of relying on other CPUs to bring it work.
4667  *
4668  * This adds some complexity to both (5) and (8) but it reduces the total idle
4669  * time.
4670  *
4671  * [XXX more?]
4672  *
4673  *
4674  * CGROUPS
4675  *
4676  * Cgroups make a horror show out of (2), instead of a simple sum we get:
4677  *
4678  *                                s_k,i
4679  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
4680  *                                 S_k
4681  *
4682  * Where
4683  *
4684  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
4685  *
4686  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
4687  *
4688  * The big problem is S_k, its a global sum needed to compute a local (W_i)
4689  * property.
4690  *
4691  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
4692  *      rewrite all of this once again.]
4693  */
4694
4695 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4696
4697 #define LBF_ALL_PINNED  0x01
4698 #define LBF_NEED_BREAK  0x02
4699 #define LBF_SOME_PINNED 0x04
4700
4701 struct lb_env {
4702         struct sched_domain     *sd;
4703
4704         struct rq               *src_rq;
4705         int                     src_cpu;
4706
4707         int                     dst_cpu;
4708         struct rq               *dst_rq;
4709
4710         struct cpumask          *dst_grpmask;
4711         int                     new_dst_cpu;
4712         enum cpu_idle_type      idle;
4713         long                    imbalance;
4714         /* The set of CPUs under consideration for load-balancing */
4715         struct cpumask          *cpus;
4716
4717         unsigned int            flags;
4718
4719         unsigned int            loop;
4720         unsigned int            loop_break;
4721         unsigned int            loop_max;
4722 };
4723
4724 /*
4725  * move_task - move a task from one runqueue to another runqueue.
4726  * Both runqueues must be locked.
4727  */
4728 static void move_task(struct task_struct *p, struct lb_env *env)
4729 {
4730         deactivate_task(env->src_rq, p, 0);
4731         set_task_cpu(p, env->dst_cpu);
4732         activate_task(env->dst_rq, p, 0);
4733         check_preempt_curr(env->dst_rq, p, 0);
4734 }
4735
4736 /*
4737  * Is this task likely cache-hot:
4738  */
4739 static int
4740 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
4741 {
4742         s64 delta;
4743
4744         if (p->sched_class != &fair_sched_class)
4745                 return 0;
4746
4747         if (unlikely(p->policy == SCHED_IDLE))
4748                 return 0;
4749
4750         /*
4751          * Buddy candidates are cache hot:
4752          */
4753         if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
4754                         (&p->se == cfs_rq_of(&p->se)->next ||
4755                          &p->se == cfs_rq_of(&p->se)->last))
4756                 return 1;
4757
4758         if (sysctl_sched_migration_cost == -1)
4759                 return 1;
4760         if (sysctl_sched_migration_cost == 0)
4761                 return 0;
4762
4763         delta = now - p->se.exec_start;
4764
4765         return delta < (s64)sysctl_sched_migration_cost;
4766 }
4767
4768 /*
4769  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
4770  */
4771 static
4772 int can_migrate_task(struct task_struct *p, struct lb_env *env)
4773 {
4774         int tsk_cache_hot = 0;
4775         /*
4776          * We do not migrate tasks that are:
4777          * 1) throttled_lb_pair, or
4778          * 2) cannot be migrated to this CPU due to cpus_allowed, or
4779          * 3) running (obviously), or
4780          * 4) are cache-hot on their current CPU.
4781          */
4782         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
4783                 return 0;
4784
4785         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
4786                 int cpu;
4787
4788                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
4789
4790                 /*
4791                  * Remember if this task can be migrated to any other cpu in
4792                  * our sched_group. We may want to revisit it if we couldn't
4793                  * meet load balance goals by pulling other tasks on src_cpu.
4794                  *
4795                  * Also avoid computing new_dst_cpu if we have already computed
4796                  * one in current iteration.
4797                  */
4798                 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
4799                         return 0;
4800
4801                 /* Prevent to re-select dst_cpu via env's cpus */
4802                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4803                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4804                                 env->flags |= LBF_SOME_PINNED;
4805                                 env->new_dst_cpu = cpu;
4806                                 break;
4807                         }
4808                 }
4809
4810                 return 0;
4811         }
4812
4813         /* Record that we found atleast one task that could run on dst_cpu */
4814         env->flags &= ~LBF_ALL_PINNED;
4815
4816         if (task_running(env->src_rq, p)) {
4817                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
4818                 return 0;
4819         }
4820
4821         /*
4822          * Aggressive migration if:
4823          * 1) task is cache cold, or
4824          * 2) too many balance attempts have failed.
4825          */
4826         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
4827         if (!tsk_cache_hot ||
4828                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4829
4830                 if (tsk_cache_hot) {
4831                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4832                         schedstat_inc(p, se.statistics.nr_forced_migrations);
4833                 }
4834
4835                 return 1;
4836         }
4837
4838         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
4839         return 0;
4840 }
4841
4842 /*
4843  * move_one_task tries to move exactly one task from busiest to this_rq, as
4844  * part of active balancing operations within "domain".
4845  * Returns 1 if successful and 0 otherwise.
4846  *
4847  * Called with both runqueues locked.
4848  */
4849 static int move_one_task(struct lb_env *env)
4850 {
4851         struct task_struct *p, *n;
4852
4853         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
4854                 if (!can_migrate_task(p, env))
4855                         continue;
4856
4857                 move_task(p, env);
4858                 /*
4859                  * Right now, this is only the second place move_task()
4860                  * is called, so we can safely collect move_task()
4861                  * stats here rather than inside move_task().
4862                  */
4863                 schedstat_inc(env->sd, lb_gained[env->idle]);
4864                 return 1;
4865         }
4866         return 0;
4867 }
4868
4869 static unsigned long task_h_load(struct task_struct *p);
4870
4871 static const unsigned int sched_nr_migrate_break = 32;
4872
4873 /*
4874  * move_tasks tries to move up to imbalance weighted load from busiest to
4875  * this_rq, as part of a balancing operation within domain "sd".
4876  * Returns 1 if successful and 0 otherwise.
4877  *
4878  * Called with both runqueues locked.
4879  */
4880 static int move_tasks(struct lb_env *env)
4881 {
4882         struct list_head *tasks = &env->src_rq->cfs_tasks;
4883         struct task_struct *p;
4884         unsigned long load;
4885         int pulled = 0;
4886
4887         if (env->imbalance <= 0)
4888                 return 0;
4889
4890         while (!list_empty(tasks)) {
4891                 p = list_first_entry(tasks, struct task_struct, se.group_node);
4892
4893                 env->loop++;
4894                 /* We've more or less seen every task there is, call it quits */
4895                 if (env->loop > env->loop_max)
4896                         break;
4897
4898                 /* take a breather every nr_migrate tasks */
4899                 if (env->loop > env->loop_break) {
4900                         env->loop_break += sched_nr_migrate_break;
4901                         env->flags |= LBF_NEED_BREAK;
4902                         break;
4903                 }
4904
4905                 if (!can_migrate_task(p, env))
4906                         goto next;
4907
4908                 load = task_h_load(p);
4909
4910                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
4911                         goto next;
4912
4913                 if ((load / 2) > env->imbalance)
4914                         goto next;
4915
4916                 move_task(p, env);
4917                 pulled++;
4918                 env->imbalance -= load;
4919
4920 #ifdef CONFIG_PREEMPT
4921                 /*
4922                  * NEWIDLE balancing is a source of latency, so preemptible
4923                  * kernels will stop after the first task is pulled to minimize
4924                  * the critical section.
4925                  */
4926                 if (env->idle == CPU_NEWLY_IDLE)
4927                         break;
4928 #endif
4929
4930                 /*
4931                  * We only want to steal up to the prescribed amount of
4932                  * weighted load.
4933                  */
4934                 if (env->imbalance <= 0)
4935                         break;
4936
4937                 continue;
4938 next:
4939                 list_move_tail(&p->se.group_node, tasks);
4940         }
4941
4942         /*
4943          * Right now, this is one of only two places move_task() is called,
4944          * so we can safely collect move_task() stats here rather than
4945          * inside move_task().
4946          */
4947         schedstat_add(env->sd, lb_gained[env->idle], pulled);
4948
4949         return pulled;
4950 }
4951
4952 #ifdef CONFIG_FAIR_GROUP_SCHED
4953 /*
4954  * update tg->load_weight by folding this cpu's load_avg
4955  */
4956 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
4957 {
4958         struct sched_entity *se = tg->se[cpu];
4959         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
4960
4961         /* throttled entities do not contribute to load */
4962         if (throttled_hierarchy(cfs_rq))
4963                 return;
4964
4965         update_cfs_rq_blocked_load(cfs_rq, 1);
4966
4967         if (se) {
4968                 update_entity_load_avg(se, 1);
4969                 /*
4970                  * We pivot on our runnable average having decayed to zero for
4971                  * list removal.  This generally implies that all our children
4972                  * have also been removed (modulo rounding error or bandwidth
4973                  * control); however, such cases are rare and we can fix these
4974                  * at enqueue.
4975                  *
4976                  * TODO: fix up out-of-order children on enqueue.
4977                  */
4978                 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4979                         list_del_leaf_cfs_rq(cfs_rq);
4980         } else {
4981                 struct rq *rq = rq_of(cfs_rq);
4982                 update_rq_runnable_avg(rq, rq->nr_running);
4983         }
4984 }
4985
4986 static void update_blocked_averages(int cpu)
4987 {
4988         struct rq *rq = cpu_rq(cpu);
4989         struct cfs_rq *cfs_rq;
4990         unsigned long flags;
4991
4992         raw_spin_lock_irqsave(&rq->lock, flags);
4993         update_rq_clock(rq);
4994         /*
4995          * Iterates the task_group tree in a bottom up fashion, see
4996          * list_add_leaf_cfs_rq() for details.
4997          */
4998         for_each_leaf_cfs_rq(rq, cfs_rq) {
4999                 /*
5000                  * Note: We may want to consider periodically releasing
5001                  * rq->lock about these updates so that creating many task
5002                  * groups does not result in continually extending hold time.
5003                  */
5004                 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
5005         }
5006
5007         raw_spin_unlock_irqrestore(&rq->lock, flags);
5008 }
5009
5010 /*
5011  * Compute the cpu's hierarchical load factor for each task group.
5012  * This needs to be done in a top-down fashion because the load of a child
5013  * group is a fraction of its parents load.
5014  */
5015 static int tg_load_down(struct task_group *tg, void *data)
5016 {
5017         unsigned long load;
5018         long cpu = (long)data;
5019
5020         if (!tg->parent) {
5021                 load = cpu_rq(cpu)->load.weight;
5022         } else {
5023                 load = tg->parent->cfs_rq[cpu]->h_load;
5024                 load *= tg->se[cpu]->load.weight;
5025                 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
5026         }
5027
5028         tg->cfs_rq[cpu]->h_load = load;
5029
5030         return 0;
5031 }
5032
5033 static void update_h_load(long cpu)
5034 {
5035         struct rq *rq = cpu_rq(cpu);
5036         unsigned long now = jiffies;
5037
5038         if (rq->h_load_throttle == now)
5039                 return;
5040
5041         rq->h_load_throttle = now;
5042
5043         rcu_read_lock();
5044         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
5045         rcu_read_unlock();
5046 }
5047
5048 static unsigned long task_h_load(struct task_struct *p)
5049 {
5050         struct cfs_rq *cfs_rq = task_cfs_rq(p);
5051         unsigned long load;
5052
5053         load = p->se.load.weight;
5054         load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
5055
5056         return load;
5057 }
5058 #else
5059 static inline void update_blocked_averages(int cpu)
5060 {
5061 }
5062
5063 static inline void update_h_load(long cpu)
5064 {
5065 }
5066
5067 static unsigned long task_h_load(struct task_struct *p)
5068 {
5069         return p->se.load.weight;
5070 }
5071 #endif
5072
5073 /********** Helpers for find_busiest_group ************************/
5074 /*
5075  * sd_lb_stats - Structure to store the statistics of a sched_domain
5076  *              during load balancing.
5077  */
5078 struct sd_lb_stats {
5079         struct sched_group *busiest; /* Busiest group in this sd */
5080         struct sched_group *this;  /* Local group in this sd */
5081         unsigned long total_load;  /* Total load of all groups in sd */
5082         unsigned long total_pwr;   /*   Total power of all groups in sd */
5083         unsigned long avg_load;    /* Average load across all groups in sd */
5084
5085         /** Statistics of this group */
5086         unsigned long this_load;
5087         unsigned long this_load_per_task;
5088         unsigned long this_nr_running;
5089         unsigned long this_has_capacity;
5090         unsigned int  this_idle_cpus;
5091
5092         /* Statistics of the busiest group */
5093         unsigned int  busiest_idle_cpus;
5094         unsigned long max_load;
5095         unsigned long busiest_load_per_task;
5096         unsigned long busiest_nr_running;
5097         unsigned long busiest_group_capacity;
5098         unsigned long busiest_has_capacity;
5099         unsigned int  busiest_group_weight;
5100
5101         int group_imb; /* Is there imbalance in this sd */
5102 };
5103
5104 /*
5105  * sg_lb_stats - stats of a sched_group required for load_balancing
5106  */
5107 struct sg_lb_stats {
5108         unsigned long avg_load; /*Avg load across the CPUs of the group */
5109         unsigned long group_load; /* Total load over the CPUs of the group */
5110         unsigned long sum_nr_running; /* Nr tasks running in the group */
5111         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
5112         unsigned long group_capacity;
5113         unsigned long idle_cpus;
5114         unsigned long group_weight;
5115         int group_imb; /* Is there an imbalance in the group ? */
5116         int group_has_capacity; /* Is there extra capacity in the group? */
5117 };
5118
5119 /**
5120  * get_sd_load_idx - Obtain the load index for a given sched domain.
5121  * @sd: The sched_domain whose load_idx is to be obtained.
5122  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
5123  */
5124 static inline int get_sd_load_idx(struct sched_domain *sd,
5125                                         enum cpu_idle_type idle)
5126 {
5127         int load_idx;
5128
5129         switch (idle) {
5130         case CPU_NOT_IDLE:
5131                 load_idx = sd->busy_idx;
5132                 break;
5133
5134         case CPU_NEWLY_IDLE:
5135                 load_idx = sd->newidle_idx;
5136                 break;
5137         default:
5138                 load_idx = sd->idle_idx;
5139                 break;
5140         }
5141
5142         return load_idx;
5143 }
5144
5145 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
5146 {
5147         return SCHED_POWER_SCALE;
5148 }
5149
5150 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
5151 {
5152         return default_scale_freq_power(sd, cpu);
5153 }
5154
5155 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
5156 {
5157         unsigned long weight = sd->span_weight;
5158         unsigned long smt_gain = sd->smt_gain;
5159
5160         smt_gain /= weight;
5161
5162         return smt_gain;
5163 }
5164
5165 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
5166 {
5167         return default_scale_smt_power(sd, cpu);
5168 }
5169
5170 static unsigned long scale_rt_power(int cpu)
5171 {
5172         struct rq *rq = cpu_rq(cpu);
5173         u64 total, available, age_stamp, avg;
5174
5175         /*
5176          * Since we're reading these variables without serialization make sure
5177          * we read them once before doing sanity checks on them.
5178          */
5179         age_stamp = ACCESS_ONCE(rq->age_stamp);
5180         avg = ACCESS_ONCE(rq->rt_avg);
5181
5182         total = sched_avg_period() + (rq->clock - age_stamp);
5183
5184         if (unlikely(total < avg)) {
5185                 /* Ensures that power won't end up being negative */
5186                 available = 0;
5187         } else {
5188                 available = total - avg;
5189         }
5190
5191         if (unlikely((s64)total < SCHED_POWER_SCALE))
5192                 total = SCHED_POWER_SCALE;
5193
5194         total >>= SCHED_POWER_SHIFT;
5195
5196         return div_u64(available, total);
5197 }
5198
5199 static void update_cpu_power(struct sched_domain *sd, int cpu)
5200 {
5201         unsigned long weight = sd->span_weight;
5202         unsigned long power = SCHED_POWER_SCALE;
5203         struct sched_group *sdg = sd->groups;
5204
5205         if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
5206                 if (sched_feat(ARCH_POWER))
5207                         power *= arch_scale_smt_power(sd, cpu);
5208                 else
5209                         power *= default_scale_smt_power(sd, cpu);
5210
5211                 power >>= SCHED_POWER_SHIFT;
5212         }
5213
5214         sdg->sgp->power_orig = power;
5215
5216         if (sched_feat(ARCH_POWER))
5217                 power *= arch_scale_freq_power(sd, cpu);
5218         else
5219                 power *= default_scale_freq_power(sd, cpu);
5220
5221         power >>= SCHED_POWER_SHIFT;
5222
5223         power *= scale_rt_power(cpu);
5224         power >>= SCHED_POWER_SHIFT;
5225
5226         if (!power)
5227                 power = 1;
5228
5229         cpu_rq(cpu)->cpu_power = power;
5230         sdg->sgp->power = power;
5231 }
5232
5233 void update_group_power(struct sched_domain *sd, int cpu)
5234 {
5235         struct sched_domain *child = sd->child;
5236         struct sched_group *group, *sdg = sd->groups;
5237         unsigned long power;
5238         unsigned long interval;
5239
5240         interval = msecs_to_jiffies(sd->balance_interval);
5241         interval = clamp(interval, 1UL, max_load_balance_interval);
5242         sdg->sgp->next_update = jiffies + interval;
5243
5244         if (!child) {
5245                 update_cpu_power(sd, cpu);
5246                 return;
5247         }
5248
5249         power = 0;
5250
5251         if (child->flags & SD_OVERLAP) {
5252                 /*
5253                  * SD_OVERLAP domains cannot assume that child groups
5254                  * span the current group.
5255                  */
5256
5257                 for_each_cpu(cpu, sched_group_cpus(sdg))
5258                         power += power_of(cpu);
5259         } else  {
5260                 /*
5261                  * !SD_OVERLAP domains can assume that child groups
5262                  * span the current group.
5263                  */
5264
5265                 group = child->groups;
5266                 do {
5267                         power += group->sgp->power;
5268                         group = group->next;
5269                 } while (group != child->groups);
5270         }
5271
5272         sdg->sgp->power_orig = sdg->sgp->power = power;
5273 }
5274
5275 /*
5276  * Try and fix up capacity for tiny siblings, this is needed when
5277  * things like SD_ASYM_PACKING need f_b_g to select another sibling
5278  * which on its own isn't powerful enough.
5279  *
5280  * See update_sd_pick_busiest() and check_asym_packing().
5281  */
5282 static inline int
5283 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
5284 {
5285         /*
5286          * Only siblings can have significantly less than SCHED_POWER_SCALE
5287          */
5288         if (!(sd->flags & SD_SHARE_CPUPOWER))
5289                 return 0;
5290
5291         /*
5292          * If ~90% of the cpu_power is still there, we're good.
5293          */
5294         if (group->sgp->power * 32 > group->sgp->power_orig * 29)
5295                 return 1;
5296
5297         return 0;
5298 }
5299
5300 /**
5301  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5302  * @env: The load balancing environment.
5303  * @group: sched_group whose statistics are to be updated.
5304  * @load_idx: Load index of sched_domain of this_cpu for load calc.
5305  * @local_group: Does group contain this_cpu.
5306  * @balance: Should we balance.
5307  * @sgs: variable to hold the statistics for this group.
5308  */
5309 static inline void update_sg_lb_stats(struct lb_env *env,
5310                         struct sched_group *group, int load_idx,
5311                         int local_group, int *balance, struct sg_lb_stats *sgs)
5312 {
5313         unsigned long nr_running, max_nr_running, min_nr_running;
5314         unsigned long load, max_cpu_load, min_cpu_load;
5315         unsigned int balance_cpu = -1, first_idle_cpu = 0;
5316         unsigned long avg_load_per_task = 0;
5317         int i;
5318
5319         if (local_group)
5320                 balance_cpu = group_balance_cpu(group);
5321
5322         /* Tally up the load of all CPUs in the group */
5323         max_cpu_load = 0;
5324         min_cpu_load = ~0UL;
5325         max_nr_running = 0;
5326         min_nr_running = ~0UL;
5327
5328         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5329                 struct rq *rq = cpu_rq(i);
5330
5331                 nr_running = rq->nr_running;
5332
5333                 /* Bias balancing toward cpus of our domain */
5334                 if (local_group) {
5335                         if (idle_cpu(i) && !first_idle_cpu &&
5336                                         cpumask_test_cpu(i, sched_group_mask(group))) {
5337                                 first_idle_cpu = 1;
5338                                 balance_cpu = i;
5339                         }
5340
5341                         load = target_load(i, load_idx);
5342                 } else {
5343                         load = source_load(i, load_idx);
5344                         if (load > max_cpu_load)
5345                                 max_cpu_load = load;
5346                         if (min_cpu_load > load)
5347                                 min_cpu_load = load;
5348
5349                         if (nr_running > max_nr_running)
5350                                 max_nr_running = nr_running;
5351                         if (min_nr_running > nr_running)
5352                                 min_nr_running = nr_running;
5353                 }
5354
5355                 sgs->group_load += load;
5356                 sgs->sum_nr_running += nr_running;
5357                 sgs->sum_weighted_load += weighted_cpuload(i);
5358                 if (idle_cpu(i))
5359                         sgs->idle_cpus++;
5360         }
5361
5362         /*
5363          * First idle cpu or the first cpu(busiest) in this sched group
5364          * is eligible for doing load balancing at this and above
5365          * domains. In the newly idle case, we will allow all the cpu's
5366          * to do the newly idle load balance.
5367          */
5368         if (local_group) {
5369                 if (env->idle != CPU_NEWLY_IDLE) {
5370                         if (balance_cpu != env->dst_cpu) {
5371                                 *balance = 0;
5372                                 return;
5373                         }
5374                         update_group_power(env->sd, env->dst_cpu);
5375                 } else if (time_after_eq(jiffies, group->sgp->next_update))
5376                         update_group_power(env->sd, env->dst_cpu);
5377         }
5378
5379         /* Adjust by relative CPU power of the group */
5380         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
5381
5382         /*
5383          * Consider the group unbalanced when the imbalance is larger
5384          * than the average weight of a task.
5385          *
5386          * APZ: with cgroup the avg task weight can vary wildly and
5387          *      might not be a suitable number - should we keep a
5388          *      normalized nr_running number somewhere that negates
5389          *      the hierarchy?
5390          */
5391         if (sgs->sum_nr_running)
5392                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5393
5394         if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
5395             (max_nr_running - min_nr_running) > 1)
5396                 sgs->group_imb = 1;
5397
5398         sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
5399                                                 SCHED_POWER_SCALE);
5400         if (!sgs->group_capacity)
5401                 sgs->group_capacity = fix_small_capacity(env->sd, group);
5402         sgs->group_weight = group->group_weight;
5403
5404         if (sgs->group_capacity > sgs->sum_nr_running)
5405                 sgs->group_has_capacity = 1;
5406 }
5407
5408 /**
5409  * update_sd_pick_busiest - return 1 on busiest group
5410  * @env: The load balancing environment.
5411  * @sds: sched_domain statistics
5412  * @sg: sched_group candidate to be checked for being the busiest
5413  * @sgs: sched_group statistics
5414  *
5415  * Determine if @sg is a busier group than the previously selected
5416  * busiest group.
5417  */
5418 static bool update_sd_pick_busiest(struct lb_env *env,
5419                                    struct sd_lb_stats *sds,
5420                                    struct sched_group *sg,
5421                                    struct sg_lb_stats *sgs)
5422 {
5423         if (sgs->avg_load <= sds->max_load)
5424                 return false;
5425
5426         if (sgs->sum_nr_running > sgs->group_capacity)
5427                 return true;
5428
5429         if (sgs->group_imb)
5430                 return true;
5431
5432         /*
5433          * ASYM_PACKING needs to move all the work to the lowest
5434          * numbered CPUs in the group, therefore mark all groups
5435          * higher than ourself as busy.
5436          */
5437         if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
5438             env->dst_cpu < group_first_cpu(sg)) {
5439                 if (!sds->busiest)
5440                         return true;
5441
5442                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
5443                         return true;
5444         }
5445
5446         return false;
5447 }
5448
5449 /**
5450  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
5451  * @env: The load balancing environment.
5452  * @balance: Should we balance.
5453  * @sds: variable to hold the statistics for this sched_domain.
5454  */
5455 static inline void update_sd_lb_stats(struct lb_env *env,
5456                                         int *balance, struct sd_lb_stats *sds)
5457 {
5458         struct sched_domain *child = env->sd->child;
5459         struct sched_group *sg = env->sd->groups;
5460         struct sg_lb_stats sgs;
5461         int load_idx, prefer_sibling = 0;
5462
5463         if (child && child->flags & SD_PREFER_SIBLING)
5464                 prefer_sibling = 1;
5465
5466         load_idx = get_sd_load_idx(env->sd, env->idle);
5467
5468         do {
5469                 int local_group;
5470
5471                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
5472                 memset(&sgs, 0, sizeof(sgs));
5473                 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
5474
5475                 if (local_group && !(*balance))
5476                         return;
5477
5478                 sds->total_load += sgs.group_load;
5479                 sds->total_pwr += sg->sgp->power;
5480
5481                 /*
5482                  * In case the child domain prefers tasks go to siblings
5483                  * first, lower the sg capacity to one so that we'll try
5484                  * and move all the excess tasks away. We lower the capacity
5485                  * of a group only if the local group has the capacity to fit
5486                  * these excess tasks, i.e. nr_running < group_capacity. The
5487                  * extra check prevents the case where you always pull from the
5488                  * heaviest group when it is already under-utilized (possible
5489                  * with a large weight task outweighs the tasks on the system).
5490                  */
5491                 if (prefer_sibling && !local_group && sds->this_has_capacity)
5492                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
5493
5494                 if (local_group) {
5495                         sds->this_load = sgs.avg_load;
5496                         sds->this = sg;
5497                         sds->this_nr_running = sgs.sum_nr_running;
5498                         sds->this_load_per_task = sgs.sum_weighted_load;
5499                         sds->this_has_capacity = sgs.group_has_capacity;
5500                         sds->this_idle_cpus = sgs.idle_cpus;
5501                 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
5502                         sds->max_load = sgs.avg_load;
5503                         sds->busiest = sg;
5504                         sds->busiest_nr_running = sgs.sum_nr_running;
5505                         sds->busiest_idle_cpus = sgs.idle_cpus;
5506                         sds->busiest_group_capacity = sgs.group_capacity;
5507                         sds->busiest_load_per_task = sgs.sum_weighted_load;
5508                         sds->busiest_has_capacity = sgs.group_has_capacity;
5509                         sds->busiest_group_weight = sgs.group_weight;
5510                         sds->group_imb = sgs.group_imb;
5511                 }
5512
5513                 sg = sg->next;
5514         } while (sg != env->sd->groups);
5515 }
5516
5517 /**
5518  * check_asym_packing - Check to see if the group is packed into the
5519  *                      sched doman.
5520  *
5521  * This is primarily intended to used at the sibling level.  Some
5522  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
5523  * case of POWER7, it can move to lower SMT modes only when higher
5524  * threads are idle.  When in lower SMT modes, the threads will
5525  * perform better since they share less core resources.  Hence when we
5526  * have idle threads, we want them to be the higher ones.
5527  *
5528  * This packing function is run on idle threads.  It checks to see if
5529  * the busiest CPU in this domain (core in the P7 case) has a higher
5530  * CPU number than the packing function is being run on.  Here we are
5531  * assuming lower CPU number will be equivalent to lower a SMT thread
5532  * number.
5533  *
5534  * Returns 1 when packing is required and a task should be moved to
5535  * this CPU.  The amount of the imbalance is returned in *imbalance.
5536  *
5537  * @env: The load balancing environment.
5538  * @sds: Statistics of the sched_domain which is to be packed
5539  */
5540 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
5541 {
5542         int busiest_cpu;
5543
5544         if (!(env->sd->flags & SD_ASYM_PACKING))
5545                 return 0;
5546
5547         if (!sds->busiest)
5548                 return 0;
5549
5550         busiest_cpu = group_first_cpu(sds->busiest);
5551         if (env->dst_cpu > busiest_cpu)
5552                 return 0;
5553
5554         env->imbalance = DIV_ROUND_CLOSEST(
5555                 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
5556
5557         return 1;
5558 }
5559
5560 /**
5561  * fix_small_imbalance - Calculate the minor imbalance that exists
5562  *                      amongst the groups of a sched_domain, during
5563  *                      load balancing.
5564  * @env: The load balancing environment.
5565  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
5566  */
5567 static inline
5568 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5569 {
5570         unsigned long tmp, pwr_now = 0, pwr_move = 0;
5571         unsigned int imbn = 2;
5572         unsigned long scaled_busy_load_per_task;
5573
5574         if (sds->this_nr_running) {
5575                 sds->this_load_per_task /= sds->this_nr_running;
5576                 if (sds->busiest_load_per_task >
5577                                 sds->this_load_per_task)
5578                         imbn = 1;
5579         } else {
5580                 sds->this_load_per_task =
5581                         cpu_avg_load_per_task(env->dst_cpu);
5582         }
5583
5584         scaled_busy_load_per_task = sds->busiest_load_per_task
5585                                          * SCHED_POWER_SCALE;
5586         scaled_busy_load_per_task /= sds->busiest->sgp->power;
5587
5588         if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
5589                         (scaled_busy_load_per_task * imbn)) {
5590                 env->imbalance = sds->busiest_load_per_task;
5591                 return;
5592         }
5593
5594         /*
5595          * OK, we don't have enough imbalance to justify moving tasks,
5596          * however we may be able to increase total CPU power used by
5597          * moving them.
5598          */
5599
5600         pwr_now += sds->busiest->sgp->power *
5601                         min(sds->busiest_load_per_task, sds->max_load);
5602         pwr_now += sds->this->sgp->power *
5603                         min(sds->this_load_per_task, sds->this_load);
5604         pwr_now /= SCHED_POWER_SCALE;
5605
5606         /* Amount of load we'd subtract */
5607         tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5608                 sds->busiest->sgp->power;
5609         if (sds->max_load > tmp)
5610                 pwr_move += sds->busiest->sgp->power *
5611                         min(sds->busiest_load_per_task, sds->max_load - tmp);
5612
5613         /* Amount of load we'd add */
5614         if (sds->max_load * sds->busiest->sgp->power <
5615                 sds->busiest_load_per_task * SCHED_POWER_SCALE)
5616                 tmp = (sds->max_load * sds->busiest->sgp->power) /
5617                         sds->this->sgp->power;
5618         else
5619                 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
5620                         sds->this->sgp->power;
5621         pwr_move += sds->this->sgp->power *
5622                         min(sds->this_load_per_task, sds->this_load + tmp);
5623         pwr_move /= SCHED_POWER_SCALE;
5624
5625         /* Move if we gain throughput */
5626         if (pwr_move > pwr_now)
5627                 env->imbalance = sds->busiest_load_per_task;
5628 }
5629
5630 /**
5631  * calculate_imbalance - Calculate the amount of imbalance present within the
5632  *                       groups of a given sched_domain during load balance.
5633  * @env: load balance environment
5634  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
5635  */
5636 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
5637 {
5638         unsigned long max_pull, load_above_capacity = ~0UL;
5639
5640         sds->busiest_load_per_task /= sds->busiest_nr_running;
5641         if (sds->group_imb) {
5642                 sds->busiest_load_per_task =
5643                         min(sds->busiest_load_per_task, sds->avg_load);
5644         }
5645
5646         /*
5647          * In the presence of smp nice balancing, certain scenarios can have
5648          * max load less than avg load(as we skip the groups at or below
5649          * its cpu_power, while calculating max_load..)
5650          */
5651         if (sds->max_load < sds->avg_load) {
5652                 env->imbalance = 0;
5653                 return fix_small_imbalance(env, sds);
5654         }
5655
5656         if (!sds->group_imb) {
5657                 /*
5658                  * Don't want to pull so many tasks that a group would go idle.
5659                  */
5660                 load_above_capacity = (sds->busiest_nr_running -
5661                                                 sds->busiest_group_capacity);
5662
5663                 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
5664
5665                 load_above_capacity /= sds->busiest->sgp->power;
5666         }
5667
5668         /*
5669          * We're trying to get all the cpus to the average_load, so we don't
5670          * want to push ourselves above the average load, nor do we wish to
5671          * reduce the max loaded cpu below the average load. At the same time,
5672          * we also don't want to reduce the group load below the group capacity
5673          * (so that we can implement power-savings policies etc). Thus we look
5674          * for the minimum possible imbalance.
5675          * Be careful of negative numbers as they'll appear as very large values
5676          * with unsigned longs.
5677          */
5678         max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
5679
5680         /* How much load to actually move to equalise the imbalance */
5681         env->imbalance = min(max_pull * sds->busiest->sgp->power,
5682                 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
5683                         / SCHED_POWER_SCALE;
5684
5685         /*
5686          * if *imbalance is less than the average load per runnable task
5687          * there is no guarantee that any tasks will be moved so we'll have
5688          * a think about bumping its value to force at least one task to be
5689          * moved
5690          */
5691         if (env->imbalance < sds->busiest_load_per_task)
5692                 return fix_small_imbalance(env, sds);
5693
5694 }
5695
5696 /******* find_busiest_group() helpers end here *********************/
5697
5698 /**
5699  * find_busiest_group - Returns the busiest group within the sched_domain
5700  * if there is an imbalance. If there isn't an imbalance, and
5701  * the user has opted for power-savings, it returns a group whose
5702  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
5703  * such a group exists.
5704  *
5705  * Also calculates the amount of weighted load which should be moved
5706  * to restore balance.
5707  *
5708  * @env: The load balancing environment.
5709  * @balance: Pointer to a variable indicating if this_cpu
5710  *      is the appropriate cpu to perform load balancing at this_level.
5711  *
5712  * Returns:     - the busiest group if imbalance exists.
5713  *              - If no imbalance and user has opted for power-savings balance,
5714  *                 return the least loaded group whose CPUs can be
5715  *                 put to idle by rebalancing its tasks onto our group.
5716  */
5717 static struct sched_group *
5718 find_busiest_group(struct lb_env *env, int *balance)
5719 {
5720         struct sd_lb_stats sds;
5721
5722         memset(&sds, 0, sizeof(sds));
5723
5724         /*
5725          * Compute the various statistics relavent for load balancing at
5726          * this level.
5727          */
5728         update_sd_lb_stats(env, balance, &sds);
5729
5730         /*
5731          * this_cpu is not the appropriate cpu to perform load balancing at
5732          * this level.
5733          */
5734         if (!(*balance))
5735                 goto ret;
5736
5737         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
5738             check_asym_packing(env, &sds))
5739                 return sds.busiest;
5740
5741         /* There is no busy sibling group to pull tasks from */
5742         if (!sds.busiest || sds.busiest_nr_running == 0)
5743                 goto out_balanced;
5744
5745         sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
5746
5747         /*
5748          * If the busiest group is imbalanced the below checks don't
5749          * work because they assumes all things are equal, which typically
5750          * isn't true due to cpus_allowed constraints and the like.
5751          */
5752         if (sds.group_imb)
5753                 goto force_balance;
5754
5755         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
5756         if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
5757                         !sds.busiest_has_capacity)
5758                 goto force_balance;
5759
5760         /*
5761          * If the local group is more busy than the selected busiest group
5762          * don't try and pull any tasks.
5763          */
5764         if (sds.this_load >= sds.max_load)
5765                 goto out_balanced;
5766
5767         /*
5768          * Don't pull any tasks if this group is already above the domain
5769          * average load.
5770          */
5771         if (sds.this_load >= sds.avg_load)
5772                 goto out_balanced;
5773
5774         if (env->idle == CPU_IDLE) {
5775                 /*
5776                  * This cpu is idle. If the busiest group load doesn't
5777                  * have more tasks than the number of available cpu's and
5778                  * there is no imbalance between this and busiest group
5779                  * wrt to idle cpu's, it is balanced.
5780                  */
5781                 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
5782                     sds.busiest_nr_running <= sds.busiest_group_weight)
5783                         goto out_balanced;
5784         } else {
5785                 /*
5786                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
5787                  * imbalance_pct to be conservative.
5788                  */
5789                 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
5790                         goto out_balanced;
5791         }
5792
5793 force_balance:
5794         /* Looks like there is an imbalance. Compute it */
5795         calculate_imbalance(env, &sds);
5796         return sds.busiest;
5797
5798 out_balanced:
5799 ret:
5800         env->imbalance = 0;
5801         return NULL;
5802 }
5803
5804 /*
5805  * find_busiest_queue - find the busiest runqueue among the cpus in group.
5806  */
5807 static struct rq *find_busiest_queue(struct lb_env *env,
5808                                      struct sched_group *group)
5809 {
5810         struct rq *busiest = NULL, *rq;
5811         unsigned long max_load = 0;
5812         int i;
5813
5814         for_each_cpu(i, sched_group_cpus(group)) {
5815                 unsigned long power = power_of(i);
5816                 unsigned long capacity = DIV_ROUND_CLOSEST(power,
5817                                                            SCHED_POWER_SCALE);
5818                 unsigned long wl;
5819
5820                 if (!capacity)
5821                         capacity = fix_small_capacity(env->sd, group);
5822
5823                 if (!cpumask_test_cpu(i, env->cpus))
5824                         continue;
5825
5826                 rq = cpu_rq(i);
5827                 wl = weighted_cpuload(i);
5828
5829                 /*
5830                  * When comparing with imbalance, use weighted_cpuload()
5831                  * which is not scaled with the cpu power.
5832                  */
5833                 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
5834                         continue;
5835
5836                 /*
5837                  * For the load comparisons with the other cpu's, consider
5838                  * the weighted_cpuload() scaled with the cpu power, so that
5839                  * the load can be moved away from the cpu that is potentially
5840                  * running at a lower capacity.
5841                  */
5842                 wl = (wl * SCHED_POWER_SCALE) / power;
5843
5844                 if (wl > max_load) {
5845                         max_load = wl;
5846                         busiest = rq;
5847                 }
5848         }
5849
5850         return busiest;
5851 }
5852
5853 /*
5854  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5855  * so long as it is large enough.
5856  */
5857 #define MAX_PINNED_INTERVAL     512
5858
5859 /* Working cpumask for load_balance and load_balance_newidle. */
5860 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5861
5862 static int need_active_balance(struct lb_env *env)
5863 {
5864         struct sched_domain *sd = env->sd;
5865
5866         if (env->idle == CPU_NEWLY_IDLE) {
5867
5868                 /*
5869                  * ASYM_PACKING needs to force migrate tasks from busy but
5870                  * higher numbered CPUs in order to pack all tasks in the
5871                  * lowest numbered CPUs.
5872                  */
5873                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
5874                         return 1;
5875         }
5876
5877         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5878 }
5879
5880 static int active_load_balance_cpu_stop(void *data);
5881
5882 /*
5883  * Check this_cpu to ensure it is balanced within domain. Attempt to move
5884  * tasks if there is an imbalance.
5885  */
5886 static int load_balance(int this_cpu, struct rq *this_rq,
5887                         struct sched_domain *sd, enum cpu_idle_type idle,
5888                         int *balance)
5889 {
5890         int ld_moved, cur_ld_moved, active_balance = 0;
5891         struct sched_group *group;
5892         struct rq *busiest;
5893         unsigned long flags;
5894         struct cpumask *cpus = __get_cpu_var(load_balance_mask);
5895
5896         struct lb_env env = {
5897                 .sd             = sd,
5898                 .dst_cpu        = this_cpu,
5899                 .dst_rq         = this_rq,
5900                 .dst_grpmask    = sched_group_cpus(sd->groups),
5901                 .idle           = idle,
5902                 .loop_break     = sched_nr_migrate_break,
5903                 .cpus           = cpus,
5904         };
5905
5906         /*
5907          * For NEWLY_IDLE load_balancing, we don't need to consider
5908          * other cpus in our group
5909          */
5910         if (idle == CPU_NEWLY_IDLE)
5911                 env.dst_grpmask = NULL;
5912
5913         cpumask_copy(cpus, cpu_active_mask);
5914
5915         schedstat_inc(sd, lb_count[idle]);
5916
5917 redo:
5918         group = find_busiest_group(&env, balance);
5919
5920         if (*balance == 0)
5921                 goto out_balanced;
5922
5923         if (!group) {
5924                 schedstat_inc(sd, lb_nobusyg[idle]);
5925                 goto out_balanced;
5926         }
5927
5928         busiest = find_busiest_queue(&env, group);
5929         if (!busiest) {
5930                 schedstat_inc(sd, lb_nobusyq[idle]);
5931                 goto out_balanced;
5932         }
5933
5934         BUG_ON(busiest == env.dst_rq);
5935
5936         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
5937
5938         ld_moved = 0;
5939         if (busiest->nr_running > 1) {
5940                 /*
5941                  * Attempt to move tasks. If find_busiest_group has found
5942                  * an imbalance but busiest->nr_running <= 1, the group is
5943                  * still unbalanced. ld_moved simply stays zero, so it is
5944                  * correctly treated as an imbalance.
5945                  */
5946                 env.flags |= LBF_ALL_PINNED;
5947                 env.src_cpu   = busiest->cpu;
5948                 env.src_rq    = busiest;
5949                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
5950
5951                 update_h_load(env.src_cpu);
5952 more_balance:
5953                 local_irq_save(flags);
5954                 double_rq_lock(env.dst_rq, busiest);
5955
5956                 /*
5957                  * cur_ld_moved - load moved in current iteration
5958                  * ld_moved     - cumulative load moved across iterations
5959                  */
5960                 cur_ld_moved = move_tasks(&env);
5961                 ld_moved += cur_ld_moved;
5962                 double_rq_unlock(env.dst_rq, busiest);
5963                 local_irq_restore(flags);
5964
5965                 /*
5966                  * some other cpu did the load balance for us.
5967                  */
5968                 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5969                         resched_cpu(env.dst_cpu);
5970
5971                 if (env.flags & LBF_NEED_BREAK) {
5972                         env.flags &= ~LBF_NEED_BREAK;
5973                         goto more_balance;
5974                 }
5975
5976                 /*
5977                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
5978                  * us and move them to an alternate dst_cpu in our sched_group
5979                  * where they can run. The upper limit on how many times we
5980                  * iterate on same src_cpu is dependent on number of cpus in our
5981                  * sched_group.
5982                  *
5983                  * This changes load balance semantics a bit on who can move
5984                  * load to a given_cpu. In addition to the given_cpu itself
5985                  * (or a ilb_cpu acting on its behalf where given_cpu is
5986                  * nohz-idle), we now have balance_cpu in a position to move
5987                  * load to given_cpu. In rare situations, this may cause
5988                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5989                  * _independently_ and at _same_ time to move some load to
5990                  * given_cpu) causing exceess load to be moved to given_cpu.
5991                  * This however should not happen so much in practice and
5992                  * moreover subsequent load balance cycles should correct the
5993                  * excess load moved.
5994                  */
5995                 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
5996
5997                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
5998                         env.dst_cpu      = env.new_dst_cpu;
5999                         env.flags       &= ~LBF_SOME_PINNED;
6000                         env.loop         = 0;
6001                         env.loop_break   = sched_nr_migrate_break;
6002
6003                         /* Prevent to re-select dst_cpu via env's cpus */
6004                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
6005
6006                         /*
6007                          * Go back to "more_balance" rather than "redo" since we
6008                          * need to continue with same src_cpu.
6009                          */
6010                         goto more_balance;
6011                 }
6012
6013                 /* All tasks on this runqueue were pinned by CPU affinity */
6014                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
6015                         cpumask_clear_cpu(cpu_of(busiest), cpus);
6016                         if (!cpumask_empty(cpus)) {
6017                                 env.loop = 0;
6018                                 env.loop_break = sched_nr_migrate_break;
6019                                 goto redo;
6020                         }
6021                         goto out_balanced;
6022                 }
6023         }
6024
6025         if (!ld_moved) {
6026                 schedstat_inc(sd, lb_failed[idle]);
6027                 /*
6028                  * Increment the failure counter only on periodic balance.
6029                  * We do not want newidle balance, which can be very
6030                  * frequent, pollute the failure counter causing
6031                  * excessive cache_hot migrations and active balances.
6032                  */
6033                 if (idle != CPU_NEWLY_IDLE)
6034                         sd->nr_balance_failed++;
6035
6036                 if (need_active_balance(&env)) {
6037                         raw_spin_lock_irqsave(&busiest->lock, flags);
6038
6039                         /* don't kick the active_load_balance_cpu_stop,
6040                          * if the curr task on busiest cpu can't be
6041                          * moved to this_cpu
6042                          */
6043                         if (!cpumask_test_cpu(this_cpu,
6044                                         tsk_cpus_allowed(busiest->curr))) {
6045                                 raw_spin_unlock_irqrestore(&busiest->lock,
6046                                                             flags);
6047                                 env.flags |= LBF_ALL_PINNED;
6048                                 goto out_one_pinned;
6049                         }
6050
6051                         /*
6052                          * ->active_balance synchronizes accesses to
6053                          * ->active_balance_work.  Once set, it's cleared
6054                          * only after active load balance is finished.
6055                          */
6056                         if (!busiest->active_balance) {
6057                                 busiest->active_balance = 1;
6058                                 busiest->push_cpu = this_cpu;
6059                                 active_balance = 1;
6060                         }
6061                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
6062
6063                         if (active_balance) {
6064                                 stop_one_cpu_nowait(cpu_of(busiest),
6065                                         active_load_balance_cpu_stop, busiest,
6066                                         &busiest->active_balance_work);
6067                         }
6068
6069                         /*
6070                          * We've kicked active balancing, reset the failure
6071                          * counter.
6072                          */
6073                         sd->nr_balance_failed = sd->cache_nice_tries+1;
6074                 }
6075         } else
6076                 sd->nr_balance_failed = 0;
6077
6078         if (likely(!active_balance)) {
6079                 /* We were unbalanced, so reset the balancing interval */
6080                 sd->balance_interval = sd->min_interval;
6081         } else {
6082                 /*
6083                  * If we've begun active balancing, start to back off. This
6084                  * case may not be covered by the all_pinned logic if there
6085                  * is only 1 task on the busy runqueue (because we don't call
6086                  * move_tasks).
6087                  */
6088                 if (sd->balance_interval < sd->max_interval)
6089                         sd->balance_interval *= 2;
6090         }
6091
6092         goto out;
6093
6094 out_balanced:
6095         schedstat_inc(sd, lb_balanced[idle]);
6096
6097         sd->nr_balance_failed = 0;
6098
6099 out_one_pinned:
6100         /* tune up the balancing interval */
6101         if (((env.flags & LBF_ALL_PINNED) &&
6102                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
6103                         (sd->balance_interval < sd->max_interval))
6104                 sd->balance_interval *= 2;
6105
6106         ld_moved = 0;
6107 out:
6108         return ld_moved;
6109 }
6110 #ifdef CONFIG_SCHED_HMP
6111 static unsigned int hmp_idle_pull(int this_cpu);
6112 #endif
6113 /*
6114  * idle_balance is called by schedule() if this_cpu is about to become
6115  * idle. Attempts to pull tasks from other CPUs.
6116  */
6117 void idle_balance(int this_cpu, struct rq *this_rq)
6118 {
6119         struct sched_domain *sd;
6120         int pulled_task = 0;
6121         unsigned long next_balance = jiffies + HZ;
6122
6123         this_rq->idle_stamp = this_rq->clock;
6124
6125         if (this_rq->avg_idle < sysctl_sched_migration_cost)
6126                 return;
6127
6128         /*
6129          * Drop the rq->lock, but keep IRQ/preempt disabled.
6130          */
6131         raw_spin_unlock(&this_rq->lock);
6132
6133         update_blocked_averages(this_cpu);
6134         rcu_read_lock();
6135         for_each_domain(this_cpu, sd) {
6136                 unsigned long interval;
6137                 int balance = 1;
6138
6139                 if (!(sd->flags & SD_LOAD_BALANCE))
6140                         continue;
6141
6142                 if (sd->flags & SD_BALANCE_NEWIDLE) {
6143                         /* If we've pulled tasks over stop searching: */
6144                         pulled_task = load_balance(this_cpu, this_rq,
6145                                                    sd, CPU_NEWLY_IDLE, &balance);
6146                 }
6147
6148                 interval = msecs_to_jiffies(sd->balance_interval);
6149                 if (time_after(next_balance, sd->last_balance + interval))
6150                         next_balance = sd->last_balance + interval;
6151                 if (pulled_task) {
6152                         this_rq->idle_stamp = 0;
6153                         break;
6154                 }
6155         }
6156         rcu_read_unlock();
6157 #ifdef CONFIG_SCHED_HMP
6158         if (!pulled_task)
6159                 pulled_task = hmp_idle_pull(this_cpu);
6160 #endif
6161         raw_spin_lock(&this_rq->lock);
6162
6163         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
6164                 /*
6165                  * We are going idle. next_balance may be set based on
6166                  * a busy processor. So reset next_balance.
6167                  */
6168                 this_rq->next_balance = next_balance;
6169         }
6170 }
6171
6172 /*
6173  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
6174  * running tasks off the busiest CPU onto idle CPUs. It requires at
6175  * least 1 task to be running on each physical CPU where possible, and
6176  * avoids physical / logical imbalances.
6177  */
6178 static int active_load_balance_cpu_stop(void *data)
6179 {
6180         struct rq *busiest_rq = data;
6181         int busiest_cpu = cpu_of(busiest_rq);
6182         int target_cpu = busiest_rq->push_cpu;
6183         struct rq *target_rq = cpu_rq(target_cpu);
6184         struct sched_domain *sd;
6185
6186         raw_spin_lock_irq(&busiest_rq->lock);
6187
6188         /* make sure the requested cpu hasn't gone down in the meantime */
6189         if (unlikely(busiest_cpu != smp_processor_id() ||
6190                      !busiest_rq->active_balance))
6191                 goto out_unlock;
6192
6193         /* Is there any task to move? */
6194         if (busiest_rq->nr_running <= 1)
6195                 goto out_unlock;
6196
6197         /*
6198          * This condition is "impossible", if it occurs
6199          * we need to fix it. Originally reported by
6200          * Bjorn Helgaas on a 128-cpu setup.
6201          */
6202         BUG_ON(busiest_rq == target_rq);
6203
6204         /* move a task from busiest_rq to target_rq */
6205         double_lock_balance(busiest_rq, target_rq);
6206
6207         /* Search for an sd spanning us and the target CPU. */
6208         rcu_read_lock();
6209         for_each_domain(target_cpu, sd) {
6210                 if ((sd->flags & SD_LOAD_BALANCE) &&
6211                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6212                                 break;
6213         }
6214
6215         if (likely(sd)) {
6216                 struct lb_env env = {
6217                         .sd             = sd,
6218                         .dst_cpu        = target_cpu,
6219                         .dst_rq         = target_rq,
6220                         .src_cpu        = busiest_rq->cpu,
6221                         .src_rq         = busiest_rq,
6222                         .idle           = CPU_IDLE,
6223                 };
6224
6225                 schedstat_inc(sd, alb_count);
6226
6227                 if (move_one_task(&env))
6228                         schedstat_inc(sd, alb_pushed);
6229                 else
6230                         schedstat_inc(sd, alb_failed);
6231         }
6232         rcu_read_unlock();
6233         double_unlock_balance(busiest_rq, target_rq);
6234 out_unlock:
6235         busiest_rq->active_balance = 0;
6236         raw_spin_unlock_irq(&busiest_rq->lock);
6237         return 0;
6238 }
6239
6240 #ifdef CONFIG_NO_HZ_COMMON
6241 /*
6242  * idle load balancing details
6243  * - When one of the busy CPUs notice that there may be an idle rebalancing
6244  *   needed, they will kick the idle load balancer, which then does idle
6245  *   load balancing for all the idle CPUs.
6246  */
6247 static struct {
6248         cpumask_var_t idle_cpus_mask;
6249         atomic_t nr_cpus;
6250         unsigned long next_balance;     /* in jiffy units */
6251 } nohz ____cacheline_aligned;
6252
6253 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6254 /*
6255  * Decide if the tasks on the busy CPUs in the
6256  * littlest domain would benefit from an idle balance
6257  */
6258 static int hmp_packing_ilb_needed(int cpu)
6259 {
6260         struct hmp_domain *hmp;
6261         /* always allow ilb on non-slowest domain */
6262         if (!hmp_cpu_is_slowest(cpu))
6263                 return 1;
6264
6265         hmp = hmp_cpu_domain(cpu);
6266         for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
6267                 /* only idle balance if a CPU is loaded over threshold */
6268                 if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
6269                         return 1;
6270         }
6271         return 0;
6272 }
6273 #endif
6274
6275 static inline int find_new_ilb(int call_cpu)
6276 {
6277         int ilb = cpumask_first(nohz.idle_cpus_mask);
6278 #ifdef CONFIG_SCHED_HMP
6279         int ilb_needed = 1;
6280
6281         /* restrict nohz balancing to occur in the same hmp domain */
6282         ilb = cpumask_first_and(nohz.idle_cpus_mask,
6283                         &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
6284
6285 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6286         if (ilb < nr_cpu_ids)
6287                 ilb_needed = hmp_packing_ilb_needed(ilb);
6288 #endif
6289
6290         if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
6291                 return ilb;
6292 #else
6293         if (ilb < nr_cpu_ids && idle_cpu(ilb))
6294                 return ilb;
6295 #endif
6296
6297         return nr_cpu_ids;
6298 }
6299
6300 /*
6301  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
6302  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
6303  * CPU (if there is one).
6304  */
6305 static void nohz_balancer_kick(int cpu)
6306 {
6307         int ilb_cpu;
6308
6309         nohz.next_balance++;
6310
6311         ilb_cpu = find_new_ilb(cpu);
6312
6313         if (ilb_cpu >= nr_cpu_ids)
6314                 return;
6315
6316         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
6317                 return;
6318         /*
6319          * Use smp_send_reschedule() instead of resched_cpu().
6320          * This way we generate a sched IPI on the target cpu which
6321          * is idle. And the softirq performing nohz idle load balance
6322          * will be run before returning from the IPI.
6323          */
6324         smp_send_reschedule(ilb_cpu);
6325         return;
6326 }
6327
6328 static inline void nohz_balance_exit_idle(int cpu)
6329 {
6330         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
6331                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
6332                 atomic_dec(&nohz.nr_cpus);
6333                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6334         }
6335 }
6336
6337 static inline void set_cpu_sd_state_busy(void)
6338 {
6339         struct sched_domain *sd;
6340         int cpu = smp_processor_id();
6341
6342         rcu_read_lock();
6343         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6344
6345         if (!sd || !sd->nohz_idle)
6346                 goto unlock;
6347         sd->nohz_idle = 0;
6348
6349         for (; sd; sd = sd->parent)
6350                 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
6351 unlock:
6352         rcu_read_unlock();
6353 }
6354
6355 void set_cpu_sd_state_idle(void)
6356 {
6357         struct sched_domain *sd;
6358         int cpu = smp_processor_id();
6359
6360         rcu_read_lock();
6361         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
6362
6363         if (!sd || sd->nohz_idle)
6364                 goto unlock;
6365         sd->nohz_idle = 1;
6366
6367         for (; sd; sd = sd->parent)
6368                 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
6369 unlock:
6370         rcu_read_unlock();
6371 }
6372
6373 /*
6374  * This routine will record that the cpu is going idle with tick stopped.
6375  * This info will be used in performing idle load balancing in the future.
6376  */
6377 void nohz_balance_enter_idle(int cpu)
6378 {
6379         /*
6380          * If this cpu is going down, then nothing needs to be done.
6381          */
6382         if (!cpu_active(cpu))
6383                 return;
6384
6385         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
6386                 return;
6387
6388         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
6389         atomic_inc(&nohz.nr_cpus);
6390         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
6391 }
6392
6393 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
6394                                         unsigned long action, void *hcpu)
6395 {
6396         switch (action & ~CPU_TASKS_FROZEN) {
6397         case CPU_DYING:
6398                 nohz_balance_exit_idle(smp_processor_id());
6399                 return NOTIFY_OK;
6400         default:
6401                 return NOTIFY_DONE;
6402         }
6403 }
6404 #endif
6405
6406 static DEFINE_SPINLOCK(balancing);
6407
6408 /*
6409  * Scale the max load_balance interval with the number of CPUs in the system.
6410  * This trades load-balance latency on larger machines for less cross talk.
6411  */
6412 void update_max_interval(void)
6413 {
6414         max_load_balance_interval = HZ*num_online_cpus()/10;
6415 }
6416
6417 /*
6418  * It checks each scheduling domain to see if it is due to be balanced,
6419  * and initiates a balancing operation if so.
6420  *
6421  * Balancing parameters are set up in init_sched_domains.
6422  */
6423 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
6424 {
6425         int balance = 1;
6426         struct rq *rq = cpu_rq(cpu);
6427         unsigned long interval;
6428         struct sched_domain *sd;
6429         /* Earliest time when we have to do rebalance again */
6430         unsigned long next_balance = jiffies + 60*HZ;
6431         int update_next_balance = 0;
6432         int need_serialize;
6433
6434         update_blocked_averages(cpu);
6435
6436         rcu_read_lock();
6437         for_each_domain(cpu, sd) {
6438                 if (!(sd->flags & SD_LOAD_BALANCE))
6439                         continue;
6440
6441                 interval = sd->balance_interval;
6442                 if (idle != CPU_IDLE)
6443                         interval *= sd->busy_factor;
6444
6445                 /* scale ms to jiffies */
6446                 interval = msecs_to_jiffies(interval);
6447                 interval = clamp(interval, 1UL, max_load_balance_interval);
6448
6449                 need_serialize = sd->flags & SD_SERIALIZE;
6450
6451                 if (need_serialize) {
6452                         if (!spin_trylock(&balancing))
6453                                 goto out;
6454                 }
6455
6456                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
6457                         if (load_balance(cpu, rq, sd, idle, &balance)) {
6458                                 /*
6459                                  * The LBF_SOME_PINNED logic could have changed
6460                                  * env->dst_cpu, so we can't know our idle
6461                                  * state even if we migrated tasks. Update it.
6462                                  */
6463                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
6464                         }
6465                         sd->last_balance = jiffies;
6466                 }
6467                 if (need_serialize)
6468                         spin_unlock(&balancing);
6469 out:
6470                 if (time_after(next_balance, sd->last_balance + interval)) {
6471                         next_balance = sd->last_balance + interval;
6472                         update_next_balance = 1;
6473                 }
6474
6475                 /*
6476                  * Stop the load balance at this level. There is another
6477                  * CPU in our sched group which is doing load balancing more
6478                  * actively.
6479                  */
6480                 if (!balance)
6481                         break;
6482         }
6483         rcu_read_unlock();
6484
6485         /*
6486          * next_balance will be updated only when there is a need.
6487          * When the cpu is attached to null domain for ex, it will not be
6488          * updated.
6489          */
6490         if (likely(update_next_balance))
6491                 rq->next_balance = next_balance;
6492 }
6493
6494 #ifdef CONFIG_NO_HZ_COMMON
6495 /*
6496  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
6497  * rebalancing for all the cpus for whom scheduler ticks are stopped.
6498  */
6499 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
6500 {
6501         struct rq *this_rq = cpu_rq(this_cpu);
6502         struct rq *rq;
6503         int balance_cpu;
6504
6505         if (idle != CPU_IDLE ||
6506             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
6507                 goto end;
6508
6509         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
6510                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
6511                         continue;
6512
6513                 /*
6514                  * If this cpu gets work to do, stop the load balancing
6515                  * work being done for other cpus. Next load
6516                  * balancing owner will pick it up.
6517                  */
6518                 if (need_resched())
6519                         break;
6520
6521                 rq = cpu_rq(balance_cpu);
6522
6523                 raw_spin_lock_irq(&rq->lock);
6524                 update_rq_clock(rq);
6525                 update_idle_cpu_load(rq);
6526                 raw_spin_unlock_irq(&rq->lock);
6527
6528                 rebalance_domains(balance_cpu, CPU_IDLE);
6529
6530                 if (time_after(this_rq->next_balance, rq->next_balance))
6531                         this_rq->next_balance = rq->next_balance;
6532         }
6533         nohz.next_balance = this_rq->next_balance;
6534 end:
6535         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
6536 }
6537
6538 /*
6539  * Current heuristic for kicking the idle load balancer in the presence
6540  * of an idle cpu is the system.
6541  *   - This rq has more than one task.
6542  *   - At any scheduler domain level, this cpu's scheduler group has multiple
6543  *     busy cpu's exceeding the group's power.
6544  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
6545  *     domain span are idle.
6546  */
6547 static inline int nohz_kick_needed(struct rq *rq, int cpu)
6548 {
6549         unsigned long now = jiffies;
6550         struct sched_domain *sd;
6551
6552         if (unlikely(idle_cpu(cpu)))
6553                 return 0;
6554
6555        /*
6556         * We may be recently in ticked or tickless idle mode. At the first
6557         * busy tick after returning from idle, we will update the busy stats.
6558         */
6559         set_cpu_sd_state_busy();
6560         nohz_balance_exit_idle(cpu);
6561
6562         /*
6563          * None are in tickless mode and hence no need for NOHZ idle load
6564          * balancing.
6565          */
6566         if (likely(!atomic_read(&nohz.nr_cpus)))
6567                 return 0;
6568
6569         if (time_before(now, nohz.next_balance))
6570                 return 0;
6571
6572 #ifdef CONFIG_SCHED_HMP
6573         /*
6574          * Bail out if there are no nohz CPUs in our
6575          * HMP domain, since we will move tasks between
6576          * domains through wakeup and force balancing
6577          * as necessary based upon task load.
6578          */
6579         if (cpumask_first_and(nohz.idle_cpus_mask,
6580                         &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
6581                 return 0;
6582 #endif
6583
6584         if (rq->nr_running >= 2)
6585                 goto need_kick;
6586
6587         rcu_read_lock();
6588         for_each_domain(cpu, sd) {
6589                 struct sched_group *sg = sd->groups;
6590                 struct sched_group_power *sgp = sg->sgp;
6591                 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
6592
6593                 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
6594                         goto need_kick_unlock;
6595
6596                 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
6597                     && (cpumask_first_and(nohz.idle_cpus_mask,
6598                                           sched_domain_span(sd)) < cpu))
6599                         goto need_kick_unlock;
6600
6601                 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
6602                         break;
6603         }
6604         rcu_read_unlock();
6605         return 0;
6606
6607 need_kick_unlock:
6608         rcu_read_unlock();
6609 need_kick:
6610         return 1;
6611 }
6612 #else
6613 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
6614 #endif
6615
6616 #ifdef CONFIG_SCHED_HMP
6617 /* Check if task should migrate to a faster cpu */
6618 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
6619 {
6620         struct task_struct *p = task_of(se);
6621         int temp_target_cpu;
6622         u64 now;
6623
6624         if (hmp_cpu_is_fastest(cpu))
6625                 return 0;
6626
6627 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6628         /* Filter by task priority */
6629         if (p->prio >= hmp_up_prio)
6630                 return 0;
6631 #endif
6632         if (se->avg.load_avg_ratio < hmp_up_threshold)
6633                 return 0;
6634
6635         /* Let the task load settle before doing another up migration */
6636         /* hack - always use clock from first online CPU */
6637         now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6638         if (((now - se->avg.hmp_last_up_migration) >> 10)
6639                                         < hmp_next_up_threshold)
6640                 return 0;
6641
6642         /* hmp_domain_min_load only returns 0 for an
6643          * idle CPU or 1023 for any partly-busy one.
6644          * Be explicit about requirement for an idle CPU.
6645          */
6646         if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
6647                         tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
6648                 if(target_cpu)
6649                         *target_cpu = temp_target_cpu;
6650                 return 1;
6651         }
6652         return 0;
6653 }
6654
6655 /* Check if task should migrate to a slower cpu */
6656 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
6657 {
6658         struct task_struct *p = task_of(se);
6659         u64 now;
6660
6661         if (hmp_cpu_is_slowest(cpu)) {
6662 #ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
6663                 if(hmp_packing_enabled)
6664                         return 1;
6665                 else
6666 #endif
6667                 return 0;
6668         }
6669
6670 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
6671         /* Filter by task priority */
6672         if ((p->prio >= hmp_up_prio) &&
6673                 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6674                                         tsk_cpus_allowed(p))) {
6675                 return 1;
6676         }
6677 #endif
6678
6679         /* Let the task load settle before doing another down migration */
6680         /* hack - always use clock from first online CPU */
6681         now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
6682         if (((now - se->avg.hmp_last_down_migration) >> 10)
6683                                         < hmp_next_down_threshold)
6684                 return 0;
6685
6686         if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
6687                                         tsk_cpus_allowed(p))
6688                 && se->avg.load_avg_ratio < hmp_down_threshold) {
6689                 return 1;
6690         }
6691         return 0;
6692 }
6693
6694 /*
6695  * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
6696  * Ideally this function should be merged with can_migrate_task() to avoid
6697  * redundant code.
6698  */
6699 static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
6700 {
6701         int tsk_cache_hot = 0;
6702
6703         /*
6704          * We do not migrate tasks that are:
6705          * 1) running (obviously), or
6706          * 2) cannot be migrated to this CPU due to cpus_allowed
6707          */
6708         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
6709                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6710                 return 0;
6711         }
6712         env->flags &= ~LBF_ALL_PINNED;
6713
6714         if (task_running(env->src_rq, p)) {
6715                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6716                 return 0;
6717         }
6718
6719         /*
6720          * Aggressive migration if:
6721          * 1) task is cache cold, or
6722          * 2) too many balance attempts have failed.
6723          */
6724
6725         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
6726         if (!tsk_cache_hot ||
6727                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
6728 #ifdef CONFIG_SCHEDSTATS
6729                 if (tsk_cache_hot) {
6730                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
6731                         schedstat_inc(p, se.statistics.nr_forced_migrations);
6732                 }
6733 #endif
6734                 return 1;
6735         }
6736
6737         return 1;
6738 }
6739
6740 /*
6741  * move_specific_task tries to move a specific task.
6742  * Returns 1 if successful and 0 otherwise.
6743  * Called with both runqueues locked.
6744  */
6745 static int move_specific_task(struct lb_env *env, struct task_struct *pm)
6746 {
6747         struct task_struct *p, *n;
6748
6749         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6750         if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
6751                                 env->dst_cpu))
6752                 continue;
6753
6754                 if (!hmp_can_migrate_task(p, env))
6755                         continue;
6756                 /* Check if we found the right task */
6757                 if (p != pm)
6758                         continue;
6759
6760                 move_task(p, env);
6761                 /*
6762                  * Right now, this is only the third place move_task()
6763                  * is called, so we can safely collect move_task()
6764                  * stats here rather than inside move_task().
6765                  */
6766                 schedstat_inc(env->sd, lb_gained[env->idle]);
6767                 return 1;
6768         }
6769         return 0;
6770 }
6771
6772 /*
6773  * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
6774  * migrate a specific task from one runqueue to another.
6775  * hmp_force_up_migration uses this to push a currently running task
6776  * off a runqueue.
6777  * Based on active_load_balance_stop_cpu and can potentially be merged.
6778  */
6779 static int hmp_active_task_migration_cpu_stop(void *data)
6780 {
6781         struct rq *busiest_rq = data;
6782         struct task_struct *p = busiest_rq->migrate_task;
6783         int busiest_cpu = cpu_of(busiest_rq);
6784         int target_cpu = busiest_rq->push_cpu;
6785         struct rq *target_rq = cpu_rq(target_cpu);
6786         struct sched_domain *sd;
6787
6788         raw_spin_lock_irq(&busiest_rq->lock);
6789         /* make sure the requested cpu hasn't gone down in the meantime */
6790         if (unlikely(busiest_cpu != smp_processor_id() ||
6791                 !busiest_rq->active_balance)) {
6792                 goto out_unlock;
6793         }
6794         /* Is there any task to move? */
6795         if (busiest_rq->nr_running <= 1)
6796                 goto out_unlock;
6797         /* Task has migrated meanwhile, abort forced migration */
6798         if (task_rq(p) != busiest_rq)
6799                 goto out_unlock;
6800         /*
6801          * This condition is "impossible", if it occurs
6802          * we need to fix it. Originally reported by
6803          * Bjorn Helgaas on a 128-cpu setup.
6804          */
6805         BUG_ON(busiest_rq == target_rq);
6806
6807         /* move a task from busiest_rq to target_rq */
6808         double_lock_balance(busiest_rq, target_rq);
6809
6810         /* Search for an sd spanning us and the target CPU. */
6811         rcu_read_lock();
6812         for_each_domain(target_cpu, sd) {
6813                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6814                         break;
6815         }
6816
6817         if (likely(sd)) {
6818                 struct lb_env env = {
6819                         .sd             = sd,
6820                         .dst_cpu        = target_cpu,
6821                         .dst_rq         = target_rq,
6822                         .src_cpu        = busiest_rq->cpu,
6823                         .src_rq         = busiest_rq,
6824                         .idle           = CPU_IDLE,
6825                 };
6826
6827                 schedstat_inc(sd, alb_count);
6828
6829                 if (move_specific_task(&env, p))
6830                         schedstat_inc(sd, alb_pushed);
6831                 else
6832                         schedstat_inc(sd, alb_failed);
6833         }
6834         rcu_read_unlock();
6835         double_unlock_balance(busiest_rq, target_rq);
6836 out_unlock:
6837         put_task_struct(p);
6838         busiest_rq->active_balance = 0;
6839         raw_spin_unlock_irq(&busiest_rq->lock);
6840         return 0;
6841 }
6842
6843 /*
6844  * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
6845  * migrate a specific task from one runqueue to another.
6846  * hmp_idle_pull uses this to push a currently running task
6847  * off a runqueue to a faster CPU.
6848  * Locking is slightly different than usual.
6849  * Based on active_load_balance_stop_cpu and can potentially be merged.
6850  */
6851 static int hmp_idle_pull_cpu_stop(void *data)
6852 {
6853         struct rq *busiest_rq = data;
6854         struct task_struct *p = busiest_rq->migrate_task;
6855         int busiest_cpu = cpu_of(busiest_rq);
6856         int target_cpu = busiest_rq->push_cpu;
6857         struct rq *target_rq = cpu_rq(target_cpu);
6858         struct sched_domain *sd;
6859
6860         raw_spin_lock_irq(&busiest_rq->lock);
6861
6862         /* make sure the requested cpu hasn't gone down in the meantime */
6863         if (unlikely(busiest_cpu != smp_processor_id() ||
6864                 !busiest_rq->active_balance))
6865                 goto out_unlock;
6866
6867         /* Is there any task to move? */
6868         if (busiest_rq->nr_running <= 1)
6869                 goto out_unlock;
6870
6871         /* Task has migrated meanwhile, abort forced migration */
6872         if (task_rq(p) != busiest_rq)
6873                 goto out_unlock;
6874
6875         /*
6876          * This condition is "impossible", if it occurs
6877          * we need to fix it. Originally reported by
6878          * Bjorn Helgaas on a 128-cpu setup.
6879          */
6880         BUG_ON(busiest_rq == target_rq);
6881
6882         /* move a task from busiest_rq to target_rq */
6883         double_lock_balance(busiest_rq, target_rq);
6884
6885         /* Search for an sd spanning us and the target CPU. */
6886         rcu_read_lock();
6887         for_each_domain(target_cpu, sd) {
6888                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
6889                         break;
6890         }
6891         if (likely(sd)) {
6892                 struct lb_env env = {
6893                         .sd             = sd,
6894                         .dst_cpu        = target_cpu,
6895                         .dst_rq         = target_rq,
6896                         .src_cpu        = busiest_rq->cpu,
6897                         .src_rq         = busiest_rq,
6898                         .idle           = CPU_IDLE,
6899                 };
6900
6901                 schedstat_inc(sd, alb_count);
6902
6903                 if (move_specific_task(&env, p))
6904                         schedstat_inc(sd, alb_pushed);
6905                 else
6906                         schedstat_inc(sd, alb_failed);
6907         }
6908         rcu_read_unlock();
6909         double_unlock_balance(busiest_rq, target_rq);
6910 out_unlock:
6911         put_task_struct(p);
6912         busiest_rq->active_balance = 0;
6913         raw_spin_unlock_irq(&busiest_rq->lock);
6914         return 0;
6915 }
6916
6917 static DEFINE_SPINLOCK(hmp_force_migration);
6918
6919 /*
6920  * hmp_force_up_migration checks runqueues for tasks that need to
6921  * be actively migrated to a faster cpu.
6922  */
6923 static void hmp_force_up_migration(int this_cpu)
6924 {
6925         int cpu, target_cpu;
6926         struct sched_entity *curr, *orig;
6927         struct rq *target;
6928         unsigned long flags;
6929         unsigned int force;
6930         struct task_struct *p;
6931
6932         if (!spin_trylock(&hmp_force_migration))
6933                 return;
6934         for_each_online_cpu(cpu) {
6935                 force = 0;
6936                 target = cpu_rq(cpu);
6937                 raw_spin_lock_irqsave(&target->lock, flags);
6938                 curr = target->cfs.curr;
6939                 if (!curr) {
6940                         raw_spin_unlock_irqrestore(&target->lock, flags);
6941                         continue;
6942                 }
6943                 if (!entity_is_task(curr)) {
6944                         struct cfs_rq *cfs_rq;
6945
6946                         cfs_rq = group_cfs_rq(curr);
6947                         while (cfs_rq) {
6948                                 curr = cfs_rq->curr;
6949                                 cfs_rq = group_cfs_rq(curr);
6950                         }
6951                 }
6952                 orig = curr;
6953                 curr = hmp_get_heaviest_task(curr, 1);
6954                 p = task_of(curr);
6955                 if (hmp_up_migration(cpu, &target_cpu, curr)) {
6956                         if (!target->active_balance) {
6957                                 get_task_struct(p);
6958                                 target->active_balance = 1;
6959                                 target->push_cpu = target_cpu;
6960                                 target->migrate_task = p;
6961                                 force = 1;
6962                                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
6963                                 hmp_next_up_delay(&p->se, target->push_cpu);
6964                         }
6965                 }
6966                 if (!force && !target->active_balance) {
6967                         /*
6968                          * For now we just check the currently running task.
6969                          * Selecting the lightest task for offloading will
6970                          * require extensive book keeping.
6971                          */
6972                         curr = hmp_get_lightest_task(orig, 1);
6973                         p = task_of(curr);
6974                         target->push_cpu = hmp_offload_down(cpu, curr);
6975                         if (target->push_cpu < NR_CPUS) {
6976                                 get_task_struct(p);
6977                                 target->active_balance = 1;
6978                                 target->migrate_task = p;
6979                                 force = 1;
6980                                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
6981                                 hmp_next_down_delay(&p->se, target->push_cpu);
6982                         }
6983                 }
6984                 raw_spin_unlock_irqrestore(&target->lock, flags);
6985                 if (force)
6986                         stop_one_cpu_nowait(cpu_of(target),
6987                                 hmp_active_task_migration_cpu_stop,
6988                                 target, &target->active_balance_work);
6989         }
6990         spin_unlock(&hmp_force_migration);
6991 }
6992 /*
6993  * hmp_idle_pull looks at little domain runqueues to see
6994  * if a task should be pulled.
6995  *
6996  * Reuses hmp_force_migration spinlock.
6997  *
6998  */
6999 static unsigned int hmp_idle_pull(int this_cpu)
7000 {
7001         int cpu;
7002         struct sched_entity *curr, *orig;
7003         struct hmp_domain *hmp_domain = NULL;
7004         struct rq *target, *rq;
7005         unsigned long flags, ratio = 0;
7006         unsigned int force = 0;
7007         struct task_struct *p = NULL;
7008
7009         if (!hmp_cpu_is_slowest(this_cpu))
7010                 hmp_domain = hmp_slower_domain(this_cpu);
7011         if (!hmp_domain)
7012                 return 0;
7013
7014         if (!spin_trylock(&hmp_force_migration))
7015                 return 0;
7016
7017         /* first select a task */
7018         for_each_cpu(cpu, &hmp_domain->cpus) {
7019                 rq = cpu_rq(cpu);
7020                 raw_spin_lock_irqsave(&rq->lock, flags);
7021                 curr = rq->cfs.curr;
7022                 if (!curr) {
7023                         raw_spin_unlock_irqrestore(&rq->lock, flags);
7024                         continue;
7025                 }
7026                 if (!entity_is_task(curr)) {
7027                         struct cfs_rq *cfs_rq;
7028
7029                         cfs_rq = group_cfs_rq(curr);
7030                         while (cfs_rq) {
7031                                 curr = cfs_rq->curr;
7032                                 if (!entity_is_task(curr))
7033                                         cfs_rq = group_cfs_rq(curr);
7034                                 else
7035                                         cfs_rq = NULL;
7036                         }
7037                 }
7038                 orig = curr;
7039                 curr = hmp_get_heaviest_task(curr, 1);
7040                 if (curr->avg.load_avg_ratio > hmp_up_threshold &&
7041                         curr->avg.load_avg_ratio > ratio) {
7042                         p = task_of(curr);
7043                         target = rq;
7044                         ratio = curr->avg.load_avg_ratio;
7045                 }
7046                 raw_spin_unlock_irqrestore(&rq->lock, flags);
7047         }
7048
7049         if (!p)
7050                 goto done;
7051
7052         /* now we have a candidate */
7053         raw_spin_lock_irqsave(&target->lock, flags);
7054         if (!target->active_balance && task_rq(p) == target) {
7055                 get_task_struct(p);
7056                 target->active_balance = 1;
7057                 target->push_cpu = this_cpu;
7058                 target->migrate_task = p;
7059                 force = 1;
7060                 trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
7061                 hmp_next_up_delay(&p->se, target->push_cpu);
7062         }
7063         raw_spin_unlock_irqrestore(&target->lock, flags);
7064         if (force) {
7065                 stop_one_cpu_nowait(cpu_of(target),
7066                         hmp_idle_pull_cpu_stop,
7067                         target, &target->active_balance_work);
7068         }
7069 done:
7070         spin_unlock(&hmp_force_migration);
7071         return force;
7072 }
7073 #else
7074 static void hmp_force_up_migration(int this_cpu) { }
7075 #endif /* CONFIG_SCHED_HMP */
7076
7077 /*
7078  * run_rebalance_domains is triggered when needed from the scheduler tick.
7079  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7080  */
7081 static void run_rebalance_domains(struct softirq_action *h)
7082 {
7083         int this_cpu = smp_processor_id();
7084         struct rq *this_rq = cpu_rq(this_cpu);
7085         enum cpu_idle_type idle = this_rq->idle_balance ?
7086                                                 CPU_IDLE : CPU_NOT_IDLE;
7087
7088         hmp_force_up_migration(this_cpu);
7089
7090         rebalance_domains(this_cpu, idle);
7091
7092         /*
7093          * If this cpu has a pending nohz_balance_kick, then do the
7094          * balancing on behalf of the other idle cpus whose ticks are
7095          * stopped.
7096          */
7097         nohz_idle_balance(this_cpu, idle);
7098 }
7099
7100 static inline int on_null_domain(int cpu)
7101 {
7102         return !rcu_dereference_sched(cpu_rq(cpu)->sd);
7103 }
7104
7105 /*
7106  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7107  */
7108 void trigger_load_balance(struct rq *rq, int cpu)
7109 {
7110         /* Don't need to rebalance while attached to NULL domain */
7111         if (time_after_eq(jiffies, rq->next_balance) &&
7112             likely(!on_null_domain(cpu)))
7113                 raise_softirq(SCHED_SOFTIRQ);
7114 #ifdef CONFIG_NO_HZ_COMMON
7115         if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
7116                 nohz_balancer_kick(cpu);
7117 #endif
7118 }
7119
7120 static void rq_online_fair(struct rq *rq)
7121 {
7122 #ifdef CONFIG_SCHED_HMP
7123         hmp_online_cpu(rq->cpu);
7124 #endif
7125         update_sysctl();
7126 }
7127
7128 static void rq_offline_fair(struct rq *rq)
7129 {
7130 #ifdef CONFIG_SCHED_HMP
7131         hmp_offline_cpu(rq->cpu);
7132 #endif
7133         update_sysctl();
7134
7135         /* Ensure any throttled groups are reachable by pick_next_task */
7136         unthrottle_offline_cfs_rqs(rq);
7137 }
7138
7139 #endif /* CONFIG_SMP */
7140
7141 /*
7142  * scheduler tick hitting a task of our scheduling class:
7143  */
7144 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7145 {
7146         struct cfs_rq *cfs_rq;
7147         struct sched_entity *se = &curr->se;
7148
7149         for_each_sched_entity(se) {
7150                 cfs_rq = cfs_rq_of(se);
7151                 entity_tick(cfs_rq, se, queued);
7152         }
7153
7154         if (sched_feat_numa(NUMA))
7155                 task_tick_numa(rq, curr);
7156
7157         update_rq_runnable_avg(rq, 1);
7158 }
7159
7160 /*
7161  * called on fork with the child task as argument from the parent's context
7162  *  - child not yet on the tasklist
7163  *  - preemption disabled
7164  */
7165 static void task_fork_fair(struct task_struct *p)
7166 {
7167         struct cfs_rq *cfs_rq;
7168         struct sched_entity *se = &p->se, *curr;
7169         int this_cpu = smp_processor_id();
7170         struct rq *rq = this_rq();
7171         unsigned long flags;
7172
7173         raw_spin_lock_irqsave(&rq->lock, flags);
7174
7175         update_rq_clock(rq);
7176
7177         cfs_rq = task_cfs_rq(current);
7178         curr = cfs_rq->curr;
7179
7180         if (unlikely(task_cpu(p) != this_cpu)) {
7181                 rcu_read_lock();
7182                 __set_task_cpu(p, this_cpu);
7183                 rcu_read_unlock();
7184         }
7185
7186         update_curr(cfs_rq);
7187
7188         if (curr)
7189                 se->vruntime = curr->vruntime;
7190         place_entity(cfs_rq, se, 1);
7191
7192         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7193                 /*
7194                  * Upon rescheduling, sched_class::put_prev_task() will place
7195                  * 'current' within the tree based on its new key value.
7196                  */
7197                 swap(curr->vruntime, se->vruntime);
7198                 resched_task(rq->curr);
7199         }
7200
7201         se->vruntime -= cfs_rq->min_vruntime;
7202
7203         raw_spin_unlock_irqrestore(&rq->lock, flags);
7204 }
7205
7206 /*
7207  * Priority of the task has changed. Check to see if we preempt
7208  * the current task.
7209  */
7210 static void
7211 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7212 {
7213         if (!p->se.on_rq)
7214                 return;
7215
7216         /*
7217          * Reschedule if we are currently running on this runqueue and
7218          * our priority decreased, or if we are not currently running on
7219          * this runqueue and our priority is higher than the current's
7220          */
7221         if (rq->curr == p) {
7222                 if (p->prio > oldprio)
7223                         resched_task(rq->curr);
7224         } else
7225                 check_preempt_curr(rq, p, 0);
7226 }
7227
7228 static void switched_from_fair(struct rq *rq, struct task_struct *p)
7229 {
7230         struct sched_entity *se = &p->se;
7231         struct cfs_rq *cfs_rq = cfs_rq_of(se);
7232
7233         /*
7234          * Ensure the task's vruntime is normalized, so that when its
7235          * switched back to the fair class the enqueue_entity(.flags=0) will
7236          * do the right thing.
7237          *
7238          * If it was on_rq, then the dequeue_entity(.flags=0) will already
7239          * have normalized the vruntime, if it was !on_rq, then only when
7240          * the task is sleeping will it still have non-normalized vruntime.
7241          */
7242         if (!se->on_rq && p->state != TASK_RUNNING) {
7243                 /*
7244                  * Fix up our vruntime so that the current sleep doesn't
7245                  * cause 'unlimited' sleep bonus.
7246                  */
7247                 place_entity(cfs_rq, se, 0);
7248                 se->vruntime -= cfs_rq->min_vruntime;
7249         }
7250
7251 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7252         /*
7253         * Remove our load from contribution when we leave sched_fair
7254         * and ensure we don't carry in an old decay_count if we
7255         * switch back.
7256         */
7257         if (p->se.avg.decay_count) {
7258                 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
7259                 __synchronize_entity_decay(&p->se);
7260                 subtract_blocked_load_contrib(cfs_rq,
7261                                 p->se.avg.load_avg_contrib);
7262         }
7263 #endif
7264 }
7265
7266 /*
7267  * We switched to the sched_fair class.
7268  */
7269 static void switched_to_fair(struct rq *rq, struct task_struct *p)
7270 {
7271         if (!p->se.on_rq)
7272                 return;
7273
7274         /*
7275          * We were most likely switched from sched_rt, so
7276          * kick off the schedule if running, otherwise just see
7277          * if we can still preempt the current task.
7278          */
7279         if (rq->curr == p)
7280                 resched_task(rq->curr);
7281         else
7282                 check_preempt_curr(rq, p, 0);
7283 }
7284
7285 /* Account for a task changing its policy or group.
7286  *
7287  * This routine is mostly called to set cfs_rq->curr field when a task
7288  * migrates between groups/classes.
7289  */
7290 static void set_curr_task_fair(struct rq *rq)
7291 {
7292         struct sched_entity *se = &rq->curr->se;
7293
7294         for_each_sched_entity(se) {
7295                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
7296
7297                 set_next_entity(cfs_rq, se);
7298                 /* ensure bandwidth has been allocated on our new cfs_rq */
7299                 account_cfs_rq_runtime(cfs_rq, 0);
7300         }
7301 }
7302
7303 void init_cfs_rq(struct cfs_rq *cfs_rq)
7304 {
7305         cfs_rq->tasks_timeline = RB_ROOT;
7306         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7307 #ifndef CONFIG_64BIT
7308         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
7309 #endif
7310 #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
7311         atomic64_set(&cfs_rq->decay_counter, 1);
7312         atomic64_set(&cfs_rq->removed_load, 0);
7313 #endif
7314 }
7315
7316 #ifdef CONFIG_FAIR_GROUP_SCHED
7317 static void task_move_group_fair(struct task_struct *p, int on_rq)
7318 {
7319         struct cfs_rq *cfs_rq;
7320         /*
7321          * If the task was not on the rq at the time of this cgroup movement
7322          * it must have been asleep, sleeping tasks keep their ->vruntime
7323          * absolute on their old rq until wakeup (needed for the fair sleeper
7324          * bonus in place_entity()).
7325          *
7326          * If it was on the rq, we've just 'preempted' it, which does convert
7327          * ->vruntime to a relative base.
7328          *
7329          * Make sure both cases convert their relative position when migrating
7330          * to another cgroup's rq. This does somewhat interfere with the
7331          * fair sleeper stuff for the first placement, but who cares.
7332          */
7333         /*
7334          * When !on_rq, vruntime of the task has usually NOT been normalized.
7335          * But there are some cases where it has already been normalized:
7336          *
7337          * - Moving a forked child which is waiting for being woken up by
7338          *   wake_up_new_task().
7339          * - Moving a task which has been woken up by try_to_wake_up() and
7340          *   waiting for actually being woken up by sched_ttwu_pending().
7341          *
7342          * To prevent boost or penalty in the new cfs_rq caused by delta
7343          * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7344          */
7345         if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7346                 on_rq = 1;
7347
7348         if (!on_rq)
7349                 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
7350         set_task_rq(p, task_cpu(p));
7351         if (!on_rq) {
7352                 cfs_rq = cfs_rq_of(&p->se);
7353                 p->se.vruntime += cfs_rq->min_vruntime;
7354 #ifdef CONFIG_SMP
7355                 /*
7356                  * migrate_task_rq_fair() will have removed our previous
7357                  * contribution, but we must synchronize for ongoing future
7358                  * decay.
7359                  */
7360                 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
7361                 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
7362 #endif
7363         }
7364 }
7365
7366 void free_fair_sched_group(struct task_group *tg)
7367 {
7368         int i;
7369
7370         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
7371
7372         for_each_possible_cpu(i) {
7373                 if (tg->cfs_rq)
7374                         kfree(tg->cfs_rq[i]);
7375                 if (tg->se)
7376                         kfree(tg->se[i]);
7377         }
7378
7379         kfree(tg->cfs_rq);
7380         kfree(tg->se);
7381 }
7382
7383 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7384 {
7385         struct cfs_rq *cfs_rq;
7386         struct sched_entity *se;
7387         int i;
7388
7389         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7390         if (!tg->cfs_rq)
7391                 goto err;
7392         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7393         if (!tg->se)
7394                 goto err;
7395
7396         tg->shares = NICE_0_LOAD;
7397
7398         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
7399
7400         for_each_possible_cpu(i) {
7401                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7402                                       GFP_KERNEL, cpu_to_node(i));
7403                 if (!cfs_rq)
7404                         goto err;
7405
7406                 se = kzalloc_node(sizeof(struct sched_entity),
7407                                   GFP_KERNEL, cpu_to_node(i));
7408                 if (!se)
7409                         goto err_free_rq;
7410
7411                 init_cfs_rq(cfs_rq);
7412                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
7413         }
7414
7415         return 1;
7416
7417 err_free_rq:
7418         kfree(cfs_rq);
7419 err:
7420         return 0;
7421 }
7422
7423 void unregister_fair_sched_group(struct task_group *tg, int cpu)
7424 {
7425         struct rq *rq = cpu_rq(cpu);
7426         unsigned long flags;
7427
7428         /*
7429         * Only empty task groups can be destroyed; so we can speculatively
7430         * check on_list without danger of it being re-added.
7431         */
7432         if (!tg->cfs_rq[cpu]->on_list)
7433                 return;
7434
7435         raw_spin_lock_irqsave(&rq->lock, flags);
7436         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
7437         raw_spin_unlock_irqrestore(&rq->lock, flags);
7438 }
7439
7440 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7441                         struct sched_entity *se, int cpu,
7442                         struct sched_entity *parent)
7443 {
7444         struct rq *rq = cpu_rq(cpu);
7445
7446         cfs_rq->tg = tg;
7447         cfs_rq->rq = rq;
7448         init_cfs_rq_runtime(cfs_rq);
7449
7450         tg->cfs_rq[cpu] = cfs_rq;
7451         tg->se[cpu] = se;
7452
7453         /* se could be NULL for root_task_group */
7454         if (!se)
7455                 return;
7456
7457         if (!parent)
7458                 se->cfs_rq = &rq->cfs;
7459         else
7460                 se->cfs_rq = parent->my_q;
7461
7462         se->my_q = cfs_rq;
7463         update_load_set(&se->load, 0);
7464         se->parent = parent;
7465 }
7466
7467 static DEFINE_MUTEX(shares_mutex);
7468
7469 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7470 {
7471         int i;
7472         unsigned long flags;
7473
7474         /*
7475          * We can't change the weight of the root cgroup.
7476          */
7477         if (!tg->se[0])
7478                 return -EINVAL;
7479
7480         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
7481
7482         mutex_lock(&shares_mutex);
7483         if (tg->shares == shares)
7484                 goto done;
7485
7486         tg->shares = shares;
7487         for_each_possible_cpu(i) {
7488                 struct rq *rq = cpu_rq(i);
7489                 struct sched_entity *se;
7490
7491                 se = tg->se[i];
7492                 /* Propagate contribution to hierarchy */
7493                 raw_spin_lock_irqsave(&rq->lock, flags);
7494                 for_each_sched_entity(se)
7495                         update_cfs_shares(group_cfs_rq(se));
7496                 raw_spin_unlock_irqrestore(&rq->lock, flags);
7497         }
7498
7499 done:
7500         mutex_unlock(&shares_mutex);
7501         return 0;
7502 }
7503 #else /* CONFIG_FAIR_GROUP_SCHED */
7504
7505 void free_fair_sched_group(struct task_group *tg) { }
7506
7507 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7508 {
7509         return 1;
7510 }
7511
7512 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
7513
7514 #endif /* CONFIG_FAIR_GROUP_SCHED */
7515
7516
7517 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
7518 {
7519         struct sched_entity *se = &task->se;
7520         unsigned int rr_interval = 0;
7521
7522         /*
7523          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
7524          * idle runqueue:
7525          */
7526         if (rq->cfs.load.weight)
7527                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
7528
7529         return rr_interval;
7530 }
7531
7532 /*
7533  * All the scheduling class methods:
7534  */
7535 const struct sched_class fair_sched_class = {
7536         .next                   = &idle_sched_class,
7537         .enqueue_task           = enqueue_task_fair,
7538         .dequeue_task           = dequeue_task_fair,
7539         .yield_task             = yield_task_fair,
7540         .yield_to_task          = yield_to_task_fair,
7541
7542         .check_preempt_curr     = check_preempt_wakeup,
7543
7544         .pick_next_task         = pick_next_task_fair,
7545         .put_prev_task          = put_prev_task_fair,
7546
7547 #ifdef CONFIG_SMP
7548         .select_task_rq         = select_task_rq_fair,
7549 #ifdef CONFIG_FAIR_GROUP_SCHED
7550         .migrate_task_rq        = migrate_task_rq_fair,
7551 #endif
7552         .rq_online              = rq_online_fair,
7553         .rq_offline             = rq_offline_fair,
7554
7555         .task_waking            = task_waking_fair,
7556 #endif
7557
7558         .set_curr_task          = set_curr_task_fair,
7559         .task_tick              = task_tick_fair,
7560         .task_fork              = task_fork_fair,
7561
7562         .prio_changed           = prio_changed_fair,
7563         .switched_from          = switched_from_fair,
7564         .switched_to            = switched_to_fair,
7565
7566         .get_rr_interval        = get_rr_interval_fair,
7567
7568 #ifdef CONFIG_FAIR_GROUP_SCHED
7569         .task_move_group        = task_move_group_fair,
7570 #endif
7571 };
7572
7573 #ifdef CONFIG_SCHED_DEBUG
7574 void print_cfs_stats(struct seq_file *m, int cpu)
7575 {
7576         struct cfs_rq *cfs_rq;
7577
7578         rcu_read_lock();
7579         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
7580                 print_cfs_rq(m, cpu, cfs_rq);
7581         rcu_read_unlock();
7582 }
7583 #endif
7584
7585 __init void init_sched_fair_class(void)
7586 {
7587 #ifdef CONFIG_SMP
7588         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7589
7590 #ifdef CONFIG_NO_HZ_COMMON
7591         nohz.next_balance = jiffies;
7592         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7593         cpu_notifier(sched_ilb_notifier, 0);
7594 #endif
7595
7596 #ifdef CONFIG_SCHED_HMP
7597         hmp_cpu_mask_setup();
7598 #endif
7599 #endif /* SMP */
7600
7601 }
7602
7603 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
7604 static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
7605 {
7606         u32 result = curr / max;
7607         return result;
7608 }
7609
7610 /* Called when the CPU Frequency is changed.
7611  * Once for each CPU.
7612  */
7613 static int cpufreq_callback(struct notifier_block *nb,
7614                                         unsigned long val, void *data)
7615 {
7616         struct cpufreq_freqs *freq = data;
7617         int cpu = freq->cpu;
7618         struct cpufreq_extents *extents;
7619
7620         if (freq->flags & CPUFREQ_CONST_LOOPS)
7621                 return NOTIFY_OK;
7622
7623         if (val != CPUFREQ_POSTCHANGE)
7624                 return NOTIFY_OK;
7625
7626         /* if dynamic load scale is disabled, set the load scale to 1.0 */
7627         if (!hmp_data.freqinvar_load_scale_enabled) {
7628                 freq_scale[cpu].curr_scale = 1024;
7629                 return NOTIFY_OK;
7630         }
7631
7632         extents = &freq_scale[cpu];
7633         if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
7634                 /* If our governor was recognised as a single-freq governor,
7635                  * use 1.0
7636                  */
7637                 extents->curr_scale = 1024;
7638         } else {
7639                 extents->curr_scale = cpufreq_calc_scale(extents->min,
7640                                 extents->max, freq->new);
7641         }
7642
7643         return NOTIFY_OK;
7644 }
7645
7646 /* Called when the CPUFreq governor is changed.
7647  * Only called for the CPUs which are actually changed by the
7648  * userspace.
7649  */
7650 static int cpufreq_policy_callback(struct notifier_block *nb,
7651                                        unsigned long event, void *data)
7652 {
7653         struct cpufreq_policy *policy = data;
7654         struct cpufreq_extents *extents;
7655         int cpu, singleFreq = 0;
7656         static const char performance_governor[] = "performance";
7657         static const char powersave_governor[] = "powersave";
7658
7659         if (event == CPUFREQ_START)
7660                 return 0;
7661
7662         if (event != CPUFREQ_INCOMPATIBLE)
7663                 return 0;
7664
7665         /* CPUFreq governors do not accurately report the range of
7666          * CPU Frequencies they will choose from.
7667          * We recognise performance and powersave governors as
7668          * single-frequency only.
7669          */
7670         if (!strncmp(policy->governor->name, performance_governor,
7671                         strlen(performance_governor)) ||
7672                 !strncmp(policy->governor->name, powersave_governor,
7673                                 strlen(powersave_governor)))
7674                 singleFreq = 1;
7675
7676         /* Make sure that all CPUs impacted by this policy are
7677          * updated since we will only get a notification when the
7678          * user explicitly changes the policy on a CPU.
7679          */
7680         for_each_cpu(cpu, policy->cpus) {
7681                 extents = &freq_scale[cpu];
7682                 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
7683                 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
7684                 if (!hmp_data.freqinvar_load_scale_enabled) {
7685                         extents->curr_scale = 1024;
7686                 } else if (singleFreq) {
7687                         extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7688                         extents->curr_scale = 1024;
7689                 } else {
7690                         extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
7691                         extents->curr_scale = cpufreq_calc_scale(extents->min,
7692                                         extents->max, policy->cur);
7693                 }
7694         }
7695
7696         return 0;
7697 }
7698
7699 static struct notifier_block cpufreq_notifier = {
7700         .notifier_call  = cpufreq_callback,
7701 };
7702 static struct notifier_block cpufreq_policy_notifier = {
7703         .notifier_call  = cpufreq_policy_callback,
7704 };
7705
7706 static int __init register_sched_cpufreq_notifier(void)
7707 {
7708         int ret = 0;
7709
7710         /* init safe defaults since there are no policies at registration */
7711         for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
7712                 /* safe defaults */
7713                 freq_scale[ret].max = 1024;
7714                 freq_scale[ret].min = 1024;
7715                 freq_scale[ret].curr_scale = 1024;
7716         }
7717
7718         pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
7719         ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
7720                         CPUFREQ_POLICY_NOTIFIER);
7721
7722         if (ret != -EINVAL)
7723                 ret = cpufreq_register_notifier(&cpufreq_notifier,
7724                         CPUFREQ_TRANSITION_NOTIFIER);
7725
7726         return ret;
7727 }
7728
7729 core_initcall(register_sched_cpufreq_notifier);
7730 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */