kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/cpuidle.h>
  27 #include <linux/slab.h>
  28 #include <linux/profile.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/mempolicy.h>
  31 #include <linux/migrate.h>
  32 #include <linux/task_work.h>
  33
  34 #include <trace/events/sched.h>
  35
  36 #include "sched.h"
  37
  38 /*
  39  * Targeted preemption latency for CPU-bound tasks:
  40  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  41  *
  42  * NOTE: this latency value is not the same as the concept of
  43  * 'timeslice length' - timeslices in CFS are of variable length
  44  * and have no persistent notion like in traditional, time-slice
  45  * based scheduling concepts.
  46  *
  47  * (to see the precise effective timeslice length of your workload,
  48  *  run vmstat and monitor the context-switches (cs) field)
  49  */
  50 unsigned int sysctl_sched_latency = 6000000ULL;
  51 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  52
  53 /*
  54  * The initial- and re-scaling of tunables is configurable
  55  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  56  *
  57  * Options are:
  58  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  59  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  60  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  61  */
  62 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  63         = SCHED_TUNABLESCALING_LOG;
  64
  65 /*
  66  * Minimal preemption granularity for CPU-bound tasks:
  67  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  68  */
  69 unsigned int sysctl_sched_min_granularity = 750000ULL;
  70 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  71
  72 /*
  73  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  74  */
  75 static unsigned int sched_nr_latency = 8;
  76
  77 /*
  78  * After fork, child runs first. If set to 0 (default) then
  79  * parent will (try to) run first.
  80  */
  81 unsigned int sysctl_sched_child_runs_first __read_mostly;
  82
  83 /*
  84  * SCHED_OTHER wake-up granularity.
  85  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
  86  *
  87  * This option delays the preemption effects of decoupled workloads
  88  * and reduces their over-scheduling. Synchronous workloads will still
  89  * have immediate wakeup/sleep latencies.
  90  */
  91 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
  92 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
  93
  94 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
  95
  96 /*
  97  * The exponential sliding  window over which load is averaged for shares
  98  * distribution.
  99  * (default: 10msec)
 100  */
 101 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 102
 103 #ifdef CONFIG_CFS_BANDWIDTH
 104 /*
 105  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 106  * each time a cfs_rq requests quota.
 107  *
 108  * Note: in the case that the slice exceeds the runtime remaining (either due
 109  * to consumption or the quota being specified to be smaller than the slice)
 110  * we will always only issue the remaining available time.
 111  *
 112  * default: 5 msec, units: microseconds
 113   */
 114 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 115 #endif
 116
 117 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 118 {
 119         lw->weight += inc;
 120         lw->inv_weight = 0;
 121 }
 122
 123 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
 124 {
 125         lw->weight -= dec;
 126         lw->inv_weight = 0;
 127 }
 128
 129 static inline void update_load_set(struct load_weight *lw, unsigned long w)
 130 {
 131         lw->weight = w;
 132         lw->inv_weight = 0;
 133 }
 134
 135 /*
 136  * Increase the granularity value when there are more CPUs,
 137  * because with more CPUs the 'effective latency' as visible
 138  * to users decreases. But the relationship is not linear,
 139  * so pick a second-best guess by going with the log2 of the
 140  * number of CPUs.
 141  *
 142  * This idea comes from the SD scheduler of Con Kolivas:
 143  */
 144 static unsigned int get_update_sysctl_factor(void)
 145 {
 146         unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
 147         unsigned int factor;
 148
 149         switch (sysctl_sched_tunable_scaling) {
 150         case SCHED_TUNABLESCALING_NONE:
 151                 factor = 1;
 152                 break;
 153         case SCHED_TUNABLESCALING_LINEAR:
 154                 factor = cpus;
 155                 break;
 156         case SCHED_TUNABLESCALING_LOG:
 157         default:
 158                 factor = 1 + ilog2(cpus);
 159                 break;
 160         }
 161
 162         return factor;
 163 }
 164
 165 static void update_sysctl(void)
 166 {
 167         unsigned int factor = get_update_sysctl_factor();
 168
 169 #define SET_SYSCTL(name) \
 170         (sysctl_##name = (factor) * normalized_sysctl_##name)
 171         SET_SYSCTL(sched_min_granularity);
 172         SET_SYSCTL(sched_latency);
 173         SET_SYSCTL(sched_wakeup_granularity);
 174 #undef SET_SYSCTL
 175 }
 176
 177 void sched_init_granularity(void)
 178 {
 179         update_sysctl();
 180 }
 181
 182 #define WMULT_CONST     (~0U)
 183 #define WMULT_SHIFT     32
 184
 185 static void __update_inv_weight(struct load_weight *lw)
 186 {
 187         unsigned long w;
 188
 189         if (likely(lw->inv_weight))
 190                 return;
 191
 192         w = scale_load_down(lw->weight);
 193
 194         if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 195                 lw->inv_weight = 1;
 196         else if (unlikely(!w))
 197                 lw->inv_weight = WMULT_CONST;
 198         else
 199                 lw->inv_weight = WMULT_CONST / w;
 200 }
 201
 202 /*
 203  * delta_exec * weight / lw.weight
 204  *   OR
 205  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
 206  *
 207  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
 208  * we're guaranteed shift stays positive because inv_weight is guaranteed to
 209  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
 210  *
 211  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
 212  * weight/lw.weight <= 1, and therefore our shift will also be positive.
 213  */
 214 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
 215 {
 216         u64 fact = scale_load_down(weight);
 217         int shift = WMULT_SHIFT;
 218
 219         __update_inv_weight(lw);
 220
 221         if (unlikely(fact >> 32)) {
 222                 while (fact >> 32) {
 223                         fact >>= 1;
 224                         shift--;
 225                 }
 226         }
 227
 228         /* hint to use a 32x32->64 mul */
 229         fact = (u64)(u32)fact * lw->inv_weight;
 230
 231         while (fact >> 32) {
 232                 fact >>= 1;
 233                 shift--;
 234         }
 235
 236         return mul_u64_u32_shr(delta_exec, fact, shift);
 237 }
 238
 239
 240 const struct sched_class fair_sched_class;
 241
 242 /**************************************************************
 243  * CFS operations on generic schedulable entities:
 244  */
 245
 246 #ifdef CONFIG_FAIR_GROUP_SCHED
 247
 248 /* cpu runqueue to which this cfs_rq is attached */
 249 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 250 {
 251         return cfs_rq->rq;
 252 }
 253
 254 /* An entity is a task if it doesn't "own" a runqueue */
 255 #define entity_is_task(se)      (!se->my_q)
 256
 257 static inline struct task_struct *task_of(struct sched_entity *se)
 258 {
 259 #ifdef CONFIG_SCHED_DEBUG
 260         WARN_ON_ONCE(!entity_is_task(se));
 261 #endif
 262         return container_of(se, struct task_struct, se);
 263 }
 264
 265 /* Walk up scheduling entities hierarchy */
 266 #define for_each_sched_entity(se) \
 267                 for (; se; se = se->parent)
 268
 269 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 270 {
 271         return p->se.cfs_rq;
 272 }
 273
 274 /* runqueue on which this entity is (to be) queued */
 275 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 276 {
 277         return se->cfs_rq;
 278 }
 279
 280 /* runqueue "owned" by this group */
 281 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 282 {
 283         return grp->my_q;
 284 }
 285
 286 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 287 {
 288         if (!cfs_rq->on_list) {
 289                 /*
 290                  * Ensure we either appear before our parent (if already
 291                  * enqueued) or force our parent to appear after us when it is
 292                  * enqueued.  The fact that we always enqueue bottom-up
 293                  * reduces this to two cases.
 294                  */
 295                 if (cfs_rq->tg->parent &&
 296                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 297                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 298                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 299                 } else {
 300                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 301                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 302                 }
 303
 304                 cfs_rq->on_list = 1;
 305         }
 306 }
 307
 308 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 309 {
 310         if (cfs_rq->on_list) {
 311                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 312                 cfs_rq->on_list = 0;
 313         }
 314 }
 315
 316 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 317 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 318         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 319
 320 /* Do the two (enqueued) entities belong to the same group ? */
 321 static inline struct cfs_rq *
 322 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 323 {
 324         if (se->cfs_rq == pse->cfs_rq)
 325                 return se->cfs_rq;
 326
 327         return NULL;
 328 }
 329
 330 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 331 {
 332         return se->parent;
 333 }
 334
 335 static void
 336 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 337 {
 338         int se_depth, pse_depth;
 339
 340         /*
 341          * preemption test can be made between sibling entities who are in the
 342          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 343          * both tasks until we find their ancestors who are siblings of common
 344          * parent.
 345          */
 346
 347         /* First walk up until both entities are at same depth */
 348         se_depth = (*se)->depth;
 349         pse_depth = (*pse)->depth;
 350
 351         while (se_depth > pse_depth) {
 352                 se_depth--;
 353                 *se = parent_entity(*se);
 354         }
 355
 356         while (pse_depth > se_depth) {
 357                 pse_depth--;
 358                 *pse = parent_entity(*pse);
 359         }
 360
 361         while (!is_same_group(*se, *pse)) {
 362                 *se = parent_entity(*se);
 363                 *pse = parent_entity(*pse);
 364         }
 365 }
 366
 367 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 368
 369 static inline struct task_struct *task_of(struct sched_entity *se)
 370 {
 371         return container_of(se, struct task_struct, se);
 372 }
 373
 374 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 375 {
 376         return container_of(cfs_rq, struct rq, cfs);
 377 }
 378
 379 #define entity_is_task(se)      1
 380
 381 #define for_each_sched_entity(se) \
 382                 for (; se; se = NULL)
 383
 384 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 385 {
 386         return &task_rq(p)->cfs;
 387 }
 388
 389 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 390 {
 391         struct task_struct *p = task_of(se);
 392         struct rq *rq = task_rq(p);
 393
 394         return &rq->cfs;
 395 }
 396
 397 /* runqueue "owned" by this group */
 398 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 399 {
 400         return NULL;
 401 }
 402
 403 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 404 {
 405 }
 406
 407 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 408 {
 409 }
 410
 411 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 412                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 413
 414 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 415 {
 416         return NULL;
 417 }
 418
 419 static inline void
 420 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 421 {
 422 }
 423
 424 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 425
 426 static __always_inline
 427 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 428
 429 /**************************************************************
 430  * Scheduling class tree data structure manipulation methods:
 431  */
 432
 433 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 434 {
 435         s64 delta = (s64)(vruntime - max_vruntime);
 436         if (delta > 0)
 437                 max_vruntime = vruntime;
 438
 439         return max_vruntime;
 440 }
 441
 442 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 443 {
 444         s64 delta = (s64)(vruntime - min_vruntime);
 445         if (delta < 0)
 446                 min_vruntime = vruntime;
 447
 448         return min_vruntime;
 449 }
 450
 451 static inline int entity_before(struct sched_entity *a,
 452                                 struct sched_entity *b)
 453 {
 454         return (s64)(a->vruntime - b->vruntime) < 0;
 455 }
 456
 457 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 458 {
 459         u64 vruntime = cfs_rq->min_vruntime;
 460
 461         if (cfs_rq->curr)
 462                 vruntime = cfs_rq->curr->vruntime;
 463
 464         if (cfs_rq->rb_leftmost) {
 465                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 466                                                    struct sched_entity,
 467                                                    run_node);
 468
 469                 if (!cfs_rq->curr)
 470                         vruntime = se->vruntime;
 471                 else
 472                         vruntime = min_vruntime(vruntime, se->vruntime);
 473         }
 474
 475         /* ensure we never gain time by being placed backwards. */
 476         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 477 #ifndef CONFIG_64BIT
 478         smp_wmb();
 479         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 480 #endif
 481 }
 482
 483 /*
 484  * Enqueue an entity into the rb-tree:
 485  */
 486 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 487 {
 488         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 489         struct rb_node *parent = NULL;
 490         struct sched_entity *entry;
 491         int leftmost = 1;
 492
 493         /*
 494          * Find the right place in the rbtree:
 495          */
 496         while (*link) {
 497                 parent = *link;
 498                 entry = rb_entry(parent, struct sched_entity, run_node);
 499                 /*
 500                  * We dont care about collisions. Nodes with
 501                  * the same key stay together.
 502                  */
 503                 if (entity_before(se, entry)) {
 504                         link = &parent->rb_left;
 505                 } else {
 506                         link = &parent->rb_right;
 507                         leftmost = 0;
 508                 }
 509         }
 510
 511         /*
 512          * Maintain a cache of leftmost tree entries (it is frequently
 513          * used):
 514          */
 515         if (leftmost)
 516                 cfs_rq->rb_leftmost = &se->run_node;
 517
 518         rb_link_node(&se->run_node, parent, link);
 519         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 520 }
 521
 522 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 523 {
 524         if (cfs_rq->rb_leftmost == &se->run_node) {
 525                 struct rb_node *next_node;
 526
 527                 next_node = rb_next(&se->run_node);
 528                 cfs_rq->rb_leftmost = next_node;
 529         }
 530
 531         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 532 }
 533
 534 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 535 {
 536         struct rb_node *left = cfs_rq->rb_leftmost;
 537
 538         if (!left)
 539                 return NULL;
 540
 541         return rb_entry(left, struct sched_entity, run_node);
 542 }
 543
 544 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 545 {
 546         struct rb_node *next = rb_next(&se->run_node);
 547
 548         if (!next)
 549                 return NULL;
 550
 551         return rb_entry(next, struct sched_entity, run_node);
 552 }
 553
 554 #ifdef CONFIG_SCHED_DEBUG
 555 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 556 {
 557         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 558
 559         if (!last)
 560                 return NULL;
 561
 562         return rb_entry(last, struct sched_entity, run_node);
 563 }
 564
 565 /**************************************************************
 566  * Scheduling class statistics methods:
 567  */
 568
 569 int sched_proc_update_handler(struct ctl_table *table, int write,
 570                 void __user *buffer, size_t *lenp,
 571                 loff_t *ppos)
 572 {
 573         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 574         unsigned int factor = get_update_sysctl_factor();
 575
 576         if (ret || !write)
 577                 return ret;
 578
 579         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 580                                         sysctl_sched_min_granularity);
 581
 582 #define WRT_SYSCTL(name) \
 583         (normalized_sysctl_##name = sysctl_##name / (factor))
 584         WRT_SYSCTL(sched_min_granularity);
 585         WRT_SYSCTL(sched_latency);
 586         WRT_SYSCTL(sched_wakeup_granularity);
 587 #undef WRT_SYSCTL
 588
 589         return 0;
 590 }
 591 #endif
 592
 593 /*
 594  * delta /= w
 595  */
 596 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 597 {
 598         if (unlikely(se->load.weight != NICE_0_LOAD))
 599                 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
 600
 601         return delta;
 602 }
 603
 604 /*
 605  * The idea is to set a period in which each task runs once.
 606  *
 607  * When there are too many tasks (sched_nr_latency) we have to stretch
 608  * this period because otherwise the slices get too small.
 609  *
 610  * p = (nr <= nl) ? l : l*nr/nl
 611  */
 612 static u64 __sched_period(unsigned long nr_running)
 613 {
 614         if (unlikely(nr_running > sched_nr_latency))
 615                 return nr_running * sysctl_sched_min_granularity;
 616         else
 617                 return sysctl_sched_latency;
 618 }
 619
 620 /*
 621  * We calculate the wall-time slice from the period by taking a part
 622  * proportional to the weight.
 623  *
 624  * s = p*P[w/rw]
 625  */
 626 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 627 {
 628         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 629
 630         for_each_sched_entity(se) {
 631                 struct load_weight *load;
 632                 struct load_weight lw;
 633
 634                 cfs_rq = cfs_rq_of(se);
 635                 load = &cfs_rq->load;
 636
 637                 if (unlikely(!se->on_rq)) {
 638                         lw = cfs_rq->load;
 639
 640                         update_load_add(&lw, se->load.weight);
 641                         load = &lw;
 642                 }
 643                 slice = __calc_delta(slice, se->load.weight, load);
 644         }
 645         return slice;
 646 }
 647
 648 /*
 649  * We calculate the vruntime slice of a to-be-inserted task.
 650  *
 651  * vs = s/w
 652  */
 653 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 654 {
 655         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 656 }
 657
 658 #ifdef CONFIG_SMP
 659 static int select_idle_sibling(struct task_struct *p, int cpu);
 660 static unsigned long task_h_load(struct task_struct *p);
 661
 662 /*
 663  * We choose a half-life close to 1 scheduling period.
 664  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
 665  * dependent on this value.
 666  */
 667 #define LOAD_AVG_PERIOD 32
 668 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 669 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
 670
 671 /* Give new sched_entity start runnable values to heavy its load in infant time */
 672 void init_entity_runnable_average(struct sched_entity *se)
 673 {
 674         struct sched_avg *sa = &se->avg;
 675
 676         sa->last_update_time = 0;
 677         /*
 678          * sched_avg's period_contrib should be strictly less then 1024, so
 679          * we give it 1023 to make sure it is almost a period (1024us), and
 680          * will definitely be update (after enqueue).
 681          */
 682         sa->period_contrib = 1023;
 683         sa->load_avg = scale_load_down(se->load.weight);
 684         sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
 685         sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
 686         sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 687         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 688 }
 689
 690 #else
 691 void init_entity_runnable_average(struct sched_entity *se)
 692 {
 693 }
 694 #endif
 695
 696 /*
 697  * Update the current task's runtime statistics.
 698  */
 699 static void update_curr(struct cfs_rq *cfs_rq)
 700 {
 701         struct sched_entity *curr = cfs_rq->curr;
 702         u64 now = rq_clock_task(rq_of(cfs_rq));
 703         u64 delta_exec;
 704
 705         if (unlikely(!curr))
 706                 return;
 707
 708         delta_exec = now - curr->exec_start;
 709         if (unlikely((s64)delta_exec <= 0))
 710                 return;
 711
 712         curr->exec_start = now;
 713
 714         schedstat_set(curr->statistics.exec_max,
 715                       max(delta_exec, curr->statistics.exec_max));
 716
 717         curr->sum_exec_runtime += delta_exec;
 718         schedstat_add(cfs_rq, exec_clock, delta_exec);
 719
 720         curr->vruntime += calc_delta_fair(delta_exec, curr);
 721         update_min_vruntime(cfs_rq);
 722
 723         if (entity_is_task(curr)) {
 724                 struct task_struct *curtask = task_of(curr);
 725
 726                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 727                 cpuacct_charge(curtask, delta_exec);
 728                 account_group_exec_runtime(curtask, delta_exec);
 729         }
 730
 731         account_cfs_rq_runtime(cfs_rq, delta_exec);
 732 }
 733
 734 static void update_curr_fair(struct rq *rq)
 735 {
 736         update_curr(cfs_rq_of(&rq->curr->se));
 737 }
 738
 739 static inline void
 740 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 741 {
 742         schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
 743 }
 744
 745 /*
 746  * Task is being enqueued - update stats:
 747  */
 748 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 749 {
 750         /*
 751          * Are we enqueueing a waiting task? (for current tasks
 752          * a dequeue/enqueue event is a NOP)
 753          */
 754         if (se != cfs_rq->curr)
 755                 update_stats_wait_start(cfs_rq, se);
 756 }
 757
 758 static void
 759 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 760 {
 761         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
 762                         rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
 763         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
 764         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
 765                         rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 766 #ifdef CONFIG_SCHEDSTATS
 767         if (entity_is_task(se)) {
 768                 trace_sched_stat_wait(task_of(se),
 769                         rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
 770         }
 771 #endif
 772         schedstat_set(se->statistics.wait_start, 0);
 773 }
 774
 775 static inline void
 776 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 777 {
 778         /*
 779          * Mark the end of the wait period if dequeueing a
 780          * waiting task:
 781          */
 782         if (se != cfs_rq->curr)
 783                 update_stats_wait_end(cfs_rq, se);
 784 }
 785
 786 /*
 787  * We are picking a new current task - update its stats:
 788  */
 789 static inline void
 790 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 791 {
 792         /*
 793          * We are starting a new run period:
 794          */
 795         se->exec_start = rq_clock_task(rq_of(cfs_rq));
 796 }
 797
 798 /**************************************************
 799  * Scheduling class queueing methods:
 800  */
 801
 802 #ifdef CONFIG_NUMA_BALANCING
 803 /*
 804  * Approximate time to scan a full NUMA task in ms. The task scan period is
 805  * calculated based on the tasks virtual memory size and
 806  * numa_balancing_scan_size.
 807  */
 808 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
 809 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
 810
 811 /* Portion of address space to scan in MB */
 812 unsigned int sysctl_numa_balancing_scan_size = 256;
 813
 814 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 815 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 816
 817 static unsigned int task_nr_scan_windows(struct task_struct *p)
 818 {
 819         unsigned long rss = 0;
 820         unsigned long nr_scan_pages;
 821
 822         /*
 823          * Calculations based on RSS as non-present and empty pages are skipped
 824          * by the PTE scanner and NUMA hinting faults should be trapped based
 825          * on resident pages
 826          */
 827         nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
 828         rss = get_mm_rss(p->mm);
 829         if (!rss)
 830                 rss = nr_scan_pages;
 831
 832         rss = round_up(rss, nr_scan_pages);
 833         return rss / nr_scan_pages;
 834 }
 835
 836 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
 837 #define MAX_SCAN_WINDOW 2560
 838
 839 static unsigned int task_scan_min(struct task_struct *p)
 840 {
 841         unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
 842         unsigned int scan, floor;
 843         unsigned int windows = 1;
 844
 845         if (scan_size < MAX_SCAN_WINDOW)
 846                 windows = MAX_SCAN_WINDOW / scan_size;
 847         floor = 1000 / windows;
 848
 849         scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
 850         return max_t(unsigned int, floor, scan);
 851 }
 852
 853 static unsigned int task_scan_max(struct task_struct *p)
 854 {
 855         unsigned int smin = task_scan_min(p);
 856         unsigned int smax;
 857
 858         /* Watch for min being lower than max due to floor calculations */
 859         smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
 860         return max(smin, smax);
 861 }
 862
 863 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 864 {
 865         rq->nr_numa_running += (p->numa_preferred_nid != -1);
 866         rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
 867 }
 868
 869 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
 870 {
 871         rq->nr_numa_running -= (p->numa_preferred_nid != -1);
 872         rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
 873 }
 874
 875 struct numa_group {
 876         atomic_t refcount;
 877
 878         spinlock_t lock; /* nr_tasks, tasks */
 879         int nr_tasks;
 880         pid_t gid;
 881
 882         struct rcu_head rcu;
 883         nodemask_t active_nodes;
 884         unsigned long total_faults;
 885         /*
 886          * Faults_cpu is used to decide whether memory should move
 887          * towards the CPU. As a consequence, these stats are weighted
 888          * more by CPU use than by memory faults.
 889          */
 890         unsigned long *faults_cpu;
 891         unsigned long faults[0];
 892 };
 893
 894 /* Shared or private faults. */
 895 #define NR_NUMA_HINT_FAULT_TYPES 2
 896
 897 /* Memory and CPU locality */
 898 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
 899
 900 /* Averaged statistics, and temporary buffers. */
 901 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 902
 903 pid_t task_numa_group_id(struct task_struct *p)
 904 {
 905         return p->numa_group ? p->numa_group->gid : 0;
 906 }
 907
 908 /*
 909  * The averaged statistics, shared & private, memory & cpu,
 910  * occupy the first half of the array. The second half of the
 911  * array is for current counters, which are averaged into the
 912  * first set by task_numa_placement.
 913  */
 914 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
 915 {
 916         return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
 917 }
 918
 919 static inline unsigned long task_faults(struct task_struct *p, int nid)
 920 {
 921         if (!p->numa_faults)
 922                 return 0;
 923
 924         return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 925                 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
 926 }
 927
 928 static inline unsigned long group_faults(struct task_struct *p, int nid)
 929 {
 930         if (!p->numa_group)
 931                 return 0;
 932
 933         return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
 934                 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
 935 }
 936
 937 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
 938 {
 939         return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
 940                 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
 941 }
 942
 943 /* Handle placement on systems where not all nodes are directly connected. */
 944 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
 945                                         int maxdist, bool task)
 946 {
 947         unsigned long score = 0;
 948         int node;
 949
 950         /*
 951          * All nodes are directly connected, and the same distance
 952          * from each other. No need for fancy placement algorithms.
 953          */
 954         if (sched_numa_topology_type == NUMA_DIRECT)
 955                 return 0;
 956
 957         /*
 958          * This code is called for each node, introducing N^2 complexity,
 959          * which should be ok given the number of nodes rarely exceeds 8.
 960          */
 961         for_each_online_node(node) {
 962                 unsigned long faults;
 963                 int dist = node_distance(nid, node);
 964
 965                 /*
 966                  * The furthest away nodes in the system are not interesting
 967                  * for placement; nid was already counted.
 968                  */
 969                 if (dist == sched_max_numa_distance || node == nid)
 970                         continue;
 971
 972                 /*
 973                  * On systems with a backplane NUMA topology, compare groups
 974                  * of nodes, and move tasks towards the group with the most
 975                  * memory accesses. When comparing two nodes at distance
 976                  * "hoplimit", only nodes closer by than "hoplimit" are part
 977                  * of each group. Skip other nodes.
 978                  */
 979                 if (sched_numa_topology_type == NUMA_BACKPLANE &&
 980                                         dist > maxdist)
 981                         continue;
 982
 983                 /* Add up the faults from nearby nodes. */
 984                 if (task)
 985                         faults = task_faults(p, node);
 986                 else
 987                         faults = group_faults(p, node);
 988
 989                 /*
 990                  * On systems with a glueless mesh NUMA topology, there are
 991                  * no fixed "groups of nodes". Instead, nodes that are not
 992                  * directly connected bounce traffic through intermediate
 993                  * nodes; a numa_group can occupy any set of nodes.
 994                  * The further away a node is, the less the faults count.
 995                  * This seems to result in good task placement.
 996                  */
 997                 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
 998                         faults *= (sched_max_numa_distance - dist);
 999                         faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1000                 }
1001
1002                 score += faults;
1003         }
1004
1005         return score;
1006 }
1007
1008 /*
1009  * These return the fraction of accesses done by a particular task, or
1010  * task group, on a particular numa node.  The group weight is given a
1011  * larger multiplier, in order to group tasks together that are almost
1012  * evenly spread out between numa nodes.
1013  */
1014 static inline unsigned long task_weight(struct task_struct *p, int nid,
1015                                         int dist)
1016 {
1017         unsigned long faults, total_faults;
1018
1019         if (!p->numa_faults)
1020                 return 0;
1021
1022         total_faults = p->total_numa_faults;
1023
1024         if (!total_faults)
1025                 return 0;
1026
1027         faults = task_faults(p, nid);
1028         faults += score_nearby_nodes(p, nid, dist, true);
1029
1030         return 1000 * faults / total_faults;
1031 }
1032
1033 static inline unsigned long group_weight(struct task_struct *p, int nid,
1034                                          int dist)
1035 {
1036         unsigned long faults, total_faults;
1037
1038         if (!p->numa_group)
1039                 return 0;
1040
1041         total_faults = p->numa_group->total_faults;
1042
1043         if (!total_faults)
1044                 return 0;
1045
1046         faults = group_faults(p, nid);
1047         faults += score_nearby_nodes(p, nid, dist, false);
1048
1049         return 1000 * faults / total_faults;
1050 }
1051
1052 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1053                                 int src_nid, int dst_cpu)
1054 {
1055         struct numa_group *ng = p->numa_group;
1056         int dst_nid = cpu_to_node(dst_cpu);
1057         int last_cpupid, this_cpupid;
1058
1059         this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1060
1061         /*
1062          * Multi-stage node selection is used in conjunction with a periodic
1063          * migration fault to build a temporal task<->page relation. By using
1064          * a two-stage filter we remove short/unlikely relations.
1065          *
1066          * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1067          * a task's usage of a particular page (n_p) per total usage of this
1068          * page (n_t) (in a given time-span) to a probability.
1069          *
1070          * Our periodic faults will sample this probability and getting the
1071          * same result twice in a row, given these samples are fully
1072          * independent, is then given by P(n)^2, provided our sample period
1073          * is sufficiently short compared to the usage pattern.
1074          *
1075          * This quadric squishes small probabilities, making it less likely we
1076          * act on an unlikely task<->page relation.
1077          */
1078         last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1079         if (!cpupid_pid_unset(last_cpupid) &&
1080                                 cpupid_to_nid(last_cpupid) != dst_nid)
1081                 return false;
1082
1083         /* Always allow migrate on private faults */
1084         if (cpupid_match_pid(p, last_cpupid))
1085                 return true;
1086
1087         /* A shared fault, but p->numa_group has not been set up yet. */
1088         if (!ng)
1089                 return true;
1090
1091         /*
1092          * Do not migrate if the destination is not a node that
1093          * is actively used by this numa group.
1094          */
1095         if (!node_isset(dst_nid, ng->active_nodes))
1096                 return false;
1097
1098         /*
1099          * Source is a node that is not actively used by this
1100          * numa group, while the destination is. Migrate.
1101          */
1102         if (!node_isset(src_nid, ng->active_nodes))
1103                 return true;
1104
1105         /*
1106          * Both source and destination are nodes in active
1107          * use by this numa group. Maximize memory bandwidth
1108          * by migrating from more heavily used groups, to less
1109          * heavily used ones, spreading the load around.
1110          * Use a 1/4 hysteresis to avoid spurious page movement.
1111          */
1112         return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1113 }
1114
1115 static unsigned long weighted_cpuload(const int cpu);
1116 static unsigned long source_load(int cpu, int type);
1117 static unsigned long target_load(int cpu, int type);
1118 static unsigned long capacity_of(int cpu);
1119 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1120
1121 /* Cached statistics for all CPUs within a node */
1122 struct numa_stats {
1123         unsigned long nr_running;
1124         unsigned long load;
1125
1126         /* Total compute capacity of CPUs on a node */
1127         unsigned long compute_capacity;
1128
1129         /* Approximate capacity in terms of runnable tasks on a node */
1130         unsigned long task_capacity;
1131         int has_free_capacity;
1132 };
1133
1134 /*
1135  * XXX borrowed from update_sg_lb_stats
1136  */
1137 static void update_numa_stats(struct numa_stats *ns, int nid)
1138 {
1139         int smt, cpu, cpus = 0;
1140         unsigned long capacity;
1141
1142         memset(ns, 0, sizeof(*ns));
1143         for_each_cpu(cpu, cpumask_of_node(nid)) {
1144                 struct rq *rq = cpu_rq(cpu);
1145
1146                 ns->nr_running += rq->nr_running;
1147                 ns->load += weighted_cpuload(cpu);
1148                 ns->compute_capacity += capacity_of(cpu);
1149
1150                 cpus++;
1151         }
1152
1153         /*
1154          * If we raced with hotplug and there are no CPUs left in our mask
1155          * the @ns structure is NULL'ed and task_numa_compare() will
1156          * not find this node attractive.
1157          *
1158          * We'll either bail at !has_free_capacity, or we'll detect a huge
1159          * imbalance and bail there.
1160          */
1161         if (!cpus)
1162                 return;
1163
1164         /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1165         smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1166         capacity = cpus / smt; /* cores */
1167
1168         ns->task_capacity = min_t(unsigned, capacity,
1169                 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1170         ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1171 }
1172
1173 struct task_numa_env {
1174         struct task_struct *p;
1175
1176         int src_cpu, src_nid;
1177         int dst_cpu, dst_nid;
1178
1179         struct numa_stats src_stats, dst_stats;
1180
1181         int imbalance_pct;
1182         int dist;
1183
1184         struct task_struct *best_task;
1185         long best_imp;
1186         int best_cpu;
1187 };
1188
1189 static void task_numa_assign(struct task_numa_env *env,
1190                              struct task_struct *p, long imp)
1191 {
1192         if (env->best_task)
1193                 put_task_struct(env->best_task);
1194
1195         env->best_task = p;
1196         env->best_imp = imp;
1197         env->best_cpu = env->dst_cpu;
1198 }
1199
1200 static bool load_too_imbalanced(long src_load, long dst_load,
1201                                 struct task_numa_env *env)
1202 {
1203         long imb, old_imb;
1204         long orig_src_load, orig_dst_load;
1205         long src_capacity, dst_capacity;
1206
1207         /*
1208          * The load is corrected for the CPU capacity available on each node.
1209          *
1210          * src_load        dst_load
1211          * ------------ vs ---------
1212          * src_capacity    dst_capacity
1213          */
1214         src_capacity = env->src_stats.compute_capacity;
1215         dst_capacity = env->dst_stats.compute_capacity;
1216
1217         /* We care about the slope of the imbalance, not the direction. */
1218         if (dst_load < src_load)
1219                 swap(dst_load, src_load);
1220
1221         /* Is the difference below the threshold? */
1222         imb = dst_load * src_capacity * 100 -
1223               src_load * dst_capacity * env->imbalance_pct;
1224         if (imb <= 0)
1225                 return false;
1226
1227         /*
1228          * The imbalance is above the allowed threshold.
1229          * Compare it with the old imbalance.
1230          */
1231         orig_src_load = env->src_stats.load;
1232         orig_dst_load = env->dst_stats.load;
1233
1234         if (orig_dst_load < orig_src_load)
1235                 swap(orig_dst_load, orig_src_load);
1236
1237         old_imb = orig_dst_load * src_capacity * 100 -
1238                   orig_src_load * dst_capacity * env->imbalance_pct;
1239
1240         /* Would this change make things worse? */
1241         return (imb > old_imb);
1242 }
1243
1244 /*
1245  * This checks if the overall compute and NUMA accesses of the system would
1246  * be improved if the source tasks was migrated to the target dst_cpu taking
1247  * into account that it might be best if task running on the dst_cpu should
1248  * be exchanged with the source task
1249  */
1250 static void task_numa_compare(struct task_numa_env *env,
1251                               long taskimp, long groupimp)
1252 {
1253         struct rq *src_rq = cpu_rq(env->src_cpu);
1254         struct rq *dst_rq = cpu_rq(env->dst_cpu);
1255         struct task_struct *cur;
1256         long src_load, dst_load;
1257         long load;
1258         long imp = env->p->numa_group ? groupimp : taskimp;
1259         long moveimp = imp;
1260         int dist = env->dist;
1261         bool assigned = false;
1262
1263         rcu_read_lock();
1264
1265         raw_spin_lock_irq(&dst_rq->lock);
1266         cur = dst_rq->curr;
1267         /*
1268          * No need to move the exiting task or idle task.
1269          */
1270         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1271                 cur = NULL;
1272         else {
1273                 /*
1274                  * The task_struct must be protected here to protect the
1275                  * p->numa_faults access in the task_weight since the
1276                  * numa_faults could already be freed in the following path:
1277                  * finish_task_switch()
1278                  *     --> put_task_struct()
1279                  *         --> __put_task_struct()
1280                  *             --> task_numa_free()
1281                  */
1282                 get_task_struct(cur);
1283         }
1284
1285         raw_spin_unlock_irq(&dst_rq->lock);
1286
1287         /*
1288          * Because we have preemption enabled we can get migrated around and
1289          * end try selecting ourselves (current == env->p) as a swap candidate.
1290          */
1291         if (cur == env->p)
1292                 goto unlock;
1293
1294         /*
1295          * "imp" is the fault differential for the source task between the
1296          * source and destination node. Calculate the total differential for
1297          * the source task and potential destination task. The more negative
1298          * the value is, the more rmeote accesses that would be expected to
1299          * be incurred if the tasks were swapped.
1300          */
1301         if (cur) {
1302                 /* Skip this swap candidate if cannot move to the source cpu */
1303                 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1304                         goto unlock;
1305
1306                 /*
1307                  * If dst and source tasks are in the same NUMA group, or not
1308                  * in any group then look only at task weights.
1309                  */
1310                 if (cur->numa_group == env->p->numa_group) {
1311                         imp = taskimp + task_weight(cur, env->src_nid, dist) -
1312                               task_weight(cur, env->dst_nid, dist);
1313                         /*
1314                          * Add some hysteresis to prevent swapping the
1315                          * tasks within a group over tiny differences.
1316                          */
1317                         if (cur->numa_group)
1318                                 imp -= imp/16;
1319                 } else {
1320                         /*
1321                          * Compare the group weights. If a task is all by
1322                          * itself (not part of a group), use the task weight
1323                          * instead.
1324                          */
1325                         if (cur->numa_group)
1326                                 imp += group_weight(cur, env->src_nid, dist) -
1327                                        group_weight(cur, env->dst_nid, dist);
1328                         else
1329                                 imp += task_weight(cur, env->src_nid, dist) -
1330                                        task_weight(cur, env->dst_nid, dist);
1331                 }
1332         }
1333
1334         if (imp <= env->best_imp && moveimp <= env->best_imp)
1335                 goto unlock;
1336
1337         if (!cur) {
1338                 /* Is there capacity at our destination? */
1339                 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1340                     !env->dst_stats.has_free_capacity)
1341                         goto unlock;
1342
1343                 goto balance;
1344         }
1345
1346         /* Balance doesn't matter much if we're running a task per cpu */
1347         if (imp > env->best_imp && src_rq->nr_running == 1 &&
1348                         dst_rq->nr_running == 1)
1349                 goto assign;
1350
1351         /*
1352          * In the overloaded case, try and keep the load balanced.
1353          */
1354 balance:
1355         load = task_h_load(env->p);
1356         dst_load = env->dst_stats.load + load;
1357         src_load = env->src_stats.load - load;
1358
1359         if (moveimp > imp && moveimp > env->best_imp) {
1360                 /*
1361                  * If the improvement from just moving env->p direction is
1362                  * better than swapping tasks around, check if a move is
1363                  * possible. Store a slightly smaller score than moveimp,
1364                  * so an actually idle CPU will win.
1365                  */
1366                 if (!load_too_imbalanced(src_load, dst_load, env)) {
1367                         imp = moveimp - 1;
1368                         put_task_struct(cur);
1369                         cur = NULL;
1370                         goto assign;
1371                 }
1372         }
1373
1374         if (imp <= env->best_imp)
1375                 goto unlock;
1376
1377         if (cur) {
1378                 load = task_h_load(cur);
1379                 dst_load -= load;
1380                 src_load += load;
1381         }
1382
1383         if (load_too_imbalanced(src_load, dst_load, env))
1384                 goto unlock;
1385
1386         /*
1387          * One idle CPU per node is evaluated for a task numa move.
1388          * Call select_idle_sibling to maybe find a better one.
1389          */
1390         if (!cur)
1391                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1392
1393 assign:
1394         assigned = true;
1395         task_numa_assign(env, cur, imp);
1396 unlock:
1397         rcu_read_unlock();
1398         /*
1399          * The dst_rq->curr isn't assigned. The protection for task_struct is
1400          * finished.
1401          */
1402         if (cur && !assigned)
1403                 put_task_struct(cur);
1404 }
1405
1406 static void task_numa_find_cpu(struct task_numa_env *env,
1407                                 long taskimp, long groupimp)
1408 {
1409         int cpu;
1410
1411         for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1412                 /* Skip this CPU if the source task cannot migrate */
1413                 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1414                         continue;
1415
1416                 env->dst_cpu = cpu;
1417                 task_numa_compare(env, taskimp, groupimp);
1418         }
1419 }
1420
1421 /* Only move tasks to a NUMA node less busy than the current node. */
1422 static bool numa_has_capacity(struct task_numa_env *env)
1423 {
1424         struct numa_stats *src = &env->src_stats;
1425         struct numa_stats *dst = &env->dst_stats;
1426
1427         if (src->has_free_capacity && !dst->has_free_capacity)
1428                 return false;
1429
1430         /*
1431          * Only consider a task move if the source has a higher load
1432          * than the destination, corrected for CPU capacity on each node.
1433          *
1434          *      src->load                dst->load
1435          * --------------------- vs ---------------------
1436          * src->compute_capacity    dst->compute_capacity
1437          */
1438         if (src->load * dst->compute_capacity * env->imbalance_pct >
1439
1440             dst->load * src->compute_capacity * 100)
1441                 return true;
1442
1443         return false;
1444 }
1445
1446 static int task_numa_migrate(struct task_struct *p)
1447 {
1448         struct task_numa_env env = {
1449                 .p = p,
1450
1451                 .src_cpu = task_cpu(p),
1452                 .src_nid = task_node(p),
1453
1454                 .imbalance_pct = 112,
1455
1456                 .best_task = NULL,
1457                 .best_imp = 0,
1458                 .best_cpu = -1
1459         };
1460         struct sched_domain *sd;
1461         unsigned long taskweight, groupweight;
1462         int nid, ret, dist;
1463         long taskimp, groupimp;
1464
1465         /*
1466          * Pick the lowest SD_NUMA domain, as that would have the smallest
1467          * imbalance and would be the first to start moving tasks about.
1468          *
1469          * And we want to avoid any moving of tasks about, as that would create
1470          * random movement of tasks -- counter the numa conditions we're trying
1471          * to satisfy here.
1472          */
1473         rcu_read_lock();
1474         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1475         if (sd)
1476                 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1477         rcu_read_unlock();
1478
1479         /*
1480          * Cpusets can break the scheduler domain tree into smaller
1481          * balance domains, some of which do not cross NUMA boundaries.
1482          * Tasks that are "trapped" in such domains cannot be migrated
1483          * elsewhere, so there is no point in (re)trying.
1484          */
1485         if (unlikely(!sd)) {
1486                 p->numa_preferred_nid = task_node(p);
1487                 return -EINVAL;
1488         }
1489
1490         env.dst_nid = p->numa_preferred_nid;
1491         dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1492         taskweight = task_weight(p, env.src_nid, dist);
1493         groupweight = group_weight(p, env.src_nid, dist);
1494         update_numa_stats(&env.src_stats, env.src_nid);
1495         taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1496         groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1497         update_numa_stats(&env.dst_stats, env.dst_nid);
1498
1499         /* Try to find a spot on the preferred nid. */
1500         if (numa_has_capacity(&env))
1501                 task_numa_find_cpu(&env, taskimp, groupimp);
1502
1503         /*
1504          * Look at other nodes in these cases:
1505          * - there is no space available on the preferred_nid
1506          * - the task is part of a numa_group that is interleaved across
1507          *   multiple NUMA nodes; in order to better consolidate the group,
1508          *   we need to check other locations.
1509          */
1510         if (env.best_cpu == -1 || (p->numa_group &&
1511                         nodes_weight(p->numa_group->active_nodes) > 1)) {
1512                 for_each_online_node(nid) {
1513                         if (nid == env.src_nid || nid == p->numa_preferred_nid)
1514                                 continue;
1515
1516                         dist = node_distance(env.src_nid, env.dst_nid);
1517                         if (sched_numa_topology_type == NUMA_BACKPLANE &&
1518                                                 dist != env.dist) {
1519                                 taskweight = task_weight(p, env.src_nid, dist);
1520                                 groupweight = group_weight(p, env.src_nid, dist);
1521                         }
1522
1523                         /* Only consider nodes where both task and groups benefit */
1524                         taskimp = task_weight(p, nid, dist) - taskweight;
1525                         groupimp = group_weight(p, nid, dist) - groupweight;
1526                         if (taskimp < 0 && groupimp < 0)
1527                                 continue;
1528
1529                         env.dist = dist;
1530                         env.dst_nid = nid;
1531                         update_numa_stats(&env.dst_stats, env.dst_nid);
1532                         if (numa_has_capacity(&env))
1533                                 task_numa_find_cpu(&env, taskimp, groupimp);
1534                 }
1535         }
1536
1537         /*
1538          * If the task is part of a workload that spans multiple NUMA nodes,
1539          * and is migrating into one of the workload's active nodes, remember
1540          * this node as the task's preferred numa node, so the workload can
1541          * settle down.
1542          * A task that migrated to a second choice node will be better off
1543          * trying for a better one later. Do not set the preferred node here.
1544          */
1545         if (p->numa_group) {
1546                 if (env.best_cpu == -1)
1547                         nid = env.src_nid;
1548                 else
1549                         nid = env.dst_nid;
1550
1551                 if (node_isset(nid, p->numa_group->active_nodes))
1552                         sched_setnuma(p, env.dst_nid);
1553         }
1554
1555         /* No better CPU than the current one was found. */
1556         if (env.best_cpu == -1)
1557                 return -EAGAIN;
1558
1559         /*
1560          * Reset the scan period if the task is being rescheduled on an
1561          * alternative node to recheck if the tasks is now properly placed.
1562          */
1563         p->numa_scan_period = task_scan_min(p);
1564
1565         if (env.best_task == NULL) {
1566                 ret = migrate_task_to(p, env.best_cpu);
1567                 if (ret != 0)
1568                         trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1569                 return ret;
1570         }
1571
1572         ret = migrate_swap(p, env.best_task);
1573         if (ret != 0)
1574                 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1575         put_task_struct(env.best_task);
1576         return ret;
1577 }
1578
1579 /* Attempt to migrate a task to a CPU on the preferred node. */
1580 static void numa_migrate_preferred(struct task_struct *p)
1581 {
1582         unsigned long interval = HZ;
1583
1584         /* This task has no NUMA fault statistics yet */
1585         if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1586                 return;
1587
1588         /* Periodically retry migrating the task to the preferred node */
1589         interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1590         p->numa_migrate_retry = jiffies + interval;
1591
1592         /* Success if task is already running on preferred CPU */
1593         if (task_node(p) == p->numa_preferred_nid)
1594                 return;
1595
1596         /* Otherwise, try migrate to a CPU on the preferred node */
1597         task_numa_migrate(p);
1598 }
1599
1600 /*
1601  * Find the nodes on which the workload is actively running. We do this by
1602  * tracking the nodes from which NUMA hinting faults are triggered. This can
1603  * be different from the set of nodes where the workload's memory is currently
1604  * located.
1605  *
1606  * The bitmask is used to make smarter decisions on when to do NUMA page
1607  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1608  * are added when they cause over 6/16 of the maximum number of faults, but
1609  * only removed when they drop below 3/16.
1610  */
1611 static void update_numa_active_node_mask(struct numa_group *numa_group)
1612 {
1613         unsigned long faults, max_faults = 0;
1614         int nid;
1615
1616         for_each_online_node(nid) {
1617                 faults = group_faults_cpu(numa_group, nid);
1618                 if (faults > max_faults)
1619                         max_faults = faults;
1620         }
1621
1622         for_each_online_node(nid) {
1623                 faults = group_faults_cpu(numa_group, nid);
1624                 if (!node_isset(nid, numa_group->active_nodes)) {
1625                         if (faults > max_faults * 6 / 16)
1626                                 node_set(nid, numa_group->active_nodes);
1627                 } else if (faults < max_faults * 3 / 16)
1628                         node_clear(nid, numa_group->active_nodes);
1629         }
1630 }
1631
1632 /*
1633  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1634  * increments. The more local the fault statistics are, the higher the scan
1635  * period will be for the next scan window. If local/(local+remote) ratio is
1636  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1637  * the scan period will decrease. Aim for 70% local accesses.
1638  */
1639 #define NUMA_PERIOD_SLOTS 10
1640 #define NUMA_PERIOD_THRESHOLD 7
1641
1642 /*
1643  * Increase the scan period (slow down scanning) if the majority of
1644  * our memory is already on our local node, or if the majority of
1645  * the page accesses are shared with other processes.
1646  * Otherwise, decrease the scan period.
1647  */
1648 static void update_task_scan_period(struct task_struct *p,
1649                         unsigned long shared, unsigned long private)
1650 {
1651         unsigned int period_slot;
1652         int ratio;
1653         int diff;
1654
1655         unsigned long remote = p->numa_faults_locality[0];
1656         unsigned long local = p->numa_faults_locality[1];
1657
1658         /*
1659          * If there were no record hinting faults then either the task is
1660          * completely idle or all activity is areas that are not of interest
1661          * to automatic numa balancing. Related to that, if there were failed
1662          * migration then it implies we are migrating too quickly or the local
1663          * node is overloaded. In either case, scan slower
1664          */
1665         if (local + shared == 0 || p->numa_faults_locality[2]) {
1666                 p->numa_scan_period = min(p->numa_scan_period_max,
1667                         p->numa_scan_period << 1);
1668
1669                 p->mm->numa_next_scan = jiffies +
1670                         msecs_to_jiffies(p->numa_scan_period);
1671
1672                 return;
1673         }
1674
1675         /*
1676          * Prepare to scale scan period relative to the current period.
1677          *       == NUMA_PERIOD_THRESHOLD scan period stays the same
1678          *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1679          *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1680          */
1681         period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1682         ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1683         if (ratio >= NUMA_PERIOD_THRESHOLD) {
1684                 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1685                 if (!slot)
1686                         slot = 1;
1687                 diff = slot * period_slot;
1688         } else {
1689                 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1690
1691                 /*
1692                  * Scale scan rate increases based on sharing. There is an
1693                  * inverse relationship between the degree of sharing and
1694                  * the adjustment made to the scanning period. Broadly
1695                  * speaking the intent is that there is little point
1696                  * scanning faster if shared accesses dominate as it may
1697                  * simply bounce migrations uselessly
1698                  */
1699                 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1700                 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1701         }
1702
1703         p->numa_scan_period = clamp(p->numa_scan_period + diff,
1704                         task_scan_min(p), task_scan_max(p));
1705         memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1706 }
1707
1708 /*
1709  * Get the fraction of time the task has been running since the last
1710  * NUMA placement cycle. The scheduler keeps similar statistics, but
1711  * decays those on a 32ms period, which is orders of magnitude off
1712  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1713  * stats only if the task is so new there are no NUMA statistics yet.
1714  */
1715 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1716 {
1717         u64 runtime, delta, now;
1718         /* Use the start of this time slice to avoid calculations. */
1719         now = p->se.exec_start;
1720         runtime = p->se.sum_exec_runtime;
1721
1722         if (p->last_task_numa_placement) {
1723                 delta = runtime - p->last_sum_exec_runtime;
1724                 *period = now - p->last_task_numa_placement;
1725         } else {
1726                 delta = p->se.avg.load_sum / p->se.load.weight;
1727                 *period = LOAD_AVG_MAX;
1728         }
1729
1730         p->last_sum_exec_runtime = runtime;
1731         p->last_task_numa_placement = now;
1732
1733         return delta;
1734 }
1735
1736 /*
1737  * Determine the preferred nid for a task in a numa_group. This needs to
1738  * be done in a way that produces consistent results with group_weight,
1739  * otherwise workloads might not converge.
1740  */
1741 static int preferred_group_nid(struct task_struct *p, int nid)
1742 {
1743         nodemask_t nodes;
1744         int dist;
1745
1746         /* Direct connections between all NUMA nodes. */
1747         if (sched_numa_topology_type == NUMA_DIRECT)
1748                 return nid;
1749
1750         /*
1751          * On a system with glueless mesh NUMA topology, group_weight
1752          * scores nodes according to the number of NUMA hinting faults on
1753          * both the node itself, and on nearby nodes.
1754          */
1755         if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1756                 unsigned long score, max_score = 0;
1757                 int node, max_node = nid;
1758
1759                 dist = sched_max_numa_distance;
1760
1761                 for_each_online_node(node) {
1762                         score = group_weight(p, node, dist);
1763                         if (score > max_score) {
1764                                 max_score = score;
1765                                 max_node = node;
1766                         }
1767                 }
1768                 return max_node;
1769         }
1770
1771         /*
1772          * Finding the preferred nid in a system with NUMA backplane
1773          * interconnect topology is more involved. The goal is to locate
1774          * tasks from numa_groups near each other in the system, and
1775          * untangle workloads from different sides of the system. This requires
1776          * searching down the hierarchy of node groups, recursively searching
1777          * inside the highest scoring group of nodes. The nodemask tricks
1778          * keep the complexity of the search down.
1779          */
1780         nodes = node_online_map;
1781         for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1782                 unsigned long max_faults = 0;
1783                 nodemask_t max_group = NODE_MASK_NONE;
1784                 int a, b;
1785
1786                 /* Are there nodes at this distance from each other? */
1787                 if (!find_numa_distance(dist))
1788                         continue;
1789
1790                 for_each_node_mask(a, nodes) {
1791                         unsigned long faults = 0;
1792                         nodemask_t this_group;
1793                         nodes_clear(this_group);
1794
1795                         /* Sum group's NUMA faults; includes a==b case. */
1796                         for_each_node_mask(b, nodes) {
1797                                 if (node_distance(a, b) < dist) {
1798                                         faults += group_faults(p, b);
1799                                         node_set(b, this_group);
1800                                         node_clear(b, nodes);
1801                                 }
1802                         }
1803
1804                         /* Remember the top group. */
1805                         if (faults > max_faults) {
1806                                 max_faults = faults;
1807                                 max_group = this_group;
1808                                 /*
1809                                  * subtle: at the smallest distance there is
1810                                  * just one node left in each "group", the
1811                                  * winner is the preferred nid.
1812                                  */
1813                                 nid = a;
1814                         }
1815                 }
1816                 /* Next round, evaluate the nodes within max_group. */
1817                 if (!max_faults)
1818                         break;
1819                 nodes = max_group;
1820         }
1821         return nid;
1822 }
1823
1824 static void task_numa_placement(struct task_struct *p)
1825 {
1826         int seq, nid, max_nid = -1, max_group_nid = -1;
1827         unsigned long max_faults = 0, max_group_faults = 0;
1828         unsigned long fault_types[2] = { 0, 0 };
1829         unsigned long total_faults;
1830         u64 runtime, period;
1831         spinlock_t *group_lock = NULL;
1832
1833         /*
1834          * The p->mm->numa_scan_seq field gets updated without
1835          * exclusive access. Use READ_ONCE() here to ensure
1836          * that the field is read in a single access:
1837          */
1838         seq = READ_ONCE(p->mm->numa_scan_seq);
1839         if (p->numa_scan_seq == seq)
1840                 return;
1841         p->numa_scan_seq = seq;
1842         p->numa_scan_period_max = task_scan_max(p);
1843
1844         total_faults = p->numa_faults_locality[0] +
1845                        p->numa_faults_locality[1];
1846         runtime = numa_get_avg_runtime(p, &period);
1847
1848         /* If the task is part of a group prevent parallel updates to group stats */
1849         if (p->numa_group) {
1850                 group_lock = &p->numa_group->lock;
1851                 spin_lock_irq(group_lock);
1852         }
1853
1854         /* Find the node with the highest number of faults */
1855         for_each_online_node(nid) {
1856                 /* Keep track of the offsets in numa_faults array */
1857                 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
1858                 unsigned long faults = 0, group_faults = 0;
1859                 int priv;
1860
1861                 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
1862                         long diff, f_diff, f_weight;
1863
1864                         mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
1865                         membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
1866                         cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
1867                         cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
1868
1869                         /* Decay existing window, copy faults since last scan */
1870                         diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
1871                         fault_types[priv] += p->numa_faults[membuf_idx];
1872                         p->numa_faults[membuf_idx] = 0;
1873
1874                         /*
1875                          * Normalize the faults_from, so all tasks in a group
1876                          * count according to CPU use, instead of by the raw
1877                          * number of faults. Tasks with little runtime have
1878                          * little over-all impact on throughput, and thus their
1879                          * faults are less important.
1880                          */
1881                         f_weight = div64_u64(runtime << 16, period + 1);
1882                         f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
1883                                    (total_faults + 1);
1884                         f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
1885                         p->numa_faults[cpubuf_idx] = 0;
1886
1887                         p->numa_faults[mem_idx] += diff;
1888                         p->numa_faults[cpu_idx] += f_diff;
1889                         faults += p->numa_faults[mem_idx];
1890                         p->total_numa_faults += diff;
1891                         if (p->numa_group) {
1892                                 /*
1893                                  * safe because we can only change our own group
1894                                  *
1895                                  * mem_idx represents the offset for a given
1896                                  * nid and priv in a specific region because it
1897                                  * is at the beginning of the numa_faults array.
1898                                  */
1899                                 p->numa_group->faults[mem_idx] += diff;
1900                                 p->numa_group->faults_cpu[mem_idx] += f_diff;
1901                                 p->numa_group->total_faults += diff;
1902                                 group_faults += p->numa_group->faults[mem_idx];
1903                         }
1904                 }
1905
1906                 if (faults > max_faults) {
1907                         max_faults = faults;
1908                         max_nid = nid;
1909                 }
1910
1911                 if (group_faults > max_group_faults) {
1912                         max_group_faults = group_faults;
1913                         max_group_nid = nid;
1914                 }
1915         }
1916
1917         update_task_scan_period(p, fault_types[0], fault_types[1]);
1918
1919         if (p->numa_group) {
1920                 update_numa_active_node_mask(p->numa_group);
1921                 spin_unlock_irq(group_lock);
1922                 max_nid = preferred_group_nid(p, max_group_nid);
1923         }
1924
1925         if (max_faults) {
1926                 /* Set the new preferred node */
1927                 if (max_nid != p->numa_preferred_nid)
1928                         sched_setnuma(p, max_nid);
1929
1930                 if (task_node(p) != p->numa_preferred_nid)
1931                         numa_migrate_preferred(p);
1932         }
1933 }
1934
1935 static inline int get_numa_group(struct numa_group *grp)
1936 {
1937         return atomic_inc_not_zero(&grp->refcount);
1938 }
1939
1940 static inline void put_numa_group(struct numa_group *grp)
1941 {
1942         if (atomic_dec_and_test(&grp->refcount))
1943                 kfree_rcu(grp, rcu);
1944 }
1945
1946 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1947                         int *priv)
1948 {
1949         struct numa_group *grp, *my_grp;
1950         struct task_struct *tsk;
1951         bool join = false;
1952         int cpu = cpupid_to_cpu(cpupid);
1953         int i;
1954
1955         if (unlikely(!p->numa_group)) {
1956                 unsigned int size = sizeof(struct numa_group) +
1957                                     4*nr_node_ids*sizeof(unsigned long);
1958
1959                 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1960                 if (!grp)
1961                         return;
1962
1963                 atomic_set(&grp->refcount, 1);
1964                 spin_lock_init(&grp->lock);
1965                 grp->gid = p->pid;
1966                 /* Second half of the array tracks nids where faults happen */
1967                 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
1968                                                 nr_node_ids;
1969
1970                 node_set(task_node(current), grp->active_nodes);
1971
1972                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
1973                         grp->faults[i] = p->numa_faults[i];
1974
1975                 grp->total_faults = p->total_numa_faults;
1976
1977                 grp->nr_tasks++;
1978                 rcu_assign_pointer(p->numa_group, grp);
1979         }
1980
1981         rcu_read_lock();
1982         tsk = READ_ONCE(cpu_rq(cpu)->curr);
1983
1984         if (!cpupid_match_pid(tsk, cpupid))
1985                 goto no_join;
1986
1987         grp = rcu_dereference(tsk->numa_group);
1988         if (!grp)
1989                 goto no_join;
1990
1991         my_grp = p->numa_group;
1992         if (grp == my_grp)
1993                 goto no_join;
1994
1995         /*
1996          * Only join the other group if its bigger; if we're the bigger group,
1997          * the other task will join us.
1998          */
1999         if (my_grp->nr_tasks > grp->nr_tasks)
2000                 goto no_join;
2001
2002         /*
2003          * Tie-break on the grp address.
2004          */
2005         if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2006                 goto no_join;
2007
2008         /* Always join threads in the same process. */
2009         if (tsk->mm == current->mm)
2010                 join = true;
2011
2012         /* Simple filter to avoid false positives due to PID collisions */
2013         if (flags & TNF_SHARED)
2014                 join = true;
2015
2016         /* Update priv based on whether false sharing was detected */
2017         *priv = !join;
2018
2019         if (join && !get_numa_group(grp))
2020                 goto no_join;
2021
2022         rcu_read_unlock();
2023
2024         if (!join)
2025                 return;
2026
2027         BUG_ON(irqs_disabled());
2028         double_lock_irq(&my_grp->lock, &grp->lock);
2029
2030         for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2031                 my_grp->faults[i] -= p->numa_faults[i];
2032                 grp->faults[i] += p->numa_faults[i];
2033         }
2034         my_grp->total_faults -= p->total_numa_faults;
2035         grp->total_faults += p->total_numa_faults;
2036
2037         my_grp->nr_tasks--;
2038         grp->nr_tasks++;
2039
2040         spin_unlock(&my_grp->lock);
2041         spin_unlock_irq(&grp->lock);
2042
2043         rcu_assign_pointer(p->numa_group, grp);
2044
2045         put_numa_group(my_grp);
2046         return;
2047
2048 no_join:
2049         rcu_read_unlock();
2050         return;
2051 }
2052
2053 void task_numa_free(struct task_struct *p)
2054 {
2055         struct numa_group *grp = p->numa_group;
2056         void *numa_faults = p->numa_faults;
2057         unsigned long flags;
2058         int i;
2059
2060         if (grp) {
2061                 spin_lock_irqsave(&grp->lock, flags);
2062                 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2063                         grp->faults[i] -= p->numa_faults[i];
2064                 grp->total_faults -= p->total_numa_faults;
2065
2066                 grp->nr_tasks--;
2067                 spin_unlock_irqrestore(&grp->lock, flags);
2068                 RCU_INIT_POINTER(p->numa_group, NULL);
2069                 put_numa_group(grp);
2070         }
2071
2072         p->numa_faults = NULL;
2073         kfree(numa_faults);
2074 }
2075
2076 /*
2077  * Got a PROT_NONE fault for a page on @node.
2078  */
2079 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2080 {
2081         struct task_struct *p = current;
2082         bool migrated = flags & TNF_MIGRATED;
2083         int cpu_node = task_node(current);
2084         int local = !!(flags & TNF_FAULT_LOCAL);
2085         int priv;
2086
2087         if (!static_branch_likely(&sched_numa_balancing))
2088                 return;
2089
2090         /* for example, ksmd faulting in a user's mm */
2091         if (!p->mm)
2092                 return;
2093
2094         /* Allocate buffer to track faults on a per-node basis */
2095         if (unlikely(!p->numa_faults)) {
2096                 int size = sizeof(*p->numa_faults) *
2097                            NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2098
2099                 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2100                 if (!p->numa_faults)
2101                         return;
2102
2103                 p->total_numa_faults = 0;
2104                 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2105         }
2106
2107         /*
2108          * First accesses are treated as private, otherwise consider accesses
2109          * to be private if the accessing pid has not changed
2110          */
2111         if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2112                 priv = 1;
2113         } else {
2114                 priv = cpupid_match_pid(p, last_cpupid);
2115                 if (!priv && !(flags & TNF_NO_GROUP))
2116                         task_numa_group(p, last_cpupid, flags, &priv);
2117         }
2118
2119         /*
2120          * If a workload spans multiple NUMA nodes, a shared fault that
2121          * occurs wholly within the set of nodes that the workload is
2122          * actively using should be counted as local. This allows the
2123          * scan rate to slow down when a workload has settled down.
2124          */
2125         if (!priv && !local && p->numa_group &&
2126                         node_isset(cpu_node, p->numa_group->active_nodes) &&
2127                         node_isset(mem_node, p->numa_group->active_nodes))
2128                 local = 1;
2129
2130         task_numa_placement(p);
2131
2132         /*
2133          * Retry task to preferred node migration periodically, in case it
2134          * case it previously failed, or the scheduler moved us.
2135          */
2136         if (time_after(jiffies, p->numa_migrate_retry))
2137                 numa_migrate_preferred(p);
2138
2139         if (migrated)
2140                 p->numa_pages_migrated += pages;
2141         if (flags & TNF_MIGRATE_FAIL)
2142                 p->numa_faults_locality[2] += pages;
2143
2144         p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2145         p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2146         p->numa_faults_locality[local] += pages;
2147 }
2148
2149 static void reset_ptenuma_scan(struct task_struct *p)
2150 {
2151         /*
2152          * We only did a read acquisition of the mmap sem, so
2153          * p->mm->numa_scan_seq is written to without exclusive access
2154          * and the update is not guaranteed to be atomic. That's not
2155          * much of an issue though, since this is just used for
2156          * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2157          * expensive, to avoid any form of compiler optimizations:
2158          */
2159         WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2160         p->mm->numa_scan_offset = 0;
2161 }
2162
2163 /*
2164  * The expensive part of numa migration is done from task_work context.
2165  * Triggered from task_tick_numa().
2166  */
2167 void task_numa_work(struct callback_head *work)
2168 {
2169         unsigned long migrate, next_scan, now = jiffies;
2170         struct task_struct *p = current;
2171         struct mm_struct *mm = p->mm;
2172         struct vm_area_struct *vma;
2173         unsigned long start, end;
2174         unsigned long nr_pte_updates = 0;
2175         long pages, virtpages;
2176
2177         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2178
2179         work->next = work; /* protect against double add */
2180         /*
2181          * Who cares about NUMA placement when they're dying.
2182          *
2183          * NOTE: make sure not to dereference p->mm before this check,
2184          * exit_task_work() happens _after_ exit_mm() so we could be called
2185          * without p->mm even though we still had it when we enqueued this
2186          * work.
2187          */
2188         if (p->flags & PF_EXITING)
2189                 return;
2190
2191         if (!mm->numa_next_scan) {
2192                 mm->numa_next_scan = now +
2193                         msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2194         }
2195
2196         /*
2197          * Enforce maximal scan/migration frequency..
2198          */
2199         migrate = mm->numa_next_scan;
2200         if (time_before(now, migrate))
2201                 return;
2202
2203         if (p->numa_scan_period == 0) {
2204                 p->numa_scan_period_max = task_scan_max(p);
2205                 p->numa_scan_period = task_scan_min(p);
2206         }
2207
2208         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2209         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2210                 return;
2211
2212         /*
2213          * Delay this task enough that another task of this mm will likely win
2214          * the next time around.
2215          */
2216         p->node_stamp += 2 * TICK_NSEC;
2217
2218         start = mm->numa_scan_offset;
2219         pages = sysctl_numa_balancing_scan_size;
2220         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2221         virtpages = pages * 8;     /* Scan up to this much virtual space */
2222         if (!pages)
2223                 return;
2224
2225
2226         down_read(&mm->mmap_sem);
2227         vma = find_vma(mm, start);
2228         if (!vma) {
2229                 reset_ptenuma_scan(p);
2230                 start = 0;
2231                 vma = mm->mmap;
2232         }
2233         for (; vma; vma = vma->vm_next) {
2234                 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2235                         is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2236                         continue;
2237                 }
2238
2239                 /*
2240                  * Shared library pages mapped by multiple processes are not
2241                  * migrated as it is expected they are cache replicated. Avoid
2242                  * hinting faults in read-only file-backed mappings or the vdso
2243                  * as migrating the pages will be of marginal benefit.
2244                  */
2245                 if (!vma->vm_mm ||
2246                     (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2247                         continue;
2248
2249                 /*
2250                  * Skip inaccessible VMAs to avoid any confusion between
2251                  * PROT_NONE and NUMA hinting ptes
2252                  */
2253                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2254                         continue;
2255
2256                 do {
2257                         start = max(start, vma->vm_start);
2258                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2259                         end = min(end, vma->vm_end);
2260                         nr_pte_updates = change_prot_numa(vma, start, end);
2261
2262                         /*
2263                          * Try to scan sysctl_numa_balancing_size worth of
2264                          * hpages that have at least one present PTE that
2265                          * is not already pte-numa. If the VMA contains
2266                          * areas that are unused or already full of prot_numa
2267                          * PTEs, scan up to virtpages, to skip through those
2268                          * areas faster.
2269                          */
2270                         if (nr_pte_updates)
2271                                 pages -= (end - start) >> PAGE_SHIFT;
2272                         virtpages -= (end - start) >> PAGE_SHIFT;
2273
2274                         start = end;
2275                         if (pages <= 0 || virtpages <= 0)
2276                                 goto out;
2277
2278                         cond_resched();
2279                 } while (end != vma->vm_end);
2280         }
2281
2282 out:
2283         /*
2284          * It is possible to reach the end of the VMA list but the last few
2285          * VMAs are not guaranteed to the vma_migratable. If they are not, we
2286          * would find the !migratable VMA on the next scan but not reset the
2287          * scanner to the start so check it now.
2288          */
2289         if (vma)
2290                 mm->numa_scan_offset = start;
2291         else
2292                 reset_ptenuma_scan(p);
2293         up_read(&mm->mmap_sem);
2294 }
2295
2296 /*
2297  * Drive the periodic memory faults..
2298  */
2299 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2300 {
2301         struct callback_head *work = &curr->numa_work;
2302         u64 period, now;
2303
2304         /*
2305          * We don't care about NUMA placement if we don't have memory.
2306          */
2307         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2308                 return;
2309
2310         /*
2311          * Using runtime rather than walltime has the dual advantage that
2312          * we (mostly) drive the selection from busy threads and that the
2313          * task needs to have done some actual work before we bother with
2314          * NUMA placement.
2315          */
2316         now = curr->se.sum_exec_runtime;
2317         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2318
2319         if (now > curr->node_stamp + period) {
2320                 if (!curr->node_stamp)
2321                         curr->numa_scan_period = task_scan_min(curr);
2322                 curr->node_stamp += period;
2323
2324                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2325                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2326                         task_work_add(curr, work, true);
2327                 }
2328         }
2329 }
2330 #else
2331 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2332 {
2333 }
2334
2335 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2336 {
2337 }
2338
2339 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2340 {
2341 }
2342 #endif /* CONFIG_NUMA_BALANCING */
2343
2344 static void
2345 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2346 {
2347         update_load_add(&cfs_rq->load, se->load.weight);
2348         if (!parent_entity(se))
2349                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2350 #ifdef CONFIG_SMP
2351         if (entity_is_task(se)) {
2352                 struct rq *rq = rq_of(cfs_rq);
2353
2354                 account_numa_enqueue(rq, task_of(se));
2355                 list_add(&se->group_node, &rq->cfs_tasks);
2356         }
2357 #endif
2358         cfs_rq->nr_running++;
2359 }
2360
2361 static void
2362 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2363 {
2364         update_load_sub(&cfs_rq->load, se->load.weight);
2365         if (!parent_entity(se))
2366                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2367         if (entity_is_task(se)) {
2368                 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2369                 list_del_init(&se->group_node);
2370         }
2371         cfs_rq->nr_running--;
2372 }
2373
2374 #ifdef CONFIG_FAIR_GROUP_SCHED
2375 # ifdef CONFIG_SMP
2376 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2377 {
2378         long tg_weight;
2379
2380         /*
2381          * Use this CPU's real-time load instead of the last load contribution
2382          * as the updating of the contribution is delayed, and we will use the
2383          * the real-time load to calc the share. See update_tg_load_avg().
2384          */
2385         tg_weight = atomic_long_read(&tg->load_avg);
2386         tg_weight -= cfs_rq->tg_load_avg_contrib;
2387         tg_weight += cfs_rq->load.weight;
2388
2389         return tg_weight;
2390 }
2391
2392 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2393 {
2394         long tg_weight, load, shares;
2395
2396         tg_weight = calc_tg_weight(tg, cfs_rq);
2397         load = cfs_rq->load.weight;
2398
2399         shares = (tg->shares * load);
2400         if (tg_weight)
2401                 shares /= tg_weight;
2402
2403         if (shares < MIN_SHARES)
2404                 shares = MIN_SHARES;
2405         if (shares > tg->shares)
2406                 shares = tg->shares;
2407
2408         return shares;
2409 }
2410 # else /* CONFIG_SMP */
2411 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2412 {
2413         return tg->shares;
2414 }
2415 # endif /* CONFIG_SMP */
2416 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2417                             unsigned long weight)
2418 {
2419         if (se->on_rq) {
2420                 /* commit outstanding execution time */
2421                 if (cfs_rq->curr == se)
2422                         update_curr(cfs_rq);
2423                 account_entity_dequeue(cfs_rq, se);
2424         }
2425
2426         update_load_set(&se->load, weight);
2427
2428         if (se->on_rq)
2429                 account_entity_enqueue(cfs_rq, se);
2430 }
2431
2432 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2433
2434 static void update_cfs_shares(struct cfs_rq *cfs_rq)
2435 {
2436         struct task_group *tg;
2437         struct sched_entity *se;
2438         long shares;
2439
2440         tg = cfs_rq->tg;
2441         se = tg->se[cpu_of(rq_of(cfs_rq))];
2442         if (!se || throttled_hierarchy(cfs_rq))
2443                 return;
2444 #ifndef CONFIG_SMP
2445         if (likely(se->load.weight == tg->shares))
2446                 return;
2447 #endif
2448         shares = calc_cfs_shares(cfs_rq, tg);
2449
2450         reweight_entity(cfs_rq_of(se), se, shares);
2451 }
2452 #else /* CONFIG_FAIR_GROUP_SCHED */
2453 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2454 {
2455 }
2456 #endif /* CONFIG_FAIR_GROUP_SCHED */
2457
2458 #ifdef CONFIG_SMP
2459 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2460 static const u32 runnable_avg_yN_inv[] = {
2461         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2462         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2463         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2464         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2465         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2466         0x85aac367, 0x82cd8698,
2467 };
2468
2469 /*
2470  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2471  * over-estimates when re-combining.
2472  */
2473 static const u32 runnable_avg_yN_sum[] = {
2474             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2475          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2476         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2477 };
2478
2479 /*
2480  * Approximate:
2481  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2482  */
2483 static __always_inline u64 decay_load(u64 val, u64 n)
2484 {
2485         unsigned int local_n;
2486
2487         if (!n)
2488                 return val;
2489         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2490                 return 0;
2491
2492         /* after bounds checking we can collapse to 32-bit */
2493         local_n = n;
2494
2495         /*
2496          * As y^PERIOD = 1/2, we can combine
2497          *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2498          * With a look-up table which covers y^n (n<PERIOD)
2499          *
2500          * To achieve constant time decay_load.
2501          */
2502         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2503                 val >>= local_n / LOAD_AVG_PERIOD;
2504                 local_n %= LOAD_AVG_PERIOD;
2505         }
2506
2507         val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2508         return val;
2509 }
2510
2511 /*
2512  * For updates fully spanning n periods, the contribution to runnable
2513  * average will be: \Sum 1024*y^n
2514  *
2515  * We can compute this reasonably efficiently by combining:
2516  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2517  */
2518 static u32 __compute_runnable_contrib(u64 n)
2519 {
2520         u32 contrib = 0;
2521
2522         if (likely(n <= LOAD_AVG_PERIOD))
2523                 return runnable_avg_yN_sum[n];
2524         else if (unlikely(n >= LOAD_AVG_MAX_N))
2525                 return LOAD_AVG_MAX;
2526
2527         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2528         do {
2529                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2530                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2531
2532                 n -= LOAD_AVG_PERIOD;
2533         } while (n > LOAD_AVG_PERIOD);
2534
2535         contrib = decay_load(contrib, n);
2536         return contrib + runnable_avg_yN_sum[n];
2537 }
2538
2539 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2540 #error "load tracking assumes 2^10 as unit"
2541 #endif
2542
2543 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2544
2545 /*
2546  * We can represent the historical contribution to runnable average as the
2547  * coefficients of a geometric series.  To do this we sub-divide our runnable
2548  * history into segments of approximately 1ms (1024us); label the segment that
2549  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2550  *
2551  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2552  *      p0            p1           p2
2553  *     (now)       (~1ms ago)  (~2ms ago)
2554  *
2555  * Let u_i denote the fraction of p_i that the entity was runnable.
2556  *
2557  * We then designate the fractions u_i as our co-efficients, yielding the
2558  * following representation of historical load:
2559  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2560  *
2561  * We choose y based on the with of a reasonably scheduling period, fixing:
2562  *   y^32 = 0.5
2563  *
2564  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2565  * approximately half as much as the contribution to load within the last ms
2566  * (u_0).
2567  *
2568  * When a period "rolls over" and we have new u_0`, multiplying the previous
2569  * sum again by y is sufficient to update:
2570  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2571  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2572  */
2573 static __always_inline int
2574 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2575                   unsigned long weight, int running, struct cfs_rq *cfs_rq)
2576 {
2577         u64 delta, scaled_delta, periods;
2578         u32 contrib;
2579         unsigned int delta_w, scaled_delta_w, decayed = 0;
2580         unsigned long scale_freq, scale_cpu;
2581
2582         delta = now - sa->last_update_time;
2583         /*
2584          * This should only happen when time goes backwards, which it
2585          * unfortunately does during sched clock init when we swap over to TSC.
2586          */
2587         if ((s64)delta < 0) {
2588                 sa->last_update_time = now;
2589                 return 0;
2590         }
2591
2592         /*
2593          * Use 1024ns as the unit of measurement since it's a reasonable
2594          * approximation of 1us and fast to compute.
2595          */
2596         delta >>= 10;
2597         if (!delta)
2598                 return 0;
2599         sa->last_update_time = now;
2600
2601         scale_freq = arch_scale_freq_capacity(NULL, cpu);
2602         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2603
2604         /* delta_w is the amount already accumulated against our next period */
2605         delta_w = sa->period_contrib;
2606         if (delta + delta_w >= 1024) {
2607                 decayed = 1;
2608
2609                 /* how much left for next period will start over, we don't know yet */
2610                 sa->period_contrib = 0;
2611
2612                 /*
2613                  * Now that we know we're crossing a period boundary, figure
2614                  * out how much from delta we need to complete the current
2615                  * period and accrue it.
2616                  */
2617                 delta_w = 1024 - delta_w;
2618                 scaled_delta_w = cap_scale(delta_w, scale_freq);
2619                 if (weight) {
2620                         sa->load_sum += weight * scaled_delta_w;
2621                         if (cfs_rq) {
2622                                 cfs_rq->runnable_load_sum +=
2623                                                 weight * scaled_delta_w;
2624                         }
2625                 }
2626                 if (running)
2627                         sa->util_sum += scaled_delta_w * scale_cpu;
2628
2629                 delta -= delta_w;
2630
2631                 /* Figure out how many additional periods this update spans */
2632                 periods = delta / 1024;
2633                 delta %= 1024;
2634
2635                 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2636                 if (cfs_rq) {
2637                         cfs_rq->runnable_load_sum =
2638                                 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2639                 }
2640                 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2641
2642                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2643                 contrib = __compute_runnable_contrib(periods);
2644                 contrib = cap_scale(contrib, scale_freq);
2645                 if (weight) {
2646                         sa->load_sum += weight * contrib;
2647                         if (cfs_rq)
2648                                 cfs_rq->runnable_load_sum += weight * contrib;
2649                 }
2650                 if (running)
2651                         sa->util_sum += contrib * scale_cpu;
2652         }
2653
2654         /* Remainder of delta accrued against u_0` */
2655         scaled_delta = cap_scale(delta, scale_freq);
2656         if (weight) {
2657                 sa->load_sum += weight * scaled_delta;
2658                 if (cfs_rq)
2659                         cfs_rq->runnable_load_sum += weight * scaled_delta;
2660         }
2661         if (running)
2662                 sa->util_sum += scaled_delta * scale_cpu;
2663
2664         sa->period_contrib += delta;
2665
2666         if (decayed) {
2667                 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2668                 if (cfs_rq) {
2669                         cfs_rq->runnable_load_avg =
2670                                 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2671                 }
2672                 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2673         }
2674
2675         return decayed;
2676 }
2677
2678 #ifdef CONFIG_FAIR_GROUP_SCHED
2679 /*
2680  * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2681  * and effective_load (which is not done because it is too costly).
2682  */
2683 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2684 {
2685         long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2686
2687         if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2688                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2689                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2690         }
2691 }
2692
2693 #else /* CONFIG_FAIR_GROUP_SCHED */
2694 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2695 #endif /* CONFIG_FAIR_GROUP_SCHED */
2696
2697 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2698
2699 /*
2700  * Unsigned subtract and clamp on underflow.
2701  *
2702  * Explicitly do a load-store to ensure the intermediate value never hits
2703  * memory. This allows lockless observations without ever seeing the negative
2704  * values.
2705  */
2706 #define sub_positive(_ptr, _val) do {                           \
2707         typeof(_ptr) ptr = (_ptr);                              \
2708         typeof(*ptr) val = (_val);                              \
2709         typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2710         res = var - val;                                        \
2711         if (res > var)                                          \
2712                 res = 0;                                        \
2713         WRITE_ONCE(*ptr, res);                                  \
2714 } while (0)
2715
2716 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2717 static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2718 {
2719         struct sched_avg *sa = &cfs_rq->avg;
2720         int decayed, removed = 0;
2721
2722         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2723                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2724                 sub_positive(&sa->load_avg, r);
2725                 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
2726                 removed = 1;
2727         }
2728
2729         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2730                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2731                 sub_positive(&sa->util_avg, r);
2732                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
2733         }
2734
2735         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2736                 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2737
2738 #ifndef CONFIG_64BIT
2739         smp_wmb();
2740         cfs_rq->load_last_update_time_copy = sa->last_update_time;
2741 #endif
2742
2743         return decayed || removed;
2744 }
2745
2746 /* Update task and its cfs_rq load average */
2747 static inline void update_load_avg(struct sched_entity *se, int update_tg)
2748 {
2749         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2750         u64 now = cfs_rq_clock_task(cfs_rq);
2751         int cpu = cpu_of(rq_of(cfs_rq));
2752
2753         /*
2754          * Track task load average for carrying it to new CPU after migrated, and
2755          * track group sched_entity load average for task_h_load calc in migration
2756          */
2757         __update_load_avg(now, cpu, &se->avg,
2758                           se->on_rq * scale_load_down(se->load.weight),
2759                           cfs_rq->curr == se, NULL);
2760
2761         if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2762                 update_tg_load_avg(cfs_rq, 0);
2763 }
2764
2765 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2766 {
2767         if (!sched_feat(ATTACH_AGE_LOAD))
2768                 goto skip_aging;
2769
2770         /*
2771          * If we got migrated (either between CPUs or between cgroups) we'll
2772          * have aged the average right before clearing @last_update_time.
2773          */
2774         if (se->avg.last_update_time) {
2775                 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2776                                   &se->avg, 0, 0, NULL);
2777
2778                 /*
2779                  * XXX: we could have just aged the entire load away if we've been
2780                  * absent from the fair class for too long.
2781                  */
2782         }
2783
2784 skip_aging:
2785         se->avg.last_update_time = cfs_rq->avg.last_update_time;
2786         cfs_rq->avg.load_avg += se->avg.load_avg;
2787         cfs_rq->avg.load_sum += se->avg.load_sum;
2788         cfs_rq->avg.util_avg += se->avg.util_avg;
2789         cfs_rq->avg.util_sum += se->avg.util_sum;
2790 }
2791
2792 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2793 {
2794         __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
2795                           &se->avg, se->on_rq * scale_load_down(se->load.weight),
2796                           cfs_rq->curr == se, NULL);
2797
2798         sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2799         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
2800         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
2801         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
2802 }
2803
2804 /* Add the load generated by se into cfs_rq's load average */
2805 static inline void
2806 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2807 {
2808         struct sched_avg *sa = &se->avg;
2809         u64 now = cfs_rq_clock_task(cfs_rq);
2810         int migrated, decayed;
2811
2812         migrated = !sa->last_update_time;
2813         if (!migrated) {
2814                 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2815                         se->on_rq * scale_load_down(se->load.weight),
2816                         cfs_rq->curr == se, NULL);
2817         }
2818
2819         decayed = update_cfs_rq_load_avg(now, cfs_rq);
2820
2821         cfs_rq->runnable_load_avg += sa->load_avg;
2822         cfs_rq->runnable_load_sum += sa->load_sum;
2823
2824         if (migrated)
2825                 attach_entity_load_avg(cfs_rq, se);
2826
2827         if (decayed || migrated)
2828                 update_tg_load_avg(cfs_rq, 0);
2829 }
2830
2831 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
2832 static inline void
2833 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2834 {
2835         update_load_avg(se, 1);
2836
2837         cfs_rq->runnable_load_avg =
2838                 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2839         cfs_rq->runnable_load_sum =
2840                 max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2841 }
2842
2843 /*
2844  * Task first catches up with cfs_rq, and then subtract
2845  * itself from the cfs_rq (task must be off the queue now).
2846  */
2847 void remove_entity_load_avg(struct sched_entity *se)
2848 {
2849         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2850         u64 last_update_time;
2851
2852 #ifndef CONFIG_64BIT
2853         u64 last_update_time_copy;
2854
2855         do {
2856                 last_update_time_copy = cfs_rq->load_last_update_time_copy;
2857                 smp_rmb();
2858                 last_update_time = cfs_rq->avg.last_update_time;
2859         } while (last_update_time != last_update_time_copy);
2860 #else
2861         last_update_time = cfs_rq->avg.last_update_time;
2862 #endif
2863
2864         __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2865         atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2866         atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2867 }
2868
2869 /*
2870  * Update the rq's load with the elapsed running time before entering
2871  * idle. if the last scheduled task is not a CFS task, idle_enter will
2872  * be the only way to update the runnable statistic.
2873  */
2874 void idle_enter_fair(struct rq *this_rq)
2875 {
2876 }
2877
2878 /*
2879  * Update the rq's load with the elapsed idle time before a task is
2880  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2881  * be the only way to update the runnable statistic.
2882  */
2883 void idle_exit_fair(struct rq *this_rq)
2884 {
2885 }
2886
2887 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2888 {
2889         return cfs_rq->runnable_load_avg;
2890 }
2891
2892 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2893 {
2894         return cfs_rq->avg.load_avg;
2895 }
2896
2897 static int idle_balance(struct rq *this_rq);
2898
2899 #else /* CONFIG_SMP */
2900
2901 static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2902 static inline void
2903 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2904 static inline void
2905 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2906 static inline void remove_entity_load_avg(struct sched_entity *se) {}
2907
2908 static inline void
2909 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2910 static inline void
2911 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2912
2913 static inline int idle_balance(struct rq *rq)
2914 {
2915         return 0;
2916 }
2917
2918 #endif /* CONFIG_SMP */
2919
2920 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2921 {
2922 #ifdef CONFIG_SCHEDSTATS
2923         struct task_struct *tsk = NULL;
2924
2925         if (entity_is_task(se))
2926                 tsk = task_of(se);
2927
2928         if (se->statistics.sleep_start) {
2929                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
2930
2931                 if ((s64)delta < 0)
2932                         delta = 0;
2933
2934                 if (unlikely(delta > se->statistics.sleep_max))
2935                         se->statistics.sleep_max = delta;
2936
2937                 se->statistics.sleep_start = 0;
2938                 se->statistics.sum_sleep_runtime += delta;
2939
2940                 if (tsk) {
2941                         account_scheduler_latency(tsk, delta >> 10, 1);
2942                         trace_sched_stat_sleep(tsk, delta);
2943                 }
2944         }
2945         if (se->statistics.block_start) {
2946                 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
2947
2948                 if ((s64)delta < 0)
2949                         delta = 0;
2950
2951                 if (unlikely(delta > se->statistics.block_max))
2952                         se->statistics.block_max = delta;
2953
2954                 se->statistics.block_start = 0;
2955                 se->statistics.sum_sleep_runtime += delta;
2956
2957                 if (tsk) {
2958                         if (tsk->in_iowait) {
2959                                 se->statistics.iowait_sum += delta;
2960                                 se->statistics.iowait_count++;
2961                                 trace_sched_stat_iowait(tsk, delta);
2962                         }
2963
2964                         trace_sched_stat_blocked(tsk, delta);
2965
2966                         /*
2967                          * Blocking time is in units of nanosecs, so shift by
2968                          * 20 to get a milliseconds-range estimation of the
2969                          * amount of time that the task spent sleeping:
2970                          */
2971                         if (unlikely(prof_on == SLEEP_PROFILING)) {
2972                                 profile_hits(SLEEP_PROFILING,
2973                                                 (void *)get_wchan(tsk),
2974                                                 delta >> 20);
2975                         }
2976                         account_scheduler_latency(tsk, delta >> 10, 0);
2977                 }
2978         }
2979 #endif
2980 }
2981
2982 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2983 {
2984 #ifdef CONFIG_SCHED_DEBUG
2985         s64 d = se->vruntime - cfs_rq->min_vruntime;
2986
2987         if (d < 0)
2988                 d = -d;
2989
2990         if (d > 3*sysctl_sched_latency)
2991                 schedstat_inc(cfs_rq, nr_spread_over);
2992 #endif
2993 }
2994
2995 static void
2996 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2997 {
2998         u64 vruntime = cfs_rq->min_vruntime;
2999
3000         /*
3001          * The 'current' period is already promised to the current tasks,
3002          * however the extra weight of the new task will slow them down a
3003          * little, place the new task so that it fits in the slot that
3004          * stays open at the end.
3005          */
3006         if (initial && sched_feat(START_DEBIT))
3007                 vruntime += sched_vslice(cfs_rq, se);
3008
3009         /* sleeps up to a single latency don't count. */
3010         if (!initial) {
3011                 unsigned long thresh = sysctl_sched_latency;
3012
3013                 /*
3014                  * Halve their sleep time's effect, to allow
3015                  * for a gentler effect of sleepers:
3016                  */
3017                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3018                         thresh >>= 1;
3019
3020                 vruntime -= thresh;
3021         }
3022
3023         /* ensure we never gain time by being placed backwards. */
3024         se->vruntime = max_vruntime(se->vruntime, vruntime);
3025 }
3026
3027 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3028
3029 static void
3030 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3031 {
3032         /*
3033          * Update the normalized vruntime before updating min_vruntime
3034          * through calling update_curr().
3035          */
3036         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3037                 se->vruntime += cfs_rq->min_vruntime;
3038
3039         /*
3040          * Update run-time statistics of the 'current'.
3041          */
3042         update_curr(cfs_rq);
3043         enqueue_entity_load_avg(cfs_rq, se);
3044         account_entity_enqueue(cfs_rq, se);
3045         update_cfs_shares(cfs_rq);
3046
3047         if (flags & ENQUEUE_WAKEUP) {
3048                 place_entity(cfs_rq, se, 0);
3049                 enqueue_sleeper(cfs_rq, se);
3050         }
3051
3052         update_stats_enqueue(cfs_rq, se);
3053         check_spread(cfs_rq, se);
3054         if (se != cfs_rq->curr)
3055                 __enqueue_entity(cfs_rq, se);
3056         se->on_rq = 1;
3057
3058         if (cfs_rq->nr_running == 1) {
3059                 list_add_leaf_cfs_rq(cfs_rq);
3060                 check_enqueue_throttle(cfs_rq);
3061         }
3062 }
3063
3064 static void __clear_buddies_last(struct sched_entity *se)
3065 {
3066         for_each_sched_entity(se) {
3067                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3068                 if (cfs_rq->last != se)
3069                         break;
3070
3071                 cfs_rq->last = NULL;
3072         }
3073 }
3074
3075 static void __clear_buddies_next(struct sched_entity *se)
3076 {
3077         for_each_sched_entity(se) {
3078                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3079                 if (cfs_rq->next != se)
3080                         break;
3081
3082                 cfs_rq->next = NULL;
3083         }
3084 }
3085
3086 static void __clear_buddies_skip(struct sched_entity *se)
3087 {
3088         for_each_sched_entity(se) {
3089                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3090                 if (cfs_rq->skip != se)
3091                         break;
3092
3093                 cfs_rq->skip = NULL;
3094         }
3095 }
3096
3097 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3098 {
3099         if (cfs_rq->last == se)
3100                 __clear_buddies_last(se);
3101
3102         if (cfs_rq->next == se)
3103                 __clear_buddies_next(se);
3104
3105         if (cfs_rq->skip == se)
3106                 __clear_buddies_skip(se);
3107 }
3108
3109 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3110
3111 static void
3112 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3113 {
3114         /*
3115          * Update run-time statistics of the 'current'.
3116          */
3117         update_curr(cfs_rq);
3118         dequeue_entity_load_avg(cfs_rq, se);
3119
3120         update_stats_dequeue(cfs_rq, se);
3121         if (flags & DEQUEUE_SLEEP) {
3122 #ifdef CONFIG_SCHEDSTATS
3123                 if (entity_is_task(se)) {
3124                         struct task_struct *tsk = task_of(se);
3125
3126                         if (tsk->state & TASK_INTERRUPTIBLE)
3127                                 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3128                         if (tsk->state & TASK_UNINTERRUPTIBLE)
3129                                 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3130                 }
3131 #endif
3132         }
3133
3134         clear_buddies(cfs_rq, se);
3135
3136         if (se != cfs_rq->curr)
3137                 __dequeue_entity(cfs_rq, se);
3138         se->on_rq = 0;
3139         account_entity_dequeue(cfs_rq, se);
3140
3141         /*
3142          * Normalize the entity after updating the min_vruntime because the
3143          * update can refer to the ->curr item and we need to reflect this
3144          * movement in our normalized position.
3145          */
3146         if (!(flags & DEQUEUE_SLEEP))
3147                 se->vruntime -= cfs_rq->min_vruntime;
3148
3149         /* return excess runtime on last dequeue */
3150         return_cfs_rq_runtime(cfs_rq);
3151
3152         update_min_vruntime(cfs_rq);
3153         update_cfs_shares(cfs_rq);
3154 }
3155
3156 /*
3157  * Preempt the current task with a newly woken task if needed:
3158  */
3159 static void
3160 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3161 {
3162         unsigned long ideal_runtime, delta_exec;
3163         struct sched_entity *se;
3164         s64 delta;
3165
3166         ideal_runtime = sched_slice(cfs_rq, curr);
3167         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3168         if (delta_exec > ideal_runtime) {
3169                 resched_curr(rq_of(cfs_rq));
3170                 /*
3171                  * The current task ran long enough, ensure it doesn't get
3172                  * re-elected due to buddy favours.
3173                  */
3174                 clear_buddies(cfs_rq, curr);
3175                 return;
3176         }
3177
3178         /*
3179          * Ensure that a task that missed wakeup preemption by a
3180          * narrow margin doesn't have to wait for a full slice.
3181          * This also mitigates buddy induced latencies under load.
3182          */
3183         if (delta_exec < sysctl_sched_min_granularity)
3184                 return;
3185
3186         se = __pick_first_entity(cfs_rq);
3187         delta = curr->vruntime - se->vruntime;
3188
3189         if (delta < 0)
3190                 return;
3191
3192         if (delta > ideal_runtime)
3193                 resched_curr(rq_of(cfs_rq));
3194 }
3195
3196 static void
3197 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3198 {
3199         /* 'current' is not kept within the tree. */
3200         if (se->on_rq) {
3201                 /*
3202                  * Any task has to be enqueued before it get to execute on
3203                  * a CPU. So account for the time it spent waiting on the
3204                  * runqueue.
3205                  */
3206                 update_stats_wait_end(cfs_rq, se);
3207                 __dequeue_entity(cfs_rq, se);
3208                 update_load_avg(se, 1);
3209         }
3210
3211         update_stats_curr_start(cfs_rq, se);
3212         cfs_rq->curr = se;
3213 #ifdef CONFIG_SCHEDSTATS
3214         /*
3215          * Track our maximum slice length, if the CPU's load is at
3216          * least twice that of our own weight (i.e. dont track it
3217          * when there are only lesser-weight tasks around):
3218          */
3219         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3220                 se->statistics.slice_max = max(se->statistics.slice_max,
3221                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
3222         }
3223 #endif
3224         se->prev_sum_exec_runtime = se->sum_exec_runtime;
3225 }
3226
3227 static int
3228 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3229
3230 /*
3231  * Pick the next process, keeping these things in mind, in this order:
3232  * 1) keep things fair between processes/task groups
3233  * 2) pick the "next" process, since someone really wants that to run
3234  * 3) pick the "last" process, for cache locality
3235  * 4) do not run the "skip" process, if something else is available
3236  */
3237 static struct sched_entity *
3238 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3239 {
3240         struct sched_entity *left = __pick_first_entity(cfs_rq);
3241         struct sched_entity *se;
3242
3243         /*
3244          * If curr is set we have to see if its left of the leftmost entity
3245          * still in the tree, provided there was anything in the tree at all.
3246          */
3247         if (!left || (curr && entity_before(curr, left)))
3248                 left = curr;
3249
3250         se = left; /* ideally we run the leftmost entity */
3251
3252         /*
3253          * Avoid running the skip buddy, if running something else can
3254          * be done without getting too unfair.
3255          */
3256         if (cfs_rq->skip == se) {
3257                 struct sched_entity *second;
3258
3259                 if (se == curr) {
3260                         second = __pick_first_entity(cfs_rq);
3261                 } else {
3262                         second = __pick_next_entity(se);
3263                         if (!second || (curr && entity_before(curr, second)))
3264                                 second = curr;
3265                 }
3266
3267                 if (second && wakeup_preempt_entity(second, left) < 1)
3268                         se = second;
3269         }
3270
3271         /*
3272          * Prefer last buddy, try to return the CPU to a preempted task.
3273          */
3274         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3275                 se = cfs_rq->last;
3276
3277         /*
3278          * Someone really wants this to run. If it's not unfair, run it.
3279          */
3280         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3281                 se = cfs_rq->next;
3282
3283         clear_buddies(cfs_rq, se);
3284
3285         return se;
3286 }
3287
3288 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3289
3290 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3291 {
3292         /*
3293          * If still on the runqueue then deactivate_task()
3294          * was not called and update_curr() has to be done:
3295          */
3296         if (prev->on_rq)
3297                 update_curr(cfs_rq);
3298
3299         /* throttle cfs_rqs exceeding runtime */
3300         check_cfs_rq_runtime(cfs_rq);
3301
3302         check_spread(cfs_rq, prev);
3303         if (prev->on_rq) {
3304                 update_stats_wait_start(cfs_rq, prev);
3305                 /* Put 'current' back into the tree. */
3306                 __enqueue_entity(cfs_rq, prev);
3307                 /* in !on_rq case, update occurred at dequeue */
3308                 update_load_avg(prev, 0);
3309         }
3310         cfs_rq->curr = NULL;
3311 }
3312
3313 static void
3314 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3315 {
3316         /*
3317          * Update run-time statistics of the 'current'.
3318          */
3319         update_curr(cfs_rq);
3320
3321         /*
3322          * Ensure that runnable average is periodically updated.
3323          */
3324         update_load_avg(curr, 1);
3325         update_cfs_shares(cfs_rq);
3326
3327 #ifdef CONFIG_SCHED_HRTICK
3328         /*
3329          * queued ticks are scheduled to match the slice, so don't bother
3330          * validating it and just reschedule.
3331          */
3332         if (queued) {
3333                 resched_curr(rq_of(cfs_rq));
3334                 return;
3335         }
3336         /*
3337          * don't let the period tick interfere with the hrtick preemption
3338          */
3339         if (!sched_feat(DOUBLE_TICK) &&
3340                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3341                 return;
3342 #endif
3343
3344         if (cfs_rq->nr_running > 1)
3345                 check_preempt_tick(cfs_rq, curr);
3346 }
3347
3348
3349 /**************************************************
3350  * CFS bandwidth control machinery
3351  */
3352
3353 #ifdef CONFIG_CFS_BANDWIDTH
3354
3355 #ifdef HAVE_JUMP_LABEL
3356 static struct static_key __cfs_bandwidth_used;
3357
3358 static inline bool cfs_bandwidth_used(void)
3359 {
3360         return static_key_false(&__cfs_bandwidth_used);
3361 }
3362
3363 void cfs_bandwidth_usage_inc(void)
3364 {
3365         static_key_slow_inc(&__cfs_bandwidth_used);
3366 }
3367
3368 void cfs_bandwidth_usage_dec(void)
3369 {
3370         static_key_slow_dec(&__cfs_bandwidth_used);
3371 }
3372 #else /* HAVE_JUMP_LABEL */
3373 static bool cfs_bandwidth_used(void)
3374 {
3375         return true;
3376 }
3377
3378 void cfs_bandwidth_usage_inc(void) {}
3379 void cfs_bandwidth_usage_dec(void) {}
3380 #endif /* HAVE_JUMP_LABEL */
3381
3382 /*
3383  * default period for cfs group bandwidth.
3384  * default: 0.1s, units: nanoseconds
3385  */
3386 static inline u64 default_cfs_period(void)
3387 {
3388         return 100000000ULL;
3389 }
3390
3391 static inline u64 sched_cfs_bandwidth_slice(void)
3392 {
3393         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3394 }
3395
3396 /*
3397  * Replenish runtime according to assigned quota and update expiration time.
3398  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3399  * additional synchronization around rq->lock.
3400  *
3401  * requires cfs_b->lock
3402  */
3403 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3404 {
3405         u64 now;
3406
3407         if (cfs_b->quota == RUNTIME_INF)
3408                 return;
3409
3410         now = sched_clock_cpu(smp_processor_id());
3411         cfs_b->runtime = cfs_b->quota;
3412         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3413 }
3414
3415 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3416 {
3417         return &tg->cfs_bandwidth;
3418 }
3419
3420 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
3421 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3422 {
3423         if (unlikely(cfs_rq->throttle_count))
3424                 return cfs_rq->throttled_clock_task;
3425
3426         return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3427 }
3428
3429 /* returns 0 on failure to allocate runtime */
3430 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3431 {
3432         struct task_group *tg = cfs_rq->tg;
3433         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3434         u64 amount = 0, min_amount, expires;
3435
3436         /* note: this is a positive sum as runtime_remaining <= 0 */
3437         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3438
3439         raw_spin_lock(&cfs_b->lock);
3440         if (cfs_b->quota == RUNTIME_INF)
3441                 amount = min_amount;
3442         else {
3443                 start_cfs_bandwidth(cfs_b);
3444
3445                 if (cfs_b->runtime > 0) {
3446                         amount = min(cfs_b->runtime, min_amount);
3447                         cfs_b->runtime -= amount;
3448                         cfs_b->idle = 0;
3449                 }
3450         }
3451         expires = cfs_b->runtime_expires;
3452         raw_spin_unlock(&cfs_b->lock);
3453
3454         cfs_rq->runtime_remaining += amount;
3455         /*
3456          * we may have advanced our local expiration to account for allowed
3457          * spread between our sched_clock and the one on which runtime was
3458          * issued.
3459          */
3460         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3461                 cfs_rq->runtime_expires = expires;
3462
3463         return cfs_rq->runtime_remaining > 0;
3464 }
3465
3466 /*
3467  * Note: This depends on the synchronization provided by sched_clock and the
3468  * fact that rq->clock snapshots this value.
3469  */
3470 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3471 {
3472         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3473
3474         /* if the deadline is ahead of our clock, nothing to do */
3475         if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
3476                 return;
3477
3478         if (cfs_rq->runtime_remaining < 0)
3479                 return;
3480
3481         /*
3482          * If the local deadline has passed we have to consider the
3483          * possibility that our sched_clock is 'fast' and the global deadline
3484          * has not truly expired.
3485          *
3486          * Fortunately we can check determine whether this the case by checking
3487          * whether the global deadline has advanced. It is valid to compare
3488          * cfs_b->runtime_expires without any locks since we only care about
3489          * exact equality, so a partial write will still work.
3490          */
3491
3492         if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
3493                 /* extend local deadline, drift is bounded above by 2 ticks */
3494                 cfs_rq->runtime_expires += TICK_NSEC;
3495         } else {
3496                 /* global deadline is ahead, expiration has passed */
3497                 cfs_rq->runtime_remaining = 0;
3498         }
3499 }
3500
3501 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3502 {
3503         /* dock delta_exec before expiring quota (as it could span periods) */
3504         cfs_rq->runtime_remaining -= delta_exec;
3505         expire_cfs_rq_runtime(cfs_rq);
3506
3507         if (likely(cfs_rq->runtime_remaining > 0))
3508                 return;
3509
3510         /*
3511          * if we're unable to extend our runtime we resched so that the active
3512          * hierarchy can be throttled
3513          */
3514         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3515                 resched_curr(rq_of(cfs_rq));
3516 }
3517
3518 static __always_inline
3519 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
3520 {
3521         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3522                 return;
3523
3524         __account_cfs_rq_runtime(cfs_rq, delta_exec);
3525 }
3526
3527 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3528 {
3529         return cfs_bandwidth_used() && cfs_rq->throttled;
3530 }
3531
3532 /* check whether cfs_rq, or any parent, is throttled */
3533 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3534 {
3535         return cfs_bandwidth_used() && cfs_rq->throttle_count;
3536 }
3537
3538 /*
3539  * Ensure that neither of the group entities corresponding to src_cpu or
3540  * dest_cpu are members of a throttled hierarchy when performing group
3541  * load-balance operations.
3542  */
3543 static inline int throttled_lb_pair(struct task_group *tg,
3544                                     int src_cpu, int dest_cpu)
3545 {
3546         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3547
3548         src_cfs_rq = tg->cfs_rq[src_cpu];
3549         dest_cfs_rq = tg->cfs_rq[dest_cpu];
3550
3551         return throttled_hierarchy(src_cfs_rq) ||
3552                throttled_hierarchy(dest_cfs_rq);
3553 }
3554
3555 /* updated child weight may affect parent so we have to do this bottom up */
3556 static int tg_unthrottle_up(struct task_group *tg, void *data)
3557 {
3558         struct rq *rq = data;
3559         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3560
3561         cfs_rq->throttle_count--;
3562 #ifdef CONFIG_SMP
3563         if (!cfs_rq->throttle_count) {
3564                 /* adjust cfs_rq_clock_task() */
3565                 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
3566                                              cfs_rq->throttled_clock_task;
3567         }
3568 #endif
3569
3570         return 0;
3571 }
3572
3573 static int tg_throttle_down(struct task_group *tg, void *data)
3574 {
3575         struct rq *rq = data;
3576         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3577
3578         /* group is entering throttled state, stop time */
3579         if (!cfs_rq->throttle_count)
3580                 cfs_rq->throttled_clock_task = rq_clock_task(rq);
3581         cfs_rq->throttle_count++;
3582
3583         return 0;
3584 }
3585
3586 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3587 {
3588         struct rq *rq = rq_of(cfs_rq);
3589         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3590         struct sched_entity *se;
3591         long task_delta, dequeue = 1;
3592         bool empty;
3593
3594         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3595
3596         /* freeze hierarchy runnable averages while throttled */
3597         rcu_read_lock();
3598         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3599         rcu_read_unlock();
3600
3601         task_delta = cfs_rq->h_nr_running;
3602         for_each_sched_entity(se) {
3603                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3604                 /* throttled entity or throttle-on-deactivate */
3605                 if (!se->on_rq)
3606                         break;
3607
3608                 if (dequeue)
3609                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3610                 qcfs_rq->h_nr_running -= task_delta;
3611
3612                 if (qcfs_rq->load.weight)
3613                         dequeue = 0;
3614         }
3615
3616         if (!se)
3617                 sub_nr_running(rq, task_delta);
3618
3619         cfs_rq->throttled = 1;
3620         cfs_rq->throttled_clock = rq_clock(rq);
3621         raw_spin_lock(&cfs_b->lock);
3622         empty = list_empty(&cfs_b->throttled_cfs_rq);
3623
3624         /*
3625          * Add to the _head_ of the list, so that an already-started
3626          * distribute_cfs_runtime will not see us
3627          */
3628         list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3629
3630         /*
3631          * If we're the first throttled task, make sure the bandwidth
3632          * timer is running.
3633          */
3634         if (empty)
3635                 start_cfs_bandwidth(cfs_b);
3636
3637         raw_spin_unlock(&cfs_b->lock);
3638 }
3639
3640 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3641 {
3642         struct rq *rq = rq_of(cfs_rq);
3643         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3644         struct sched_entity *se;
3645         int enqueue = 1;
3646         long task_delta;
3647
3648         se = cfs_rq->tg->se[cpu_of(rq)];
3649
3650         cfs_rq->throttled = 0;
3651
3652         update_rq_clock(rq);
3653
3654         raw_spin_lock(&cfs_b->lock);
3655         cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
3656         list_del_rcu(&cfs_rq->throttled_list);
3657         raw_spin_unlock(&cfs_b->lock);
3658
3659         /* update hierarchical throttle state */
3660         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3661
3662         if (!cfs_rq->load.weight)
3663                 return;
3664
3665         task_delta = cfs_rq->h_nr_running;
3666         for_each_sched_entity(se) {
3667                 if (se->on_rq)
3668                         enqueue = 0;
3669
3670                 cfs_rq = cfs_rq_of(se);
3671                 if (enqueue)
3672                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3673                 cfs_rq->h_nr_running += task_delta;
3674
3675                 if (cfs_rq_throttled(cfs_rq))
3676                         break;
3677         }
3678
3679         if (!se)
3680                 add_nr_running(rq, task_delta);
3681
3682         /* determine whether we need to wake up potentially idle cpu */
3683         if (rq->curr == rq->idle && rq->cfs.nr_running)
3684                 resched_curr(rq);
3685 }
3686
3687 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3688                 u64 remaining, u64 expires)
3689 {
3690         struct cfs_rq *cfs_rq;
3691         u64 runtime;
3692         u64 starting_runtime = remaining;
3693
3694         rcu_read_lock();
3695         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3696                                 throttled_list) {
3697                 struct rq *rq = rq_of(cfs_rq);
3698
3699                 raw_spin_lock(&rq->lock);
3700                 if (!cfs_rq_throttled(cfs_rq))
3701                         goto next;
3702
3703                 runtime = -cfs_rq->runtime_remaining + 1;
3704                 if (runtime > remaining)
3705                         runtime = remaining;
3706                 remaining -= runtime;
3707
3708                 cfs_rq->runtime_remaining += runtime;
3709                 cfs_rq->runtime_expires = expires;
3710
3711                 /* we check whether we're throttled above */
3712                 if (cfs_rq->runtime_remaining > 0)
3713                         unthrottle_cfs_rq(cfs_rq);
3714
3715 next:
3716                 raw_spin_unlock(&rq->lock);
3717
3718                 if (!remaining)
3719                         break;
3720         }
3721         rcu_read_unlock();
3722
3723         return starting_runtime - remaining;
3724 }
3725
3726 /*
3727  * Responsible for refilling a task_group's bandwidth and unthrottling its
3728  * cfs_rqs as appropriate. If there has been no activity within the last
3729  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3730  * used to track this state.
3731  */
3732 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3733 {
3734         u64 runtime, runtime_expires;
3735         int throttled;
3736
3737         /* no need to continue the timer with no bandwidth constraint */
3738         if (cfs_b->quota == RUNTIME_INF)
3739                 goto out_deactivate;
3740
3741         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3742         cfs_b->nr_periods += overrun;
3743
3744         /*
3745          * idle depends on !throttled (for the case of a large deficit), and if
3746          * we're going inactive then everything else can be deferred
3747          */
3748         if (cfs_b->idle && !throttled)
3749                 goto out_deactivate;
3750
3751         __refill_cfs_bandwidth_runtime(cfs_b);
3752
3753         if (!throttled) {
3754                 /* mark as potentially idle for the upcoming period */
3755                 cfs_b->idle = 1;
3756                 return 0;
3757         }
3758
3759         /* account preceding periods in which throttling occurred */
3760         cfs_b->nr_throttled += overrun;
3761
3762         runtime_expires = cfs_b->runtime_expires;
3763
3764         /*
3765          * This check is repeated as we are holding onto the new bandwidth while
3766          * we unthrottle. This can potentially race with an unthrottled group
3767          * trying to acquire new bandwidth from the global pool. This can result
3768          * in us over-using our runtime if it is all used during this loop, but
3769          * only by limited amounts in that extreme case.
3770          */
3771         while (throttled && cfs_b->runtime > 0) {
3772                 runtime = cfs_b->runtime;
3773                 raw_spin_unlock(&cfs_b->lock);
3774                 /* we can't nest cfs_b->lock while distributing bandwidth */
3775                 runtime = distribute_cfs_runtime(cfs_b, runtime,
3776                                                  runtime_expires);
3777                 raw_spin_lock(&cfs_b->lock);
3778
3779                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3780
3781                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3782         }
3783
3784         /*
3785          * While we are ensured activity in the period following an
3786          * unthrottle, this also covers the case in which the new bandwidth is
3787          * insufficient to cover the existing bandwidth deficit.  (Forcing the
3788          * timer to remain active while there are any throttled entities.)
3789          */
3790         cfs_b->idle = 0;
3791
3792         return 0;
3793
3794 out_deactivate:
3795         return 1;
3796 }
3797
3798 /* a cfs_rq won't donate quota below this amount */
3799 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3800 /* minimum remaining period time to redistribute slack quota */
3801 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3802 /* how long we wait to gather additional slack before distributing */
3803 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3804
3805 /*
3806  * Are we near the end of the current quota period?
3807  *
3808  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3809  * hrtimer base being cleared by hrtimer_start. In the case of
3810  * migrate_hrtimers, base is never cleared, so we are fine.
3811  */
3812 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3813 {
3814         struct hrtimer *refresh_timer = &cfs_b->period_timer;
3815         u64 remaining;
3816
3817         /* if the call-back is running a quota refresh is already occurring */
3818         if (hrtimer_callback_running(refresh_timer))
3819                 return 1;
3820
3821         /* is a quota refresh about to occur? */
3822         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3823         if (remaining < min_expire)
3824                 return 1;
3825
3826         return 0;
3827 }
3828
3829 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3830 {
3831         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3832
3833         /* if there's a quota refresh soon don't bother with slack */
3834         if (runtime_refresh_within(cfs_b, min_left))
3835                 return;
3836
3837         hrtimer_start(&cfs_b->slack_timer,
3838                         ns_to_ktime(cfs_bandwidth_slack_period),
3839                         HRTIMER_MODE_REL);
3840 }
3841
3842 /* we know any runtime found here is valid as update_curr() precedes return */
3843 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3844 {
3845         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3846         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3847
3848         if (slack_runtime <= 0)
3849                 return;
3850
3851         raw_spin_lock(&cfs_b->lock);
3852         if (cfs_b->quota != RUNTIME_INF &&
3853             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3854                 cfs_b->runtime += slack_runtime;
3855
3856                 /* we are under rq->lock, defer unthrottling using a timer */
3857                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3858                     !list_empty(&cfs_b->throttled_cfs_rq))
3859                         start_cfs_slack_bandwidth(cfs_b);
3860         }
3861         raw_spin_unlock(&cfs_b->lock);
3862
3863         /* even if it's not valid for return we don't want to try again */
3864         cfs_rq->runtime_remaining -= slack_runtime;
3865 }
3866
3867 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3868 {
3869         if (!cfs_bandwidth_used())
3870                 return;
3871
3872         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3873                 return;
3874
3875         __return_cfs_rq_runtime(cfs_rq);
3876 }
3877
3878 /*
3879  * This is done with a timer (instead of inline with bandwidth return) since
3880  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3881  */
3882 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3883 {
3884         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3885         u64 expires;
3886
3887         /* confirm we're still not at a refresh boundary */
3888         raw_spin_lock(&cfs_b->lock);
3889         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3890                 raw_spin_unlock(&cfs_b->lock);
3891                 return;
3892         }
3893
3894         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
3895                 runtime = cfs_b->runtime;
3896
3897         expires = cfs_b->runtime_expires;
3898         raw_spin_unlock(&cfs_b->lock);
3899
3900         if (!runtime)
3901                 return;
3902
3903         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3904
3905         raw_spin_lock(&cfs_b->lock);
3906         if (expires == cfs_b->runtime_expires)
3907                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
3908         raw_spin_unlock(&cfs_b->lock);
3909 }
3910
3911 /*
3912  * When a group wakes up we want to make sure that its quota is not already
3913  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3914  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3915  */
3916 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3917 {
3918         if (!cfs_bandwidth_used())
3919                 return;
3920
3921         /* an active group must be handled by the update_curr()->put() path */
3922         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3923                 return;
3924
3925         /* ensure the group is not already throttled */
3926         if (cfs_rq_throttled(cfs_rq))
3927                 return;
3928
3929         /* update runtime allocation */
3930         account_cfs_rq_runtime(cfs_rq, 0);
3931         if (cfs_rq->runtime_remaining <= 0)
3932                 throttle_cfs_rq(cfs_rq);
3933 }
3934
3935 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3936 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3937 {
3938         if (!cfs_bandwidth_used())
3939                 return false;
3940
3941         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3942                 return false;
3943
3944         /*
3945          * it's possible for a throttled entity to be forced into a running
3946          * state (e.g. set_curr_task), in this case we're finished.
3947          */
3948         if (cfs_rq_throttled(cfs_rq))
3949                 return true;
3950
3951         throttle_cfs_rq(cfs_rq);
3952         return true;
3953 }
3954
3955 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3956 {
3957         struct cfs_bandwidth *cfs_b =
3958                 container_of(timer, struct cfs_bandwidth, slack_timer);
3959
3960         do_sched_cfs_slack_timer(cfs_b);
3961
3962         return HRTIMER_NORESTART;
3963 }
3964
3965 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3966 {
3967         struct cfs_bandwidth *cfs_b =
3968                 container_of(timer, struct cfs_bandwidth, period_timer);
3969         int overrun;
3970         int idle = 0;
3971
3972         raw_spin_lock(&cfs_b->lock);
3973         for (;;) {
3974                 overrun = hrtimer_forward_now(timer, cfs_b->period);
3975                 if (!overrun)
3976                         break;
3977
3978                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3979         }
3980         if (idle)
3981                 cfs_b->period_active = 0;
3982         raw_spin_unlock(&cfs_b->lock);
3983
3984         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3985 }
3986
3987 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3988 {
3989         raw_spin_lock_init(&cfs_b->lock);
3990         cfs_b->runtime = 0;
3991         cfs_b->quota = RUNTIME_INF;
3992         cfs_b->period = ns_to_ktime(default_cfs_period());
3993
3994         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3995         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
3996         cfs_b->period_timer.function = sched_cfs_period_timer;
3997         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3998         cfs_b->slack_timer.function = sched_cfs_slack_timer;
3999 }
4000
4001 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4002 {
4003         cfs_rq->runtime_enabled = 0;
4004         INIT_LIST_HEAD(&cfs_rq->throttled_list);
4005 }
4006
4007 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4008 {
4009         lockdep_assert_held(&cfs_b->lock);
4010
4011         if (!cfs_b->period_active) {
4012                 cfs_b->period_active = 1;
4013                 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4014                 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4015         }
4016 }
4017
4018 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4019 {
4020         /* init_cfs_bandwidth() was not called */
4021         if (!cfs_b->throttled_cfs_rq.next)
4022                 return;
4023
4024         hrtimer_cancel(&cfs_b->period_timer);
4025         hrtimer_cancel(&cfs_b->slack_timer);
4026 }
4027
4028 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4029 {
4030         struct cfs_rq *cfs_rq;
4031
4032         for_each_leaf_cfs_rq(rq, cfs_rq) {
4033                 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4034
4035                 raw_spin_lock(&cfs_b->lock);
4036                 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4037                 raw_spin_unlock(&cfs_b->lock);
4038         }
4039 }
4040
4041 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4042 {
4043         struct cfs_rq *cfs_rq;
4044
4045         for_each_leaf_cfs_rq(rq, cfs_rq) {
4046                 if (!cfs_rq->runtime_enabled)
4047                         continue;
4048
4049                 /*
4050                  * clock_task is not advancing so we just need to make sure
4051                  * there's some valid quota amount
4052                  */
4053                 cfs_rq->runtime_remaining = 1;
4054                 /*
4055                  * Offline rq is schedulable till cpu is completely disabled
4056                  * in take_cpu_down(), so we prevent new cfs throttling here.
4057                  */
4058                 cfs_rq->runtime_enabled = 0;
4059
4060                 if (cfs_rq_throttled(cfs_rq))
4061                         unthrottle_cfs_rq(cfs_rq);
4062         }
4063 }
4064
4065 #else /* CONFIG_CFS_BANDWIDTH */
4066 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4067 {
4068         return rq_clock_task(rq_of(cfs_rq));
4069 }
4070
4071 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
4072 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
4073 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
4074 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4075
4076 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4077 {
4078         return 0;
4079 }
4080
4081 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4082 {
4083         return 0;
4084 }
4085
4086 static inline int throttled_lb_pair(struct task_group *tg,
4087                                     int src_cpu, int dest_cpu)
4088 {
4089         return 0;
4090 }
4091
4092 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4093
4094 #ifdef CONFIG_FAIR_GROUP_SCHED
4095 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4096 #endif
4097
4098 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4099 {
4100         return NULL;
4101 }
4102 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4103 static inline void update_runtime_enabled(struct rq *rq) {}
4104 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4105
4106 #endif /* CONFIG_CFS_BANDWIDTH */
4107
4108 /**************************************************
4109  * CFS operations on tasks:
4110  */
4111
4112 #ifdef CONFIG_SCHED_HRTICK
4113 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4114 {
4115         struct sched_entity *se = &p->se;
4116         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4117
4118         WARN_ON(task_rq(p) != rq);
4119
4120         if (cfs_rq->nr_running > 1) {
4121                 u64 slice = sched_slice(cfs_rq, se);
4122                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4123                 s64 delta = slice - ran;
4124
4125                 if (delta < 0) {
4126                         if (rq->curr == p)
4127                                 resched_curr(rq);
4128                         return;
4129                 }
4130                 hrtick_start(rq, delta);
4131         }
4132 }
4133
4134 /*
4135  * called from enqueue/dequeue and updates the hrtick when the
4136  * current task is from our class and nr_running is low enough
4137  * to matter.
4138  */
4139 static void hrtick_update(struct rq *rq)
4140 {
4141         struct task_struct *curr = rq->curr;
4142
4143         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4144                 return;
4145
4146         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4147                 hrtick_start_fair(rq, curr);
4148 }
4149 #else /* !CONFIG_SCHED_HRTICK */
4150 static inline void
4151 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4152 {
4153 }
4154
4155 static inline void hrtick_update(struct rq *rq)
4156 {
4157 }
4158 #endif
4159
4160 /*
4161  * The enqueue_task method is called before nr_running is
4162  * increased. Here we update the fair scheduling stats and
4163  * then put the task into the rbtree:
4164  */
4165 static void
4166 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4167 {
4168         struct cfs_rq *cfs_rq;
4169         struct sched_entity *se = &p->se;
4170
4171         for_each_sched_entity(se) {
4172                 if (se->on_rq)
4173                         break;
4174                 cfs_rq = cfs_rq_of(se);
4175                 enqueue_entity(cfs_rq, se, flags);
4176
4177                 /*
4178                  * end evaluation on encountering a throttled cfs_rq
4179                  *
4180                  * note: in the case of encountering a throttled cfs_rq we will
4181                  * post the final h_nr_running increment below.
4182                 */
4183                 if (cfs_rq_throttled(cfs_rq))
4184                         break;
4185                 cfs_rq->h_nr_running++;
4186
4187                 flags = ENQUEUE_WAKEUP;
4188         }
4189
4190         for_each_sched_entity(se) {
4191                 cfs_rq = cfs_rq_of(se);
4192                 cfs_rq->h_nr_running++;
4193
4194                 if (cfs_rq_throttled(cfs_rq))
4195                         break;
4196
4197                 update_load_avg(se, 1);
4198                 update_cfs_shares(cfs_rq);
4199         }
4200
4201         if (!se)
4202                 add_nr_running(rq, 1);
4203
4204         hrtick_update(rq);
4205 }
4206
4207 static void set_next_buddy(struct sched_entity *se);
4208
4209 /*
4210  * The dequeue_task method is called before nr_running is
4211  * decreased. We remove the task from the rbtree and
4212  * update the fair scheduling stats:
4213  */
4214 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4215 {
4216         struct cfs_rq *cfs_rq;
4217         struct sched_entity *se = &p->se;
4218         int task_sleep = flags & DEQUEUE_SLEEP;
4219
4220         for_each_sched_entity(se) {
4221                 cfs_rq = cfs_rq_of(se);
4222                 dequeue_entity(cfs_rq, se, flags);
4223
4224                 /*
4225                  * end evaluation on encountering a throttled cfs_rq
4226                  *
4227                  * note: in the case of encountering a throttled cfs_rq we will
4228                  * post the final h_nr_running decrement below.
4229                 */
4230                 if (cfs_rq_throttled(cfs_rq))
4231                         break;
4232                 cfs_rq->h_nr_running--;
4233
4234                 /* Don't dequeue parent if it has other entities besides us */
4235                 if (cfs_rq->load.weight) {
4236                         /*
4237                          * Bias pick_next to pick a task from this cfs_rq, as
4238                          * p is sleeping when it is within its sched_slice.
4239                          */
4240                         if (task_sleep && parent_entity(se))
4241                                 set_next_buddy(parent_entity(se));
4242
4243                         /* avoid re-evaluating load for this entity */
4244                         se = parent_entity(se);
4245                         break;
4246                 }
4247                 flags |= DEQUEUE_SLEEP;
4248         }
4249
4250         for_each_sched_entity(se) {
4251                 cfs_rq = cfs_rq_of(se);
4252                 cfs_rq->h_nr_running--;
4253
4254                 if (cfs_rq_throttled(cfs_rq))
4255                         break;
4256
4257                 update_load_avg(se, 1);
4258                 update_cfs_shares(cfs_rq);
4259         }
4260
4261         if (!se)
4262                 sub_nr_running(rq, 1);
4263
4264         hrtick_update(rq);
4265 }
4266
4267 #ifdef CONFIG_SMP
4268
4269 /*
4270  * per rq 'load' arrray crap; XXX kill this.
4271  */
4272
4273 /*
4274  * The exact cpuload at various idx values, calculated at every tick would be
4275  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4276  *
4277  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4278  * on nth tick when cpu may be busy, then we have:
4279  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4280  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4281  *
4282  * decay_load_missed() below does efficient calculation of
4283  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4284  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4285  *
4286  * The calculation is approximated on a 128 point scale.
4287  * degrade_zero_ticks is the number of ticks after which load at any
4288  * particular idx is approximated to be zero.
4289  * degrade_factor is a precomputed table, a row for each load idx.
4290  * Each column corresponds to degradation factor for a power of two ticks,
4291  * based on 128 point scale.
4292  * Example:
4293  * row 2, col 3 (=12) says that the degradation at load idx 2 after
4294  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4295  *
4296  * With this power of 2 load factors, we can degrade the load n times
4297  * by looking at 1 bits in n and doing as many mult/shift instead of
4298  * n mult/shifts needed by the exact degradation.
4299  */
4300 #define DEGRADE_SHIFT           7
4301 static const unsigned char
4302                 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4303 static const unsigned char
4304                 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4305                                         {0, 0, 0, 0, 0, 0, 0, 0},
4306                                         {64, 32, 8, 0, 0, 0, 0, 0},
4307                                         {96, 72, 40, 12, 1, 0, 0},
4308                                         {112, 98, 75, 43, 15, 1, 0},
4309                                         {120, 112, 98, 76, 45, 16, 2} };
4310
4311 /*
4312  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4313  * would be when CPU is idle and so we just decay the old load without
4314  * adding any new load.
4315  */
4316 static unsigned long
4317 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4318 {
4319         int j = 0;
4320
4321         if (!missed_updates)
4322                 return load;
4323
4324         if (missed_updates >= degrade_zero_ticks[idx])
4325                 return 0;
4326
4327         if (idx == 1)
4328                 return load >> missed_updates;
4329
4330         while (missed_updates) {
4331                 if (missed_updates % 2)
4332                         load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
4333
4334                 missed_updates >>= 1;
4335                 j++;
4336         }
4337         return load;
4338 }
4339
4340 /*
4341  * Update rq->cpu_load[] statistics. This function is usually called every
4342  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
4343  * every tick. We fix it up based on jiffies.
4344  */
4345 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4346                               unsigned long pending_updates)
4347 {
4348         int i, scale;
4349
4350         this_rq->nr_load_updates++;
4351
4352         /* Update our load: */
4353         this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
4354         for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
4355                 unsigned long old_load, new_load;
4356
4357                 /* scale is effectively 1 << i now, and >> i divides by scale */
4358
4359                 old_load = this_rq->cpu_load[i];
4360                 old_load = decay_load_missed(old_load, pending_updates - 1, i);
4361                 new_load = this_load;
4362                 /*
4363                  * Round up the averaging division if load is increasing. This
4364                  * prevents us from getting stuck on 9 if the load is 10, for
4365                  * example.
4366                  */
4367                 if (new_load > old_load)
4368                         new_load += scale - 1;
4369
4370                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
4371         }
4372
4373         sched_avg_update(this_rq);
4374 }
4375
4376 /* Used instead of source_load when we know the type == 0 */
4377 static unsigned long weighted_cpuload(const int cpu)
4378 {
4379         return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4380 }
4381
4382 #ifdef CONFIG_NO_HZ_COMMON
4383 /*
4384  * There is no sane way to deal with nohz on smp when using jiffies because the
4385  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
4386  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
4387  *
4388  * Therefore we cannot use the delta approach from the regular tick since that
4389  * would seriously skew the load calculation. However we'll make do for those
4390  * updates happening while idle (nohz_idle_balance) or coming out of idle
4391  * (tick_nohz_idle_exit).
4392  *
4393  * This means we might still be one tick off for nohz periods.
4394  */
4395
4396 /*
4397  * Called from nohz_idle_balance() to update the load ratings before doing the
4398  * idle balance.
4399  */
4400 static void update_idle_cpu_load(struct rq *this_rq)
4401 {
4402         unsigned long curr_jiffies = READ_ONCE(jiffies);
4403         unsigned long load = weighted_cpuload(cpu_of(this_rq));
4404         unsigned long pending_updates;
4405
4406         /*
4407          * bail if there's load or we're actually up-to-date.
4408          */
4409         if (load || curr_jiffies == this_rq->last_load_update_tick)
4410                 return;
4411
4412         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4413         this_rq->last_load_update_tick = curr_jiffies;
4414
4415         __update_cpu_load(this_rq, load, pending_updates);
4416 }
4417
4418 /*
4419  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
4420  */
4421 void update_cpu_load_nohz(void)
4422 {
4423         struct rq *this_rq = this_rq();
4424         unsigned long curr_jiffies = READ_ONCE(jiffies);
4425         unsigned long pending_updates;
4426
4427         if (curr_jiffies == this_rq->last_load_update_tick)
4428                 return;
4429
4430         raw_spin_lock(&this_rq->lock);
4431         pending_updates = curr_jiffies - this_rq->last_load_update_tick;
4432         if (pending_updates) {
4433                 this_rq->last_load_update_tick = curr_jiffies;
4434                 /*
4435                  * We were idle, this means load 0, the current load might be
4436                  * !0 due to remote wakeups and the sort.
4437                  */
4438                 __update_cpu_load(this_rq, 0, pending_updates);
4439         }
4440         raw_spin_unlock(&this_rq->lock);
4441 }
4442 #endif /* CONFIG_NO_HZ */
4443
4444 /*
4445  * Called from scheduler_tick()
4446  */
4447 void update_cpu_load_active(struct rq *this_rq)
4448 {
4449         unsigned long load = weighted_cpuload(cpu_of(this_rq));
4450         /*
4451          * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4452          */
4453         this_rq->last_load_update_tick = jiffies;
4454         __update_cpu_load(this_rq, load, 1);
4455 }
4456
4457 /*
4458  * Return a low guess at the load of a migration-source cpu weighted
4459  * according to the scheduling class and "nice" value.
4460  *
4461  * We want to under-estimate the load of migration sources, to
4462  * balance conservatively.
4463  */
4464 static unsigned long source_load(int cpu, int type)
4465 {
4466         struct rq *rq = cpu_rq(cpu);
4467         unsigned long total = weighted_cpuload(cpu);
4468
4469         if (type == 0 || !sched_feat(LB_BIAS))
4470                 return total;
4471
4472         return min(rq->cpu_load[type-1], total);
4473 }
4474
4475 /*
4476  * Return a high guess at the load of a migration-target cpu weighted
4477  * according to the scheduling class and "nice" value.
4478  */
4479 static unsigned long target_load(int cpu, int type)
4480 {
4481         struct rq *rq = cpu_rq(cpu);
4482         unsigned long total = weighted_cpuload(cpu);
4483
4484         if (type == 0 || !sched_feat(LB_BIAS))
4485                 return total;
4486
4487         return max(rq->cpu_load[type-1], total);
4488 }
4489
4490 static unsigned long capacity_of(int cpu)
4491 {
4492         return cpu_rq(cpu)->cpu_capacity;
4493 }
4494
4495 static unsigned long capacity_orig_of(int cpu)
4496 {
4497         return cpu_rq(cpu)->cpu_capacity_orig;
4498 }
4499
4500 static unsigned long cpu_avg_load_per_task(int cpu)
4501 {
4502         struct rq *rq = cpu_rq(cpu);
4503         unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4504         unsigned long load_avg = weighted_cpuload(cpu);
4505
4506         if (nr_running)
4507                 return load_avg / nr_running;
4508
4509         return 0;
4510 }
4511
4512 static void record_wakee(struct task_struct *p)
4513 {
4514         /*
4515          * Rough decay (wiping) for cost saving, don't worry
4516          * about the boundary, really active task won't care
4517          * about the loss.
4518          */
4519         if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
4520                 current->wakee_flips >>= 1;
4521                 current->wakee_flip_decay_ts = jiffies;
4522         }
4523
4524         if (current->last_wakee != p) {
4525                 current->last_wakee = p;
4526                 current->wakee_flips++;
4527         }
4528 }
4529
4530 static void task_waking_fair(struct task_struct *p)
4531 {
4532         struct sched_entity *se = &p->se;
4533         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4534         u64 min_vruntime;
4535
4536 #ifndef CONFIG_64BIT
4537         u64 min_vruntime_copy;
4538
4539         do {
4540                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4541                 smp_rmb();
4542                 min_vruntime = cfs_rq->min_vruntime;
4543         } while (min_vruntime != min_vruntime_copy);
4544 #else
4545         min_vruntime = cfs_rq->min_vruntime;
4546 #endif
4547
4548         se->vruntime -= min_vruntime;
4549         record_wakee(p);
4550 }
4551
4552 #ifdef CONFIG_FAIR_GROUP_SCHED
4553 /*
4554  * effective_load() calculates the load change as seen from the root_task_group
4555  *
4556  * Adding load to a group doesn't make a group heavier, but can cause movement
4557  * of group shares between cpus. Assuming the shares were perfectly aligned one
4558  * can calculate the shift in shares.
4559  *
4560  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4561  * on this @cpu and results in a total addition (subtraction) of @wg to the
4562  * total group weight.
4563  *
4564  * Given a runqueue weight distribution (rw_i) we can compute a shares
4565  * distribution (s_i) using:
4566  *
4567  *   s_i = rw_i / \Sum rw_j                                             (1)
4568  *
4569  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4570  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4571  * shares distribution (s_i):
4572  *
4573  *   rw_i = {   2,   4,   1,   0 }
4574  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4575  *
4576  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4577  * task used to run on and the CPU the waker is running on), we need to
4578  * compute the effect of waking a task on either CPU and, in case of a sync
4579  * wakeup, compute the effect of the current task going to sleep.
4580  *
4581  * So for a change of @wl to the local @cpu with an overall group weight change
4582  * of @wl we can compute the new shares distribution (s'_i) using:
4583  *
4584  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
4585  *
4586  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4587  * differences in waking a task to CPU 0. The additional task changes the
4588  * weight and shares distributions like:
4589  *
4590  *   rw'_i = {   3,   4,   1,   0 }
4591  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4592  *
4593  * We can then compute the difference in effective weight by using:
4594  *
4595  *   dw_i = S * (s'_i - s_i)                                            (3)
4596  *
4597  * Where 'S' is the group weight as seen by its parent.
4598  *
4599  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4600  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4601  * 4/7) times the weight of the group.
4602  */
4603 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4604 {
4605         struct sched_entity *se = tg->se[cpu];
4606
4607         if (!tg->parent)        /* the trivial, non-cgroup case */
4608                 return wl;
4609
4610         for_each_sched_entity(se) {
4611                 struct cfs_rq *cfs_rq = se->my_q;
4612                 long W, w = cfs_rq_load_avg(cfs_rq);
4613
4614                 tg = cfs_rq->tg;
4615
4616                 /*
4617                  * W = @wg + \Sum rw_j
4618                  */
4619                 W = wg + atomic_long_read(&tg->load_avg);
4620
4621                 /* Ensure \Sum rw_j >= rw_i */
4622                 W -= cfs_rq->tg_load_avg_contrib;
4623                 W += w;
4624
4625                 /*
4626                  * w = rw_i + @wl
4627                  */
4628                 w += wl;
4629
4630                 /*
4631                  * wl = S * s'_i; see (2)
4632                  */
4633                 if (W > 0 && w < W)
4634                         wl = (w * (long)tg->shares) / W;
4635                 else
4636                         wl = tg->shares;
4637
4638                 /*
4639                  * Per the above, wl is the new se->load.weight value; since
4640                  * those are clipped to [MIN_SHARES, ...) do so now. See
4641                  * calc_cfs_shares().
4642                  */
4643                 if (wl < MIN_SHARES)
4644                         wl = MIN_SHARES;
4645
4646                 /*
4647                  * wl = dw_i = S * (s'_i - s_i); see (3)
4648                  */
4649                 wl -= se->avg.load_avg;
4650
4651                 /*
4652                  * Recursively apply this logic to all parent groups to compute
4653                  * the final effective load change on the root group. Since
4654                  * only the @tg group gets extra weight, all parent groups can
4655                  * only redistribute existing shares. @wl is the shift in shares
4656                  * resulting from this level per the above.
4657                  */
4658                 wg = 0;
4659         }
4660
4661         return wl;
4662 }
4663 #else
4664
4665 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4666 {
4667         return wl;
4668 }
4669
4670 #endif
4671
4672 /*
4673  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4674  * A waker of many should wake a different task than the one last awakened
4675  * at a frequency roughly N times higher than one of its wakees.  In order
4676  * to determine whether we should let the load spread vs consolodating to
4677  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
4678  * partner, and a factor of lls_size higher frequency in the other.  With
4679  * both conditions met, we can be relatively sure that the relationship is
4680  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
4681  * being client/server, worker/dispatcher, interrupt source or whatever is
4682  * irrelevant, spread criteria is apparent partner count exceeds socket size.
4683  */
4684 static int wake_wide(struct task_struct *p)
4685 {
4686         unsigned int master = current->wakee_flips;
4687         unsigned int slave = p->wakee_flips;
4688         int factor = this_cpu_read(sd_llc_size);
4689
4690         if (master < slave)
4691                 swap(master, slave);
4692         if (slave < factor || master < slave * factor)
4693                 return 0;
4694         return 1;
4695 }
4696
4697 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4698 {
4699         s64 this_load, load;
4700         s64 this_eff_load, prev_eff_load;
4701         int idx, this_cpu, prev_cpu;
4702         struct task_group *tg;
4703         unsigned long weight;
4704         int balanced;
4705
4706         idx       = sd->wake_idx;
4707         this_cpu  = smp_processor_id();
4708         prev_cpu  = task_cpu(p);
4709         load      = source_load(prev_cpu, idx);
4710         this_load = target_load(this_cpu, idx);
4711
4712         /*
4713          * If sync wakeup then subtract the (maximum possible)
4714          * effect of the currently running task from the load
4715          * of the current CPU:
4716          */
4717         if (sync) {
4718                 tg = task_group(current);
4719                 weight = current->se.avg.load_avg;
4720
4721                 this_load += effective_load(tg, this_cpu, -weight, -weight);
4722                 load += effective_load(tg, prev_cpu, 0, -weight);
4723         }
4724
4725         tg = task_group(p);
4726         weight = p->se.avg.load_avg;
4727
4728         /*
4729          * In low-load situations, where prev_cpu is idle and this_cpu is idle
4730          * due to the sync cause above having dropped this_load to 0, we'll
4731          * always have an imbalance, but there's really nothing you can do
4732          * about that, so that's good too.
4733          *
4734          * Otherwise check if either cpus are near enough in load to allow this
4735          * task to be woken on this_cpu.
4736          */
4737         this_eff_load = 100;
4738         this_eff_load *= capacity_of(prev_cpu);
4739
4740         prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4741         prev_eff_load *= capacity_of(this_cpu);
4742
4743         if (this_load > 0) {
4744                 this_eff_load *= this_load +
4745                         effective_load(tg, this_cpu, weight, weight);
4746
4747                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4748         }
4749
4750         balanced = this_eff_load <= prev_eff_load;
4751
4752         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4753
4754         if (!balanced)
4755                 return 0;
4756
4757         schedstat_inc(sd, ttwu_move_affine);
4758         schedstat_inc(p, se.statistics.nr_wakeups_affine);
4759
4760         return 1;
4761 }
4762
4763 /*
4764  * find_idlest_group finds and returns the least busy CPU group within the
4765  * domain.
4766  */
4767 static struct sched_group *
4768 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4769                   int this_cpu, int sd_flag)
4770 {
4771         struct sched_group *idlest = NULL, *group = sd->groups;
4772         unsigned long min_load = ULONG_MAX, this_load = 0;
4773         int load_idx = sd->forkexec_idx;
4774         int imbalance = 100 + (sd->imbalance_pct-100)/2;
4775
4776         if (sd_flag & SD_BALANCE_WAKE)
4777                 load_idx = sd->wake_idx;
4778
4779         do {
4780                 unsigned long load, avg_load;
4781                 int local_group;
4782                 int i;
4783
4784                 /* Skip over this group if it has no CPUs allowed */
4785                 if (!cpumask_intersects(sched_group_cpus(group),
4786                                         tsk_cpus_allowed(p)))
4787                         continue;
4788
4789                 local_group = cpumask_test_cpu(this_cpu,
4790                                                sched_group_cpus(group));
4791
4792                 /* Tally up the load of all CPUs in the group */
4793                 avg_load = 0;
4794
4795                 for_each_cpu(i, sched_group_cpus(group)) {
4796                         /* Bias balancing toward cpus of our domain */
4797                         if (local_group)
4798                                 load = source_load(i, load_idx);
4799                         else
4800                                 load = target_load(i, load_idx);
4801
4802                         avg_load += load;
4803                 }
4804
4805                 /* Adjust by relative CPU capacity of the group */
4806                 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
4807
4808                 if (local_group) {
4809                         this_load = avg_load;
4810                 } else if (avg_load < min_load) {
4811                         min_load = avg_load;
4812                         idlest = group;
4813                 }
4814         } while (group = group->next, group != sd->groups);
4815
4816         if (!idlest || 100*this_load < imbalance*min_load)
4817                 return NULL;
4818         return idlest;
4819 }
4820
4821 /*
4822  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4823  */
4824 static int
4825 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4826 {
4827         unsigned long load, min_load = ULONG_MAX;
4828         unsigned int min_exit_latency = UINT_MAX;
4829         u64 latest_idle_timestamp = 0;
4830         int least_loaded_cpu = this_cpu;
4831         int shallowest_idle_cpu = -1;
4832         int i;
4833
4834         /* Traverse only the allowed CPUs */
4835         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4836                 if (idle_cpu(i)) {
4837                         struct rq *rq = cpu_rq(i);
4838                         struct cpuidle_state *idle = idle_get_state(rq);
4839                         if (idle && idle->exit_latency < min_exit_latency) {
4840                                 /*
4841                                  * We give priority to a CPU whose idle state
4842                                  * has the smallest exit latency irrespective
4843                                  * of any idle timestamp.
4844                                  */
4845                                 min_exit_latency = idle->exit_latency;
4846                                 latest_idle_timestamp = rq->idle_stamp;
4847                                 shallowest_idle_cpu = i;
4848                         } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4849                                    rq->idle_stamp > latest_idle_timestamp) {
4850                                 /*
4851                                  * If equal or no active idle state, then
4852                                  * the most recently idled CPU might have
4853                                  * a warmer cache.
4854                                  */
4855                                 latest_idle_timestamp = rq->idle_stamp;
4856                                 shallowest_idle_cpu = i;
4857                         }
4858                 } else if (shallowest_idle_cpu == -1) {
4859                         load = weighted_cpuload(i);
4860                         if (load < min_load || (load == min_load && i == this_cpu)) {
4861                                 min_load = load;
4862                                 least_loaded_cpu = i;
4863                         }
4864                 }
4865         }
4866
4867         return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4868 }
4869
4870 /*
4871  * Try and locate an idle CPU in the sched_domain.
4872  */
4873 static int select_idle_sibling(struct task_struct *p, int target)
4874 {
4875         struct sched_domain *sd;
4876         struct sched_group *sg;
4877         int i = task_cpu(p);
4878
4879         if (idle_cpu(target))
4880                 return target;
4881
4882         /*
4883          * If the prevous cpu is cache affine and idle, don't be stupid.
4884          */
4885         if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4886                 return i;
4887
4888         /*
4889          * Otherwise, iterate the domains and find an elegible idle cpu.
4890          */
4891         sd = rcu_dereference(per_cpu(sd_llc, target));
4892         for_each_lower_domain(sd) {
4893                 sg = sd->groups;
4894                 do {
4895                         if (!cpumask_intersects(sched_group_cpus(sg),
4896                                                 tsk_cpus_allowed(p)))
4897                                 goto next;
4898
4899                         for_each_cpu(i, sched_group_cpus(sg)) {
4900                                 if (i == target || !idle_cpu(i))
4901                                         goto next;
4902                         }
4903
4904                         target = cpumask_first_and(sched_group_cpus(sg),
4905                                         tsk_cpus_allowed(p));
4906                         goto done;
4907 next:
4908                         sg = sg->next;
4909                 } while (sg != sd->groups);
4910         }
4911 done:
4912         return target;
4913 }
4914
4915 /*
4916  * cpu_util returns the amount of capacity of a CPU that is used by CFS
4917  * tasks. The unit of the return value must be the one of capacity so we can
4918  * compare the utilization with the capacity of the CPU that is available for
4919  * CFS task (ie cpu_capacity).
4920  *
4921  * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
4922  * recent utilization of currently non-runnable tasks on a CPU. It represents
4923  * the amount of utilization of a CPU in the range [0..capacity_orig] where
4924  * capacity_orig is the cpu_capacity available at the highest frequency
4925  * (arch_scale_freq_capacity()).
4926  * The utilization of a CPU converges towards a sum equal to or less than the
4927  * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
4928  * the running time on this CPU scaled by capacity_curr.
4929  *
4930  * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
4931  * higher than capacity_orig because of unfortunate rounding in
4932  * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
4933  * the average stabilizes with the new running time. We need to check that the
4934  * utilization stays within the range of [0..capacity_orig] and cap it if
4935  * necessary. Without utilization capping, a group could be seen as overloaded
4936  * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
4937  * available capacity. We allow utilization to overshoot capacity_curr (but not
4938  * capacity_orig) as it useful for predicting the capacity required after task
4939  * migrations (scheduler-driven DVFS).
4940  */
4941 static int cpu_util(int cpu)
4942 {
4943         unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
4944         unsigned long capacity = capacity_orig_of(cpu);
4945
4946         return (util >= capacity) ? capacity : util;
4947 }
4948
4949 /*
4950  * select_task_rq_fair: Select target runqueue for the waking task in domains
4951  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
4952  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
4953  *
4954  * Balances load by selecting the idlest cpu in the idlest group, or under
4955  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
4956  *
4957  * Returns the target cpu number.
4958  *
4959  * preempt must be disabled.
4960  */
4961 static int
4962 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
4963 {
4964         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
4965         int cpu = smp_processor_id();
4966         int new_cpu = prev_cpu;
4967         int want_affine = 0;
4968         int sync = wake_flags & WF_SYNC;
4969
4970         if (sd_flag & SD_BALANCE_WAKE)
4971                 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4972
4973         rcu_read_lock();
4974         for_each_domain(cpu, tmp) {
4975                 if (!(tmp->flags & SD_LOAD_BALANCE))
4976                         break;
4977
4978                 /*
4979                  * If both cpu and prev_cpu are part of this domain,
4980                  * cpu is a valid SD_WAKE_AFFINE target.
4981                  */
4982                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
4983                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
4984                         affine_sd = tmp;
4985                         break;
4986                 }
4987
4988                 if (tmp->flags & sd_flag)
4989                         sd = tmp;
4990                 else if (!want_affine)
4991                         break;
4992         }
4993
4994         if (affine_sd) {
4995                 sd = NULL; /* Prefer wake_affine over balance flags */
4996                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
4997                         new_cpu = cpu;
4998         }
4999
5000         if (!sd) {
5001                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
5002                         new_cpu = select_idle_sibling(p, new_cpu);
5003
5004         } else while (sd) {
5005                 struct sched_group *group;
5006                 int weight;
5007
5008                 if (!(sd->flags & sd_flag)) {
5009                         sd = sd->child;
5010                         continue;
5011                 }
5012
5013                 group = find_idlest_group(sd, p, cpu, sd_flag);
5014                 if (!group) {
5015                         sd = sd->child;
5016                         continue;
5017                 }
5018
5019                 new_cpu = find_idlest_cpu(group, p, cpu);
5020                 if (new_cpu == -1 || new_cpu == cpu) {
5021                         /* Now try balancing at a lower domain level of cpu */
5022                         sd = sd->child;
5023                         continue;
5024                 }
5025
5026                 /* Now try balancing at a lower domain level of new_cpu */
5027                 cpu = new_cpu;
5028                 weight = sd->span_weight;
5029                 sd = NULL;
5030                 for_each_domain(cpu, tmp) {
5031                         if (weight <= tmp->span_weight)
5032                                 break;
5033                         if (tmp->flags & sd_flag)
5034                                 sd = tmp;
5035                 }
5036                 /* while loop will break here if sd == NULL */
5037         }
5038         rcu_read_unlock();
5039
5040         return new_cpu;
5041 }
5042
5043 /*
5044  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5045  * cfs_rq_of(p) references at time of call are still valid and identify the
5046  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
5047  * other assumptions, including the state of rq->lock, should be made.
5048  */
5049 static void migrate_task_rq_fair(struct task_struct *p)
5050 {
5051         /*
5052          * We are supposed to update the task to "current" time, then its up to date
5053          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5054          * what current time is, so simply throw away the out-of-date time. This
5055          * will result in the wakee task is less decayed, but giving the wakee more
5056          * load sounds not bad.
5057          */
5058         remove_entity_load_avg(&p->se);
5059
5060         /* Tell new CPU we are migrated */
5061         p->se.avg.last_update_time = 0;
5062
5063         /* We have migrated, no longer consider this task hot */
5064         p->se.exec_start = 0;
5065 }
5066
5067 static void task_dead_fair(struct task_struct *p)
5068 {
5069         remove_entity_load_avg(&p->se);
5070 }
5071 #endif /* CONFIG_SMP */
5072
5073 static unsigned long
5074 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5075 {
5076         unsigned long gran = sysctl_sched_wakeup_granularity;
5077
5078         /*
5079          * Since its curr running now, convert the gran from real-time
5080          * to virtual-time in his units.
5081          *
5082          * By using 'se' instead of 'curr' we penalize light tasks, so
5083          * they get preempted easier. That is, if 'se' < 'curr' then
5084          * the resulting gran will be larger, therefore penalizing the
5085          * lighter, if otoh 'se' > 'curr' then the resulting gran will
5086          * be smaller, again penalizing the lighter task.
5087          *
5088          * This is especially important for buddies when the leftmost
5089          * task is higher priority than the buddy.
5090          */
5091         return calc_delta_fair(gran, se);
5092 }
5093
5094 /*
5095  * Should 'se' preempt 'curr'.
5096  *
5097  *             |s1
5098  *        |s2
5099  *   |s3
5100  *         g
5101  *      |<--->|c
5102  *
5103  *  w(c, s1) = -1
5104  *  w(c, s2) =  0
5105  *  w(c, s3) =  1
5106  *
5107  */
5108 static int
5109 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5110 {
5111         s64 gran, vdiff = curr->vruntime - se->vruntime;
5112
5113         if (vdiff <= 0)
5114                 return -1;
5115
5116         gran = wakeup_gran(curr, se);
5117         if (vdiff > gran)
5118                 return 1;
5119
5120         return 0;
5121 }
5122
5123 static void set_last_buddy(struct sched_entity *se)
5124 {
5125         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5126                 return;
5127
5128         for_each_sched_entity(se)
5129                 cfs_rq_of(se)->last = se;
5130 }
5131
5132 static void set_next_buddy(struct sched_entity *se)
5133 {
5134         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5135                 return;
5136
5137         for_each_sched_entity(se)
5138                 cfs_rq_of(se)->next = se;
5139 }
5140
5141 static void set_skip_buddy(struct sched_entity *se)
5142 {
5143         for_each_sched_entity(se)
5144                 cfs_rq_of(se)->skip = se;
5145 }
5146
5147 /*
5148  * Preempt the current task with a newly woken task if needed:
5149  */
5150 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5151 {
5152         struct task_struct *curr = rq->curr;
5153         struct sched_entity *se = &curr->se, *pse = &p->se;
5154         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5155         int scale = cfs_rq->nr_running >= sched_nr_latency;
5156         int next_buddy_marked = 0;
5157
5158         if (unlikely(se == pse))
5159                 return;
5160
5161         /*
5162          * This is possible from callers such as attach_tasks(), in which we
5163          * unconditionally check_prempt_curr() after an enqueue (which may have
5164          * lead to a throttle).  This both saves work and prevents false
5165          * next-buddy nomination below.
5166          */
5167         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5168                 return;
5169
5170         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5171                 set_next_buddy(pse);
5172                 next_buddy_marked = 1;
5173         }
5174
5175         /*
5176          * We can come here with TIF_NEED_RESCHED already set from new task
5177          * wake up path.
5178          *
5179          * Note: this also catches the edge-case of curr being in a throttled
5180          * group (e.g. via set_curr_task), since update_curr() (in the
5181          * enqueue of curr) will have resulted in resched being set.  This
5182          * prevents us from potentially nominating it as a false LAST_BUDDY
5183          * below.
5184          */
5185         if (test_tsk_need_resched(curr))
5186                 return;
5187
5188         /* Idle tasks are by definition preempted by non-idle tasks. */
5189         if (unlikely(curr->policy == SCHED_IDLE) &&
5190             likely(p->policy != SCHED_IDLE))
5191                 goto preempt;
5192
5193         /*
5194          * Batch and idle tasks do not preempt non-idle tasks (their preemption
5195          * is driven by the tick):
5196          */
5197         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5198                 return;
5199
5200         find_matching_se(&se, &pse);
5201         update_curr(cfs_rq_of(se));
5202         BUG_ON(!pse);
5203         if (wakeup_preempt_entity(se, pse) == 1) {
5204                 /*
5205                  * Bias pick_next to pick the sched entity that is
5206                  * triggering this preemption.
5207                  */
5208                 if (!next_buddy_marked)
5209                         set_next_buddy(pse);
5210                 goto preempt;
5211         }
5212
5213         return;
5214
5215 preempt:
5216         resched_curr(rq);
5217         /*
5218          * Only set the backward buddy when the current task is still
5219          * on the rq. This can happen when a wakeup gets interleaved
5220          * with schedule on the ->pre_schedule() or idle_balance()
5221          * point, either of which can * drop the rq lock.
5222          *
5223          * Also, during early boot the idle thread is in the fair class,
5224          * for obvious reasons its a bad idea to schedule back to it.
5225          */
5226         if (unlikely(!se->on_rq || curr == rq->idle))
5227                 return;
5228
5229         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5230                 set_last_buddy(se);
5231 }
5232
5233 static struct task_struct *
5234 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
5235 {
5236         struct cfs_rq *cfs_rq = &rq->cfs;
5237         struct sched_entity *se;
5238         struct task_struct *p;
5239         int new_tasks;
5240
5241 again:
5242 #ifdef CONFIG_FAIR_GROUP_SCHED
5243         if (!cfs_rq->nr_running)
5244                 goto idle;
5245
5246         if (prev->sched_class != &fair_sched_class)
5247                 goto simple;
5248
5249         /*
5250          * Because of the set_next_buddy() in dequeue_task_fair() it is rather
5251          * likely that a next task is from the same cgroup as the current.
5252          *
5253          * Therefore attempt to avoid putting and setting the entire cgroup
5254          * hierarchy, only change the part that actually changes.
5255          */
5256
5257         do {
5258                 struct sched_entity *curr = cfs_rq->curr;
5259
5260                 /*
5261                  * Since we got here without doing put_prev_entity() we also
5262                  * have to consider cfs_rq->curr. If it is still a runnable
5263                  * entity, update_curr() will update its vruntime, otherwise
5264                  * forget we've ever seen it.
5265                  */
5266                 if (curr) {
5267                         if (curr->on_rq)
5268                                 update_curr(cfs_rq);
5269                         else
5270                                 curr = NULL;
5271
5272                         /*
5273                          * This call to check_cfs_rq_runtime() will do the
5274                          * throttle and dequeue its entity in the parent(s).
5275                          * Therefore the 'simple' nr_running test will indeed
5276                          * be correct.
5277                          */
5278                         if (unlikely(check_cfs_rq_runtime(cfs_rq)))
5279                                 goto simple;
5280                 }
5281
5282                 se = pick_next_entity(cfs_rq, curr);
5283                 cfs_rq = group_cfs_rq(se);
5284         } while (cfs_rq);
5285
5286         p = task_of(se);
5287
5288         /*
5289          * Since we haven't yet done put_prev_entity and if the selected task
5290          * is a different task than we started out with, try and touch the
5291          * least amount of cfs_rqs.
5292          */
5293         if (prev != p) {
5294                 struct sched_entity *pse = &prev->se;
5295
5296                 while (!(cfs_rq = is_same_group(se, pse))) {
5297                         int se_depth = se->depth;
5298                         int pse_depth = pse->depth;
5299
5300                         if (se_depth <= pse_depth) {
5301                                 put_prev_entity(cfs_rq_of(pse), pse);
5302                                 pse = parent_entity(pse);
5303                         }
5304                         if (se_depth >= pse_depth) {
5305                                 set_next_entity(cfs_rq_of(se), se);
5306                                 se = parent_entity(se);
5307                         }
5308                 }
5309
5310                 put_prev_entity(cfs_rq, pse);
5311                 set_next_entity(cfs_rq, se);
5312         }
5313
5314         if (hrtick_enabled(rq))
5315                 hrtick_start_fair(rq, p);
5316
5317         return p;
5318 simple:
5319         cfs_rq = &rq->cfs;
5320 #endif
5321
5322         if (!cfs_rq->nr_running)
5323                 goto idle;
5324
5325         put_prev_task(rq, prev);
5326
5327         do {
5328                 se = pick_next_entity(cfs_rq, NULL);
5329                 set_next_entity(cfs_rq, se);
5330                 cfs_rq = group_cfs_rq(se);
5331         } while (cfs_rq);
5332
5333         p = task_of(se);
5334
5335         if (hrtick_enabled(rq))
5336                 hrtick_start_fair(rq, p);
5337
5338         return p;
5339
5340 idle:
5341         /*
5342          * This is OK, because current is on_cpu, which avoids it being picked
5343          * for load-balance and preemption/IRQs are still disabled avoiding
5344          * further scheduler activity on it and we're being very careful to
5345          * re-start the picking loop.
5346          */
5347         lockdep_unpin_lock(&rq->lock);
5348         new_tasks = idle_balance(rq);
5349         lockdep_pin_lock(&rq->lock);
5350         /*
5351          * Because idle_balance() releases (and re-acquires) rq->lock, it is
5352          * possible for any higher priority task to appear. In that case we
5353          * must re-start the pick_next_entity() loop.
5354          */
5355         if (new_tasks < 0)
5356                 return RETRY_TASK;
5357
5358         if (new_tasks > 0)
5359                 goto again;
5360
5361         return NULL;
5362 }
5363
5364 /*
5365  * Account for a descheduled task:
5366  */
5367 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5368 {
5369         struct sched_entity *se = &prev->se;
5370         struct cfs_rq *cfs_rq;
5371
5372         for_each_sched_entity(se) {
5373                 cfs_rq = cfs_rq_of(se);
5374                 put_prev_entity(cfs_rq, se);
5375         }
5376 }
5377
5378 /*
5379  * sched_yield() is very simple
5380  *
5381  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5382  */
5383 static void yield_task_fair(struct rq *rq)
5384 {
5385         struct task_struct *curr = rq->curr;
5386         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5387         struct sched_entity *se = &curr->se;
5388
5389         /*
5390          * Are we the only task in the tree?
5391          */
5392         if (unlikely(rq->nr_running == 1))
5393                 return;
5394
5395         clear_buddies(cfs_rq, se);
5396
5397         if (curr->policy != SCHED_BATCH) {
5398                 update_rq_clock(rq);
5399                 /*
5400                  * Update run-time statistics of the 'current'.
5401                  */
5402                 update_curr(cfs_rq);
5403                 /*
5404                  * Tell update_rq_clock() that we've just updated,
5405                  * so we don't do microscopic update in schedule()
5406                  * and double the fastpath cost.
5407                  */
5408                 rq_clock_skip_update(rq, true);
5409         }
5410
5411         set_skip_buddy(se);
5412 }
5413
5414 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5415 {
5416         struct sched_entity *se = &p->se;
5417
5418         /* throttled hierarchies are not runnable */
5419         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5420                 return false;
5421
5422         /* Tell the scheduler that we'd really like pse to run next. */
5423         set_next_buddy(se);
5424
5425         yield_task_fair(rq);
5426
5427         return true;
5428 }
5429
5430 #ifdef CONFIG_SMP
5431 /**************************************************
5432  * Fair scheduling class load-balancing methods.
5433  *
5434  * BASICS
5435  *
5436  * The purpose of load-balancing is to achieve the same basic fairness the
5437  * per-cpu scheduler provides, namely provide a proportional amount of compute
5438  * time to each task. This is expressed in the following equation:
5439  *
5440  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5441  *
5442  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5443  * W_i,0 is defined as:
5444  *
5445  *   W_i,0 = \Sum_j w_i,j                                             (2)
5446  *
5447  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5448  * is derived from the nice value as per prio_to_weight[].
5449  *
5450  * The weight average is an exponential decay average of the instantaneous
5451  * weight:
5452  *
5453  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5454  *
5455  * C_i is the compute capacity of cpu i, typically it is the
5456  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5457  * can also include other factors [XXX].
5458  *
5459  * To achieve this balance we define a measure of imbalance which follows
5460  * directly from (1):
5461  *
5462  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
5463  *
5464  * We them move tasks around to minimize the imbalance. In the continuous
5465  * function space it is obvious this converges, in the discrete case we get
5466  * a few fun cases generally called infeasible weight scenarios.
5467  *
5468  * [XXX expand on:
5469  *     - infeasible weights;
5470  *     - local vs global optima in the discrete case. ]
5471  *
5472  *
5473  * SCHED DOMAINS
5474  *
5475  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5476  * for all i,j solution, we create a tree of cpus that follows the hardware
5477  * topology where each level pairs two lower groups (or better). This results
5478  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5479  * tree to only the first of the previous level and we decrease the frequency
5480  * of load-balance at each level inv. proportional to the number of cpus in
5481  * the groups.
5482  *
5483  * This yields:
5484  *
5485  *     log_2 n     1     n
5486  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5487  *     i = 0      2^i   2^i
5488  *                               `- size of each group
5489  *         |         |     `- number of cpus doing load-balance
5490  *         |         `- freq
5491  *         `- sum over all levels
5492  *
5493  * Coupled with a limit on how many tasks we can migrate every balance pass,
5494  * this makes (5) the runtime complexity of the balancer.
5495  *
5496  * An important property here is that each CPU is still (indirectly) connected
5497  * to every other cpu in at most O(log n) steps:
5498  *
5499  * The adjacency matrix of the resulting graph is given by:
5500  *
5501  *             log_2 n
5502  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5503  *             k = 0
5504  *
5505  * And you'll find that:
5506  *
5507  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5508  *
5509  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5510  * The task movement gives a factor of O(m), giving a convergence complexity
5511  * of:
5512  *
5513  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5514  *
5515  *
5516  * WORK CONSERVING
5517  *
5518  * In order to avoid CPUs going idle while there's still work to do, new idle
5519  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5520  * tree itself instead of relying on other CPUs to bring it work.
5521  *
5522  * This adds some complexity to both (5) and (8) but it reduces the total idle
5523  * time.
5524  *
5525  * [XXX more?]
5526  *
5527  *
5528  * CGROUPS
5529  *
5530  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5531  *
5532  *                                s_k,i
5533  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5534  *                                 S_k
5535  *
5536  * Where
5537  *
5538  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5539  *
5540  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5541  *
5542  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5543  * property.
5544  *
5545  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5546  *      rewrite all of this once again.]
5547  */
5548
5549 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5550
5551 enum fbq_type { regular, remote, all };
5552
5553 #define LBF_ALL_PINNED  0x01
5554 #define LBF_NEED_BREAK  0x02
5555 #define LBF_DST_PINNED  0x04
5556 #define LBF_SOME_PINNED 0x08
5557
5558 struct lb_env {
5559         struct sched_domain     *sd;
5560
5561         struct rq               *src_rq;
5562         int                     src_cpu;
5563
5564         int                     dst_cpu;
5565         struct rq               *dst_rq;
5566
5567         struct cpumask          *dst_grpmask;
5568         int                     new_dst_cpu;
5569         enum cpu_idle_type      idle;
5570         long                    imbalance;
5571         /* The set of CPUs under consideration for load-balancing */
5572         struct cpumask          *cpus;
5573
5574         unsigned int            flags;
5575
5576         unsigned int            loop;
5577         unsigned int            loop_break;
5578         unsigned int            loop_max;
5579
5580         enum fbq_type           fbq_type;
5581         struct list_head        tasks;
5582 };
5583
5584 /*
5585  * Is this task likely cache-hot:
5586  */
5587 static int task_hot(struct task_struct *p, struct lb_env *env)
5588 {
5589         s64 delta;
5590
5591         lockdep_assert_held(&env->src_rq->lock);
5592
5593         if (p->sched_class != &fair_sched_class)
5594                 return 0;
5595
5596         if (unlikely(p->policy == SCHED_IDLE))
5597                 return 0;
5598
5599         /*
5600          * Buddy candidates are cache hot:
5601          */
5602         if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
5603                         (&p->se == cfs_rq_of(&p->se)->next ||
5604                          &p->se == cfs_rq_of(&p->se)->last))
5605                 return 1;
5606
5607         if (sysctl_sched_migration_cost == -1)
5608                 return 1;
5609         if (sysctl_sched_migration_cost == 0)
5610                 return 0;
5611
5612         delta = rq_clock_task(env->src_rq) - p->se.exec_start;
5613
5614         return delta < (s64)sysctl_sched_migration_cost;
5615 }
5616
5617 #ifdef CONFIG_NUMA_BALANCING
5618 /*
5619  * Returns 1, if task migration degrades locality
5620  * Returns 0, if task migration improves locality i.e migration preferred.
5621  * Returns -1, if task migration is not affected by locality.
5622  */
5623 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5624 {
5625         struct numa_group *numa_group = rcu_dereference(p->numa_group);
5626         unsigned long src_faults, dst_faults;
5627         int src_nid, dst_nid;
5628
5629         if (!static_branch_likely(&sched_numa_balancing))
5630                 return -1;
5631
5632         if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5633                 return -1;
5634
5635         src_nid = cpu_to_node(env->src_cpu);
5636         dst_nid = cpu_to_node(env->dst_cpu);
5637
5638         if (src_nid == dst_nid)
5639                 return -1;
5640
5641         /* Migrating away from the preferred node is always bad. */
5642         if (src_nid == p->numa_preferred_nid) {
5643                 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5644                         return 1;
5645                 else
5646                         return -1;
5647         }
5648
5649         /* Encourage migration to the preferred node. */
5650         if (dst_nid == p->numa_preferred_nid)
5651                 return 0;
5652
5653         if (numa_group) {
5654                 src_faults = group_faults(p, src_nid);
5655                 dst_faults = group_faults(p, dst_nid);
5656         } else {
5657                 src_faults = task_faults(p, src_nid);
5658                 dst_faults = task_faults(p, dst_nid);
5659         }
5660
5661         return dst_faults < src_faults;
5662 }
5663
5664 #else
5665 static inline int migrate_degrades_locality(struct task_struct *p,
5666                                              struct lb_env *env)
5667 {
5668         return -1;
5669 }
5670 #endif
5671
5672 /*
5673  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5674  */
5675 static
5676 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5677 {
5678         int tsk_cache_hot;
5679
5680         lockdep_assert_held(&env->src_rq->lock);
5681
5682         /*
5683          * We do not migrate tasks that are:
5684          * 1) throttled_lb_pair, or
5685          * 2) cannot be migrated to this CPU due to cpus_allowed, or
5686          * 3) running (obviously), or
5687          * 4) are cache-hot on their current CPU.
5688          */
5689         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5690                 return 0;
5691
5692         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5693                 int cpu;
5694
5695                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5696
5697                 env->flags |= LBF_SOME_PINNED;
5698
5699                 /*
5700                  * Remember if this task can be migrated to any other cpu in
5701                  * our sched_group. We may want to revisit it if we couldn't
5702                  * meet load balance goals by pulling other tasks on src_cpu.
5703                  *
5704                  * Also avoid computing new_dst_cpu if we have already computed
5705                  * one in current iteration.
5706                  */
5707                 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
5708                         return 0;
5709
5710                 /* Prevent to re-select dst_cpu via env's cpus */
5711                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5712                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5713                                 env->flags |= LBF_DST_PINNED;
5714                                 env->new_dst_cpu = cpu;
5715                                 break;
5716                         }
5717                 }
5718
5719                 return 0;
5720         }
5721
5722         /* Record that we found atleast one task that could run on dst_cpu */
5723         env->flags &= ~LBF_ALL_PINNED;
5724
5725         if (task_running(env->src_rq, p)) {
5726                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5727                 return 0;
5728         }
5729
5730         /*
5731          * Aggressive migration if:
5732          * 1) destination numa is preferred
5733          * 2) task is cache cold, or
5734          * 3) too many balance attempts have failed.
5735          */
5736         tsk_cache_hot = migrate_degrades_locality(p, env);
5737         if (tsk_cache_hot == -1)
5738                 tsk_cache_hot = task_hot(p, env);
5739
5740         if (tsk_cache_hot <= 0 ||
5741             env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5742                 if (tsk_cache_hot == 1) {
5743                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5744                         schedstat_inc(p, se.statistics.nr_forced_migrations);
5745                 }
5746                 return 1;
5747         }
5748
5749         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5750         return 0;
5751 }
5752
5753 /*
5754  * detach_task() -- detach the task for the migration specified in env
5755  */
5756 static void detach_task(struct task_struct *p, struct lb_env *env)
5757 {
5758         lockdep_assert_held(&env->src_rq->lock);
5759
5760         deactivate_task(env->src_rq, p, 0);
5761         p->on_rq = TASK_ON_RQ_MIGRATING;
5762         set_task_cpu(p, env->dst_cpu);
5763 }
5764
5765 /*
5766  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5767  * part of active balancing operations within "domain".
5768  *
5769  * Returns a task if successful and NULL otherwise.
5770  */
5771 static struct task_struct *detach_one_task(struct lb_env *env)
5772 {
5773         struct task_struct *p, *n;
5774
5775         lockdep_assert_held(&env->src_rq->lock);
5776
5777         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5778                 if (!can_migrate_task(p, env))
5779                         continue;
5780
5781                 detach_task(p, env);
5782
5783                 /*
5784                  * Right now, this is only the second place where
5785                  * lb_gained[env->idle] is updated (other is detach_tasks)
5786                  * so we can safely collect stats here rather than
5787                  * inside detach_tasks().
5788                  */
5789                 schedstat_inc(env->sd, lb_gained[env->idle]);
5790                 return p;
5791         }
5792         return NULL;
5793 }
5794
5795 static const unsigned int sched_nr_migrate_break = 32;
5796
5797 /*
5798  * detach_tasks() -- tries to detach up to imbalance weighted load from
5799  * busiest_rq, as part of a balancing operation within domain "sd".
5800  *
5801  * Returns number of detached tasks if successful and 0 otherwise.
5802  */
5803 static int detach_tasks(struct lb_env *env)
5804 {
5805         struct list_head *tasks = &env->src_rq->cfs_tasks;
5806         struct task_struct *p;
5807         unsigned long load;
5808         int detached = 0;
5809
5810         lockdep_assert_held(&env->src_rq->lock);
5811
5812         if (env->imbalance <= 0)
5813                 return 0;
5814
5815         while (!list_empty(tasks)) {
5816                 /*
5817                  * We don't want to steal all, otherwise we may be treated likewise,
5818                  * which could at worst lead to a livelock crash.
5819                  */
5820                 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5821                         break;
5822
5823                 p = list_first_entry(tasks, struct task_struct, se.group_node);
5824
5825                 env->loop++;
5826                 /* We've more or less seen every task there is, call it quits */
5827                 if (env->loop > env->loop_max)
5828                         break;
5829
5830                 /* take a breather every nr_migrate tasks */
5831                 if (env->loop > env->loop_break) {
5832                         env->loop_break += sched_nr_migrate_break;
5833                         env->flags |= LBF_NEED_BREAK;
5834                         break;
5835                 }
5836
5837                 if (!can_migrate_task(p, env))
5838                         goto next;
5839
5840                 load = task_h_load(p);
5841
5842                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
5843                         goto next;
5844
5845                 if ((load / 2) > env->imbalance)
5846                         goto next;
5847
5848                 detach_task(p, env);
5849                 list_add(&p->se.group_node, &env->tasks);
5850
5851                 detached++;
5852                 env->imbalance -= load;
5853
5854 #ifdef CONFIG_PREEMPT
5855                 /*
5856                  * NEWIDLE balancing is a source of latency, so preemptible
5857                  * kernels will stop after the first task is detached to minimize
5858                  * the critical section.
5859                  */
5860                 if (env->idle == CPU_NEWLY_IDLE)
5861                         break;
5862 #endif
5863
5864                 /*
5865                  * We only want to steal up to the prescribed amount of
5866                  * weighted load.
5867                  */
5868                 if (env->imbalance <= 0)
5869                         break;
5870
5871                 continue;
5872 next:
5873                 list_move_tail(&p->se.group_node, tasks);
5874         }
5875
5876         /*
5877          * Right now, this is one of only two places we collect this stat
5878          * so we can safely collect detach_one_task() stats here rather
5879          * than inside detach_one_task().
5880          */
5881         schedstat_add(env->sd, lb_gained[env->idle], detached);
5882
5883         return detached;
5884 }
5885
5886 /*
5887  * attach_task() -- attach the task detached by detach_task() to its new rq.
5888  */
5889 static void attach_task(struct rq *rq, struct task_struct *p)
5890 {
5891         lockdep_assert_held(&rq->lock);
5892
5893         BUG_ON(task_rq(p) != rq);
5894         p->on_rq = TASK_ON_RQ_QUEUED;
5895         activate_task(rq, p, 0);
5896         check_preempt_curr(rq, p, 0);
5897 }
5898
5899 /*
5900  * attach_one_task() -- attaches the task returned from detach_one_task() to
5901  * its new rq.
5902  */
5903 static void attach_one_task(struct rq *rq, struct task_struct *p)
5904 {
5905         raw_spin_lock(&rq->lock);
5906         attach_task(rq, p);
5907         raw_spin_unlock(&rq->lock);
5908 }
5909
5910 /*
5911  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5912  * new rq.
5913  */
5914 static void attach_tasks(struct lb_env *env)
5915 {
5916         struct list_head *tasks = &env->tasks;
5917         struct task_struct *p;
5918
5919         raw_spin_lock(&env->dst_rq->lock);
5920
5921         while (!list_empty(tasks)) {
5922                 p = list_first_entry(tasks, struct task_struct, se.group_node);
5923                 list_del_init(&p->se.group_node);
5924
5925                 attach_task(env->dst_rq, p);
5926         }
5927
5928         raw_spin_unlock(&env->dst_rq->lock);
5929 }
5930
5931 #ifdef CONFIG_FAIR_GROUP_SCHED
5932 static void update_blocked_averages(int cpu)
5933 {
5934         struct rq *rq = cpu_rq(cpu);
5935         struct cfs_rq *cfs_rq;
5936         unsigned long flags;
5937
5938         raw_spin_lock_irqsave(&rq->lock, flags);
5939         update_rq_clock(rq);
5940
5941         /*
5942          * Iterates the task_group tree in a bottom up fashion, see
5943          * list_add_leaf_cfs_rq() for details.
5944          */
5945         for_each_leaf_cfs_rq(rq, cfs_rq) {
5946                 /* throttled entities do not contribute to load */
5947                 if (throttled_hierarchy(cfs_rq))
5948                         continue;
5949
5950                 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
5951                         update_tg_load_avg(cfs_rq, 0);
5952         }
5953         raw_spin_unlock_irqrestore(&rq->lock, flags);
5954 }
5955
5956 /*
5957  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
5958  * This needs to be done in a top-down fashion because the load of a child
5959  * group is a fraction of its parents load.
5960  */
5961 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
5962 {
5963         struct rq *rq = rq_of(cfs_rq);
5964         struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
5965         unsigned long now = jiffies;
5966         unsigned long load;
5967
5968         if (cfs_rq->last_h_load_update == now)
5969                 return;
5970
5971         cfs_rq->h_load_next = NULL;
5972         for_each_sched_entity(se) {
5973                 cfs_rq = cfs_rq_of(se);
5974                 cfs_rq->h_load_next = se;
5975                 if (cfs_rq->last_h_load_update == now)
5976                         break;
5977         }
5978
5979         if (!se) {
5980                 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
5981                 cfs_rq->last_h_load_update = now;
5982         }
5983
5984         while ((se = cfs_rq->h_load_next) != NULL) {
5985                 load = cfs_rq->h_load;
5986                 load = div64_ul(load * se->avg.load_avg,
5987                         cfs_rq_load_avg(cfs_rq) + 1);
5988                 cfs_rq = group_cfs_rq(se);
5989                 cfs_rq->h_load = load;
5990                 cfs_rq->last_h_load_update = now;
5991         }
5992 }
5993
5994 static unsigned long task_h_load(struct task_struct *p)
5995 {
5996         struct cfs_rq *cfs_rq = task_cfs_rq(p);
5997
5998         update_cfs_rq_h_load(cfs_rq);
5999         return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6000                         cfs_rq_load_avg(cfs_rq) + 1);
6001 }
6002 #else
6003 static inline void update_blocked_averages(int cpu)
6004 {
6005         struct rq *rq = cpu_rq(cpu);
6006         struct cfs_rq *cfs_rq = &rq->cfs;
6007         unsigned long flags;
6008
6009         raw_spin_lock_irqsave(&rq->lock, flags);
6010         update_rq_clock(rq);
6011         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
6012         raw_spin_unlock_irqrestore(&rq->lock, flags);
6013 }
6014
6015 static unsigned long task_h_load(struct task_struct *p)
6016 {
6017         return p->se.avg.load_avg;
6018 }
6019 #endif
6020
6021 /********** Helpers for find_busiest_group ************************/
6022
6023 enum group_type {
6024         group_other = 0,
6025         group_imbalanced,
6026         group_overloaded,
6027 };
6028
6029 /*
6030  * sg_lb_stats - stats of a sched_group required for load_balancing
6031  */
6032 struct sg_lb_stats {
6033         unsigned long avg_load; /*Avg load across the CPUs of the group */
6034         unsigned long group_load; /* Total load over the CPUs of the group */
6035         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6036         unsigned long load_per_task;
6037         unsigned long group_capacity;
6038         unsigned long group_util; /* Total utilization of the group */
6039         unsigned int sum_nr_running; /* Nr tasks running in the group */
6040         unsigned int idle_cpus;
6041         unsigned int group_weight;
6042         enum group_type group_type;
6043         int group_no_capacity;
6044 #ifdef CONFIG_NUMA_BALANCING
6045         unsigned int nr_numa_running;
6046         unsigned int nr_preferred_running;
6047 #endif
6048 };
6049
6050 /*
6051  * sd_lb_stats - Structure to store the statistics of a sched_domain
6052  *               during load balancing.
6053  */
6054 struct sd_lb_stats {
6055         struct sched_group *busiest;    /* Busiest group in this sd */
6056         struct sched_group *local;      /* Local group in this sd */
6057         unsigned long total_load;       /* Total load of all groups in sd */
6058         unsigned long total_capacity;   /* Total capacity of all groups in sd */
6059         unsigned long avg_load; /* Average load across all groups in sd */
6060
6061         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
6062         struct sg_lb_stats local_stat;  /* Statistics of the local group */
6063 };
6064
6065 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
6066 {
6067         /*
6068          * Skimp on the clearing to avoid duplicate work. We can avoid clearing
6069          * local_stat because update_sg_lb_stats() does a full clear/assignment.
6070          * We must however clear busiest_stat::avg_load because
6071          * update_sd_pick_busiest() reads this before assignment.
6072          */
6073         *sds = (struct sd_lb_stats){
6074                 .busiest = NULL,
6075                 .local = NULL,
6076                 .total_load = 0UL,
6077                 .total_capacity = 0UL,
6078                 .busiest_stat = {
6079                         .avg_load = 0UL,
6080                         .sum_nr_running = 0,
6081                         .group_type = group_other,
6082                 },
6083         };
6084 }
6085
6086 /**
6087  * get_sd_load_idx - Obtain the load index for a given sched domain.
6088  * @sd: The sched_domain whose load_idx is to be obtained.
6089  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
6090  *
6091  * Return: The load index.
6092  */
6093 static inline int get_sd_load_idx(struct sched_domain *sd,
6094                                         enum cpu_idle_type idle)
6095 {
6096         int load_idx;
6097
6098         switch (idle) {
6099         case CPU_NOT_IDLE:
6100                 load_idx = sd->busy_idx;
6101                 break;
6102
6103         case CPU_NEWLY_IDLE:
6104                 load_idx = sd->newidle_idx;
6105                 break;
6106         default:
6107                 load_idx = sd->idle_idx;
6108                 break;
6109         }
6110
6111         return load_idx;
6112 }
6113
6114 static unsigned long scale_rt_capacity(int cpu)
6115 {
6116         struct rq *rq = cpu_rq(cpu);
6117         u64 total, used, age_stamp, avg;
6118         s64 delta;
6119
6120         /*
6121          * Since we're reading these variables without serialization make sure
6122          * we read them once before doing sanity checks on them.
6123          */
6124         age_stamp = READ_ONCE(rq->age_stamp);
6125         avg = READ_ONCE(rq->rt_avg);
6126         delta = __rq_clock_broken(rq) - age_stamp;
6127
6128         if (unlikely(delta < 0))
6129                 delta = 0;
6130
6131         total = sched_avg_period() + delta;
6132
6133         used = div_u64(avg, total);
6134
6135         if (likely(used < SCHED_CAPACITY_SCALE))
6136                 return SCHED_CAPACITY_SCALE - used;
6137
6138         return 1;
6139 }
6140
6141 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6142 {
6143         unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
6144         struct sched_group *sdg = sd->groups;
6145
6146         cpu_rq(cpu)->cpu_capacity_orig = capacity;
6147
6148         capacity *= scale_rt_capacity(cpu);
6149         capacity >>= SCHED_CAPACITY_SHIFT;
6150
6151         if (!capacity)
6152                 capacity = 1;
6153
6154         cpu_rq(cpu)->cpu_capacity = capacity;
6155         sdg->sgc->capacity = capacity;
6156 }
6157
6158 void update_group_capacity(struct sched_domain *sd, int cpu)
6159 {
6160         struct sched_domain *child = sd->child;
6161         struct sched_group *group, *sdg = sd->groups;
6162         unsigned long capacity;
6163         unsigned long interval;
6164
6165         interval = msecs_to_jiffies(sd->balance_interval);
6166         interval = clamp(interval, 1UL, max_load_balance_interval);
6167         sdg->sgc->next_update = jiffies + interval;
6168
6169         if (!child) {
6170                 update_cpu_capacity(sd, cpu);
6171                 return;
6172         }
6173
6174         capacity = 0;
6175
6176         if (child->flags & SD_OVERLAP) {
6177                 /*
6178                  * SD_OVERLAP domains cannot assume that child groups
6179                  * span the current group.
6180                  */
6181
6182                 for_each_cpu(cpu, sched_group_cpus(sdg)) {
6183                         struct sched_group_capacity *sgc;
6184                         struct rq *rq = cpu_rq(cpu);
6185
6186                         /*
6187                          * build_sched_domains() -> init_sched_groups_capacity()
6188                          * gets here before we've attached the domains to the
6189                          * runqueues.
6190                          *
6191                          * Use capacity_of(), which is set irrespective of domains
6192                          * in update_cpu_capacity().
6193                          *
6194                          * This avoids capacity from being 0 and
6195                          * causing divide-by-zero issues on boot.
6196                          */
6197                         if (unlikely(!rq->sd)) {
6198                                 capacity += capacity_of(cpu);
6199                                 continue;
6200                         }
6201
6202                         sgc = rq->sd->groups->sgc;
6203                         capacity += sgc->capacity;
6204                 }
6205         } else  {
6206                 /*
6207                  * !SD_OVERLAP domains can assume that child groups
6208                  * span the current group.
6209                  */
6210
6211                 group = child->groups;
6212                 do {
6213                         capacity += group->sgc->capacity;
6214                         group = group->next;
6215                 } while (group != child->groups);
6216         }
6217
6218         sdg->sgc->capacity = capacity;
6219 }
6220
6221 /*
6222  * Check whether the capacity of the rq has been noticeably reduced by side
6223  * activity. The imbalance_pct is used for the threshold.
6224  * Return true is the capacity is reduced
6225  */
6226 static inline int
6227 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6228 {
6229         return ((rq->cpu_capacity * sd->imbalance_pct) <
6230                                 (rq->cpu_capacity_orig * 100));
6231 }
6232
6233 /*
6234  * Group imbalance indicates (and tries to solve) the problem where balancing
6235  * groups is inadequate due to tsk_cpus_allowed() constraints.
6236  *
6237  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
6238  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6239  * Something like:
6240  *
6241  *      { 0 1 2 3 } { 4 5 6 7 }
6242  *              *     * * *
6243  *
6244  * If we were to balance group-wise we'd place two tasks in the first group and
6245  * two tasks in the second group. Clearly this is undesired as it will overload
6246  * cpu 3 and leave one of the cpus in the second group unused.
6247  *
6248  * The current solution to this issue is detecting the skew in the first group
6249  * by noticing the lower domain failed to reach balance and had difficulty
6250  * moving tasks due to affinity constraints.
6251  *
6252  * When this is so detected; this group becomes a candidate for busiest; see
6253  * update_sd_pick_busiest(). And calculate_imbalance() and
6254  * find_busiest_group() avoid some of the usual balance conditions to allow it
6255  * to create an effective group imbalance.
6256  *
6257  * This is a somewhat tricky proposition since the next run might not find the
6258  * group imbalance and decide the groups need to be balanced again. A most
6259  * subtle and fragile situation.
6260  */
6261
6262 static inline int sg_imbalanced(struct sched_group *group)
6263 {
6264         return group->sgc->imbalance;
6265 }
6266
6267 /*
6268  * group_has_capacity returns true if the group has spare capacity that could
6269  * be used by some tasks.
6270  * We consider that a group has spare capacity if the  * number of task is
6271  * smaller than the number of CPUs or if the utilization is lower than the
6272  * available capacity for CFS tasks.
6273  * For the latter, we use a threshold to stabilize the state, to take into
6274  * account the variance of the tasks' load and to return true if the available
6275  * capacity in meaningful for the load balancer.
6276  * As an example, an available capacity of 1% can appear but it doesn't make
6277  * any benefit for the load balance.
6278  */
6279 static inline bool
6280 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
6281 {
6282         if (sgs->sum_nr_running < sgs->group_weight)
6283                 return true;
6284
6285         if ((sgs->group_capacity * 100) >
6286                         (sgs->group_util * env->sd->imbalance_pct))
6287                 return true;
6288
6289         return false;
6290 }
6291
6292 /*
6293  *  group_is_overloaded returns true if the group has more tasks than it can
6294  *  handle.
6295  *  group_is_overloaded is not equals to !group_has_capacity because a group
6296  *  with the exact right number of tasks, has no more spare capacity but is not
6297  *  overloaded so both group_has_capacity and group_is_overloaded return
6298  *  false.
6299  */
6300 static inline bool
6301 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
6302 {
6303         if (sgs->sum_nr_running <= sgs->group_weight)
6304                 return false;
6305
6306         if ((sgs->group_capacity * 100) <
6307                         (sgs->group_util * env->sd->imbalance_pct))
6308                 return true;
6309
6310         return false;
6311 }
6312
6313 static inline enum
6314 group_type group_classify(struct sched_group *group,
6315                           struct sg_lb_stats *sgs)
6316 {
6317         if (sgs->group_no_capacity)
6318                 return group_overloaded;
6319
6320         if (sg_imbalanced(group))
6321                 return group_imbalanced;
6322
6323         return group_other;
6324 }
6325
6326 /**
6327  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6328  * @env: The load balancing environment.
6329  * @group: sched_group whose statistics are to be updated.
6330  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6331  * @local_group: Does group contain this_cpu.
6332  * @sgs: variable to hold the statistics for this group.
6333  * @overload: Indicate more than one runnable task for any CPU.
6334  */
6335 static inline void update_sg_lb_stats(struct lb_env *env,
6336                         struct sched_group *group, int load_idx,
6337                         int local_group, struct sg_lb_stats *sgs,
6338                         bool *overload)
6339 {
6340         unsigned long load;
6341         int i;
6342
6343         memset(sgs, 0, sizeof(*sgs));
6344
6345         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6346                 struct rq *rq = cpu_rq(i);
6347
6348                 /* Bias balancing toward cpus of our domain */
6349                 if (local_group)
6350                         load = target_load(i, load_idx);
6351                 else
6352                         load = source_load(i, load_idx);
6353
6354                 sgs->group_load += load;
6355                 sgs->group_util += cpu_util(i);
6356                 sgs->sum_nr_running += rq->cfs.h_nr_running;
6357
6358                 if (rq->nr_running > 1)
6359                         *overload = true;
6360
6361 #ifdef CONFIG_NUMA_BALANCING
6362                 sgs->nr_numa_running += rq->nr_numa_running;
6363                 sgs->nr_preferred_running += rq->nr_preferred_running;
6364 #endif
6365                 sgs->sum_weighted_load += weighted_cpuload(i);
6366                 if (idle_cpu(i))
6367                         sgs->idle_cpus++;
6368         }
6369
6370         /* Adjust by relative CPU capacity of the group */
6371         sgs->group_capacity = group->sgc->capacity;
6372         sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
6373
6374         if (sgs->sum_nr_running)
6375                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
6376
6377         sgs->group_weight = group->group_weight;
6378
6379         sgs->group_no_capacity = group_is_overloaded(env, sgs);
6380         sgs->group_type = group_classify(group, sgs);
6381 }
6382
6383 /**
6384  * update_sd_pick_busiest - return 1 on busiest group
6385  * @env: The load balancing environment.
6386  * @sds: sched_domain statistics
6387  * @sg: sched_group candidate to be checked for being the busiest
6388  * @sgs: sched_group statistics
6389  *
6390  * Determine if @sg is a busier group than the previously selected
6391  * busiest group.
6392  *
6393  * Return: %true if @sg is a busier group than the previously selected
6394  * busiest group. %false otherwise.
6395  */
6396 static bool update_sd_pick_busiest(struct lb_env *env,
6397                                    struct sd_lb_stats *sds,
6398                                    struct sched_group *sg,
6399                                    struct sg_lb_stats *sgs)
6400 {
6401         struct sg_lb_stats *busiest = &sds->busiest_stat;
6402
6403         if (sgs->group_type > busiest->group_type)
6404                 return true;
6405
6406         if (sgs->group_type < busiest->group_type)
6407                 return false;
6408
6409         if (sgs->avg_load <= busiest->avg_load)
6410                 return false;
6411
6412         /* This is the busiest node in its class. */
6413         if (!(env->sd->flags & SD_ASYM_PACKING))
6414                 return true;
6415
6416         /*
6417          * ASYM_PACKING needs to move all the work to the lowest
6418          * numbered CPUs in the group, therefore mark all groups
6419          * higher than ourself as busy.
6420          */
6421         if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
6422                 if (!sds->busiest)
6423                         return true;
6424
6425                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
6426                         return true;
6427         }
6428
6429         return false;
6430 }
6431
6432 #ifdef CONFIG_NUMA_BALANCING
6433 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6434 {
6435         if (sgs->sum_nr_running > sgs->nr_numa_running)
6436                 return regular;
6437         if (sgs->sum_nr_running > sgs->nr_preferred_running)
6438                 return remote;
6439         return all;
6440 }
6441
6442 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6443 {
6444         if (rq->nr_running > rq->nr_numa_running)
6445                 return regular;
6446         if (rq->nr_running > rq->nr_preferred_running)
6447                 return remote;
6448         return all;
6449 }
6450 #else
6451 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
6452 {
6453         return all;
6454 }
6455
6456 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
6457 {
6458         return regular;
6459 }
6460 #endif /* CONFIG_NUMA_BALANCING */
6461
6462 /**
6463  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
6464  * @env: The load balancing environment.
6465  * @sds: variable to hold the statistics for this sched_domain.
6466  */
6467 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
6468 {
6469         struct sched_domain *child = env->sd->child;
6470         struct sched_group *sg = env->sd->groups;
6471         struct sg_lb_stats tmp_sgs;
6472         int load_idx, prefer_sibling = 0;
6473         bool overload = false;
6474
6475         if (child && child->flags & SD_PREFER_SIBLING)
6476                 prefer_sibling = 1;
6477
6478         load_idx = get_sd_load_idx(env->sd, env->idle);
6479
6480         do {
6481                 struct sg_lb_stats *sgs = &tmp_sgs;
6482                 int local_group;
6483
6484                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
6485                 if (local_group) {
6486                         sds->local = sg;
6487                         sgs = &sds->local_stat;
6488
6489                         if (env->idle != CPU_NEWLY_IDLE ||
6490                             time_after_eq(jiffies, sg->sgc->next_update))
6491                                 update_group_capacity(env->sd, env->dst_cpu);
6492                 }
6493
6494                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
6495                                                 &overload);
6496
6497                 if (local_group)
6498                         goto next_group;
6499
6500                 /*
6501                  * In case the child domain prefers tasks go to siblings
6502                  * first, lower the sg capacity so that we'll try
6503                  * and move all the excess tasks away. We lower the capacity
6504                  * of a group only if the local group has the capacity to fit
6505                  * these excess tasks. The extra check prevents the case where
6506                  * you always pull from the heaviest group when it is already
6507                  * under-utilized (possible with a large weight task outweighs
6508                  * the tasks on the system).
6509                  */
6510                 if (prefer_sibling && sds->local &&
6511                     group_has_capacity(env, &sds->local_stat) &&
6512                     (sgs->sum_nr_running > 1)) {
6513                         sgs->group_no_capacity = 1;
6514                         sgs->group_type = group_classify(sg, sgs);
6515                 }
6516
6517                 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
6518                         sds->busiest = sg;
6519                         sds->busiest_stat = *sgs;
6520                 }
6521
6522 next_group:
6523                 /* Now, start updating sd_lb_stats */
6524                 sds->total_load += sgs->group_load;
6525                 sds->total_capacity += sgs->group_capacity;
6526
6527                 sg = sg->next;
6528         } while (sg != env->sd->groups);
6529
6530         if (env->sd->flags & SD_NUMA)
6531                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
6532
6533         if (!env->sd->parent) {
6534                 /* update overload indicator if we are at root domain */
6535                 if (env->dst_rq->rd->overload != overload)
6536                         env->dst_rq->rd->overload = overload;
6537         }
6538
6539 }
6540
6541 /**
6542  * check_asym_packing - Check to see if the group is packed into the
6543  *                      sched doman.
6544  *
6545  * This is primarily intended to used at the sibling level.  Some
6546  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
6547  * case of POWER7, it can move to lower SMT modes only when higher
6548  * threads are idle.  When in lower SMT modes, the threads will
6549  * perform better since they share less core resources.  Hence when we
6550  * have idle threads, we want them to be the higher ones.
6551  *
6552  * This packing function is run on idle threads.  It checks to see if
6553  * the busiest CPU in this domain (core in the P7 case) has a higher
6554  * CPU number than the packing function is being run on.  Here we are
6555  * assuming lower CPU number will be equivalent to lower a SMT thread
6556  * number.
6557  *
6558  * Return: 1 when packing is required and a task should be moved to
6559  * this CPU.  The amount of the imbalance is returned in *imbalance.
6560  *
6561  * @env: The load balancing environment.
6562  * @sds: Statistics of the sched_domain which is to be packed
6563  */
6564 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
6565 {
6566         int busiest_cpu;
6567
6568         if (!(env->sd->flags & SD_ASYM_PACKING))
6569                 return 0;
6570
6571         if (!sds->busiest)
6572                 return 0;
6573
6574         busiest_cpu = group_first_cpu(sds->busiest);
6575         if (env->dst_cpu > busiest_cpu)
6576                 return 0;
6577
6578         env->imbalance = DIV_ROUND_CLOSEST(
6579                 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
6580                 SCHED_CAPACITY_SCALE);
6581
6582         return 1;
6583 }
6584
6585 /**
6586  * fix_small_imbalance - Calculate the minor imbalance that exists
6587  *                      amongst the groups of a sched_domain, during
6588  *                      load balancing.
6589  * @env: The load balancing environment.
6590  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
6591  */
6592 static inline
6593 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6594 {
6595         unsigned long tmp, capa_now = 0, capa_move = 0;
6596         unsigned int imbn = 2;
6597         unsigned long scaled_busy_load_per_task;
6598         struct sg_lb_stats *local, *busiest;
6599
6600         local = &sds->local_stat;
6601         busiest = &sds->busiest_stat;
6602
6603         if (!local->sum_nr_running)
6604                 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
6605         else if (busiest->load_per_task > local->load_per_task)
6606                 imbn = 1;
6607
6608         scaled_busy_load_per_task =
6609                 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6610                 busiest->group_capacity;
6611
6612         if (busiest->avg_load + scaled_busy_load_per_task >=
6613             local->avg_load + (scaled_busy_load_per_task * imbn)) {
6614                 env->imbalance = busiest->load_per_task;
6615                 return;
6616         }
6617
6618         /*
6619          * OK, we don't have enough imbalance to justify moving tasks,
6620          * however we may be able to increase total CPU capacity used by
6621          * moving them.
6622          */
6623
6624         capa_now += busiest->group_capacity *
6625                         min(busiest->load_per_task, busiest->avg_load);
6626         capa_now += local->group_capacity *
6627                         min(local->load_per_task, local->avg_load);
6628         capa_now /= SCHED_CAPACITY_SCALE;
6629
6630         /* Amount of load we'd subtract */
6631         if (busiest->avg_load > scaled_busy_load_per_task) {
6632                 capa_move += busiest->group_capacity *
6633                             min(busiest->load_per_task,
6634                                 busiest->avg_load - scaled_busy_load_per_task);
6635         }
6636
6637         /* Amount of load we'd add */
6638         if (busiest->avg_load * busiest->group_capacity <
6639             busiest->load_per_task * SCHED_CAPACITY_SCALE) {
6640                 tmp = (busiest->avg_load * busiest->group_capacity) /
6641                       local->group_capacity;
6642         } else {
6643                 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
6644                       local->group_capacity;
6645         }
6646         capa_move += local->group_capacity *
6647                     min(local->load_per_task, local->avg_load + tmp);
6648         capa_move /= SCHED_CAPACITY_SCALE;
6649
6650         /* Move if we gain throughput */
6651         if (capa_move > capa_now)
6652                 env->imbalance = busiest->load_per_task;
6653 }
6654
6655 /**
6656  * calculate_imbalance - Calculate the amount of imbalance present within the
6657  *                       groups of a given sched_domain during load balance.
6658  * @env: load balance environment
6659  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
6660  */
6661 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
6662 {
6663         unsigned long max_pull, load_above_capacity = ~0UL;
6664         struct sg_lb_stats *local, *busiest;
6665
6666         local = &sds->local_stat;
6667         busiest = &sds->busiest_stat;
6668
6669         if (busiest->group_type == group_imbalanced) {
6670                 /*
6671                  * In the group_imb case we cannot rely on group-wide averages
6672                  * to ensure cpu-load equilibrium, look at wider averages. XXX
6673                  */
6674                 busiest->load_per_task =
6675                         min(busiest->load_per_task, sds->avg_load);
6676         }
6677
6678         /*
6679          * In the presence of smp nice balancing, certain scenarios can have
6680          * max load less than avg load(as we skip the groups at or below
6681          * its cpu_capacity, while calculating max_load..)
6682          */
6683         if (busiest->avg_load <= sds->avg_load ||
6684             local->avg_load >= sds->avg_load) {
6685                 env->imbalance = 0;
6686                 return fix_small_imbalance(env, sds);
6687         }
6688
6689         /*
6690          * If there aren't any idle cpus, avoid creating some.
6691          */
6692         if (busiest->group_type == group_overloaded &&
6693             local->group_type   == group_overloaded) {
6694                 load_above_capacity = busiest->sum_nr_running *
6695                                         SCHED_LOAD_SCALE;
6696                 if (load_above_capacity > busiest->group_capacity)
6697                         load_above_capacity -= busiest->group_capacity;
6698                 else
6699                         load_above_capacity = ~0UL;
6700         }
6701
6702         /*
6703          * We're trying to get all the cpus to the average_load, so we don't
6704          * want to push ourselves above the average load, nor do we wish to
6705          * reduce the max loaded cpu below the average load. At the same time,
6706          * we also don't want to reduce the group load below the group capacity
6707          * (so that we can implement power-savings policies etc). Thus we look
6708          * for the minimum possible imbalance.
6709          */
6710         max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
6711
6712         /* How much load to actually move to equalise the imbalance */
6713         env->imbalance = min(
6714                 max_pull * busiest->group_capacity,
6715                 (sds->avg_load - local->avg_load) * local->group_capacity
6716         ) / SCHED_CAPACITY_SCALE;
6717
6718         /*
6719          * if *imbalance is less than the average load per runnable task
6720          * there is no guarantee that any tasks will be moved so we'll have
6721          * a think about bumping its value to force at least one task to be
6722          * moved
6723          */
6724         if (env->imbalance < busiest->load_per_task)
6725                 return fix_small_imbalance(env, sds);
6726 }
6727
6728 /******* find_busiest_group() helpers end here *********************/
6729
6730 /**
6731  * find_busiest_group - Returns the busiest group within the sched_domain
6732  * if there is an imbalance. If there isn't an imbalance, and
6733  * the user has opted for power-savings, it returns a group whose
6734  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
6735  * such a group exists.
6736  *
6737  * Also calculates the amount of weighted load which should be moved
6738  * to restore balance.
6739  *
6740  * @env: The load balancing environment.
6741  *
6742  * Return:      - The busiest group if imbalance exists.
6743  *              - If no imbalance and user has opted for power-savings balance,
6744  *                 return the least loaded group whose CPUs can be
6745  *                 put to idle by rebalancing its tasks onto our group.
6746  */
6747 static struct sched_group *find_busiest_group(struct lb_env *env)
6748 {
6749         struct sg_lb_stats *local, *busiest;
6750         struct sd_lb_stats sds;
6751
6752         init_sd_lb_stats(&sds);
6753
6754         /*
6755          * Compute the various statistics relavent for load balancing at
6756          * this level.
6757          */
6758         update_sd_lb_stats(env, &sds);
6759         local = &sds.local_stat;
6760         busiest = &sds.busiest_stat;
6761
6762         /* ASYM feature bypasses nice load balance check */
6763         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
6764             check_asym_packing(env, &sds))
6765                 return sds.busiest;
6766
6767         /* There is no busy sibling group to pull tasks from */
6768         if (!sds.busiest || busiest->sum_nr_running == 0)
6769                 goto out_balanced;
6770
6771         sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
6772                                                 / sds.total_capacity;
6773
6774         /*
6775          * If the busiest group is imbalanced the below checks don't
6776          * work because they assume all things are equal, which typically
6777          * isn't true due to cpus_allowed constraints and the like.
6778          */
6779         if (busiest->group_type == group_imbalanced)
6780                 goto force_balance;
6781
6782         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
6783         if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
6784             busiest->group_no_capacity)
6785                 goto force_balance;
6786
6787         /*
6788          * If the local group is busier than the selected busiest group
6789          * don't try and pull any tasks.
6790          */
6791         if (local->avg_load >= busiest->avg_load)
6792                 goto out_balanced;
6793
6794         /*
6795          * Don't pull any tasks if this group is already above the domain
6796          * average load.
6797          */
6798         if (local->avg_load >= sds.avg_load)
6799                 goto out_balanced;
6800
6801         if (env->idle == CPU_IDLE) {
6802                 /*
6803                  * This cpu is idle. If the busiest group is not overloaded
6804                  * and there is no imbalance between this and busiest group
6805                  * wrt idle cpus, it is balanced. The imbalance becomes
6806                  * significant if the diff is greater than 1 otherwise we
6807                  * might end up to just move the imbalance on another group
6808                  */
6809                 if ((busiest->group_type != group_overloaded) &&
6810                                 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6811                         goto out_balanced;
6812         } else {
6813                 /*
6814                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
6815                  * imbalance_pct to be conservative.
6816                  */
6817                 if (100 * busiest->avg_load <=
6818                                 env->sd->imbalance_pct * local->avg_load)
6819                         goto out_balanced;
6820         }
6821
6822 force_balance:
6823         /* Looks like there is an imbalance. Compute it */
6824         calculate_imbalance(env, &sds);
6825         return sds.busiest;
6826
6827 out_balanced:
6828         env->imbalance = 0;
6829         return NULL;
6830 }
6831
6832 /*
6833  * find_busiest_queue - find the busiest runqueue among the cpus in group.
6834  */
6835 static struct rq *find_busiest_queue(struct lb_env *env,
6836                                      struct sched_group *group)
6837 {
6838         struct rq *busiest = NULL, *rq;
6839         unsigned long busiest_load = 0, busiest_capacity = 1;
6840         int i;
6841
6842         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6843                 unsigned long capacity, wl;
6844                 enum fbq_type rt;
6845
6846                 rq = cpu_rq(i);
6847                 rt = fbq_classify_rq(rq);
6848
6849                 /*
6850                  * We classify groups/runqueues into three groups:
6851                  *  - regular: there are !numa tasks
6852                  *  - remote:  there are numa tasks that run on the 'wrong' node
6853                  *  - all:     there is no distinction
6854                  *
6855                  * In order to avoid migrating ideally placed numa tasks,
6856                  * ignore those when there's better options.
6857                  *
6858                  * If we ignore the actual busiest queue to migrate another
6859                  * task, the next balance pass can still reduce the busiest
6860                  * queue by moving tasks around inside the node.
6861                  *
6862                  * If we cannot move enough load due to this classification
6863                  * the next pass will adjust the group classification and
6864                  * allow migration of more tasks.
6865                  *
6866                  * Both cases only affect the total convergence complexity.
6867                  */
6868                 if (rt > env->fbq_type)
6869                         continue;
6870
6871                 capacity = capacity_of(i);
6872
6873                 wl = weighted_cpuload(i);
6874
6875                 /*
6876                  * When comparing with imbalance, use weighted_cpuload()
6877                  * which is not scaled with the cpu capacity.
6878                  */
6879
6880                 if (rq->nr_running == 1 && wl > env->imbalance &&
6881                     !check_cpu_capacity(rq, env->sd))
6882                         continue;
6883
6884                 /*
6885                  * For the load comparisons with the other cpu's, consider
6886                  * the weighted_cpuload() scaled with the cpu capacity, so
6887                  * that the load can be moved away from the cpu that is
6888                  * potentially running at a lower capacity.
6889                  *
6890                  * Thus we're looking for max(wl_i / capacity_i), crosswise
6891                  * multiplication to rid ourselves of the division works out
6892                  * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
6893                  * our previous maximum.
6894                  */
6895                 if (wl * busiest_capacity > busiest_load * capacity) {
6896                         busiest_load = wl;
6897                         busiest_capacity = capacity;
6898                         busiest = rq;
6899                 }
6900         }
6901
6902         return busiest;
6903 }
6904
6905 /*
6906  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
6907  * so long as it is large enough.
6908  */
6909 #define MAX_PINNED_INTERVAL     512
6910
6911 /* Working cpumask for load_balance and load_balance_newidle. */
6912 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
6913
6914 static int need_active_balance(struct lb_env *env)
6915 {
6916         struct sched_domain *sd = env->sd;
6917
6918         if (env->idle == CPU_NEWLY_IDLE) {
6919
6920                 /*
6921                  * ASYM_PACKING needs to force migrate tasks from busy but
6922                  * higher numbered CPUs in order to pack all tasks in the
6923                  * lowest numbered CPUs.
6924                  */
6925                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
6926                         return 1;
6927         }
6928
6929         /*
6930          * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
6931          * It's worth migrating the task if the src_cpu's capacity is reduced
6932          * because of other sched_class or IRQs if more capacity stays
6933          * available on dst_cpu.
6934          */
6935         if ((env->idle != CPU_NOT_IDLE) &&
6936             (env->src_rq->cfs.h_nr_running == 1)) {
6937                 if ((check_cpu_capacity(env->src_rq, sd)) &&
6938                     (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
6939                         return 1;
6940         }
6941
6942         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
6943 }
6944
6945 static int active_load_balance_cpu_stop(void *data);
6946
6947 static int should_we_balance(struct lb_env *env)
6948 {
6949         struct sched_group *sg = env->sd->groups;
6950         struct cpumask *sg_cpus, *sg_mask;
6951         int cpu, balance_cpu = -1;
6952
6953         /*
6954          * In the newly idle case, we will allow all the cpu's
6955          * to do the newly idle load balance.
6956          */
6957         if (env->idle == CPU_NEWLY_IDLE)
6958                 return 1;
6959
6960         sg_cpus = sched_group_cpus(sg);
6961         sg_mask = sched_group_mask(sg);
6962         /* Try to find first idle cpu */
6963         for_each_cpu_and(cpu, sg_cpus, env->cpus) {
6964                 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
6965                         continue;
6966
6967                 balance_cpu = cpu;
6968                 break;
6969         }
6970
6971         if (balance_cpu == -1)
6972                 balance_cpu = group_balance_cpu(sg);
6973
6974         /*
6975          * First idle cpu or the first cpu(busiest) in this sched group
6976          * is eligible for doing load balancing at this and above domains.
6977          */
6978         return balance_cpu == env->dst_cpu;
6979 }
6980
6981 /*
6982  * Check this_cpu to ensure it is balanced within domain. Attempt to move
6983  * tasks if there is an imbalance.
6984  */
6985 static int load_balance(int this_cpu, struct rq *this_rq,
6986                         struct sched_domain *sd, enum cpu_idle_type idle,
6987                         int *continue_balancing)
6988 {
6989         int ld_moved, cur_ld_moved, active_balance = 0;
6990         struct sched_domain *sd_parent = sd->parent;
6991         struct sched_group *group;
6992         struct rq *busiest;
6993         unsigned long flags;
6994         struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6995
6996         struct lb_env env = {
6997                 .sd             = sd,
6998                 .dst_cpu        = this_cpu,
6999                 .dst_rq         = this_rq,
7000                 .dst_grpmask    = sched_group_cpus(sd->groups),
7001                 .idle           = idle,
7002                 .loop_break     = sched_nr_migrate_break,
7003                 .cpus           = cpus,
7004                 .fbq_type       = all,
7005                 .tasks          = LIST_HEAD_INIT(env.tasks),
7006         };
7007
7008         /*
7009          * For NEWLY_IDLE load_balancing, we don't need to consider
7010          * other cpus in our group
7011          */
7012         if (idle == CPU_NEWLY_IDLE)
7013                 env.dst_grpmask = NULL;
7014
7015         cpumask_copy(cpus, cpu_active_mask);
7016
7017         schedstat_inc(sd, lb_count[idle]);
7018
7019 redo:
7020         if (!should_we_balance(&env)) {
7021                 *continue_balancing = 0;
7022                 goto out_balanced;
7023         }
7024
7025         group = find_busiest_group(&env);
7026         if (!group) {
7027                 schedstat_inc(sd, lb_nobusyg[idle]);
7028                 goto out_balanced;
7029         }
7030
7031         busiest = find_busiest_queue(&env, group);
7032         if (!busiest) {
7033                 schedstat_inc(sd, lb_nobusyq[idle]);
7034                 goto out_balanced;
7035         }
7036
7037         BUG_ON(busiest == env.dst_rq);
7038
7039         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7040
7041         env.src_cpu = busiest->cpu;
7042         env.src_rq = busiest;
7043
7044         ld_moved = 0;
7045         if (busiest->nr_running > 1) {
7046                 /*
7047                  * Attempt to move tasks. If find_busiest_group has found
7048                  * an imbalance but busiest->nr_running <= 1, the group is
7049                  * still unbalanced. ld_moved simply stays zero, so it is
7050                  * correctly treated as an imbalance.
7051                  */
7052                 env.flags |= LBF_ALL_PINNED;
7053                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7054
7055 more_balance:
7056                 raw_spin_lock_irqsave(&busiest->lock, flags);
7057
7058                 /*
7059                  * cur_ld_moved - load moved in current iteration
7060                  * ld_moved     - cumulative load moved across iterations
7061                  */
7062                 cur_ld_moved = detach_tasks(&env);
7063
7064                 /*
7065                  * We've detached some tasks from busiest_rq. Every
7066                  * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
7067                  * unlock busiest->lock, and we are able to be sure
7068                  * that nobody can manipulate the tasks in parallel.
7069                  * See task_rq_lock() family for the details.
7070                  */
7071
7072                 raw_spin_unlock(&busiest->lock);
7073
7074                 if (cur_ld_moved) {
7075                         attach_tasks(&env);
7076                         ld_moved += cur_ld_moved;
7077                 }
7078
7079                 local_irq_restore(flags);
7080
7081                 if (env.flags & LBF_NEED_BREAK) {
7082                         env.flags &= ~LBF_NEED_BREAK;
7083                         goto more_balance;
7084                 }
7085
7086                 /*
7087                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
7088                  * us and move them to an alternate dst_cpu in our sched_group
7089                  * where they can run. The upper limit on how many times we
7090                  * iterate on same src_cpu is dependent on number of cpus in our
7091                  * sched_group.
7092                  *
7093                  * This changes load balance semantics a bit on who can move
7094                  * load to a given_cpu. In addition to the given_cpu itself
7095                  * (or a ilb_cpu acting on its behalf where given_cpu is
7096                  * nohz-idle), we now have balance_cpu in a position to move
7097                  * load to given_cpu. In rare situations, this may cause
7098                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7099                  * _independently_ and at _same_ time to move some load to
7100                  * given_cpu) causing exceess load to be moved to given_cpu.
7101                  * This however should not happen so much in practice and
7102                  * moreover subsequent load balance cycles should correct the
7103                  * excess load moved.
7104                  */
7105                 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
7106
7107                         /* Prevent to re-select dst_cpu via env's cpus */
7108                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
7109
7110                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
7111                         env.dst_cpu      = env.new_dst_cpu;
7112                         env.flags       &= ~LBF_DST_PINNED;
7113                         env.loop         = 0;
7114                         env.loop_break   = sched_nr_migrate_break;
7115
7116                         /*
7117                          * Go back to "more_balance" rather than "redo" since we
7118                          * need to continue with same src_cpu.
7119                          */
7120                         goto more_balance;
7121                 }
7122
7123                 /*
7124                  * We failed to reach balance because of affinity.
7125                  */
7126                 if (sd_parent) {
7127                         int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7128
7129                         if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
7130                                 *group_imbalance = 1;
7131                 }
7132
7133                 /* All tasks on this runqueue were pinned by CPU affinity */
7134                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7135                         cpumask_clear_cpu(cpu_of(busiest), cpus);
7136                         if (!cpumask_empty(cpus)) {
7137                                 env.loop = 0;
7138                                 env.loop_break = sched_nr_migrate_break;
7139                                 goto redo;
7140                         }
7141                         goto out_all_pinned;
7142                 }
7143         }
7144
7145         if (!ld_moved) {
7146                 schedstat_inc(sd, lb_failed[idle]);
7147                 /*
7148                  * Increment the failure counter only on periodic balance.
7149                  * We do not want newidle balance, which can be very
7150                  * frequent, pollute the failure counter causing
7151                  * excessive cache_hot migrations and active balances.
7152                  */
7153                 if (idle != CPU_NEWLY_IDLE)
7154                         sd->nr_balance_failed++;
7155
7156                 if (need_active_balance(&env)) {
7157                         raw_spin_lock_irqsave(&busiest->lock, flags);
7158
7159                         /* don't kick the active_load_balance_cpu_stop,
7160                          * if the curr task on busiest cpu can't be
7161                          * moved to this_cpu
7162                          */
7163                         if (!cpumask_test_cpu(this_cpu,
7164                                         tsk_cpus_allowed(busiest->curr))) {
7165                                 raw_spin_unlock_irqrestore(&busiest->lock,
7166                                                             flags);
7167                                 env.flags |= LBF_ALL_PINNED;
7168                                 goto out_one_pinned;
7169                         }
7170
7171                         /*
7172                          * ->active_balance synchronizes accesses to
7173                          * ->active_balance_work.  Once set, it's cleared
7174                          * only after active load balance is finished.
7175                          */
7176                         if (!busiest->active_balance) {
7177                                 busiest->active_balance = 1;
7178                                 busiest->push_cpu = this_cpu;
7179                                 active_balance = 1;
7180                         }
7181                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
7182
7183                         if (active_balance) {
7184                                 stop_one_cpu_nowait(cpu_of(busiest),
7185                                         active_load_balance_cpu_stop, busiest,
7186                                         &busiest->active_balance_work);
7187                         }
7188
7189                         /*
7190                          * We've kicked active balancing, reset the failure
7191                          * counter.
7192                          */
7193                         sd->nr_balance_failed = sd->cache_nice_tries+1;
7194                 }
7195         } else
7196                 sd->nr_balance_failed = 0;
7197
7198         if (likely(!active_balance)) {
7199                 /* We were unbalanced, so reset the balancing interval */
7200                 sd->balance_interval = sd->min_interval;
7201         } else {
7202                 /*
7203                  * If we've begun active balancing, start to back off. This
7204                  * case may not be covered by the all_pinned logic if there
7205                  * is only 1 task on the busy runqueue (because we don't call
7206                  * detach_tasks).
7207                  */
7208                 if (sd->balance_interval < sd->max_interval)
7209                         sd->balance_interval *= 2;
7210         }
7211
7212         goto out;
7213
7214 out_balanced:
7215         /*
7216          * We reach balance although we may have faced some affinity
7217          * constraints. Clear the imbalance flag if it was set.
7218          */
7219         if (sd_parent) {
7220                 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
7221
7222                 if (*group_imbalance)
7223                         *group_imbalance = 0;
7224         }
7225
7226 out_all_pinned:
7227         /*
7228          * We reach balance because all tasks are pinned at this level so
7229          * we can't migrate them. Let the imbalance flag set so parent level
7230          * can try to migrate them.
7231          */
7232         schedstat_inc(sd, lb_balanced[idle]);
7233
7234         sd->nr_balance_failed = 0;
7235
7236 out_one_pinned:
7237         /* tune up the balancing interval */
7238         if (((env.flags & LBF_ALL_PINNED) &&
7239                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
7240                         (sd->balance_interval < sd->max_interval))
7241                 sd->balance_interval *= 2;
7242
7243         ld_moved = 0;
7244 out:
7245         return ld_moved;
7246 }
7247
7248 static inline unsigned long
7249 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
7250 {
7251         unsigned long interval = sd->balance_interval;
7252
7253         if (cpu_busy)
7254                 interval *= sd->busy_factor;
7255
7256         /* scale ms to jiffies */
7257         interval = msecs_to_jiffies(interval);
7258         interval = clamp(interval, 1UL, max_load_balance_interval);
7259
7260         return interval;
7261 }
7262
7263 static inline void
7264 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
7265 {
7266         unsigned long interval, next;
7267
7268         interval = get_sd_balance_interval(sd, cpu_busy);
7269         next = sd->last_balance + interval;
7270
7271         if (time_after(*next_balance, next))
7272                 *next_balance = next;
7273 }
7274
7275 /*
7276  * idle_balance is called by schedule() if this_cpu is about to become
7277  * idle. Attempts to pull tasks from other CPUs.
7278  */
7279 static int idle_balance(struct rq *this_rq)
7280 {
7281         unsigned long next_balance = jiffies + HZ;
7282         int this_cpu = this_rq->cpu;
7283         struct sched_domain *sd;
7284         int pulled_task = 0;
7285         u64 curr_cost = 0;
7286
7287         idle_enter_fair(this_rq);
7288
7289         /*
7290          * We must set idle_stamp _before_ calling idle_balance(), such that we
7291          * measure the duration of idle_balance() as idle time.
7292          */
7293         this_rq->idle_stamp = rq_clock(this_rq);
7294
7295         if (this_rq->avg_idle < sysctl_sched_migration_cost ||
7296             !this_rq->rd->overload) {
7297                 rcu_read_lock();
7298                 sd = rcu_dereference_check_sched_domain(this_rq->sd);
7299                 if (sd)
7300                         update_next_balance(sd, 0, &next_balance);
7301                 rcu_read_unlock();
7302
7303                 goto out;
7304         }
7305
7306         raw_spin_unlock(&this_rq->lock);
7307
7308         update_blocked_averages(this_cpu);
7309         rcu_read_lock();
7310         for_each_domain(this_cpu, sd) {
7311                 int continue_balancing = 1;
7312                 u64 t0, domain_cost;
7313
7314                 if (!(sd->flags & SD_LOAD_BALANCE))
7315                         continue;
7316
7317                 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
7318                         update_next_balance(sd, 0, &next_balance);
7319                         break;
7320                 }
7321
7322                 if (sd->flags & SD_BALANCE_NEWIDLE) {
7323                         t0 = sched_clock_cpu(this_cpu);
7324
7325                         pulled_task = load_balance(this_cpu, this_rq,
7326                                                    sd, CPU_NEWLY_IDLE,
7327                                                    &continue_balancing);
7328
7329                         domain_cost = sched_clock_cpu(this_cpu) - t0;
7330                         if (domain_cost > sd->max_newidle_lb_cost)
7331                                 sd->max_newidle_lb_cost = domain_cost;
7332
7333                         curr_cost += domain_cost;
7334                 }
7335
7336                 update_next_balance(sd, 0, &next_balance);
7337
7338                 /*
7339                  * Stop searching for tasks to pull if there are
7340                  * now runnable tasks on this rq.
7341                  */
7342                 if (pulled_task || this_rq->nr_running > 0)
7343                         break;
7344         }
7345         rcu_read_unlock();
7346
7347         raw_spin_lock(&this_rq->lock);
7348
7349         if (curr_cost > this_rq->max_idle_balance_cost)
7350                 this_rq->max_idle_balance_cost = curr_cost;
7351
7352         /*
7353          * While browsing the domains, we released the rq lock, a task could
7354          * have been enqueued in the meantime. Since we're not going idle,
7355          * pretend we pulled a task.
7356          */
7357         if (this_rq->cfs.h_nr_running && !pulled_task)
7358                 pulled_task = 1;
7359
7360 out:
7361         /* Move the next balance forward */
7362         if (time_after(this_rq->next_balance, next_balance))
7363                 this_rq->next_balance = next_balance;
7364
7365         /* Is there a task of a high priority class? */
7366         if (this_rq->nr_running != this_rq->cfs.h_nr_running)
7367                 pulled_task = -1;
7368
7369         if (pulled_task) {
7370                 idle_exit_fair(this_rq);
7371                 this_rq->idle_stamp = 0;
7372         }
7373
7374         return pulled_task;
7375 }
7376
7377 /*
7378  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7379  * running tasks off the busiest CPU onto idle CPUs. It requires at
7380  * least 1 task to be running on each physical CPU where possible, and
7381  * avoids physical / logical imbalances.
7382  */
7383 static int active_load_balance_cpu_stop(void *data)
7384 {
7385         struct rq *busiest_rq = data;
7386         int busiest_cpu = cpu_of(busiest_rq);
7387         int target_cpu = busiest_rq->push_cpu;
7388         struct rq *target_rq = cpu_rq(target_cpu);
7389         struct sched_domain *sd;
7390         struct task_struct *p = NULL;
7391
7392         raw_spin_lock_irq(&busiest_rq->lock);
7393
7394         /* make sure the requested cpu hasn't gone down in the meantime */
7395         if (unlikely(busiest_cpu != smp_processor_id() ||
7396                      !busiest_rq->active_balance))
7397                 goto out_unlock;
7398
7399         /* Is there any task to move? */
7400         if (busiest_rq->nr_running <= 1)
7401                 goto out_unlock;
7402
7403         /*
7404          * This condition is "impossible", if it occurs
7405          * we need to fix it. Originally reported by
7406          * Bjorn Helgaas on a 128-cpu setup.
7407          */
7408         BUG_ON(busiest_rq == target_rq);
7409
7410         /* Search for an sd spanning us and the target CPU. */
7411         rcu_read_lock();
7412         for_each_domain(target_cpu, sd) {
7413                 if ((sd->flags & SD_LOAD_BALANCE) &&
7414                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7415                                 break;
7416         }
7417
7418         if (likely(sd)) {
7419                 struct lb_env env = {
7420                         .sd             = sd,
7421                         .dst_cpu        = target_cpu,
7422                         .dst_rq         = target_rq,
7423                         .src_cpu        = busiest_rq->cpu,
7424                         .src_rq         = busiest_rq,
7425                         .idle           = CPU_IDLE,
7426                 };
7427
7428                 schedstat_inc(sd, alb_count);
7429
7430                 p = detach_one_task(&env);
7431                 if (p)
7432                         schedstat_inc(sd, alb_pushed);
7433                 else
7434                         schedstat_inc(sd, alb_failed);
7435         }
7436         rcu_read_unlock();
7437 out_unlock:
7438         busiest_rq->active_balance = 0;
7439         raw_spin_unlock(&busiest_rq->lock);
7440
7441         if (p)
7442                 attach_one_task(target_rq, p);
7443
7444         local_irq_enable();
7445
7446         return 0;
7447 }
7448
7449 static inline int on_null_domain(struct rq *rq)
7450 {
7451         return unlikely(!rcu_dereference_sched(rq->sd));
7452 }
7453
7454 #ifdef CONFIG_NO_HZ_COMMON
7455 /*
7456  * idle load balancing details
7457  * - When one of the busy CPUs notice that there may be an idle rebalancing
7458  *   needed, they will kick the idle load balancer, which then does idle
7459  *   load balancing for all the idle CPUs.
7460  */
7461 static struct {
7462         cpumask_var_t idle_cpus_mask;
7463         atomic_t nr_cpus;
7464         unsigned long next_balance;     /* in jiffy units */
7465 } nohz ____cacheline_aligned;
7466
7467 static inline int find_new_ilb(void)
7468 {
7469         int ilb = cpumask_first(nohz.idle_cpus_mask);
7470
7471         if (ilb < nr_cpu_ids && idle_cpu(ilb))
7472                 return ilb;
7473
7474         return nr_cpu_ids;
7475 }
7476
7477 /*
7478  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
7479  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
7480  * CPU (if there is one).
7481  */
7482 static void nohz_balancer_kick(void)
7483 {
7484         int ilb_cpu;
7485
7486         nohz.next_balance++;
7487
7488         ilb_cpu = find_new_ilb();
7489
7490         if (ilb_cpu >= nr_cpu_ids)
7491                 return;
7492
7493         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
7494                 return;
7495         /*
7496          * Use smp_send_reschedule() instead of resched_cpu().
7497          * This way we generate a sched IPI on the target cpu which
7498          * is idle. And the softirq performing nohz idle load balance
7499          * will be run before returning from the IPI.
7500          */
7501         smp_send_reschedule(ilb_cpu);
7502         return;
7503 }
7504
7505 static inline void nohz_balance_exit_idle(int cpu)
7506 {
7507         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
7508                 /*
7509                  * Completely isolated CPUs don't ever set, so we must test.
7510                  */
7511                 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
7512                         cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
7513                         atomic_dec(&nohz.nr_cpus);
7514                 }
7515                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7516         }
7517 }
7518
7519 static inline void set_cpu_sd_state_busy(void)
7520 {
7521         struct sched_domain *sd;
7522         int cpu = smp_processor_id();
7523
7524         rcu_read_lock();
7525         sd = rcu_dereference(per_cpu(sd_busy, cpu));
7526
7527         if (!sd || !sd->nohz_idle)
7528                 goto unlock;
7529         sd->nohz_idle = 0;
7530
7531         atomic_inc(&sd->groups->sgc->nr_busy_cpus);
7532 unlock:
7533         rcu_read_unlock();
7534 }
7535
7536 void set_cpu_sd_state_idle(void)
7537 {
7538         struct sched_domain *sd;
7539         int cpu = smp_processor_id();
7540
7541         rcu_read_lock();
7542         sd = rcu_dereference(per_cpu(sd_busy, cpu));
7543
7544         if (!sd || sd->nohz_idle)
7545                 goto unlock;
7546         sd->nohz_idle = 1;
7547
7548         atomic_dec(&sd->groups->sgc->nr_busy_cpus);
7549 unlock:
7550         rcu_read_unlock();
7551 }
7552
7553 /*
7554  * This routine will record that the cpu is going idle with tick stopped.
7555  * This info will be used in performing idle load balancing in the future.
7556  */
7557 void nohz_balance_enter_idle(int cpu)
7558 {
7559         /*
7560          * If this cpu is going down, then nothing needs to be done.
7561          */
7562         if (!cpu_active(cpu))
7563                 return;
7564
7565         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
7566                 return;
7567
7568         /*
7569          * If we're a completely isolated CPU, we don't play.
7570          */
7571         if (on_null_domain(cpu_rq(cpu)))
7572                 return;
7573
7574         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
7575         atomic_inc(&nohz.nr_cpus);
7576         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
7577 }
7578
7579 static int sched_ilb_notifier(struct notifier_block *nfb,
7580                                         unsigned long action, void *hcpu)
7581 {
7582         switch (action & ~CPU_TASKS_FROZEN) {
7583         case CPU_DYING:
7584                 nohz_balance_exit_idle(smp_processor_id());
7585                 return NOTIFY_OK;
7586         default:
7587                 return NOTIFY_DONE;
7588         }
7589 }
7590 #endif
7591
7592 static DEFINE_SPINLOCK(balancing);
7593
7594 /*
7595  * Scale the max load_balance interval with the number of CPUs in the system.
7596  * This trades load-balance latency on larger machines for less cross talk.
7597  */
7598 void update_max_interval(void)
7599 {
7600         max_load_balance_interval = HZ*num_online_cpus()/10;
7601 }
7602
7603 /*
7604  * It checks each scheduling domain to see if it is due to be balanced,
7605  * and initiates a balancing operation if so.
7606  *
7607  * Balancing parameters are set up in init_sched_domains.
7608  */
7609 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
7610 {
7611         int continue_balancing = 1;
7612         int cpu = rq->cpu;
7613         unsigned long interval;
7614         struct sched_domain *sd;
7615         /* Earliest time when we have to do rebalance again */
7616         unsigned long next_balance = jiffies + 60*HZ;
7617         int update_next_balance = 0;
7618         int need_serialize, need_decay = 0;
7619         u64 max_cost = 0;
7620
7621         update_blocked_averages(cpu);
7622
7623         rcu_read_lock();
7624         for_each_domain(cpu, sd) {
7625                 /*
7626                  * Decay the newidle max times here because this is a regular
7627                  * visit to all the domains. Decay ~1% per second.
7628                  */
7629                 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
7630                         sd->max_newidle_lb_cost =
7631                                 (sd->max_newidle_lb_cost * 253) / 256;
7632                         sd->next_decay_max_lb_cost = jiffies + HZ;
7633                         need_decay = 1;
7634                 }
7635                 max_cost += sd->max_newidle_lb_cost;
7636
7637                 if (!(sd->flags & SD_LOAD_BALANCE))
7638                         continue;
7639
7640                 /*
7641                  * Stop the load balance at this level. There is another
7642                  * CPU in our sched group which is doing load balancing more
7643                  * actively.
7644                  */
7645                 if (!continue_balancing) {
7646                         if (need_decay)
7647                                 continue;
7648                         break;
7649                 }
7650
7651                 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7652
7653                 need_serialize = sd->flags & SD_SERIALIZE;
7654                 if (need_serialize) {
7655                         if (!spin_trylock(&balancing))
7656                                 goto out;
7657                 }
7658
7659                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
7660                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
7661                                 /*
7662                                  * The LBF_DST_PINNED logic could have changed
7663                                  * env->dst_cpu, so we can't know our idle
7664                                  * state even if we migrated tasks. Update it.
7665                                  */
7666                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
7667                         }
7668                         sd->last_balance = jiffies;
7669                         interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
7670                 }
7671                 if (need_serialize)
7672                         spin_unlock(&balancing);
7673 out:
7674                 if (time_after(next_balance, sd->last_balance + interval)) {
7675                         next_balance = sd->last_balance + interval;
7676                         update_next_balance = 1;
7677                 }
7678         }
7679         if (need_decay) {
7680                 /*
7681                  * Ensure the rq-wide value also decays but keep it at a
7682                  * reasonable floor to avoid funnies with rq->avg_idle.
7683                  */
7684                 rq->max_idle_balance_cost =
7685                         max((u64)sysctl_sched_migration_cost, max_cost);
7686         }
7687         rcu_read_unlock();
7688
7689         /*
7690          * next_balance will be updated only when there is a need.
7691          * When the cpu is attached to null domain for ex, it will not be
7692          * updated.
7693          */
7694         if (likely(update_next_balance)) {
7695                 rq->next_balance = next_balance;
7696
7697 #ifdef CONFIG_NO_HZ_COMMON
7698                 /*
7699                  * If this CPU has been elected to perform the nohz idle
7700                  * balance. Other idle CPUs have already rebalanced with
7701                  * nohz_idle_balance() and nohz.next_balance has been
7702                  * updated accordingly. This CPU is now running the idle load
7703                  * balance for itself and we need to update the
7704                  * nohz.next_balance accordingly.
7705                  */
7706                 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
7707                         nohz.next_balance = rq->next_balance;
7708 #endif
7709         }
7710 }
7711
7712 #ifdef CONFIG_NO_HZ_COMMON
7713 /*
7714  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
7715  * rebalancing for all the cpus for whom scheduler ticks are stopped.
7716  */
7717 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
7718 {
7719         int this_cpu = this_rq->cpu;
7720         struct rq *rq;
7721         int balance_cpu;
7722         /* Earliest time when we have to do rebalance again */
7723         unsigned long next_balance = jiffies + 60*HZ;
7724         int update_next_balance = 0;
7725
7726         if (idle != CPU_IDLE ||
7727             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
7728                 goto end;
7729
7730         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
7731                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
7732                         continue;
7733
7734                 /*
7735                  * If this cpu gets work to do, stop the load balancing
7736                  * work being done for other cpus. Next load
7737                  * balancing owner will pick it up.
7738                  */
7739                 if (need_resched())
7740                         break;
7741
7742                 rq = cpu_rq(balance_cpu);
7743
7744                 /*
7745                  * If time for next balance is due,
7746                  * do the balance.
7747                  */
7748                 if (time_after_eq(jiffies, rq->next_balance)) {
7749                         raw_spin_lock_irq(&rq->lock);
7750                         update_rq_clock(rq);
7751                         update_idle_cpu_load(rq);
7752                         raw_spin_unlock_irq(&rq->lock);
7753                         rebalance_domains(rq, CPU_IDLE);
7754                 }
7755
7756                 if (time_after(next_balance, rq->next_balance)) {
7757                         next_balance = rq->next_balance;
7758                         update_next_balance = 1;
7759                 }
7760         }
7761
7762         /*
7763          * next_balance will be updated only when there is a need.
7764          * When the CPU is attached to null domain for ex, it will not be
7765          * updated.
7766          */
7767         if (likely(update_next_balance))
7768                 nohz.next_balance = next_balance;
7769 end:
7770         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
7771 }
7772
7773 /*
7774  * Current heuristic for kicking the idle load balancer in the presence
7775  * of an idle cpu in the system.
7776  *   - This rq has more than one task.
7777  *   - This rq has at least one CFS task and the capacity of the CPU is
7778  *     significantly reduced because of RT tasks or IRQs.
7779  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
7780  *     multiple busy cpu.
7781  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
7782  *     domain span are idle.
7783  */
7784 static inline bool nohz_kick_needed(struct rq *rq)
7785 {
7786         unsigned long now = jiffies;
7787         struct sched_domain *sd;
7788         struct sched_group_capacity *sgc;
7789         int nr_busy, cpu = rq->cpu;
7790         bool kick = false;
7791
7792         if (unlikely(rq->idle_balance))
7793                 return false;
7794
7795        /*
7796         * We may be recently in ticked or tickless idle mode. At the first
7797         * busy tick after returning from idle, we will update the busy stats.
7798         */
7799         set_cpu_sd_state_busy();
7800         nohz_balance_exit_idle(cpu);
7801
7802         /*
7803          * None are in tickless mode and hence no need for NOHZ idle load
7804          * balancing.
7805          */
7806         if (likely(!atomic_read(&nohz.nr_cpus)))
7807                 return false;
7808
7809         if (time_before(now, nohz.next_balance))
7810                 return false;
7811
7812         if (rq->nr_running >= 2)
7813                 return true;
7814
7815         rcu_read_lock();
7816         sd = rcu_dereference(per_cpu(sd_busy, cpu));
7817         if (sd) {
7818                 sgc = sd->groups->sgc;
7819                 nr_busy = atomic_read(&sgc->nr_busy_cpus);
7820
7821                 if (nr_busy > 1) {
7822                         kick = true;
7823                         goto unlock;
7824                 }
7825
7826         }
7827
7828         sd = rcu_dereference(rq->sd);
7829         if (sd) {
7830                 if ((rq->cfs.h_nr_running >= 1) &&
7831                                 check_cpu_capacity(rq, sd)) {
7832                         kick = true;
7833                         goto unlock;
7834                 }
7835         }
7836
7837         sd = rcu_dereference(per_cpu(sd_asym, cpu));
7838         if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
7839                                   sched_domain_span(sd)) < cpu)) {
7840                 kick = true;
7841                 goto unlock;
7842         }
7843
7844 unlock:
7845         rcu_read_unlock();
7846         return kick;
7847 }
7848 #else
7849 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
7850 #endif
7851
7852 /*
7853  * run_rebalance_domains is triggered when needed from the scheduler tick.
7854  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
7855  */
7856 static void run_rebalance_domains(struct softirq_action *h)
7857 {
7858         struct rq *this_rq = this_rq();
7859         enum cpu_idle_type idle = this_rq->idle_balance ?
7860                                                 CPU_IDLE : CPU_NOT_IDLE;
7861
7862         /*
7863          * If this cpu has a pending nohz_balance_kick, then do the
7864          * balancing on behalf of the other idle cpus whose ticks are
7865          * stopped. Do nohz_idle_balance *before* rebalance_domains to
7866          * give the idle cpus a chance to load balance. Else we may
7867          * load balance only within the local sched_domain hierarchy
7868          * and abort nohz_idle_balance altogether if we pull some load.
7869          */
7870         nohz_idle_balance(this_rq, idle);
7871         rebalance_domains(this_rq, idle);
7872 }
7873
7874 /*
7875  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
7876  */
7877 void trigger_load_balance(struct rq *rq)
7878 {
7879         /* Don't need to rebalance while attached to NULL domain */
7880         if (unlikely(on_null_domain(rq)))
7881                 return;
7882
7883         if (time_after_eq(jiffies, rq->next_balance))
7884                 raise_softirq(SCHED_SOFTIRQ);
7885 #ifdef CONFIG_NO_HZ_COMMON
7886         if (nohz_kick_needed(rq))
7887                 nohz_balancer_kick();
7888 #endif
7889 }
7890
7891 static void rq_online_fair(struct rq *rq)
7892 {
7893         update_sysctl();
7894
7895         update_runtime_enabled(rq);
7896 }
7897
7898 static void rq_offline_fair(struct rq *rq)
7899 {
7900         update_sysctl();
7901
7902         /* Ensure any throttled groups are reachable by pick_next_task */
7903         unthrottle_offline_cfs_rqs(rq);
7904 }
7905
7906 #endif /* CONFIG_SMP */
7907
7908 /*
7909  * scheduler tick hitting a task of our scheduling class:
7910  */
7911 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
7912 {
7913         struct cfs_rq *cfs_rq;
7914         struct sched_entity *se = &curr->se;
7915
7916         for_each_sched_entity(se) {
7917                 cfs_rq = cfs_rq_of(se);
7918                 entity_tick(cfs_rq, se, queued);
7919         }
7920
7921         if (static_branch_unlikely(&sched_numa_balancing))
7922                 task_tick_numa(rq, curr);
7923 }
7924
7925 /*
7926  * called on fork with the child task as argument from the parent's context
7927  *  - child not yet on the tasklist
7928  *  - preemption disabled
7929  */
7930 static void task_fork_fair(struct task_struct *p)
7931 {
7932         struct cfs_rq *cfs_rq;
7933         struct sched_entity *se = &p->se, *curr;
7934         int this_cpu = smp_processor_id();
7935         struct rq *rq = this_rq();
7936         unsigned long flags;
7937
7938         raw_spin_lock_irqsave(&rq->lock, flags);
7939
7940         update_rq_clock(rq);
7941
7942         cfs_rq = task_cfs_rq(current);
7943         curr = cfs_rq->curr;
7944
7945         /*
7946          * Not only the cpu but also the task_group of the parent might have
7947          * been changed after parent->se.parent,cfs_rq were copied to
7948          * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
7949          * of child point to valid ones.
7950          */
7951         rcu_read_lock();
7952         __set_task_cpu(p, this_cpu);
7953         rcu_read_unlock();
7954
7955         update_curr(cfs_rq);
7956
7957         if (curr)
7958                 se->vruntime = curr->vruntime;
7959         place_entity(cfs_rq, se, 1);
7960
7961         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
7962                 /*
7963                  * Upon rescheduling, sched_class::put_prev_task() will place
7964                  * 'current' within the tree based on its new key value.
7965                  */
7966                 swap(curr->vruntime, se->vruntime);
7967                 resched_curr(rq);
7968         }
7969
7970         se->vruntime -= cfs_rq->min_vruntime;
7971
7972         raw_spin_unlock_irqrestore(&rq->lock, flags);
7973 }
7974
7975 /*
7976  * Priority of the task has changed. Check to see if we preempt
7977  * the current task.
7978  */
7979 static void
7980 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7981 {
7982         if (!task_on_rq_queued(p))
7983                 return;
7984
7985         /*
7986          * Reschedule if we are currently running on this runqueue and
7987          * our priority decreased, or if we are not currently running on
7988          * this runqueue and our priority is higher than the current's
7989          */
7990         if (rq->curr == p) {
7991                 if (p->prio > oldprio)
7992                         resched_curr(rq);
7993         } else
7994                 check_preempt_curr(rq, p, 0);
7995 }
7996
7997 static inline bool vruntime_normalized(struct task_struct *p)
7998 {
7999         struct sched_entity *se = &p->se;
8000
8001         /*
8002          * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
8003          * the dequeue_entity(.flags=0) will already have normalized the
8004          * vruntime.
8005          */
8006         if (p->on_rq)
8007                 return true;
8008
8009         /*
8010          * When !on_rq, vruntime of the task has usually NOT been normalized.
8011          * But there are some cases where it has already been normalized:
8012          *
8013          * - A forked child which is waiting for being woken up by
8014          *   wake_up_new_task().
8015          * - A task which has been woken up by try_to_wake_up() and
8016          *   waiting for actually being woken up by sched_ttwu_pending().
8017          */
8018         if (!se->sum_exec_runtime || p->state == TASK_WAKING)
8019                 return true;
8020
8021         return false;
8022 }
8023
8024 static void detach_task_cfs_rq(struct task_struct *p)
8025 {
8026         struct sched_entity *se = &p->se;
8027         struct cfs_rq *cfs_rq = cfs_rq_of(se);
8028
8029         if (!vruntime_normalized(p)) {
8030                 /*
8031                  * Fix up our vruntime so that the current sleep doesn't
8032                  * cause 'unlimited' sleep bonus.
8033                  */
8034                 place_entity(cfs_rq, se, 0);
8035                 se->vruntime -= cfs_rq->min_vruntime;
8036         }
8037
8038         /* Catch up with the cfs_rq and remove our load when we leave */
8039         detach_entity_load_avg(cfs_rq, se);
8040 }
8041
8042 static void attach_task_cfs_rq(struct task_struct *p)
8043 {
8044         struct sched_entity *se = &p->se;
8045         struct cfs_rq *cfs_rq = cfs_rq_of(se);
8046
8047 #ifdef CONFIG_FAIR_GROUP_SCHED
8048         /*
8049          * Since the real-depth could have been changed (only FAIR
8050          * class maintain depth value), reset depth properly.
8051          */
8052         se->depth = se->parent ? se->parent->depth + 1 : 0;
8053 #endif
8054
8055         /* Synchronize task with its cfs_rq */
8056         attach_entity_load_avg(cfs_rq, se);
8057
8058         if (!vruntime_normalized(p))
8059                 se->vruntime += cfs_rq->min_vruntime;
8060 }
8061
8062 static void switched_from_fair(struct rq *rq, struct task_struct *p)
8063 {
8064         detach_task_cfs_rq(p);
8065 }
8066
8067 static void switched_to_fair(struct rq *rq, struct task_struct *p)
8068 {
8069         attach_task_cfs_rq(p);
8070
8071         if (task_on_rq_queued(p)) {
8072                 /*
8073                  * We were most likely switched from sched_rt, so
8074                  * kick off the schedule if running, otherwise just see
8075                  * if we can still preempt the current task.
8076                  */
8077                 if (rq->curr == p)
8078                         resched_curr(rq);
8079                 else
8080                         check_preempt_curr(rq, p, 0);
8081         }
8082 }
8083
8084 /* Account for a task changing its policy or group.
8085  *
8086  * This routine is mostly called to set cfs_rq->curr field when a task
8087  * migrates between groups/classes.
8088  */
8089 static void set_curr_task_fair(struct rq *rq)
8090 {
8091         struct sched_entity *se = &rq->curr->se;
8092
8093         for_each_sched_entity(se) {
8094                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8095
8096                 set_next_entity(cfs_rq, se);
8097                 /* ensure bandwidth has been allocated on our new cfs_rq */
8098                 account_cfs_rq_runtime(cfs_rq, 0);
8099         }
8100 }
8101
8102 void init_cfs_rq(struct cfs_rq *cfs_rq)
8103 {
8104         cfs_rq->tasks_timeline = RB_ROOT;
8105         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
8106 #ifndef CONFIG_64BIT
8107         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8108 #endif
8109 #ifdef CONFIG_SMP
8110         atomic_long_set(&cfs_rq->removed_load_avg, 0);
8111         atomic_long_set(&cfs_rq->removed_util_avg, 0);
8112 #endif
8113 }
8114
8115 #ifdef CONFIG_FAIR_GROUP_SCHED
8116 static void task_move_group_fair(struct task_struct *p)
8117 {
8118         detach_task_cfs_rq(p);
8119         set_task_rq(p, task_cpu(p));
8120
8121 #ifdef CONFIG_SMP
8122         /* Tell se's cfs_rq has been changed -- migrated */
8123         p->se.avg.last_update_time = 0;
8124 #endif
8125         attach_task_cfs_rq(p);
8126 }
8127
8128 void free_fair_sched_group(struct task_group *tg)
8129 {
8130         int i;
8131
8132         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8133
8134         for_each_possible_cpu(i) {
8135                 if (tg->cfs_rq)
8136                         kfree(tg->cfs_rq[i]);
8137                 if (tg->se) {
8138                         if (tg->se[i])
8139                                 remove_entity_load_avg(tg->se[i]);
8140                         kfree(tg->se[i]);
8141                 }
8142         }
8143
8144         kfree(tg->cfs_rq);
8145         kfree(tg->se);
8146 }
8147
8148 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8149 {
8150         struct cfs_rq *cfs_rq;
8151         struct sched_entity *se;
8152         int i;
8153
8154         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
8155         if (!tg->cfs_rq)
8156                 goto err;
8157         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
8158         if (!tg->se)
8159                 goto err;
8160
8161         tg->shares = NICE_0_LOAD;
8162
8163         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8164
8165         for_each_possible_cpu(i) {
8166                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8167                                       GFP_KERNEL, cpu_to_node(i));
8168                 if (!cfs_rq)
8169                         goto err;
8170
8171                 se = kzalloc_node(sizeof(struct sched_entity),
8172                                   GFP_KERNEL, cpu_to_node(i));
8173                 if (!se)
8174                         goto err_free_rq;
8175
8176                 init_cfs_rq(cfs_rq);
8177                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8178                 init_entity_runnable_average(se);
8179         }
8180
8181         return 1;
8182
8183 err_free_rq:
8184         kfree(cfs_rq);
8185 err:
8186         return 0;
8187 }
8188
8189 void unregister_fair_sched_group(struct task_group *tg, int cpu)
8190 {
8191         struct rq *rq = cpu_rq(cpu);
8192         unsigned long flags;
8193
8194         /*
8195         * Only empty task groups can be destroyed; so we can speculatively
8196         * check on_list without danger of it being re-added.
8197         */
8198         if (!tg->cfs_rq[cpu]->on_list)
8199                 return;
8200
8201         raw_spin_lock_irqsave(&rq->lock, flags);
8202         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8203         raw_spin_unlock_irqrestore(&rq->lock, flags);
8204 }
8205
8206 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8207                         struct sched_entity *se, int cpu,
8208                         struct sched_entity *parent)
8209 {
8210         struct rq *rq = cpu_rq(cpu);
8211
8212         cfs_rq->tg = tg;
8213         cfs_rq->rq = rq;
8214         init_cfs_rq_runtime(cfs_rq);
8215
8216         tg->cfs_rq[cpu] = cfs_rq;
8217         tg->se[cpu] = se;
8218
8219         /* se could be NULL for root_task_group */
8220         if (!se)
8221                 return;
8222
8223         if (!parent) {
8224                 se->cfs_rq = &rq->cfs;
8225                 se->depth = 0;
8226         } else {
8227                 se->cfs_rq = parent->my_q;
8228                 se->depth = parent->depth + 1;
8229         }
8230
8231         se->my_q = cfs_rq;
8232         /* guarantee group entities always have weight */
8233         update_load_set(&se->load, NICE_0_LOAD);
8234         se->parent = parent;
8235 }
8236
8237 static DEFINE_MUTEX(shares_mutex);
8238
8239 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8240 {
8241         int i;
8242         unsigned long flags;
8243
8244         /*
8245          * We can't change the weight of the root cgroup.
8246          */
8247         if (!tg->se[0])
8248                 return -EINVAL;
8249
8250         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
8251
8252         mutex_lock(&shares_mutex);
8253         if (tg->shares == shares)
8254                 goto done;
8255
8256         tg->shares = shares;
8257         for_each_possible_cpu(i) {
8258                 struct rq *rq = cpu_rq(i);
8259                 struct sched_entity *se;
8260
8261                 se = tg->se[i];
8262                 /* Propagate contribution to hierarchy */
8263                 raw_spin_lock_irqsave(&rq->lock, flags);
8264
8265                 /* Possible calls to update_curr() need rq clock */
8266                 update_rq_clock(rq);
8267                 for_each_sched_entity(se)
8268                         update_cfs_shares(group_cfs_rq(se));
8269                 raw_spin_unlock_irqrestore(&rq->lock, flags);
8270         }
8271
8272 done:
8273         mutex_unlock(&shares_mutex);
8274         return 0;
8275 }
8276 #else /* CONFIG_FAIR_GROUP_SCHED */
8277
8278 void free_fair_sched_group(struct task_group *tg) { }
8279
8280 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8281 {
8282         return 1;
8283 }
8284
8285 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
8286
8287 #endif /* CONFIG_FAIR_GROUP_SCHED */
8288
8289
8290 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
8291 {
8292         struct sched_entity *se = &task->se;
8293         unsigned int rr_interval = 0;
8294
8295         /*
8296          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
8297          * idle runqueue:
8298          */
8299         if (rq->cfs.load.weight)
8300                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
8301
8302         return rr_interval;
8303 }
8304
8305 /*
8306  * All the scheduling class methods:
8307  */
8308 const struct sched_class fair_sched_class = {
8309         .next                   = &idle_sched_class,
8310         .enqueue_task           = enqueue_task_fair,
8311         .dequeue_task           = dequeue_task_fair,
8312         .yield_task             = yield_task_fair,
8313         .yield_to_task          = yield_to_task_fair,
8314
8315         .check_preempt_curr     = check_preempt_wakeup,
8316
8317         .pick_next_task         = pick_next_task_fair,
8318         .put_prev_task          = put_prev_task_fair,
8319
8320 #ifdef CONFIG_SMP
8321         .select_task_rq         = select_task_rq_fair,
8322         .migrate_task_rq        = migrate_task_rq_fair,
8323
8324         .rq_online              = rq_online_fair,
8325         .rq_offline             = rq_offline_fair,
8326
8327         .task_waking            = task_waking_fair,
8328         .task_dead              = task_dead_fair,
8329         .set_cpus_allowed       = set_cpus_allowed_common,
8330 #endif
8331
8332         .set_curr_task          = set_curr_task_fair,
8333         .task_tick              = task_tick_fair,
8334         .task_fork              = task_fork_fair,
8335
8336         .prio_changed           = prio_changed_fair,
8337         .switched_from          = switched_from_fair,
8338         .switched_to            = switched_to_fair,
8339
8340         .get_rr_interval        = get_rr_interval_fair,
8341
8342         .update_curr            = update_curr_fair,
8343
8344 #ifdef CONFIG_FAIR_GROUP_SCHED
8345         .task_move_group        = task_move_group_fair,
8346 #endif
8347 };
8348
8349 #ifdef CONFIG_SCHED_DEBUG
8350 void print_cfs_stats(struct seq_file *m, int cpu)
8351 {
8352         struct cfs_rq *cfs_rq;
8353
8354         rcu_read_lock();
8355         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
8356                 print_cfs_rq(m, cpu, cfs_rq);
8357         rcu_read_unlock();
8358 }
8359
8360 #ifdef CONFIG_NUMA_BALANCING
8361 void show_numa_stats(struct task_struct *p, struct seq_file *m)
8362 {
8363         int node;
8364         unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
8365
8366         for_each_online_node(node) {
8367                 if (p->numa_faults) {
8368                         tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
8369                         tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
8370                 }
8371                 if (p->numa_group) {
8372                         gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
8373                         gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
8374                 }
8375                 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
8376         }
8377 }
8378 #endif /* CONFIG_NUMA_BALANCING */
8379 #endif /* CONFIG_SCHED_DEBUG */
8380
8381 __init void init_sched_fair_class(void)
8382 {
8383 #ifdef CONFIG_SMP
8384         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8385
8386 #ifdef CONFIG_NO_HZ_COMMON
8387         nohz.next_balance = jiffies;
8388         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8389         cpu_notifier(sched_ilb_notifier, 0);
8390 #endif
8391 #endif /* SMP */
8392
8393 }