4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <asm/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/capability.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h>
34 #include <linux/security.h>
35 #include <linux/notifier.h>
36 #include <linux/profile.h>
37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/delay.h>
41 #include <linux/smp.h>
42 #include <linux/threads.h>
43 #include <linux/timer.h>
44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h>
46 #include <linux/cpuset.h>
47 #include <linux/percpu.h>
48 #include <linux/kthread.h>
49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h>
51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h>
55 #include <linux/reciprocal_div.h>
58 #include <asm/unistd.h>
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
65 unsigned long long __attribute__((weak)) sched_clock(void)
67 return (unsigned long long)jiffies * (1000000000 / HZ);
71 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
73 void sched_clock_unstable_event(void)
78 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
82 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
83 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
84 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
87 * 'User priority' is the nice value converted to something we
88 * can work with better when scaling various scheduler parameters,
89 * it's a [ 0 ... 39 ] range.
91 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
92 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
93 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
96 * Some helpers for converting nanosecond timing to jiffy resolution
98 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
99 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
101 #define NICE_0_LOAD SCHED_LOAD_SCALE
102 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
105 * These are the 'tuning knobs' of the scheduler:
107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
109 * Timeslices get refilled after they expire.
111 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
112 #define DEF_TIMESLICE (100 * HZ / 1000)
113 #define ON_RUNQUEUE_WEIGHT 30
114 #define CHILD_PENALTY 95
115 #define PARENT_PENALTY 100
116 #define EXIT_WEIGHT 3
117 #define PRIO_BONUS_RATIO 25
118 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
119 #define INTERACTIVE_DELTA 2
120 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
121 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
122 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
125 * If a task is 'interactive' then we reinsert it in the active
126 * array after it has expired its current timeslice. (it will not
127 * continue to run immediately, it will still roundrobin with
128 * other interactive tasks.)
130 * This part scales the interactivity limit depending on niceness.
132 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
133 * Here are a few examples of different nice levels:
135 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
136 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
137 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
138 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
139 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
141 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
142 * priority range a task can explore, a value of '1' means the
143 * task is rated interactive.)
145 * Ie. nice +19 tasks can never get 'interactive' enough to be
146 * reinserted into the active array. And only heavily CPU-hog nice -20
147 * tasks will be expired. Default nice 0 tasks are somewhere between,
148 * it takes some effort for them to get interactive, but it's not
152 #define CURRENT_BONUS(p) \
153 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
156 #define GRANULARITY (10 * HZ / 1000 ? : 1)
159 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
160 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
163 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
164 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
167 #define SCALE(v1,v1_max,v2_max) \
168 (v1) * (v2_max) / (v1_max)
171 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
174 #define TASK_INTERACTIVE(p) \
175 ((p)->prio <= (p)->static_prio - DELTA(p))
177 #define INTERACTIVE_SLEEP(p) \
178 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
179 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
181 #define TASK_PREEMPTS_CURR(p, rq) \
182 ((p)->prio < (rq)->curr->prio)
184 #define SCALE_PRIO(x, prio) \
185 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
187 static unsigned int static_prio_timeslice(int static_prio)
189 if (static_prio < NICE_TO_PRIO(0))
190 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
192 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
197 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
198 * Since cpu_power is a 'constant', we can use a reciprocal divide.
200 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
202 return reciprocal_divide(load, sg->reciprocal_cpu_power);
206 * Each time a sched group cpu_power is changed,
207 * we must compute its reciprocal value
209 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
211 sg->__cpu_power += val;
212 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
217 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
218 * to time slice values: [800ms ... 100ms ... 5ms]
220 * The higher a thread's priority, the bigger timeslices
221 * it gets during one round of execution. But even the lowest
222 * priority thread gets MIN_TIMESLICE worth of execution time.
225 static inline unsigned int task_timeslice(struct task_struct *p)
227 return static_prio_timeslice(p->static_prio);
230 static inline int rt_policy(int policy)
232 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
237 static inline int task_has_rt_policy(struct task_struct *p)
239 return rt_policy(p->policy);
243 * This is the priority-queue data structure of the RT scheduling class:
245 struct rt_prio_array {
246 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
247 struct list_head queue[MAX_RT_PRIO];
251 struct load_weight load;
252 u64 load_update_start, load_update_last;
253 unsigned long delta_fair, delta_exec, delta_stat;
256 /* CFS-related fields in a runqueue */
258 struct load_weight load;
259 unsigned long nr_running;
265 unsigned long wait_runtime_overruns, wait_runtime_underruns;
267 struct rb_root tasks_timeline;
268 struct rb_node *rb_leftmost;
269 struct rb_node *rb_load_balance_curr;
270 #ifdef CONFIG_FAIR_GROUP_SCHED
271 /* 'curr' points to currently running entity on this cfs_rq.
272 * It is set to NULL otherwise (i.e when none are currently running).
274 struct sched_entity *curr;
275 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
277 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
278 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
279 * (like users, containers etc.)
281 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
282 * list is used during load balance.
284 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
288 /* Real-Time classes' related field in a runqueue: */
290 struct rt_prio_array active;
291 int rt_load_balance_idx;
292 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
296 * The prio-array type of the old scheduler:
299 unsigned int nr_active;
300 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
301 struct list_head queue[MAX_PRIO];
305 * This is the main, per-CPU runqueue data structure.
307 * Locking rule: those places that want to lock multiple runqueues
308 * (such as the load balancing or the thread migration code), lock
309 * acquire operations must be ordered by ascending &runqueue.
312 spinlock_t lock; /* runqueue lock */
315 * nr_running and cpu_load should be in the same cacheline because
316 * remote CPUs use both these fields when doing load calculation.
318 unsigned long nr_running;
319 unsigned long raw_weighted_load;
320 #define CPU_LOAD_IDX_MAX 5
321 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
322 unsigned char idle_at_tick;
324 unsigned char in_nohz_recently;
326 struct load_stat ls; /* capture load from *all* tasks on this cpu */
327 unsigned long nr_load_updates;
331 #ifdef CONFIG_FAIR_GROUP_SCHED
332 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
337 * This is part of a global counter where only the total sum
338 * over all CPUs matters. A task can increase this counter on
339 * one CPU and if it got migrated afterwards it may decrease
340 * it on another CPU. Always updated under the runqueue lock:
342 unsigned long nr_uninterruptible;
344 unsigned long expired_timestamp;
345 unsigned long long most_recent_timestamp;
347 struct task_struct *curr, *idle;
348 unsigned long next_balance;
349 struct mm_struct *prev_mm;
351 struct prio_array *active, *expired, arrays[2];
352 int best_expired_prio;
354 u64 clock, prev_clock_raw;
357 unsigned int clock_warps, clock_overflows;
358 unsigned int clock_unstable_events;
360 struct sched_class *load_balance_class;
365 struct sched_domain *sd;
367 /* For active balancing */
370 int cpu; /* cpu of this runqueue */
372 struct task_struct *migration_thread;
373 struct list_head migration_queue;
376 #ifdef CONFIG_SCHEDSTATS
378 struct sched_info rq_sched_info;
380 /* sys_sched_yield() stats */
381 unsigned long yld_exp_empty;
382 unsigned long yld_act_empty;
383 unsigned long yld_both_empty;
384 unsigned long yld_cnt;
386 /* schedule() stats */
387 unsigned long sched_switch;
388 unsigned long sched_cnt;
389 unsigned long sched_goidle;
391 /* try_to_wake_up() stats */
392 unsigned long ttwu_cnt;
393 unsigned long ttwu_local;
395 struct lock_class_key rq_lock_key;
398 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
399 static DEFINE_MUTEX(sched_hotcpu_mutex);
401 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
403 rq->curr->sched_class->check_preempt_curr(rq, p);
406 static inline int cpu_of(struct rq *rq)
416 * Per-runqueue clock, as finegrained as the platform can give us:
418 static unsigned long long __rq_clock(struct rq *rq)
420 u64 prev_raw = rq->prev_clock_raw;
421 u64 now = sched_clock();
422 s64 delta = now - prev_raw;
423 u64 clock = rq->clock;
426 * Protect against sched_clock() occasionally going backwards:
428 if (unlikely(delta < 0)) {
433 * Catch too large forward jumps too:
435 if (unlikely(delta > 2*TICK_NSEC)) {
437 rq->clock_overflows++;
439 if (unlikely(delta > rq->clock_max_delta))
440 rq->clock_max_delta = delta;
445 rq->prev_clock_raw = now;
451 static inline unsigned long long rq_clock(struct rq *rq)
453 int this_cpu = smp_processor_id();
455 if (this_cpu == cpu_of(rq))
456 return __rq_clock(rq);
462 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
463 * See detach_destroy_domains: synchronize_sched for details.
465 * The domain tree of any CPU may only be accessed from within
466 * preempt-disabled sections.
468 #define for_each_domain(cpu, __sd) \
469 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
471 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
472 #define this_rq() (&__get_cpu_var(runqueues))
473 #define task_rq(p) cpu_rq(task_cpu(p))
474 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
476 #ifdef CONFIG_FAIR_GROUP_SCHED
477 /* Change a task's ->cfs_rq if it moves across CPUs */
478 static inline void set_task_cfs_rq(struct task_struct *p)
480 p->se.cfs_rq = &task_rq(p)->cfs;
483 static inline void set_task_cfs_rq(struct task_struct *p)
488 #ifndef prepare_arch_switch
489 # define prepare_arch_switch(next) do { } while (0)
491 #ifndef finish_arch_switch
492 # define finish_arch_switch(prev) do { } while (0)
495 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
496 static inline int task_running(struct rq *rq, struct task_struct *p)
498 return rq->curr == p;
501 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
505 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
507 #ifdef CONFIG_DEBUG_SPINLOCK
508 /* this is a valid case when another task releases the spinlock */
509 rq->lock.owner = current;
512 * If we are tracking spinlock dependencies then we have to
513 * fix up the runqueue lock - which gets 'carried over' from
516 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
518 spin_unlock_irq(&rq->lock);
521 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
522 static inline int task_running(struct rq *rq, struct task_struct *p)
527 return rq->curr == p;
531 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
535 * We can optimise this out completely for !SMP, because the
536 * SMP rebalancing from interrupt is the only thing that cares
541 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
542 spin_unlock_irq(&rq->lock);
544 spin_unlock(&rq->lock);
548 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
552 * After ->oncpu is cleared, the task can be moved to a different CPU.
553 * We must ensure this doesn't happen until the switch is completely
559 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
563 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
566 * __task_rq_lock - lock the runqueue a given task resides on.
567 * Must be called interrupts disabled.
569 static inline struct rq *__task_rq_lock(struct task_struct *p)
576 spin_lock(&rq->lock);
577 if (unlikely(rq != task_rq(p))) {
578 spin_unlock(&rq->lock);
579 goto repeat_lock_task;
585 * task_rq_lock - lock the runqueue a given task resides on and disable
586 * interrupts. Note the ordering: we can safely lookup the task_rq without
587 * explicitly disabling preemption.
589 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
595 local_irq_save(*flags);
597 spin_lock(&rq->lock);
598 if (unlikely(rq != task_rq(p))) {
599 spin_unlock_irqrestore(&rq->lock, *flags);
600 goto repeat_lock_task;
605 static inline void __task_rq_unlock(struct rq *rq)
608 spin_unlock(&rq->lock);
611 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
614 spin_unlock_irqrestore(&rq->lock, *flags);
618 * this_rq_lock - lock this runqueue and disable interrupts.
620 static inline struct rq *this_rq_lock(void)
627 spin_lock(&rq->lock);
633 * resched_task - mark a task 'to be rescheduled now'.
635 * On UP this means the setting of the need_resched flag, on SMP it
636 * might also involve a cross-CPU call to trigger the scheduler on
641 #ifndef tsk_is_polling
642 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
645 static void resched_task(struct task_struct *p)
649 assert_spin_locked(&task_rq(p)->lock);
651 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
654 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
657 if (cpu == smp_processor_id())
660 /* NEED_RESCHED must be visible before we test polling */
662 if (!tsk_is_polling(p))
663 smp_send_reschedule(cpu);
666 static void resched_cpu(int cpu)
668 struct rq *rq = cpu_rq(cpu);
671 if (!spin_trylock_irqsave(&rq->lock, flags))
673 resched_task(cpu_curr(cpu));
674 spin_unlock_irqrestore(&rq->lock, flags);
677 static inline void resched_task(struct task_struct *p)
679 assert_spin_locked(&task_rq(p)->lock);
680 set_tsk_need_resched(p);
684 static u64 div64_likely32(u64 divident, unsigned long divisor)
686 #if BITS_PER_LONG == 32
687 if (likely(divident <= 0xffffffffULL))
688 return (u32)divident / divisor;
689 do_div(divident, divisor);
693 return divident / divisor;
697 #if BITS_PER_LONG == 32
698 # define WMULT_CONST (~0UL)
700 # define WMULT_CONST (1UL << 32)
703 #define WMULT_SHIFT 32
705 static inline unsigned long
706 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
707 struct load_weight *lw)
711 if (unlikely(!lw->inv_weight))
712 lw->inv_weight = WMULT_CONST / lw->weight;
714 tmp = (u64)delta_exec * weight;
716 * Check whether we'd overflow the 64-bit multiplication:
718 if (unlikely(tmp > WMULT_CONST)) {
719 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
722 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
725 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
728 static inline unsigned long
729 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
731 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
734 static void update_load_add(struct load_weight *lw, unsigned long inc)
740 static void update_load_sub(struct load_weight *lw, unsigned long dec)
746 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
748 if (rq->curr != rq->idle && ls->load.weight) {
749 ls->delta_exec += ls->delta_stat;
750 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
756 * Update delta_exec, delta_fair fields for rq.
758 * delta_fair clock advances at a rate inversely proportional to
759 * total load (rq->ls.load.weight) on the runqueue, while
760 * delta_exec advances at the same rate as wall-clock (provided
763 * delta_exec / delta_fair is a measure of the (smoothened) load on this
764 * runqueue over any given interval. This (smoothened) load is used
765 * during load balance.
767 * This function is called /before/ updating rq->ls.load
768 * and when switching tasks.
770 static void update_curr_load(struct rq *rq, u64 now)
772 struct load_stat *ls = &rq->ls;
775 start = ls->load_update_start;
776 ls->load_update_start = now;
777 ls->delta_stat += now - start;
779 * Stagger updates to ls->delta_fair. Very frequent updates
782 if (ls->delta_stat >= sysctl_sched_stat_granularity)
783 __update_curr_load(rq, ls);
787 * To aid in avoiding the subversion of "niceness" due to uneven distribution
788 * of tasks with abnormal "nice" values across CPUs the contribution that
789 * each task makes to its run queue's load is weighted according to its
790 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
791 * scaled version of the new time slice allocation that they receive on time
796 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
797 * If static_prio_timeslice() is ever changed to break this assumption then
798 * this code will need modification
800 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
801 #define load_weight(lp) \
802 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
803 #define PRIO_TO_LOAD_WEIGHT(prio) \
804 load_weight(static_prio_timeslice(prio))
805 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
806 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
808 #define WEIGHT_IDLEPRIO 2
809 #define WMULT_IDLEPRIO (1 << 31)
812 * Nice levels are multiplicative, with a gentle 10% change for every
813 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
814 * nice 1, it will get ~10% less CPU time than another CPU-bound task
815 * that remained on nice 0.
817 * The "10% effect" is relative and cumulative: from _any_ nice level,
818 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
819 * it's +10% CPU usage.
821 static const int prio_to_weight[40] = {
822 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
823 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
824 /* 0 */ NICE_0_LOAD /* 1024 */,
825 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
826 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
829 static const u32 prio_to_wmult[40] = {
830 48356, 60446, 75558, 94446, 118058, 147573,
831 184467, 230589, 288233, 360285, 450347,
832 562979, 703746, 879575, 1099582, 1374389,
833 717986, 2147483, 2684354, 3355443, 4194304,
834 244160, 6557201, 8196502, 10250518, 12782640,
835 16025997, 19976592, 24970740, 31350126, 39045157,
836 49367440, 61356675, 76695844, 95443717, 119304647,
837 148102320, 186737708, 238609294, 286331153,
841 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
843 update_curr_load(rq, now);
844 update_load_add(&rq->ls.load, p->se.load.weight);
848 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
850 update_curr_load(rq, now);
851 update_load_sub(&rq->ls.load, p->se.load.weight);
854 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
857 inc_load(rq, p, now);
860 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
863 dec_load(rq, p, now);
866 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
869 * runqueue iterator, to support SMP load-balancing between different
870 * scheduling classes, without having to expose their internal data
871 * structures to the load-balancing proper:
875 struct task_struct *(*start)(void *);
876 struct task_struct *(*next)(void *);
879 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
880 unsigned long max_nr_move, unsigned long max_load_move,
881 struct sched_domain *sd, enum cpu_idle_type idle,
882 int *all_pinned, unsigned long *load_moved,
883 int this_best_prio, int best_prio, int best_prio_seen,
884 struct rq_iterator *iterator);
886 #include "sched_stats.h"
887 #include "sched_rt.c"
888 #include "sched_fair.c"
889 #include "sched_idletask.c"
890 #ifdef CONFIG_SCHED_DEBUG
891 # include "sched_debug.c"
894 #define sched_class_highest (&rt_sched_class)
896 static void set_load_weight(struct task_struct *p)
898 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
899 p->se.wait_runtime = 0;
901 if (task_has_rt_policy(p)) {
902 p->se.load.weight = prio_to_weight[0] * 2;
903 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
908 * SCHED_IDLE tasks get minimal weight:
910 if (p->policy == SCHED_IDLE) {
911 p->se.load.weight = WEIGHT_IDLEPRIO;
912 p->se.load.inv_weight = WMULT_IDLEPRIO;
916 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
917 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
921 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
923 sched_info_queued(p);
924 p->sched_class->enqueue_task(rq, p, wakeup, now);
929 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
931 p->sched_class->dequeue_task(rq, p, sleep, now);
936 * __normal_prio - return the priority that is based on the static prio
938 static inline int __normal_prio(struct task_struct *p)
940 return p->static_prio;
944 * Calculate the expected normal priority: i.e. priority
945 * without taking RT-inheritance into account. Might be
946 * boosted by interactivity modifiers. Changes upon fork,
947 * setprio syscalls, and whenever the interactivity
948 * estimator recalculates.
950 static inline int normal_prio(struct task_struct *p)
954 if (task_has_rt_policy(p))
955 prio = MAX_RT_PRIO-1 - p->rt_priority;
957 prio = __normal_prio(p);
962 * Calculate the current priority, i.e. the priority
963 * taken into account by the scheduler. This value might
964 * be boosted by RT tasks, or might be boosted by
965 * interactivity modifiers. Will be RT if the task got
966 * RT-boosted. If not then it returns p->normal_prio.
968 static int effective_prio(struct task_struct *p)
970 p->normal_prio = normal_prio(p);
972 * If we are RT tasks or we were boosted to RT priority,
973 * keep the priority unchanged. Otherwise, update priority
974 * to the normal priority:
976 if (!rt_prio(p->prio))
977 return p->normal_prio;
982 * activate_task - move a task to the runqueue.
984 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
986 u64 now = rq_clock(rq);
988 if (p->state == TASK_UNINTERRUPTIBLE)
989 rq->nr_uninterruptible--;
991 enqueue_task(rq, p, wakeup, now);
992 inc_nr_running(p, rq, now);
996 * activate_idle_task - move idle task to the _front_ of runqueue.
998 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
1000 u64 now = rq_clock(rq);
1002 if (p->state == TASK_UNINTERRUPTIBLE)
1003 rq->nr_uninterruptible--;
1005 enqueue_task(rq, p, 0, now);
1006 inc_nr_running(p, rq, now);
1010 * deactivate_task - remove a task from the runqueue.
1012 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1014 u64 now = rq_clock(rq);
1016 if (p->state == TASK_UNINTERRUPTIBLE)
1017 rq->nr_uninterruptible++;
1019 dequeue_task(rq, p, sleep, now);
1020 dec_nr_running(p, rq, now);
1024 * task_curr - is this task currently executing on a CPU?
1025 * @p: the task in question.
1027 inline int task_curr(const struct task_struct *p)
1029 return cpu_curr(task_cpu(p)) == p;
1032 /* Used instead of source_load when we know the type == 0 */
1033 unsigned long weighted_cpuload(const int cpu)
1035 return cpu_rq(cpu)->ls.load.weight;
1038 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1041 task_thread_info(p)->cpu = cpu;
1048 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1050 int old_cpu = task_cpu(p);
1051 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1052 u64 clock_offset, fair_clock_offset;
1054 clock_offset = old_rq->clock - new_rq->clock;
1055 fair_clock_offset = old_rq->cfs.fair_clock -
1056 new_rq->cfs.fair_clock;
1057 if (p->se.wait_start)
1058 p->se.wait_start -= clock_offset;
1059 if (p->se.wait_start_fair)
1060 p->se.wait_start_fair -= fair_clock_offset;
1061 if (p->se.sleep_start)
1062 p->se.sleep_start -= clock_offset;
1063 if (p->se.block_start)
1064 p->se.block_start -= clock_offset;
1065 if (p->se.sleep_start_fair)
1066 p->se.sleep_start_fair -= fair_clock_offset;
1068 __set_task_cpu(p, new_cpu);
1071 struct migration_req {
1072 struct list_head list;
1074 struct task_struct *task;
1077 struct completion done;
1081 * The task's runqueue lock must be held.
1082 * Returns true if you have to wait for migration thread.
1085 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1087 struct rq *rq = task_rq(p);
1090 * If the task is not on a runqueue (and not running), then
1091 * it is sufficient to simply update the task's cpu field.
1093 if (!p->se.on_rq && !task_running(rq, p)) {
1094 set_task_cpu(p, dest_cpu);
1098 init_completion(&req->done);
1100 req->dest_cpu = dest_cpu;
1101 list_add(&req->list, &rq->migration_queue);
1107 * wait_task_inactive - wait for a thread to unschedule.
1109 * The caller must ensure that the task *will* unschedule sometime soon,
1110 * else this function might spin for a *long* time. This function can't
1111 * be called with interrupts off, or it may introduce deadlock with
1112 * smp_call_function() if an IPI is sent by the same process we are
1113 * waiting to become inactive.
1115 void wait_task_inactive(struct task_struct *p)
1117 unsigned long flags;
1123 * We do the initial early heuristics without holding
1124 * any task-queue locks at all. We'll only try to get
1125 * the runqueue lock when things look like they will
1131 * If the task is actively running on another CPU
1132 * still, just relax and busy-wait without holding
1135 * NOTE! Since we don't hold any locks, it's not
1136 * even sure that "rq" stays as the right runqueue!
1137 * But we don't care, since "task_running()" will
1138 * return false if the runqueue has changed and p
1139 * is actually now running somewhere else!
1141 while (task_running(rq, p))
1145 * Ok, time to look more closely! We need the rq
1146 * lock now, to be *sure*. If we're wrong, we'll
1147 * just go back and repeat.
1149 rq = task_rq_lock(p, &flags);
1150 running = task_running(rq, p);
1151 on_rq = p->se.on_rq;
1152 task_rq_unlock(rq, &flags);
1155 * Was it really running after all now that we
1156 * checked with the proper locks actually held?
1158 * Oops. Go back and try again..
1160 if (unlikely(running)) {
1166 * It's not enough that it's not actively running,
1167 * it must be off the runqueue _entirely_, and not
1170 * So if it wa still runnable (but just not actively
1171 * running right now), it's preempted, and we should
1172 * yield - it could be a while.
1174 if (unlikely(on_rq)) {
1180 * Ahh, all good. It wasn't running, and it wasn't
1181 * runnable, which means that it will never become
1182 * running in the future either. We're all done!
1187 * kick_process - kick a running thread to enter/exit the kernel
1188 * @p: the to-be-kicked thread
1190 * Cause a process which is running on another CPU to enter
1191 * kernel-mode, without any delay. (to get signals handled.)
1193 * NOTE: this function doesnt have to take the runqueue lock,
1194 * because all it wants to ensure is that the remote task enters
1195 * the kernel. If the IPI races and the task has been migrated
1196 * to another CPU then no harm is done and the purpose has been
1199 void kick_process(struct task_struct *p)
1205 if ((cpu != smp_processor_id()) && task_curr(p))
1206 smp_send_reschedule(cpu);
1211 * Return a low guess at the load of a migration-source cpu weighted
1212 * according to the scheduling class and "nice" value.
1214 * We want to under-estimate the load of migration sources, to
1215 * balance conservatively.
1217 static inline unsigned long source_load(int cpu, int type)
1219 struct rq *rq = cpu_rq(cpu);
1220 unsigned long total = weighted_cpuload(cpu);
1225 return min(rq->cpu_load[type-1], total);
1229 * Return a high guess at the load of a migration-target cpu weighted
1230 * according to the scheduling class and "nice" value.
1232 static inline unsigned long target_load(int cpu, int type)
1234 struct rq *rq = cpu_rq(cpu);
1235 unsigned long total = weighted_cpuload(cpu);
1240 return max(rq->cpu_load[type-1], total);
1244 * Return the average load per task on the cpu's run queue
1246 static inline unsigned long cpu_avg_load_per_task(int cpu)
1248 struct rq *rq = cpu_rq(cpu);
1249 unsigned long total = weighted_cpuload(cpu);
1250 unsigned long n = rq->nr_running;
1252 return n ? total / n : SCHED_LOAD_SCALE;
1256 * find_idlest_group finds and returns the least busy CPU group within the
1259 static struct sched_group *
1260 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1262 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1263 unsigned long min_load = ULONG_MAX, this_load = 0;
1264 int load_idx = sd->forkexec_idx;
1265 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1268 unsigned long load, avg_load;
1272 /* Skip over this group if it has no CPUs allowed */
1273 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1276 local_group = cpu_isset(this_cpu, group->cpumask);
1278 /* Tally up the load of all CPUs in the group */
1281 for_each_cpu_mask(i, group->cpumask) {
1282 /* Bias balancing toward cpus of our domain */
1284 load = source_load(i, load_idx);
1286 load = target_load(i, load_idx);
1291 /* Adjust by relative CPU power of the group */
1292 avg_load = sg_div_cpu_power(group,
1293 avg_load * SCHED_LOAD_SCALE);
1296 this_load = avg_load;
1298 } else if (avg_load < min_load) {
1299 min_load = avg_load;
1303 group = group->next;
1304 } while (group != sd->groups);
1306 if (!idlest || 100*this_load < imbalance*min_load)
1312 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1315 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1318 unsigned long load, min_load = ULONG_MAX;
1322 /* Traverse only the allowed CPUs */
1323 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1325 for_each_cpu_mask(i, tmp) {
1326 load = weighted_cpuload(i);
1328 if (load < min_load || (load == min_load && i == this_cpu)) {
1338 * sched_balance_self: balance the current task (running on cpu) in domains
1339 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1342 * Balance, ie. select the least loaded group.
1344 * Returns the target CPU number, or the same CPU if no balancing is needed.
1346 * preempt must be disabled.
1348 static int sched_balance_self(int cpu, int flag)
1350 struct task_struct *t = current;
1351 struct sched_domain *tmp, *sd = NULL;
1353 for_each_domain(cpu, tmp) {
1355 * If power savings logic is enabled for a domain, stop there.
1357 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1359 if (tmp->flags & flag)
1365 struct sched_group *group;
1366 int new_cpu, weight;
1368 if (!(sd->flags & flag)) {
1374 group = find_idlest_group(sd, t, cpu);
1380 new_cpu = find_idlest_cpu(group, t, cpu);
1381 if (new_cpu == -1 || new_cpu == cpu) {
1382 /* Now try balancing at a lower domain level of cpu */
1387 /* Now try balancing at a lower domain level of new_cpu */
1390 weight = cpus_weight(span);
1391 for_each_domain(cpu, tmp) {
1392 if (weight <= cpus_weight(tmp->span))
1394 if (tmp->flags & flag)
1397 /* while loop will break here if sd == NULL */
1403 #endif /* CONFIG_SMP */
1406 * wake_idle() will wake a task on an idle cpu if task->cpu is
1407 * not idle and an idle cpu is available. The span of cpus to
1408 * search starts with cpus closest then further out as needed,
1409 * so we always favor a closer, idle cpu.
1411 * Returns the CPU we should wake onto.
1413 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1414 static int wake_idle(int cpu, struct task_struct *p)
1417 struct sched_domain *sd;
1421 * If it is idle, then it is the best cpu to run this task.
1423 * This cpu is also the best, if it has more than one task already.
1424 * Siblings must be also busy(in most cases) as they didn't already
1425 * pickup the extra load from this cpu and hence we need not check
1426 * sibling runqueue info. This will avoid the checks and cache miss
1427 * penalities associated with that.
1429 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1432 for_each_domain(cpu, sd) {
1433 if (sd->flags & SD_WAKE_IDLE) {
1434 cpus_and(tmp, sd->span, p->cpus_allowed);
1435 for_each_cpu_mask(i, tmp) {
1446 static inline int wake_idle(int cpu, struct task_struct *p)
1453 * try_to_wake_up - wake up a thread
1454 * @p: the to-be-woken-up thread
1455 * @state: the mask of task states that can be woken
1456 * @sync: do a synchronous wakeup?
1458 * Put it on the run-queue if it's not already there. The "current"
1459 * thread is always on the run-queue (except when the actual
1460 * re-schedule is in progress), and as such you're allowed to do
1461 * the simpler "current->state = TASK_RUNNING" to mark yourself
1462 * runnable without the overhead of this.
1464 * returns failure only if the task is already active.
1466 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1468 int cpu, this_cpu, success = 0;
1469 unsigned long flags;
1473 struct sched_domain *sd, *this_sd = NULL;
1474 unsigned long load, this_load;
1478 rq = task_rq_lock(p, &flags);
1479 old_state = p->state;
1480 if (!(old_state & state))
1487 this_cpu = smp_processor_id();
1490 if (unlikely(task_running(rq, p)))
1495 schedstat_inc(rq, ttwu_cnt);
1496 if (cpu == this_cpu) {
1497 schedstat_inc(rq, ttwu_local);
1501 for_each_domain(this_cpu, sd) {
1502 if (cpu_isset(cpu, sd->span)) {
1503 schedstat_inc(sd, ttwu_wake_remote);
1509 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1513 * Check for affine wakeup and passive balancing possibilities.
1516 int idx = this_sd->wake_idx;
1517 unsigned int imbalance;
1519 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1521 load = source_load(cpu, idx);
1522 this_load = target_load(this_cpu, idx);
1524 new_cpu = this_cpu; /* Wake to this CPU if we can */
1526 if (this_sd->flags & SD_WAKE_AFFINE) {
1527 unsigned long tl = this_load;
1528 unsigned long tl_per_task;
1530 tl_per_task = cpu_avg_load_per_task(this_cpu);
1533 * If sync wakeup then subtract the (maximum possible)
1534 * effect of the currently running task from the load
1535 * of the current CPU:
1538 tl -= current->se.load.weight;
1541 tl + target_load(cpu, idx) <= tl_per_task) ||
1542 100*(tl + p->se.load.weight) <= imbalance*load) {
1544 * This domain has SD_WAKE_AFFINE and
1545 * p is cache cold in this domain, and
1546 * there is no bad imbalance.
1548 schedstat_inc(this_sd, ttwu_move_affine);
1554 * Start passive balancing when half the imbalance_pct
1557 if (this_sd->flags & SD_WAKE_BALANCE) {
1558 if (imbalance*this_load <= 100*load) {
1559 schedstat_inc(this_sd, ttwu_move_balance);
1565 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1567 new_cpu = wake_idle(new_cpu, p);
1568 if (new_cpu != cpu) {
1569 set_task_cpu(p, new_cpu);
1570 task_rq_unlock(rq, &flags);
1571 /* might preempt at this point */
1572 rq = task_rq_lock(p, &flags);
1573 old_state = p->state;
1574 if (!(old_state & state))
1579 this_cpu = smp_processor_id();
1584 #endif /* CONFIG_SMP */
1585 activate_task(rq, p, 1);
1587 * Sync wakeups (i.e. those types of wakeups where the waker
1588 * has indicated that it will leave the CPU in short order)
1589 * don't trigger a preemption, if the woken up task will run on
1590 * this cpu. (in this case the 'I will reschedule' promise of
1591 * the waker guarantees that the freshly woken up task is going
1592 * to be considered on this CPU.)
1594 if (!sync || cpu != this_cpu)
1595 check_preempt_curr(rq, p);
1599 p->state = TASK_RUNNING;
1601 task_rq_unlock(rq, &flags);
1606 int fastcall wake_up_process(struct task_struct *p)
1608 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1609 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1611 EXPORT_SYMBOL(wake_up_process);
1613 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1615 return try_to_wake_up(p, state, 0);
1619 * Perform scheduler related setup for a newly forked process p.
1620 * p is forked by current.
1622 * __sched_fork() is basic setup used by init_idle() too:
1624 static void __sched_fork(struct task_struct *p)
1626 p->se.wait_start_fair = 0;
1627 p->se.wait_start = 0;
1628 p->se.exec_start = 0;
1629 p->se.sum_exec_runtime = 0;
1630 p->se.delta_exec = 0;
1631 p->se.delta_fair_run = 0;
1632 p->se.delta_fair_sleep = 0;
1633 p->se.wait_runtime = 0;
1634 p->se.sum_wait_runtime = 0;
1635 p->se.sum_sleep_runtime = 0;
1636 p->se.sleep_start = 0;
1637 p->se.sleep_start_fair = 0;
1638 p->se.block_start = 0;
1639 p->se.sleep_max = 0;
1640 p->se.block_max = 0;
1643 p->se.wait_runtime_overruns = 0;
1644 p->se.wait_runtime_underruns = 0;
1646 INIT_LIST_HEAD(&p->run_list);
1650 * We mark the process as running here, but have not actually
1651 * inserted it onto the runqueue yet. This guarantees that
1652 * nobody will actually run it, and a signal or other external
1653 * event cannot wake it up and insert it on the runqueue either.
1655 p->state = TASK_RUNNING;
1659 * fork()/clone()-time setup:
1661 void sched_fork(struct task_struct *p, int clone_flags)
1663 int cpu = get_cpu();
1668 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1670 __set_task_cpu(p, cpu);
1673 * Make sure we do not leak PI boosting priority to the child:
1675 p->prio = current->normal_prio;
1677 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1678 if (likely(sched_info_on()))
1679 memset(&p->sched_info, 0, sizeof(p->sched_info));
1681 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1684 #ifdef CONFIG_PREEMPT
1685 /* Want to start with kernel preemption disabled. */
1686 task_thread_info(p)->preempt_count = 1;
1692 * After fork, child runs first. (default) If set to 0 then
1693 * parent will (try to) run first.
1695 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1698 * wake_up_new_task - wake up a newly created task for the first time.
1700 * This function will do some initial scheduler statistics housekeeping
1701 * that must be done for every newly created context, then puts the task
1702 * on the runqueue and wakes it.
1704 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1706 unsigned long flags;
1710 rq = task_rq_lock(p, &flags);
1711 BUG_ON(p->state != TASK_RUNNING);
1712 this_cpu = smp_processor_id(); /* parent's CPU */
1714 p->prio = effective_prio(p);
1716 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1717 task_cpu(p) != this_cpu || !current->se.on_rq) {
1718 activate_task(rq, p, 0);
1721 * Let the scheduling class do new task startup
1722 * management (if any):
1724 p->sched_class->task_new(rq, p);
1726 check_preempt_curr(rq, p);
1727 task_rq_unlock(rq, &flags);
1731 * prepare_task_switch - prepare to switch tasks
1732 * @rq: the runqueue preparing to switch
1733 * @next: the task we are going to switch to.
1735 * This is called with the rq lock held and interrupts off. It must
1736 * be paired with a subsequent finish_task_switch after the context
1739 * prepare_task_switch sets up locking and calls architecture specific
1742 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1744 prepare_lock_switch(rq, next);
1745 prepare_arch_switch(next);
1749 * finish_task_switch - clean up after a task-switch
1750 * @rq: runqueue associated with task-switch
1751 * @prev: the thread we just switched away from.
1753 * finish_task_switch must be called after the context switch, paired
1754 * with a prepare_task_switch call before the context switch.
1755 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1756 * and do any other architecture-specific cleanup actions.
1758 * Note that we may have delayed dropping an mm in context_switch(). If
1759 * so, we finish that here outside of the runqueue lock. (Doing it
1760 * with the lock held can cause deadlocks; see schedule() for
1763 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1764 __releases(rq->lock)
1766 struct mm_struct *mm = rq->prev_mm;
1772 * A task struct has one reference for the use as "current".
1773 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1774 * schedule one last time. The schedule call will never return, and
1775 * the scheduled task must drop that reference.
1776 * The test for TASK_DEAD must occur while the runqueue locks are
1777 * still held, otherwise prev could be scheduled on another cpu, die
1778 * there before we look at prev->state, and then the reference would
1780 * Manfred Spraul <manfred@colorfullife.com>
1782 prev_state = prev->state;
1783 finish_arch_switch(prev);
1784 finish_lock_switch(rq, prev);
1787 if (unlikely(prev_state == TASK_DEAD)) {
1789 * Remove function-return probe instances associated with this
1790 * task and put them back on the free list.
1792 kprobe_flush_task(prev);
1793 put_task_struct(prev);
1798 * schedule_tail - first thing a freshly forked thread must call.
1799 * @prev: the thread we just switched away from.
1801 asmlinkage void schedule_tail(struct task_struct *prev)
1802 __releases(rq->lock)
1804 struct rq *rq = this_rq();
1806 finish_task_switch(rq, prev);
1807 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1808 /* In this case, finish_task_switch does not reenable preemption */
1811 if (current->set_child_tid)
1812 put_user(current->pid, current->set_child_tid);
1816 * context_switch - switch to the new MM and the new
1817 * thread's register state.
1820 context_switch(struct rq *rq, struct task_struct *prev,
1821 struct task_struct *next)
1823 struct mm_struct *mm, *oldmm;
1825 prepare_task_switch(rq, next);
1827 oldmm = prev->active_mm;
1829 * For paravirt, this is coupled with an exit in switch_to to
1830 * combine the page table reload and the switch backend into
1833 arch_enter_lazy_cpu_mode();
1835 if (unlikely(!mm)) {
1836 next->active_mm = oldmm;
1837 atomic_inc(&oldmm->mm_count);
1838 enter_lazy_tlb(oldmm, next);
1840 switch_mm(oldmm, mm, next);
1842 if (unlikely(!prev->mm)) {
1843 prev->active_mm = NULL;
1844 rq->prev_mm = oldmm;
1847 * Since the runqueue lock will be released by the next
1848 * task (which is an invalid locking op but in the case
1849 * of the scheduler it's an obvious special-case), so we
1850 * do an early lockdep release here:
1852 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1853 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1856 /* Here we just switch the register state and the stack. */
1857 switch_to(prev, next, prev);
1861 * this_rq must be evaluated again because prev may have moved
1862 * CPUs since it called schedule(), thus the 'rq' on its stack
1863 * frame will be invalid.
1865 finish_task_switch(this_rq(), prev);
1869 * nr_running, nr_uninterruptible and nr_context_switches:
1871 * externally visible scheduler statistics: current number of runnable
1872 * threads, current number of uninterruptible-sleeping threads, total
1873 * number of context switches performed since bootup.
1875 unsigned long nr_running(void)
1877 unsigned long i, sum = 0;
1879 for_each_online_cpu(i)
1880 sum += cpu_rq(i)->nr_running;
1885 unsigned long nr_uninterruptible(void)
1887 unsigned long i, sum = 0;
1889 for_each_possible_cpu(i)
1890 sum += cpu_rq(i)->nr_uninterruptible;
1893 * Since we read the counters lockless, it might be slightly
1894 * inaccurate. Do not allow it to go below zero though:
1896 if (unlikely((long)sum < 0))
1902 unsigned long long nr_context_switches(void)
1905 unsigned long long sum = 0;
1907 for_each_possible_cpu(i)
1908 sum += cpu_rq(i)->nr_switches;
1913 unsigned long nr_iowait(void)
1915 unsigned long i, sum = 0;
1917 for_each_possible_cpu(i)
1918 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1923 unsigned long nr_active(void)
1925 unsigned long i, running = 0, uninterruptible = 0;
1927 for_each_online_cpu(i) {
1928 running += cpu_rq(i)->nr_running;
1929 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1932 if (unlikely((long)uninterruptible < 0))
1933 uninterruptible = 0;
1935 return running + uninterruptible;
1939 * Update rq->cpu_load[] statistics. This function is usually called every
1940 * scheduler tick (TICK_NSEC).
1942 static void update_cpu_load(struct rq *this_rq)
1944 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1945 unsigned long total_load = this_rq->ls.load.weight;
1946 unsigned long this_load = total_load;
1947 struct load_stat *ls = &this_rq->ls;
1948 u64 now = __rq_clock(this_rq);
1951 this_rq->nr_load_updates++;
1952 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1955 /* Update delta_fair/delta_exec fields first */
1956 update_curr_load(this_rq, now);
1958 fair_delta64 = ls->delta_fair + 1;
1961 exec_delta64 = ls->delta_exec + 1;
1964 sample_interval64 = now - ls->load_update_last;
1965 ls->load_update_last = now;
1967 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1968 sample_interval64 = TICK_NSEC;
1970 if (exec_delta64 > sample_interval64)
1971 exec_delta64 = sample_interval64;
1973 idle_delta64 = sample_interval64 - exec_delta64;
1975 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1976 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1978 this_load = (unsigned long)tmp64;
1982 /* Update our load: */
1983 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1984 unsigned long old_load, new_load;
1986 /* scale is effectively 1 << i now, and >> i divides by scale */
1988 old_load = this_rq->cpu_load[i];
1989 new_load = this_load;
1991 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1998 * double_rq_lock - safely lock two runqueues
2000 * Note this does not disable interrupts like task_rq_lock,
2001 * you need to do so manually before calling.
2003 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2004 __acquires(rq1->lock)
2005 __acquires(rq2->lock)
2007 BUG_ON(!irqs_disabled());
2009 spin_lock(&rq1->lock);
2010 __acquire(rq2->lock); /* Fake it out ;) */
2013 spin_lock(&rq1->lock);
2014 spin_lock(&rq2->lock);
2016 spin_lock(&rq2->lock);
2017 spin_lock(&rq1->lock);
2023 * double_rq_unlock - safely unlock two runqueues
2025 * Note this does not restore interrupts like task_rq_unlock,
2026 * you need to do so manually after calling.
2028 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2029 __releases(rq1->lock)
2030 __releases(rq2->lock)
2032 spin_unlock(&rq1->lock);
2034 spin_unlock(&rq2->lock);
2036 __release(rq2->lock);
2040 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2042 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2043 __releases(this_rq->lock)
2044 __acquires(busiest->lock)
2045 __acquires(this_rq->lock)
2047 if (unlikely(!irqs_disabled())) {
2048 /* printk() doesn't work good under rq->lock */
2049 spin_unlock(&this_rq->lock);
2052 if (unlikely(!spin_trylock(&busiest->lock))) {
2053 if (busiest < this_rq) {
2054 spin_unlock(&this_rq->lock);
2055 spin_lock(&busiest->lock);
2056 spin_lock(&this_rq->lock);
2058 spin_lock(&busiest->lock);
2063 * If dest_cpu is allowed for this process, migrate the task to it.
2064 * This is accomplished by forcing the cpu_allowed mask to only
2065 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2066 * the cpu_allowed mask is restored.
2068 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2070 struct migration_req req;
2071 unsigned long flags;
2074 rq = task_rq_lock(p, &flags);
2075 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2076 || unlikely(cpu_is_offline(dest_cpu)))
2079 /* force the process onto the specified CPU */
2080 if (migrate_task(p, dest_cpu, &req)) {
2081 /* Need to wait for migration thread (might exit: take ref). */
2082 struct task_struct *mt = rq->migration_thread;
2084 get_task_struct(mt);
2085 task_rq_unlock(rq, &flags);
2086 wake_up_process(mt);
2087 put_task_struct(mt);
2088 wait_for_completion(&req.done);
2093 task_rq_unlock(rq, &flags);
2097 * sched_exec - execve() is a valuable balancing opportunity, because at
2098 * this point the task has the smallest effective memory and cache footprint.
2100 void sched_exec(void)
2102 int new_cpu, this_cpu = get_cpu();
2103 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2105 if (new_cpu != this_cpu)
2106 sched_migrate_task(current, new_cpu);
2110 * pull_task - move a task from a remote runqueue to the local runqueue.
2111 * Both runqueues must be locked.
2113 static void pull_task(struct rq *src_rq, struct task_struct *p,
2114 struct rq *this_rq, int this_cpu)
2116 deactivate_task(src_rq, p, 0);
2117 set_task_cpu(p, this_cpu);
2118 activate_task(this_rq, p, 0);
2120 * Note that idle threads have a prio of MAX_PRIO, for this test
2121 * to be always true for them.
2123 check_preempt_curr(this_rq, p);
2127 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2130 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2131 struct sched_domain *sd, enum cpu_idle_type idle,
2135 * We do not migrate tasks that are:
2136 * 1) running (obviously), or
2137 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2138 * 3) are cache-hot on their current CPU.
2140 if (!cpu_isset(this_cpu, p->cpus_allowed))
2144 if (task_running(rq, p))
2148 * Aggressive migration if too many balance attempts have failed:
2150 if (sd->nr_balance_failed > sd->cache_nice_tries)
2156 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2157 unsigned long max_nr_move, unsigned long max_load_move,
2158 struct sched_domain *sd, enum cpu_idle_type idle,
2159 int *all_pinned, unsigned long *load_moved,
2160 int this_best_prio, int best_prio, int best_prio_seen,
2161 struct rq_iterator *iterator)
2163 int pulled = 0, pinned = 0, skip_for_load;
2164 struct task_struct *p;
2165 long rem_load_move = max_load_move;
2167 if (max_nr_move == 0 || max_load_move == 0)
2173 * Start the load-balancing iterator:
2175 p = iterator->start(iterator->arg);
2180 * To help distribute high priority tasks accross CPUs we don't
2181 * skip a task if it will be the highest priority task (i.e. smallest
2182 * prio value) on its new queue regardless of its load weight
2184 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2185 SCHED_LOAD_SCALE_FUZZ;
2186 if (skip_for_load && p->prio < this_best_prio)
2187 skip_for_load = !best_prio_seen && p->prio == best_prio;
2188 if (skip_for_load ||
2189 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2191 best_prio_seen |= p->prio == best_prio;
2192 p = iterator->next(iterator->arg);
2196 pull_task(busiest, p, this_rq, this_cpu);
2198 rem_load_move -= p->se.load.weight;
2201 * We only want to steal up to the prescribed number of tasks
2202 * and the prescribed amount of weighted load.
2204 if (pulled < max_nr_move && rem_load_move > 0) {
2205 if (p->prio < this_best_prio)
2206 this_best_prio = p->prio;
2207 p = iterator->next(iterator->arg);
2212 * Right now, this is the only place pull_task() is called,
2213 * so we can safely collect pull_task() stats here rather than
2214 * inside pull_task().
2216 schedstat_add(sd, lb_gained[idle], pulled);
2219 *all_pinned = pinned;
2220 *load_moved = max_load_move - rem_load_move;
2225 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2226 * load from busiest to this_rq, as part of a balancing operation within
2227 * "domain". Returns the number of tasks moved.
2229 * Called with both runqueues locked.
2231 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2232 unsigned long max_nr_move, unsigned long max_load_move,
2233 struct sched_domain *sd, enum cpu_idle_type idle,
2236 struct sched_class *class = sched_class_highest;
2237 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2238 long rem_load_move = max_load_move;
2241 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2242 max_nr_move, (unsigned long)rem_load_move,
2243 sd, idle, all_pinned, &load_moved);
2244 total_nr_moved += nr_moved;
2245 max_nr_move -= nr_moved;
2246 rem_load_move -= load_moved;
2247 class = class->next;
2248 } while (class && max_nr_move && rem_load_move > 0);
2250 return total_nr_moved;
2254 * find_busiest_group finds and returns the busiest CPU group within the
2255 * domain. It calculates and returns the amount of weighted load which
2256 * should be moved to restore balance via the imbalance parameter.
2258 static struct sched_group *
2259 find_busiest_group(struct sched_domain *sd, int this_cpu,
2260 unsigned long *imbalance, enum cpu_idle_type idle,
2261 int *sd_idle, cpumask_t *cpus, int *balance)
2263 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2264 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2265 unsigned long max_pull;
2266 unsigned long busiest_load_per_task, busiest_nr_running;
2267 unsigned long this_load_per_task, this_nr_running;
2269 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2270 int power_savings_balance = 1;
2271 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2272 unsigned long min_nr_running = ULONG_MAX;
2273 struct sched_group *group_min = NULL, *group_leader = NULL;
2276 max_load = this_load = total_load = total_pwr = 0;
2277 busiest_load_per_task = busiest_nr_running = 0;
2278 this_load_per_task = this_nr_running = 0;
2279 if (idle == CPU_NOT_IDLE)
2280 load_idx = sd->busy_idx;
2281 else if (idle == CPU_NEWLY_IDLE)
2282 load_idx = sd->newidle_idx;
2284 load_idx = sd->idle_idx;
2287 unsigned long load, group_capacity;
2290 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2291 unsigned long sum_nr_running, sum_weighted_load;
2293 local_group = cpu_isset(this_cpu, group->cpumask);
2296 balance_cpu = first_cpu(group->cpumask);
2298 /* Tally up the load of all CPUs in the group */
2299 sum_weighted_load = sum_nr_running = avg_load = 0;
2301 for_each_cpu_mask(i, group->cpumask) {
2304 if (!cpu_isset(i, *cpus))
2309 if (*sd_idle && !idle_cpu(i))
2312 /* Bias balancing toward cpus of our domain */
2314 if (idle_cpu(i) && !first_idle_cpu) {
2319 load = target_load(i, load_idx);
2321 load = source_load(i, load_idx);
2324 sum_nr_running += rq->nr_running;
2325 sum_weighted_load += weighted_cpuload(i);
2329 * First idle cpu or the first cpu(busiest) in this sched group
2330 * is eligible for doing load balancing at this and above
2333 if (local_group && balance_cpu != this_cpu && balance) {
2338 total_load += avg_load;
2339 total_pwr += group->__cpu_power;
2341 /* Adjust by relative CPU power of the group */
2342 avg_load = sg_div_cpu_power(group,
2343 avg_load * SCHED_LOAD_SCALE);
2345 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2348 this_load = avg_load;
2350 this_nr_running = sum_nr_running;
2351 this_load_per_task = sum_weighted_load;
2352 } else if (avg_load > max_load &&
2353 sum_nr_running > group_capacity) {
2354 max_load = avg_load;
2356 busiest_nr_running = sum_nr_running;
2357 busiest_load_per_task = sum_weighted_load;
2360 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2362 * Busy processors will not participate in power savings
2365 if (idle == CPU_NOT_IDLE ||
2366 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2370 * If the local group is idle or completely loaded
2371 * no need to do power savings balance at this domain
2373 if (local_group && (this_nr_running >= group_capacity ||
2375 power_savings_balance = 0;
2378 * If a group is already running at full capacity or idle,
2379 * don't include that group in power savings calculations
2381 if (!power_savings_balance || sum_nr_running >= group_capacity
2386 * Calculate the group which has the least non-idle load.
2387 * This is the group from where we need to pick up the load
2390 if ((sum_nr_running < min_nr_running) ||
2391 (sum_nr_running == min_nr_running &&
2392 first_cpu(group->cpumask) <
2393 first_cpu(group_min->cpumask))) {
2395 min_nr_running = sum_nr_running;
2396 min_load_per_task = sum_weighted_load /
2401 * Calculate the group which is almost near its
2402 * capacity but still has some space to pick up some load
2403 * from other group and save more power
2405 if (sum_nr_running <= group_capacity - 1) {
2406 if (sum_nr_running > leader_nr_running ||
2407 (sum_nr_running == leader_nr_running &&
2408 first_cpu(group->cpumask) >
2409 first_cpu(group_leader->cpumask))) {
2410 group_leader = group;
2411 leader_nr_running = sum_nr_running;
2416 group = group->next;
2417 } while (group != sd->groups);
2419 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2422 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2424 if (this_load >= avg_load ||
2425 100*max_load <= sd->imbalance_pct*this_load)
2428 busiest_load_per_task /= busiest_nr_running;
2430 * We're trying to get all the cpus to the average_load, so we don't
2431 * want to push ourselves above the average load, nor do we wish to
2432 * reduce the max loaded cpu below the average load, as either of these
2433 * actions would just result in more rebalancing later, and ping-pong
2434 * tasks around. Thus we look for the minimum possible imbalance.
2435 * Negative imbalances (*we* are more loaded than anyone else) will
2436 * be counted as no imbalance for these purposes -- we can't fix that
2437 * by pulling tasks to us. Be careful of negative numbers as they'll
2438 * appear as very large values with unsigned longs.
2440 if (max_load <= busiest_load_per_task)
2444 * In the presence of smp nice balancing, certain scenarios can have
2445 * max load less than avg load(as we skip the groups at or below
2446 * its cpu_power, while calculating max_load..)
2448 if (max_load < avg_load) {
2450 goto small_imbalance;
2453 /* Don't want to pull so many tasks that a group would go idle */
2454 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2456 /* How much load to actually move to equalise the imbalance */
2457 *imbalance = min(max_pull * busiest->__cpu_power,
2458 (avg_load - this_load) * this->__cpu_power)
2462 * if *imbalance is less than the average load per runnable task
2463 * there is no gaurantee that any tasks will be moved so we'll have
2464 * a think about bumping its value to force at least one task to be
2467 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2468 unsigned long tmp, pwr_now, pwr_move;
2472 pwr_move = pwr_now = 0;
2474 if (this_nr_running) {
2475 this_load_per_task /= this_nr_running;
2476 if (busiest_load_per_task > this_load_per_task)
2479 this_load_per_task = SCHED_LOAD_SCALE;
2481 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2482 busiest_load_per_task * imbn) {
2483 *imbalance = busiest_load_per_task;
2488 * OK, we don't have enough imbalance to justify moving tasks,
2489 * however we may be able to increase total CPU power used by
2493 pwr_now += busiest->__cpu_power *
2494 min(busiest_load_per_task, max_load);
2495 pwr_now += this->__cpu_power *
2496 min(this_load_per_task, this_load);
2497 pwr_now /= SCHED_LOAD_SCALE;
2499 /* Amount of load we'd subtract */
2500 tmp = sg_div_cpu_power(busiest,
2501 busiest_load_per_task * SCHED_LOAD_SCALE);
2503 pwr_move += busiest->__cpu_power *
2504 min(busiest_load_per_task, max_load - tmp);
2506 /* Amount of load we'd add */
2507 if (max_load * busiest->__cpu_power <
2508 busiest_load_per_task * SCHED_LOAD_SCALE)
2509 tmp = sg_div_cpu_power(this,
2510 max_load * busiest->__cpu_power);
2512 tmp = sg_div_cpu_power(this,
2513 busiest_load_per_task * SCHED_LOAD_SCALE);
2514 pwr_move += this->__cpu_power *
2515 min(this_load_per_task, this_load + tmp);
2516 pwr_move /= SCHED_LOAD_SCALE;
2518 /* Move if we gain throughput */
2519 if (pwr_move <= pwr_now)
2522 *imbalance = busiest_load_per_task;
2528 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2529 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2532 if (this == group_leader && group_leader != group_min) {
2533 *imbalance = min_load_per_task;
2543 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2546 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2547 unsigned long imbalance, cpumask_t *cpus)
2549 struct rq *busiest = NULL, *rq;
2550 unsigned long max_load = 0;
2553 for_each_cpu_mask(i, group->cpumask) {
2556 if (!cpu_isset(i, *cpus))
2560 wl = weighted_cpuload(i);
2562 if (rq->nr_running == 1 && wl > imbalance)
2565 if (wl > max_load) {
2575 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2576 * so long as it is large enough.
2578 #define MAX_PINNED_INTERVAL 512
2580 static inline unsigned long minus_1_or_zero(unsigned long n)
2582 return n > 0 ? n - 1 : 0;
2586 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2587 * tasks if there is an imbalance.
2589 static int load_balance(int this_cpu, struct rq *this_rq,
2590 struct sched_domain *sd, enum cpu_idle_type idle,
2593 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2594 struct sched_group *group;
2595 unsigned long imbalance;
2597 cpumask_t cpus = CPU_MASK_ALL;
2598 unsigned long flags;
2601 * When power savings policy is enabled for the parent domain, idle
2602 * sibling can pick up load irrespective of busy siblings. In this case,
2603 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2604 * portraying it as CPU_NOT_IDLE.
2606 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2607 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2610 schedstat_inc(sd, lb_cnt[idle]);
2613 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2620 schedstat_inc(sd, lb_nobusyg[idle]);
2624 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2626 schedstat_inc(sd, lb_nobusyq[idle]);
2630 BUG_ON(busiest == this_rq);
2632 schedstat_add(sd, lb_imbalance[idle], imbalance);
2635 if (busiest->nr_running > 1) {
2637 * Attempt to move tasks. If find_busiest_group has found
2638 * an imbalance but busiest->nr_running <= 1, the group is
2639 * still unbalanced. nr_moved simply stays zero, so it is
2640 * correctly treated as an imbalance.
2642 local_irq_save(flags);
2643 double_rq_lock(this_rq, busiest);
2644 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2645 minus_1_or_zero(busiest->nr_running),
2646 imbalance, sd, idle, &all_pinned);
2647 double_rq_unlock(this_rq, busiest);
2648 local_irq_restore(flags);
2651 * some other cpu did the load balance for us.
2653 if (nr_moved && this_cpu != smp_processor_id())
2654 resched_cpu(this_cpu);
2656 /* All tasks on this runqueue were pinned by CPU affinity */
2657 if (unlikely(all_pinned)) {
2658 cpu_clear(cpu_of(busiest), cpus);
2659 if (!cpus_empty(cpus))
2666 schedstat_inc(sd, lb_failed[idle]);
2667 sd->nr_balance_failed++;
2669 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2671 spin_lock_irqsave(&busiest->lock, flags);
2673 /* don't kick the migration_thread, if the curr
2674 * task on busiest cpu can't be moved to this_cpu
2676 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2677 spin_unlock_irqrestore(&busiest->lock, flags);
2679 goto out_one_pinned;
2682 if (!busiest->active_balance) {
2683 busiest->active_balance = 1;
2684 busiest->push_cpu = this_cpu;
2687 spin_unlock_irqrestore(&busiest->lock, flags);
2689 wake_up_process(busiest->migration_thread);
2692 * We've kicked active balancing, reset the failure
2695 sd->nr_balance_failed = sd->cache_nice_tries+1;
2698 sd->nr_balance_failed = 0;
2700 if (likely(!active_balance)) {
2701 /* We were unbalanced, so reset the balancing interval */
2702 sd->balance_interval = sd->min_interval;
2705 * If we've begun active balancing, start to back off. This
2706 * case may not be covered by the all_pinned logic if there
2707 * is only 1 task on the busy runqueue (because we don't call
2710 if (sd->balance_interval < sd->max_interval)
2711 sd->balance_interval *= 2;
2714 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2715 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2720 schedstat_inc(sd, lb_balanced[idle]);
2722 sd->nr_balance_failed = 0;
2725 /* tune up the balancing interval */
2726 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2727 (sd->balance_interval < sd->max_interval))
2728 sd->balance_interval *= 2;
2730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2737 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2738 * tasks if there is an imbalance.
2740 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2741 * this_rq is locked.
2744 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2746 struct sched_group *group;
2747 struct rq *busiest = NULL;
2748 unsigned long imbalance;
2751 cpumask_t cpus = CPU_MASK_ALL;
2754 * When power savings policy is enabled for the parent domain, idle
2755 * sibling can pick up load irrespective of busy siblings. In this case,
2756 * let the state of idle sibling percolate up as IDLE, instead of
2757 * portraying it as CPU_NOT_IDLE.
2759 if (sd->flags & SD_SHARE_CPUPOWER &&
2760 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2763 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2765 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2766 &sd_idle, &cpus, NULL);
2768 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2772 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2775 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2779 BUG_ON(busiest == this_rq);
2781 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2784 if (busiest->nr_running > 1) {
2785 /* Attempt to move tasks */
2786 double_lock_balance(this_rq, busiest);
2787 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2788 minus_1_or_zero(busiest->nr_running),
2789 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2790 spin_unlock(&busiest->lock);
2793 cpu_clear(cpu_of(busiest), cpus);
2794 if (!cpus_empty(cpus))
2800 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2801 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2802 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2805 sd->nr_balance_failed = 0;
2810 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2811 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2812 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2814 sd->nr_balance_failed = 0;
2820 * idle_balance is called by schedule() if this_cpu is about to become
2821 * idle. Attempts to pull tasks from other CPUs.
2823 static void idle_balance(int this_cpu, struct rq *this_rq)
2825 struct sched_domain *sd;
2826 int pulled_task = -1;
2827 unsigned long next_balance = jiffies + HZ;
2829 for_each_domain(this_cpu, sd) {
2830 unsigned long interval;
2832 if (!(sd->flags & SD_LOAD_BALANCE))
2835 if (sd->flags & SD_BALANCE_NEWIDLE)
2836 /* If we've pulled tasks over stop searching: */
2837 pulled_task = load_balance_newidle(this_cpu,
2840 interval = msecs_to_jiffies(sd->balance_interval);
2841 if (time_after(next_balance, sd->last_balance + interval))
2842 next_balance = sd->last_balance + interval;
2846 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2848 * We are going idle. next_balance may be set based on
2849 * a busy processor. So reset next_balance.
2851 this_rq->next_balance = next_balance;
2856 * active_load_balance is run by migration threads. It pushes running tasks
2857 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2858 * running on each physical CPU where possible, and avoids physical /
2859 * logical imbalances.
2861 * Called with busiest_rq locked.
2863 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2865 int target_cpu = busiest_rq->push_cpu;
2866 struct sched_domain *sd;
2867 struct rq *target_rq;
2869 /* Is there any task to move? */
2870 if (busiest_rq->nr_running <= 1)
2873 target_rq = cpu_rq(target_cpu);
2876 * This condition is "impossible", if it occurs
2877 * we need to fix it. Originally reported by
2878 * Bjorn Helgaas on a 128-cpu setup.
2880 BUG_ON(busiest_rq == target_rq);
2882 /* move a task from busiest_rq to target_rq */
2883 double_lock_balance(busiest_rq, target_rq);
2885 /* Search for an sd spanning us and the target CPU. */
2886 for_each_domain(target_cpu, sd) {
2887 if ((sd->flags & SD_LOAD_BALANCE) &&
2888 cpu_isset(busiest_cpu, sd->span))
2893 schedstat_inc(sd, alb_cnt);
2895 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2896 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2898 schedstat_inc(sd, alb_pushed);
2900 schedstat_inc(sd, alb_failed);
2902 spin_unlock(&target_rq->lock);
2907 atomic_t load_balancer;
2909 } nohz ____cacheline_aligned = {
2910 .load_balancer = ATOMIC_INIT(-1),
2911 .cpu_mask = CPU_MASK_NONE,
2915 * This routine will try to nominate the ilb (idle load balancing)
2916 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2917 * load balancing on behalf of all those cpus. If all the cpus in the system
2918 * go into this tickless mode, then there will be no ilb owner (as there is
2919 * no need for one) and all the cpus will sleep till the next wakeup event
2922 * For the ilb owner, tick is not stopped. And this tick will be used
2923 * for idle load balancing. ilb owner will still be part of
2926 * While stopping the tick, this cpu will become the ilb owner if there
2927 * is no other owner. And will be the owner till that cpu becomes busy
2928 * or if all cpus in the system stop their ticks at which point
2929 * there is no need for ilb owner.
2931 * When the ilb owner becomes busy, it nominates another owner, during the
2932 * next busy scheduler_tick()
2934 int select_nohz_load_balancer(int stop_tick)
2936 int cpu = smp_processor_id();
2939 cpu_set(cpu, nohz.cpu_mask);
2940 cpu_rq(cpu)->in_nohz_recently = 1;
2943 * If we are going offline and still the leader, give up!
2945 if (cpu_is_offline(cpu) &&
2946 atomic_read(&nohz.load_balancer) == cpu) {
2947 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2952 /* time for ilb owner also to sleep */
2953 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2954 if (atomic_read(&nohz.load_balancer) == cpu)
2955 atomic_set(&nohz.load_balancer, -1);
2959 if (atomic_read(&nohz.load_balancer) == -1) {
2960 /* make me the ilb owner */
2961 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2963 } else if (atomic_read(&nohz.load_balancer) == cpu)
2966 if (!cpu_isset(cpu, nohz.cpu_mask))
2969 cpu_clear(cpu, nohz.cpu_mask);
2971 if (atomic_read(&nohz.load_balancer) == cpu)
2972 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2979 static DEFINE_SPINLOCK(balancing);
2982 * It checks each scheduling domain to see if it is due to be balanced,
2983 * and initiates a balancing operation if so.
2985 * Balancing parameters are set up in arch_init_sched_domains.
2987 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2990 struct rq *rq = cpu_rq(cpu);
2991 unsigned long interval;
2992 struct sched_domain *sd;
2993 /* Earliest time when we have to do rebalance again */
2994 unsigned long next_balance = jiffies + 60*HZ;
2996 for_each_domain(cpu, sd) {
2997 if (!(sd->flags & SD_LOAD_BALANCE))
3000 interval = sd->balance_interval;
3001 if (idle != CPU_IDLE)
3002 interval *= sd->busy_factor;
3004 /* scale ms to jiffies */
3005 interval = msecs_to_jiffies(interval);
3006 if (unlikely(!interval))
3008 if (interval > HZ*NR_CPUS/10)
3009 interval = HZ*NR_CPUS/10;
3012 if (sd->flags & SD_SERIALIZE) {
3013 if (!spin_trylock(&balancing))
3017 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3018 if (load_balance(cpu, rq, sd, idle, &balance)) {
3020 * We've pulled tasks over so either we're no
3021 * longer idle, or one of our SMT siblings is
3024 idle = CPU_NOT_IDLE;
3026 sd->last_balance = jiffies;
3028 if (sd->flags & SD_SERIALIZE)
3029 spin_unlock(&balancing);
3031 if (time_after(next_balance, sd->last_balance + interval))
3032 next_balance = sd->last_balance + interval;
3035 * Stop the load balance at this level. There is another
3036 * CPU in our sched group which is doing load balancing more
3042 rq->next_balance = next_balance;
3046 * run_rebalance_domains is triggered when needed from the scheduler tick.
3047 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3048 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3050 static void run_rebalance_domains(struct softirq_action *h)
3052 int this_cpu = smp_processor_id();
3053 struct rq *this_rq = cpu_rq(this_cpu);
3054 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3055 CPU_IDLE : CPU_NOT_IDLE;
3057 rebalance_domains(this_cpu, idle);
3061 * If this cpu is the owner for idle load balancing, then do the
3062 * balancing on behalf of the other idle cpus whose ticks are
3065 if (this_rq->idle_at_tick &&
3066 atomic_read(&nohz.load_balancer) == this_cpu) {
3067 cpumask_t cpus = nohz.cpu_mask;
3071 cpu_clear(this_cpu, cpus);
3072 for_each_cpu_mask(balance_cpu, cpus) {
3074 * If this cpu gets work to do, stop the load balancing
3075 * work being done for other cpus. Next load
3076 * balancing owner will pick it up.
3081 rebalance_domains(balance_cpu, SCHED_IDLE);
3083 rq = cpu_rq(balance_cpu);
3084 if (time_after(this_rq->next_balance, rq->next_balance))
3085 this_rq->next_balance = rq->next_balance;
3092 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3094 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3095 * idle load balancing owner or decide to stop the periodic load balancing,
3096 * if the whole system is idle.
3098 static inline void trigger_load_balance(struct rq *rq, int cpu)
3102 * If we were in the nohz mode recently and busy at the current
3103 * scheduler tick, then check if we need to nominate new idle
3106 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3107 rq->in_nohz_recently = 0;
3109 if (atomic_read(&nohz.load_balancer) == cpu) {
3110 cpu_clear(cpu, nohz.cpu_mask);
3111 atomic_set(&nohz.load_balancer, -1);
3114 if (atomic_read(&nohz.load_balancer) == -1) {
3116 * simple selection for now: Nominate the
3117 * first cpu in the nohz list to be the next
3120 * TBD: Traverse the sched domains and nominate
3121 * the nearest cpu in the nohz.cpu_mask.
3123 int ilb = first_cpu(nohz.cpu_mask);
3131 * If this cpu is idle and doing idle load balancing for all the
3132 * cpus with ticks stopped, is it time for that to stop?
3134 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3135 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3141 * If this cpu is idle and the idle load balancing is done by
3142 * someone else, then no need raise the SCHED_SOFTIRQ
3144 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3145 cpu_isset(cpu, nohz.cpu_mask))
3148 if (time_after_eq(jiffies, rq->next_balance))
3149 raise_softirq(SCHED_SOFTIRQ);
3152 #else /* CONFIG_SMP */
3155 * on UP we do not need to balance between CPUs:
3157 static inline void idle_balance(int cpu, struct rq *rq)
3161 /* Avoid "used but not defined" warning on UP */
3162 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3163 unsigned long max_nr_move, unsigned long max_load_move,
3164 struct sched_domain *sd, enum cpu_idle_type idle,
3165 int *all_pinned, unsigned long *load_moved,
3166 int this_best_prio, int best_prio, int best_prio_seen,
3167 struct rq_iterator *iterator)
3176 DEFINE_PER_CPU(struct kernel_stat, kstat);
3178 EXPORT_PER_CPU_SYMBOL(kstat);
3181 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3182 * that have not yet been banked in case the task is currently running.
3184 unsigned long long task_sched_runtime(struct task_struct *p)
3186 unsigned long flags;
3190 rq = task_rq_lock(p, &flags);
3191 ns = p->se.sum_exec_runtime;
3192 if (rq->curr == p) {
3193 delta_exec = rq_clock(rq) - p->se.exec_start;
3194 if ((s64)delta_exec > 0)
3197 task_rq_unlock(rq, &flags);
3203 * Account user cpu time to a process.
3204 * @p: the process that the cpu time gets accounted to
3205 * @hardirq_offset: the offset to subtract from hardirq_count()
3206 * @cputime: the cpu time spent in user space since the last update
3208 void account_user_time(struct task_struct *p, cputime_t cputime)
3210 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3213 p->utime = cputime_add(p->utime, cputime);
3215 /* Add user time to cpustat. */
3216 tmp = cputime_to_cputime64(cputime);
3217 if (TASK_NICE(p) > 0)
3218 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3220 cpustat->user = cputime64_add(cpustat->user, tmp);
3224 * Account system cpu time to a process.
3225 * @p: the process that the cpu time gets accounted to
3226 * @hardirq_offset: the offset to subtract from hardirq_count()
3227 * @cputime: the cpu time spent in kernel space since the last update
3229 void account_system_time(struct task_struct *p, int hardirq_offset,
3232 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3233 struct rq *rq = this_rq();
3236 p->stime = cputime_add(p->stime, cputime);
3238 /* Add system time to cpustat. */
3239 tmp = cputime_to_cputime64(cputime);
3240 if (hardirq_count() - hardirq_offset)
3241 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3242 else if (softirq_count())
3243 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3244 else if (p != rq->idle)
3245 cpustat->system = cputime64_add(cpustat->system, tmp);
3246 else if (atomic_read(&rq->nr_iowait) > 0)
3247 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3249 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3250 /* Account for system time used */
3251 acct_update_integrals(p);
3255 * Account for involuntary wait time.
3256 * @p: the process from which the cpu time has been stolen
3257 * @steal: the cpu time spent in involuntary wait
3259 void account_steal_time(struct task_struct *p, cputime_t steal)
3261 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3262 cputime64_t tmp = cputime_to_cputime64(steal);
3263 struct rq *rq = this_rq();
3265 if (p == rq->idle) {
3266 p->stime = cputime_add(p->stime, steal);
3267 if (atomic_read(&rq->nr_iowait) > 0)
3268 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3270 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3272 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3276 * This function gets called by the timer code, with HZ frequency.
3277 * We call it with interrupts disabled.
3279 * It also gets called by the fork code, when changing the parent's
3282 void scheduler_tick(void)
3284 int cpu = smp_processor_id();
3285 struct rq *rq = cpu_rq(cpu);
3286 struct task_struct *curr = rq->curr;
3288 spin_lock(&rq->lock);
3289 if (curr != rq->idle) /* FIXME: needed? */
3290 curr->sched_class->task_tick(rq, curr);
3291 update_cpu_load(rq);
3292 spin_unlock(&rq->lock);
3295 rq->idle_at_tick = idle_cpu(cpu);
3296 trigger_load_balance(rq, cpu);
3300 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3302 void fastcall add_preempt_count(int val)
3307 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3309 preempt_count() += val;
3311 * Spinlock count overflowing soon?
3313 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3316 EXPORT_SYMBOL(add_preempt_count);
3318 void fastcall sub_preempt_count(int val)
3323 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3326 * Is the spinlock portion underflowing?
3328 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3329 !(preempt_count() & PREEMPT_MASK)))
3332 preempt_count() -= val;
3334 EXPORT_SYMBOL(sub_preempt_count);
3339 * Print scheduling while atomic bug:
3341 static noinline void __schedule_bug(struct task_struct *prev)
3343 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3344 prev->comm, preempt_count(), prev->pid);
3345 debug_show_held_locks(prev);
3346 if (irqs_disabled())
3347 print_irqtrace_events(prev);
3352 * Various schedule()-time debugging checks and statistics:
3354 static inline void schedule_debug(struct task_struct *prev)
3357 * Test if we are atomic. Since do_exit() needs to call into
3358 * schedule() atomically, we ignore that path for now.
3359 * Otherwise, whine if we are scheduling when we should not be.
3361 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3362 __schedule_bug(prev);
3364 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3366 schedstat_inc(this_rq(), sched_cnt);
3370 * Pick up the highest-prio task:
3372 static inline struct task_struct *
3373 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3375 struct sched_class *class;
3376 struct task_struct *p;
3379 * Optimization: we know that if all tasks are in
3380 * the fair class we can call that function directly:
3382 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3383 p = fair_sched_class.pick_next_task(rq, now);
3388 class = sched_class_highest;
3390 p = class->pick_next_task(rq, now);
3394 * Will never be NULL as the idle class always
3395 * returns a non-NULL p:
3397 class = class->next;
3402 * schedule() is the main scheduler function.
3404 asmlinkage void __sched schedule(void)
3406 struct task_struct *prev, *next;
3414 cpu = smp_processor_id();
3418 switch_count = &prev->nivcsw;
3420 release_kernel_lock(prev);
3421 need_resched_nonpreemptible:
3423 schedule_debug(prev);
3425 spin_lock_irq(&rq->lock);
3426 clear_tsk_need_resched(prev);
3428 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3429 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3430 unlikely(signal_pending(prev)))) {
3431 prev->state = TASK_RUNNING;
3433 deactivate_task(rq, prev, 1);
3435 switch_count = &prev->nvcsw;
3438 if (unlikely(!rq->nr_running))
3439 idle_balance(cpu, rq);
3441 now = __rq_clock(rq);
3442 prev->sched_class->put_prev_task(rq, prev, now);
3443 next = pick_next_task(rq, prev, now);
3445 sched_info_switch(prev, next);
3447 if (likely(prev != next)) {
3452 context_switch(rq, prev, next); /* unlocks the rq */
3454 spin_unlock_irq(&rq->lock);
3456 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3457 cpu = smp_processor_id();
3459 goto need_resched_nonpreemptible;
3461 preempt_enable_no_resched();
3462 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3465 EXPORT_SYMBOL(schedule);
3467 #ifdef CONFIG_PREEMPT
3469 * this is the entry point to schedule() from in-kernel preemption
3470 * off of preempt_enable. Kernel preemptions off return from interrupt
3471 * occur there and call schedule directly.
3473 asmlinkage void __sched preempt_schedule(void)
3475 struct thread_info *ti = current_thread_info();
3476 #ifdef CONFIG_PREEMPT_BKL
3477 struct task_struct *task = current;
3478 int saved_lock_depth;
3481 * If there is a non-zero preempt_count or interrupts are disabled,
3482 * we do not want to preempt the current task. Just return..
3484 if (likely(ti->preempt_count || irqs_disabled()))
3488 add_preempt_count(PREEMPT_ACTIVE);
3490 * We keep the big kernel semaphore locked, but we
3491 * clear ->lock_depth so that schedule() doesnt
3492 * auto-release the semaphore:
3494 #ifdef CONFIG_PREEMPT_BKL
3495 saved_lock_depth = task->lock_depth;
3496 task->lock_depth = -1;
3499 #ifdef CONFIG_PREEMPT_BKL
3500 task->lock_depth = saved_lock_depth;
3502 sub_preempt_count(PREEMPT_ACTIVE);
3504 /* we could miss a preemption opportunity between schedule and now */
3506 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3509 EXPORT_SYMBOL(preempt_schedule);
3512 * this is the entry point to schedule() from kernel preemption
3513 * off of irq context.
3514 * Note, that this is called and return with irqs disabled. This will
3515 * protect us against recursive calling from irq.
3517 asmlinkage void __sched preempt_schedule_irq(void)
3519 struct thread_info *ti = current_thread_info();
3520 #ifdef CONFIG_PREEMPT_BKL
3521 struct task_struct *task = current;
3522 int saved_lock_depth;
3524 /* Catch callers which need to be fixed */
3525 BUG_ON(ti->preempt_count || !irqs_disabled());
3528 add_preempt_count(PREEMPT_ACTIVE);
3530 * We keep the big kernel semaphore locked, but we
3531 * clear ->lock_depth so that schedule() doesnt
3532 * auto-release the semaphore:
3534 #ifdef CONFIG_PREEMPT_BKL
3535 saved_lock_depth = task->lock_depth;
3536 task->lock_depth = -1;
3540 local_irq_disable();
3541 #ifdef CONFIG_PREEMPT_BKL
3542 task->lock_depth = saved_lock_depth;
3544 sub_preempt_count(PREEMPT_ACTIVE);
3546 /* we could miss a preemption opportunity between schedule and now */
3548 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3552 #endif /* CONFIG_PREEMPT */
3554 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3557 return try_to_wake_up(curr->private, mode, sync);
3559 EXPORT_SYMBOL(default_wake_function);
3562 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3563 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3564 * number) then we wake all the non-exclusive tasks and one exclusive task.
3566 * There are circumstances in which we can try to wake a task which has already
3567 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3568 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3570 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3571 int nr_exclusive, int sync, void *key)
3573 struct list_head *tmp, *next;
3575 list_for_each_safe(tmp, next, &q->task_list) {
3576 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3577 unsigned flags = curr->flags;
3579 if (curr->func(curr, mode, sync, key) &&
3580 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3586 * __wake_up - wake up threads blocked on a waitqueue.
3588 * @mode: which threads
3589 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3590 * @key: is directly passed to the wakeup function
3592 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3593 int nr_exclusive, void *key)
3595 unsigned long flags;
3597 spin_lock_irqsave(&q->lock, flags);
3598 __wake_up_common(q, mode, nr_exclusive, 0, key);
3599 spin_unlock_irqrestore(&q->lock, flags);
3601 EXPORT_SYMBOL(__wake_up);
3604 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3606 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3608 __wake_up_common(q, mode, 1, 0, NULL);
3612 * __wake_up_sync - wake up threads blocked on a waitqueue.
3614 * @mode: which threads
3615 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3617 * The sync wakeup differs that the waker knows that it will schedule
3618 * away soon, so while the target thread will be woken up, it will not
3619 * be migrated to another CPU - ie. the two threads are 'synchronized'
3620 * with each other. This can prevent needless bouncing between CPUs.
3622 * On UP it can prevent extra preemption.
3625 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3627 unsigned long flags;
3633 if (unlikely(!nr_exclusive))
3636 spin_lock_irqsave(&q->lock, flags);
3637 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3638 spin_unlock_irqrestore(&q->lock, flags);
3640 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3642 void fastcall complete(struct completion *x)
3644 unsigned long flags;
3646 spin_lock_irqsave(&x->wait.lock, flags);
3648 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3650 spin_unlock_irqrestore(&x->wait.lock, flags);
3652 EXPORT_SYMBOL(complete);
3654 void fastcall complete_all(struct completion *x)
3656 unsigned long flags;
3658 spin_lock_irqsave(&x->wait.lock, flags);
3659 x->done += UINT_MAX/2;
3660 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3662 spin_unlock_irqrestore(&x->wait.lock, flags);
3664 EXPORT_SYMBOL(complete_all);
3666 void fastcall __sched wait_for_completion(struct completion *x)
3670 spin_lock_irq(&x->wait.lock);
3672 DECLARE_WAITQUEUE(wait, current);
3674 wait.flags |= WQ_FLAG_EXCLUSIVE;
3675 __add_wait_queue_tail(&x->wait, &wait);
3677 __set_current_state(TASK_UNINTERRUPTIBLE);
3678 spin_unlock_irq(&x->wait.lock);
3680 spin_lock_irq(&x->wait.lock);
3682 __remove_wait_queue(&x->wait, &wait);
3685 spin_unlock_irq(&x->wait.lock);
3687 EXPORT_SYMBOL(wait_for_completion);
3689 unsigned long fastcall __sched
3690 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3694 spin_lock_irq(&x->wait.lock);
3696 DECLARE_WAITQUEUE(wait, current);
3698 wait.flags |= WQ_FLAG_EXCLUSIVE;
3699 __add_wait_queue_tail(&x->wait, &wait);
3701 __set_current_state(TASK_UNINTERRUPTIBLE);
3702 spin_unlock_irq(&x->wait.lock);
3703 timeout = schedule_timeout(timeout);
3704 spin_lock_irq(&x->wait.lock);
3706 __remove_wait_queue(&x->wait, &wait);
3710 __remove_wait_queue(&x->wait, &wait);
3714 spin_unlock_irq(&x->wait.lock);
3717 EXPORT_SYMBOL(wait_for_completion_timeout);
3719 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3725 spin_lock_irq(&x->wait.lock);
3727 DECLARE_WAITQUEUE(wait, current);
3729 wait.flags |= WQ_FLAG_EXCLUSIVE;
3730 __add_wait_queue_tail(&x->wait, &wait);
3732 if (signal_pending(current)) {
3734 __remove_wait_queue(&x->wait, &wait);
3737 __set_current_state(TASK_INTERRUPTIBLE);
3738 spin_unlock_irq(&x->wait.lock);
3740 spin_lock_irq(&x->wait.lock);
3742 __remove_wait_queue(&x->wait, &wait);
3746 spin_unlock_irq(&x->wait.lock);
3750 EXPORT_SYMBOL(wait_for_completion_interruptible);
3752 unsigned long fastcall __sched
3753 wait_for_completion_interruptible_timeout(struct completion *x,
3754 unsigned long timeout)
3758 spin_lock_irq(&x->wait.lock);
3760 DECLARE_WAITQUEUE(wait, current);
3762 wait.flags |= WQ_FLAG_EXCLUSIVE;
3763 __add_wait_queue_tail(&x->wait, &wait);
3765 if (signal_pending(current)) {
3766 timeout = -ERESTARTSYS;
3767 __remove_wait_queue(&x->wait, &wait);
3770 __set_current_state(TASK_INTERRUPTIBLE);
3771 spin_unlock_irq(&x->wait.lock);
3772 timeout = schedule_timeout(timeout);
3773 spin_lock_irq(&x->wait.lock);
3775 __remove_wait_queue(&x->wait, &wait);
3779 __remove_wait_queue(&x->wait, &wait);
3783 spin_unlock_irq(&x->wait.lock);
3786 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3789 #define SLEEP_ON_VAR \
3790 unsigned long flags; \
3791 wait_queue_t wait; \
3792 init_waitqueue_entry(&wait, current);
3794 #define SLEEP_ON_HEAD \
3795 spin_lock_irqsave(&q->lock,flags); \
3796 __add_wait_queue(q, &wait); \
3797 spin_unlock(&q->lock);
3799 #define SLEEP_ON_TAIL \
3800 spin_lock_irq(&q->lock); \
3801 __remove_wait_queue(q, &wait); \
3802 spin_unlock_irqrestore(&q->lock, flags);
3804 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3808 current->state = TASK_INTERRUPTIBLE;
3814 EXPORT_SYMBOL(interruptible_sleep_on);
3816 long fastcall __sched
3817 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3821 current->state = TASK_INTERRUPTIBLE;
3824 timeout = schedule_timeout(timeout);
3829 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3831 void fastcall __sched sleep_on(wait_queue_head_t *q)
3835 current->state = TASK_UNINTERRUPTIBLE;
3841 EXPORT_SYMBOL(sleep_on);
3843 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3847 current->state = TASK_UNINTERRUPTIBLE;
3850 timeout = schedule_timeout(timeout);
3856 EXPORT_SYMBOL(sleep_on_timeout);
3858 #ifdef CONFIG_RT_MUTEXES
3861 * rt_mutex_setprio - set the current priority of a task
3863 * @prio: prio value (kernel-internal form)
3865 * This function changes the 'effective' priority of a task. It does
3866 * not touch ->normal_prio like __setscheduler().
3868 * Used by the rt_mutex code to implement priority inheritance logic.
3870 void rt_mutex_setprio(struct task_struct *p, int prio)
3872 unsigned long flags;
3877 BUG_ON(prio < 0 || prio > MAX_PRIO);
3879 rq = task_rq_lock(p, &flags);
3883 on_rq = p->se.on_rq;
3885 dequeue_task(rq, p, 0, now);
3888 p->sched_class = &rt_sched_class;
3890 p->sched_class = &fair_sched_class;
3895 enqueue_task(rq, p, 0, now);
3897 * Reschedule if we are currently running on this runqueue and
3898 * our priority decreased, or if we are not currently running on
3899 * this runqueue and our priority is higher than the current's
3901 if (task_running(rq, p)) {
3902 if (p->prio > oldprio)
3903 resched_task(rq->curr);
3905 check_preempt_curr(rq, p);
3908 task_rq_unlock(rq, &flags);
3913 void set_user_nice(struct task_struct *p, long nice)
3915 int old_prio, delta, on_rq;
3916 unsigned long flags;
3920 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3923 * We have to be careful, if called from sys_setpriority(),
3924 * the task might be in the middle of scheduling on another CPU.
3926 rq = task_rq_lock(p, &flags);
3929 * The RT priorities are set via sched_setscheduler(), but we still
3930 * allow the 'normal' nice value to be set - but as expected
3931 * it wont have any effect on scheduling until the task is
3932 * SCHED_FIFO/SCHED_RR:
3934 if (task_has_rt_policy(p)) {
3935 p->static_prio = NICE_TO_PRIO(nice);
3938 on_rq = p->se.on_rq;
3940 dequeue_task(rq, p, 0, now);
3941 dec_load(rq, p, now);
3944 p->static_prio = NICE_TO_PRIO(nice);
3947 p->prio = effective_prio(p);
3948 delta = p->prio - old_prio;
3951 enqueue_task(rq, p, 0, now);
3952 inc_load(rq, p, now);
3954 * If the task increased its priority or is running and
3955 * lowered its priority, then reschedule its CPU:
3957 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3958 resched_task(rq->curr);
3961 task_rq_unlock(rq, &flags);
3963 EXPORT_SYMBOL(set_user_nice);
3966 * can_nice - check if a task can reduce its nice value
3970 int can_nice(const struct task_struct *p, const int nice)
3972 /* convert nice value [19,-20] to rlimit style value [1,40] */
3973 int nice_rlim = 20 - nice;
3975 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3976 capable(CAP_SYS_NICE));
3979 #ifdef __ARCH_WANT_SYS_NICE
3982 * sys_nice - change the priority of the current process.
3983 * @increment: priority increment
3985 * sys_setpriority is a more generic, but much slower function that
3986 * does similar things.
3988 asmlinkage long sys_nice(int increment)
3993 * Setpriority might change our priority at the same moment.
3994 * We don't have to worry. Conceptually one call occurs first
3995 * and we have a single winner.
3997 if (increment < -40)
4002 nice = PRIO_TO_NICE(current->static_prio) + increment;
4008 if (increment < 0 && !can_nice(current, nice))
4011 retval = security_task_setnice(current, nice);
4015 set_user_nice(current, nice);
4022 * task_prio - return the priority value of a given task.
4023 * @p: the task in question.
4025 * This is the priority value as seen by users in /proc.
4026 * RT tasks are offset by -200. Normal tasks are centered
4027 * around 0, value goes from -16 to +15.
4029 int task_prio(const struct task_struct *p)
4031 return p->prio - MAX_RT_PRIO;
4035 * task_nice - return the nice value of a given task.
4036 * @p: the task in question.
4038 int task_nice(const struct task_struct *p)
4040 return TASK_NICE(p);
4042 EXPORT_SYMBOL_GPL(task_nice);
4045 * idle_cpu - is a given cpu idle currently?
4046 * @cpu: the processor in question.
4048 int idle_cpu(int cpu)
4050 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4054 * idle_task - return the idle task for a given cpu.
4055 * @cpu: the processor in question.
4057 struct task_struct *idle_task(int cpu)
4059 return cpu_rq(cpu)->idle;
4063 * find_process_by_pid - find a process with a matching PID value.
4064 * @pid: the pid in question.
4066 static inline struct task_struct *find_process_by_pid(pid_t pid)
4068 return pid ? find_task_by_pid(pid) : current;
4071 /* Actually do priority change: must hold rq lock. */
4073 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4075 BUG_ON(p->se.on_rq);
4078 switch (p->policy) {
4082 p->sched_class = &fair_sched_class;
4086 p->sched_class = &rt_sched_class;
4090 p->rt_priority = prio;
4091 p->normal_prio = normal_prio(p);
4092 /* we are holding p->pi_lock already */
4093 p->prio = rt_mutex_getprio(p);
4098 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4099 * @p: the task in question.
4100 * @policy: new policy.
4101 * @param: structure containing the new RT priority.
4103 * NOTE that the task may be already dead.
4105 int sched_setscheduler(struct task_struct *p, int policy,
4106 struct sched_param *param)
4108 int retval, oldprio, oldpolicy = -1, on_rq;
4109 unsigned long flags;
4112 /* may grab non-irq protected spin_locks */
4113 BUG_ON(in_interrupt());
4115 /* double check policy once rq lock held */
4117 policy = oldpolicy = p->policy;
4118 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4119 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4120 policy != SCHED_IDLE)
4123 * Valid priorities for SCHED_FIFO and SCHED_RR are
4124 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4125 * SCHED_BATCH and SCHED_IDLE is 0.
4127 if (param->sched_priority < 0 ||
4128 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4129 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4131 if (rt_policy(policy) != (param->sched_priority != 0))
4135 * Allow unprivileged RT tasks to decrease priority:
4137 if (!capable(CAP_SYS_NICE)) {
4138 if (rt_policy(policy)) {
4139 unsigned long rlim_rtprio;
4141 if (!lock_task_sighand(p, &flags))
4143 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4144 unlock_task_sighand(p, &flags);
4146 /* can't set/change the rt policy */
4147 if (policy != p->policy && !rlim_rtprio)
4150 /* can't increase priority */
4151 if (param->sched_priority > p->rt_priority &&
4152 param->sched_priority > rlim_rtprio)
4156 * Like positive nice levels, dont allow tasks to
4157 * move out of SCHED_IDLE either:
4159 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4162 /* can't change other user's priorities */
4163 if ((current->euid != p->euid) &&
4164 (current->euid != p->uid))
4168 retval = security_task_setscheduler(p, policy, param);
4172 * make sure no PI-waiters arrive (or leave) while we are
4173 * changing the priority of the task:
4175 spin_lock_irqsave(&p->pi_lock, flags);
4177 * To be able to change p->policy safely, the apropriate
4178 * runqueue lock must be held.
4180 rq = __task_rq_lock(p);
4181 /* recheck policy now with rq lock held */
4182 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4183 policy = oldpolicy = -1;
4184 __task_rq_unlock(rq);
4185 spin_unlock_irqrestore(&p->pi_lock, flags);
4188 on_rq = p->se.on_rq;
4190 deactivate_task(rq, p, 0);
4192 __setscheduler(rq, p, policy, param->sched_priority);
4194 activate_task(rq, p, 0);
4196 * Reschedule if we are currently running on this runqueue and
4197 * our priority decreased, or if we are not currently running on
4198 * this runqueue and our priority is higher than the current's
4200 if (task_running(rq, p)) {
4201 if (p->prio > oldprio)
4202 resched_task(rq->curr);
4204 check_preempt_curr(rq, p);
4207 __task_rq_unlock(rq);
4208 spin_unlock_irqrestore(&p->pi_lock, flags);
4210 rt_mutex_adjust_pi(p);
4214 EXPORT_SYMBOL_GPL(sched_setscheduler);
4217 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4219 struct sched_param lparam;
4220 struct task_struct *p;
4223 if (!param || pid < 0)
4225 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4230 p = find_process_by_pid(pid);
4232 retval = sched_setscheduler(p, policy, &lparam);
4239 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4240 * @pid: the pid in question.
4241 * @policy: new policy.
4242 * @param: structure containing the new RT priority.
4244 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4245 struct sched_param __user *param)
4247 /* negative values for policy are not valid */
4251 return do_sched_setscheduler(pid, policy, param);
4255 * sys_sched_setparam - set/change the RT priority of a thread
4256 * @pid: the pid in question.
4257 * @param: structure containing the new RT priority.
4259 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4261 return do_sched_setscheduler(pid, -1, param);
4265 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4266 * @pid: the pid in question.
4268 asmlinkage long sys_sched_getscheduler(pid_t pid)
4270 struct task_struct *p;
4271 int retval = -EINVAL;
4277 read_lock(&tasklist_lock);
4278 p = find_process_by_pid(pid);
4280 retval = security_task_getscheduler(p);
4284 read_unlock(&tasklist_lock);
4291 * sys_sched_getscheduler - get the RT priority of a thread
4292 * @pid: the pid in question.
4293 * @param: structure containing the RT priority.
4295 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4297 struct sched_param lp;
4298 struct task_struct *p;
4299 int retval = -EINVAL;
4301 if (!param || pid < 0)
4304 read_lock(&tasklist_lock);
4305 p = find_process_by_pid(pid);
4310 retval = security_task_getscheduler(p);
4314 lp.sched_priority = p->rt_priority;
4315 read_unlock(&tasklist_lock);
4318 * This one might sleep, we cannot do it with a spinlock held ...
4320 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4326 read_unlock(&tasklist_lock);
4330 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4332 cpumask_t cpus_allowed;
4333 struct task_struct *p;
4336 mutex_lock(&sched_hotcpu_mutex);
4337 read_lock(&tasklist_lock);
4339 p = find_process_by_pid(pid);
4341 read_unlock(&tasklist_lock);
4342 mutex_unlock(&sched_hotcpu_mutex);
4347 * It is not safe to call set_cpus_allowed with the
4348 * tasklist_lock held. We will bump the task_struct's
4349 * usage count and then drop tasklist_lock.
4352 read_unlock(&tasklist_lock);
4355 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4356 !capable(CAP_SYS_NICE))
4359 retval = security_task_setscheduler(p, 0, NULL);
4363 cpus_allowed = cpuset_cpus_allowed(p);
4364 cpus_and(new_mask, new_mask, cpus_allowed);
4365 retval = set_cpus_allowed(p, new_mask);
4369 mutex_unlock(&sched_hotcpu_mutex);
4373 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4374 cpumask_t *new_mask)
4376 if (len < sizeof(cpumask_t)) {
4377 memset(new_mask, 0, sizeof(cpumask_t));
4378 } else if (len > sizeof(cpumask_t)) {
4379 len = sizeof(cpumask_t);
4381 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4385 * sys_sched_setaffinity - set the cpu affinity of a process
4386 * @pid: pid of the process
4387 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4388 * @user_mask_ptr: user-space pointer to the new cpu mask
4390 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4391 unsigned long __user *user_mask_ptr)
4396 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4400 return sched_setaffinity(pid, new_mask);
4404 * Represents all cpu's present in the system
4405 * In systems capable of hotplug, this map could dynamically grow
4406 * as new cpu's are detected in the system via any platform specific
4407 * method, such as ACPI for e.g.
4410 cpumask_t cpu_present_map __read_mostly;
4411 EXPORT_SYMBOL(cpu_present_map);
4414 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4415 EXPORT_SYMBOL(cpu_online_map);
4417 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4418 EXPORT_SYMBOL(cpu_possible_map);
4421 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4423 struct task_struct *p;
4426 mutex_lock(&sched_hotcpu_mutex);
4427 read_lock(&tasklist_lock);
4430 p = find_process_by_pid(pid);
4434 retval = security_task_getscheduler(p);
4438 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4441 read_unlock(&tasklist_lock);
4442 mutex_unlock(&sched_hotcpu_mutex);
4450 * sys_sched_getaffinity - get the cpu affinity of a process
4451 * @pid: pid of the process
4452 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4453 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4455 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4456 unsigned long __user *user_mask_ptr)
4461 if (len < sizeof(cpumask_t))
4464 ret = sched_getaffinity(pid, &mask);
4468 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4471 return sizeof(cpumask_t);
4475 * sys_sched_yield - yield the current processor to other threads.
4477 * This function yields the current CPU to other tasks. If there are no
4478 * other threads running on this CPU then this function will return.
4480 asmlinkage long sys_sched_yield(void)
4482 struct rq *rq = this_rq_lock();
4484 schedstat_inc(rq, yld_cnt);
4485 if (unlikely(rq->nr_running == 1))
4486 schedstat_inc(rq, yld_act_empty);
4488 current->sched_class->yield_task(rq, current);
4491 * Since we are going to call schedule() anyway, there's
4492 * no need to preempt or enable interrupts:
4494 __release(rq->lock);
4495 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4496 _raw_spin_unlock(&rq->lock);
4497 preempt_enable_no_resched();
4504 static void __cond_resched(void)
4506 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4507 __might_sleep(__FILE__, __LINE__);
4510 * The BKS might be reacquired before we have dropped
4511 * PREEMPT_ACTIVE, which could trigger a second
4512 * cond_resched() call.
4515 add_preempt_count(PREEMPT_ACTIVE);
4517 sub_preempt_count(PREEMPT_ACTIVE);
4518 } while (need_resched());
4521 int __sched cond_resched(void)
4523 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4524 system_state == SYSTEM_RUNNING) {
4530 EXPORT_SYMBOL(cond_resched);
4533 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4534 * call schedule, and on return reacquire the lock.
4536 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4537 * operations here to prevent schedule() from being called twice (once via
4538 * spin_unlock(), once by hand).
4540 int cond_resched_lock(spinlock_t *lock)
4544 if (need_lockbreak(lock)) {
4550 if (need_resched() && system_state == SYSTEM_RUNNING) {
4551 spin_release(&lock->dep_map, 1, _THIS_IP_);
4552 _raw_spin_unlock(lock);
4553 preempt_enable_no_resched();
4560 EXPORT_SYMBOL(cond_resched_lock);
4562 int __sched cond_resched_softirq(void)
4564 BUG_ON(!in_softirq());
4566 if (need_resched() && system_state == SYSTEM_RUNNING) {
4574 EXPORT_SYMBOL(cond_resched_softirq);
4577 * yield - yield the current processor to other threads.
4579 * This is a shortcut for kernel-space yielding - it marks the
4580 * thread runnable and calls sys_sched_yield().
4582 void __sched yield(void)
4584 set_current_state(TASK_RUNNING);
4587 EXPORT_SYMBOL(yield);
4590 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4591 * that process accounting knows that this is a task in IO wait state.
4593 * But don't do that if it is a deliberate, throttling IO wait (this task
4594 * has set its backing_dev_info: the queue against which it should throttle)
4596 void __sched io_schedule(void)
4598 struct rq *rq = &__raw_get_cpu_var(runqueues);
4600 delayacct_blkio_start();
4601 atomic_inc(&rq->nr_iowait);
4603 atomic_dec(&rq->nr_iowait);
4604 delayacct_blkio_end();
4606 EXPORT_SYMBOL(io_schedule);
4608 long __sched io_schedule_timeout(long timeout)
4610 struct rq *rq = &__raw_get_cpu_var(runqueues);
4613 delayacct_blkio_start();
4614 atomic_inc(&rq->nr_iowait);
4615 ret = schedule_timeout(timeout);
4616 atomic_dec(&rq->nr_iowait);
4617 delayacct_blkio_end();
4622 * sys_sched_get_priority_max - return maximum RT priority.
4623 * @policy: scheduling class.
4625 * this syscall returns the maximum rt_priority that can be used
4626 * by a given scheduling class.
4628 asmlinkage long sys_sched_get_priority_max(int policy)
4635 ret = MAX_USER_RT_PRIO-1;
4647 * sys_sched_get_priority_min - return minimum RT priority.
4648 * @policy: scheduling class.
4650 * this syscall returns the minimum rt_priority that can be used
4651 * by a given scheduling class.
4653 asmlinkage long sys_sched_get_priority_min(int policy)
4671 * sys_sched_rr_get_interval - return the default timeslice of a process.
4672 * @pid: pid of the process.
4673 * @interval: userspace pointer to the timeslice value.
4675 * this syscall writes the default timeslice value of a given process
4676 * into the user-space timespec buffer. A value of '0' means infinity.
4679 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4681 struct task_struct *p;
4682 int retval = -EINVAL;
4689 read_lock(&tasklist_lock);
4690 p = find_process_by_pid(pid);
4694 retval = security_task_getscheduler(p);
4698 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4699 0 : static_prio_timeslice(p->static_prio), &t);
4700 read_unlock(&tasklist_lock);
4701 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4705 read_unlock(&tasklist_lock);
4709 static const char stat_nam[] = "RSDTtZX";
4711 static void show_task(struct task_struct *p)
4713 unsigned long free = 0;
4716 state = p->state ? __ffs(p->state) + 1 : 0;
4717 printk("%-13.13s %c", p->comm,
4718 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4719 #if (BITS_PER_LONG == 32)
4720 if (state == TASK_RUNNING)
4721 printk(" running ");
4723 printk(" %08lX ", thread_saved_pc(p));
4725 if (state == TASK_RUNNING)
4726 printk(" running task ");
4728 printk(" %016lx ", thread_saved_pc(p));
4730 #ifdef CONFIG_DEBUG_STACK_USAGE
4732 unsigned long *n = end_of_stack(p);
4735 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4738 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4740 printk(" (L-TLB)\n");
4742 printk(" (NOTLB)\n");
4744 if (state != TASK_RUNNING)
4745 show_stack(p, NULL);
4748 void show_state_filter(unsigned long state_filter)
4750 struct task_struct *g, *p;
4752 #if (BITS_PER_LONG == 32)
4755 printk(" task PC stack pid father child younger older\n");
4759 printk(" task PC stack pid father child younger older\n");
4761 read_lock(&tasklist_lock);
4762 do_each_thread(g, p) {
4764 * reset the NMI-timeout, listing all files on a slow
4765 * console might take alot of time:
4767 touch_nmi_watchdog();
4768 if (!state_filter || (p->state & state_filter))
4770 } while_each_thread(g, p);
4772 touch_all_softlockup_watchdogs();
4774 #ifdef CONFIG_SCHED_DEBUG
4775 sysrq_sched_debug_show();
4777 read_unlock(&tasklist_lock);
4779 * Only show locks if all tasks are dumped:
4781 if (state_filter == -1)
4782 debug_show_all_locks();
4785 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4787 idle->sched_class = &idle_sched_class;
4791 * init_idle - set up an idle thread for a given CPU
4792 * @idle: task in question
4793 * @cpu: cpu the idle task belongs to
4795 * NOTE: this function does not set the idle thread's NEED_RESCHED
4796 * flag, to make booting more robust.
4798 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4800 struct rq *rq = cpu_rq(cpu);
4801 unsigned long flags;
4804 idle->se.exec_start = sched_clock();
4806 idle->prio = idle->normal_prio = MAX_PRIO;
4807 idle->cpus_allowed = cpumask_of_cpu(cpu);
4808 __set_task_cpu(idle, cpu);
4810 spin_lock_irqsave(&rq->lock, flags);
4811 rq->curr = rq->idle = idle;
4812 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4815 spin_unlock_irqrestore(&rq->lock, flags);
4817 /* Set the preempt count _outside_ the spinlocks! */
4818 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4819 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4821 task_thread_info(idle)->preempt_count = 0;
4824 * The idle tasks have their own, simple scheduling class:
4826 idle->sched_class = &idle_sched_class;
4830 * In a system that switches off the HZ timer nohz_cpu_mask
4831 * indicates which cpus entered this state. This is used
4832 * in the rcu update to wait only for active cpus. For system
4833 * which do not switch off the HZ timer nohz_cpu_mask should
4834 * always be CPU_MASK_NONE.
4836 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4839 * Increase the granularity value when there are more CPUs,
4840 * because with more CPUs the 'effective latency' as visible
4841 * to users decreases. But the relationship is not linear,
4842 * so pick a second-best guess by going with the log2 of the
4845 * This idea comes from the SD scheduler of Con Kolivas:
4847 static inline void sched_init_granularity(void)
4849 unsigned int factor = 1 + ilog2(num_online_cpus());
4850 const unsigned long gran_limit = 10000000;
4852 sysctl_sched_granularity *= factor;
4853 if (sysctl_sched_granularity > gran_limit)
4854 sysctl_sched_granularity = gran_limit;
4856 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4857 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4862 * This is how migration works:
4864 * 1) we queue a struct migration_req structure in the source CPU's
4865 * runqueue and wake up that CPU's migration thread.
4866 * 2) we down() the locked semaphore => thread blocks.
4867 * 3) migration thread wakes up (implicitly it forces the migrated
4868 * thread off the CPU)
4869 * 4) it gets the migration request and checks whether the migrated
4870 * task is still in the wrong runqueue.
4871 * 5) if it's in the wrong runqueue then the migration thread removes
4872 * it and puts it into the right queue.
4873 * 6) migration thread up()s the semaphore.
4874 * 7) we wake up and the migration is done.
4878 * Change a given task's CPU affinity. Migrate the thread to a
4879 * proper CPU and schedule it away if the CPU it's executing on
4880 * is removed from the allowed bitmask.
4882 * NOTE: the caller must have a valid reference to the task, the
4883 * task must not exit() & deallocate itself prematurely. The
4884 * call is not atomic; no spinlocks may be held.
4886 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4888 struct migration_req req;
4889 unsigned long flags;
4893 rq = task_rq_lock(p, &flags);
4894 if (!cpus_intersects(new_mask, cpu_online_map)) {
4899 p->cpus_allowed = new_mask;
4900 /* Can the task run on the task's current CPU? If so, we're done */
4901 if (cpu_isset(task_cpu(p), new_mask))
4904 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4905 /* Need help from migration thread: drop lock and wait. */
4906 task_rq_unlock(rq, &flags);
4907 wake_up_process(rq->migration_thread);
4908 wait_for_completion(&req.done);
4909 tlb_migrate_finish(p->mm);
4913 task_rq_unlock(rq, &flags);
4917 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4920 * Move (not current) task off this cpu, onto dest cpu. We're doing
4921 * this because either it can't run here any more (set_cpus_allowed()
4922 * away from this CPU, or CPU going down), or because we're
4923 * attempting to rebalance this task on exec (sched_exec).
4925 * So we race with normal scheduler movements, but that's OK, as long
4926 * as the task is no longer on this CPU.
4928 * Returns non-zero if task was successfully migrated.
4930 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4932 struct rq *rq_dest, *rq_src;
4935 if (unlikely(cpu_is_offline(dest_cpu)))
4938 rq_src = cpu_rq(src_cpu);
4939 rq_dest = cpu_rq(dest_cpu);
4941 double_rq_lock(rq_src, rq_dest);
4942 /* Already moved. */
4943 if (task_cpu(p) != src_cpu)
4945 /* Affinity changed (again). */
4946 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4949 on_rq = p->se.on_rq;
4951 deactivate_task(rq_src, p, 0);
4952 set_task_cpu(p, dest_cpu);
4954 activate_task(rq_dest, p, 0);
4955 check_preempt_curr(rq_dest, p);
4959 double_rq_unlock(rq_src, rq_dest);
4964 * migration_thread - this is a highprio system thread that performs
4965 * thread migration by bumping thread off CPU then 'pushing' onto
4968 static int migration_thread(void *data)
4970 int cpu = (long)data;
4974 BUG_ON(rq->migration_thread != current);
4976 set_current_state(TASK_INTERRUPTIBLE);
4977 while (!kthread_should_stop()) {
4978 struct migration_req *req;
4979 struct list_head *head;
4983 spin_lock_irq(&rq->lock);
4985 if (cpu_is_offline(cpu)) {
4986 spin_unlock_irq(&rq->lock);
4990 if (rq->active_balance) {
4991 active_load_balance(rq, cpu);
4992 rq->active_balance = 0;
4995 head = &rq->migration_queue;
4997 if (list_empty(head)) {
4998 spin_unlock_irq(&rq->lock);
5000 set_current_state(TASK_INTERRUPTIBLE);
5003 req = list_entry(head->next, struct migration_req, list);
5004 list_del_init(head->next);
5006 spin_unlock(&rq->lock);
5007 __migrate_task(req->task, cpu, req->dest_cpu);
5010 complete(&req->done);
5012 __set_current_state(TASK_RUNNING);
5016 /* Wait for kthread_stop */
5017 set_current_state(TASK_INTERRUPTIBLE);
5018 while (!kthread_should_stop()) {
5020 set_current_state(TASK_INTERRUPTIBLE);
5022 __set_current_state(TASK_RUNNING);
5026 #ifdef CONFIG_HOTPLUG_CPU
5028 * Figure out where task on dead CPU should go, use force if neccessary.
5029 * NOTE: interrupts should be disabled by the caller
5031 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5033 unsigned long flags;
5040 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5041 cpus_and(mask, mask, p->cpus_allowed);
5042 dest_cpu = any_online_cpu(mask);
5044 /* On any allowed CPU? */
5045 if (dest_cpu == NR_CPUS)
5046 dest_cpu = any_online_cpu(p->cpus_allowed);
5048 /* No more Mr. Nice Guy. */
5049 if (dest_cpu == NR_CPUS) {
5050 rq = task_rq_lock(p, &flags);
5051 cpus_setall(p->cpus_allowed);
5052 dest_cpu = any_online_cpu(p->cpus_allowed);
5053 task_rq_unlock(rq, &flags);
5056 * Don't tell them about moving exiting tasks or
5057 * kernel threads (both mm NULL), since they never
5060 if (p->mm && printk_ratelimit())
5061 printk(KERN_INFO "process %d (%s) no "
5062 "longer affine to cpu%d\n",
5063 p->pid, p->comm, dead_cpu);
5065 if (!__migrate_task(p, dead_cpu, dest_cpu))
5070 * While a dead CPU has no uninterruptible tasks queued at this point,
5071 * it might still have a nonzero ->nr_uninterruptible counter, because
5072 * for performance reasons the counter is not stricly tracking tasks to
5073 * their home CPUs. So we just add the counter to another CPU's counter,
5074 * to keep the global sum constant after CPU-down:
5076 static void migrate_nr_uninterruptible(struct rq *rq_src)
5078 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5079 unsigned long flags;
5081 local_irq_save(flags);
5082 double_rq_lock(rq_src, rq_dest);
5083 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5084 rq_src->nr_uninterruptible = 0;
5085 double_rq_unlock(rq_src, rq_dest);
5086 local_irq_restore(flags);
5089 /* Run through task list and migrate tasks from the dead cpu. */
5090 static void migrate_live_tasks(int src_cpu)
5092 struct task_struct *p, *t;
5094 write_lock_irq(&tasklist_lock);
5096 do_each_thread(t, p) {
5100 if (task_cpu(p) == src_cpu)
5101 move_task_off_dead_cpu(src_cpu, p);
5102 } while_each_thread(t, p);
5104 write_unlock_irq(&tasklist_lock);
5108 * Schedules idle task to be the next runnable task on current CPU.
5109 * It does so by boosting its priority to highest possible and adding it to
5110 * the _front_ of the runqueue. Used by CPU offline code.
5112 void sched_idle_next(void)
5114 int this_cpu = smp_processor_id();
5115 struct rq *rq = cpu_rq(this_cpu);
5116 struct task_struct *p = rq->idle;
5117 unsigned long flags;
5119 /* cpu has to be offline */
5120 BUG_ON(cpu_online(this_cpu));
5123 * Strictly not necessary since rest of the CPUs are stopped by now
5124 * and interrupts disabled on the current cpu.
5126 spin_lock_irqsave(&rq->lock, flags);
5128 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5130 /* Add idle task to the _front_ of its priority queue: */
5131 activate_idle_task(p, rq);
5133 spin_unlock_irqrestore(&rq->lock, flags);
5137 * Ensures that the idle task is using init_mm right before its cpu goes
5140 void idle_task_exit(void)
5142 struct mm_struct *mm = current->active_mm;
5144 BUG_ON(cpu_online(smp_processor_id()));
5147 switch_mm(mm, &init_mm, current);
5151 /* called under rq->lock with disabled interrupts */
5152 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5154 struct rq *rq = cpu_rq(dead_cpu);
5156 /* Must be exiting, otherwise would be on tasklist. */
5157 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5159 /* Cannot have done final schedule yet: would have vanished. */
5160 BUG_ON(p->state == TASK_DEAD);
5165 * Drop lock around migration; if someone else moves it,
5166 * that's OK. No task can be added to this CPU, so iteration is
5168 * NOTE: interrupts should be left disabled --dev@
5170 spin_unlock(&rq->lock);
5171 move_task_off_dead_cpu(dead_cpu, p);
5172 spin_lock(&rq->lock);
5177 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5178 static void migrate_dead_tasks(unsigned int dead_cpu)
5180 struct rq *rq = cpu_rq(dead_cpu);
5181 struct task_struct *next;
5184 if (!rq->nr_running)
5186 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5189 migrate_dead(dead_cpu, next);
5192 #endif /* CONFIG_HOTPLUG_CPU */
5195 * migration_call - callback that gets triggered when a CPU is added.
5196 * Here we can start up the necessary migration thread for the new CPU.
5198 static int __cpuinit
5199 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5201 struct task_struct *p;
5202 int cpu = (long)hcpu;
5203 unsigned long flags;
5207 case CPU_LOCK_ACQUIRE:
5208 mutex_lock(&sched_hotcpu_mutex);
5211 case CPU_UP_PREPARE:
5212 case CPU_UP_PREPARE_FROZEN:
5213 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5216 p->flags |= PF_NOFREEZE;
5217 kthread_bind(p, cpu);
5218 /* Must be high prio: stop_machine expects to yield to it. */
5219 rq = task_rq_lock(p, &flags);
5220 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5221 task_rq_unlock(rq, &flags);
5222 cpu_rq(cpu)->migration_thread = p;
5226 case CPU_ONLINE_FROZEN:
5227 /* Strictly unneccessary, as first user will wake it. */
5228 wake_up_process(cpu_rq(cpu)->migration_thread);
5231 #ifdef CONFIG_HOTPLUG_CPU
5232 case CPU_UP_CANCELED:
5233 case CPU_UP_CANCELED_FROZEN:
5234 if (!cpu_rq(cpu)->migration_thread)
5236 /* Unbind it from offline cpu so it can run. Fall thru. */
5237 kthread_bind(cpu_rq(cpu)->migration_thread,
5238 any_online_cpu(cpu_online_map));
5239 kthread_stop(cpu_rq(cpu)->migration_thread);
5240 cpu_rq(cpu)->migration_thread = NULL;
5244 case CPU_DEAD_FROZEN:
5245 migrate_live_tasks(cpu);
5247 kthread_stop(rq->migration_thread);
5248 rq->migration_thread = NULL;
5249 /* Idle task back to normal (off runqueue, low prio) */
5250 rq = task_rq_lock(rq->idle, &flags);
5251 deactivate_task(rq, rq->idle, 0);
5252 rq->idle->static_prio = MAX_PRIO;
5253 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5254 rq->idle->sched_class = &idle_sched_class;
5255 migrate_dead_tasks(cpu);
5256 task_rq_unlock(rq, &flags);
5257 migrate_nr_uninterruptible(rq);
5258 BUG_ON(rq->nr_running != 0);
5260 /* No need to migrate the tasks: it was best-effort if
5261 * they didn't take sched_hotcpu_mutex. Just wake up
5262 * the requestors. */
5263 spin_lock_irq(&rq->lock);
5264 while (!list_empty(&rq->migration_queue)) {
5265 struct migration_req *req;
5267 req = list_entry(rq->migration_queue.next,
5268 struct migration_req, list);
5269 list_del_init(&req->list);
5270 complete(&req->done);
5272 spin_unlock_irq(&rq->lock);
5275 case CPU_LOCK_RELEASE:
5276 mutex_unlock(&sched_hotcpu_mutex);
5282 /* Register at highest priority so that task migration (migrate_all_tasks)
5283 * happens before everything else.
5285 static struct notifier_block __cpuinitdata migration_notifier = {
5286 .notifier_call = migration_call,
5290 int __init migration_init(void)
5292 void *cpu = (void *)(long)smp_processor_id();
5295 /* Start one for the boot CPU: */
5296 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5297 BUG_ON(err == NOTIFY_BAD);
5298 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5299 register_cpu_notifier(&migration_notifier);
5307 /* Number of possible processor ids */
5308 int nr_cpu_ids __read_mostly = NR_CPUS;
5309 EXPORT_SYMBOL(nr_cpu_ids);
5311 #undef SCHED_DOMAIN_DEBUG
5312 #ifdef SCHED_DOMAIN_DEBUG
5313 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5318 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5322 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5327 struct sched_group *group = sd->groups;
5328 cpumask_t groupmask;
5330 cpumask_scnprintf(str, NR_CPUS, sd->span);
5331 cpus_clear(groupmask);
5334 for (i = 0; i < level + 1; i++)
5336 printk("domain %d: ", level);
5338 if (!(sd->flags & SD_LOAD_BALANCE)) {
5339 printk("does not load-balance\n");
5341 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5346 printk("span %s\n", str);
5348 if (!cpu_isset(cpu, sd->span))
5349 printk(KERN_ERR "ERROR: domain->span does not contain "
5351 if (!cpu_isset(cpu, group->cpumask))
5352 printk(KERN_ERR "ERROR: domain->groups does not contain"
5356 for (i = 0; i < level + 2; i++)
5362 printk(KERN_ERR "ERROR: group is NULL\n");
5366 if (!group->__cpu_power) {
5368 printk(KERN_ERR "ERROR: domain->cpu_power not "
5372 if (!cpus_weight(group->cpumask)) {
5374 printk(KERN_ERR "ERROR: empty group\n");
5377 if (cpus_intersects(groupmask, group->cpumask)) {
5379 printk(KERN_ERR "ERROR: repeated CPUs\n");
5382 cpus_or(groupmask, groupmask, group->cpumask);
5384 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5387 group = group->next;
5388 } while (group != sd->groups);
5391 if (!cpus_equal(sd->span, groupmask))
5392 printk(KERN_ERR "ERROR: groups don't span "
5400 if (!cpus_subset(groupmask, sd->span))
5401 printk(KERN_ERR "ERROR: parent span is not a superset "
5402 "of domain->span\n");
5407 # define sched_domain_debug(sd, cpu) do { } while (0)
5410 static int sd_degenerate(struct sched_domain *sd)
5412 if (cpus_weight(sd->span) == 1)
5415 /* Following flags need at least 2 groups */
5416 if (sd->flags & (SD_LOAD_BALANCE |
5417 SD_BALANCE_NEWIDLE |
5421 SD_SHARE_PKG_RESOURCES)) {
5422 if (sd->groups != sd->groups->next)
5426 /* Following flags don't use groups */
5427 if (sd->flags & (SD_WAKE_IDLE |
5436 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5438 unsigned long cflags = sd->flags, pflags = parent->flags;
5440 if (sd_degenerate(parent))
5443 if (!cpus_equal(sd->span, parent->span))
5446 /* Does parent contain flags not in child? */
5447 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5448 if (cflags & SD_WAKE_AFFINE)
5449 pflags &= ~SD_WAKE_BALANCE;
5450 /* Flags needing groups don't count if only 1 group in parent */
5451 if (parent->groups == parent->groups->next) {
5452 pflags &= ~(SD_LOAD_BALANCE |
5453 SD_BALANCE_NEWIDLE |
5457 SD_SHARE_PKG_RESOURCES);
5459 if (~cflags & pflags)
5466 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5467 * hold the hotplug lock.
5469 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5471 struct rq *rq = cpu_rq(cpu);
5472 struct sched_domain *tmp;
5474 /* Remove the sched domains which do not contribute to scheduling. */
5475 for (tmp = sd; tmp; tmp = tmp->parent) {
5476 struct sched_domain *parent = tmp->parent;
5479 if (sd_parent_degenerate(tmp, parent)) {
5480 tmp->parent = parent->parent;
5482 parent->parent->child = tmp;
5486 if (sd && sd_degenerate(sd)) {
5492 sched_domain_debug(sd, cpu);
5494 rcu_assign_pointer(rq->sd, sd);
5497 /* cpus with isolated domains */
5498 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5500 /* Setup the mask of cpus configured for isolated domains */
5501 static int __init isolated_cpu_setup(char *str)
5503 int ints[NR_CPUS], i;
5505 str = get_options(str, ARRAY_SIZE(ints), ints);
5506 cpus_clear(cpu_isolated_map);
5507 for (i = 1; i <= ints[0]; i++)
5508 if (ints[i] < NR_CPUS)
5509 cpu_set(ints[i], cpu_isolated_map);
5513 __setup ("isolcpus=", isolated_cpu_setup);
5516 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5517 * to a function which identifies what group(along with sched group) a CPU
5518 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5519 * (due to the fact that we keep track of groups covered with a cpumask_t).
5521 * init_sched_build_groups will build a circular linked list of the groups
5522 * covered by the given span, and will set each group's ->cpumask correctly,
5523 * and ->cpu_power to 0.
5526 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5527 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5528 struct sched_group **sg))
5530 struct sched_group *first = NULL, *last = NULL;
5531 cpumask_t covered = CPU_MASK_NONE;
5534 for_each_cpu_mask(i, span) {
5535 struct sched_group *sg;
5536 int group = group_fn(i, cpu_map, &sg);
5539 if (cpu_isset(i, covered))
5542 sg->cpumask = CPU_MASK_NONE;
5543 sg->__cpu_power = 0;
5545 for_each_cpu_mask(j, span) {
5546 if (group_fn(j, cpu_map, NULL) != group)
5549 cpu_set(j, covered);
5550 cpu_set(j, sg->cpumask);
5561 #define SD_NODES_PER_DOMAIN 16
5566 * find_next_best_node - find the next node to include in a sched_domain
5567 * @node: node whose sched_domain we're building
5568 * @used_nodes: nodes already in the sched_domain
5570 * Find the next node to include in a given scheduling domain. Simply
5571 * finds the closest node not already in the @used_nodes map.
5573 * Should use nodemask_t.
5575 static int find_next_best_node(int node, unsigned long *used_nodes)
5577 int i, n, val, min_val, best_node = 0;
5581 for (i = 0; i < MAX_NUMNODES; i++) {
5582 /* Start at @node */
5583 n = (node + i) % MAX_NUMNODES;
5585 if (!nr_cpus_node(n))
5588 /* Skip already used nodes */
5589 if (test_bit(n, used_nodes))
5592 /* Simple min distance search */
5593 val = node_distance(node, n);
5595 if (val < min_val) {
5601 set_bit(best_node, used_nodes);
5606 * sched_domain_node_span - get a cpumask for a node's sched_domain
5607 * @node: node whose cpumask we're constructing
5608 * @size: number of nodes to include in this span
5610 * Given a node, construct a good cpumask for its sched_domain to span. It
5611 * should be one that prevents unnecessary balancing, but also spreads tasks
5614 static cpumask_t sched_domain_node_span(int node)
5616 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5617 cpumask_t span, nodemask;
5621 bitmap_zero(used_nodes, MAX_NUMNODES);
5623 nodemask = node_to_cpumask(node);
5624 cpus_or(span, span, nodemask);
5625 set_bit(node, used_nodes);
5627 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5628 int next_node = find_next_best_node(node, used_nodes);
5630 nodemask = node_to_cpumask(next_node);
5631 cpus_or(span, span, nodemask);
5638 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5641 * SMT sched-domains:
5643 #ifdef CONFIG_SCHED_SMT
5644 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5645 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5647 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5648 struct sched_group **sg)
5651 *sg = &per_cpu(sched_group_cpus, cpu);
5657 * multi-core sched-domains:
5659 #ifdef CONFIG_SCHED_MC
5660 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5661 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5664 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5665 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5666 struct sched_group **sg)
5669 cpumask_t mask = cpu_sibling_map[cpu];
5670 cpus_and(mask, mask, *cpu_map);
5671 group = first_cpu(mask);
5673 *sg = &per_cpu(sched_group_core, group);
5676 #elif defined(CONFIG_SCHED_MC)
5677 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5678 struct sched_group **sg)
5681 *sg = &per_cpu(sched_group_core, cpu);
5686 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5687 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5689 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5690 struct sched_group **sg)
5693 #ifdef CONFIG_SCHED_MC
5694 cpumask_t mask = cpu_coregroup_map(cpu);
5695 cpus_and(mask, mask, *cpu_map);
5696 group = first_cpu(mask);
5697 #elif defined(CONFIG_SCHED_SMT)
5698 cpumask_t mask = cpu_sibling_map[cpu];
5699 cpus_and(mask, mask, *cpu_map);
5700 group = first_cpu(mask);
5705 *sg = &per_cpu(sched_group_phys, group);
5711 * The init_sched_build_groups can't handle what we want to do with node
5712 * groups, so roll our own. Now each node has its own list of groups which
5713 * gets dynamically allocated.
5715 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5716 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5718 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5719 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5721 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5722 struct sched_group **sg)
5724 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5727 cpus_and(nodemask, nodemask, *cpu_map);
5728 group = first_cpu(nodemask);
5731 *sg = &per_cpu(sched_group_allnodes, group);
5735 static void init_numa_sched_groups_power(struct sched_group *group_head)
5737 struct sched_group *sg = group_head;
5743 for_each_cpu_mask(j, sg->cpumask) {
5744 struct sched_domain *sd;
5746 sd = &per_cpu(phys_domains, j);
5747 if (j != first_cpu(sd->groups->cpumask)) {
5749 * Only add "power" once for each
5755 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5758 if (sg != group_head)
5764 /* Free memory allocated for various sched_group structures */
5765 static void free_sched_groups(const cpumask_t *cpu_map)
5769 for_each_cpu_mask(cpu, *cpu_map) {
5770 struct sched_group **sched_group_nodes
5771 = sched_group_nodes_bycpu[cpu];
5773 if (!sched_group_nodes)
5776 for (i = 0; i < MAX_NUMNODES; i++) {
5777 cpumask_t nodemask = node_to_cpumask(i);
5778 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5780 cpus_and(nodemask, nodemask, *cpu_map);
5781 if (cpus_empty(nodemask))
5791 if (oldsg != sched_group_nodes[i])
5794 kfree(sched_group_nodes);
5795 sched_group_nodes_bycpu[cpu] = NULL;
5799 static void free_sched_groups(const cpumask_t *cpu_map)
5805 * Initialize sched groups cpu_power.
5807 * cpu_power indicates the capacity of sched group, which is used while
5808 * distributing the load between different sched groups in a sched domain.
5809 * Typically cpu_power for all the groups in a sched domain will be same unless
5810 * there are asymmetries in the topology. If there are asymmetries, group
5811 * having more cpu_power will pickup more load compared to the group having
5814 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5815 * the maximum number of tasks a group can handle in the presence of other idle
5816 * or lightly loaded groups in the same sched domain.
5818 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5820 struct sched_domain *child;
5821 struct sched_group *group;
5823 WARN_ON(!sd || !sd->groups);
5825 if (cpu != first_cpu(sd->groups->cpumask))
5830 sd->groups->__cpu_power = 0;
5833 * For perf policy, if the groups in child domain share resources
5834 * (for example cores sharing some portions of the cache hierarchy
5835 * or SMT), then set this domain groups cpu_power such that each group
5836 * can handle only one task, when there are other idle groups in the
5837 * same sched domain.
5839 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5841 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5842 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5847 * add cpu_power of each child group to this groups cpu_power
5849 group = child->groups;
5851 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5852 group = group->next;
5853 } while (group != child->groups);
5857 * Build sched domains for a given set of cpus and attach the sched domains
5858 * to the individual cpus
5860 static int build_sched_domains(const cpumask_t *cpu_map)
5864 struct sched_group **sched_group_nodes = NULL;
5865 int sd_allnodes = 0;
5868 * Allocate the per-node list of sched groups
5870 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5872 if (!sched_group_nodes) {
5873 printk(KERN_WARNING "Can not alloc sched group node list\n");
5876 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5880 * Set up domains for cpus specified by the cpu_map.
5882 for_each_cpu_mask(i, *cpu_map) {
5883 struct sched_domain *sd = NULL, *p;
5884 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5886 cpus_and(nodemask, nodemask, *cpu_map);
5889 if (cpus_weight(*cpu_map) >
5890 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5891 sd = &per_cpu(allnodes_domains, i);
5892 *sd = SD_ALLNODES_INIT;
5893 sd->span = *cpu_map;
5894 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5900 sd = &per_cpu(node_domains, i);
5902 sd->span = sched_domain_node_span(cpu_to_node(i));
5906 cpus_and(sd->span, sd->span, *cpu_map);
5910 sd = &per_cpu(phys_domains, i);
5912 sd->span = nodemask;
5916 cpu_to_phys_group(i, cpu_map, &sd->groups);
5918 #ifdef CONFIG_SCHED_MC
5920 sd = &per_cpu(core_domains, i);
5922 sd->span = cpu_coregroup_map(i);
5923 cpus_and(sd->span, sd->span, *cpu_map);
5926 cpu_to_core_group(i, cpu_map, &sd->groups);
5929 #ifdef CONFIG_SCHED_SMT
5931 sd = &per_cpu(cpu_domains, i);
5932 *sd = SD_SIBLING_INIT;
5933 sd->span = cpu_sibling_map[i];
5934 cpus_and(sd->span, sd->span, *cpu_map);
5937 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5941 #ifdef CONFIG_SCHED_SMT
5942 /* Set up CPU (sibling) groups */
5943 for_each_cpu_mask(i, *cpu_map) {
5944 cpumask_t this_sibling_map = cpu_sibling_map[i];
5945 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5946 if (i != first_cpu(this_sibling_map))
5949 init_sched_build_groups(this_sibling_map, cpu_map,
5954 #ifdef CONFIG_SCHED_MC
5955 /* Set up multi-core groups */
5956 for_each_cpu_mask(i, *cpu_map) {
5957 cpumask_t this_core_map = cpu_coregroup_map(i);
5958 cpus_and(this_core_map, this_core_map, *cpu_map);
5959 if (i != first_cpu(this_core_map))
5961 init_sched_build_groups(this_core_map, cpu_map,
5962 &cpu_to_core_group);
5966 /* Set up physical groups */
5967 for (i = 0; i < MAX_NUMNODES; i++) {
5968 cpumask_t nodemask = node_to_cpumask(i);
5970 cpus_and(nodemask, nodemask, *cpu_map);
5971 if (cpus_empty(nodemask))
5974 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5978 /* Set up node groups */
5980 init_sched_build_groups(*cpu_map, cpu_map,
5981 &cpu_to_allnodes_group);
5983 for (i = 0; i < MAX_NUMNODES; i++) {
5984 /* Set up node groups */
5985 struct sched_group *sg, *prev;
5986 cpumask_t nodemask = node_to_cpumask(i);
5987 cpumask_t domainspan;
5988 cpumask_t covered = CPU_MASK_NONE;
5991 cpus_and(nodemask, nodemask, *cpu_map);
5992 if (cpus_empty(nodemask)) {
5993 sched_group_nodes[i] = NULL;
5997 domainspan = sched_domain_node_span(i);
5998 cpus_and(domainspan, domainspan, *cpu_map);
6000 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6002 printk(KERN_WARNING "Can not alloc domain group for "
6006 sched_group_nodes[i] = sg;
6007 for_each_cpu_mask(j, nodemask) {
6008 struct sched_domain *sd;
6009 sd = &per_cpu(node_domains, j);
6012 sg->__cpu_power = 0;
6013 sg->cpumask = nodemask;
6015 cpus_or(covered, covered, nodemask);
6018 for (j = 0; j < MAX_NUMNODES; j++) {
6019 cpumask_t tmp, notcovered;
6020 int n = (i + j) % MAX_NUMNODES;
6022 cpus_complement(notcovered, covered);
6023 cpus_and(tmp, notcovered, *cpu_map);
6024 cpus_and(tmp, tmp, domainspan);
6025 if (cpus_empty(tmp))
6028 nodemask = node_to_cpumask(n);
6029 cpus_and(tmp, tmp, nodemask);
6030 if (cpus_empty(tmp))
6033 sg = kmalloc_node(sizeof(struct sched_group),
6037 "Can not alloc domain group for node %d\n", j);
6040 sg->__cpu_power = 0;
6042 sg->next = prev->next;
6043 cpus_or(covered, covered, tmp);
6050 /* Calculate CPU power for physical packages and nodes */
6051 #ifdef CONFIG_SCHED_SMT
6052 for_each_cpu_mask(i, *cpu_map) {
6053 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6055 init_sched_groups_power(i, sd);
6058 #ifdef CONFIG_SCHED_MC
6059 for_each_cpu_mask(i, *cpu_map) {
6060 struct sched_domain *sd = &per_cpu(core_domains, i);
6062 init_sched_groups_power(i, sd);
6066 for_each_cpu_mask(i, *cpu_map) {
6067 struct sched_domain *sd = &per_cpu(phys_domains, i);
6069 init_sched_groups_power(i, sd);
6073 for (i = 0; i < MAX_NUMNODES; i++)
6074 init_numa_sched_groups_power(sched_group_nodes[i]);
6077 struct sched_group *sg;
6079 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6080 init_numa_sched_groups_power(sg);
6084 /* Attach the domains */
6085 for_each_cpu_mask(i, *cpu_map) {
6086 struct sched_domain *sd;
6087 #ifdef CONFIG_SCHED_SMT
6088 sd = &per_cpu(cpu_domains, i);
6089 #elif defined(CONFIG_SCHED_MC)
6090 sd = &per_cpu(core_domains, i);
6092 sd = &per_cpu(phys_domains, i);
6094 cpu_attach_domain(sd, i);
6101 free_sched_groups(cpu_map);
6106 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6108 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6110 cpumask_t cpu_default_map;
6114 * Setup mask for cpus without special case scheduling requirements.
6115 * For now this just excludes isolated cpus, but could be used to
6116 * exclude other special cases in the future.
6118 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6120 err = build_sched_domains(&cpu_default_map);
6125 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6127 free_sched_groups(cpu_map);
6131 * Detach sched domains from a group of cpus specified in cpu_map
6132 * These cpus will now be attached to the NULL domain
6134 static void detach_destroy_domains(const cpumask_t *cpu_map)
6138 for_each_cpu_mask(i, *cpu_map)
6139 cpu_attach_domain(NULL, i);
6140 synchronize_sched();
6141 arch_destroy_sched_domains(cpu_map);
6145 * Partition sched domains as specified by the cpumasks below.
6146 * This attaches all cpus from the cpumasks to the NULL domain,
6147 * waits for a RCU quiescent period, recalculates sched
6148 * domain information and then attaches them back to the
6149 * correct sched domains
6150 * Call with hotplug lock held
6152 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6154 cpumask_t change_map;
6157 cpus_and(*partition1, *partition1, cpu_online_map);
6158 cpus_and(*partition2, *partition2, cpu_online_map);
6159 cpus_or(change_map, *partition1, *partition2);
6161 /* Detach sched domains from all of the affected cpus */
6162 detach_destroy_domains(&change_map);
6163 if (!cpus_empty(*partition1))
6164 err = build_sched_domains(partition1);
6165 if (!err && !cpus_empty(*partition2))
6166 err = build_sched_domains(partition2);
6171 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6172 int arch_reinit_sched_domains(void)
6176 mutex_lock(&sched_hotcpu_mutex);
6177 detach_destroy_domains(&cpu_online_map);
6178 err = arch_init_sched_domains(&cpu_online_map);
6179 mutex_unlock(&sched_hotcpu_mutex);
6184 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6188 if (buf[0] != '0' && buf[0] != '1')
6192 sched_smt_power_savings = (buf[0] == '1');
6194 sched_mc_power_savings = (buf[0] == '1');
6196 ret = arch_reinit_sched_domains();
6198 return ret ? ret : count;
6201 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6205 #ifdef CONFIG_SCHED_SMT
6207 err = sysfs_create_file(&cls->kset.kobj,
6208 &attr_sched_smt_power_savings.attr);
6210 #ifdef CONFIG_SCHED_MC
6211 if (!err && mc_capable())
6212 err = sysfs_create_file(&cls->kset.kobj,
6213 &attr_sched_mc_power_savings.attr);
6219 #ifdef CONFIG_SCHED_MC
6220 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6222 return sprintf(page, "%u\n", sched_mc_power_savings);
6224 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6225 const char *buf, size_t count)
6227 return sched_power_savings_store(buf, count, 0);
6229 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6230 sched_mc_power_savings_store);
6233 #ifdef CONFIG_SCHED_SMT
6234 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6236 return sprintf(page, "%u\n", sched_smt_power_savings);
6238 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6239 const char *buf, size_t count)
6241 return sched_power_savings_store(buf, count, 1);
6243 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6244 sched_smt_power_savings_store);
6248 * Force a reinitialization of the sched domains hierarchy. The domains
6249 * and groups cannot be updated in place without racing with the balancing
6250 * code, so we temporarily attach all running cpus to the NULL domain
6251 * which will prevent rebalancing while the sched domains are recalculated.
6253 static int update_sched_domains(struct notifier_block *nfb,
6254 unsigned long action, void *hcpu)
6257 case CPU_UP_PREPARE:
6258 case CPU_UP_PREPARE_FROZEN:
6259 case CPU_DOWN_PREPARE:
6260 case CPU_DOWN_PREPARE_FROZEN:
6261 detach_destroy_domains(&cpu_online_map);
6264 case CPU_UP_CANCELED:
6265 case CPU_UP_CANCELED_FROZEN:
6266 case CPU_DOWN_FAILED:
6267 case CPU_DOWN_FAILED_FROZEN:
6269 case CPU_ONLINE_FROZEN:
6271 case CPU_DEAD_FROZEN:
6273 * Fall through and re-initialise the domains.
6280 /* The hotplug lock is already held by cpu_up/cpu_down */
6281 arch_init_sched_domains(&cpu_online_map);
6286 void __init sched_init_smp(void)
6288 cpumask_t non_isolated_cpus;
6290 mutex_lock(&sched_hotcpu_mutex);
6291 arch_init_sched_domains(&cpu_online_map);
6292 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6293 if (cpus_empty(non_isolated_cpus))
6294 cpu_set(smp_processor_id(), non_isolated_cpus);
6295 mutex_unlock(&sched_hotcpu_mutex);
6296 /* XXX: Theoretical race here - CPU may be hotplugged now */
6297 hotcpu_notifier(update_sched_domains, 0);
6299 /* Move init over to a non-isolated CPU */
6300 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6302 sched_init_granularity();
6305 void __init sched_init_smp(void)
6307 sched_init_granularity();
6309 #endif /* CONFIG_SMP */
6311 int in_sched_functions(unsigned long addr)
6313 /* Linker adds these: start and end of __sched functions */
6314 extern char __sched_text_start[], __sched_text_end[];
6316 return in_lock_functions(addr) ||
6317 (addr >= (unsigned long)__sched_text_start
6318 && addr < (unsigned long)__sched_text_end);
6321 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6323 cfs_rq->tasks_timeline = RB_ROOT;
6324 cfs_rq->fair_clock = 1;
6325 #ifdef CONFIG_FAIR_GROUP_SCHED
6330 void __init sched_init(void)
6332 u64 now = sched_clock();
6333 int highest_cpu = 0;
6337 * Link up the scheduling class hierarchy:
6339 rt_sched_class.next = &fair_sched_class;
6340 fair_sched_class.next = &idle_sched_class;
6341 idle_sched_class.next = NULL;
6343 for_each_possible_cpu(i) {
6344 struct rt_prio_array *array;
6348 spin_lock_init(&rq->lock);
6349 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6352 init_cfs_rq(&rq->cfs, rq);
6353 #ifdef CONFIG_FAIR_GROUP_SCHED
6354 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6355 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6357 rq->ls.load_update_last = now;
6358 rq->ls.load_update_start = now;
6360 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6361 rq->cpu_load[j] = 0;
6364 rq->active_balance = 0;
6365 rq->next_balance = jiffies;
6368 rq->migration_thread = NULL;
6369 INIT_LIST_HEAD(&rq->migration_queue);
6371 atomic_set(&rq->nr_iowait, 0);
6373 array = &rq->rt.active;
6374 for (j = 0; j < MAX_RT_PRIO; j++) {
6375 INIT_LIST_HEAD(array->queue + j);
6376 __clear_bit(j, array->bitmap);
6379 /* delimiter for bitsearch: */
6380 __set_bit(MAX_RT_PRIO, array->bitmap);
6383 set_load_weight(&init_task);
6386 nr_cpu_ids = highest_cpu + 1;
6387 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6390 #ifdef CONFIG_RT_MUTEXES
6391 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6395 * The boot idle thread does lazy MMU switching as well:
6397 atomic_inc(&init_mm.mm_count);
6398 enter_lazy_tlb(&init_mm, current);
6401 * Make us the idle thread. Technically, schedule() should not be
6402 * called from this thread, however somewhere below it might be,
6403 * but because we are the idle thread, we just pick up running again
6404 * when this runqueue becomes "idle".
6406 init_idle(current, smp_processor_id());
6408 * During early bootup we pretend to be a normal task:
6410 current->sched_class = &fair_sched_class;
6413 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6414 void __might_sleep(char *file, int line)
6417 static unsigned long prev_jiffy; /* ratelimiting */
6419 if ((in_atomic() || irqs_disabled()) &&
6420 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6421 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6423 prev_jiffy = jiffies;
6424 printk(KERN_ERR "BUG: sleeping function called from invalid"
6425 " context at %s:%d\n", file, line);
6426 printk("in_atomic():%d, irqs_disabled():%d\n",
6427 in_atomic(), irqs_disabled());
6428 debug_show_held_locks(current);
6429 if (irqs_disabled())
6430 print_irqtrace_events(current);
6435 EXPORT_SYMBOL(__might_sleep);
6438 #ifdef CONFIG_MAGIC_SYSRQ
6439 void normalize_rt_tasks(void)
6441 struct task_struct *g, *p;
6442 unsigned long flags;
6446 read_lock_irq(&tasklist_lock);
6447 do_each_thread(g, p) {
6449 p->se.wait_runtime = 0;
6450 p->se.wait_start_fair = 0;
6451 p->se.wait_start = 0;
6452 p->se.exec_start = 0;
6453 p->se.sleep_start = 0;
6454 p->se.sleep_start_fair = 0;
6455 p->se.block_start = 0;
6456 task_rq(p)->cfs.fair_clock = 0;
6457 task_rq(p)->clock = 0;
6461 * Renice negative nice level userspace
6464 if (TASK_NICE(p) < 0 && p->mm)
6465 set_user_nice(p, 0);
6469 spin_lock_irqsave(&p->pi_lock, flags);
6470 rq = __task_rq_lock(p);
6473 * Do not touch the migration thread:
6475 if (p == rq->migration_thread)
6479 on_rq = p->se.on_rq;
6481 deactivate_task(task_rq(p), p, 0);
6482 __setscheduler(rq, p, SCHED_NORMAL, 0);
6484 activate_task(task_rq(p), p, 0);
6485 resched_task(rq->curr);
6490 __task_rq_unlock(rq);
6491 spin_unlock_irqrestore(&p->pi_lock, flags);
6492 } while_each_thread(g, p);
6494 read_unlock_irq(&tasklist_lock);
6497 #endif /* CONFIG_MAGIC_SYSRQ */
6501 * These functions are only useful for the IA64 MCA handling.
6503 * They can only be called when the whole system has been
6504 * stopped - every CPU needs to be quiescent, and no scheduling
6505 * activity can take place. Using them for anything else would
6506 * be a serious bug, and as a result, they aren't even visible
6507 * under any other configuration.
6511 * curr_task - return the current task for a given cpu.
6512 * @cpu: the processor in question.
6514 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6516 struct task_struct *curr_task(int cpu)
6518 return cpu_curr(cpu);
6522 * set_curr_task - set the current task for a given cpu.
6523 * @cpu: the processor in question.
6524 * @p: the task pointer to set.
6526 * Description: This function must only be used when non-maskable interrupts
6527 * are serviced on a separate stack. It allows the architecture to switch the
6528 * notion of the current task on a cpu in a non-blocking manner. This function
6529 * must be called with all CPU's synchronized, and interrupts disabled, the
6530 * and caller must save the original value of the current task (see
6531 * curr_task() above) and restore that value before reenabling interrupts and
6532 * re-starting the system.
6534 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6536 void set_curr_task(int cpu, struct task_struct *p)