1 #include <linux/cgroup.h>
3 #include <linux/kernel.h>
4 #include <linux/percpu.h>
5 #include <linux/printk.h>
6 #include <linux/rcupdate.h>
7 #include <linux/slab.h>
9 #include <trace/events/sched.h>
14 #ifdef CONFIG_CGROUP_SCHEDTUNE
15 static bool schedtune_initialized = false;
18 unsigned int sysctl_sched_cfs_boost __read_mostly;
20 extern struct target_nrg schedtune_target_nrg;
22 /* Performance Boost region (B) threshold params */
23 static int perf_boost_idx;
25 /* Performance Constraint region (C) threshold params */
26 static int perf_constrain_idx;
29 * Performance-Energy (P-E) Space thresholds constants
31 struct threshold_params {
37 * System specific P-E space thresholds constants
39 static struct threshold_params
50 { 5, 0 } /* <= 100% */
54 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
55 int perf_boost_idx, int perf_constrain_idx)
57 int payoff = -INT_MAX;
60 /* Performance Boost (B) region */
61 if (nrg_delta >= 0 && cap_delta > 0)
62 gain_idx = perf_boost_idx;
63 /* Performance Constraint (C) region */
64 else if (nrg_delta < 0 && cap_delta <= 0)
65 gain_idx = perf_constrain_idx;
67 /* Default: reject schedule candidate */
72 * Evaluate "Performance Boost" vs "Energy Increase"
74 * - Performance Boost (B) region
76 * Condition: nrg_delta > 0 && cap_delta > 0
78 * cap_gain / nrg_gain < cap_delta / nrg_delta =
79 * cap_gain * nrg_delta < cap_delta * nrg_gain
80 * Note that since both nrg_gain and nrg_delta are positive, the
81 * inequality does not change. Thus:
83 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
85 * - Performance Constraint (C) region
87 * Condition: nrg_delta < 0 && cap_delta < 0
89 * cap_gain / nrg_gain > cap_delta / nrg_delta =
90 * cap_gain * nrg_delta < cap_delta * nrg_gain
91 * Note that since nrg_gain > 0 while nrg_delta < 0, the
92 * inequality change. Thus:
94 * payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
96 * This means that, in case of same positive defined {cap,nrg}_gain
97 * for both the B and C regions, we can use the same payoff formula
98 * where a positive value represents the accept condition.
100 payoff = cap_delta * threshold_gains[gain_idx].nrg_gain;
101 payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
106 #ifdef CONFIG_CGROUP_SCHEDTUNE
109 * EAS scheduler tunables for task groups.
112 /* SchdTune tunables for a group of tasks */
114 /* SchedTune CGroup subsystem */
115 struct cgroup_subsys_state css;
117 /* Boost group allocated ID */
120 /* Boost value for tasks on that SchedTune CGroup */
123 /* Performance Boost (B) region threshold params */
126 /* Performance Constraint (C) region threshold params */
127 int perf_constrain_idx;
129 /* Hint to bias scheduling of tasks on that SchedTune CGroup
130 * towards idle CPUs */
134 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
136 return css ? container_of(css, struct schedtune, css) : NULL;
139 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
141 return css_st(task_css(tsk, schedtune_cgrp_id));
144 static inline struct schedtune *parent_st(struct schedtune *st)
146 return css_st(st->css.parent);
150 * SchedTune root control group
151 * The root control group is used to defined a system-wide boosting tuning,
152 * which is applied to all tasks in the system.
153 * Task specific boost tuning could be specified by creating and
154 * configuring a child control group under the root one.
155 * By default, system-wide boosting is disabled, i.e. no boosting is applied
156 * to tasks which are not into a child control group.
158 static struct schedtune
162 .perf_constrain_idx = 0,
167 schedtune_accept_deltas(int nrg_delta, int cap_delta,
168 struct task_struct *task)
170 struct schedtune *ct;
172 int perf_constrain_idx;
174 /* Optimal (O) region */
175 if (nrg_delta < 0 && cap_delta > 0) {
176 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
180 /* Suboptimal (S) region */
181 if (nrg_delta > 0 && cap_delta < 0) {
182 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
186 /* Get task specific perf Boost/Constraints indexes */
188 ct = task_schedtune(task);
189 perf_boost_idx = ct->perf_boost_idx;
190 perf_constrain_idx = ct->perf_constrain_idx;
193 return __schedtune_accept_deltas(nrg_delta, cap_delta,
194 perf_boost_idx, perf_constrain_idx);
198 * Maximum number of boost groups to support
199 * When per-task boosting is used we still allow only limited number of
200 * boost groups for two main reasons:
201 * 1. on a real system we usually have only few classes of workloads which
202 * make sense to boost with different values (e.g. background vs foreground
203 * tasks, interactive vs low-priority tasks)
204 * 2. a limited number allows for a simpler and more memory/time efficient
205 * implementation especially for the computation of the per-CPU boost
208 #define BOOSTGROUPS_COUNT 4
210 /* Array of configured boostgroups */
211 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
216 /* SchedTune boost groups
217 * Keep track of all the boost groups which impact on CPU, for example when a
218 * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
219 * likely with different boost values.
220 * Since on each system we expect only a limited number of boost groups, here
221 * we use a simple array to keep track of the metrics required to compute the
222 * maximum per-CPU boosting value.
224 struct boost_groups {
225 /* Maximum boost value for all RUNNABLE tasks on a CPU */
229 /* The boost for tasks on that boost group */
231 /* Count of RUNNABLE tasks on that boost group */
233 } group[BOOSTGROUPS_COUNT];
234 /* CPU's boost group locking */
238 /* Boost groups affecting each CPU in the system */
239 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
242 schedtune_cpu_update(int cpu)
244 struct boost_groups *bg;
248 bg = &per_cpu(cpu_boost_groups, cpu);
250 /* The root boost group is always active */
251 boost_max = bg->group[0].boost;
252 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
254 * A boost group affects a CPU only if it has
255 * RUNNABLE tasks on that CPU
257 if (bg->group[idx].tasks == 0)
260 boost_max = max(boost_max, bg->group[idx].boost);
262 /* Ensures boost_max is non-negative when all cgroup boost values
263 * are neagtive. Avoids under-accounting of cpu capacity which may cause
264 * task stacking and frequency spikes.*/
265 boost_max = max(boost_max, 0);
266 bg->boost_max = boost_max;
270 schedtune_boostgroup_update(int idx, int boost)
272 struct boost_groups *bg;
277 /* Update per CPU boost groups */
278 for_each_possible_cpu(cpu) {
279 bg = &per_cpu(cpu_boost_groups, cpu);
282 * Keep track of current boost values to compute the per CPU
283 * maximum only when it has been affected by the new value of
284 * the updated boost group
286 cur_boost_max = bg->boost_max;
287 old_boost = bg->group[idx].boost;
289 /* Update the boost value of this boost group */
290 bg->group[idx].boost = boost;
292 /* Check if this update increase current max */
293 if (boost > cur_boost_max && bg->group[idx].tasks) {
294 bg->boost_max = boost;
295 trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
299 /* Check if this update has decreased current max */
300 if (cur_boost_max == old_boost && old_boost > boost) {
301 schedtune_cpu_update(cpu);
302 trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
306 trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
312 #define ENQUEUE_TASK 1
313 #define DEQUEUE_TASK -1
316 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
318 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
319 int tasks = bg->group[idx].tasks + task_count;
321 /* Update boosted tasks count while avoiding to make it negative */
322 bg->group[idx].tasks = max(0, tasks);
324 trace_sched_tune_tasks_update(p, cpu, tasks, idx,
325 bg->group[idx].boost, bg->boost_max);
327 /* Boost group activation or deactivation on that RQ */
328 if (tasks == 1 || tasks == 0)
329 schedtune_cpu_update(cpu);
333 * NOTE: This function must be called while holding the lock on the CPU RQ
335 void schedtune_enqueue_task(struct task_struct *p, int cpu)
337 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
338 unsigned long irq_flags;
339 struct schedtune *st;
342 if (!unlikely(schedtune_initialized))
346 * When a task is marked PF_EXITING by do_exit() it's going to be
347 * dequeued and enqueued multiple times in the exit path.
348 * Thus we avoid any further update, since we do not want to change
349 * CPU boosting while the task is exiting.
351 if (p->flags & PF_EXITING)
355 * Boost group accouting is protected by a per-cpu lock and requires
356 * interrupt to be disabled to avoid race conditions for example on
357 * do_exit()::cgroup_exit() and task migration.
359 raw_spin_lock_irqsave(&bg->lock, irq_flags);
362 st = task_schedtune(p);
365 schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
368 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
371 int schedtune_can_attach(struct cgroup_taskset *tset)
373 struct task_struct *task;
374 struct cgroup_subsys_state *css;
375 struct boost_groups *bg;
376 unsigned long irq_flags;
379 int src_bg; /* Source boost group index */
380 int dst_bg; /* Destination boost group index */
383 if (!unlikely(schedtune_initialized))
387 cgroup_taskset_for_each(task, css, tset) {
390 * Lock the CPU's RQ the task is enqueued to avoid race
391 * conditions with migration code while the task is being
394 rq = lock_rq_of(task, &irq_flags);
397 unlock_rq_of(rq, task, &irq_flags);
402 * Boost group accouting is protected by a per-cpu lock and requires
403 * interrupt to be disabled to avoid race conditions on...
406 bg = &per_cpu(cpu_boost_groups, cpu);
407 raw_spin_lock(&bg->lock);
409 dst_bg = css_st(css)->idx;
410 src_bg = task_schedtune(task)->idx;
413 * Current task is not changing boostgroup, which can
414 * happen when the new hierarchy is in use.
416 if (unlikely(dst_bg == src_bg)) {
417 raw_spin_unlock(&bg->lock);
418 unlock_rq_of(rq, task, &irq_flags);
423 * This is the case of a RUNNABLE task which is switching its
424 * current boost group.
427 /* Move task from src to dst boost group */
428 tasks = bg->group[src_bg].tasks - 1;
429 bg->group[src_bg].tasks = max(0, tasks);
430 bg->group[dst_bg].tasks += 1;
432 raw_spin_unlock(&bg->lock);
433 unlock_rq_of(rq, task, &irq_flags);
435 /* Update CPU boost group */
436 if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
437 schedtune_cpu_update(task_cpu(task));
444 void schedtune_cancel_attach(struct cgroup_taskset *tset)
446 /* This can happen only if SchedTune controller is mounted with
447 * other hierarchies ane one of them fails. Since usually SchedTune is
448 * mouted on its own hierarcy, for the time being we do not implement
449 * a proper rollback mechanism */
450 WARN(1, "SchedTune cancel attach not implemented");
454 * NOTE: This function must be called while holding the lock on the CPU RQ
456 void schedtune_dequeue_task(struct task_struct *p, int cpu)
458 struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
459 unsigned long irq_flags;
460 struct schedtune *st;
463 if (!unlikely(schedtune_initialized))
467 * When a task is marked PF_EXITING by do_exit() it's going to be
468 * dequeued and enqueued multiple times in the exit path.
469 * Thus we avoid any further update, since we do not want to change
470 * CPU boosting while the task is exiting.
471 * The last dequeue is already enforce by the do_exit() code path
472 * via schedtune_exit_task().
474 if (p->flags & PF_EXITING)
478 * Boost group accouting is protected by a per-cpu lock and requires
479 * interrupt to be disabled to avoid race conditions on...
481 raw_spin_lock_irqsave(&bg->lock, irq_flags);
484 st = task_schedtune(p);
487 schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
490 raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
493 void schedtune_exit_task(struct task_struct *tsk)
495 struct schedtune *st;
496 unsigned long irq_flags;
501 if (!unlikely(schedtune_initialized))
504 rq = lock_rq_of(tsk, &irq_flags);
508 st = task_schedtune(tsk);
510 schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
513 unlock_rq_of(rq, tsk, &irq_flags);
516 int schedtune_cpu_boost(int cpu)
518 struct boost_groups *bg;
520 bg = &per_cpu(cpu_boost_groups, cpu);
521 return bg->boost_max;
524 int schedtune_task_boost(struct task_struct *p)
526 struct schedtune *st;
529 /* Get task boost value */
531 st = task_schedtune(p);
532 task_boost = st->boost;
538 int schedtune_prefer_idle(struct task_struct *p)
540 struct schedtune *st;
543 /* Get prefer_idle value */
545 st = task_schedtune(p);
546 prefer_idle = st->prefer_idle;
553 prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
555 struct schedtune *st = css_st(css);
557 return st->prefer_idle;
561 prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
564 struct schedtune *st = css_st(css);
565 st->prefer_idle = prefer_idle;
571 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
573 struct schedtune *st = css_st(css);
579 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
582 struct schedtune *st = css_st(css);
583 unsigned threshold_idx;
586 if (boost < -100 || boost > 100)
591 * Update threshold params for Performance Boost (B)
592 * and Performance Constraint (C) regions.
593 * The current implementatio uses the same cuts for both
596 threshold_idx = clamp(boost_pct, 0, 99) / 10;
597 st->perf_boost_idx = threshold_idx;
598 st->perf_constrain_idx = threshold_idx;
601 if (css == &root_schedtune.css) {
602 sysctl_sched_cfs_boost = boost;
603 perf_boost_idx = threshold_idx;
604 perf_constrain_idx = threshold_idx;
607 /* Update CPU boost */
608 schedtune_boostgroup_update(st->idx, st->boost);
610 trace_sched_tune_config(st->boost);
615 static struct cftype files[] = {
618 .read_s64 = boost_read,
619 .write_s64 = boost_write,
622 .name = "prefer_idle",
623 .read_u64 = prefer_idle_read,
624 .write_u64 = prefer_idle_write,
630 schedtune_boostgroup_init(struct schedtune *st)
632 struct boost_groups *bg;
635 /* Keep track of allocated boost groups */
636 allocated_group[st->idx] = st;
638 /* Initialize the per CPU boost groups */
639 for_each_possible_cpu(cpu) {
640 bg = &per_cpu(cpu_boost_groups, cpu);
641 bg->group[st->idx].boost = 0;
642 bg->group[st->idx].tasks = 0;
648 static struct cgroup_subsys_state *
649 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
651 struct schedtune *st;
655 return &root_schedtune.css;
657 /* Allow only single level hierachies */
658 if (parent_css != &root_schedtune.css) {
659 pr_err("Nested SchedTune boosting groups not allowed\n");
660 return ERR_PTR(-ENOMEM);
663 /* Allow only a limited number of boosting groups */
664 for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
665 if (!allocated_group[idx])
667 if (idx == BOOSTGROUPS_COUNT) {
668 pr_err("Trying to create more than %d SchedTune boosting groups\n",
670 return ERR_PTR(-ENOSPC);
673 st = kzalloc(sizeof(*st), GFP_KERNEL);
677 /* Initialize per CPUs boost group support */
679 if (schedtune_boostgroup_init(st))
687 return ERR_PTR(-ENOMEM);
691 schedtune_boostgroup_release(struct schedtune *st)
693 /* Reset this boost group */
694 schedtune_boostgroup_update(st->idx, 0);
696 /* Keep track of allocated boost groups */
697 allocated_group[st->idx] = NULL;
701 schedtune_css_free(struct cgroup_subsys_state *css)
703 struct schedtune *st = css_st(css);
705 schedtune_boostgroup_release(st);
709 struct cgroup_subsys schedtune_cgrp_subsys = {
710 .css_alloc = schedtune_css_alloc,
711 .css_free = schedtune_css_free,
712 .can_attach = schedtune_can_attach,
713 .cancel_attach = schedtune_cancel_attach,
714 .legacy_cftypes = files,
719 schedtune_init_cgroups(void)
721 struct boost_groups *bg;
724 /* Initialize the per CPU boost groups */
725 for_each_possible_cpu(cpu) {
726 bg = &per_cpu(cpu_boost_groups, cpu);
727 memset(bg, 0, sizeof(struct boost_groups));
730 pr_info("schedtune: configured to support %d boost groups\n",
733 schedtune_initialized = true;
736 #else /* CONFIG_CGROUP_SCHEDTUNE */
739 schedtune_accept_deltas(int nrg_delta, int cap_delta,
740 struct task_struct *task)
742 /* Optimal (O) region */
743 if (nrg_delta < 0 && cap_delta > 0) {
744 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
748 /* Suboptimal (S) region */
749 if (nrg_delta > 0 && cap_delta < 0) {
750 trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
754 return __schedtune_accept_deltas(nrg_delta, cap_delta,
755 perf_boost_idx, perf_constrain_idx);
758 #endif /* CONFIG_CGROUP_SCHEDTUNE */
761 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
762 void __user *buffer, size_t *lenp,
765 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
766 unsigned threshold_idx;
772 if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
774 boost_pct = sysctl_sched_cfs_boost;
777 * Update threshold params for Performance Boost (B)
778 * and Performance Constraint (C) regions.
779 * The current implementatio uses the same cuts for both
782 threshold_idx = clamp(boost_pct, 0, 99) / 10;
783 perf_boost_idx = threshold_idx;
784 perf_constrain_idx = threshold_idx;
789 #ifdef CONFIG_SCHED_DEBUG
791 schedtune_test_nrg(unsigned long delta_pwr)
793 unsigned long test_delta_pwr;
794 unsigned long test_norm_pwr;
798 * Check normalization constants using some constant system
801 pr_info("schedtune: verify normalization constants...\n");
802 for (idx = 0; idx < 6; ++idx) {
803 test_delta_pwr = delta_pwr >> idx;
805 /* Normalize on max energy for target platform */
806 test_norm_pwr = reciprocal_divide(
807 test_delta_pwr << SCHED_LOAD_SHIFT,
808 schedtune_target_nrg.rdiv);
810 pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
811 idx, test_delta_pwr, test_norm_pwr);
815 #define schedtune_test_nrg(delta_pwr)
819 * Compute the min/max power consumption of a cluster and all its CPUs
822 schedtune_add_cluster_nrg(
823 struct sched_domain *sd,
824 struct sched_group *sg,
825 struct target_nrg *ste)
827 struct sched_domain *sd2;
828 struct sched_group *sg2;
830 struct cpumask *cluster_cpus;
833 unsigned long min_pwr;
834 unsigned long max_pwr;
837 /* Get Cluster energy using EM data for the first CPU */
838 cluster_cpus = sched_group_cpus(sg);
839 snprintf(str, 32, "CLUSTER[%*pbl]",
840 cpumask_pr_args(cluster_cpus));
842 min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
843 max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
844 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
845 str, min_pwr, max_pwr);
848 * Keep track of this cluster's energy in the computation of the
849 * overall system energy
851 ste->min_power += min_pwr;
852 ste->max_power += max_pwr;
854 /* Get CPU energy using EM data for each CPU in the group */
855 for_each_cpu(cpu, cluster_cpus) {
856 /* Get a SD view for the specific CPU */
857 for_each_domain(cpu, sd2) {
858 /* Get the CPU group */
860 min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
861 max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
863 ste->min_power += min_pwr;
864 ste->max_power += max_pwr;
866 snprintf(str, 32, "CPU[%d]", cpu);
867 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
868 str, min_pwr, max_pwr);
871 * Assume we have EM data only at the CPU and
872 * the upper CLUSTER level
874 BUG_ON(!cpumask_equal(
875 sched_group_cpus(sg),
876 sched_group_cpus(sd2->parent->groups)
884 * Initialize the constants required to compute normalized energy.
885 * The values of these constants depends on the EM data for the specific
886 * target system and topology.
887 * Thus, this function is expected to be called by the code
888 * that bind the EM to the topology information.
893 struct target_nrg *ste = &schedtune_target_nrg;
894 unsigned long delta_pwr = 0;
895 struct sched_domain *sd;
896 struct sched_group *sg;
898 pr_info("schedtune: init normalization constants...\n");
905 * When EAS is in use, we always have a pointer to the highest SD
906 * which provides EM data.
908 sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
910 pr_info("schedtune: no energy model data\n");
916 schedtune_add_cluster_nrg(sd, sg, ste);
917 } while (sg = sg->next, sg != sd->groups);
921 pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
922 "SYSTEM", ste->min_power, ste->max_power);
924 /* Compute normalization constants */
925 delta_pwr = ste->max_power - ste->min_power;
926 ste->rdiv = reciprocal_value(delta_pwr);
927 pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
928 ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
930 schedtune_test_nrg(delta_pwr);
932 #ifdef CONFIG_CGROUP_SCHEDTUNE
933 schedtune_init_cgroups();
935 pr_info("schedtune: configured to support global boosting only\n");
944 postcore_initcall(schedtune_init);