ARM: Experimental Frequency-Invariant Load Scaling Patch
authorChris Redpath <chris.redpath@arm.com>
Fri, 16 Nov 2012 10:03:00 +0000 (10:03 +0000)
committerJon Medhurst <tixy@linaro.org>
Wed, 17 Jul 2013 10:12:27 +0000 (11:12 +0100)
Evaluation Patch to investigate using load as a representation of the
amount of POTENTIAL cpu compute capacity used rather than a representation
of the CURRENT cpu compute capacity.

If CPUFreq is enabled, scales load in accordance with frequency.

Powersave/performance CPUFreq governors are detected and scaling is
disabled while these governors are in use. This is because when a
single-frequency governor is in use, potential CPU capacity is static.

So long as the governors and CPUFreq subsystem correctly report the
frequencies available, the scaling should self tune.

Adds an additional file to sysfs to allow this feature to be disabled
for experimentation.

/sys/kernel/hmp/frequency_invariant_load_scale

write 0 to disable, 1 to enable.

Signed-off-by: Chris Redpath <chris.redpath@arm.com>
arch/arm/Kconfig
kernel/sched/fair.c

index f0a5a0787ea62e6c68e9c2df4d75012aedd9d97e..bc79c57d78c4185be265c27e4c132eb5ec827fbc 100644 (file)
@@ -1564,6 +1564,21 @@ config HMP_VARIABLE_SCALE
                (1002/1024)^(LOAD_AVG_PERIOD/load_avg_period_ms)
          but it remove intermadiate overflows in computation.
 
+config HMP_FREQUENCY_INVARIANT_SCALE
+       bool "(EXPERIMENTAL) Frequency-Invariant Tracked Load for HMP"
+       depends on HMP_VARIABLE_SCALE && CPU_FREQ
+       help
+         Scales the current load contribution in line with the frequency
+         of the CPU that the task was executed on.
+         In this version, we use a simple linear scale derived from the
+         maximum frequency reported by CPUFreq.
+         Restricting tracked load to be scaled by the CPU's frequency
+         represents the consumption of possible compute capacity
+         (rather than consumption of actual instantaneous capacity as
+         normal) and allows the HMP migration's simple threshold
+         migration strategy to interact more predictably with CPUFreq's
+         asynchronous compute capacity changes.
+
 config HAVE_ARM_SCU
        bool
        help
index f86399bb8fe7d68333c5c7de7d4463ac248f7e42..202bebe3dab8bf240293e3099e147563503bf053 100644 (file)
 #ifdef CONFIG_HMP_VARIABLE_SCALE
 #include <linux/sysfs.h>
 #include <linux/vmalloc.h>
-#endif
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
 
 #include "sched.h"
 
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -1206,8 +1213,93 @@ static u32 __compute_runnable_contrib(u64 n)
 }
 
 #ifdef CONFIG_HMP_VARIABLE_SCALE
-static u64 hmp_variable_scale_convert(u64 delta);
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct kobject *kobj,
+                       struct attribute *attr, char *buf);
+       ssize_t (*store)(struct kobject *a, struct attribute *b,
+                       const char *c, size_t count);
+       int *value;
+       int (*to_sysfs)(int);
+       int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#define HMP_DATA_SYSFS_MAX 4
+#else
+#define HMP_DATA_SYSFS_MAX 3
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       int freqinvar_load_scale_enabled;
 #endif
+       int multiplier; /* used to scale the time delta */
+       struct attribute_group attr_group;
+       struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+       struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+       u32 curr_scale;
+       u32 min;
+       u32 max;
+       u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
 /* We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
  * history into segments of approximately 1ms (1024us); label the segment that
@@ -1238,11 +1330,18 @@ static u64 hmp_variable_scale_convert(u64 delta);
 static __always_inline int __update_entity_runnable_avg(u64 now,
                                                        struct sched_avg *sa,
                                                        int runnable,
-                                                       int running)
+                                                       int running,
+                                                       int cpu)
 {
        u64 delta, periods;
        u32 runnable_contrib;
        int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       u64 scaled_delta;
+       u32 scaled_runnable_contrib;
+       int scaled_delta_w;
+       u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
 
        delta = now - sa->last_runnable_update;
 #ifdef CONFIG_HMP_VARIABLE_SCALE
@@ -1266,6 +1365,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                return 0;
        sa->last_runnable_update = now;
 
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* retrieve scale factor for load */
+       if (hmp_data.freqinvar_load_scale_enabled)
+               curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
        /* delta_w is the amount already accumulated against our next period */
        delta_w = sa->runnable_avg_period % 1024;
        if (delta + delta_w >= 1024) {
@@ -1278,10 +1383,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                 * period and accrue it.
                 */
                delta_w = 1024 - delta_w;
+               /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_delta_w = (delta_w * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_delta_w;
+               if (running)
+                       sa->usage_avg_sum += scaled_delta_w;
+#else
                if (runnable)
                        sa->runnable_avg_sum += delta_w;
                if (running)
                        sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                sa->runnable_avg_period += delta_w;
 
                delta -= delta_w;
@@ -1289,27 +1404,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
                /* Figure out how many additional periods this update spans */
                periods = delta / 1024;
                delta %= 1024;
-
+               /* decay the load we have accumulated so far */
                sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
                                                  periods + 1);
                sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
                                                     periods + 1);
                sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
-
+               /* add the contribution from this period */
                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
                runnable_contrib = __compute_runnable_contrib(periods);
+               /* Apply load scaling if necessary.
+                * Note that multiplying the whole series is same as
+                * multiplying all terms
+                */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+               scaled_runnable_contrib = (runnable_contrib * curr_scale)
+                               >> SCHED_FREQSCALE_SHIFT;
+               if (runnable)
+                       sa->runnable_avg_sum += scaled_runnable_contrib;
+               if (running)
+                       sa->usage_avg_sum += scaled_runnable_contrib;
+#else
                if (runnable)
                        sa->runnable_avg_sum += runnable_contrib;
                if (running)
                        sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
                sa->runnable_avg_period += runnable_contrib;
        }
 
        /* Remainder of delta accrued against u_0` */
+       /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+       if (runnable)
+               sa->runnable_avg_sum += scaled_delta;
+       if (running)
+               sa->usage_avg_sum += scaled_delta;
+#else
        if (runnable)
                sa->runnable_avg_sum += delta;
        if (running)
                sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
        sa->runnable_avg_period += delta;
 
        return decayed;
@@ -1488,7 +1625,7 @@ static inline void update_entity_load_avg(struct sched_entity *se,
                now = cfs_rq_clock_task(group_cfs_rq(se));
 
        if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
-                                         cfs_rq->curr == se))
+                       cfs_rq->curr == se, se->cfs_rq->rq->cpu))
                return;
 
        contrib_delta = __update_entity_load_avg_contrib(se);
@@ -1534,7 +1671,7 @@ static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
 {
        u32 contrib;
        __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
-                                    runnable);
+                                    runnable, rq->cpu);
        __update_tg_runnable_avg(&rq->avg, &rq->cfs);
        contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
        contrib /= (rq->avg.runnable_avg_period + 1);
@@ -3554,27 +3691,6 @@ static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
  * delta time by 1/22 and setting load_avg_period_ms = 706.
  */
 
-#define HMP_VARIABLE_SCALE_SHIFT 16ULL
-struct hmp_global_attr {
-       struct attribute attr;
-       ssize_t (*show)(struct kobject *kobj,
-                       struct attribute *attr, char *buf);
-       ssize_t (*store)(struct kobject *a, struct attribute *b,
-                       const char *c, size_t count);
-       int *value;
-       int (*to_sysfs)(int);
-       int (*from_sysfs)(int);
-};
-
-#define HMP_DATA_SYSFS_MAX 3
-
-struct hmp_data_struct {
-       int multiplier; /* used to scale the time delta */
-       struct attribute_group attr_group;
-       struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
-       struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
-} hmp_data;
-
 /*
  * By scaling the delta time it end-up increasing or decrease the
  * growing speed of the per entity load_avg_ratio
@@ -3642,7 +3758,15 @@ static int hmp_theshold_from_sysfs(int value)
                return -1;
        return value;
 }
-
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+       if (value < 0 || value > 1)
+               return -1;
+       return value;
+}
+#endif
 static void hmp_attr_add(
        const char *name,
        int *value,
@@ -3687,7 +3811,14 @@ static int hmp_attr_init(void)
                &hmp_down_threshold,
                NULL,
                hmp_theshold_from_sysfs);
-
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+       /* default frequency-invariant scaling ON */
+       hmp_data.freqinvar_load_scale_enabled = 1;
+       hmp_attr_add("frequency_invariant_load_scale",
+               &hmp_data.freqinvar_load_scale_enabled,
+               NULL,
+               hmp_freqinvar_from_sysfs);
+#endif
        hmp_data.attr_group.name = "hmp";
        hmp_data.attr_group.attrs = hmp_data.attributes;
        ret = sysfs_create_group(kernel_kobj,
@@ -6878,3 +7009,132 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 
 }
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+       u32 result = curr / max;
+       return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+                                       unsigned long val, void *data)
+{
+       struct cpufreq_freqs *freq = data;
+       int cpu = freq->cpu;
+       struct cpufreq_extents *extents;
+
+       if (freq->flags & CPUFREQ_CONST_LOOPS)
+               return NOTIFY_OK;
+
+       if (val != CPUFREQ_POSTCHANGE)
+               return NOTIFY_OK;
+
+       /* if dynamic load scale is disabled, set the load scale to 1.0 */
+       if (!hmp_data.freqinvar_load_scale_enabled) {
+               freq_scale[cpu].curr_scale = 1024;
+               return NOTIFY_OK;
+       }
+
+       extents = &freq_scale[cpu];
+       if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+               /* If our governor was recognised as a single-freq governor,
+                * use 1.0
+                */
+               extents->curr_scale = 1024;
+       } else {
+               extents->curr_scale = cpufreq_calc_scale(extents->min,
+                               extents->max, freq->new);
+       }
+
+       return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+                                      unsigned long event, void *data)
+{
+       struct cpufreq_policy *policy = data;
+       struct cpufreq_extents *extents;
+       int cpu, singleFreq = 0;
+       static const char performance_governor[] = "performance";
+       static const char powersave_governor[] = "powersave";
+
+       if (event == CPUFREQ_START)
+               return 0;
+
+       if (event != CPUFREQ_INCOMPATIBLE)
+               return 0;
+
+       /* CPUFreq governors do not accurately report the range of
+        * CPU Frequencies they will choose from.
+        * We recognise performance and powersave governors as
+        * single-frequency only.
+        */
+       if (!strncmp(policy->governor->name, performance_governor,
+                       strlen(performance_governor)) ||
+               !strncmp(policy->governor->name, powersave_governor,
+                               strlen(powersave_governor)))
+               singleFreq = 1;
+
+       /* Make sure that all CPUs impacted by this policy are
+        * updated since we will only get a notification when the
+        * user explicitly changes the policy on a CPU.
+        */
+       for_each_cpu(cpu, policy->cpus) {
+               extents = &freq_scale[cpu];
+               extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+               extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+               if (!hmp_data.freqinvar_load_scale_enabled) {
+                       extents->curr_scale = 1024;
+               } else if (singleFreq) {
+                       extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = 1024;
+               } else {
+                       extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+                       extents->curr_scale = cpufreq_calc_scale(extents->min,
+                                       extents->max, policy->cur);
+               }
+       }
+
+       return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+       .notifier_call  = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+       .notifier_call  = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+       int ret = 0;
+
+       /* init safe defaults since there are no policies at registration */
+       for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+               /* safe defaults */
+               freq_scale[ret].max = 1024;
+               freq_scale[ret].min = 1024;
+               freq_scale[ret].curr_scale = 1024;
+       }
+
+       pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+       ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+                       CPUFREQ_POLICY_NOTIFIER);
+
+       if (ret != -EINVAL)
+               ret = cpufreq_register_notifier(&cpufreq_notifier,
+                       CPUFREQ_TRANSITION_NOTIFIER);
+
+       return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */