sched: backport cpufreq hooks from 4.9-rc4

author Steve Muckle <smuckle@linaro.org>

Fri, 11 Nov 2016 22:04:43 +0000 (14:04 -0800)

committer Amit Pundir <amit.pundir@linaro.org>

Wed, 21 Jun 2017 11:04:04 +0000 (16:34 +0530)
author Steve Muckle <smuckle@linaro.org>
Fri, 11 Nov 2016 22:04:43 +0000 (14:04 -0800)
committer Amit Pundir <amit.pundir@linaro.org>
Wed, 21 Jun 2017 11:04:04 +0000 (16:34 +0530)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 79f70e5bae08971f1bfc869fdbcfcb98c321506d..bc23d15244db6c2ddc2e55e77741a69b709928dd 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3291,4 +3291,21 @@ static inline unsigned long rlimit_max(unsigned int limit)
         return task_rlimit_max(current, limit);
  }
  
+#define SCHED_CPUFREQ_RT        (1U << 0)
+#define SCHED_CPUFREQ_DL        (1U << 1)
+#define SCHED_CPUFREQ_IOWAIT    (1U << 2)
+
+#define SCHED_CPUFREQ_RT_DL     (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
+
+#ifdef CONFIG_CPU_FREQ
+struct update_util_data {
+       void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
+};
+
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+                       void (*func)(struct update_util_data *data, u64 time,
+                                    unsigned int flags));
+void cpufreq_remove_update_util_hook(int cpu);
+#endif /* CONFIG_CPU_FREQ */
+
  #endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index 623ce4bde0d5123a3960087e7f6c9bb6e6b11c25..f2d49474aab1251bc417384b1258dbf2f8e04d56 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -21,4 +21,5 @@ obj-$(CONFIG_SCHEDSTATS) += stats.o
  obj-$(CONFIG_SCHED_DEBUG) += debug.o
  obj-$(CONFIG_SCHED_TUNE) += tune.o
  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
  obj-$(CONFIG_CPU_FREQ_GOV_SCHED) += cpufreq_sched.o
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c

new file mode 100644 (file)

index 0000000..dbc5144
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep.  @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+                       void (*func)(struct update_util_data *data, u64 time,
+                                    unsigned int flags))
+{
+       if (WARN_ON(!data || !func))
+               return;
+
+       if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+               return;
+
+       data->func = func;
+       rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+       rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index f10b1cb255b25e94a525c320f60d37f50bcc5874..9ec7f19b5020c29de7effae31e7625de8a2c2cfb 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -753,6 +753,9 @@ static void update_curr_dl(struct rq *rq)
         if (unlikely((s64)delta_exec <= 0))
                 return;
  
+       /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+       cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 6f353de3f3902aebb5c5097bcfc182c22bbab1c7..baba2d34c34e1d4dc860e16dc4c462d4b9714d83 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2711,6 +2711,29 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
  static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+        if (&this_rq()->cfs == cfs_rq) {
+                /*
+                 * There are a few boundary cases this might miss but it should
+                 * get called often enough that that should (hopefully) not be
+                 * a real problem -- added to that it only calls on the local
+                 * CPU, so if we enqueue remotely we'll miss an update, but
+                 * the next tick/schedule should update.
+                 *
+                 * It will not get called when we go idle, because the idle
+                 * thread is a different class (!fair), nor will the utilization
+                 * number include things like RT tasks.
+                 *
+                 * As is, the util number is not freq-invariant (we'd have to
+                 * implement arch_scale_freq_capacity() for that).
+                 *
+                 * See cpu_util().
+                 */
+                cpufreq_update_util(rq_of(cfs_rq), 0);
+        }
+}
+
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
  /*
@@ -2731,10 +2754,11 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  } while (0)
  
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq,
+                                        bool update_freq)
  {
         struct sched_avg *sa = &cfs_rq->avg;
-       int decayed, removed = 0;
+       int decayed, removed = 0, removed_util = 0;
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2747,6 +2771,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
                 sub_positive(&sa->util_avg, r);
                 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+               removed_util = 1;
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2761,6 +2786,9 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         if (cfs_rq == &rq_of(cfs_rq)->cfs)
                 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
  
+       if (update_freq && (decayed || removed_util))
+               cfs_rq_util_change(cfs_rq);
+
         return decayed || removed;
  }
  
@@ -2779,7 +2807,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
                           se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
  
-       if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
                 update_tg_load_avg(cfs_rq, 0);
  
         if (entity_is_task(se))
@@ -2811,6 +2839,8 @@ skip_aging:
         cfs_rq->avg.load_sum += se->avg.load_sum;
         cfs_rq->avg.util_avg += se->avg.util_avg;
         cfs_rq->avg.util_sum += se->avg.util_sum;
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -2823,6 +2853,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+
+       cfs_rq_util_change(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -2840,7 +2872,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         cfs_rq->curr == se, NULL);
         }
  
-       decayed = update_cfs_rq_load_avg(now, cfs_rq);
+       decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
  
         cfs_rq->runnable_load_avg += sa->load_avg;
         cfs_rq->runnable_load_sum += sa->load_sum;
@@ -2940,7 +2972,11 @@ static int idle_balance(struct rq *this_rq);
  
  #else /* CONFIG_SMP */
  
-static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+       cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+}
+
  static inline void
  enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
  static inline void
@@ -4257,6 +4293,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         int task_wakeup = flags & ENQUEUE_WAKEUP;
  #endif
  
+       /*
+        * If in_iowait is set, the code below may not trigger any cpufreq
+        * utilization updates, so do it here explicitly with the IOWAIT flag
+        * passed.
+        */
+       if (p->in_iowait)
+               cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
+
         for_each_sched_entity(se) {
                 if (se->on_rq)
                         break;
@@ -6923,7 +6967,8 @@ static void update_blocked_averages(int cpu)
                 if (throttled_hierarchy(cfs_rq))
                         continue;
  
-               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+               if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
+                                          true))
                         update_tg_load_avg(cfs_rq, 0);
         }
         raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -6984,7 +7029,7 @@ static inline void update_blocked_averages(int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
         update_rq_clock(rq);
-       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 541b8494450e9152add97e3eea8d36f78e200c33..6bb51d62dca4f36af2f541a180338f6cc21c988d 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1002,6 +1002,9 @@ static void update_curr_rt(struct rq *rq)
         if (unlikely((s64)delta_exec <= 0))
                 return;
  
+       /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
+       cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
+
         schedstat_set(curr->se.statistics.exec_max,
                       max(curr->se.statistics.exec_max, delta_exec));
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 0386a2cdb0089f6638322952f619a8672ac5ffe6..237b0bcdd30479dcf8ccf54f3301125faf2db4cd 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2038,3 +2038,55 @@ static inline void account_reset_rq(struct rq *rq)
         rq->prev_steal_time_rq = 0;
  #endif
  }
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @rq: Runqueue to carry out the update for.
+ * @flags: Update reason flags.
+ *
+ * This function is called by the scheduler on the CPU whose utilization is
+ * being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid.  Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+        struct update_util_data *data;
+
+        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+        if (data)
+                data->func(data, rq_clock(rq), flags);
+}
+
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
+{
+        if (cpu_of(rq) == smp_processor_id())
+                cpufreq_update_util(rq, flags);
+}
+#else
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()     (true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()     (false)
+#endif
author	Steve Muckle <smuckle@linaro.org>
	Fri, 11 Nov 2016 22:04:43 +0000 (14:04 -0800)
committer	Amit Pundir <amit.pundir@linaro.org>
	Wed, 21 Jun 2017 11:04:04 +0000 (16:34 +0530)
include/linux/sched.h		patch \| blob \| history
kernel/sched/Makefile		patch \| blob \| history
kernel/sched/cpufreq.c	[new file with mode: 0644]	patch \| blob
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history