sched: introduce se->vruntime
authorIngo Molnar <mingo@elte.hu>
Mon, 15 Oct 2007 15:00:04 +0000 (17:00 +0200)
committerIngo Molnar <mingo@elte.hu>
Mon, 15 Oct 2007 15:00:04 +0000 (17:00 +0200)
introduce se->vruntime as a sum of weighted delta-exec's, and use that
as the key into the tree.

the idea to use absolute virtual time as the basic metric of scheduling
has been first raised by William Lee Irwin, advanced by Tong Li and first
prototyped by Roman Zippel in the "Really Fair Scheduler" (RFS) patchset.

also see:

   http://lkml.org/lkml/2007/9/2/76

for a simpler variant of this patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
include/linux/sched.h
kernel/sched.c
kernel/sched_fair.c

index 3c38a5040e8f30f7c028a836f9d0e4b9d5c0a8dc..5e5c457fba86466cae44e55f9b730073a6f34971 100644 (file)
@@ -902,6 +902,7 @@ struct sched_entity {
 
        u64                     exec_start;
        u64                     sum_exec_runtime;
+       u64                     vruntime;
        u64                     prev_sum_exec_runtime;
        u64                     wait_start_fair;
        u64                     sleep_start_fair;
index 992a1fae72a71e1798a2aa355839b76659437ca1..8f80ebafacc124fb76dd26e367315c071729d4b9 100644 (file)
@@ -182,6 +182,7 @@ struct cfs_rq {
 
        s64 fair_clock;
        u64 exec_clock;
+       u64 min_vruntime;
        s64 wait_runtime;
        u64 sleeper_bonus;
        unsigned long wait_runtime_overruns, wait_runtime_underruns;
index b46f8078e78f86c39b1edd34fa7ae1915fb6c4a5..a2af09cb6a70384ff58ad7ac00dd6722e35f54ea 100644 (file)
@@ -92,14 +92,16 @@ unsigned int sysctl_sched_runtime_limit __read_mostly;
  */
 enum {
        SCHED_FEAT_FAIR_SLEEPERS        = 1,
-       SCHED_FEAT_SLEEPER_AVG          = 2,
-       SCHED_FEAT_SLEEPER_LOAD_AVG     = 4,
-       SCHED_FEAT_START_DEBIT          = 8,
-       SCHED_FEAT_SKIP_INITIAL         = 16,
+       SCHED_FEAT_NEW_FAIR_SLEEPERS    = 2,
+       SCHED_FEAT_SLEEPER_AVG          = 4,
+       SCHED_FEAT_SLEEPER_LOAD_AVG     = 8,
+       SCHED_FEAT_START_DEBIT          = 16,
+       SCHED_FEAT_SKIP_INITIAL         = 32,
 };
 
 const_debug unsigned int sysctl_sched_features =
-               SCHED_FEAT_FAIR_SLEEPERS        *1 |
+               SCHED_FEAT_FAIR_SLEEPERS        *0 |
+               SCHED_FEAT_NEW_FAIR_SLEEPERS    *1 |
                SCHED_FEAT_SLEEPER_AVG          *0 |
                SCHED_FEAT_SLEEPER_LOAD_AVG     *1 |
                SCHED_FEAT_START_DEBIT          *1 |
@@ -145,6 +147,19 @@ static inline struct task_struct *task_of(struct sched_entity *se)
  * Scheduling class tree data structure manipulation methods:
  */
 
+static inline void
+set_leftmost(struct cfs_rq *cfs_rq, struct rb_node *leftmost)
+{
+       struct sched_entity *se;
+
+       cfs_rq->rb_leftmost = leftmost;
+       if (leftmost) {
+               se = rb_entry(leftmost, struct sched_entity, run_node);
+               cfs_rq->min_vruntime = max(se->vruntime,
+                                               cfs_rq->min_vruntime);
+       }
+}
+
 /*
  * Enqueue an entity into the rb-tree:
  */
@@ -180,7 +195,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * used):
         */
        if (leftmost)
-               cfs_rq->rb_leftmost = &se->run_node;
+               set_leftmost(cfs_rq, &se->run_node);
 
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
@@ -195,7 +210,8 @@ static void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        if (cfs_rq->rb_leftmost == &se->run_node)
-               cfs_rq->rb_leftmost = rb_next(&se->run_node);
+               set_leftmost(cfs_rq, rb_next(&se->run_node));
+
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
        update_load_sub(&cfs_rq->load, se->load.weight);
        cfs_rq->nr_running--;
@@ -336,7 +352,7 @@ static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
              unsigned long delta_exec)
 {
-       unsigned long delta, delta_fair, delta_mine;
+       unsigned long delta, delta_fair, delta_mine, delta_exec_weighted;
        struct load_weight *lw = &cfs_rq->load;
        unsigned long load = lw->weight;
 
@@ -344,6 +360,12 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
        curr->sum_exec_runtime += delta_exec;
        cfs_rq->exec_clock += delta_exec;
+       delta_exec_weighted = delta_exec;
+       if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+               delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+                                                       &curr->load);
+       }
+       curr->vruntime += delta_exec_weighted;
 
        if (unlikely(!load))
                return;
@@ -413,8 +435,6 @@ calc_weighted(unsigned long delta, struct sched_entity *se)
  */
 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       s64 key;
-
        /*
         * Are we enqueueing a waiting task? (for current tasks
         * a dequeue/enqueue event is a NOP)
@@ -424,28 +444,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        /*
         * Update the key:
         */
-       key = cfs_rq->fair_clock;
-
-       /*
-        * Optimize the common nice 0 case:
-        */
-       if (likely(se->load.weight == NICE_0_LOAD)) {
-               key -= se->wait_runtime;
-       } else {
-               u64 tmp;
-
-               if (se->wait_runtime < 0) {
-                       tmp = -se->wait_runtime;
-                       key += (tmp * se->load.inv_weight) >>
-                                       (WMULT_SHIFT - NICE_0_SHIFT);
-               } else {
-                       tmp = se->wait_runtime;
-                       key -= (tmp * se->load.inv_weight) >>
-                                       (WMULT_SHIFT - NICE_0_SHIFT);
-               }
-       }
-
-       se->fair_key = key;
+       se->fair_key = se->vruntime;
 }
 
 /*
@@ -615,8 +614,22 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
         */
        update_curr(cfs_rq);
 
-       if (wakeup)
+       if (wakeup) {
+               u64 min_runtime, latency;
+
+               min_runtime = cfs_rq->min_vruntime;
+               min_runtime += sysctl_sched_latency/2;
+
+               if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                       latency = calc_weighted(sysctl_sched_latency, se);
+                       if (min_runtime > latency)
+                               min_runtime -= latency;
+               }
+
+               se->vruntime = max(se->vruntime, min_runtime);
+
                enqueue_sleeper(cfs_rq, se);
+       }
 
        update_stats_enqueue(cfs_rq, se);
        __enqueue_entity(cfs_rq, se);
@@ -1155,6 +1168,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
        if (sched_feat(START_DEBIT))
                se->wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
+       se->vruntime = cfs_rq->min_vruntime;
+       update_stats_enqueue(cfs_rq, se);
        __enqueue_entity(cfs_rq, se);
        resched_task(rq->curr);
 }