diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 786 |
1 files changed, 481 insertions, 305 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 3c11ae0a948d..a675fd64a036 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -542,7 +542,6 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; - u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -591,6 +590,8 @@ struct rq { u64 rt_avg; u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; #endif /* calc_load related fields */ @@ -814,6 +815,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; * default: 0.25ms */ unsigned int sysctl_sched_shares_ratelimit = 250000; +unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; /* * Inject some fuzzyness into changing the per-cpu group shares @@ -940,14 +942,25 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* + * Check whether the task is waking, we use this to synchronize ->cpus_allowed + * against ttwu(). + */ +static inline int task_is_waking(struct task_struct *p) +{ + return unlikely(p->state == TASK_WAKING); +} + +/* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { + struct rq *rq; + for (;;) { - struct rq *rq = task_rq(p); + rq = task_rq(p); spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; @@ -1258,6 +1271,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } @@ -1614,7 +1633,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, shares = 0; + unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; unsigned long *usd_rq_weight; struct sched_domain *sd = data; unsigned long flags; @@ -1630,6 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data) weight = tg->cfs_rq[i]->load.weight; usd_rq_weight[i] = weight; + rq_weight += weight; /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to @@ -1638,10 +1658,13 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!weight) weight = NICE_0_LOAD; - rq_weight += weight; + sum_weight += weight; shares += tg->cfs_rq[i]->shares; } + if (!rq_weight) + rq_weight = sum_weight; + if ((!shares && rq_weight) || shares > tg->shares) shares = tg->shares; @@ -1708,9 +1731,6 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -1810,6 +1830,21 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) #endif static void calc_load_account_active(struct rq *this_rq); +static void update_sysctl(void); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} #include "sched_stats.h" #include "sched_idletask.c" @@ -1860,13 +1895,14 @@ static void update_avg(u64 *avg, u64 sample) *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static void +enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head) { if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + p->sched_class->enqueue_task(rq, p, wakeup, head); p->se.on_rq = 1; } @@ -1942,7 +1978,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); + enqueue_task(rq, p, wakeup, false); inc_nr_running(rq); } @@ -1967,20 +2003,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - static inline void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio, int running) @@ -2007,21 +2029,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - spin_lock_irqsave(&rq->lock, flags); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2034,6 +2050,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2042,9 +2061,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) @@ -2059,35 +2075,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); - u64 clock_offset; - clock_offset = old_rq->clock - new_rq->clock; +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); +#endif trace_sched_migrate_task(p, new_cpu); -#ifdef CONFIG_SCHEDSTATS - if (p->se.wait_start) - p->se.wait_start -= clock_offset; - if (p->se.sleep_start) - p->se.sleep_start -= clock_offset; - if (p->se.block_start) - p->se.block_start -= clock_offset; -#endif if (old_cpu != new_cpu) { p->se.nr_migrations++; - new_rq->nr_migrations_in++; -#ifdef CONFIG_SCHEDSTATS - if (task_hot(p, old_rq->clock, NULL)) - schedstat_inc(p, se.nr_forced2_migrations); -#endif perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -2112,12 +2116,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2322,6 +2324,69 @@ void task_oncpu_function_call(struct task_struct *p, preempt_enable(); } +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (unlikely(dest_cpu >= nr_cpu_ids)) { + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +{ + int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); + + return cpu; +} +#endif + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2370,22 +2435,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, * * First fix up the nr_uninterruptible count: */ - if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; + if (task_contributes_to_load(p)) { + if (likely(cpu_online(orig_cpu))) + rq->nr_uninterruptible--; + else + this_rq()->nr_uninterruptible--; + } p->state = TASK_WAKING; - task_rq_unlock(rq, &flags); - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + + cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); + __task_rq_unlock(rq); - rq = task_rq_lock(p, &flags); - - if (rq != orig_rq) - update_rq_clock(rq); + rq = cpu_rq(cpu); + spin_lock(&rq->lock); + update_rq_clock(rq); + /* + * We migrated the task without holding either rq->lock, however + * since the task is not on the task list itself, nobody else + * will try and migrate the task, hence the rq should match the + * cpu we just moved it to. + */ + WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); - cpu = task_cpu(p); #ifdef CONFIG_SCHEDSTATS schedstat_inc(rq, ttwu_count); @@ -2438,8 +2515,19 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } #endif out: task_rq_unlock(rq, &flags); @@ -2508,7 +2596,6 @@ static void __sched_fork(struct task_struct *p) p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; p->se.nr_forced_migrations = 0; - p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; p->se.nr_wakeups_sync = 0; @@ -2529,14 +2616,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2547,6 +2626,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; /* * Revert to default priority/policy on fork if requested. @@ -2578,9 +2663,9 @@ void sched_fork(struct task_struct *p, int clone_flags) if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; -#ifdef CONFIG_SMP - cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); -#endif + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2610,28 +2695,38 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); +#ifdef CONFIG_SMP rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); - update_rq_clock(rq); + p->state = TASK_WAKING; - if (!p->sched_class->task_new || !current->se.on_rq) { - activate_task(rq, p, 0); - } else { - /* - * Let the scheduling class do new task startup - * management (if any): - */ - p->sched_class->task_new(rq, p); - inc_nr_running(rq); - } + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We set TASK_WAKING so that select_task_rq() can drop rq->lock + * without people poking at ->cpus_allowed. + */ + cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); + + p->state = TASK_RUNNING; + task_rq_unlock(rq, &flags); +#endif + + rq = task_rq_lock(p, &flags); + update_rq_clock(rq); + activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3018,15 +3113,6 @@ static void calc_load_account_active(struct rq *this_rq) } /* - * Externally visible per-cpu scheduler statistics: - * cpu_nr_migrations(cpu) - number of migrations into that cpu - */ -u64 cpu_nr_migrations(int cpu) -{ - return cpu_rq(cpu)->nr_migrations_in; -} - -/* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). */ @@ -3108,24 +3194,28 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; unsigned long flags; struct rq *rq; + int dest_cpu; rq = task_rq_lock(p, &flags); - if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; - /* force the process onto the specified CPU */ - if (migrate_task(p, dest_cpu, &req)) { + /* + * select_task_rq() can race against ->cpus_allowed + */ + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && + likely(cpu_active(dest_cpu)) && + migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ struct task_struct *mt = rq->migration_thread; @@ -3137,24 +3227,11 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: +unlock: task_rq_unlock(rq, &flags); } /* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - -/* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ @@ -3164,10 +3241,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, deactivate_task(src_rq, p, 0); set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); - /* - * Note that idle threads have a prio of MAX_PRIO, for this test - * to be always true for them. - */ check_preempt_curr(this_rq, p, 0); } @@ -3391,6 +3464,7 @@ struct sd_lb_stats { unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; + unsigned long busiest_group_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3604,7 +3678,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long smt_gain = sd->smt_gain; smt_gain /= weight; @@ -3637,7 +3711,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { - unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long weight = sd->span_weight; unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; @@ -3710,8 +3784,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, unsigned long load, max_cpu_load, min_cpu_load; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; + unsigned long avg_load_per_task = 0; if (local_group) { balance_cpu = group_first_cpu(group); @@ -3720,7 +3793,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Tally up the load of all CPUs in the group */ - sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; min_cpu_load = ~0UL; @@ -3750,7 +3822,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->sum_nr_running += rq->nr_running; sgs->sum_weighted_load += weighted_cpuload(i); - sum_avg_load_per_task += cpu_avg_load_per_task(i); } /* @@ -3768,7 +3839,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; - /* * Consider the group unbalanced when the imbalance is larger * than the average weight of two tasks. @@ -3778,8 +3848,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / - group->cpu_power; + if (sgs->sum_nr_running) + avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; @@ -3848,6 +3918,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->max_load = sgs.avg_load; sds->busiest = group; sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } @@ -3870,6 +3941,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, { unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; + unsigned long scaled_busy_load_per_task; if (sds->this_nr_running) { sds->this_load_per_task /= sds->this_nr_running; @@ -3880,8 +3952,12 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, sds->this_load_per_task = cpu_avg_load_per_task(this_cpu); - if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= - sds->busiest_load_per_task * imbn) { + scaled_busy_load_per_task = sds->busiest_load_per_task + * SCHED_LOAD_SCALE; + scaled_busy_load_per_task /= sds->busiest->cpu_power; + + if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { *imbalance = sds->busiest_load_per_task; return; } @@ -3932,7 +4008,14 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, unsigned long *imbalance) { - unsigned long max_pull; + unsigned long max_pull, load_above_capacity = ~0UL; + + sds->busiest_load_per_task /= sds->busiest_nr_running; + if (sds->group_imb) { + sds->busiest_load_per_task = + min(sds->busiest_load_per_task, sds->avg_load); + } + /* * In the presence of smp nice balancing, certain scenarios can have * max load less than avg load(as we skip the groups at or below @@ -3943,9 +4026,29 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(sds->max_load - sds->avg_load, - sds->max_load - sds->busiest_load_per_task); + if (!sds->group_imb) { + /* + * Don't want to pull so many tasks that a group would go idle. + */ + load_above_capacity = (sds->busiest_nr_running - + sds->busiest_group_capacity); + + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + + load_above_capacity /= sds->busiest->cpu_power; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load. At the same time, + * we also don't want to reduce the group load below the group capacity + * (so that we can implement power-savings policies etc). Thus we look + * for the minimum possible imbalance. + * Be careful of negative numbers as they'll appear as very large values + * with unsigned longs. + */ + max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, @@ -4013,7 +4116,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. - * 6) Any rebalance would lead to ping-pong */ if (balance && !(*balance)) goto ret; @@ -4032,25 +4134,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; - sds.busiest_load_per_task /= sds.busiest_nr_running; - if (sds.group_imb) - sds.busiest_load_per_task = - min(sds.busiest_load_per_task, sds.avg_load); - - /* - * We're trying to get all the cpus to the average_load, so we don't - * want to push ourselves above the average load, nor do we wish to - * reduce the max loaded cpu below the average load, as either of these - * actions would just result in more rebalancing later, and ping-pong - * tasks around. Thus we look for the minimum possible imbalance. - * Negative imbalances (*we* are more loaded than anyone else) will - * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll - * appear as very large values with unsigned longs. - */ - if (sds.max_load <= sds.busiest_load_per_task) - goto out_balanced; - /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -4087,12 +4170,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, continue; rq = cpu_rq(i); - wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; - wl /= power; + wl = weighted_cpuload(i); + /* + * When comparing with imbalance, use weighted_cpuload() + * which is not scaled with the cpu power. + */ if (capacity && rq->nr_running == 1 && wl > imbalance) continue; + /* + * For the load comparisons with the other cpu's, consider + * the weighted_cpuload() scaled with the cpu power, so that + * the load can be moved away from the cpu that is potentially + * running at a lower capacity. + */ + wl = (wl * SCHED_LOAD_SCALE) / power; + if (wl > max_load) { max_load = wl; busiest = rq; @@ -4126,7 +4220,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4289,7 +4383,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) int all_pinned = 0; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); - cpumask_setall(cpus); + cpumask_copy(cpus, cpu_active_mask); /* * When power savings policy is enabled for the parent domain, idle @@ -4429,6 +4523,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) int pulled_task = 0; unsigned long next_balance = jiffies + HZ; + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + for_each_domain(this_cpu, sd) { unsigned long interval; @@ -4443,8 +4542,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) + if (pulled_task) { + this_rq->idle_stamp = 0; break; + } } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -4679,7 +4780,7 @@ int select_nohz_load_balancer(int stop_tick) cpumask_set_cpu(cpu, nohz.cpu_mask); /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { + if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { if (atomic_read(&nohz.load_balancer) == cpu) atomic_set(&nohz.load_balancer, -1); return 0; @@ -5171,45 +5272,90 @@ cputime_t task_stime(struct task_struct *p) { return p->stime; } + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} #else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) \ + msecs_to_cputime(div_u64((__nsecs), NSEC_PER_MSEC)) +#endif + cputime_t task_utime(struct task_struct *p) { - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); + cputime_t utime = p->utime, total = utime + p->stime; u64 temp; /* * Use CFS's precise accounting: */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); + temp = (u64)nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { temp *= utime; do_div(temp, total); } - utime = (clock_t)temp; + utime = (cputime_t)temp; - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); + p->prev_utime = max(p->prev_utime, utime); return p->prev_utime; } cputime_t task_stime(struct task_struct *p) { - clock_t stime; + cputime_t stime; /* * Use CFS's precise accounting. (we subtract utime from * the total, to make sure the total observed by userspace * grows monotonically - apps rely on that): */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); + stime = nsecs_to_cputime(p->se.sum_exec_runtime) - task_utime(p); if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); + p->prev_stime = max(p->prev_stime, stime); return p->prev_stime; } + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { + u64 temp = rtime; + + temp *= cputime.utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} #endif inline cputime_t task_gtime(struct task_struct *p) @@ -5501,7 +5647,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -5511,14 +5657,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -5537,7 +5683,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif @@ -5885,14 +6031,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5907,12 +6054,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); @@ -5980,7 +6128,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq, running; struct rq *rq; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -5988,6 +6136,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) update_rq_clock(rq); oldprio = p->prio; + prev_class = p->sched_class; on_rq = p->se.on_rq; running = task_current(rq, p); if (on_rq) @@ -6005,7 +6154,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, oldprio < prio); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -6049,7 +6198,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + enqueue_task(rq, p, 0, false); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -6215,7 +6364,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy, { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; - const struct sched_class *prev_class = p->sched_class; + const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; @@ -6329,6 +6478,7 @@ recheck: p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; + prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); if (running) @@ -6439,7 +6589,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6447,7 +6597,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6465,7 +6615,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6476,7 +6626,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6486,7 +6636,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6497,22 +6647,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6593,10 +6739,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; + unsigned long flags; + struct rq *rq; int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6607,10 +6755,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; + rq = task_rq_lock(p, &flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; @@ -6628,7 +6778,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -6636,10 +6788,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); @@ -6845,6 +6999,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, { struct task_struct *p; unsigned int time_slice; + unsigned long flags; + struct rq *rq; int retval; struct timespec t; @@ -6852,7 +7008,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6861,15 +7017,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - time_slice = p->sched_class->get_rr_interval(p); + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6960,9 +7118,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); - idle->prio = idle->normal_prio = MAX_PRIO; cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); __set_task_cpu(idle, cpu); @@ -7003,22 +7161,23 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static inline void sched_init_granularity(void) +static void update_sysctl(void) { - unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long limit = 200000000; - - sysctl_sched_min_granularity *= factor; - if (sysctl_sched_min_granularity > limit) - sysctl_sched_min_granularity = limit; + unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int factor = 1 + ilog2(cpus); - sysctl_sched_latency *= factor; - if (sysctl_sched_latency > limit) - sysctl_sched_latency = limit; - - sysctl_sched_wakeup_granularity *= factor; +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); + SET_SYSCTL(sched_shares_ratelimit); +#undef SET_SYSCTL +} - sysctl_sched_shares_ratelimit *= factor; +static inline void sched_init_granularity(void) +{ + update_sysctl(); } #ifdef CONFIG_SMP @@ -7054,8 +7213,20 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Serialize against TASK_WAKING so that ttwu() and wunt() can + * drop the rq->lock and still rely on ->cpus_allowed. + */ +again: + while (task_is_waking(p)) + cpu_relax(); rq = task_rq_lock(p, &flags); - if (!cpumask_intersects(new_mask, cpu_online_mask)) { + if (task_is_waking(p)) { + task_rq_unlock(rq, &flags); + goto again; + } + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; } @@ -7077,13 +7248,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { + if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ struct task_struct *mt = rq->migration_thread; get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(rq->migration_thread); + wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); @@ -7110,7 +7281,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7126,12 +7297,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } @@ -7210,57 +7382,29 @@ static int migration_thread(void *data) } #ifdef CONFIG_HOTPLUG_CPU - -static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) -{ - int ret; - - local_irq_disable(); - ret = __migrate_task(p, src_cpu, dest_cpu); - local_irq_enable(); - return ret; -} - /* * Figure out where task on dead CPU should go, use force if necessary. */ -static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) +void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { - int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); - -again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); + struct rq *rq = cpu_rq(dead_cpu); + int needs_cpu, uninitialized_var(dest_cpu); + unsigned long flags; - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + local_irq_save(flags); -move: - /* It can have affinity changed while we were choosing. */ - if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) - goto again; + spin_lock(&rq->lock); + needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); + if (needs_cpu) + dest_cpu = select_fallback_rq(dead_cpu, p); + spin_unlock(&rq->lock); + /* + * It can only fail if we race with set_cpus_allowed(), + * in the racer should migrate the task anyway. + */ + if (needs_cpu) + __migrate_task(p, dead_cpu, dest_cpu); + local_irq_restore(flags); } /* @@ -7272,7 +7416,7 @@ move: */ static void migrate_nr_uninterruptible(struct rq *rq_src) { - struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); unsigned long flags; local_irq_save(flags); @@ -7526,7 +7670,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) static struct ctl_table_header *sd_sysctl_header; static void register_sched_domain_sysctl(void) { - int i, cpu_num = num_online_cpus(); + int i, cpu_num = num_possible_cpus(); struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); char buf[32]; @@ -7536,7 +7680,7 @@ static void register_sched_domain_sysctl(void) if (entry == NULL) return; - for_each_online_cpu(i) { + for_each_possible_cpu(i) { snprintf(buf, 32, "cpu%d", i); entry->procname = kstrdup(buf, GFP_KERNEL); entry->mode = 0555; @@ -7608,10 +7752,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) unsigned long flags; struct rq *rq; - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; @@ -7626,7 +7769,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: - case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); @@ -7643,7 +7785,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ @@ -7654,24 +7795,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpu_rq(cpu)->migration_thread = NULL; break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ - migrate_live_tasks(cpu); + case CPU_POST_DEAD: + /* + * Bring the migration thread down in CPU_POST_DEAD event, + * since the timers should have got migrated by now and thus + * we should not see a deadlock between trying to kill the + * migration thread and the sched_rt_period_timer. + */ rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); put_task_struct(rq->migration_thread); rq->migration_thread = NULL; + break; + + case CPU_DEAD: + migrate_live_tasks(cpu); + rq = cpu_rq(cpu); /* Idle task back to normal (off runqueue, low prio) */ spin_lock_irq(&rq->lock); update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); - rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); rq->idle->sched_class = &idle_sched_class; migrate_dead_tasks(cpu); spin_unlock_irq(&rq->lock); - cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); calc_global_load_remove(rq); @@ -7695,7 +7842,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DYING: - case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); @@ -7905,6 +8051,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) static void free_rootdomain(struct root_domain *rd) { + synchronize_sched(); + cpupri_cleanup(&rd->cpupri); free_cpumask_var(rd->rto_mask); @@ -8013,6 +8161,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + for (tmp = sd; tmp; tmp = tmp->parent) + tmp->span_weight = cpumask_weight(sched_domain_span(tmp)); + /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { struct sched_domain *parent = tmp->parent; @@ -8045,6 +8196,7 @@ static cpumask_var_t cpu_isolated_map; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + alloc_bootmem_cpumask_var(&cpu_isolated_map); cpulist_parse(str, cpu_isolated_map); return 1; } @@ -9022,7 +9174,7 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; doms_new = fallback_doms; - cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); + cpumask_andnot(&doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); } @@ -9153,8 +9305,10 @@ static int update_sched_domains(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; @@ -9201,7 +9355,7 @@ void __init sched_init_smp(void) #endif get_online_cpus(); mutex_lock(&sched_domains_mutex); - arch_init_sched_domains(cpu_online_mask); + arch_init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); @@ -9522,6 +9676,8 @@ void __init sched_init(void) rq->cpu = i; rq->online = 0; rq->migration_thread = NULL; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif @@ -9571,7 +9727,9 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); #endif - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ perf_event_init(); @@ -9993,13 +10151,13 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + enqueue_task(rq, tsk, 0, false); task_rq_unlock(rq, &flags); } @@ -10771,12 +10929,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) } /* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + +/* * Charge the system/user time to the task's accounting group. */ static void cpuacct_update_stats(struct task_struct *tsk, enum cpuacct_stat_index idx, cputime_t val) { struct cpuacct *ca; + int batch = CPUACCT_BATCH; if (unlikely(!cpuacct_subsys.active)) return; @@ -10785,7 +10961,7 @@ static void cpuacct_update_stats(struct task_struct *tsk, ca = task_ca(tsk); do { - percpu_counter_add(&ca->cpustat[idx], val); + __percpu_counter_add(&ca->cpustat[idx], val, batch); ca = ca->parent; } while (ca); rcu_read_unlock(); |