From 7539a3b3d1f892dd97eaf094134d7de55c13befe Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 13 Dec 2009 00:07:30 +0100 Subject: sched: Make wakeup side and atomic variants of completion API irq safe Alan Stern noticed that all the wakeup side (and atomic) variants of the completion APIs should be irq safe, but the newly introduced completion_done() and try_wait_for_completion() aren't. The use of the irq unsafe variants in IRQ contexts can cause crashes/hangs. Fix the problem by making them use spin_lock_irqsave() and spin_lock_irqrestore(). Reported-by: Alan Stern Signed-off-by: Rafael J. Wysocki Cc: Linus Torvalds Cc: Zhang Rui Cc: pm list Cc: Peter Zijlstra Cc: David Chinner Cc: Lachlan McIlroy LKML-Reference: <200912130007.30541.rjw@sisk.pl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ff39cadf621e..8b3532f262d7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5908,14 +5908,15 @@ EXPORT_SYMBOL(wait_for_completion_killable); */ bool try_wait_for_completion(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; else x->done--; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(try_wait_for_completion); @@ -5930,12 +5931,13 @@ EXPORT_SYMBOL(try_wait_for_completion); */ bool completion_done(struct completion *x) { + unsigned long flags; int ret = 1; - spin_lock_irq(&x->wait.lock); + spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - spin_unlock_irq(&x->wait.lock); + spin_unlock_irqrestore(&x->wait.lock, flags); return ret; } EXPORT_SYMBOL(completion_done); -- cgit v1.2.3 From 663997d417330a59a566452f52cfa04c8ffd190b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 12 Dec 2009 13:57:27 -0800 Subject: sched: Use pr_fmt() and pr_() - Convert printk(KERN_ to pr_ (not KERN_DEBUG) - Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - Coalesce long format strings - Add missing \n to "ERROR: !SD_LOAD_BALANCE domain has parent" Signed-off-by: Joe Perches Cc: Peter Zijlstra LKML-Reference: <1260655047.2637.7.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- kernel/sched.c | 94 ++++++++++++++++++++++--------------------------- kernel/sched_idletask.c | 2 +- 2 files changed, 43 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8b3532f262d7..258c73c6a2f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -5337,8 +5339,8 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); - printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", - prev->comm, prev->pid, preempt_count()); + pr_err("BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); print_modules(); @@ -6906,23 +6908,23 @@ void sched_show_task(struct task_struct *p) unsigned state; state = p->state ? __ffs(p->state) + 1 : 0; - printk(KERN_INFO "%-13.13s %c", p->comm, + pr_info("%-13.13s %c", p->comm, state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); #if BITS_PER_LONG == 32 if (state == TASK_RUNNING) - printk(KERN_CONT " running "); + pr_cont(" running "); else - printk(KERN_CONT " %08lx ", thread_saved_pc(p)); + pr_cont(" %08lx ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) - printk(KERN_CONT " running task "); + pr_cont(" running task "); else - printk(KERN_CONT " %016lx ", thread_saved_pc(p)); + pr_cont(" %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + pr_cont("%5lu %5d %6d 0x%08lx\n", free, task_pid_nr(p), task_pid_nr(p->real_parent), (unsigned long)task_thread_info(p)->flags); @@ -6934,11 +6936,9 @@ void show_state_filter(unsigned long state_filter) struct task_struct *g, *p; #if BITS_PER_LONG == 32 - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #else - printk(KERN_INFO - " task PC stack pid father\n"); + pr_info(" task PC stack pid father\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { @@ -7296,9 +7296,8 @@ again: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no " - "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); + pr_info("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, dead_cpu); } } @@ -7805,48 +7804,44 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, printk(KERN_DEBUG "%*s domain %d: ", level, "", level); if (!(sd->flags & SD_LOAD_BALANCE)) { - printk("does not load-balance\n"); + pr_cont("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" - " has parent"); + pr_err("ERROR: !SD_LOAD_BALANCE domain has parent\n"); return -1; } - printk(KERN_CONT "span %s level %s\n", str, sd->name); + pr_cont("span %s level %s\n", str, sd->name); if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { - printk(KERN_ERR "ERROR: domain->span does not contain " - "CPU%d\n", cpu); + pr_err("ERROR: domain->span does not contain CPU%d\n", cpu); } if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { - printk(KERN_ERR "ERROR: domain->groups does not contain" - " CPU%d\n", cpu); + pr_err("ERROR: domain->groups does not contain CPU%d\n", cpu); } printk(KERN_DEBUG "%*s groups:", level + 1, ""); do { if (!group) { - printk("\n"); - printk(KERN_ERR "ERROR: group is NULL\n"); + pr_cont("\n"); + pr_err("ERROR: group is NULL\n"); break; } if (!group->cpu_power) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not " - "set\n"); + pr_cont("\n"); + pr_err("ERROR: domain->cpu_power not set\n"); break; } if (!cpumask_weight(sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: empty group\n"); + pr_cont("\n"); + pr_err("ERROR: empty group\n"); break; } if (cpumask_intersects(groupmask, sched_group_cpus(group))) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: repeated CPUs\n"); + pr_cont("\n"); + pr_err("ERROR: repeated CPUs\n"); break; } @@ -7854,23 +7849,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + pr_cont(" %s", str); if (group->cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (cpu_power = %d)", - group->cpu_power); + pr_cont(" (cpu_power = %d)", group->cpu_power); } group = group->next; } while (group != sd->groups); - printk(KERN_CONT "\n"); + pr_cont("\n"); if (!cpumask_equal(sched_domain_span(sd), groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + pr_err("ERROR: groups don't span domain->span\n"); if (sd->parent && !cpumask_subset(groupmask, sched_domain_span(sd->parent))) - printk(KERN_ERR "ERROR: parent span is not a superset " - "of domain->span\n"); + pr_err("ERROR: parent span is not a superset of domain->span\n"); return 0; } @@ -8426,8 +8419,7 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for node %d\n", - num); + pr_warning("Can not alloc domain group for node %d\n", num); return -ENOMEM; } d->sched_group_nodes[num] = sg; @@ -8456,8 +8448,8 @@ static int build_numa_sched_groups(struct s_data *d, sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, num); if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); + pr_warning("Can not alloc domain group for node %d\n", + j); return -ENOMEM; } sg->cpu_power = 0; @@ -8685,7 +8677,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, d->sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), GFP_KERNEL); if (!d->sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); + pr_warning("Can not alloc sched group node list\n"); return sa_notcovered; } sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; @@ -8702,7 +8694,7 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_send_covered; d->rd = alloc_rootdomain(); if (!d->rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); + pr_warning("Cannot alloc root domain\n"); return sa_tmpmask; } return sa_rootdomain; @@ -9684,13 +9676,11 @@ void __might_sleep(char *file, int line, int preempt_offset) return; prev_jiffy = jiffies; - printk(KERN_ERR - "BUG: sleeping function called from invalid context at %s:%d\n", - file, line); - printk(KERN_ERR - "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), - current->pid, current->comm); + pr_err("BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 33d5384a73a8..b810e22772d5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -35,7 +35,7 @@ static void dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) { spin_unlock_irq(&rq->lock); - printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + pr_err("bad: scheduling from the idle thread!\n"); dump_stack(); spin_lock_irq(&rq->lock); } -- cgit v1.2.3 From 5fe85be081edf0ac92d83f9c39e0ab5c1371eb82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:14:58 +0000 Subject: sched: Use rcu in sys_sched_getscheduler/sys_sched_getparam() read_lock(&tasklist_lock) does not protect sys_sched_getscheduler and sys_sched_getparam() against a concurrent update of the policy or scheduler parameters as do_sched_setscheduler() does not take the tasklist_lock. The accessed integers can be retrieved w/o locking and are snapshots anyway. Using rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away is not changing the above situation. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.753790977@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 258c73c6a2f3..1782beed2fa7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6458,7 +6458,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); @@ -6466,7 +6466,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) retval = p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } @@ -6484,7 +6484,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) @@ -6495,7 +6495,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) goto out_unlock; lp.sched_priority = p->rt_priority; - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * This one might sleep, we cannot do it with a spinlock held ... @@ -6505,7 +6505,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From 23f5d142519621b16cf2b378cf8adf4dcf01a616 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:01 +0000 Subject: sched: Use rcu in sched_get/set_affinity() tasklist_lock is held read locked to protect the find_task_by_vpid() call and to prevent the task going away. sched_setaffinity acquires a task struct ref and drops tasklist lock right away. The access to the cpus_allowed mask is protected by rq->lock. rcu_read_lock() provides the same protection here. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.789059966@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1782beed2fa7..79893123325c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6516,22 +6516,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return -ESRCH; } - /* - * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's - * usage count and then drop tasklist_lock. - */ + /* Prevent p going away */ get_task_struct(p); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; @@ -6617,7 +6613,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) int retval; get_online_cpus(); - read_lock(&tasklist_lock); + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -6633,7 +6629,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) task_rq_unlock(rq, &flags); out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_online_cpus(); return retval; -- cgit v1.2.3 From 1a551ae715825bb2a2107a2dd68de024a1fa4e32 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 10:15:11 +0000 Subject: sched: Use rcu in sched_get_rr_param() read_lock(&tasklist_lock) does not protect sys_sched_get_rr_param() against a concurrent update of the policy or scheduler parameters as do_sched_scheduler() does not take the tasklist_lock. The access to task->sched_class->get_rr_interval is protected by task_rq_lock(task). Use rcu_read_lock() to protect find_task_by_vpid() and prevent the task struct from going away. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <20091209100706.862897167@linutronix.de> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 79893123325c..db5c26692dd5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6873,7 +6873,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, return -EINVAL; retval = -ESRCH; - read_lock(&tasklist_lock); + rcu_read_lock(); p = find_process_by_pid(pid); if (!p) goto out_unlock; @@ -6886,13 +6886,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, time_slice = p->sched_class->get_rr_interval(rq, p); task_rq_unlock(rq, &flags); - read_unlock(&tasklist_lock); + rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; return retval; out_unlock: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } -- cgit v1.2.3 From b9f8fcd55bbdb037e5332dbdb7b494f0b70861ac Mon Sep 17 00:00:00 2001 From: David Miller Date: Sun, 13 Dec 2009 18:25:02 -0800 Subject: sched: Fix cpu_clock() in NMIs, on !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK Relax stable-sched-clock architectures to not save/disable/restore hardirqs in cpu_clock(). The background is that I was trying to resolve a sparc64 perf issue when I discovered this problem. On sparc64 I implement pseudo NMIs by simply running the kernel at IRQ level 14 when local_irq_disable() is called, this allows performance counter events to still come in at IRQ level 15. This doesn't work if any code in an NMI handler does local_irq_save() or local_irq_disable() since the "disable" will kick us back to cpu IRQ level 14 thus letting NMIs back in and we recurse. The only path which that does that in the perf event IRQ handling path is the code supporting frequency based events. It uses cpu_clock(). cpu_clock() simply invokes sched_clock() with IRQs disabled. And that's a fundamental bug all on it's own, particularly for the HAVE_UNSTABLE_SCHED_CLOCK case. NMIs can thus get into the sched_clock() code interrupting the local IRQ disable code sections of it. Furthermore, for the not-HAVE_UNSTABLE_SCHED_CLOCK case, the IRQ disabling done by cpu_clock() is just pure overhead and completely unnecessary. So the core problem is that sched_clock() is not NMI safe, but we are invoking it from NMI contexts in the perf events code (via cpu_clock()). A less important issue is the overhead of IRQ disabling when it isn't necessary in cpu_clock(). CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures are not affected by this patch. Signed-off-by: David S. Miller Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091213.182502.215092085.davem@davemloft.net> Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 479ce5682d7c..5b496132c28a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ unsigned long long cpu_clock(int cpu) { - unsigned long long clock; - unsigned long flags; + return sched_clock_cpu(cpu); +} - local_irq_save(flags); - clock = sched_clock_cpu(cpu); - local_irq_restore(flags); +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ - return clock; -} EXPORT_SYMBOL_GPL(cpu_clock); -- cgit v1.2.3 From 9ee349ad6d326df3633d43f54202427295999c47 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 16 Dec 2009 18:04:32 +0100 Subject: sched: Fix set_cpu_active() in cpu_down() Sachin found cpu hotplug test failures on powerpc, which made the kernel hang on his POWER box. The problem is that we fail to re-activate a cpu when a hot-unplug fails. Fix this by moving the de-activation into _cpu_down after doing the initial checks. Remove the synchronize_sched() calls and rely on those implied by rebuilding the sched domains using the new mask. Reported-by: Sachin Sant Signed-off-by: Xiaotian Feng Tested-by: Sachin Sant Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.500272612@chello.nl> Signed-off-by: Ingo Molnar --- kernel/cpu.c | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 291ac586f37f..1c8ddd6ee940 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -209,6 +209,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -ENOMEM; cpu_hotplug_begin(); + set_cpu_active(cpu, false); err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err == NOTIFY_BAD) { @@ -280,18 +281,6 @@ int __ref cpu_down(unsigned int cpu) goto out; } - set_cpu_active(cpu, false); - - /* - * Make sure the all cpus did the reschedule and are not - * using stale version of the cpu_active_mask. - * This is not strictly necessary becuase stop_machine() - * that we run down the line already provides the required - * synchronization. But it's really a side effect and we do not - * want to depend on the innards of the stop_machine here. - */ - synchronize_sched(); - err = _cpu_down(cpu, 0); out: @@ -382,19 +371,12 @@ int disable_nonboot_cpus(void) return error; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); - /* We take down all of the non-boot CPUs in one shot to avoid races + /* + * We take down all of the non-boot CPUs in one shot to avoid races * with the userspace trying to use the CPU hotplug at the same time */ cpumask_clear(frozen_cpus); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - set_cpu_active(cpu, false); - } - - synchronize_sched(); - printk("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) -- cgit v1.2.3 From e6c8fba7771563b2f3dfb96a78f36ec17e15bdf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:33 +0100 Subject: sched: Fix task_hot() test order Make sure not to access sched_fair fields before verifying it is indeed a sched_fair task. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith CC: stable@kernel.org LKML-Reference: <20091216170517.577998058@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9c30858b6463..1d8ca25dd6fb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2046,6 +2046,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + if (p->sched_class != &fair_sched_class) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2054,9 +2057,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) &p->se == cfs_rq_of(&p->se)->last)) return 1; - if (p->sched_class != &fair_sched_class) - return 0; - if (sysctl_sched_migration_cost == -1) return 1; if (sysctl_sched_migration_cost == 0) -- cgit v1.2.3 From e4f4288842ee12747e10c354d72be7d424c0b627 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:34 +0100 Subject: sched: Select_task_rq_fair() must honour SD_LOAD_BALANCE We should skip !SD_LOAD_BALANCE domains. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.653578430@chello.nl> CC: stable@kernel.org Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 5bedf6e3ebf3..ec1d2715620c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1429,6 +1429,9 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag } for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + /* * If power savings logic is enabled for a domain, see if we * are not overloaded, if so, don't balance wider. -- cgit v1.2.3 From 06b83b5fbea273672822b6ee93e16781046553ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:35 +0100 Subject: sched: Use TASK_WAKING for fork wakups For later convenience use TASK_WAKING for fresh tasks. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.732561278@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1d8ca25dd6fb..1672823aabfe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2540,14 +2540,6 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif - - /* - * We mark the process as running here, but have not actually - * inserted it onto the runqueue yet. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_RUNNING; } /* @@ -2558,6 +2550,12 @@ void sched_fork(struct task_struct *p, int clone_flags) int cpu = get_cpu(); __sched_fork(p); + /* + * We mark the process as waking here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_WAKING; /* * Revert to default priority/policy on fork if requested. @@ -2626,7 +2624,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) struct rq *rq; rq = task_rq_lock(p, &flags); - BUG_ON(p->state != TASK_RUNNING); + BUG_ON(p->state != TASK_WAKING); + p->state = TASK_RUNNING; update_rq_clock(rq); activate_task(rq, p, 0); trace_sched_wakeup_new(rq, p, 1); @@ -6984,6 +6983,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); __sched_fork(idle); + idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); -- cgit v1.2.3 From e2912009fb7b715728311b0d8fe327a1432b3f79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:36 +0100 Subject: sched: Ensure set_task_cpu() is never called on blocked tasks In order to clean up the set_task_cpu() rq dependencies we need to ensure it is never called on blocked tasks because such usage does not pair with consistent rq->lock usage. This puts the migration burden on ttwu(). Furthermore we need to close a race against changing ->cpus_allowed, since select_task_rq() runs with only preemption disabled. For sched_fork() this is safe because the child isn't in the tasklist yet, for wakeup we fix this by synchronizing set_cpus_allowed_ptr() against TASK_WAKING, which leaves sched_exec to be a problem This also closes a hole in (6ad4c1888 sched: Fix balance vs hotplug race) where ->select_task_rq() doesn't validate the result against the sched_domain/root_domain. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 85 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1672823aabfe..33d7965f63f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2018,22 +2018,15 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - /* Must have done schedule() in kthread() before we set_task_cpu */ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { WARN_ON(1); return; } - raw_spin_lock_irqsave(&rq->lock, flags); - update_rq_clock(rq); - set_task_cpu(p, cpu); p->cpus_allowed = cpumask_of_cpu(cpu); p->rt.nr_cpus_allowed = 1; p->flags |= PF_THREAD_BOUND; - raw_spin_unlock_irqrestore(&rq->lock, flags); } EXPORT_SYMBOL(kthread_bind); @@ -2074,6 +2067,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) struct cfs_rq *old_cfsrq = task_cfs_rq(p), *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); +#endif + trace_sched_migrate_task(p, new_cpu); if (old_cpu != new_cpu) { @@ -2107,13 +2108,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) /* * If the task is not on a runqueue (and not running), then - * it is sufficient to simply update the task's cpu field. + * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) { - update_rq_clock(rq); - set_task_cpu(p, dest_cpu); + if (!p->se.on_rq && !task_running(rq, p)) return 0; - } init_completion(&req->done); req->task = p; @@ -2319,10 +2317,42 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +/* + * Called from: + * + * - fork, @p is stable because it isn't on the tasklist yet + * + * - exec, @p is unstable XXX + * + * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so + * we should be good. + */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - return p->sched_class->select_task_rq(p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_active(cpu))) { + + cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + /* + * XXX: race against hot-plug modifying cpu_active_mask + */ + BUG_ON(cpu >= nr_cpu_ids); + } + + return cpu; } #endif @@ -7098,7 +7128,23 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) struct rq *rq; int ret = 0; + /* + * Since we rely on wake-ups to migrate sleeping tasks, don't change + * the ->cpus_allowed mask from under waking tasks, which would be + * possible when we change rq->lock in ttwu(), so synchronize against + * TASK_WAKING to avoid that. + */ +again: + while (p->state == TASK_WAKING) + cpu_relax(); + rq = task_rq_lock(p, &flags); + + if (p->state == TASK_WAKING) { + task_rq_unlock(rq, &flags); + goto again; + } + if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; goto out; @@ -7154,7 +7200,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { struct rq *rq_dest, *rq_src; - int ret = 0, on_rq; + int ret = 0; if (unlikely(!cpu_active(dest_cpu))) return ret; @@ -7170,12 +7216,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) goto fail; - on_rq = p->se.on_rq; - if (on_rq) + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->se.on_rq) { deactivate_task(rq_src, p, 0); - - set_task_cpu(p, dest_cpu); - if (on_rq) { + set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); check_preempt_curr(rq_dest, p, 0); } -- cgit v1.2.3 From 3802290628348674985d14914f9bfee7b9084548 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:37 +0100 Subject: sched: Fix sched_exec() balancing Since we access ->cpus_allowed without holding rq->lock we need a retry loop to validate the result, this comes for near free when we merge sched_migrate_task() into sched_exec() since that already does the needed check. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.884743662@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 33d7965f63f0..63e55ac242d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2322,7 +2322,7 @@ void task_oncpu_function_call(struct task_struct *p, * * - fork, @p is stable because it isn't on the tasklist yet * - * - exec, @p is unstable XXX + * - exec, @p is unstable, retry loop * * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so * we should be good. @@ -3132,21 +3132,36 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) } /* - * If dest_cpu is allowed for this process, migrate the task to it. - * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then - * the cpu_allowed mask is restored. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ -static void sched_migrate_task(struct task_struct *p, int dest_cpu) +void sched_exec(void) { + struct task_struct *p = current; struct migration_req req; + int dest_cpu, this_cpu; unsigned long flags; struct rq *rq; +again: + this_cpu = get_cpu(); + dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == this_cpu) { + put_cpu(); + return; + } + rq = task_rq_lock(p, &flags); + put_cpu(); + + /* + * select_task_rq() can race against ->cpus_allowed + */ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) - || unlikely(!cpu_active(dest_cpu))) - goto out; + || unlikely(!cpu_active(dest_cpu))) { + task_rq_unlock(rq, &flags); + goto again; + } /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { @@ -3161,23 +3176,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) return; } -out: task_rq_unlock(rq, &flags); } -/* - * sched_exec - execve() is a valuable balancing opportunity, because at - * this point the task has the smallest effective memory and cache footprint. - */ -void sched_exec(void) -{ - int new_cpu, this_cpu = get_cpu(); - new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); - put_cpu(); - if (new_cpu != this_cpu) - sched_migrate_task(current, new_cpu); -} - /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.3 From 5da9a0fb673a0ea0a093862f95f6b89b3390c31e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:38 +0100 Subject: sched: Fix select_task_rq() vs hotplug issues Since select_task_rq() is now responsible for guaranteeing ->cpus_allowed and cpu_active_mask, we need to verify this. select_task_rq_rt() can blindly return smp_processor_id()/task_cpu() without checking the valid masks, select_task_rq_fair() can do the same in the rare case that all SD_flags are disabled. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.961475466@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 75 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63e55ac242d1..cc40bdadee7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2317,6 +2317,43 @@ void task_oncpu_function_call(struct task_struct *p, } #ifdef CONFIG_SMP +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + if (dest_cpu >= nr_cpu_ids) { + rcu_read_lock(); + cpuset_cpus_allowed_locked(p, &p->cpus_allowed); + rcu_read_unlock(); + dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); + + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no " + "longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + } + + return dest_cpu; +} + /* * Called from: * @@ -2343,14 +2380,8 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) * not worry about this generic constraint ] */ if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || - !cpu_active(cpu))) { - - cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - /* - * XXX: race against hot-plug modifying cpu_active_mask - */ - BUG_ON(cpu >= nr_cpu_ids); - } + !cpu_active(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); return cpu; } @@ -7319,36 +7350,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) { int dest_cpu; - const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); again: - /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) - goto move; - - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - goto move; - - /* No more Mr. Nice Guy. */ - if (dest_cpu >= nr_cpu_ids) { - cpuset_cpus_allowed_locked(p, &p->cpus_allowed); - dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); - - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - pr_info("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); - } - } + dest_cpu = select_fallback_rq(dead_cpu, p); -move: /* It can have affinity changed while we were choosing. */ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) goto again; -- cgit v1.2.3 From 881232b70b195768a71cd74ff4b4e8ab9502997b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:39 +0100 Subject: sched: Move kthread_bind() back to kthread.c Since kthread_bind() lost its dependencies on sched.c, move it back where it came from. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.039524041@chello.nl> Signed-off-by: Ingo Molnar --- kernel/kthread.c | 23 +++++++++++++++++++++++ kernel/sched.c | 26 -------------------------- 2 files changed, 23 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index ab7ae57773e1..fbb6222fe7e0 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -149,6 +149,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + p->cpus_allowed = cpumask_of_cpu(cpu); + p->rt.nr_cpus_allowed = 1; + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/sched.c b/kernel/sched.c index cc40bdadee7a..297dc441ff96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2004,32 +2004,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio, running); } -/** - * kthread_bind - bind a just-created kthread to a cpu. - * @p: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create()). - * - * Function lives here instead of kthread.c because it messes with - * scheduler internals which require locking. - */ -void kthread_bind(struct task_struct *p, unsigned int cpu) -{ - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - - p->cpus_allowed = cpumask_of_cpu(cpu); - p->rt.nr_cpus_allowed = 1; - p->flags |= PF_THREAD_BOUND; -} -EXPORT_SYMBOL(kthread_bind); - #ifdef CONFIG_SMP /* * Is this task likely cache-hot: -- cgit v1.2.3 From efbbd05a595343a413964ad85a2ad359b7b7efbd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:40 +0100 Subject: sched: Add pre and post wakeup hooks As will be apparent in the next patch, we need a pre wakeup hook for sched_fair task migration, hence rename the post wakeup hook and one pre wakeup. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.114746117@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++---- kernel/sched_rt.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 297dc441ff96..6c571bdd5658 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2412,6 +2412,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(rq, p); + __task_rq_unlock(rq); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); @@ -2475,8 +2479,8 @@ out_running: p->state = TASK_RUNNING; #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); if (unlikely(rq->idle_stamp)) { u64 delta = rq->clock - rq->idle_stamp; @@ -2666,8 +2670,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_sched_wakeup_new(rq, p, 1); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP - if (p->sched_class->task_wake_up) - p->sched_class->task_wake_up(rq, p); + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d2ea2828164e..f48328ac216f 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq) * If we are not running and we are not going to reschedule soon, we should * try to push tasks away now */ -static void task_wake_up_rt(struct rq *rq, struct task_struct *p) +static void task_woken_rt(struct rq *rq, struct task_struct *p) { if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && @@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = { .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, - .task_wake_up = task_wake_up_rt, + .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif -- cgit v1.2.3 From 88ec22d3edb72b261f8628226cd543589a6d5e1b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:41 +0100 Subject: sched: Remove the cfs_rq dependency from set_task_cpu() In order to remove the cfs_rq dependency from set_task_cpu() we need to ensure the task is cfs_rq invariant for all callsites. The simple approach is to substract cfs_rq->min_vruntime from se->vruntime on dequeue, and add cfs_rq->min_vruntime on enqueue. However, this has the downside of breaking FAIR_SLEEPERS since we loose the old vruntime as we only maintain the relative position. To solve this, we observe that we only migrate runnable tasks, we do this using deactivate_task(.sleep=0) and activate_task(.wakeup=0), therefore we can restrain the min_vruntime invariance to that state. The only other case is wakeup balancing, since we want to maintain the old vruntime we cannot make it relative on dequeue, but since we don't migrate inactive tasks, we can do so right before we activate it again. This is where we need the new pre-wakeup hook, we need to call this while still holding the old rq->lock. We could fold it into ->select_task_rq(), but since that has multiple callsites and would obfuscate the locking requirements, that seems like a fudge. This leaves the fork() case, simply make sure that ->task_fork() leaves the ->vruntime in a relative state. This covers all cases where set_task_cpu() gets called, and ensures it sees a relative vruntime. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.191697025@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +----- kernel/sched_fair.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c571bdd5658..f92ce63edfff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2038,8 +2038,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { int old_cpu = task_cpu(p); - struct cfs_rq *old_cfsrq = task_cfs_rq(p), - *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); #ifdef CONFIG_SCHED_DEBUG /* @@ -2056,8 +2054,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } - p->se.vruntime -= old_cfsrq->min_vruntime - - new_cfsrq->min_vruntime; __set_task_cpu(p, new_cpu); } @@ -10102,7 +10098,7 @@ void sched_move_task(struct task_struct *tsk) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->moved_group) - tsk->sched_class->moved_group(tsk); + tsk->sched_class->moved_group(tsk, on_rq); #endif if (unlikely(running)) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ec1d2715620c..42ac3c9f66f6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -510,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); delta_exec_weighted = calc_delta_fair(delta_exec, curr); + curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); } @@ -765,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) se->vruntime = vruntime; } +#define ENQUEUE_WAKEUP 1 +#define ENQUEUE_MIGRATE 2 + static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) + se->vruntime += cfs_rq->min_vruntime; + /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); - if (wakeup) { + if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); enqueue_sleeper(cfs_rq, se); } @@ -828,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) __dequeue_entity(cfs_rq, se); account_entity_dequeue(cfs_rq, se); update_min_vruntime(cfs_rq); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!sleep) + se->vruntime -= cfs_rq->min_vruntime; } /* @@ -1038,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int flags = 0; + + if (wakeup) + flags |= ENQUEUE_WAKEUP; + if (p->state == TASK_WAKING) + flags |= ENQUEUE_MIGRATE; for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); - wakeup = 1; + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; } hrtick_update(rq); @@ -1120,6 +1145,14 @@ static void yield_task_fair(struct rq *rq) #ifdef CONFIG_SMP +static void task_waking_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + se->vruntime -= cfs_rq->min_vruntime; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* * effective_load() calculates the load change as seen from the root_task_group @@ -1978,6 +2011,8 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } + se->vruntime -= cfs_rq->min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -2031,12 +2066,13 @@ static void set_curr_task_fair(struct rq *rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void moved_group_fair(struct task_struct *p) +static void moved_group_fair(struct task_struct *p, int on_rq) { struct cfs_rq *cfs_rq = task_cfs_rq(p); update_curr(cfs_rq); - place_entity(cfs_rq, &p->se, 1); + if (!on_rq) + place_entity(cfs_rq, &p->se, 1); } #endif @@ -2076,6 +2112,8 @@ static const struct sched_class fair_sched_class = { .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, -- cgit v1.2.3 From 738d2be4301007f054541c5c4bf7fb6a361c9b3a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2009 18:04:42 +0100 Subject: sched: Simplify set_task_cpu() Rearrange code a bit now that its a simpler function. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170518.269101883@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f92ce63edfff..8a2bfd37ab4f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2034,11 +2034,8 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { - int old_cpu = task_cpu(p); - #ifdef CONFIG_SCHED_DEBUG /* * We should never call set_task_cpu() on a blocked task, @@ -2049,11 +2046,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); - if (old_cpu != new_cpu) { - p->se.nr_migrations++; - perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, - 1, 1, NULL, 0); - } + if (task_cpu(p) == new_cpu) + return; + + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); __set_task_cpu(p, new_cpu); } -- cgit v1.2.3 From 416eb39556a03d1c7e52b0791e9052ccd71db241 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Dec 2009 06:05:49 +0100 Subject: sched: Make warning less noisy Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091216170517.807938893@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8a2bfd37ab4f..af7dfa74e6bb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3 From 234da7bcdc7aaa935846534c3b726dbc79a9cdd5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 16 Dec 2009 20:21:05 +0100 Subject: sched: Teach might_sleep() about preemptible RCU In practice, it is harmless to voluntarily sleep in a rcu_read_lock() section if we are running under preempt rcu, but it is illegal if we build a kernel running non-preemptable rcu. Currently, might_sleep() doesn't notice sleepable operations under rcu_read_lock() sections if we are running under preemptable rcu because preempt_count() is left untouched after rcu_read_lock() in this case. But we want developers who test their changes under such config to notice the "sleeping while atomic" issues. So we add rcu_read_lock_nesting to prempt_count() in might_sleep() checks. [ v2: Handle rcu-tiny ] Signed-off-by: Frederic Weisbecker Reviewed-by: Paul E. McKenney Cc: Peter Zijlstra LKML-Reference: <1260991265-8451-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index af7dfa74e6bb..7be88a7be047 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9682,7 +9682,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() & ~PREEMPT_ACTIVE; + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); } -- cgit v1.2.3 From 077614ee1e93245a3b9a4e1213659405dbeb0ba6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Dec 2009 13:16:31 +0100 Subject: sched: Fix broken assertion There's a preemption race in the set_task_cpu() debug check in that when we get preempted after setting task->state we'd still be on the rq proper, but fail the test. Check for preempted tasks, since those are always on the RQ. Signed-off-by: Peter Zijlstra LKML-Reference: <20091217121830.137155561@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7be88a7be047..720df108a2d6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2041,7 +2041,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * We should never call set_task_cpu() on a blocked task, * ttwu() will sort out the placement. */ - WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING); + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); #endif trace_sched_migrate_task(p, new_cpu); -- cgit v1.2.3