From 72b1e22dfcad1daca6906148fd956ffe404bb0bc Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:45 -0700 Subject: x64, x2apic/intr-remap: generic irq migration support from process context Generic infrastructure for migrating the irq from the process context in the presence of CONFIG_GENERIC_PENDING_IRQ. This will be used later for migrating irq in the presence of interrupt-remapping. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 46d6611a33bb..628b5572a7c2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -87,7 +87,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) set_balance_irq_affinity(irq, cpumask); #ifdef CONFIG_GENERIC_PENDING_IRQ - set_pending_irq(irq, cpumask); + if (desc->status & IRQ_MOVE_PCNTXT) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + desc->chip->set_affinity(irq, cpumask); + spin_unlock_irqrestore(&desc->lock, flags); + } else + set_pending_irq(irq, cpumask); #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); -- cgit v1.2.3 From 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:55 +0800 Subject: rcu classic: simplify the next pending batch use a batch number(rcp->pending) instead of a flag(rcp->next_pending) rcu_start_batch() need to change this flag, so mb()s is needed for memory-access safe. but(after this patch applied) rcu_start_batch() do not change this batch number(rcp->pending), rcp->pending is managed by __rcu_process_callbacks only, and troublesome mb()s are eliminated. And codes look simpler and clearer. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 16eeeaa9d618..03726eb95193 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -60,12 +60,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map); static struct rcu_ctrlblk rcu_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, + .pending = -300, .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), .cpumask = CPU_MASK_NONE, }; @@ -276,14 +278,8 @@ static void rcu_do_batch(struct rcu_data *rdp) */ static void rcu_start_batch(struct rcu_ctrlblk *rcp) { - if (rcp->next_pending && + if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { - rcp->next_pending = 0; - /* - * next_pending == 0 must be visible in - * __rcu_process_callbacks() before it can see new value of cur. - */ - smp_wmb(); rcp->cur++; /* @@ -441,16 +437,14 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, /* determine batch number */ rdp->batch = rcp->cur + 1; - /* see the comment and corresponding wmb() in - * the rcu_start_batch() - */ - smp_rmb(); - if (!rcp->next_pending) { + if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ spin_lock(&rcp->lock); - rcp->next_pending = 1; - rcu_start_batch(rcp); + if (rcu_batch_after(rdp->batch, rcp->pending)) { + rcp->pending = rdp->batch; + rcu_start_batch(rcp); + } spin_unlock(&rcp->lock); } } -- cgit v1.2.3 From 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Sun, 6 Jul 2008 17:23:59 +0800 Subject: rcu classic: new algorithm for callbacks-processing(v2) This is v2, it's a little deference from v1 that I had send to lkml. use ACCESS_ONCE use rcu_batch_after/rcu_batch_before for batch # comparison. rcutorture test result: (hotplugs: do cpu-online/offline once per second) No CONFIG_NO_HZ: OK, 12hours No CONFIG_NO_HZ, hotplugs: OK, 12hours CONFIG_NO_HZ=y: OK, 24hours CONFIG_NO_HZ=y, hotplugs: Failed. (Failed also without my patch applied, exactly the same bug occurred, http://lkml.org/lkml/2008/7/3/24) v1's email thread: http://lkml.org/lkml/2008/6/2/539 v1's description: The code/algorithm of the implement of current callbacks-processing is very efficient and technical. But when I studied it and I found a disadvantage: In multi-CPU systems, when a new RCU callback is being queued(call_rcu[_bh]), this callback will be invoked after the grace period for the batch with batch number = rcp->cur+2 has completed very very likely in current implement. Actually, this callback can be invoked after the grace period for the batch with batch number = rcp->cur+1 has completed. The delay of invocation means that latency of synchronize_rcu() is extended. But more important thing is that the callbacks usually free memory, and these works are delayed too! it's necessary for reclaimer to free memory as soon as possible when left memory is few. A very simple way can solve this problem: a field(struct rcu_head::batch) is added to record the batch number for the RCU callback. And when a new RCU callback is being queued, we determine the batch number for this callback(head->batch = rcp->cur+1) and we move this callback to rdp->donelist if we find that head->batch <= rcp->completed when we process callbacks. This simple way reduces the wait time for invocation a lot. (about 2.5Grace Period -> 1.5Grace Period in average in multi-CPU systems) This is my algorithm. But I do not add any field for struct rcu_head in my implement. We just need to memorize the last 2 batches and their batch number, because these 2 batches include all entries that for whom the grace period hasn't completed. So we use a special linked-list rather than add a field. Please see the comment of struct rcu_data. Signed-off-by: Lai Jiangshan Cc: "Paul E. McKenney" Cc: Dipankar Sarma Cc: Gautham Shenoy Cc: Dhaval Giani Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 157 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 03726eb95193..d3553ee55f64 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -120,6 +120,43 @@ static inline void force_quiescent_state(struct rcu_data *rdp, } #endif +static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long batch; + smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + /* + * Determine the batch number of this callback. + * + * Using ACCESS_ONCE to avoid the following error when gcc eliminates + * local variable "batch" and emits codes like this: + * 1) rdp->batch = rcp->cur + 1 # gets old value + * ...... + * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value + * then [*nxttail[0], *nxttail[1]) may contain callbacks + * that batch# = rdp->batch, see the comment of struct rcu_data. + */ + batch = ACCESS_ONCE(rcp->cur) + 1; + + if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) { + /* process callbacks */ + rdp->nxttail[0] = rdp->nxttail[1]; + rdp->nxttail[1] = rdp->nxttail[2]; + if (rcu_batch_after(batch - 1, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[2]; + } + + rdp->batch = batch; + *rdp->nxttail[2] = head; + rdp->nxttail[2] = &head->next; + + if (unlikely(++rdp->qlen > qhimark)) { + rdp->blimit = INT_MAX; + force_quiescent_state(rdp, &rcu_ctrlblk); + } +} + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -135,18 +172,11 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_ctrlblk); - } + __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu); @@ -171,20 +201,11 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { unsigned long flags; - struct rcu_data *rdp; head->func = func; head->next = NULL; local_irq_save(flags); - rdp = &__get_cpu_var(rcu_bh_data); - *rdp->nxttail = head; - rdp->nxttail = &head->next; - - if (unlikely(++rdp->qlen > qhimark)) { - rdp->blimit = INT_MAX; - force_quiescent_state(rdp, &rcu_bh_ctrlblk); - } - + __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -213,12 +234,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); static inline void raise_rcu_softirq(void) { raise_softirq(RCU_SOFTIRQ); - /* - * The smp_mb() here is required to ensure that this cpu's - * __rcu_process_callbacks() reads the most recently updated - * value of rcu->cur. - */ - smp_mb(); } /* @@ -360,13 +375,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, * which is dead and hence not processing interrupts. */ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, - struct rcu_head **tail) + struct rcu_head **tail, long batch) { - local_irq_disable(); - *this_rdp->nxttail = list; - if (list) - this_rdp->nxttail = tail; - local_irq_enable(); + if (list) { + local_irq_disable(); + this_rdp->batch = batch; + *this_rdp->nxttail[2] = list; + this_rdp->nxttail[2] = tail; + local_irq_enable(); + } } static void __rcu_offline_cpu(struct rcu_data *this_rdp, @@ -380,9 +397,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp, if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); spin_unlock_bh(&rcp->lock); - rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); - rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); - rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); + /* spin_lock implies smp_mb() */ + rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); + rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -416,27 +433,37 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { - *rdp->donetail = rdp->curlist; - rdp->donetail = rdp->curtail; - rdp->curlist = NULL; - rdp->curtail = &rdp->curlist; - } - - if (rdp->nxtlist && !rdp->curlist) { + if (rdp->nxtlist) { local_irq_disable(); - rdp->curlist = rdp->nxtlist; - rdp->curtail = rdp->nxttail; - rdp->nxtlist = NULL; - rdp->nxttail = &rdp->nxtlist; - local_irq_enable(); /* - * start the next batch of callbacks + * move the other grace-period-completed entries to + * [rdp->nxtlist, *rdp->nxttail[0]) temporarily + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; + else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + rdp->nxttail[0] = rdp->nxttail[1]; + + /* + * the grace period for entries in + * [rdp->nxtlist, *rdp->nxttail[0]) has completed and + * move these entries to donelist */ + if (rdp->nxttail[0] != &rdp->nxtlist) { + *rdp->donetail = rdp->nxtlist; + rdp->donetail = rdp->nxttail[0]; + rdp->nxtlist = *rdp->nxttail[0]; + *rdp->donetail = NULL; + + if (rdp->nxttail[1] == rdp->nxttail[0]) + rdp->nxttail[1] = &rdp->nxtlist; + if (rdp->nxttail[2] == rdp->nxttail[0]) + rdp->nxttail[2] = &rdp->nxtlist; + rdp->nxttail[0] = &rdp->nxtlist; + } - /* determine batch number */ - rdp->batch = rcp->cur + 1; + local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ @@ -462,15 +489,26 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* This cpu has pending rcu entries and the grace period - * for them has completed. - */ - if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) - return 1; + if (rdp->nxtlist) { + /* + * This cpu has pending rcu entries and the grace period + * for them has completed. + */ + if (!rcu_batch_before(rcp->completed, rdp->batch)) + return 1; + if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + rdp->nxttail[0] != rdp->nxttail[1]) + return 1; + if (rdp->nxttail[0] != &rdp->nxtlist) + return 1; - /* This cpu has no pending entries, but there are new entries */ - if (!rdp->curlist && rdp->nxtlist) - return 1; + /* + * This cpu has pending rcu entries and the new batch + * for then hasn't been started nor scheduled start + */ + if (rcu_batch_after(rdp->batch, rcp->pending)) + return 1; + } /* This cpu has finished callbacks to invoke */ if (rdp->donelist) @@ -506,7 +544,7 @@ int rcu_needs_cpu(int cpu) struct rcu_data *rdp = &per_cpu(rcu_data, cpu); struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); - return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); + return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } void rcu_check_callbacks(int cpu, int user) @@ -553,8 +591,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { memset(rdp, 0, sizeof(*rdp)); - rdp->curtail = &rdp->curlist; - rdp->nxttail = &rdp->nxtlist; + rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; rdp->quiescbatch = rcp->completed; rdp->qs_pending = 0; -- cgit v1.2.3 From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Aug 2008 18:35:38 -0700 Subject: rcu, debug: detect stalled grace periods this is a diagnostic patch for Classic RCU. The approach is to record a timestamp at the beginning of the grace period (in rcu_start_batch()), then have rcu_check_callbacks() complain if: 1. it is running on a CPU that has holding up grace periods for a long time (say one second). This will identify the culprit assuming that the culprit has not disabled hardware irqs, instruction execution, or some such. 2. it is running on a CPU that is not holding up grace periods, but grace periods have been held up for an even longer time (say two seconds). It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter. Rather than exponential backoff, it backs off to once per 30 seconds. My feeling upon thinking on it was that if you have stalled RCU grace periods for that long, a few extra printk() messages are probably the least of your worries... Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Yinghai Lu Cc: David Witbrodt Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d4271146a9bd..d7ec731de75c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + +#ifdef CONFIG_DEBUG_RCU_STALL + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_check = get_seconds() + 3; +} +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock(&rcp->lock); + delta = get_seconds() - rcp->gp_check; + if (delta < 2L || + cpus_empty(rcp->cpumask)) { + spin_unlock(&rcp->lock); + return; + rcp->gp_check = get_seconds() + 30; + } + spin_unlock(&rcp->lock); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_cpu_mask(cpu, rcp->cpumask) + printk(" %d", cpu); + printk(" (detected by %d, t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); +} +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); + dump_stack(); + spin_lock(&rcp->lock); + if ((long)(get_seconds() - rcp->gp_check) >= 0L) + rcp->gp_check = get_seconds() + 30; + spin_unlock(&rcp->lock); +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long delta; + + delta = get_seconds() - rcp->gp_check; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { + + /* We haven't checked in, so go dump stack. */ + + print_cpu_stall(rcp); + + } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + + /* They had two seconds to dump stack, so complain. */ + + print_other_cpu_stall(rcp); + + } +} + +#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; + record_gp_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp, rdp); + if (rdp->nxtlist) { /* * This cpu has pending rcu entries and the grace period -- cgit v1.2.3 From 78635fc739b1254f3e0362ac430edbdd2cff01dc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 11 Aug 2008 13:34:15 +0200 Subject: rcu, debug: detect stalled grace periods, cleanups small cleanups. Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d7ec731de75c..56b8712fe5aa 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -294,6 +294,7 @@ static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { rcp->gp_check = get_seconds() + 3; } + static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; @@ -303,11 +304,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_lock(&rcp->lock); delta = get_seconds() - rcp->gp_check; - if (delta < 2L || - cpus_empty(rcp->cpumask)) { + if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; - rcp->gp_check = get_seconds() + 30; } spin_unlock(&rcp->lock); @@ -319,6 +318,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) printk(" (detected by %d, t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); } + static void print_cpu_stall(struct rcu_ctrlblk *rcp) { printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", @@ -329,8 +329,8 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp) rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long delta; @@ -341,12 +341,11 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, print_cpu_stall(rcp); - } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - - /* They had two seconds to dump stack, so complain. */ - - print_other_cpu_stall(rcp); - + } else { + if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } } } @@ -355,8 +354,9 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) { } -static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, - struct rcu_data *rdp) + +static inline void +check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { } -- cgit v1.2.3 From 293a17ebc944c958e24e6ffbd1d5a49abdbf489e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 12 Aug 2008 17:25:03 -0700 Subject: rcu: prevent console flood when one CPU sees another AWOL via RCU One small change needed to keep from flooding the console when one CPU notices that another is AWOL. Unless I am missing something subtle. Otherwise the cleanups look good! Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 56b8712fe5aa..dab2676d9d72 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -308,6 +308,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) spin_unlock(&rcp->lock); return; } + rcp->gp_check = get_seconds() + 30; spin_unlock(&rcp->lock); /* OK, time to rat on our buddy... */ -- cgit v1.2.3 From 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Aug 2008 09:21:44 -0700 Subject: rcu: classic RCU locking and memory-barrier cleanups This patch simplifies the locking and memory-barrier usage in the Classic RCU grace-period-detection mechanism, incorporating Lai Jiangshan's feedback from the earlier version (http://lkml.org/lkml/2008/8/1/400 and http://lkml.org/lkml/2008/8/3/43). Passed 10 hours of rcutorture concurrent with CPUs being put online and taken offline on a 128-hardware-thread Power machine. My apologies to whoever in the Eastern Hemisphere was planning to use this machine over the Western Hemisphere night, but it was sitting idle and... So this is ready for tip/core/rcu. This patch is in preparation for moving to a hierarchical algorithm to allow the very large SMP machines -- requested by some people at OLS, and there seem to have been a few recent patches in the 4096-CPU direction as well. The general idea is to move to a much more conservative concurrency design, then apply a hierarchy to reduce contention on the global lock by a few orders of magnitude (larger machines would see greater reductions). The reason for taking a conservative approach is that this code isn't on any fast path. Prototype in progress. This patch is against the linux-tip git tree (tip/core/rcu). If you wish to test this against 2.6.26, use the following set of patches: http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimp-1.patch http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimpfix-3.patch The first patch combines commits 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 and 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 from Lai Jiangshan , and the second patch contains my changes. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 51 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index dab2676d9d72..5de126630b10 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -87,6 +87,7 @@ static void force_quiescent_state(struct rcu_data *rdp, int cpu; cpumask_t cpumask; set_need_resched(); + spin_lock(&rcp->lock); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -112,6 +113,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } + spin_unlock(&rcp->lock); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -125,7 +127,9 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { long batch; - smp_mb(); /* reads the most recently updated value of rcu->cur. */ + + head->next = NULL; + smp_mb(); /* Read of rcu->cur must happen after any change by caller. */ /* * Determine the batch number of this callback. @@ -175,7 +179,6 @@ void call_rcu(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data)); local_irq_restore(flags); @@ -204,7 +207,6 @@ void call_rcu_bh(struct rcu_head *head, unsigned long flags; head->func = func; - head->next = NULL; local_irq_save(flags); __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); local_irq_restore(flags); @@ -467,17 +469,17 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - /* if the cpu going offline owns the grace period + /* + * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ spin_lock_bh(&rcp->lock); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); - spin_unlock_bh(&rcp->lock); - /* spin_lock implies smp_mb() */ rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); + spin_unlock_bh(&rcp->lock); local_irq_disable(); this_rdp->qlen += rdp->qlen; @@ -511,16 +513,19 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + long completed_snap; + if (rdp->nxtlist) { local_irq_disable(); + completed_snap = ACCESS_ONCE(rcp->completed); /* * move the other grace-period-completed entries to * [rdp->nxtlist, *rdp->nxttail[0]) temporarily */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2]; - else if (!rcu_batch_before(rcp->completed, rdp->batch - 1)) + else if (!rcu_batch_before(completed_snap, rdp->batch - 1)) rdp->nxttail[0] = rdp->nxttail[1]; /* @@ -561,8 +566,24 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, static void rcu_process_callbacks(struct softirq_action *unused) { + /* + * Memory references from any prior RCU read-side critical sections + * executed by the interrupted code must be see before any RCU + * grace-period manupulations below. + */ + + smp_mb(); /* See above block comment. */ + __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); + + /* + * Memory references from any later RCU read-side critical sections + * executed by the interrupted code must be see after any RCU + * grace-period manupulations above. + */ + + smp_mb(); /* See above block comment. */ } static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -571,13 +592,15 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) check_cpu_stall(rcp, rdp); if (rdp->nxtlist) { + long completed_snap = ACCESS_ONCE(rcp->completed); + /* * This cpu has pending rcu entries and the grace period * for them has completed. */ - if (!rcu_batch_before(rcp->completed, rdp->batch)) + if (!rcu_batch_before(completed_snap, rdp->batch)) return 1; - if (!rcu_batch_before(rcp->completed, rdp->batch - 1) && + if (!rcu_batch_before(completed_snap, rdp->batch - 1) && rdp->nxttail[0] != rdp->nxttail[1]) return 1; if (rdp->nxttail[0] != &rdp->nxtlist) @@ -628,6 +651,12 @@ int rcu_needs_cpu(int cpu) return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu); } +/* + * Top-level function driving RCU grace-period detection, normally + * invoked from the scheduler-clock interrupt. This function simply + * increments counters that are read only from softirq by this same + * CPU, so there are no memory barriers required. + */ void rcu_check_callbacks(int cpu, int user) { if (user || @@ -671,6 +700,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + spin_lock(&rcp->lock); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -678,6 +708,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; + spin_unlock(&rcp->lock); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.2.3 From 5802294f1b1895ee19a3d0ae72805da453afb9de Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 30 Jul 2008 14:20:55 -0400 Subject: rcu: trace fix possible mem-leak In the initialization of the RCU trace module, if rcupreempt_debugfs_init() fails, we never free the the trace buffer. This patch frees the trace buffer in case the debugfs fails. Signed-off-by: Steven Rostedt Reviewed-by: "Paul E. McKenney" Signed-off-by: Ingo Molnar --- kernel/rcupreempt_trace.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c index 5edf82c34bbc..35c2d3360ecf 100644 --- a/kernel/rcupreempt_trace.c +++ b/kernel/rcupreempt_trace.c @@ -308,11 +308,16 @@ out: static int __init rcupreempt_trace_init(void) { + int ret; + mutex_init(&rcupreempt_trace_mutex); rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); if (!rcupreempt_trace_buf) return 1; - return rcupreempt_debugfs_init(); + ret = rcupreempt_debugfs_init(); + if (ret) + kfree(rcupreempt_trace_buf); + return ret; } static void __exit rcupreempt_trace_cleanup(void) -- cgit v1.2.3 From cd95851785bcfe95fdf73689e8ecb5a1c5959231 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 07:37:15 -0700 Subject: rcu: fix classic RCU locking cleanup lockdep problem On Fri, Aug 15, 2008 at 04:24:30PM +0200, Ingo Molnar wrote: > > Paul, > > one of your two recent RCU patches caused this lockdep splat in -tip > testing: > > -------------------> > Brought up 2 CPUs > Total of 2 processors activated (6850.87 BogoMIPS). > PM: Adding info for No Bus:platform > khelper used greatest stack depth: 3124 bytes left > > ================================= > [ INFO: inconsistent lock state ] > 2.6.27-rc3-tip #1 > --------------------------------- > inconsistent {softirq-on-W} -> {in-softirq-W} usage. > ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes: > (&rcu_ctrlblk.lock){-+..}, at: [] __rcu_process_callbacks+0x1ac/0x1f0 > {softirq-on-W} state was registered at: > [] __lock_acquire+0x3f4/0x5b0 > [] lock_acquire+0x89/0xc0 > [] _spin_lock+0x3b/0x70 > [] rcu_init_percpu_data+0x29/0x80 > [] rcu_cpu_notify+0xaf/0xd0 > [] notifier_call_chain+0x2d/0x60 > [] __raw_notifier_call_chain+0x1e/0x30 > [] _cpu_up+0x79/0x110 > [] cpu_up+0x4d/0x70 > [] kernel_init+0xb1/0x200 > [] kernel_thread_helper+0x7/0x10 > [] 0xffffffff > irq event stamp: 14 > hardirqs last enabled at (14): [] trace_hardirqs_on+0xb/0x10 > hardirqs last disabled at (13): [] trace_hardirqs_off+0xb/0x10 > softirqs last enabled at (0): [] copy_process+0x276/0x1190 > softirqs last disabled at (11): [] call_on_stack+0x1a/0x30 > > other info that might help us debug this: > no locks held by ksoftirqd/0/4. > > stack backtrace: > Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27-rc3-tip #1 > [] print_usage_bug+0x16c/0x1b0 > [] mark_lock+0xa75/0xb10 > [] ? sched_clock+0x15/0x30 > [] __lock_acquire+0x3ad/0x5b0 > [] lock_acquire+0x89/0xc0 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] _spin_lock+0x3b/0x70 > [] ? __rcu_process_callbacks+0x1ac/0x1f0 > [] __rcu_process_callbacks+0x1ac/0x1f0 > [] rcu_process_callbacks+0x26/0x50 > [] __do_softirq+0x95/0x120 > [] ? __do_softirq+0x0/0x120 > [] call_on_stack+0x1a/0x30 > [] ? ksoftirqd+0x96/0x110 > [] ? ksoftirqd+0x0/0x110 > [] ? kthread+0x47/0x80 > [] ? kthread+0x0/0x80 > [] ? kernel_thread_helper+0x7/0x10 > ======================= > calling init_cpufreq_transition_notifier_list+0x0/0x20 > initcall init_cpufreq_transition_notifier_list+0x0/0x20 returned 0 after 0 msecs > calling net_ns_init+0x0/0x190 > net_namespace: 676 bytes > initcall net_ns_init+0x0/0x190 returned 0 after 0 msecs > calling cpufreq_tsc+0x0/0x20 > initcall cpufreq_tsc+0x0/0x20 returned 0 after 0 msecs > calling reboot_init+0x0/0x20 > initcall reboot_init+0x0/0x20 returned 0 after 0 msecs > calling print_banner+0x0/0x10 > Booting paravirtualized kernel on bare hardware > > <----------------------- > > my guess is on: > > commit 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 > Author: Paul E. McKenney > Date: Tue Aug 5 09:21:44 2008 -0700 > > rcu: classic RCU locking and memory-barrier cleanups > > Ingo Fixes a problem detected by lockdep in which rcu->lock was acquired both in irq context and in process context, but without disabling from process context. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 5de126630b10..fb1f1cc45142 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -700,7 +700,9 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - spin_lock(&rcp->lock); + long flags; + + spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist; rdp->donetail = &rdp->donelist; @@ -708,7 +710,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, rdp->qs_pending = 0; rdp->cpu = cpu; rdp->blimit = blimit; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void __cpuinit rcu_online_cpu(int cpu) -- cgit v1.2.3 From ded00a56e99555c3f4000ef3eebfd5fe0d574565 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 17 Aug 2008 12:50:36 -0700 Subject: rcu: remove redundant ACCESS_ONCE definition from rcupreempt.c Remove the redundant definition of ACCESS_ONCE() from rcupreempt.c in favor of the one in compiler.h. Also merge the comment header from rcupreempt.c's definition into that in compiler.h. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 27827931ca0d..ca4bbbe04aa4 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -58,14 +58,6 @@ #include #include -/* - * Macro that prevents the compiler from reordering accesses, but does - * absolutely -nothing- to prevent CPUs from reordering. This is used - * only to mediate communication between mainline code and hardware - * interrupt and NMI handlers. - */ -#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) - /* * PREEMPT_RCU data structures. */ -- cgit v1.2.3 From eff9b713ee3540ddab862095aaf4b1511a6758bc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Aug 2008 17:51:08 -0700 Subject: rcu: fix locking cleanup fallout Given that the rcp->lock is now acquired from call_rcu(), which can be invoked from irq-disable regions, all acquisitions need to disable irqs. The following patch fixes this. Although I don't have any reason to believe that this is the cause of Yinghai's oops, it does need to be fixed. Signed-off-by: Paul E. McKenney Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index fb1f1cc45142..c6b6cf55f3e2 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -86,8 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp, { int cpu; cpumask_t cpumask; + unsigned long flags; + set_need_resched(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (unlikely(!rcp->signaled)) { rcp->signaled = 1; /* @@ -113,7 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp, for_each_cpu_mask_nr(cpu, cpumask) smp_send_reschedule(cpu); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } #else static inline void force_quiescent_state(struct rcu_data *rdp, @@ -301,17 +303,18 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) { int cpu; long delta; + unsigned long flags; /* Only let one CPU complain about others per time interval. */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); delta = get_seconds() - rcp->gp_check; if (delta < 2L || cpus_empty(rcp->cpumask)) { spin_unlock(&rcp->lock); return; } rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); /* OK, time to rat on our buddy... */ @@ -324,13 +327,15 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) static void print_cpu_stall(struct rcu_ctrlblk *rcp) { + unsigned long flags; + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", smp_processor_id(), get_seconds(), rcp->gp_check); dump_stack(); - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if ((long)(get_seconds() - rcp->gp_check) >= 0L) rcp->gp_check = get_seconds() + 30; - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) @@ -413,6 +418,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + if (rdp->quiescbatch != rcp->cur) { /* start new grace period: */ rdp->qs_pending = 1; @@ -436,7 +443,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, return; rdp->qs_pending = 0; - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); /* * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync * during cpu startup. Ignore the quiescent state. @@ -444,7 +451,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, if (likely(rdp->quiescbatch == rcp->cur)) cpu_quiet(rdp->cpu, rcp); - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } @@ -469,21 +476,22 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, static void __rcu_offline_cpu(struct rcu_data *this_rdp, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; + /* * if the cpu going offline owns the grace period * we can block indefinitely waiting for it, so flush * it here */ - spin_lock_bh(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcp->cur != rcp->completed) cpu_quiet(rdp->cpu, rcp); rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1); rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1); - spin_unlock_bh(&rcp->lock); + spin_unlock(&rcp->lock); - local_irq_disable(); this_rdp->qlen += rdp->qlen; - local_irq_enable(); + local_irq_restore(flags); } static void rcu_offline_cpu(int cpu) @@ -550,12 +558,12 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, if (rcu_batch_after(rdp->batch, rcp->pending)) { /* and start it/schedule start if it's a new batch */ - spin_lock(&rcp->lock); + spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock(&rcp->lock); + spin_unlock_irqrestore(&rcp->lock, flags); } } -- cgit v1.2.3 From 0c925d79234fe77589d8ff3861f9f8bb9e7fc3f6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 21:49:51 -0700 Subject: rcuclassic: fix compilation NG fix: CC kernel/rcuclassic.o kernel/rcuclassic.c: In function '__rcu_process_callbacks': kernel/rcuclassic.c:561: error: 'flags' undeclared (first use in this function) kernel/rcuclassic.c:561: error: (Each undeclared identifier is reported only once kernel/rcuclassic.c:561: error: for each function it appears in.) Declare missing variable flags. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index c6b6cf55f3e2..01e761a6b38c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -557,6 +557,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_enable(); if (rcu_batch_after(rdp->batch, rcp->pending)) { + unsigned long flags; + /* and start it/schedule start if it's a new batch */ spin_lock_irqsave(&rcp->lock, flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { -- cgit v1.2.3 From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:02 +0200 Subject: sched: rt-bandwidth for user grouping interface rt_runtime is a signed value Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a096..39d6159fae43 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); -- cgit v1.2.3 From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b4543..77340b04a538 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.3 From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:04 +0200 Subject: sched: rt-bandwidth group disable fixes More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime to disable full group bandwidth control. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_rt.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb84e26d..c1bee5fb8154 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +static inline int rt_bandwidth_enabled(void); + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 77340b04a538..94daace5ee15 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); -- cgit v1.2.3 From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:05 +0200 Subject: sched: extract walk_tg_tree() Extract walk_tg_tree() and make it a little more generic so we can use it in the schedulablity test. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1bee5fb8154..8c019a19d052 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1426,14 +1412,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else -- cgit v1.2.3 From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:06 +0200 Subject: sched: rt-bandwidth fixes The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting for the group scheduler - however it doesn't deal with sched_setscheduler(), which will keep tasks out of groups that have no assigned runtime. If we relax this, we get into the situation where RT tasks can get into a group when we disable bandwidth control, and then starve them by enabling it again. Rework the schedulability code to check for this condition and fail to turn on bandwidth control with -EBUSY when this situation is found. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 125 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8c019a19d052..e41bdae2778d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -5082,7 +5082,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void) rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(tg, rt_period, rt_runtime); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; -- cgit v1.2.3 From 275a89bdd3868af3008852594d2e169eaf69441b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Aug 2008 06:14:55 -0700 Subject: rcu: use irq-safe locks Some earlier tip/core/rcu patches caused RCU to incorrectly enable irqs too early in boot. This caused Yinghai's repeated-kexec testing to hit oopses, presumably due to so that device interrupts left over from the prior kernel instance (which would oops the newly booting kernel before it got a chance to reset said devices). This patch therefore converts all the local_irq_disable()s in rcuclassic.c to local_irq_save(). Besides, I never did like local_irq_disable() anyway. ;-) Signed-off-by: Paul E. McKenney Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 01e761a6b38c..3f6918966bda 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -247,6 +247,7 @@ static inline void raise_rcu_softirq(void) */ static void rcu_do_batch(struct rcu_data *rdp) { + unsigned long flags; struct rcu_head *next, *list; int count = 0; @@ -261,9 +262,9 @@ static void rcu_do_batch(struct rcu_data *rdp) } rdp->donelist = list; - local_irq_disable(); + local_irq_save(flags); rdp->qlen -= count; - local_irq_enable(); + local_irq_restore(flags); if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; @@ -464,12 +465,14 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, struct rcu_head **tail, long batch) { + unsigned long flags; + if (list) { - local_irq_disable(); + local_irq_save(flags); this_rdp->batch = batch; *this_rdp->nxttail[2] = list; this_rdp->nxttail[2] = tail; - local_irq_enable(); + local_irq_restore(flags); } } @@ -521,10 +524,11 @@ static void rcu_offline_cpu(int cpu) static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + unsigned long flags; long completed_snap; if (rdp->nxtlist) { - local_irq_disable(); + local_irq_save(flags); completed_snap = ACCESS_ONCE(rcp->completed); /* @@ -554,7 +558,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, rdp->nxttail[0] = &rdp->nxtlist; } - local_irq_enable(); + local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { unsigned long flags; -- cgit v1.2.3 From a38409fbb5181324fb73b359189a72e02b6c7030 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 12:16:09 +0200 Subject: dma-coherent: export dma_[alloc|release]_from_coherent methods fixes modular builds: ERROR: "dma_alloc_from_coherent" [sound/core/snd-page-alloc.ko] undefined! ERROR: "dma_release_from_coherent" [sound/core/snd-page-alloc.ko] undefined! Signed-off-by: Ingo Molnar --- kernel/dma-coherent.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index c1d4d5b4c61c..f013a0c2e111 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size, } return (mem != NULL); } +EXPORT_SYMBOL(dma_alloc_from_coherent); /** * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool @@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr) } return 0; } +EXPORT_SYMBOL(dma_release_from_coherent); -- cgit v1.2.3 From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:41 -0700 Subject: sched: do_wait_for_common: use signal_pending_state() Change do_wait_for_common() to use signal_pending_state() instead of open coding. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406ca..da7c5d23cc03 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if ((state == TASK_INTERRUPTIBLE && - signal_pending(current)) || - (state == TASK_KILLABLE && - fatal_signal_pending(current))) { + if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } -- cgit v1.2.3 From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive(): don't consider task->nivcsw If wait_task_inactive() returns success the task was deactivated. In that case schedule() always increments ->nvcsw which alone can be used as a "generation counter". If the next call returns the same number, we can be sure that the task was unscheduled. Otherwise, because we know that .on_rq == 0 again, ->nvcsw should have been changed in between. Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ? This decreases the possibility of "was it unscheduled" false positive when ->nvcsw == 0. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index da7c5d23cc03..908670aa215a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); on_rq = p->se.on_rq; ncsw = 0; - if (!match_state || p->state == match_state) { - ncsw = p->nivcsw + p->nvcsw; - if (unlikely(!ncsw)) - ncsw = 1; - } + if (!match_state || p->state == match_state) + ncsw = p->nvcsw ?: 1; task_rq_unlock(rq, &flags); /* -- cgit v1.2.3 From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Aug 2008 16:54:44 -0700 Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0 wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1. This means that two subsequent calls can return the same number while the task was scheduled in between. Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now the overlap always needs LONG_MAX schedules. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 908670aa215a..6a43c8942b05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) on_rq = p->se.on_rq; ncsw = 0; if (!match_state || p->state == match_state) - ncsw = p->nvcsw ?: 1; + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, &flags); /* -- cgit v1.2.3 From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001 From: Kevin Diggs Date: Tue, 26 Aug 2008 10:26:54 +0200 Subject: sched: add kernel doc for the completion, fix kernel-doc-nano-HOWTO.txt This patch adds kernel doc for the completion feature. An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is also fixed. Signed-off-by: Kevin Diggs Signed-off-by: Ingo Molnar --- kernel/sched.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 29e2ec0bd831..93f5ea08be97 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + */ void complete(struct completion *x) { unsigned long flags; @@ -4576,6 +4585,12 @@ void complete(struct completion *x) } EXPORT_SYMBOL(complete); +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + */ void complete_all(struct completion *x) { unsigned long flags; @@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state) return timeout; } +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ void __sched wait_for_completion(struct completion *x) { wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(wait_for_completion); +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ unsigned long __sched wait_for_completion_timeout(struct completion *x, unsigned long timeout) { @@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout) } EXPORT_SYMBOL(wait_for_completion_timeout); +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); @@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x) } EXPORT_SYMBOL(wait_for_completion_interruptible); +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ unsigned long __sched wait_for_completion_interruptible_timeout(struct completion *x, unsigned long timeout) @@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); -- cgit v1.2.3 From 0cd418ddb1ee88df7d16d5df06cb2da68eceb9e4 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 18 Aug 2008 14:39:21 -0700 Subject: rcuclassic: fix compiler warning CC kernel/rcuclassic.o kernel/rcuclassic.c: In function 'rcu_init_percpu_data': kernel/rcuclassic.c:705: warning: comparison of distinct pointer types lacks a cast kernel/rcuclassic.c:713: warning: comparison of distinct pointer types lacks a cast flags should be unsigned long. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 3f6918966bda..743cf0550ff4 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -714,7 +714,7 @@ void rcu_check_callbacks(int cpu, int user) static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { - long flags; + unsigned long flags; spin_lock_irqsave(&rcp->lock, flags); memset(rdp, 0, sizeof(*rdp)); -- cgit v1.2.3 From aec0a5142cb52aaa152d962d84a838e25d520742 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 28 Aug 2008 14:42:49 +0530 Subject: sched: call resched_task() conditionally from new task wake up path - During wake up of a new task, task_new_fair() can do a resched_task() on the current task. Later in the code path, check_preempt_curr() also ends up doing the same, which can be avoided. Check if TIF_NEED_RESCHED is already set for the current task. - task_new_fair() does a resched_task() on the current task unconditionally. This can be done only in case when child runs before the parent. So this is a small speedup. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fb8994c6d4bb..8264bb5dbd51 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1348,6 +1348,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (unlikely(se == pse)) return; + /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + cfs_rq_of(pse)->next = pse; /* @@ -1620,10 +1627,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); } enqueue_task_fair(rq, p, 0); - resched_task(rq->curr); } /* -- cgit v1.2.3 From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 27 Aug 2008 11:21:39 -0400 Subject: make might_sleep() display the oopsing process Expand might_sleep's printk to indicate the oopsing process. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 93f5ea08be97..6e283dc7679a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line) prev_jiffy = jiffies; printk(KERN_ERR "BUG: sleeping function called from invalid" " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d\n", - in_atomic(), irqs_disabled()); + printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), current->pid, current->comm); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); -- cgit v1.2.3 From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Aug 2008 11:34:43 +0200 Subject: sched: clean up __might_sleep() add KERN_ to the printout and clean up the flow a bit. Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6e283dc7679a..b112caaa400a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line) #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && - system_state == SYSTEM_RUNNING && !oops_in_progress) { - if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) - return; - prev_jiffy = jiffies; - printk(KERN_ERR "BUG: sleeping function called from invalid" - " context at %s:%d\n", file, line); - printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n", - in_atomic(), irqs_disabled(), current->pid, current->comm); - debug_show_held_locks(current); - if (irqs_disabled()) - print_irqtrace_events(current); - dump_stack(); - } + if ((!in_atomic() && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); #endif } EXPORT_SYMBOL(__might_sleep); -- cgit v1.2.3 From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 13:40:47 +0200 Subject: sched: extract walk_tg_tree(), fix fix: kernel/sched.c: In function '__rt_schedulable': kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree' kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function) kernel/sched.c:8771: error: (Each undeclared identifier is reported only once kernel/sched.c:8771: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e41bdae2778d..703f56d5db5f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* -- cgit v1.2.3 From 268364a0f48aee2f851f9d1ef8a6cda0f3039ef1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:02:44 +0200 Subject: IO resources: add reserve_region_with_split() add reserve_region_with_split() to not lose e820 reserved entries if they overlap with existing IO regions: with test case by extend 0xe0000000 - 0xeffffff to 0xdd800000 - we get: e0000000-efffffff : PCI MMCONFIG 0 e0000000-efffffff : reserved and in /proc/iomem we get: found conflict for reserved [dd800000, efffffff], try to reserve with split __reserve_region_with_split: (PCI Bus #80) [dd000000, ddffffff], res: (reserved) [dd800000, efffffff] __reserve_region_with_split: (PCI Bus #00) [de000000, dfffffff], res: (reserved) [de000000, efffffff] initcall pci_subsys_init+0x0/0x121 returned 0 after 381 msecs in dmesg various fixes and improvements suggested by Linus. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/resource.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 03d796c1b2e9..414d6fc9131e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t return result; } +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + for (;;) { + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + + if (!res) { + printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n", + conflict->name, conflict->start, conflict->end, + name, start, end); + + /* failed, split and try again */ + + /* conflict coverred whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (!(conflict->flags & IORESOURCE_BUSY)) { + resource_size_t common_start, common_end; + + common_start = max(conflict->start, start); + common_end = min(conflict->end, end); + if (common_start < common_end) + __reserve_region_with_split(root, common_start, common_end, name); + } + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); + } + +} + +void reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + EXPORT_SYMBOL(adjust_resource); /** -- cgit v1.2.3 From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Fri, 5 Sep 2008 23:46:19 +0200 Subject: sched: compilation fix with gcc 3.4.6 I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6. The error is: CC kernel/sched.o kernel/sched.c: In function `start_rt_bandwidth': kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available kernel/sched.c:214: sorry, unimplemented: called from here make[1]: *** [kernel/sched.o] Error 1 make: *** [kernel] Error 2 It seems that the gcc 3.4.6 requires full inline definition before first usage. The patch below fixes the compilation problem. Signed-off-by: Krzysztof Helt (if needed> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 703f56d5db5f..4de2bfb28c58 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -static inline int rt_bandwidth_enabled(void); +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { @@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.3 From 38736f475071b80b66be28af7b44c854073699cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Sat, 6 Sep 2008 14:50:23 +0530 Subject: sched: fix __load_balance_iterator() for cfq with only one task The __load_balance_iterator() returns a NULL when there's only one sched_entity which is a task. It is caused by the following code-path. /* Skip over entities that are not tasks */ do { se = list_entry(next, struct sched_entity, group_node); next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); if (next == &cfs_rq->tasks) return NULL; ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This will return NULL even when se is a task. As a side-effect, there was a regression in sched_mc behavior since 2.6.25, since iter_move_one_task() when it calls load_balance_start_fair(), would not get any tasks to move! Fix this by checking if the last entity was a task or not. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 8264bb5dbd51..a10ac0bcee64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1458,7 +1458,7 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) next = next->next; } while (next != &cfs_rq->tasks && !entity_is_task(se)); - if (next == &cfs_rq->tasks) + if (next == &cfs_rq->tasks && !entity_is_task(se)) return NULL; cfs_rq->balance_iterator = next; -- cgit v1.2.3 From 3ba35573ad9a149a3af19625b502679283382f6b Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 31 Aug 2008 19:58:49 +0200 Subject: kernel/cpu.c: Move the CPU_DYING notifiers When a cpu is taken offline, the CPU_DYING notifiers are called on the dying cpu. According to , the cpu should be "not running any task, not handling interrupts, soon dead". For the current implementation, this is not true: - __cpu_disable can fail. If it fails, then the cpu will remain alive and happy. - At least on x86, __cpu_disable() briefly enables the local interrupts to handle any outstanding interrupts. What about moving CPU_DYING down a few lines, behind the __cpu_disable() line? There are only two CPU_DYING handlers in the kernel right now: one in kvm, one in the scheduler. Both should work with the patch applied [and: I'm not sure if either one handles a failing __cpu_disable()] The patch survives simple offlining a cpu. kvm untested due to lack of a test setup. Signed-off-By: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..9e7ebde1331c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param) struct take_cpu_down_param *param = _param; int err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; + raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, + param->hcpu); + /* Force idle task to run as soon as we yield: it should immediately notice cpu is offline and die quickly. */ sched_idle_next(); -- cgit v1.2.3 From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 16:57:22 +0200 Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier Right now, there is no notifier that is called on a new cpu, before the new cpu begins processing interrupts/softirqs. Various kernel function would need that notification, e.g. kvm works around by calling smp_call_function_single(), rcu polls cpu_online_map. The patch adds a CPU_STARTING notification. It also adds a helper function that sends the message to all cpu_chain handlers. Tested on x86-64. All other archs are untested. Especially on sparc, I'm not sure if I got it right. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- kernel/cpu.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index f17e9854c246..dc45f2459efb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -453,6 +453,25 @@ out: } #endif /* CONFIG_PM_SLEEP_SMP */ +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (cpu_isset(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); +} + #endif /* CONFIG_SMP */ /* -- cgit v1.2.3 From e75b986af7881ed8d8ccb1ed154045ed17cfebd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Sep 2008 21:38:57 +0200 Subject: clockevents: remove WARN_ON which was used to gather information The issue of the endless reprogramming loop due to a too small min_delta_ns was fixed with the previous updates of the clock events code, but we had no information about the spread of this problem. I added a WARN_ON to get automated information via kerneloops.org and to get some direct reports, which allowed me to analyse the affected machines. The WARN_ON has served its purpose and would be annoying for a release kernel. Remove it and just keep the information about the increase of the min_delta_ns value. Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2e35501e61dd..2e8de678e767 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, * and emit a warning. */ if (++i > 2) { - printk(KERN_WARNING "CE: __tick_program_event of %s is " - "stuck %llx %llx\n", dev->name ? dev->name : "?", - now.tv64, expires.tv64); - printk(KERN_WARNING - "CE: increasing min_delta_ns %ld to %ld nsec\n", - dev->min_delta_ns, dev->min_delta_ns << 1); - WARN_ON(1); - - /* Double the min. delta and try again */ + /* Increase the min. delta and try again */ if (!dev->min_delta_ns) dev->min_delta_ns = 5000; else - dev->min_delta_ns <<= 1; + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + i = 0; } -- cgit v1.2.3 From baf25731e54d06eb13dc4eda78c6dc7da4ce84e8 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 9 Sep 2008 11:26:33 +0800 Subject: sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly On my tulsa x86-64 machine, kernel 2.6.25-rc5 couldn't boot randomly. Basically, function __enable_runtime forgets to reset rt_rq->rt_throttled to 0. When every cpu is up, per-cpu migration_thread is created and it runs very fast, sometimes to mark the corresponding rt_rq->rt_throttled to 1 very quickly. After all cpus are up, with below calling chain: sched_init_smp => arch_init_sched_domains => build_sched_domains => ... => cpu_attach_domain => rq_attach_root => set_rq_online => ... => _enable_runtime _enable_runtime is called against every rt_rq again, so rt_rq->rt_time is reset to 0, but rt_rq->rt_throttled might be still 1. Later on function do_sched_rt_period_timer couldn't reset it, and all RT tasks couldn't be scheduled to run on that cpu. here is RT task migration_thread which is woken up when a task is migrated to another cpu. Below patch fixes it against 2.6.27-rc5. Signed-off-by: Zhang Yanmin Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798dad..1113157b2058 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -350,6 +350,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } -- cgit v1.2.3 From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 10 Sep 2008 17:00:19 -0700 Subject: sched: fix deadlock in setting scheduler parameter to zero Andrei Gusev wrote: > I played witch scheduler settings. After doing something like: > echo -n 1000000 >sched_rt_period_us > > command is locked. I found in kernel.log: > > Sep 11 00:39:34 zaratustra > Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G W > (2.6.26.3 #12) > Sep 11 00:39:34 zaratustra EIP: 0060:[] EFLAGS: 00210246 CPU: 0 > Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80 > Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000 > EDX: 00000000 > Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f > ESP: ea7a6edc > Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 > Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000 > task=ea744000 task.ti=ea7a6000) > Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f > c0119042 00000000 00000000 00000001 > Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000 > c04d2e80 00000001 000e7ef0 > Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001 > ffffffff c04d2e80 ea5b2480 > Sep 11 00:39:34 zaratustra Call Trace: > Sep 11 00:39:34 zaratustra [] __rt_schedulable+0x52/0x130 > Sep 11 00:39:34 zaratustra [] sched_rt_handler+0x83/0x120 > Sep 11 00:39:34 zaratustra [] proc_sys_call_handler+0xb6/0xd0 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x0/0x20 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x19/0x20 > Sep 11 00:39:34 zaratustra [] vfs_write+0xa8/0x140 > Sep 11 00:39:34 zaratustra [] sys_write+0x41/0x80 > Sep 11 00:39:34 zaratustra [] sysenter_past_esp+0x6a/0x91 > Sep 11 00:39:34 zaratustra ======================= > Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de > 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea > 77 08 89 e8 31 d2 f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89 > ca 8b 1c 24 > Sep 11 00:39:34 zaratustra EIP: [] div64_u64+0x57/0x80 SS:ESP > 0068:ea7a6edc > Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]--- fix the boundary condition. sysctl_sched_rt_period=0 makes exception at to_ratio(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b50b82..98890807375b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void) u64 rt_runtime, rt_period; int ret = 0; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); rt_runtime = tg->rt_bandwidth.rt_runtime; @@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.2.3 From 2344abbcbdb82140050e8be29d3d55e4f6fe860b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Sep 2008 11:32:50 -0700 Subject: clockevents: make device shutdown robust The device shut down does not cleanup the next_event variable of the clock event device. So when the device is reactivated the possible stale next_event value can prevent the device to be reprogrammed as it claims to wait on a event already. This is the root cause of the resurfacing suspend/resume problem, where systems need key press to come back to life. Fix this by setting next_event to KTIME_MAX when the device is shut down. Use a separate function for shutdown which takes care of that and only keep the direct set mode call in the broadcast code, where we can not touch the next_event value. Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 12 +++++++++++- kernel/time/tick-broadcast.c | 9 ++++----- kernel/time/tick-common.c | 4 ++-- kernel/time/tick-internal.h | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1876b526c778..f8d968063cea 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -71,6 +71,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } } +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + /** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) @@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2f5a38294bf9..f1f3eee28113 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -236,8 +236,7 @@ static void tick_do_broadcast_on_off(void *why) if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -254,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) if (cpus_empty(tick_broadcast_mask)) { if (!bc_stopped) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); @@ -306,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -321,7 +320,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c4777193d567..019315ebf9de 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -249,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -311,7 +311,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0ffc2918ea6f..6e9db9734aa6 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 20 Sep 2008 23:38:02 +0200 Subject: sched: wakeup preempt when small overlap Lin Ming reported a 10% OLTP regression against 2.6.27-rc4. The difference seems to come from different preemption agressiveness, which affects the cache footprint of the workload and its effective cache trashing. Aggresively preempt a task if its avg overlap is very small, this should avoid the task going to sleep and find it still running when we schedule back to it - saving a wakeup. Reported-by: Lin Ming Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ kernel/sched_fair.c | 13 ++++++++++--- kernel/sched_features.h | 1 + kernel/sched_idletask.c | 6 +++--- kernel/sched_rt.c | 2 +- 5 files changed, 21 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0d8905a1b8ca..ad9d39b021f8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -604,9 +604,9 @@ struct rq { static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) +static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) { - rq->curr->sched_class->check_preempt_curr(rq, p); + rq->curr->sched_class->check_preempt_curr(rq, p, sync); } static inline int cpu_of(struct rq *rq) @@ -2282,7 +2282,7 @@ out_running: trace_mark(kernel_sched_wakeup, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, sync); p->state = TASK_RUNNING; #ifdef CONFIG_SMP @@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) trace_mark(kernel_sched_wakeup_new, "pid %d state %ld ## rq %p task %p rq->curr %p", p->pid, p->state, rq, p, rq->curr); - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) p->sched_class->task_wake_up(rq, p); @@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. */ - check_preempt_curr(this_rq, p); + check_preempt_curr(this_rq, p, 0); } /* @@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); - check_preempt_curr(rq_dest, p); + check_preempt_curr(rq_dest, p, 0); } done: ret = 1; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a10ac0bcee64..7328383690f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) { struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; + if (sched_feat(WAKEUP_OVERLAP) && sync && + se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost) { + resched_task(curr); + return; + } + /* * preemption test can be made between sibling entities who are in the * same cfs_rq i.e who have a common parent. Walk up the hierarchy of @@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* @@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* Account for a task changing its policy or group. diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 9353ca78154e..bf027a7accf8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 3a4f92dbbe66..dec4ccabe2f5 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) { resched_task(rq->idle); } @@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p, if (running) resched_task(rq->curr); else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } static void prio_changed_idle(struct rq *rq, struct task_struct *p, @@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p, if (p->prio > oldprio) resched_task(rq->curr); } else - check_preempt_curr(rq, p); + check_preempt_curr(rq, p, 0); } /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798dad..6d2d0a5d030b 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) { if (p->prio < rq->curr->prio) { resched_task(rq->curr); -- cgit v1.2.3 From f681bbd656b01439be904250a1581ca9c27505a1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Sep 2008 16:29:00 +0200 Subject: sched: turn off WAKEUP_OVERLAP WAKEUP_OVERLAP is not a winner on a 16way box, running psql+sysbench: .27-rc7-NO_WAKEUP_OVERLAP .27-rc7-WAKEUP_OVERLAP ------------------------------------------------- 1: 694 811 +14.39% 2: 1454 1427 -1.86% 4: 3017 3070 +1.70% 8: 5694 5808 +1.96% 16: 10592 10612 +0.19% 32: 9693 9647 -0.48% 64: 8507 8262 -2.97% 128: 8402 7087 -18.55% 256: 8419 5124 -64.30% 512: 7990 3671 -117.62% ------------------------------------------------- SUM: 64466 55524 -16.11% ... so turn it off by default. Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_features.h b/kernel/sched_features.h index bf027a7accf8..7c9e8f4a049f 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -11,4 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1) SCHED_FEAT(LB_BIAS, 1) SCHED_FEAT(LB_WAKEUP_UPDATE, 1) SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 1) +SCHED_FEAT(WAKEUP_OVERLAP, 0) -- cgit v1.2.3 From caea8a03702c147e8ae90da0801e7ba8297b1d46 Mon Sep 17 00:00:00 2001 From: Chris Friesen Date: Mon, 22 Sep 2008 11:06:09 -0600 Subject: sched: fix list traversal to use _rcu variant load_balance_fair() calls rcu_read_lock() but then traverses the list using the regular list traversal routine. This patch converts the list traversal to use the _rcu version. Signed-off-by: Chris Friesen Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7328383690f1..3b89aa6594a9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1521,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rcu_read_lock(); update_h_load(busiest_cpu); - list_for_each_entry(tg, &task_groups, list) { + list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; -- cgit v1.2.3 From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 22 Sep 2008 14:55:45 -0700 Subject: sched: fix init_hrtick() section mismatch warning LD kernel/built-in.o WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference from the function init_hrtick() to the variable .cpuinit.data:hotplug_hrtick_nb.8 The function init_hrtick() references the variable __cpuinitdata hotplug_hrtick_nb.8. This is often because init_hrtick lacks a __cpuinitdata annotation or the annotation of hotplug_hrtick_nb.8 is wrong. Signed-off-by: Md.Rakib H. Mullick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98890807375b..13dd2db9fb2d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -- cgit v1.2.3 From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: sched: clarify ifdef tangle - Add some comments to try to make the ifdef puzzle a bit clearer - Explicitly inline one of the three init_hrtick() implementations. Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index ad9d39b021f8..927c9307cd00 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay) hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); } -static void init_hrtick(void) +static inline void init_hrtick(void) { } #endif /* CONFIG_SMP */ @@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq) rq->hrtick_timer.function = hrtick; rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -#else +#else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) { } @@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq) static inline void init_hrtick(void) { } -#endif +#endif /* CONFIG_SCHED_HRTICK */ /* * resched_task - mark a task 'to be rescheduled now'. -- cgit v1.2.3 From 3a72dc8eb5a7122fff439a22bd22486a4fff505c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 22 Sep 2008 14:55:46 -0700 Subject: rcu: fix sparse shadowed variable warning kernel/rcuclassic.c:564:18: warning: symbol 'flags' shadows an earlier one kernel/rcuclassic.c:527:16: originally declared here Signed-off-by: Harvey Harrison Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 743cf0550ff4..ed15128ca2c9 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -561,15 +561,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, local_irq_restore(flags); if (rcu_batch_after(rdp->batch, rcp->pending)) { - unsigned long flags; + unsigned long flags2; /* and start it/schedule start if it's a new batch */ - spin_lock_irqsave(&rcp->lock, flags); + spin_lock_irqsave(&rcp->lock, flags2); if (rcu_batch_after(rdp->batch, rcp->pending)) { rcp->pending = rdp->batch; rcu_start_batch(rcp); } - spin_unlock_irqrestore(&rcp->lock, flags); + spin_unlock_irqrestore(&rcp->lock, flags2); } } -- cgit v1.2.3 From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:46:37 +0200 Subject: clockevents: prevent cpu online to interfere with nohz Impact: rare hang which can be triggered on CPU online. tick_do_timer_cpu keeps track of the CPU which updates jiffies via do_timer. The value -1 is used to signal, that currently no CPU is doing this. There are two cases, where the variable can have this state: boot: necessary for systems where the boot cpu id can be != 0 nohz long idle sleep: When the CPU which did the jiffies update last goes into a long idle sleep it drops the update jiffies duty so another CPU which is not idle can pick it up and keep jiffies going. Using the same value for both situations is wrong, as the CPU online code can see the -1 state when the timer of the newly onlined CPU is setup. The setup for a newly onlined CPU goes through periodic mode and can pick up the do_timer duty without being aware of the nohz / highres mode of the already running system. Use two separate states and make them constants to avoid magic numbers confusion. Signed-off-by: Thomas Gleixner --- kernel/time/tick-common.c | 7 ++++--- kernel/time/tick-internal.h | 4 ++++ kernel/time/tick-sched.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 019315ebf9de..b523d095decf 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); */ ktime_t tick_next_period; ktime_t tick_period; -int tick_do_timer_cpu __read_mostly = -1; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; DEFINE_SPINLOCK(tick_device_lock); /* @@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td, * If no cpu took the do_timer update, assign it to * this cpu: */ - if (tick_do_timer_cpu == -1) { + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); tick_period = ktime_set(0, NSEC_PER_SEC / HZ); @@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup) if (*cpup == tick_do_timer_cpu) { int cpu = first_cpu(cpu_online_map); - tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; + tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : + TICK_DO_TIMER_NONE; } spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6e9db9734aa6..e18014fadf95 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -1,6 +1,10 @@ /* * tick internal variable and functions used by low/high res code */ + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern spinlock_t tick_device_lock; extern ktime_t tick_next_period; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a87b0468568b..31a14e8caac1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ if (unlikely(!cpu_online(cpu))) { if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; } if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) @@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle) * invoked. */ if (cpu == tick_do_timer_cpu) - tick_do_timer_cpu = -1; + tick_do_timer_cpu = TICK_DO_TIMER_NONE; ts->idle_sleeps++; @@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; /* Check, if the jiffies need an update */ @@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) * this duty, then the jiffies update is still serialized by * xtime_lock. */ - if (unlikely(tick_do_timer_cpu == -1)) + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) tick_do_timer_cpu = cpu; #endif -- cgit v1.2.3 From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:56:01 +0200 Subject: clockevents: prevent stale tick_next_period for onlining CPUs Impact: possible hang on CPU onlining in timer one shot mode. The tick_next_period variable is only used during boot on nohz/highres enabled systems, but for CPU onlining it needs to be maintained when the per cpu clock events device operates in one shot mode. Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 31a14e8caac1..39019b3f7621 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now) incr * ticks); } do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); } write_sequnlock(&xtime_lock); } -- cgit v1.2.3 From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: clockevents: check broadcast device not tick device Impact: Possible hang on CPU online observed on AMD C1E machines. The broadcast setup code looks at the mode of the tick device to determine whether it needs to be shut down or setup. This is wrong when the broadcast mode is set to one shot already. This can happen when a CPU is brought online as it goes through the periodic setup first. The problem went unnoticed as sane systems do not call into that code before the switch to one shot for the clock event device happens. The AMD C1E idle routine switches over immediately and thereby shuts down the just setup device before the first interrupt happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f1f3eee28113..e2b66d1c8ca5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (td->mode == TICKDEV_MODE_PERIODIC) + if (bc->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.2.3 From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:04:02 +0200 Subject: clockevents: prevent mode mismatch on cpu online Impact: timer hang on CPU online observed on AMD C1E systems When a CPU is brought online then the broadcast machinery can be in the one shot state already. Check this and setup the timer device of the new CPU in one shot mode so the broadcast code can pick up the next_event value correctly. Another AMD C1E oddity, as we switch to broadcast immediately and not after the full bring up via the ACPI cpu idle code. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 8 ++++++++ kernel/time/tick-common.c | 3 ++- kernel/time/tick-internal.h | 2 ++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index e2b66d1c8ca5..bd7034542399 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + #endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b523d095decf..df12434b43ca 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if (!tick_device_is_functional(dev)) return; - if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); } else { unsigned long seq; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e18014fadf95..55c3f4be6077 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); # else /* BROADCAST */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { @@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) static inline void tick_broadcast_oneshot_control(unsigned long reason) { } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } # endif /* !BROADCAST */ #else /* !ONESHOT */ -- cgit v1.2.3 From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Sep 2008 13:00:57 +0200 Subject: timers: fix build error in !oneshot case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kernel/time/tick-common.c: In function ‘tick_setup_periodic’: kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’ Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 55c3f4be6077..469248782c23 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } #endif /* !TICK_ONESHOT */ /* -- cgit v1.2.3 From 695698500912c4479ddf4723e492de3970ff8530 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 14:54:23 +0200 Subject: sched: rework wakeup preemption Rework the wakeup preemption to work on real runtime instead of the virtual runtime. This greatly simplifies the code. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 133 ++-------------------------------------------------- 1 file changed, 4 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3b89aa6594a9..c20899763457 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) return __sched_period(nr_running); } -/* - * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in - * that it favours >=0 over <0. - * - * -20 | - * | - * 0 --------+------- - * .' - * 19 .' - * - */ -static unsigned long -calc_delta_asym(unsigned long delta, struct sched_entity *se) -{ - struct load_weight lw = { - .weight = NICE_0_LOAD, - .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) - }; - - for_each_sched_entity(se) { - struct load_weight *se_lw = &se->load; - unsigned long rw = cfs_rq_of(se)->load.weight; - -#ifdef CONFIG_FAIR_SCHED_GROUP - struct cfs_rq *cfs_rq = se->my_q; - struct task_group *tg = NULL - - if (cfs_rq) - tg = cfs_rq->tg; - - if (tg && tg->shares < NICE_0_LOAD) { - /* - * scale shares to what it would have been had - * tg->weight been NICE_0_LOAD: - * - * weight = 1024 * shares / tg->weight - */ - lw.weight *= se->load.weight; - lw.weight /= tg->shares; - - lw.inv_weight = 0; - - se_lw = &lw; - rw += lw.weight - se->load.weight; - } else -#endif - - if (se->load.weight < NICE_0_LOAD) { - se_lw = &lw; - rw += NICE_0_LOAD - se->load.weight; - } - - delta = calc_delta_mine(delta, rw, se_lw); - } - - return delta; -} - /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1281,53 +1223,11 @@ static unsigned long wakeup_gran(struct sched_entity *se) * + nice tasks. */ if (sched_feat(ASYM_GRAN)) - gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); - else - gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se); + gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load); return gran; } -/* - * Should 'se' preempt 'curr'. - * - * |s1 - * |s2 - * |s3 - * g - * |<--->|c - * - * w(c, s1) = -1 - * w(c, s2) = 0 - * w(c, s3) = 1 - * - */ -static int -wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -{ - s64 gran, vdiff = curr->vruntime - se->vruntime; - - if (vdiff < 0) - return -1; - - gran = wakeup_gran(curr); - if (vdiff > gran) - return 1; - - return 0; -} - -/* return depth at which a sched entity is present in the hierarchy */ -static inline int depth_se(struct sched_entity *se) -{ - int depth = 0; - - for_each_sched_entity(se) - depth++; - - return depth; -} - /* * Preempt the current task with a newly woken task if needed: */ @@ -1336,7 +1236,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; - int se_depth, pse_depth; + s64 delta_exec; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1374,33 +1274,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) return; } - /* - * preemption test can be made between sibling entities who are in the - * same cfs_rq i.e who have a common parent. Walk up the hierarchy of - * both tasks until we find their ancestors who are siblings of common - * parent. - */ - - /* First walk up until both entities are at same depth */ - se_depth = depth_se(se); - pse_depth = depth_se(pse); - - while (se_depth > pse_depth) { - se_depth--; - se = parent_entity(se); - } - - while (pse_depth > se_depth) { - pse_depth--; - pse = parent_entity(pse); - } - - while (!is_same_group(se, pse)) { - se = parent_entity(se); - pse = parent_entity(pse); - } - - if (wakeup_preempt_entity(se, pse) == 1) + delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; + if (delta_exec > wakeup_gran(pse)) resched_task(curr); } -- cgit v1.2.3 From 940959e93949e839c14f8ddc3b9b0e34a2ab6e29 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:42 +0200 Subject: sched: fixlet for group load balance We should not only correct the increment for the initial group, but should be consistent and do so for all the groups we encounter. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c20899763457..0c59da7e3120 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1027,7 +1027,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) { struct sched_entity *se = tg->se[cpu]; - long more_w; if (!tg->parent) return wl; @@ -1039,18 +1038,17 @@ static long effective_load(struct task_group *tg, int cpu, if (!wl && sched_feat(ASYM_EFF_LOAD)) return wl; - /* - * Instead of using this increment, also add the difference - * between when the shares were last updated and now. - */ - more_w = se->my_q->load.weight - se->my_q->rq_weight; - wl += more_w; - wg += more_w; - for_each_sched_entity(se) { -#define D(n) (likely(n) ? (n) : 1) - long S, rw, s, a, b; + long more_w; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; S = se->my_q->tg->shares; s = se->my_q->shares; @@ -1059,7 +1057,11 @@ static long effective_load(struct task_group *tg, int cpu, a = S*(rw + wl); b = S*rw + s*wg; - wl = s*(a-b)/D(b); + wl = s*(a-b); + + if (likely(b)) + wl /= b; + /* * Assume the group is already running and will * thus already be accounted for in the weight. @@ -1068,7 +1070,6 @@ static long effective_load(struct task_group *tg, int cpu, * alter the group weight. */ wg = 0; -#undef D } return wl; -- cgit v1.2.3 From 78333cdd0e472180743d35988e576d6ecc6f6ddb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:43 +0200 Subject: sched: add some comments to the bandwidth code Hopefully clarify some of this code a little. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2e228bd5395e..d570a8cc4fcd 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ static int do_balance_runtime(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ if (iter->rt_runtime == RUNTIME_INF) goto next; + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { diff = div_u64((u64)diff, weight); @@ -274,6 +286,9 @@ next: return more; } +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ static void __disable_runtime(struct rq *rq) { struct root_domain *rd = rq->rd; @@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq) spin_lock(&rt_b->rt_runtime_lock); spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ if (rt_rq->rt_runtime == RUNTIME_INF || rt_rq->rt_runtime == rt_b->rt_runtime) goto balanced; spin_unlock(&rt_rq->rt_runtime_lock); + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ want = rt_b->rt_runtime - rt_rq->rt_runtime; + /* + * Greedy reclaim, take back as much as we can. + */ for_each_cpu_mask(i, rd->span) { struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); s64 diff; + /* + * Can't reclaim from ourselves or disabled runqueues. + */ if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) continue; @@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq) } spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ BUG_ON(want); balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ rt_rq->rt_runtime = RUNTIME_INF; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); @@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq) if (unlikely(!scheduler_running)) return; + /* + * Reset each runqueue's bandwidth settings + */ for_each_leaf_rt_rq(rt_rq, rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); -- cgit v1.2.3 From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:44 +0200 Subject: sched: more sanity checks on the bandwidth settings While playing around with it, I noticed we missed some sanity checks. Also add some comments while we're there. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 669c49aa57f0..e1299de1765e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data) runtime = d->rt_runtime; } + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) return -EBUSY; total = to_ratio(period, runtime); + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ list_for_each_entry_rcu(child, &tg->children, siblings) { period = ktime_to_ns(child->rt_bandwidth.rt_period); runtime = child->rt_bandwidth.rt_runtime; @@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg) static int sched_rt_global_constraints(void) { - struct task_group *tg = &root_task_group; - u64 rt_runtime, rt_period; + u64 runtime, period; int ret = 0; if (sysctl_sched_rt_period <= 0) return -EINVAL; - rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); - rt_runtime = tg->rt_bandwidth.rt_runtime; + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - ret = __rt_schedulable(tg, rt_period, rt_runtime); + ret = __rt_schedulable(NULL, 0, 0); read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); -- cgit v1.2.3 From 57fdc26d4a734a3e00c6b2fc0e1e40ff8da4dc31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Sep 2008 15:33:45 +0200 Subject: sched: fixup buddy selection We should set the buddy even though we might already have the TIF_RESCHED flag set. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c59da7e3120..e3f3c10f7033 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1249,6 +1249,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (unlikely(se == pse)) return; + cfs_rq_of(pse)->next = pse; + /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -1256,8 +1258,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (test_tsk_need_resched(curr)) return; - cfs_rq_of(pse)->next = pse; - /* * Batch tasks do not preempt (their preemption is driven by * the tick): -- cgit v1.2.3 From f9092f358bc2ec5367621478811f046f82873376 Mon Sep 17 00:00:00 2001 From: Jonathan Steel Date: Mon, 22 Sep 2008 13:57:45 -0700 Subject: kexec: fix segmentation fault in kimage_add_entry A segmentation fault can occur in kimage_add_entry in kexec.c when loading a kernel image into memory. The fault occurs because a page is requested by calling kimage_alloc_page with gfp_mask GFP_KERNEL and the function may actually return a page with gfp_mask GFP_HIGHUSER. The high mem page is returned because it was swapped with the kernel page due to the kernel page being a page that will shortly be copied to. This patch ensures that kimage_alloc_page returns a page that was created with the correct gfp flags. I have verified the change and fixed the whitespace damage of the original patch. Jonathan did a great job of tracking this down after he hit the problem. -- Eric Signed-off-by: Jonathan Steel Signed-off-by: Eric W. Biederman Acked-by: Simon Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f3f0df35d4..aef265325cd3 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image, *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; -- cgit v1.2.3 From b87f17242da6b2ac6db2d179b2f93fb84cff2fbe Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Thu, 25 Sep 2008 09:53:54 +0530 Subject: sched: maintain only task entities in cfs_rq->tasks list cfs_rq->tasks list is used by the load balancer to iterate over all the tasks. Currently it holds all the entities (both task and group entities) because of which there is a need to check for group entities explicitly during load balancing. This patch changes the cfs_rq->tasks list to hold only task entities. Signed-off-by: Bharata B Rao Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e3f3c10f7033..95c1295ad26d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -528,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) inc_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } cfs_rq->nr_running++; se->on_rq = 1; - list_add(&se->group_node, &cfs_rq->tasks); } static void @@ -541,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); - if (entity_is_task(se)) + if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } cfs_rq->nr_running--; se->on_rq = 0; - list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -1335,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) if (next == &cfs_rq->tasks) return NULL; - /* Skip over entities that are not tasks */ - do { - se = list_entry(next, struct sched_entity, group_node); - next = next->next; - } while (next != &cfs_rq->tasks && !entity_is_task(se)); - - if (next == &cfs_rq->tasks && !entity_is_task(se)) - return NULL; - - cfs_rq->balance_iterator = next; - - if (entity_is_task(se)) - p = task_of(se); + se = list_entry(next, struct sched_entity, group_node); + p = task_of(se); + cfs_rq->balance_iterator = next->next; return p; } -- cgit v1.2.3 From 18d6522b86d21a04c8ac1ea79747e2e434a956d9 Mon Sep 17 00:00:00 2001 From: Atsuo Igarashi Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb: could not write to the last of valid memory with kgdb On the ARM architecture, kgdb will crash the kernel if the last byte of valid memory is written due to a flush_icache_range flushing beyond the memory boundary. Signed-off-by: Atsuo Igarashi Signed-off-by: Jason Wessel --- kernel/kgdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index eaa21fc9ad1d..949806ab67de 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -488,7 +488,7 @@ static int write_mem_msg(int binary) if (err) return err; if (CACHE_FLUSH_IS_SAFE) - flush_icache_range(addr, addr + length + 1); + flush_icache_range(addr, addr + length); return 0; } -- cgit v1.2.3 From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping On the x86 arch, user space single step exceptions should be ignored if they occur in the kernel space, such as ptrace stepping through a system call. First check if it is kgdb that is executing a single step, then ensure it is not an accidental traversal into the user space, while in kgdb, any other time the TIF_SINGLESTEP is set, kgdb should ignore the exception. On x86, arm, mips and powerpc, the kgdb_contthread usage was inconsistent with the way single stepping is implemented in the kgdb core. The arch specific stub should always set the kgdb_cpu_doing_single_step correctly if it is single stepping. This allows kgdb to correctly process an instruction steps if ptrace happens to be requesting an instruction step over a system call. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 949806ab67de..25d955dbb989 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -1462,7 +1462,7 @@ acquirelock: * Get the passive CPU lock which will hold all the non-primary * CPU in a spin state while the debugger is active */ - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = 0; i < NR_CPUS; i++) atomic_set(&passive_cpu_wait[i], 1); } @@ -1475,7 +1475,7 @@ acquirelock: #ifdef CONFIG_SMP /* Signal the other CPUs to enter kgdb_wait() */ - if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) + if ((!kgdb_single_step) && kgdb_do_roundup) kgdb_roundup_cpus(flags); #endif @@ -1494,7 +1494,7 @@ acquirelock: kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); kgdb_deactivate_sw_breakpoints(); kgdb_single_step = 0; - kgdb_contthread = NULL; + kgdb_contthread = current; exception_level = 0; /* Talk to debugger with gdbserial protocol */ @@ -1508,7 +1508,7 @@ acquirelock: kgdb_info[ks->cpu].task = NULL; atomic_set(&cpu_in_kgdb[ks->cpu], 0); - if (!kgdb_single_step || !kgdb_contthread) { + if (!kgdb_single_step) { for (i = NR_CPUS-1; i >= 0; i--) atomic_set(&passive_cpu_wait[i], 0); /* -- cgit v1.2.3 From 7659e349672bb0d378ef8d7d62bae4c53d2bdd18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:06:45 +0200 Subject: hrtimer: migrate pending list on cpu offline Impact: hrtimers which are on the pending list are not migrated at cpu offline and can be stale forever Add the pending list migration when CONFIG_HIGH_RES_TIMERS is enabled Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b8e4dce80a74..580bc66ae136 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1610,10 +1610,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } +#ifdef CONFIG_HIGH_RES_TIMERS +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + struct hrtimer *timer; + int raise = 0; + + while (!list_empty(&old_base->cb_pending)) { + timer = list_entry(old_base->cb_pending.next, + struct hrtimer, cb_entry); + + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0); + timer->base = &new_base->clock_base[timer->base->index]; + list_add_tail(&timer->cb_entry, &new_base->cb_pending); + raise = 1; + } + return raise; +} +#else +static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base, + struct hrtimer_cpu_base *new_base) +{ + return 0; +} +#endif + static void migrate_hrtimers(int cpu) { struct hrtimer_cpu_base *old_base, *new_base; - int i; + int i, raise = 0; BUG_ON(cpu_online(cpu)); old_base = &per_cpu(hrtimer_bases, cpu); @@ -1630,10 +1656,16 @@ static void migrate_hrtimers(int cpu) &new_base->clock_base[i]); } + if (migrate_hrtimer_pending(old_base, new_base)) + raise = 1; + spin_unlock(&old_base->lock); spin_unlock(&new_base->lock); local_irq_enable(); put_cpu_var(hrtimer_bases); + + if (raise) + hrtimer_raise_softirq(); } #endif /* CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3 From 41e1022eae71707f1ce6801a746f70b1e57b7567 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 14:09:39 +0200 Subject: hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers Impact: Stale timers after a CPU went offline. commit 37bb6cb4097e29ffee970065b74499cbf10603a3 hrtimer: unlock hrtimer_wakeup changed the hrtimer sleeper callback mode to CB_IRQSAFE_NO_SOFTIRQ due to locking problems. A result of this change is that when enqueue is called for an already expired hrtimer the callback function is not longer called directly from the enqueue code. The normal callers have been fixed in the code, but the migration code which moves hrtimers from a dead CPU to a live CPU was not made aware of this. This can be fixed by checking the timer state after the call to enqueue in the migration code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 580bc66ae136..ac2f6d6d4868 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1591,11 +1591,12 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU -static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, +static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct rb_node *node; + int raise = 0; while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); @@ -1607,7 +1608,27 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * Enqueue the timer. Allow reprogramming of the event device */ enqueue_hrtimer(timer, new_base, 1); + +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Happens with high res enabled when the timer was + * already expired and the callback mode is + * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ + * (hrtimer_sleeper). The enqueue code does not move + * them to the soft irq pending list for + * performance/latency reasons, but in the migration + * state, we need to do that otherwise we end up with + * a stale timer. + */ + if (timer->state == HRTIMER_STATE_INACTIVE) { + timer->state = HRTIMER_STATE_PENDING; + list_add_tail(&timer->cb_entry, + &new_base->cpu_base->cb_pending); + raise = 1; + } +#endif } + return raise; } #ifdef CONFIG_HIGH_RES_TIMERS @@ -1652,8 +1673,9 @@ static void migrate_hrtimers(int cpu) spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); + if (migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i])) + raise = 1; } if (migrate_hrtimer_pending(old_base, new_base)) -- cgit v1.2.3 From b00c1a99e7758f794923c61e5cd55268d61c9469 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:44:46 +0200 Subject: hrtimer: mark migration state Impact: during migration active hrtimers can be seen as inactive The migration code removes the hrtimers from the queues of the dead CPU and sets the state temporary to INACTIVE. The enqueue code sets it to ACTIVE/PENDING again. Prevent that the wrong state can be seen by using a separate migration state bit. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ac2f6d6d4868..ace723dd1e52 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1602,7 +1602,13 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); - __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); timer->base = new_base; /* * Enqueue the timer. Allow reprogramming of the event device @@ -1620,13 +1626,15 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * state, we need to do that otherwise we end up with * a stale timer. */ - if (timer->state == HRTIMER_STATE_INACTIVE) { + if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; list_add_tail(&timer->cb_entry, &new_base->cpu_base->cb_pending); raise = 1; } #endif + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; } return raise; } -- cgit v1.2.3 From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 29 Sep 2008 15:47:42 +0200 Subject: hrtimer: prevent migration of per CPU hrtimers Impact: per CPU hrtimers can be migrated from a dead CPU The hrtimer code has no knowledge about per CPU timers, but we need to prevent the migration of such timers and warn when such a timer is active at migration time. Explicitely mark the timers as per CPU and use a more understandable mode descriptor for the interrupts safe unlocked callback mode, which is used by hrtimer_sleeper and the scheduler code. Signed-off-by: Thomas Gleixner --- kernel/hrtimer.c | 37 +++++++++++++++++++++++++------------ kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 2 +- kernel/trace/trace_sysprof.c | 2 +- 4 files changed, 29 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ace723dd1e52..cdec83e722fa 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ BUG_ON(timer->function(timer) != HRTIMER_NORESTART); return 1; - case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: + case HRTIMER_CB_IRQSAFE_PERCPU: + case HRTIMER_CB_IRQSAFE_UNLOCKED: /* * This is solely for the sched tick emulation with * dynamic tick support to ensure that we do not * restart the tick right on the edge and end up with * the tick timer in the softirq ! The calling site - * takes care of this. + * takes care of this. Also used for hrtimer sleeper ! */ debug_hrtimer_deactivate(timer); return 1; @@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer) timer_stats_account_hrtimer(timer); fn = timer->function; - if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU || + timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) { /* * Used for scheduler timers, avoid lock inversion with * rq->lock and tasklist_lock. @@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) sl->timer.function = hrtimer_wakeup; sl->task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; #endif } @@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) #ifdef CONFIG_HOTPLUG_CPU static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, - struct hrtimer_clock_base *new_base) + struct hrtimer_clock_base *new_base, int dcpu) { struct hrtimer *timer; struct rb_node *node; @@ -1603,6 +1605,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); + /* + * Should not happen. Per CPU timers should be + * canceled _before_ the migration code is called + */ + if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) { + __remove_hrtimer(timer, old_base, + HRTIMER_STATE_INACTIVE, 0); + WARN(1, "hrtimer (%p %p)active but cpu %d dead\n", + timer, timer->function, dcpu); + continue; + } + /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the * timer could be seen as !active and just vanish away @@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base, /* * Happens with high res enabled when the timer was * already expired and the callback mode is - * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ - * (hrtimer_sleeper). The enqueue code does not move - * them to the soft irq pending list for - * performance/latency reasons, but in the migration - * state, we need to do that otherwise we end up with - * a stale timer. + * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The + * enqueue code does not move them to the soft irq + * pending list for performance/latency reasons, but + * in the migration state, we need to do that + * otherwise we end up with a stale timer. */ if (timer->state == HRTIMER_STATE_MIGRATE) { timer->state = HRTIMER_STATE_PENDING; @@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu) for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { if (migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i])) + &new_base->clock_base[i], cpu)) raise = 1; } diff --git a/kernel/sched.c b/kernel/sched.c index 13dd2db9fb2d..ad1962dc0aa2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; - rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; } static void start_rt_bandwidth(struct rt_bandwidth *rt_b) @@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq) hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rq->hrtick_timer.function = hrtick; - rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; } #else static inline void hrtick_clear(struct rq *rq) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 39019b3f7621..cb02324bdb88 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -625,7 +625,7 @@ void tick_setup_sched_timer(void) */ hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); ts->sched_timer.function = tick_sched_timer; - ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index bb948e52ce20..db58fb66a135 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -202,7 +202,7 @@ static void start_stack_timer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = stack_trace_timer_fn; - hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; + hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU; hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); } -- cgit v1.2.3 From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Sun, 28 Sep 2008 23:09:31 +0100 Subject: mm owner: fix race between swapoff and exit There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc// or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins Signed-off-by: Balbir Singh Signed-off-by: Jiri Slaby Signed-off-by: Daisuke Nishimura Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- kernel/exit.c | 12 ++++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13932abde159..a0123d75ec9a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) { - struct cgroup *oldcgrp, *newcgrp; + struct cgroup *oldcgrp, *newcgrp = NULL; if (need_mm_owner_callback) { int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; oldcgrp = task_cgroup(old, ss->subsys_id); - newcgrp = task_cgroup(new, ss->subsys_id); + if (new) + newcgrp = task_cgroup(new, ss->subsys_id); if (oldcgrp == newcgrp) continue; if (ss->mm_owner_changed) diff --git a/kernel/exit.c b/kernel/exit.c index 16395644a98f..85a83c831856 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p) * If there are other users of the mm and the owner (us) is exiting * we need to find a new owner to take on the responsibility. */ - if (!mm) - return 0; if (atomic_read(&mm->mm_users) <= 1) return 0; if (mm->owner != p) @@ -627,6 +625,16 @@ retry: } while_each_thread(g, c); read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL, + * so that subsystems can understand the callback and take action. + */ + down_write(&mm->mmap_sem); + cgroup_mm_owner_callbacks(mm->owner, NULL); + mm->owner = NULL; + up_write(&mm->mmap_sem); return; assign_new_owner: -- cgit v1.2.3 From 64b9e0294d24a4204232e13e01630b0690e48d61 Mon Sep 17 00:00:00 2001 From: "Amit K. Arora" Date: Tue, 30 Sep 2008 17:15:39 +0530 Subject: sched: minor optimizations in wake_affine and select_task_rq_fair This patch does following: o Removes unused variable and argument "rq". o Optimizes one of the "if" conditions in wake_affine() - i.e. if "balanced" is true, we need not do rest of the calculations in the condition. o If this cpu is same as the previous cpu (on which woken up task was running when it went to sleep), no need to call wake_affine at all. Signed-off-by: Amit K Arora Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 95c1295ad26d..fcbe850a5a90 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1088,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif static int -wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, +wake_affine(struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, int idx, unsigned long load, unsigned long this_load, unsigned int imbalance) @@ -1136,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); - if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || - balanced) { + if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= + tl_per_task)) { /* * This domain has SD_WAKE_AFFINE and * p is cache cold in this domain, and @@ -1156,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync) struct sched_domain *sd, *this_sd = NULL; int prev_cpu, this_cpu, new_cpu; unsigned long load, this_load; - struct rq *rq, *this_rq; + struct rq *this_rq; unsigned int imbalance; int idx; prev_cpu = task_cpu(p); - rq = task_rq(p); this_cpu = smp_processor_id(); this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; + if (prev_cpu == this_cpu) + goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1193,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync) load = source_load(prev_cpu, idx); this_load = target_load(this_cpu, idx); - if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, load, this_load, imbalance)) return this_cpu; - if (prev_cpu == this_cpu) - goto out; - /* * Start passive balancing when half the imbalance_pct * limit is reached. -- cgit v1.2.3 From aa94fbd5ccd840c8ab26d02439ec799b03a72547 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 2 Oct 2008 14:50:14 -0700 Subject: fix error-path NULL deref in alloc_posix_timer() Found by static checker (http://repo.or.cz/w/smatch.git). Signed-off-by: Dan Carpenter Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e36d5798cbff..5131e5471169 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); - tmr = NULL; + return NULL; } memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); return tmr; -- cgit v1.2.3 From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Oct 2008 16:06:39 -0700 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 166 +++++++++++++++++++++++++++------------------------- 1 file changed, 86 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128ca2c9..0d07e6e51578 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ -- cgit v1.2.3 From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Oct 2008 10:41:00 +0200 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU, fix fix the !CONFIG_RCU_CPU_STALL_DETECTOR path: kernel/rcuclassic.c: In function '__rcu_pending': kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall' Signed-off-by: Ingo Molnar --- kernel/rcuclassic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 0d07e6e51578..37f72e551542 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) { } -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp) { } -- cgit v1.2.3 From d294eb83d8d39a29f01dad391f15fc3a29aa04f9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 Oct 2008 12:10:10 +0200 Subject: cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a warning on latest -tip: kernel/cpuset.c: Dans la fonction «scan_for_empty_cpusets» : kernel/cpuset.c:1932: attention : passing argument 1 of «list_add_tail» discards qualifiers from pointer target type Actually the struct cpuset *root passed in parameter to scan_for_empty_cpusets is not supposed to be const since an entry is added on the tail of its list. Just correct the qualifier. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 827cd9adccb2..eab7bd6628e0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */ -static void scan_for_empty_cpusets(const struct cpuset *root) +static void scan_for_empty_cpusets(struct cpuset *root) { LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ -- cgit v1.2.3 From 07454bfff151d2465ada809bbaddf3548cc1097c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Oct 2008 10:51:07 +0200 Subject: clockevents: check broadcast tick device not the clock events device Impact: jiffies increment too fast. Hugh Dickins noted that with NOHZ=n and HIGHRES=n jiffies get incremented too fast. The reason is a wrong check in the broadcast enter/exit code, which keeps the local apic timer in periodic mode when the switch happens. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index bd7034542399..cb01cd8f919b 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why) case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) @@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_broadcast_force && cpu_isset(cpu, tick_broadcast_mask)) { cpu_clear(cpu, tick_broadcast_mask); - if (bc->mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) tick_setup_periodic(dev, 0); } break; -- cgit v1.2.3 From f6121f4f8708195e88cbdf8dd8d171b226b3f858 Mon Sep 17 00:00:00 2001 From: Dario Faggioli Date: Fri, 3 Oct 2008 17:40:46 +0200 Subject: sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq While working on the new version of the code for SCHED_SPORADIC I noticed something strange in the present throttling mechanism. More specifically in the throttling timer handler in sched_rt.c (do_sched_rt_period_timer()) and in rt_rq_enqueue(). The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only asks for rescheduling if the runqueue has a sched_entity associated to it (i.e., rt_rq->rt_se != NULL). Now, if the runqueue is the root rq (which has a rt_se = NULL) rescheduling does not take place, and it is delayed to some undefined instant in the future. This imply some random bandwidth usage by the RT tasks under throttling. For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT task will get less than 95%. In our tests we got something varying between 70% to 95%. Using smaller time values, e.g., 95ms/100ms, things are even worse, and I can see values also going down to 20-25%!! The tests we performed are simply running 'yes' as a SCHED_FIFO task, and checking the CPU usage with top, but we can investigate thoroughly if you think it is needed. Things go much better, for us, with the attached patch... Don't know if it is the best approach, but it solved the issue for us. Signed-off-by: Dario Faggioli Signed-off-by: Michael Trimarchi Acked-by: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d570a8cc4fcd..cdf5740ab03e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; struct sched_rt_entity *rt_se = rt_rq->rt_se; - if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; - - enqueue_rt_entity(rt_se); + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se); if (rt_rq->highest_prio < curr->prio) resched_task(curr); } -- cgit v1.2.3 From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 6 Oct 2008 09:27:00 +0800 Subject: sched: remove redundant code in cpu_cgroup_create() css will be initialized by cgroup core. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2caedc47e764..9715f4ce6cfe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!cgrp->parent) { /* This is early initialization for the top cgroup */ - init_task_group.css.cgroup = cgrp; return &init_task_group.css; } @@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); - /* Bind the cgroup to task_group object we just created */ - tg->css.cgroup = cgrp; - return &tg->css; } -- cgit v1.2.3 From cc1e0f4f7ad95a9eb81e1904cb16068af226180d Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 6 Oct 2008 13:50:59 -0500 Subject: kgdb: call touch_softlockup_watchdog on resume The softlockup watchdog needs to be touched when resuming the from the kgdb stopped state to avoid the printk that a CPU is stuck if the debugger was active for longer than the softlockup threshold. Signed-off-by: Jason Wessel --- kernel/kgdb.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 25d955dbb989..e4dcfb2272a4 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1432,6 +1433,7 @@ acquirelock: atomic_read(&kgdb_cpu_doing_single_step) != cpu) { atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1524,6 +1526,7 @@ acquirelock: kgdb_restore: /* Free kgdb_active */ atomic_set(&kgdb_active, -1); + touch_softlockup_watchdog(); clocksource_touch_watchdog(); local_irq_restore(flags); -- cgit v1.2.3 From 2fb7635c4cea310992a39580133099dd99ad151c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Oct 2008 09:16:04 +0200 Subject: sched: sync wakeups vs avg_overlap While looking at the code I wondered why we always do: sync && avg_overlap < migration_cost Which is a bit odd, since the overlap test was meant to detect sync wakeups so using it to specialize sync wakeups doesn't make much sense. Hence change the code to do: sync || avg_overlap < migration_cost Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index fcbe850a5a90..18fd17172eb6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1103,6 +1103,11 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) return 0; + if (!sync && sched_feat(SYNC_WAKEUPS) && + curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + sync = 1; + /* * If sync wakeup then subtract the (maximum possible) * effect of the currently running task from the load @@ -1127,11 +1132,8 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced) { - if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) - return 1; - } + if (sync && balanced) + return 1; schedstat_inc(p, se.nr_wakeups_affine_attempts); tl_per_task = cpu_avg_load_per_task(this_cpu); @@ -1268,9 +1270,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) if (!sched_feat(WAKEUP_PREEMPT)) return; - if (sched_feat(WAKEUP_OVERLAP) && sync && - se->avg_overlap < sysctl_sched_migration_cost && - pse->avg_overlap < sysctl_sched_migration_cost) { + if (sched_feat(WAKEUP_OVERLAP) && (sync || + (se->avg_overlap < sysctl_sched_migration_cost && + pse->avg_overlap < sysctl_sched_migration_cost))) { resched_task(curr); return; } -- cgit v1.2.3 From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Oct 2008 11:35:51 +0200 Subject: sched debug: add name to sched_domain sysctl entries add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make it easier to see which specific scheduler domain remained at that entry. Since we process the scheduler domain tree and simplify it, it's not always immediately clear during debugging which domain came from where. depends on CONFIG_SCHED_DEBUG=y. Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9715f4ce6cfe..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry, static struct ctl_table * sd_alloc_ctl_domain_table(struct sched_domain *sd) { - struct ctl_table *table = sd_alloc_ctl_entry(12); + struct ctl_table *table = sd_alloc_ctl_entry(13); if (table == NULL) return NULL; @@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) sizeof(int), 0644, proc_dointvec_minmax); set_table_entry(&table[10], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); - /* &table[11] is terminator */ + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ return table; } @@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) * Non-inlined to reduce accumulated stack pressure in build_sched_domains() */ +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + #define SD_INIT(sd, type) sd_init_##type(sd) + #define SD_INIT_FUNC(type) \ static noinline void sd_init_##type(struct sched_domain *sd) \ { \ memset(sd, 0, sizeof(*sd)); \ *sd = SD_##type##_INIT; \ sd->level = SD_LV_##type; \ + SD_INIT_NAME(sd, type); \ } SD_INIT_FUNC(CPU) -- cgit v1.2.3 From 8083e4ad970e4eb567e31037060cdd4ba346f0c0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Mon, 4 Aug 2008 11:59:11 -0700 Subject: [CPUFREQ][5/6] cpufreq: Changes to get_cpu_idle_time_us(), used by ondemand governor export get_cpu_idle_time_us() for it to be used in ondemand governor. Last update time can be current time when the CPU is currently non-idle, accounting for the busy time since last idle. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- kernel/time/tick-sched.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb02324bdb88..a4d219398167 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - *last_update_time = ktime_to_us(ts->idle_lastupdate); + if (!tick_nohz_enabled) + return -1; + + if (ts->idle_active) + *last_update_time = ktime_to_us(ts->idle_lastupdate); + else + *last_update_time = ktime_to_us(ktime_get()); + return ktime_to_us(ts->idle_sleeptime); } +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); /** * tick_nohz_stop_sched_tick - stop the idle tick from the idle task -- cgit v1.2.3