Fix longstanding load balancing bug in the scheduler

The scheduler will stop load balancing if the most busy processor contains processes pinned via processor affinity. The scheduler currently only does one search for busiest cpu. If it cannot pull any tasks away from the busiest cpu because they were pinned then the scheduler goes into a corner and sulks leaving the idle processors idle. F.e. If you have processor 0 busy running four tasks pinned via taskset, there are none on processor 1 and one just started two processes on processor 2 then the scheduler will not move one of the two processes away from processor 2. This patch fixes that issue by forcing the scheduler to come out of its corner and retrying the load balancing by considering other processors for load balancing. This patch was originally developed by John Hawkes and discussed at http://marc.theaimsgroup.com/?l=linux-kernel&m=113901368523205&w=2. I have removed extraneous material and gone back to equipping struct rq with the cpu the queue is associated with since this makes the patch much easier and it is likely that others in the future will have the same difficulty of figuring out which processor owns which runqueue. The overhead added through these patches is a single word on the stack if the kernel is configured to support 32 cpus or less (32 bit). For 32 bit environments the maximum number of cpus that can be configued is 255 which would result in the use of 32 bytes additional on the stack. On IA64 up to 1k cpus can be configured which will result in the use of 128 additional bytes on the stack. The maximum additional cache footprint is one cacheline. Typically memory use will be much less than a cacheline and the additional cpumask will be placed on the stack in a cacheline that already contains other local variable. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Adrian Bunk <bunk@stusta.de>
author: Christoph Lameter <clameter@sgi.com> 2006-11-11 12:13:53 +0100
committer: Adrian Bunk <bunk@stusta.de> 2006-11-11 12:13:53 +0100
commit: ef147950bee58ecfa672433badf679608fb90668 (patch)
tree: 499fda5911ff48ac9ef93d0fddfe1d20eaa796c9
parent: f9a198cc962f1347564493d406789b79cab4294b (diff)
1 files changed, 31 insertions, 7 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 4e7efac7b1ec..d1707c5f2880 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1924,7 +1924,8 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+		   unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+		   cpumask_t *cpus)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -1950,6 +1951,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		avg_load = 0;
 
 		for_each_cpu_mask(i, group->cpumask) {
+			if (!cpu_isset(i, *cpus))
+				continue;
+
 			if (*sd_idle && !idle_cpu(i))
 				*sd_idle = 0;
 
@@ -2063,13 +2067,16 @@ out_balanced:
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group,
-	enum idle_type idle)
+	enum idle_type idle, cpumask_t *cpus)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
+		if (!cpu_isset(i, *cpus))
+			continue;
+
 		load = source_load(i, 0);
 
 		if (load > max_load) {
@@ -2102,19 +2109,22 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 	int nr_moved, all_pinned = 0;
 	int active_balance = 0;
 	int sd_idle = 0;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[idle]);
 
-	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+			&sd_idle, &cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[idle]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, idle);
+	busiest = find_busiest_queue(group, idle, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2138,8 +2148,12 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		double_rq_unlock(this_rq, busiest);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
-		if (unlikely(all_pinned))
+		if (unlikely(all_pinned)) {
+			cpu_clear(busiest->cpu, cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
 			goto out_balanced;
+		}
 	}
 
 	if (!nr_moved) {
@@ -2226,18 +2240,21 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	unsigned long imbalance;
 	int nr_moved = 0;
 	int sd_idle = 0;
+	cpumask_t cpus = CPU_MASK_ALL;
 
 	if (sd->flags & SD_SHARE_CPUPOWER)
 		sd_idle = 1;
 
 	schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+			&sd_idle, &cpus);
 	if (!group) {
 		schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group, NEWLY_IDLE);
+	busiest = find_busiest_queue(group, NEWLY_IDLE, &cpus);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -2254,6 +2271,12 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 					imbalance, sd, NEWLY_IDLE, NULL);
 		spin_unlock(&busiest->lock);
+
+		if (!nr_moved) {
+			cpu_clear(busiest->cpu, cpus);
+			if (!cpus_empty(cpus))
+				goto redo;
+		}
 	}
 
 	if (!nr_moved) {
@@ -6037,6 +6060,7 @@ void __init sched_init(void)
 			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
+		rq->cpu = i;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		rq->cpu = i;
author	Christoph Lameter <clameter@sgi.com>	2006-11-11 12:13:53 +0100
committer	Adrian Bunk <bunk@stusta.de>	2006-11-11 12:13:53 +0100
commit	ef147950bee58ecfa672433badf679608fb90668 (patch)
tree	499fda5911ff48ac9ef93d0fddfe1d20eaa796c9
parent	f9a198cc962f1347564493d406789b79cab4294b (diff)