---

 25-akpm/include/linux/sched.h |    6 
 25-akpm/kernel/sched.c        |  364 ++++++++++++++++++------------------------
 2 files changed, 166 insertions(+), 204 deletions(-)

diff -puN include/linux/sched.h~sched-ingo include/linux/sched.h
--- 25/include/linux/sched.h~sched-ingo	2004-03-26 12:26:21.472049240 -0800
+++ 25-akpm/include/linux/sched.h	2004-03-26 12:26:21.476048632 -0800
@@ -601,7 +601,7 @@ struct sched_domain {
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (5*1000000/2),	\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
 				| SD_WAKE_AFFINE,	\
@@ -621,7 +621,7 @@ struct sched_domain {
 	.busy_factor		= 8,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 3,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_EXEC,	\
 	.last_balance		= jiffies,		\
@@ -647,7 +647,7 @@ static inline int set_cpus_allowed(task_
 
 extern unsigned long long sched_clock(void);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SMP
 extern void sched_balance_exec(void);
 #else
 #define sched_balance_exec()   {}
diff -puN kernel/sched.c~sched-ingo kernel/sched.c
--- 25/kernel/sched.c~sched-ingo	2004-03-26 12:26:21.473049088 -0800
+++ 25-akpm/kernel/sched.c	2004-03-26 12:26:21.483047568 -0800
@@ -185,6 +185,8 @@ static unsigned int task_timeslice(task_
 	return BASE_TIMESLICE(p);
 }
 
+#define task_hot(p, now, sd) \
+		((now) - (p)->timestamp < (sd)->cache_hot_time)
 /*
  * These are the runqueue data structures:
  */
@@ -209,14 +211,7 @@ struct prio_array {
 struct runqueue {
 	spinlock_t lock;
 
-	/*
-	 * nr_running and cpu_load should be in the same cacheline because
-	 * remote CPUs use both these fields when doing load calculation.
-	 */
 	unsigned long nr_running;
-#ifdef CONFIG_SMP
-	unsigned long cpu_load;
-#endif
 	unsigned long long nr_switches;
 	unsigned long expired_timestamp, nr_uninterruptible;
 	unsigned long long timestamp_last_tick;
@@ -318,6 +313,21 @@ static void enqueue_task(struct task_str
 	p->array = array;
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Used by the migration code - we pull tasks from the head of the
+ * remote queue so we want these tasks to show up at the head of the
+ * local queue:
+ */
+static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
+{
+	list_add(&p->run_list, array->queue + p->prio);
+	__set_bit(p->prio, array->bitmap);
+	array->nr_active++;
+	p->array = array;
+}
+#endif
+
 /*
  * effective_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
@@ -606,33 +616,22 @@ EXPORT_SYMBOL_GPL(kick_process);
 /*
  * Return a low guess at the load of cpu.
  */
-static inline unsigned long get_low_cpu_load(int cpu)
+static inline unsigned long cpu_load(int cpu)
 {
-	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-
-	return min(rq->cpu_load, load_now);
-}
-
-static inline unsigned long get_high_cpu_load(int cpu)
-{
-	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
-
-	return max(rq->cpu_load, load_now);
+	return cpu_rq(cpu)->nr_running * SCHED_LOAD_SCALE;
 }
 
 #endif
 
 /*
- * wake_idle() is useful especially on SMT architectures to wake a
- * task onto an idle sibling if we would otherwise wake it onto a
- * busy sibling.
+ * wake_idle() can be used on SMT architectures to wake a task onto
+ * an idle sibling if 'cpu' is not idle.
  *
- * Returns the CPU we should wake onto.
+ * Returns 'cpu' if 'cpu' is idle or no siblings of 'cpu' are idle,
+ * otherwise returns an idle sibling.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, task_t *p)
+static inline int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
 	struct sched_domain *sd;
@@ -649,7 +648,6 @@ static int wake_idle(int cpu, task_t *p)
 	for_each_cpu_mask(i, tmp) {
 		if (!cpu_isset(i, p->cpus_allowed))
 			continue;
-
 		if (idle_cpu(i))
 			return i;
 	}
@@ -685,17 +683,16 @@ static int try_to_wake_up(task_t * p, un
 	runqueue_t *rq;
 	int cpu, this_cpu;
 #ifdef CONFIG_SMP
+	int new_cpu;
 	unsigned long long now;
 	unsigned long load, this_load;
 	struct sched_domain *sd;
-	int new_cpu;
 #endif
 
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
-
 	if (p->array)
 		goto out_running;
 
@@ -703,37 +700,41 @@ static int try_to_wake_up(task_t * p, un
 	this_cpu = smp_processor_id();
 
 #ifdef CONFIG_SMP
-	if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
-		goto out_activate;
-
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
-
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-		goto out_set_cpu;
+	new_cpu = this_cpu;
+	sd = cpu_sched_domain(this_cpu);
+	now = sched_clock();
 
-	/* Passive load balancing */
-	load = get_low_cpu_load(cpu);
-	this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE;
-	if (load > this_load)
+	if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu)))
 		goto out_set_cpu;
+	if (task_running(rq, p))
+		goto out_activate;
 
-	now = sched_clock();
+	/*
+	 * Passive load balancing, migrate the task if:
+	 *
+	 * - remote load is higher than local load, and
+	 * - task is woken up by another task
+	 * - or task is woken up from an irq handler and task is cache-cold.
+	 */
+	load = cpu_load(cpu);
+	this_load = cpu_load(this_cpu);
 
+	if (load > this_load && (!in_interrupt() || !task_hot(p, now, sd)))
+		goto out_set_cpu;
 	/*
 	 * Migrate the task to the waking domain.
-	 * Do not violate hard affinity.
+	 * Do not violate soft affinity.
 	 */
 	for_each_domain(this_cpu, sd) {
 		if (!(sd->flags & SD_WAKE_AFFINE))
 			break;
-		if (now - p->timestamp < sd->cache_hot_time)
+		if (task_hot(p, now, sd))
 			break;
-
 		if (cpu_isset(cpu, sd->span))
 			goto out_set_cpu;
 	}
 
-	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+	new_cpu = cpu;
 out_set_cpu:
 	new_cpu = wake_idle(new_cpu, p);
 	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
@@ -748,10 +749,8 @@ repeat_lock_task:
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
-
 	if (p->array)
 		goto out_running;
-
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
@@ -769,7 +768,7 @@ out_activate:
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
+	 * dont trigger a preemption, if the woken up task will run on
 	 * this cpu. (in this case the 'I will reschedule' promise of
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
@@ -1102,7 +1101,6 @@ enum idle_type
 };
 
 #ifdef CONFIG_SMP
-#ifdef CONFIG_NUMA
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1112,8 +1110,8 @@ enum idle_type
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
 	migration_req_t req;
-	runqueue_t *rq;
 	unsigned long flags;
+	runqueue_t *rq;
 
 	lock_cpu_hotplug();
 	rq = task_rq_lock(p, &flags);
@@ -1148,19 +1146,19 @@ out:
  */
 static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
 {
+	int i = 0, min_load, this_cpu, best_cpu;
 	cpumask_t tmp;
-	int i, min_load, this_cpu, best_cpu;
 
 	best_cpu = this_cpu = task_cpu(p);
-	min_load = INT_MAX;
+
+	/* subtract the currently running task's load effect: */
+	min_load = cpu_load(i) - SCHED_LOAD_SCALE;
 
 	cpus_and(tmp, sd->span, cpu_online_map);
+	cpu_clear(this_cpu, tmp);
+
 	for_each_cpu_mask(i, tmp) {
-		unsigned long load;
-		if (i == this_cpu)
-			load = get_low_cpu_load(i);
-		else
-			load = get_high_cpu_load(i) + SCHED_LOAD_SCALE;
+		unsigned long load = cpu_load(i);
 
 		if (min_load > load) {
 			best_cpu = i;
@@ -1172,28 +1170,30 @@ static int sched_best_cpu(struct task_st
 
 /*
  * sched_balance_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
+ * domain and try to migrate the current task to the least loaded CPU.
  *
  * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * the task has the smallest effective cache footprint - a completely new
+ * process image is being created, so almost all of the currently existing
+ * cache footprint is irrelevant. So we attempt to balance this task as
+ * broadly as possible, without considering migration costs, which costs
+ * otherwise affect all other types of task migrations.
  */
 void sched_balance_exec(void)
 {
 	struct sched_domain *sd, *best_sd = NULL;
-	int new_cpu;
-	int this_cpu = get_cpu();
+	int new_cpu, this_cpu = get_cpu();
 
-	/* Prefer the current CPU if there's only this task running */
+	/* Prefer the current CPU if there's only this task running: */
 	if (this_rq()->nr_running <= 1)
 		goto out;
 
-	for_each_domain(this_cpu, sd) {
+	for_each_domain(this_cpu, sd)
 		if (sd->flags & SD_BALANCE_EXEC)
 			best_sd = sd;
-	}
 
 	if (best_sd) {
-		new_cpu = sched_best_cpu(current, sd);
+		new_cpu = sched_best_cpu(current, best_sd);
 		if (new_cpu != this_cpu) {
 			put_cpu();
 			sched_migrate_task(current, new_cpu);
@@ -1203,7 +1203,6 @@ void sched_balance_exec(void)
 out:
 	put_cpu();
 }
-#endif /* CONFIG_NUMA */
 
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -1226,13 +1225,13 @@ static void double_lock_balance(runqueue
  */
 static inline
 void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
-		runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
+	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	this_rq->nr_running--;
+	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
-	enqueue_task(p, this_array);
+	enqueue_task_head(p, this_array);
 	p->timestamp = sched_clock() -
 				(src_rq->timestamp_last_tick - p->timestamp);
 	/*
@@ -1248,7 +1247,7 @@ void pull_task(runqueue_t *src_rq, prio_
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		struct sched_domain *sd, enum idle_type idle)
+		     struct sched_domain *sd, enum idle_type idle)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1261,15 +1260,19 @@ int can_migrate_task(task_t *p, runqueue
 	if (!cpu_isset(this_cpu, p->cpus_allowed))
 		return 0;
 
-	/* Aggressive migration if we've failed balancing */
-	if (idle == NEWLY_IDLE ||
-			sd->nr_balance_failed < sd->cache_nice_tries) {
-		if ((rq->timestamp_last_tick - p->timestamp)
-						< sd->cache_hot_time)
-			return 0;
-	}
+	if (!task_hot(p, rq->timestamp_last_tick, sd))
+		return 1;
 
-	return 1;
+	/* Aggressive migration if newly idle or we've failed balancing */
+	if (idle == NEWLY_IDLE)
+		return 1;
+	if (idle == IDLE && (sd->flags & SD_BALANCE_NEWIDLE))
+		return 1;
+	if (sd->nr_balance_failed >= sd->cache_nice_tries)
+		return 1;
+
+	/* abort the search: */
+	return -1;
 }
 
 /*
@@ -1280,30 +1283,24 @@ int can_migrate_task(task_t *p, runqueue
  * Called with both runqueues locked.
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-			unsigned long max_nr_move, struct sched_domain *domain,
-			enum idle_type idle)
+		      unsigned long max_nr_move, struct sched_domain *sd,
+		      enum idle_type idle)
 {
-	int idx;
-	int pulled = 0;
 	prio_array_t *array, *dst_array;
 	struct list_head *head, *curr;
+	int ret, idx, pulled = 0;
 	task_t *tmp;
 
 	if (max_nr_move <= 0 || busiest->nr_running <= 1)
 		goto out;
 
-	/*
-	 * We first consider expired tasks. Those will likely not be
-	 * executed in the near future, and they are most likely to
-	 * be cache-cold, thus switching CPUs has the least effect
-	 * on them.
-	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
+	/* We first consider active tasks. */
+	if (busiest->active->nr_active) {
 		array = busiest->active;
 		dst_array = this_rq->active;
+	} else {
+		array = busiest->expired;
+		dst_array = this_rq->expired;
 	}
 
 new_array:
@@ -1315,22 +1312,27 @@ skip_bitmap:
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
-			array = busiest->active;
-			dst_array = this_rq->active;
+		if (array == busiest->active && busiest->expired->nr_active) {
+			array = busiest->expired;
+			dst_array = this_rq->expired;
 			goto new_array;
 		}
 		goto out;
 	}
 
 	head = array->queue + idx;
-	curr = head->prev;
+	curr = head->next;
 skip_queue:
 	tmp = list_entry(curr, task_t, run_list);
 
-	curr = curr->prev;
+	curr = curr->next;
 
-	if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) {
+	ret = can_migrate_task(tmp, busiest, this_cpu, sd, idle);
+	if (ret == -1) {
+		idx++;
+		goto skip_bitmap;
+	}
+	if (!ret) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1353,46 +1355,30 @@ out:
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the number of tasks which should be
- * moved to restore balance via the imbalance parameter.
+ * moved to restore balance, via the imbalance parameter.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-		unsigned long *imbalance, enum idle_type idle)
+		   unsigned long *imbalance, enum idle_type idle)
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	unsigned long max_load, avg_load, total_load, this_load;
+	unsigned int total_pwr;
 
-	max_load = this_load = total_load = total_pwr = 0;
+	max_load = this_load = total_load = 0;
+	total_pwr = 0;
 
 	do {
 		cpumask_t tmp;
-		unsigned long load;
-		int local_group;
-		int i, nr_cpus = 0;
-
-		local_group = cpu_isset(this_cpu, group->cpumask);
+		int i;
 
 		/* Tally up the load of all CPUs in the group */
-		avg_load = 0;
 		cpus_and(tmp, group->cpumask, cpu_online_map);
-		if (cpus_empty(tmp)) {
-			WARN_ON(1);
-			goto out_balanced;
-		}
+		WARN_ON(cpus_empty(tmp));
 
-		for_each_cpu_mask(i, tmp) {
-			/* Bias balancing toward cpus of our domain */
-			if (local_group) {
-				load = get_high_cpu_load(i);
-			} else
-				load = get_low_cpu_load(i);
-
-			nr_cpus++;
-			avg_load += load;
-		}
-
-		if (!nr_cpus)
-			goto nextgroup;
+		avg_load = 0;
+		for_each_cpu_mask(i, tmp)
+			avg_load += cpu_load(i);
 
 		total_load += avg_load;
 		total_pwr += group->cpu_power;
@@ -1400,7 +1386,7 @@ find_busiest_group(struct sched_domain *
 		/* Adjust by relative CPU power of the group */
 		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 
-		if (local_group) {
+		if (cpu_isset(this_cpu, group->cpumask)) {
 			this_load = avg_load;
 			this = group;
 			goto nextgroup;
@@ -1437,37 +1423,9 @@ nextgroup:
 	 */
 	*imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
 
-	if (*imbalance <= SCHED_LOAD_SCALE/2) {
-		unsigned long pwr_now = 0, pwr_move = 0;
-		unsigned long tmp;
-
-		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
-		 */
+	if (*imbalance <= SCHED_LOAD_SCALE/2)
+		goto out_balanced;
 
-		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
-		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
-		if (max_load > tmp)
-			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
-							max_load - tmp);
-
-		/* Amount of load we'd add */
-		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
-		pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain another 8th of a CPU worth of throughput */
-		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
-			goto out_balanced;
-		*imbalance = 1;
-		return busiest;
-	}
 
 	/* How many tasks to actually move to equalise the imbalance */
 	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
@@ -1492,14 +1450,15 @@ out_balanced:
  */
 static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
-	cpumask_t tmp;
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
+	cpumask_t tmp;
 	int i;
 
 	cpus_and(tmp, group->cpumask, cpu_online_map);
+
 	for_each_cpu_mask(i, tmp) {
-		load = get_low_cpu_load(i);
+		load = cpu_load(i);
 
 		if (load >= max_load) {
 			max_load = load;
@@ -1520,8 +1479,8 @@ static int load_balance(int this_cpu, ru
 			struct sched_domain *sd, enum idle_type idle)
 {
 	struct sched_group *group;
-	runqueue_t *busiest;
 	unsigned long imbalance;
+	runqueue_t *busiest;
 	int nr_moved;
 
 	spin_lock(&this_rq->lock);
@@ -1529,26 +1488,19 @@ static int load_balance(int this_cpu, ru
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
 	if (!group)
 		goto out_balanced;
-
 	busiest = find_busiest_queue(group);
-	if (!busiest)
-		goto out_balanced;
-	if (unlikely(busiest == this_rq)) {
-		WARN_ON(1);
+	if (!busiest || busiest == this_rq)
 		goto out_balanced;
-	}
 
 	/* Attempt to move tasks */
 	double_lock_balance(this_rq, busiest);
-
 	nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle);
 	spin_unlock(&this_rq->lock);
 	spin_unlock(&busiest->lock);
 
 	if (!nr_moved) {
 		sd->nr_balance_failed++;
-
-		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
+		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries)) {
 			int wake = 0;
 
 			spin_lock(&busiest->lock);
@@ -1560,17 +1512,16 @@ static int load_balance(int this_cpu, ru
 			spin_unlock(&busiest->lock);
 			if (wake)
 				wake_up_process(busiest->migration_thread);
-
 			/*
-			 * We've kicked active balancing, reset the failure
-			 * counter.
+			 * We've kicked active balancing, reset the
+			 * failure counter:
 			 */
-			sd->nr_balance_failed = sd->cache_nice_tries;
+			sd->nr_balance_failed = 0;
 		}
 	} else
 		sd->nr_balance_failed = 0;
 
-	/* We were unbalanced, so reset the balancing interval */
+	/* reset the balancing interval: */
 	sd->balance_interval = sd->min_interval;
 
 	return nr_moved;
@@ -1578,7 +1529,7 @@ static int load_balance(int this_cpu, ru
 out_balanced:
 	spin_unlock(&this_rq->lock);
 
-	/* tune up the balancing interval */
+	/* tune up the balancing interval: */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
 
@@ -1631,14 +1582,11 @@ static inline void idle_balance(int this
 	if (unlikely(cpu_is_offline(this_cpu)))
 		return;
 
-	for_each_domain(this_cpu, sd) {
-		if (sd->flags & SD_BALANCE_NEWIDLE) {
-			if (load_balance_newidle(this_cpu, this_rq, sd)) {
+	for_each_domain(this_cpu, sd)
+		if (sd->flags & SD_BALANCE_NEWIDLE)
+			if (load_balance_newidle(this_cpu, this_rq, sd))
 				/* We've pulled tasks over so stop searching */
 				break;
-			}
-		}
-	}
 }
 
 /*
@@ -1651,19 +1599,18 @@ static inline void idle_balance(int this
  */
 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 {
-	struct sched_domain *sd;
 	struct sched_group *group, *busy_group;
+	struct sched_domain *sd;
 	int i;
 
 	if (busiest->nr_running <= 1)
 		return;
 
-	for_each_domain(busiest_cpu, sd) {
+	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	}
 
-	if (!sd->parent || !cpu_isset(busiest->push_cpu, sd->span)) {
+	if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
 		WARN_ON(1);
 		return;
 	}
@@ -1689,7 +1636,7 @@ static void active_load_balance(runqueue
  			push_cpu = i;
  			nr++;
  		}
- 		if (nr == 0)
+ 		if (!nr)
  			goto next_group;
 
 		rq = cpu_rq(push_cpu);
@@ -1713,21 +1660,16 @@ next_group:
 /* Don't have all balancing operations going off at once */
 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
 
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
-						enum idle_type idle)
+static void
+rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
 {
-	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
 
 	if (unlikely(cpu_is_offline(this_cpu)))
 		return;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
-	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	this_rq->cpu_load = (old_load + this_load) / 2;
-
+	/* Run through all this CPU's domains */
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval = sd->balance_interval;
 
@@ -1736,7 +1678,7 @@ static void rebalance_tick(int this_cpu,
 
 		/* scale ms to jiffies */
 		interval = MSEC_TO_JIFFIES(interval);
-		if (unlikely(interval == 0))
+		if (unlikely(!interval))
 			interval = 1;
 
 		if (j - sd->last_balance >= interval) {
@@ -1755,7 +1697,7 @@ static void rebalance_tick(int this_cpu,
 static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
 {
 }
-static inline void idle_balance(int cpu, runqueue_t *rq)
+static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 {
 }
 #endif
@@ -3404,10 +3346,10 @@ static void __init arch_init_sched_domai
 
 	/* Set up groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t tmp = node_to_cpumask(i);
-		cpumask_t nodemask;
 		struct sched_group *first_cpu = NULL, *last_cpu = NULL;
 		struct sched_group *node = &sched_group_nodes[i];
+		cpumask_t tmp = node_to_cpumask(i);
+		cpumask_t nodemask;
 		int j;
 
 		cpus_and(nodemask, tmp, cpu_possible_map);
@@ -3529,12 +3471,12 @@ void sched_domain_debug(void)
 				printk(" ");
 			printk("groups:");
 			do {
-				if (group == NULL) {
+				if (!group) {
 					printk(" ERROR: NULL");
 					break;
 				}
 
-				if (cpus_weight(group->cpumask) == 0)
+				if (!cpus_weight(group->cpumask))
 					printk(" ERROR empty group:");
 
 				cpus_and(tmp, groupmask, group->cpumask);
@@ -3588,9 +3530,29 @@ void __init sched_init(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		prio_array_t *array;
 #ifdef CONFIG_SMP
-		struct sched_domain *domain;
-		domain = cpu_sched_domain(i);
-		memset(domain, 0, sizeof(struct sched_domain));
+		static struct sched_group __initdata sched_group_init[NR_CPUS];
+		struct sched_domain *sd;
+		struct sched_group *group;
+
+		/*
+		 * Create isolated, 1-CPU, no-balancing domains to avoid
+		 * special-cases during early bootup. Once topology info
+		 * is available later into the bootup, the architecture
+		 * sets up an optimal domain-hierarchy, in the
+		 * arch_init_sched_domains() function.
+		 */
+		sd = cpu_sched_domain(i);
+		memset(sd, 0, sizeof(struct sched_domain));
+		cpus_clear(sd->span);
+		cpu_set(i, sd->span);
+
+		group = sched_group_init + i;
+		group->next = group;
+		cpus_clear(group->cpumask);
+		cpu_set(i, group->cpumask);
+		group->cpu_power = SCHED_LOAD_SCALE;
+
+		sd->groups = group;
 #endif
 
 		rq = cpu_rq(i);

_