diff -urN numa-sched-ref/arch/alpha/config.in numa-sched/arch/alpha/config.in
--- numa-sched-ref/arch/alpha/config.in	Thu Apr 12 20:03:04 2001
+++ numa-sched/arch/alpha/config.in	Thu Apr 12 20:03:26 2001
@@ -208,6 +208,9 @@
    bool 'Discontiguous Memory Support' CONFIG_DISCONTIGMEM
    if [ "$CONFIG_DISCONTIGMEM" = "y" ]; then
       bool ' NUMA Support' CONFIG_NUMA
+      if [ "$CONFIG_NUMA" = "y" ]; then	
+	bool '  NUMA Scheduler Support' CONFIG_NUMA_SCHED
+      fi
    fi
 fi
 
diff -urN numa-sched-ref/include/asm-alpha/mmzone.h numa-sched/include/asm-alpha/mmzone.h
--- numa-sched-ref/include/asm-alpha/mmzone.h	Thu Apr 12 20:03:04 2001
+++ numa-sched/include/asm-alpha/mmzone.h	Thu Apr 12 20:03:26 2001
@@ -21,7 +21,7 @@
 #ifdef NOTYET
 	kern_vars_t	kern_vars;
 #endif
-#if defined(CONFIG_NUMA) && defined(CONFIG_NUMA_SCHED)
+#ifdef CONFIG_NUMA_SCHED
 	struct numa_schedule_data schedule_data;
 #endif
 } plat_pg_data_t;
diff -urN numa-sched-ref/include/asm-alpha/timex.h numa-sched/include/asm-alpha/timex.h
--- numa-sched-ref/include/asm-alpha/timex.h	Tue Dec 29 22:56:15 1998
+++ numa-sched/include/asm-alpha/timex.h	Thu Apr 12 20:03:26 2001
@@ -27,4 +27,8 @@
 	return ret;
 }
 
+typedef long last_schedule_t;
+#define get_last_schedule() ({ jiffies; })
+#define last_schedule_before(a, b)	time_before(a, b)
+
 #endif
diff -urN numa-sched-ref/include/asm-i386/timex.h numa-sched/include/asm-i386/timex.h
--- numa-sched-ref/include/asm-i386/timex.h	Tue Apr 10 00:40:42 2001
+++ numa-sched/include/asm-i386/timex.h	Thu Apr 12 20:03:26 2001
@@ -45,4 +45,8 @@
 #endif
 }
 
+typedef cycles_t last_schedule_t;
+#define get_last_schedule() ({ get_cycles(); })
+#define last_schedule_before(a, b)	({ a < b; })
+
 #endif
diff -urN numa-sched-ref/include/linux/numa_sched.h numa-sched/include/linux/numa_sched.h
--- numa-sched-ref/include/linux/numa_sched.h	Thu Jan  1 01:00:00 1970
+++ numa-sched/include/linux/numa_sched.h	Thu Apr 12 20:03:26 2001
@@ -0,0 +1,57 @@
+/*
+ *  linux/include/linux/numa_sched.h
+ *
+ *  NUMA based scheduler
+ */
+
+#ifndef _LINUX_NUMA_SCHED_H
+#define _LINUX_NUMA_SCHED_H
+
+#ifdef CONFIG_NUMA_SCHED
+#include <linux/cache.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+#include <asm/timex.h>
+
+struct numa_per_cpu_schedule_data {
+	struct task_struct * curr;
+	last_schedule_t last_schedule;
+};
+
+struct numa_schedule_data {
+	struct numa_per_cpu_schedule_data per_cpu[NR_CPUS] ____cacheline_aligned;
+	struct list_head runqueue_head;
+	int nr_running, nr_threads;
+};
+
+#define numa_nr_running_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running++; } while(0)
+#define numa_nr_running_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_running--; } while(0)
+#define numa_nr_running(nid) (NODE_SCHEDULE_DATA(nid)->nr_running)
+
+#define numa_nr_threads_inc() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads++; } while(0)
+#define numa_nr_threads_dec() do { NODE_SCHEDULE_DATA(numa_node_id())->nr_threads--; } while(0)
+#define numa_nr_threads(nid) (NODE_SCHEDULE_DATA(nid)->nr_threads)
+
+extern void numa_set_node_affinity(struct task_struct *);
+
+#define cpu_curr(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].curr)
+#define last_schedule(cpu) (NODE_SCHEDULE_DATA(cputonode(cpu))->per_cpu[(cpu)].last_schedule)
+
+#define numa_runqueue_head(x) (&NODE_SCHEDULE_DATA(x)->runqueue_head)
+
+#else /* CONFIG_NUMA_SCHED */
+
+#define numa_nr_running_inc() do { } while(0)
+#define numa_nr_running_dec() do { } while(0)
+#define numa_nr_threads_inc() do { } while(0)
+#define numa_nr_threads_dec() do { } while(0)
+
+#define numa_set_node_affinity(x) do { } while(0)
+
+#define cpu_curr(cpu) (aligned_data[(cpu)].schedule_data.curr)
+#define last_schedule(cpu) (aligned_data[(cpu)].schedule_data.last_schedule)
+
+#define numa_runqueue_head(x) (&runqueue_head)
+#endif /* CONFIG_NUMA_SCHED */
+
+#endif /* __ALPHA_NUMA_SCHED_H */
diff -urN numa-sched-ref/include/linux/sched.h numa-sched/include/linux/sched.h
--- numa-sched-ref/include/linux/sched.h	Thu Apr 12 16:54:02 2001
+++ numa-sched/include/linux/sched.h	Thu Apr 12 20:03:26 2001
@@ -26,6 +26,7 @@
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>
+#include <linux/numa_sched.h>
 
 /*
  * cloning flags:
@@ -304,7 +305,7 @@
 	long nice;
 	unsigned long policy;
 	struct mm_struct *mm;
-	int has_cpu, processor;
+	int has_cpu, processor, nid;
 	unsigned long cpus_allowed;
 	/*
 	 * (only the 'next' pointer fits into the cacheline, but
@@ -452,7 +453,7 @@
     policy:		SCHED_OTHER,					\
     mm:			NULL,						\
     active_mm:		&init_mm,					\
-    cpus_allowed:	-1,						\
+    cpus_allowed:	-1UL,						\
     run_list:		LIST_HEAD_INIT(tsk.run_list),			\
     next_task:		&tsk,						\
     prev_task:		&tsk,						\
@@ -760,6 +761,30 @@
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 
+#define nr_running_inc()			\
+do {						\
+	numa_nr_running_inc();			\
+	nr_running++;				\
+} while (0)
+
+#define nr_running_dec()			\
+do {						\
+	numa_nr_running_dec();			\
+	nr_running--;				\
+} while (0)
+
+#define nr_threads_inc()			\
+do {						\
+	numa_nr_threads_inc();			\
+	nr_threads++;				\
+} while (0)
+
+#define nr_threads_dec()			\
+do {						\
+	numa_nr_threads_dec();			\
+	nr_threads--;				\
+} while (0)
+
 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
@@ -840,29 +865,29 @@
 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
 
-static inline void del_from_runqueue(struct task_struct * p)
-{
-	nr_running--;
-	p->sleep_time = jiffies;
-	list_del(&p->run_list);
-	p->run_list.next = NULL;
-}
+#define del_from_runqueue(p)			\
+do {						\
+	nr_running_dec();			\
+	p->sleep_time = jiffies;		\
+	list_del(&(p)->run_list);		\
+	(p)->run_list.next = NULL;		\
+} while(0)
 
 static inline int task_on_runqueue(struct task_struct *p)
 {
 	return (p->run_list.next != NULL);
 }
 
-static inline void unhash_process(struct task_struct *p)
-{
-	if (task_on_runqueue(p)) BUG();
-	write_lock_irq(&tasklist_lock);
-	nr_threads--;
-	unhash_pid(p);
-	REMOVE_LINKS(p);
-	list_del(&p->thread_group);
-	write_unlock_irq(&tasklist_lock);
-}
+#define unhash_process(p)			\
+do {						\
+	if (task_on_runqueue(p)) BUG();		\
+	write_lock_irq(&tasklist_lock);		\
+	nr_threads_dec();			\
+	unhash_pid(p);				\
+	REMOVE_LINKS(p);			\
+	list_del(&(p)->thread_group);		\
+	write_unlock_irq(&tasklist_lock);	\
+} while(0)
 
 /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
 static inline void task_lock(struct task_struct *p)
diff -urN numa-sched-ref/kernel/fork.c numa-sched/kernel/fork.c
--- numa-sched-ref/kernel/fork.c	Thu Apr 12 16:54:02 2001
+++ numa-sched/kernel/fork.c	Thu Apr 12 20:03:26 2001
@@ -630,7 +630,6 @@
 	{
 		int i;
 		p->has_cpu = 0;
-		p->processor = current->processor;
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
@@ -692,7 +691,7 @@
 	}
 	SET_LINKS(p);
 	hash_pid(p);
-	nr_threads++;
+	nr_threads_inc();
 	write_unlock_irq(&tasklist_lock);
 
 	if (p->ptrace & PT_PTRACED)
diff -urN numa-sched-ref/kernel/sched.c numa-sched/kernel/sched.c
--- numa-sched-ref/kernel/sched.c	Thu Apr 12 16:54:02 2001
+++ numa-sched/kernel/sched.c	Thu Apr 12 20:03:53 2001
@@ -10,6 +10,7 @@
  *  1998-11-19	Implemented schedule_timeout() and related stuff
  *		by Andrea Arcangeli
  *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
+ *  2001-01-29	first NUMA scheduler attempt by Andrea Arcangeli, SuSE
  */
 
 /*
@@ -88,6 +89,8 @@
 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */
 
+#ifndef CONFIG_NUMA_SCHED
+
 static LIST_HEAD(runqueue_head);
 
 /*
@@ -97,13 +100,33 @@
 static union {
 	struct schedule_data {
 		struct task_struct * curr;
-		cycles_t last_schedule;
+		last_schedule_t last_schedule;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};
 
-#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
-#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule
+#define init_numa_schedule_data() do { } while(0)
+
+#else /* CONFIG_NUMA_SCHED */
+
+static void __init init_numa_schedule_data(void)
+{
+	int i;
+
+	for (i = 0; i < numnodes; i++) {
+		INIT_LIST_HEAD(&NODE_SCHEDULE_DATA(i)->runqueue_head);
+		NODE_SCHEDULE_DATA(i)->nr_running = 0;
+		NODE_SCHEDULE_DATA(i)->nr_threads = 0;
+	}
+}
+
+/* very dumb heuristic, doesn't take into account memory pressure yet */
+static inline int node_cpu_load(int nid)
+{
+	return numa_nr_running(nid);
+}
+
+#endif /* CONFIG_NUMA_SCHED */
 
 struct kernel_stat kstat;
 
@@ -111,7 +134,7 @@
 
 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 #define can_schedule(p,cpu) ((!(p)->has_cpu) && \
-				((p)->cpus_allowed & (1 << cpu)))
+				((p)->cpus_allowed & (1UL << cpu)))
 
 #else
 
@@ -209,8 +232,8 @@
 #ifdef CONFIG_SMP
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk, *target_tsk;
-	int cpu, best_cpu, i, max_prio;
-	cycles_t oldest_idle;
+	int cpu, best_cpu, i, max_prio, found_idle;
+	last_schedule_t oldest_idle;
 
 	/*
 	 * shortcut if the woken up task's last CPU is
@@ -220,7 +243,7 @@
 	if (can_schedule(p, best_cpu)) {
 		tsk = idle_task(best_cpu);
 		if (cpu_curr(best_cpu) == tsk) {
-			int need_resched;
+			long need_resched;
 send_now_idle:
 			/*
 			 * If need_resched == -1 then we can skip sending
@@ -242,13 +265,17 @@
 	 * one will have the least active cache context.) Also find
 	 * the executing process which has the least priority.
 	 */
-	oldest_idle = (cycles_t) -1;
 	target_tsk = NULL;
 	max_prio = 1;
+	found_idle = 0;
 
 	for (i = 0; i < smp_num_cpus; i++) {
 		cpu = cpu_logical_map(i);
-		if (!can_schedule(p, cpu))
+		if (
+#ifdef CONFIG_NUMA_SCHED
+		    cputonode(cpu) != p->nid ||
+#endif
+		    !can_schedule(p, cpu))
 			continue;
 		tsk = cpu_curr(cpu);
 		/*
@@ -257,12 +284,13 @@
 		 * a problem.
 		 */
 		if (tsk == idle_task(cpu)) {
-			if (last_schedule(cpu) < oldest_idle) {
+			if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) {
 				oldest_idle = last_schedule(cpu);
 				target_tsk = tsk;
+				found_idle = 1;
 			}
 		} else {
-			if (oldest_idle == -1ULL) {
+			if (!found_idle) {
 				int prio = preemption_goodness(tsk, p, cpu);
 
 				if (prio > max_prio) {
@@ -272,15 +300,34 @@
 			}
 		}
 	}
-	tsk = target_tsk;
-	if (tsk) {
-		if (oldest_idle != -1ULL) {
-			best_cpu = tsk->processor;
+
+#ifdef CONFIG_NUMA_SCHED
+	if (!target_tsk)
+		/* Make sure to use the idle cpus in the other nodes */
+		for (i = 0; i < smp_num_cpus; i++) {
+			cpu = cpu_logical_map(i);
+			if (cputonode(cpu) == p->nid || !can_schedule(p, cpu))
+				continue;
+			tsk = cpu_curr(cpu);
+			if (tsk == idle_task(cpu)) {
+				if (!found_idle || last_schedule_before(last_schedule(cpu), oldest_idle)) {
+					oldest_idle = last_schedule(cpu);
+					target_tsk = tsk;
+					found_idle = 1;
+					target_tsk->nid = numa_node_id();
+				}
+			}
+		}
+#endif
+
+	if (target_tsk) {
+		if (found_idle) {
+			best_cpu = target_tsk->processor;
 			goto send_now_idle;
 		}
-		tsk->need_resched = 1;
-		if (tsk->processor != this_cpu)
-			smp_send_reschedule(tsk->processor);
+		target_tsk->need_resched = 1;
+		if (target_tsk->processor != this_cpu)
+			smp_send_reschedule(target_tsk->processor);
 	}
 	return;
 		
@@ -304,20 +351,20 @@
  */
 static inline void add_to_runqueue(struct task_struct * p)
 {
-	list_add(&p->run_list, &runqueue_head);
-	nr_running++;
+	list_add(&p->run_list, numa_runqueue_head(p->nid));
+	nr_running_inc();
 }
 
 static inline void move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add_tail(&p->run_list, &runqueue_head);
+	list_add_tail(&p->run_list, numa_runqueue_head(p->nid));
 }
 
 static inline void move_first_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
-	list_add(&p->run_list, &runqueue_head);
+	list_add(&p->run_list, numa_runqueue_head(p->nid));
 }
 
 /*
@@ -340,9 +387,9 @@
 	p->state = TASK_RUNNING;
 	if (task_on_runqueue(p))
 		goto out;
-	add_to_runqueue(p);
 	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 		reschedule_idle(p);
+	add_to_runqueue(p);
 	success = 1;
 out:
 	spin_unlock_irqrestore(&runqueue_lock, flags);
@@ -528,10 +575,12 @@
  */
 asmlinkage void schedule(void)
 {
-	struct schedule_data * sched_data;
 	struct task_struct *prev, *next, *p;
 	struct list_head *tmp;
 	int this_cpu, c;
+#ifdef CONFIG_NUMA_SCHED
+	int recalculate_all;
+#endif
 
 	if (!current->active_mm) BUG();
 need_resched_back:
@@ -548,12 +597,6 @@
 		goto handle_softirq;
 handle_softirq_back:
 
-	/*
-	 * 'sched_data' is protected by the fact that we can run
-	 * only one process per CPU.
-	 */
-	sched_data = & aligned_data[this_cpu].schedule_data;
-
 	spin_lock_irq(&runqueue_lock);
 
 	/* move an exhausted RR process to be last.. */
@@ -587,7 +630,7 @@
 		goto still_running;
 
 still_running_back:
-	list_for_each(tmp, &runqueue_head) {
+	list_for_each(tmp, numa_runqueue_head(numa_node_id())) {
 		p = list_entry(tmp, struct task_struct, run_list);
 		if (can_schedule(p, this_cpu)) {
 			int weight = goodness(p, this_cpu, prev->active_mm);
@@ -596,6 +639,27 @@
 		}
 	}
 
+#ifdef CONFIG_NUMA_SCHED
+	recalculate_all = 0;
+	if (c < 0) {
+		int nid;
+
+		recalculate_all = 1;
+		for (nid = 0; nid < numnodes; nid++) {
+			if (nid == numa_node_id())
+				continue;
+			list_for_each(tmp, numa_runqueue_head(nid)) {
+				p = list_entry(tmp, struct task_struct, run_list);
+				if (can_schedule(p, this_cpu)) {
+					int weight = goodness(p, this_cpu, prev->active_mm);
+					if (weight > c)
+						c = weight, next = p;
+				}
+			}
+		}
+	}
+#endif
+
 	/* Do we need to re-calculate counters? */
 	if (!c)
 		goto recalculate;
@@ -604,10 +668,16 @@
 	 * switching to the next task, save this fact in
 	 * sched_data.
 	 */
-	sched_data->curr = next;
+	cpu_curr(this_cpu) = next;
 #ifdef CONFIG_SMP
  	next->has_cpu = 1;
 	next->processor = this_cpu;
+#ifdef CONFIG_NUMA_SCHED
+	if (next != idle_task(this_cpu) && next->nid != numa_node_id()) {
+		next->nid = numa_node_id();
+		move_last_runqueue(next);
+	}
+#endif
 #endif
 	spin_unlock_irq(&runqueue_lock);
 
@@ -622,7 +692,7 @@
 	 * and it's approximate, so we do not have to maintain
 	 * it while holding the runqueue spinlock.
  	 */
- 	sched_data->last_schedule = get_cycles();
+ 	last_schedule(this_cpu) = get_last_schedule();
 
 	/*
 	 * We drop the scheduler lock early (it's a global spinlock),
@@ -681,8 +751,13 @@
 		struct task_struct *p;
 		spin_unlock_irq(&runqueue_lock);
 		read_lock(&tasklist_lock);
-		for_each_task(p)
+		for_each_task(p) {
+#ifdef CONFIG_NUMA_SCHED
+			if (!recalculate_all && p->nid != numa_node_id())
+				continue;
+#endif
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
+		}
 		read_unlock(&tasklist_lock);
 		spin_lock_irq(&runqueue_lock);
 	}
@@ -1094,7 +1169,7 @@
 
 	// Substract non-idle processes running on other CPUs.
 	for (i = 0; i < smp_num_cpus; i++)
-		if (aligned_data[i].schedule_data.curr != idle_task(i))
+		if (cpu_curr(i) != idle_task(i))
 			nr_pending--;
 #else
 	// on UP this process is on the runqueue as well
@@ -1288,16 +1363,15 @@
 
 void __init init_idle(void)
 {
-	struct schedule_data * sched_data;
-	sched_data = &aligned_data[smp_processor_id()].schedule_data;
+	int cpu = smp_processor_id();
 
 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
 			smp_processor_id(), current->pid);
 		del_from_runqueue(current);
 	}
-	sched_data->curr = current;
-	sched_data->last_schedule = get_cycles();
+	cpu_curr(cpu) = current;
+	last_schedule(cpu) = get_last_schedule();
 }
 
 extern void init_timervecs (void);
@@ -1327,4 +1401,6 @@
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
+
+	init_numa_schedule_data();
 }