From: Rusty Russell <rusty@rustcorp.com.au>

Ingo recently accidentally broke CPU hotplug by enabling preemption around
release_task(), which can be called on the current task if the parent isn't
interested.

The problem is, the task can be preempted and then the CPU can go down:
it's not in the task list any more, and so it won't get migrated after the
CPU goes down.  It stays on the down CPU, which triggers a BUG_ON.

We have had previous problems with tasks releasing themselves: oprofile has
a comment about it, and we had the case of trying to deliver SIGXCPU in the
timer tick to the current task which had called release_task().  This patch
shuffles the self-reaping off to finish_task_switch, so there's never a
running task which isn't in the task list, except idle threads.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (authored)
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/include/linux/sched.h |    1 +
 25-akpm/kernel/exit.c         |   28 +++++++---------------------
 25-akpm/kernel/sched.c        |    5 ++++-
 3 files changed, 12 insertions(+), 22 deletions(-)

diff -puN include/linux/sched.h~dont-sleep-after-were-out-of-task-list include/linux/sched.h
--- 25/include/linux/sched.h~dont-sleep-after-were-out-of-task-list	2004-08-17 23:56:04.674166560 -0700
+++ 25-akpm/include/linux/sched.h	2004-08-17 23:56:04.681165496 -0700
@@ -565,6 +565,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_DEAD		0x00000008	/* Dead */
+#define PF_SELFREAP	0x00000010	/* Never a zombie, must be released */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
diff -puN kernel/exit.c~dont-sleep-after-were-out-of-task-list kernel/exit.c
--- 25/kernel/exit.c~dont-sleep-after-were-out-of-task-list	2004-08-17 23:56:04.675166408 -0700
+++ 25-akpm/kernel/exit.c	2004-08-17 23:56:04.682165344 -0700
@@ -755,8 +755,8 @@ static void exit_notify(struct task_stru
 	state = TASK_ZOMBIE;
 	if (tsk->exit_signal == -1 && tsk->ptrace == 0)
 		state = TASK_DEAD;
-	else
-		tsk->state = state;
+	tsk->state = state;
+
 	/*
 	 * Clear these here so that update_process_times() won't try to deliver
 	 * itimer, profile or rlimit signals to this task while it is in late exit.
@@ -765,14 +765,6 @@ static void exit_notify(struct task_stru
 	tsk->it_prof_value = 0;
 	tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY;
 
-	/*
-	 * Get a reference to it so that we can set the state
-	 * as the last step. The state-setting only matters if the
-	 * current task is releasing itself, to trigger the final
-	 * put_task_struct() in finish_task_switch(). (thread self-reap)
-	 */
-	get_task_struct(tsk);
-
 	write_unlock_irq(&tasklist_lock);
 
 	list_for_each_safe(_p, _n, &ptrace_dead) {
@@ -781,18 +773,12 @@ static void exit_notify(struct task_stru
 		release_task(t);
 	}
 
-	/* If the process is dead, release it - nobody will wait for it */
-	if (state == TASK_DEAD) {
-		release_task(tsk);
-		write_lock_irq(&tasklist_lock);
-		tsk->state = state;
-		_raw_write_unlock(&tasklist_lock);
-		local_irq_enable();
-	} else
-		preempt_disable();
-
+	preempt_disable();
+	/* PF_DEAD says drop ref after we schedule. */
 	tsk->flags |= PF_DEAD;
-	put_task_struct(tsk);
+	/* PF_SELFREAP says there's no parent to wait4() for us. */
+	if (state == TASK_DEAD)
+		tsk->flags |= PF_SELFREAP;
 }
 
 asmlinkage NORET_TYPE void do_exit(long code)
diff -puN kernel/sched.c~dont-sleep-after-were-out-of-task-list kernel/sched.c
--- 25/kernel/sched.c~dont-sleep-after-were-out-of-task-list	2004-08-17 23:56:04.677166104 -0700
+++ 25-akpm/kernel/sched.c	2004-08-17 23:56:04.685164888 -0700
@@ -1484,8 +1484,11 @@ static void finish_task_switch(task_t *p
 	finish_arch_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
-	if (unlikely(prev_task_flags & PF_DEAD))
+	if (unlikely(prev_task_flags & PF_DEAD)) {
+		if (prev_task_flags & PF_SELFREAP)
+			release_task(prev);
 		put_task_struct(prev);
+	}
 }
 
 /**
_