sched: Fix fork vs hotplug vs cpuset namespaces There are a number of issues: 1) TASK_WAKING vs cgroup_clone (cpusets) copy_process(): sched_fork() child->state = TASK_WAKING; /* waiting for wake_up_new_task() */ if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); while (child->state == TASK_WAKING) cpu_relax(); will deadlock the system. 2) cgroup_clone (cpusets) vs copy_process So even if the above would work we still have: copy_process(): if (current->nsproxy != p->nsproxy) ns_cgroup_clone() cgroup_clone() mutex_lock(inode->i_mutex) mutex_lock(cgroup_mutex) cgroup_attach_task() ss->can_attach() ss->attach() [ -> cpuset_attach() ] cpuset_attach_task() set_cpus_allowed_ptr(); ... p->cpus_allowed = current->cpus_allowed over-writing the modified cpus_allowed. 3) fork() vs hotplug if we unplug the child's cpu after the sanity check when the child gets attached to the task_list but before wake_up_new_task() shit will meet with fan. Solve all these issues by moving fork cpu selection into wake_up_new_task(). Reported-by: Serge E. Hallyn <serue@us.ibm.com> Tested-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1264106190.4283.1314.camel@laptop> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

commit: fabf318e5e4bda0aca2b0d617b191884fda62703 [log] [tgz]
author: Peter Zijlstra <a.p.zijlstra@chello.nl> Thu Jan 21 21:04:57 2010 +0100
committer: Thomas Gleixner <tglx@linutronix.de> Thu Jan 21 23:25:31 2010 +0100
tree: 651b2ee4fb8f393d2fe93f133a5ec6129cb7a8e8
parent: 6d558c3ac9b6508d26fd5cadccce51fc9d726b1c [diff] [blame]
diff --git a/kernel/sched.c b/kernel/sched.c
index 4508fe7..3a8fb30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c

@@ -2320,14 +2320,12 @@
 }
 
 /*
- * Called from:
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
  *
- *  - fork, @p is stable because it isn't on the tasklist yet
- *
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
+ *  exec:           is unstable, retry loop
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
  */
 static inline
 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2620,9 +2618,6 @@
 	if (p->sched_class->task_fork)
 		p->sched_class->task_fork(p);
 
-#ifdef CONFIG_SMP
-	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
 	set_task_cpu(p, cpu);
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2652,6 +2647,21 @@
 {
 	unsigned long flags;
 	struct rq *rq;
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	/*
+	 * Fork balancing, do it here and not earlier because:
+	 *  - cpus_allowed can change in the fork path
+	 *  - any previously selected cpu might disappear through hotplug
+	 *
+	 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+	 * ->cpus_allowed is stable, we have preemption disabled, meaning
+	 * cpu_online_mask is stable.
+	 */
+	cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+	set_task_cpu(p, cpu);
+#endif
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_WAKING);
@@ -2665,6 +2675,7 @@
 		p->sched_class->task_woken(rq, p);
 #endif
 	task_rq_unlock(rq, &flags);
+	put_cpu();
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -7139,14 +7150,18 @@
 	 * the ->cpus_allowed mask from under waking tasks, which would be
 	 * possible when we change rq->lock in ttwu(), so synchronize against
 	 * TASK_WAKING to avoid that.
+	 *
+	 * Make an exception for freshly cloned tasks, since cpuset namespaces
+	 * might move the task about, we have to validate the target in
+	 * wake_up_new_task() anyway since the cpu might have gone away.
 	 */
 again:
-	while (p->state == TASK_WAKING)
+	while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
 		cpu_relax();
 
 	rq = task_rq_lock(p, &flags);
 
-	if (p->state == TASK_WAKING) {
+	if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
 		task_rq_unlock(rq, &flags);
 		goto again;
 	}
commit	fabf318e5e4bda0aca2b0d617b191884fda62703	[log] [tgz]
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	Thu Jan 21 21:04:57 2010 +0100
committer	Thomas Gleixner <tglx@linutronix.de>	Thu Jan 21 23:25:31 2010 +0100
tree	651b2ee4fb8f393d2fe93f133a5ec6129cb7a8e8
parent	6d558c3ac9b6508d26fd5cadccce51fc9d726b1c [diff] [blame]