summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-10-05 17:43:37 +0100
committerGreg Kroah-Hartman <gregkh@suse.de>2008-10-08 20:23:12 -0700
commit553d7dd7336a3c1f3dd12085b5c42451c17225e1 (patch)
tree842a73e602b6f49613b6518a001f1a2c5556e4ef
parenteb07718d62cfd8da699a8127110fbb9fa5a18663 (diff)
mm owner: fix race between swapoff and exit
[Here's a backport of 2.6.27-rc8's 31a78f23bac0069004e69f98808b6988baccb6b6 to 2.6.26 or 2.6.26.5: I wouldn't trouble -stable for the (root only) swapoff case which uncovered the bug, but the /proc/<pid>/<mmstats> case is open to all, so I think worth plugging in the next 2.6.26-stable. - Hugh] There's a race between mm->owner assignment and swapoff, more easily seen when task slab poisoning is turned on. The condition occurs when try_to_unuse() runs in parallel with an exiting task. A similar race can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats> or ptrace or page migration. CPU0 CPU1 try_to_unuse looks at mm = task0->mm increments mm->mm_users task 0 exits mm->owner needs to be updated, but no new owner is found (mm_users > 1, but no other task has task->mm = task0->mm) mm_update_next_owner() leaves mmput(mm) decrements mm->mm_users task0 freed dereferencing mm->owner fails The fix is to notify the subsystem via mm_owner_changed callback(), if no new owner is found, by specifying the new task as NULL. Jiri Slaby: mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but must be set after that, so as not to pass NULL as old owner causing oops. Daisuke Nishimura: mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task() and its callers need to take account of this situation to avoid oops. Hugh Dickins: Lockdep warning and hang below exec_mmap() when testing these patches. exit_mm() up_reads mmap_sem before calling mm_update_next_owner(), so exec_mmap() now needs to do the same. And with that repositioning, there's now no point in mm_need_new_owner() allowing for NULL mm. Reported-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: Jiri Slaby <jirislaby@gmail.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Hugh Dickins <hugh@veritas.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
-rw-r--r--fs/exec.c2
-rw-r--r--kernel/cgroup.c5
-rw-r--r--kernel/exit.c12
-rw-r--r--mm/memcontrol.c13
4 files changed, 27 insertions, 5 deletions
diff --git a/fs/exec.c b/fs/exec.c
index fd9234379e8d..85e99483eba0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -740,11 +740,11 @@ static int exec_mmap(struct mm_struct *mm)
tsk->active_mm = mm;
activate_mm(active_mm, mm);
task_unlock(tsk);
- mm_update_next_owner(old_mm);
arch_pick_mmap_layout(mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
+ mm_update_next_owner(old_mm);
mmput(old_mm);
return 0;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..d53caaa49be6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2761,14 +2761,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
*/
void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
{
- struct cgroup *oldcgrp, *newcgrp;
+ struct cgroup *oldcgrp, *newcgrp = NULL;
if (need_mm_owner_callback) {
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
oldcgrp = task_cgroup(old, ss->subsys_id);
- newcgrp = task_cgroup(new, ss->subsys_id);
+ if (new)
+ newcgrp = task_cgroup(new, ss->subsys_id);
if (oldcgrp == newcgrp)
continue;
if (ss->mm_owner_changed)
diff --git a/kernel/exit.c b/kernel/exit.c
index 3bb2b8310ad3..f68b081124bf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -577,8 +577,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
* If there are other users of the mm and the owner (us) is exiting
* we need to find a new owner to take on the responsibility.
*/
- if (!mm)
- return 0;
if (atomic_read(&mm->mm_users) <= 1)
return 0;
if (mm->owner != p)
@@ -621,6 +619,16 @@ retry:
} while_each_thread(g, c);
read_unlock(&tasklist_lock);
+ /*
+ * We found no owner yet mm_users > 1: this implies that we are
+ * most likely racing with swapoff (try_to_unuse()) or /proc or
+ * ptrace or page migration (get_task_mm()). Mark owner as NULL,
+ * so that subsystems can understand the callback and take action.
+ */
+ down_write(&mm->mmap_sem);
+ cgroup_mm_owner_callbacks(mm->owner, NULL);
+ mm->owner = NULL;
+ up_write(&mm->mmap_sem);
return;
assign_new_owner:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..ed1cfb12bc95 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
+ /*
+ * mm_update_next_owner() may clear mm->owner to NULL
+ * if it races with swapoff, page migration, etc.
+ * So this can be called with p == NULL.
+ */
+ if (unlikely(!p))
+ return NULL;
+
return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
struct mem_cgroup, css);
}
@@ -574,6 +582,11 @@ retry:
rcu_read_lock();
mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+ if (unlikely(!mem)) {
+ rcu_read_unlock();
+ kmem_cache_free(page_cgroup_cache, pc);
+ return 0;
+ }
/*
* For every charge from the cgroup, increment reference count
*/