From 99ee4ca746dda71326db7645463b4075ac1d665c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 17:50:17 -0800 Subject: rcu: Suppress __mpol_dup() false positive from RCU lockdep Common code is used during task creation and after the task has started running. RCU protection is not needed during task creation because no other CPU has access to the under-construction task. Provide the RCU protection anyway to suppress the false positive, as there does not appear to be a good way for the common code to recognize that the task is only accessible to the CPU creating it. Signed-off-by: Paul E. McKenney Cc: Paul Menage Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267667418-32233-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- mm/mempolicy.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5bf0440..3cec080faa23 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1730,10 +1730,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(old, &mems); } + rcu_read_unlock(); *new = *old; atomic_set(&new->refcnt, 1); return new; -- cgit v1.2.3 From 9d8cebd4bcd7c3878462fdfda34bbcdeb4df7ef4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 5 Mar 2010 13:41:57 -0800 Subject: mm: fix mbind vma merge problem Strangely, current mbind() doesn't merge vma with neighbor vma although it's possible. Unfortunately, many vma can reduce performance... This patch fixes it. reproduced program ---------------------------------------------------------------- #include #include #include #include #include #include #include static unsigned long pagesize; int main(int argc, char** argv) { void* addr; int ch; int node; struct bitmask *nmask = numa_allocate_nodemask(); int err; int node_set = 0; char buf[128]; while ((ch = getopt(argc, argv, "n:")) != -1){ switch (ch){ case 'n': node = strtol(optarg, NULL, 0); numa_bitmask_setbit(nmask, node); node_set = 1; break; default: ; } } argc -= optind; argv += optind; if (!node_set) numa_bitmask_setbit(nmask, 0); pagesize = getpagesize(); addr = mmap(NULL, pagesize*3, PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, 0, 0); if (addr == MAP_FAILED) perror("mmap "), exit(1); fprintf(stderr, "pid = %d \n" "addr = %p\n", getpid(), addr); /* make page populate */ memset(addr, 0, pagesize*3); /* first mbind */ err = mbind(addr+pagesize, pagesize, MPOL_BIND, nmask->maskp, nmask->size, MPOL_MF_MOVE_ALL); if (err) error("mbind1 "); /* second mbind */ err = mbind(addr, pagesize*3, MPOL_DEFAULT, NULL, 0, 0); if (err) error("mbind2 "); sprintf(buf, "cat /proc/%d/maps", getpid()); system(buf); return 0; } ---------------------------------------------------------------- result without this patch addr = 0x7fe26ef09000 [snip] 7fe26ef09000-7fe26ef0a000 rw-p 00000000 00:00 0 7fe26ef0a000-7fe26ef0b000 rw-p 00000000 00:00 0 7fe26ef0b000-7fe26ef0c000 rw-p 00000000 00:00 0 7fe26ef0c000-7fe26ef0d000 rw-p 00000000 00:00 0 => 0x7fe26ef09000-0x7fe26ef0c000 have three vmas. result with this patch addr = 0x7fc9ebc76000 [snip] 7fc9ebc76000-7fc9ebc7a000 rw-p 00000000 00:00 0 7fffbe690000-7fffbe6a5000 rw-p 00000000 00:00 0 [stack] => 0x7fc9ebc76000-0x7fc9ebc7a000 have only one vma. [minchan.kim@gmail.com: fix file offset passed to vma_merge()] Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Cc: Nick Piggin Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Lee Schermerhorn Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5bf0440..44dd9d1521ec 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) } /* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct mempolicy *new) +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; - int err; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; - err = 0; - for (; vma && vma->vm_start < end; vma = next) { + vma = find_vma_prev(mm, start, &prev); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; - if (vma->vm_start < start) - err = split_vma(vma->vm_mm, vma, start, 1); - if (!err && vma->vm_end > end) - err = split_vma(vma->vm_mm, vma, end, 0); - if (!err) - err = policy_vma(vma, new); + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + continue; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + err = policy_vma(vma, new_pol); if (err) - break; + goto out; } + + out: return err; } @@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(vma, start, end, new); + err = mbind_range(mm, start, end, new); if (!list_empty(&pagelist)) nr_failed = migrate_pages(&pagelist, new_vma_page, -- cgit v1.2.3 From da0aa138944311e6745a00ac3d88f03e8d9a46c4 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 5 Mar 2010 13:41:59 -0800 Subject: mm/mempolicy.c: fix indentation of the comments of do_migrate_pages Currently, do_migrate_pages() have very long comment and this is not indent properly. I often misunderstand it is function starting commnents and confused it. this patch fixes it. note: this patch doesn't break 80 column rule. I guess original author intended this indentaion, but an accident corrupted it. Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 60 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 30 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 44dd9d1521ec..bda230e52acd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -888,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm, if (err) goto out; -/* - * Find a 'source' bit set in 'tmp' whose corresponding 'dest' - * bit in 'to' is not also set in 'tmp'. Clear the found 'source' - * bit in 'tmp', and return that pair for migration. - * The pair of nodemasks 'to' and 'from' define the map. - * - * If no pair of bits is found that way, fallback to picking some - * pair of 'source' and 'dest' bits that are not the same. If the - * 'source' and 'dest' bits are the same, this represents a node - * that will be migrating to itself, so no pages need move. - * - * If no bits are left in 'tmp', or if all remaining bits left - * in 'tmp' correspond to the same bit in 'to', return false - * (nothing left to migrate). - * - * This lets us pick a pair of nodes to migrate between, such that - * if possible the dest node is not already occupied by some other - * source node, minimizing the risk of overloading the memory on a - * node that would happen if we migrated incoming memory to a node - * before migrating outgoing memory source that same node. - * - * A single scan of tmp is sufficient. As we go, we remember the - * most recent pair that moved (s != d). If we find a pair - * that not only moved, but what's better, moved to an empty slot - * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the - * most recent pair that moved. If we get all the way through - * the scan of tmp without finding any node that moved, much less - * moved to an empty node, then there is nothing left worth migrating. - */ + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ tmp = *from_nodes; while (!nodes_empty(tmp)) { -- cgit v1.2.3 From 413b43deab8377819aba1dbad2abf0c15d59b491 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Tue, 23 Mar 2010 13:35:28 -0700 Subject: tmpfs: fix oops on mounts with mpol=default Fix an 'oops' when a tmpfs mount point is mounted with the mpol=default mempolicy. Upon remounting a tmpfs mount point with 'mpol=default' option, the mount code crashed with a null pointer dereference. The initial problem report was on 2.6.27, but the problem exists in mainline 2.6.34-rc as well. On examining the code, we see that mpol_new returns NULL if default mempolicy was requested. This 'NULL' mempolicy is accessed to store the node mask resulting in oops. The following patch fixes it. Signed-off-by: Ravikiran Thirumalai Signed-off-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Mel Gorman Acked-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 643f66e10187..745ce90308a6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2215,10 +2215,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) goto out; mode = MPOL_PREFERRED; break; - + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; /* * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. */ } -- cgit v1.2.3 From d69b2e63e9172afb4d07c305601b79a55509ac4c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 23 Mar 2010 13:35:30 -0700 Subject: tmpfs: mpol=bind:0 don't cause mount error. Currently, following mount operation cause mount error. % mount -t tmpfs -ompol=bind:0 none /tmp Because commit 71fe804b6d5 (mempolicy: use struct mempolicy pointer in shmem_sb_info) corrupted MPOL_BIND parse code. This patch restore the needed one. Signed-off-by: KOSAKI Motohiro Cc: Ravikiran Thirumalai Cc: Christoph Lameter Cc: Mel Gorman Acked-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 745ce90308a6..10db44f95749 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2222,9 +2222,13 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (!nodelist) err = 0; goto out; - /* - * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - */ + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; + err = 0; } mode_flags = 0; -- cgit v1.2.3 From 12821f5fb942e795f8009ece14bde868893bd811 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 23 Mar 2010 13:35:31 -0700 Subject: tmpfs: handle MPOL_LOCAL mount option properly commit 71fe804b6d5 (mempolicy: use struct mempolicy pointer in shmem_sb_info) added mpol=local mount option. but its feature is broken since it was born. because such code always return 1 (i.e. mount failure). This patch fixes it. Signed-off-by: KOSAKI Motohiro Cc: Ravikiran Thirumalai Cc: Christoph Lameter Cc: Mel Gorman Acked-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 10db44f95749..fb71790398f0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2214,6 +2214,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (nodelist) goto out; mode = MPOL_PREFERRED; + err = 0; break; case MPOL_DEFAULT: /* -- cgit v1.2.3 From 926f2ae04f183098cf9a30521776fb2759c8afeb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 23 Mar 2010 13:35:32 -0700 Subject: tmpfs: cleanup mpol_parse_str() mpol_parse_str() made lots 'err' variable related bug. Because it is ugly and reviewing unfriendly. This patch simplifies it. Signed-off-by: KOSAKI Motohiro Cc: Ravikiran Thirumalai Cc: Christoph Lameter Cc: Mel Gorman Acked-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fb71790398f0..6cdfa1df57f6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2195,8 +2195,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *rest = nodelist; while (isdigit(*rest)) rest++; - if (!*rest) - err = 0; + if (*rest) + goto out; } break; case MPOL_INTERLEAVE: @@ -2205,7 +2205,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) nodes = node_states[N_HIGH_MEMORY]; - err = 0; break; case MPOL_LOCAL: /* @@ -2214,7 +2213,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (nodelist) goto out; mode = MPOL_PREFERRED; - err = 0; break; case MPOL_DEFAULT: /* @@ -2229,7 +2227,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) goto out; - err = 0; } mode_flags = 0; @@ -2243,13 +2240,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else - err = 1; + goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) - err = 1; - else { + goto out; + + { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2260,13 +2258,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) ret = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); if (ret) { - err = 1; mpol_put(new); - } else if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; + goto out; } } + err = 0; + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } out: /* Restore string for error message */ -- cgit v1.2.3 From c6b6ef8bb05af632889c5536513b9f4004961f73 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Tue, 23 Mar 2010 13:35:41 -0700 Subject: mempolicy: fix get_mempolicy() for relative and static nodes Discovered while testing other mempolicy changes: get_mempolicy() does not handle static/relative mode flags correctly. Return the value that the user specified so that it can be restored via set_mempolicy() if desired. Signed-off-by: Lee Schermerhorn Cc: Hugh Dickins Cc: Ravikiran Thirumalai Cc: KOSAKI Motohiro Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'mm/mempolicy.c') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6cdfa1df57f6..8034abd3a135 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -806,9 +806,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) { - task_lock(current); - get_policy_nodemask(pol, nmask); - task_unlock(current); + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } } out: -- cgit v1.2.3