diff options
author | Sachin Nikam <snikam@nvidia.com> | 2013-03-06 19:12:15 +0530 |
---|---|---|
committer | Sachin Nikam <snikam@nvidia.com> | 2013-03-06 19:31:07 +0530 |
commit | d3dd6b4227fe3e59df0a55f679ec196e4ccb1e3a (patch) | |
tree | 0137a07ecab729a19cea68b671c88def5bf83a2d /mm | |
parent | bb36790494c382ec26250f351c8cea76613f0f02 (diff) | |
parent | 2713e2797a78a0fdda765bc5ad7fed41e94818ba (diff) |
Merge branch 'linux-3.4.35' into rel-17
Bug 1243631
Change-Id: I915826047b2e20f0ad0a7d75df295c6cbf6e5b0a
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 6 | ||||
-rw-r--r-- | mm/dmapool.c | 31 | ||||
-rw-r--r-- | mm/fadvise.c | 18 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 3 | ||||
-rw-r--r-- | mm/memblock.c | 24 | ||||
-rw-r--r-- | mm/memcontrol.c | 21 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 23 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 225 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 147 | ||||
-rw-r--r-- | mm/page-writeback.c | 25 | ||||
-rw-r--r-- | mm/page_alloc.c | 19 | ||||
-rw-r--r-- | mm/rmap.c | 20 | ||||
-rw-r--r-- | mm/shmem.c | 44 | ||||
-rw-r--r-- | mm/slab.c | 6 | ||||
-rw-r--r-- | mm/sparse.c | 10 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 8 |
21 files changed, 403 insertions, 265 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..53cf62b186b6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -766,13 +766,17 @@ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { bootmem_data_t *bdata; - unsigned long pfn, goal; + unsigned long pfn, goal, limit; pfn = section_nr_to_pfn(section_nr); goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); + if (goal + size > limit) + limit = 0; + + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); } #endif diff --git a/mm/compaction.c b/mm/compaction.c index eede98104e9f..10f9bf96f7f0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -840,7 +840,7 @@ static int compact_node(int nid) } /* Compact all nodes in the system */ -static int compact_nodes(void) +static void compact_nodes(void) { int nid; @@ -849,8 +849,6 @@ static int compact_nodes(void) for_each_online_node(nid) compact_node(nid); - - return COMPACT_COMPLETE; } /* The written value is actually unused, all memory is compacted */ @@ -861,7 +859,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { if (write) - return compact_nodes(); + compact_nodes(); return 0; } diff --git a/mm/dmapool.c b/mm/dmapool.c index c5ab33bca0a8..da1b0f0b8709 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -50,7 +50,6 @@ struct dma_pool { /* the pool */ size_t allocation; size_t boundary; char name[32]; - wait_queue_head_t waitq; struct list_head pools; }; @@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ unsigned int offset; }; -#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000) - static DEFINE_MUTEX(pools_lock); static ssize_t @@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->size = size; retval->boundary = boundary; retval->allocation = allocation; - init_waitqueue_head(&retval->waitq); if (dev) { int ret; @@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags) memset(page->vaddr, POOL_POISON_FREED, pool->allocation); #endif pool_initialise_page(pool, page); - list_add(&page->page_list, &pool->page_list); page->in_use = 0; page->offset = 0; } else { @@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, might_sleep_if(mem_flags & __GFP_WAIT); spin_lock_irqsave(&pool->lock, flags); - restart: list_for_each_entry(page, &pool->page_list, page_list) { if (page->offset < pool->allocation) goto ready; } - page = pool_alloc_page(pool, GFP_ATOMIC); - if (!page) { - if (mem_flags & __GFP_WAIT) { - DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_UNINTERRUPTIBLE); - __add_wait_queue(&pool->waitq, &wait); - spin_unlock_irqrestore(&pool->lock, flags); + /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */ + spin_unlock_irqrestore(&pool->lock, flags); - schedule_timeout(POOL_TIMEOUT_JIFFIES); + page = pool_alloc_page(pool, mem_flags); + if (!page) + return NULL; - spin_lock_irqsave(&pool->lock, flags); - __remove_wait_queue(&pool->waitq, &wait); - goto restart; - } - retval = NULL; - goto done; - } + spin_lock_irqsave(&pool->lock, flags); + list_add(&page->page_list, &pool->page_list); ready: page->in_use++; offset = page->offset; @@ -348,7 +334,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, #ifdef DMAPOOL_DEBUG memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif - done: spin_unlock_irqrestore(&pool->lock, flags); return retval; } @@ -435,8 +420,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; - if (waitqueue_active(&pool->waitq)) - wake_up_locked(&pool->waitq); /* * Resist a temptation to do * if (!is_page_busy(page)) pool_free_page(pool, page); diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0af79..dcb98722a22f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -17,6 +17,7 @@ #include <linux/fadvise.h> #include <linux/writeback.h> #include <linux/syscalls.h> +#include <linux/swap.h> #include <asm/unistd.h> @@ -124,9 +125,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; end_index = (endbyte >> PAGE_CACHE_SHIFT); - if (end_index >= start_index) - invalidate_mapping_pages(mapping, start_index, + if (end_index >= start_index) { + unsigned long count = invalidate_mapping_pages(mapping, + start_index, end_index); + + /* + * If fewer pages were invalidated than expected then + * it is possible that some of the pages were on + * a per-cpu pagevec for a remote CPU. Drain all + * pagevecs and try again. + */ + if (count < (end_index - start_index + 1)) { + lru_add_drain_all(); + invalidate_mapping_pages(mapping, start_index, end_index); + } + } break; default: ret = -EINVAL; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..caf15b6fa753 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -950,6 +950,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); + if (ret & VM_FAULT_OOM) + split_huge_page(page); put_page(page); goto out; } @@ -957,6 +959,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { put_page(new_page); + split_huge_page(page); put_page(page); ret |= VM_FAULT_OOM; goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a799df59d310..c384e0901f9c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2431,7 +2431,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = vma_hugecache_offset(h, vma, address); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + vma->vm_pgoff; mapping = vma->vm_file->f_dentry->d_inode->i_mapping; /* diff --git a/mm/memblock.c b/mm/memblock.c index 280d3d7835d6..11e5bd174f3c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -908,6 +908,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + int i; + phys_addr_t start, end, orig_start, orig_end; + struct memblock_type *mem = &memblock.memory; + + for (i = 0; i < mem->cnt; i++) { + orig_start = mem->regions[i].base; + orig_end = mem->regions[i].base + mem->regions[i].size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + mem->regions[i].base = start; + mem->regions[i].size = end - start; + } else { + memblock_remove_region(mem, i); + i--; + } + } +} void __init_memblock memblock_set_current_limit(phys_addr_t limit) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7685d4a0b3ce..81c275b3afed 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1489,17 +1489,26 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { u64 limit; - u64 memsw; limit = res_counter_read_u64(&memcg->res, RES_LIMIT); - limit += total_swap_pages << PAGE_SHIFT; - memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* - * If memsw is finite and limits the amount of swap space available - * to this memcg, return that limit. + * Do not consider swap space if we cannot swap due to swappiness */ - return min(limit, memsw); + if (mem_cgroup_swappiness(memcg)) { + u64 memsw; + + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + + /* + * If memsw is finite and limits the amount of swap space + * available to this memcg, return that limit. + */ + limit = min(limit, memsw); + } + + return limit; } static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9299eea77f72..b68c8f66e4d3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1481,9 +1481,17 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_trans_head(page); if (PageHuge(page)) return soft_offline_huge_page(page, flags); + if (PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } ret = get_any_page(page, pfn, flags); if (ret < 0) diff --git a/mm/memory.c b/mm/memory.c index 6105f475fa86..2f42aab50e9b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -182,10 +182,14 @@ static int tlb_next_batch(struct mmu_gather *tlb) return 1; } + if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) + return 0; + batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return 0; + tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; @@ -212,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) tlb->local.nr = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; + tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; @@ -3489,6 +3494,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); +retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3502,13 +3508,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd, flags); } else { pmd_t orig_pmd = *pmd; + int ret; + barrier(); if (pmd_trans_huge(orig_pmd)) { if (flags & FAULT_FLAG_WRITE && !pmd_write(orig_pmd) && - !pmd_trans_splitting(orig_pmd)) - return do_huge_pmd_wp_page(mm, vma, address, - pmd, orig_pmd); + !pmd_trans_splitting(orig_pmd)) { + ret = do_huge_pmd_wp_page(mm, vma, address, pmd, + orig_pmd); + /* + * If COW results in an oom, the huge pmd will + * have been split, so retry the fault on the + * pte for a smaller charge. + */ + if (unlikely(ret & VM_FAULT_OOM)) + goto retry; + return ret; + } return 0; } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc898cb4fe8f..77ad30613a3d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -127,9 +127,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -188,9 +185,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) end_pfn = pfn + pgdat->node_spanned_pages; /* register_section info */ - for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) - register_page_bootmem_info_section(pfn); - + for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + /* + * Some platforms can assign the same pfn to multiple nodes - on + * node0 as well as nodeN. To avoid registering a pfn against + * multiple nodes we check that this pfn does not already + * reside in some other node. + */ + if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + register_page_bootmem_info_section(pfn); + } } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index bf5b485ecd31..82f1b027ba1c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return first; } +/* + * Apply policy to a single VMA + * This must be called with the mmap_sem held for writing. + */ +static int vma_replace_policy(struct vm_area_struct *vma, + struct mempolicy *pol) +{ + int err; + struct mempolicy *old; + struct mempolicy *new; + + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + vma->vm_start, vma->vm_end, vma->vm_pgoff, + vma->vm_ops, vma->vm_file, + vma->vm_ops ? vma->vm_ops->set_policy : NULL); + + new = mpol_dup(pol); + if (IS_ERR(new)) + return PTR_ERR(new); + + if (vma->vm_ops && vma->vm_ops->set_policy) { + err = vma->vm_ops->set_policy(vma, new); + if (err) + goto err_out; + } + + old = vma->vm_policy; + vma->vm_policy = new; /* protected by mmap_sem */ + mpol_put(old); + + return 0; + err_out: + mpol_put(new); + return err; +} + /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) @@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } - - /* - * Apply policy to a single VMA. The reference counting of - * policy for vma_policy linkages has already been handled by - * vma_merge and split_vma as necessary. If this is a shared - * policy then ->set_policy will increment the reference count - * for an sp node. - */ - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - if (vma->vm_ops && vma->vm_ops->set_policy) { - err = vma->vm_ops->set_policy(vma, new_pol); - if (err) - goto out; - } + err = vma_replace_policy(vma, new_pol); + if (err) + goto out; } out: @@ -1510,8 +1532,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, addr); if (vpol) pol = vpol; - } else if (vma->vm_policy) + } else if (vma->vm_policy) { pol = vma->vm_policy; + + /* + * shmem_alloc_page() passes MPOL_F_SHARED policy with + * a pseudo vma whose vma->vm_ops=NULL. Take a reference + * count on these policies which will be dropped by + * mpol_cond_put() later + */ + if (mpol_needs_cond_ref(pol)) + mpol_get(pol); + } } if (!pol) pol = &default_policy; @@ -1977,28 +2009,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) return new; } -/* - * If *frompol needs [has] an extra ref, copy *frompol to *tompol , - * eliminate the * MPOL_F_* flags that require conditional ref and - * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly - * after return. Use the returned value. - * - * Allows use of a mempolicy for, e.g., multiple allocations with a single - * policy lookup, even if the policy needs/has extra ref on lookup. - * shmem_readahead needs this. - */ -struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, - struct mempolicy *frompol) -{ - if (!mpol_needs_cond_ref(frompol)) - return frompol; - - *tompol = *frompol; - tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */ - __mpol_put(frompol); - return tompol; -} - /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -2035,7 +2045,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) */ /* lookup first element intersecting start-end */ -/* Caller holds sp->lock */ +/* Caller holds sp->mutex */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { @@ -2099,36 +2109,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) if (!sp->root.rb_node) return NULL; - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } - spin_unlock(&sp->lock); + mutex_unlock(&sp->mutex); return pol; } +static void sp_free(struct sp_node *n) +{ + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_free(n); } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { - struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); + struct sp_node *n; + struct mempolicy *newpol; + n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; + + newpol = mpol_dup(pol); + if (IS_ERR(newpol)) { + kmem_cache_free(sn_cache, n); + return NULL; + } + newpol->flags |= MPOL_F_SHARED; + n->start = start; n->end = end; - mpol_get(pol); - pol->flags |= MPOL_F_SHARED; /* for unref */ - n->policy = pol; + n->policy = newpol; + return n; } @@ -2136,10 +2160,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { - struct sp_node *n, *new2 = NULL; + struct sp_node *n; + int ret = 0; -restart: - spin_lock(&sp->lock); + mutex_lock(&sp->mutex); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { @@ -2152,16 +2176,14 @@ restart: } else { /* Old policy spanning whole new range. */ if (n->end > end) { + struct sp_node *new2; + new2 = sp_alloc(end, n->end, n->policy); if (!new2) { - spin_unlock(&sp->lock); - new2 = sp_alloc(end, n->end, n->policy); - if (!new2) - return -ENOMEM; - goto restart; + ret = -ENOMEM; + goto out; } n->end = start; sp_insert(sp, new2); - new2 = NULL; break; } else n->end = start; @@ -2172,12 +2194,9 @@ restart: } if (new) sp_insert(sp, new); - spin_unlock(&sp->lock); - if (new2) { - mpol_put(new2->policy); - kmem_cache_free(sn_cache, new2); - } - return 0; +out: + mutex_unlock(&sp->mutex); + return ret; } /** @@ -2195,7 +2214,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + mutex_init(&sp->mutex); if (mpol) { struct vm_area_struct pvma; @@ -2249,7 +2268,7 @@ int mpol_set_shared_policy(struct shared_policy *info, } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) - kmem_cache_free(sn_cache, new); + sp_free(new); return err; } @@ -2261,16 +2280,14 @@ void mpol_free_shared_policy(struct shared_policy *p) if (!p->root.rb_node) return; - spin_lock(&p->lock); + mutex_lock(&p->mutex); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - rb_erase(&n->nd, &p->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + sp_delete(p, n); } - spin_unlock(&p->lock); + mutex_unlock(&p->mutex); } /* assumes fs == KERNEL_DS */ @@ -2327,8 +2344,7 @@ void numa_default_policy(void) */ /* - * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag - * Used only for mpol_parse_str() and mpol_to_str() + * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ #define MPOL_LOCAL MPOL_MAX static const char * const policy_modes[] = @@ -2343,28 +2359,21 @@ static const char * const policy_modes[] = #ifdef CONFIG_TMPFS /** - * mpol_parse_str - parse string to mempolicy + * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. - * @no_context: flag whether to "contextualize" the mempolicy + * @unused: redundant argument, to be removed later. * * Format of input: * <mode>[=<flags>][:<nodelist>] * - * if @no_context is true, save the input nodemask in w.user_nodemask in - * the returned mempolicy. This will be used to "clone" the mempolicy in - * a specific context [cpuset] at a later time. Used to parse tmpfs mpol - * mount option. Note that if 'static' or 'relative' mode flags were - * specified, the input nodemask will already have been saved. Saving - * it again is redundant, but safe. - * * On success, returns 0, else 1 */ -int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) +int mpol_parse_str(char *str, struct mempolicy **mpol, int unused) { struct mempolicy *new = NULL; unsigned short mode; - unsigned short uninitialized_var(mode_flags); + unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); @@ -2452,24 +2461,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (IS_ERR(new)) goto out; - if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; - } else { - int ret; - NODEMASK_SCRATCH(scratch); - if (scratch) { - task_lock(current); - ret = mpol_set_nodemask(new, &nodes, scratch); - task_unlock(current); - } else - ret = -ENOMEM; - NODEMASK_SCRATCH_FREE(scratch); - if (ret) { - mpol_put(new); - goto out; - } - } + /* + * Save nodes for mpol_to_str() to show the tmpfs mount options + * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. + */ + if (mode != MPOL_PREFERRED) + new->v.nodes = nodes; + else if (nodelist) + new->v.preferred_node = first_node(nodes); + else + new->flags |= MPOL_F_LOCAL; + + /* + * Save nodes for contextualization: this will be used to "clone" + * the mempolicy in a specific context [cpuset] at a later time. + */ + new->w.user_nodemask = nodes; + err = 0; out: @@ -2489,13 +2497,13 @@ out: * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted - * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask + * @unused: redundant argument, to be removed later. * * Convert a mempolicy into a string. * Returns the number of characters in buffer (if positive) * or an error (negative) */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused) { char *p = buffer; int l; @@ -2521,7 +2529,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_PREFERRED: nodes_clear(nodes); if (flags & MPOL_F_LOCAL) - mode = MPOL_LOCAL; /* pseudo-policy */ + mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; @@ -2529,14 +2537,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - if (no_context) - nodes = pol->w.user_nodemask; - else - nodes = pol->v.nodes; + nodes = pol->v.nodes; break; default: - BUG(); + return -EINVAL; } l = strlen(policy_modes[mode]); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b60822d9f..8d1ca2de4080 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -14,10 +14,14 @@ #include <linux/export.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/srcu.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/slab.h> +/* global SRCU for all MMs */ +static struct srcu_struct srcu; + /* * This function can't run concurrently against mmu_notifier_register * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap @@ -25,58 +29,61 @@ * in parallel despite there being no task using this mm any more, * through the vmas outside of the exit_mmap context, such as with * vmtruncate. This serializes against mmu_notifier_unregister with - * the mmu_notifier_mm->lock in addition to RCU and it serializes - * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * the mmu_notifier_mm->lock in addition to SRCU and it serializes + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm * can't go away from under us as exit_mmap holds an mm_count pin * itself. */ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; - struct hlist_node *n; + int id; /* - * RCU here will block mmu_notifier_unregister until - * ->release returns. + * srcu_read_lock() here will block synchronize_srcu() in + * mmu_notifier_unregister() until all registered + * ->release() callouts this function makes have + * returned. */ - rcu_read_lock(); - hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) - /* - * if ->release runs before mmu_notifier_unregister it - * must be handled as it's the only way for the driver - * to flush all existing sptes and stop the driver - * from establishing any more sptes before all the - * pages in the mm are freed. - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - rcu_read_unlock(); - + id = srcu_read_lock(&srcu); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { mn = hlist_entry(mm->mmu_notifier_mm->list.first, struct mmu_notifier, hlist); + /* - * We arrived before mmu_notifier_unregister so - * mmu_notifier_unregister will do nothing other than - * to wait ->release to finish and - * mmu_notifier_unregister to return. + * Unlink. This will prevent mmu_notifier_unregister() + * from also making the ->release() callout. */ hlist_del_init_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * Clear sptes. (see 'release' description in mmu_notifier.h) + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + + spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * synchronize_rcu here prevents mmu_notifier_release to - * return to exit_mmap (which would proceed freeing all pages - * in the mm) until the ->release method returns, if it was - * invoked by mmu_notifier_unregister. - * - * The mmu_notifier_mm can't go away from under us because one - * mm_count is hold by exit_mmap. + * All callouts to ->release() which we have done are complete. + * Allow synchronize_srcu() in mmu_notifier_unregister() to complete + */ + srcu_read_unlock(&srcu, id); + + /* + * mmu_notifier_unregister() may have unlinked a notifier and may + * still be calling out to it. Additionally, other notifiers + * may have been active via vmtruncate() et. al. Block here + * to ensure that all notifier callouts for this mm have been + * completed and the sptes are really cleaned up before returning + * to exit_mmap(). */ - synchronize_rcu(); + synchronize_srcu(&srcu); } /* @@ -89,14 +96,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->clear_flush_young) young |= mn->ops->clear_flush_young(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -106,9 +113,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; - int young = 0; + int young = 0, id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->test_young) { young = mn->ops->test_young(mn, mm, address); @@ -116,7 +123,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, break; } } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); return young; } @@ -126,8 +133,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->change_pte) mn->ops->change_pte(mn, mm, address, pte); @@ -138,7 +146,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, else if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -146,13 +154,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_page) mn->ops->invalidate_page(mn, mm, address); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, @@ -160,13 +169,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_start) mn->ops->invalidate_range_start(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -174,13 +184,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, { struct mmu_notifier *mn; struct hlist_node *n; + int id; - rcu_read_lock(); + id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); } - rcu_read_unlock(); + srcu_read_unlock(&srcu, id); } static int do_mmu_notifier_register(struct mmu_notifier *mn, @@ -192,6 +203,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ + BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -274,8 +291,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) /* * This releases the mm_count pin automatically and frees the mm * structure if it was the last user of it. It serializes against - * running mmu notifiers with RCU and against mmu_notifier_unregister - * with the unregister lock + RCU. All sptes must be dropped before + * running mmu notifiers with SRCU and against mmu_notifier_unregister + * with the unregister lock + SRCU. All sptes must be dropped before * calling mmu_notifier_unregister. ->release or any other notifier * method may be invoked concurrently with mmu_notifier_unregister, * and only after mmu_notifier_unregister returned we're guaranteed @@ -285,35 +302,43 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); + spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { - /* - * RCU here will force exit_mmap to wait ->release to finish - * before freeing the pages. - */ - rcu_read_lock(); + int id; /* - * exit_mmap will block in mmu_notifier_release to - * guarantee ->release is called before freeing the - * pages. + * Ensure we synchronize up with __mmu_notifier_release(). */ + id = srcu_read_lock(&srcu); + + hlist_del_rcu(&mn->hlist); + spin_unlock(&mm->mmu_notifier_mm->lock); + if (mn->ops->release) mn->ops->release(mn, mm); - rcu_read_unlock(); - spin_lock(&mm->mmu_notifier_mm->lock); - hlist_del_rcu(&mn->hlist); + /* + * Allow __mmu_notifier_release() to complete. + */ + srcu_read_unlock(&srcu, id); + } else spin_unlock(&mm->mmu_notifier_mm->lock); - } /* - * Wait any running method to finish, of course including - * ->release if it was run by mmu_notifier_relase instead of us. + * Wait for any running method to finish, including ->release() if it + * was run by __mmu_notifier_release() instead of us. */ - synchronize_rcu(); + synchronize_srcu(&srcu); BUG_ON(atomic_read(&mm->mm_count) <= 0); mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +static int __init mmu_notifier_init(void) +{ + return init_srcu_struct(&srcu); +} + +module_init(mmu_notifier_init); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 26adea8ca2e7..bc8465f579a8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -187,6 +187,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) zone_reclaimable_pages(z) - z->dirty_balance_reserve; } /* + * Unreclaimable memory (kernel memory or anonymous memory + * without swap) can bring down the dirtyable pages below + * the zone's dirty balance reserve and the above calculation + * will underflow. However we still want to add in nodes + * which are below threshold (negative values) to get a more + * accurate calculation but make sure that the total never + * underflows. + */ + if ((long)x < 0) + x = 0; + + /* * Make sure that the number of highmem pages is never larger * than the number of the total dirtyable memory. This can only * occur in very strange VM situations but we want to make sure @@ -208,8 +220,8 @@ unsigned long global_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - - dirty_balance_reserve; + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x -= min(x, dirty_balance_reserve); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -276,9 +288,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone) * highmem zone can hold its share of dirty pages, so we don't * care about vm_highmem_is_dirtyable here. */ - return zone_page_state(zone, NR_FREE_PAGES) + - zone_reclaimable_pages(zone) - - zone->dirty_balance_reserve; + unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone); + + /* don't allow this to underflow */ + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + return nr_pages; } /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f20f1c354ef..ef868e299932 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -581,7 +581,7 @@ static inline void __free_one_page(struct page *page, combined_idx = buddy_idx & page_idx; higher_page = page + (combined_idx - page_idx); buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = page + (buddy_idx - combined_idx); + higher_buddy = higher_page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); @@ -4273,10 +4273,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, * round what is now in bits to nearest long in bits, then return it in * bytes. */ -static unsigned long __init usemap_size(unsigned long zonesize) +static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize) { unsigned long usemapsize; + zonesize += zone_start_pfn & (pageblock_nr_pages-1); usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; @@ -4286,17 +4287,19 @@ static unsigned long __init usemap_size(unsigned long zonesize) } static void __init setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) + struct zone *zone, + unsigned long zone_start_pfn, + unsigned long zonesize) { - unsigned long usemapsize = usemap_size(zonesize); + unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); zone->pageblock_flags = NULL; if (usemapsize) zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, usemapsize); } #else -static inline void setup_usemap(struct pglist_data *pgdat, - struct zone *zone, unsigned long zonesize) {} +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, + unsigned long zone_start_pfn, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -4424,7 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, continue; set_pageblock_order(pageblock_default_order()); - setup_usemap(pgdat, zone, size); + setup_usemap(pgdat, zone, zone_start_pfn, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); @@ -5409,7 +5412,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - zone->zone_start_pfn; + pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } diff --git a/mm/rmap.c b/mm/rmap.c index 5b5ad584ffb7..bfca52c96999 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> +#include <linux/backing-dev.h> #include <asm/tlbflush.h> @@ -977,11 +978,8 @@ int page_mkclean(struct page *page) if (page_mapped(page)) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping) ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } } return ret; @@ -1167,6 +1165,7 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1189,8 +1188,19 @@ void page_remove_rmap(struct page *page) * this if the page is anon, so about to be freed; but perhaps * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. + * + * And we can skip it on file pages, so long as the filesystem + * participates in dirty tracking; but need to catch shm and tmpfs + * and ramfs pages which have been modified since creation by read + * fault. + * + * Note that mapping must be decided above, before decrementing + * mapcount (which luckily provides a barrier): once page is unmapped, + * it could be truncated and page->mapping reset to NULL at any moment. + * Note also that we are relying on page_mapping(page) to set mapping + * to &swapper_space when PageSwapCache(page). */ - if ((!anon || PageSwapCache(page)) && + if (mapping && !mapping_cap_account_dirty(mapping) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* diff --git a/mm/shmem.c b/mm/shmem.c index 2910f0d58c46..448956691e6b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -595,7 +595,7 @@ static void shmem_evict_inode(struct inode *inode) kfree(xattr->name); kfree(xattr); } - BUG_ON(inode->i_blocks); + WARN_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); end_writeback(inode); } @@ -798,24 +798,28 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct mempolicy mpol, *spol; struct vm_area_struct pvma; - - spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, index)); + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; pvma.vm_pgoff = index; pvma.vm_ops = NULL; - pvma.vm_policy = spol; - return swapin_readahead(swap, gfp, &pvma, 0); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); + + page = swapin_readahead(swap, gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } static struct page *shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; + struct page *page; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; @@ -823,10 +827,12 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - /* - * alloc_page_vma() will drop the shared policy reference - */ - return alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma(gfp, &pvma, 0); + + /* Drop reference taken by mpol_shared_policy_lookup() */ + mpol_cond_put(pvma.vm_policy); + + return page; } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS @@ -2018,12 +2024,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, { struct inode *inode; struct dentry *dentry = NULL; - u64 inum = fid->raw[2]; - inum = (inum << 32) | fid->raw[1]; + u64 inum; if (fh_len < 3) return NULL; + inum = fid->raw[2]; + inum = (inum << 32) | fid->raw[1]; + inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { @@ -2169,6 +2177,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) unsigned long inodes; int error = -EINVAL; + config.mpol = NULL; if (shmem_parse_options(data, &config, true)) return error; @@ -2193,8 +2202,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; - mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + /* + * Preserve previous mempolicy unless mpol remount option was specified. + */ + if (config.mpol) { + mpol_put(sbinfo->mpol); + sbinfo->mpol = config.mpol; /* transfers initial ref */ + } out: spin_unlock(&sbinfo->stat_lock); return error; diff --git a/mm/slab.c b/mm/slab.c index e901a36e2520..da2bb689a008 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1685,9 +1685,6 @@ void __init kmem_cache_init_late(void) g_cpucache_up = LATE; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -1695,6 +1692,9 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&cache_chain_mutex); + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* Done! */ g_cpucache_up = FULL; diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..290dba25a7ed 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -619,7 +619,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { return; /* XXX: Not implemented yet */ } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { } #else @@ -660,10 +660,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) get_order(sizeof(struct page) * nr_pages)); } -static void free_map_bootmem(struct page *page, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; unsigned long magic; + struct page *page = virt_to_page(memmap); for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -712,13 +713,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) */ if (memmap) { - struct page *memmap_page; - memmap_page = virt_to_page(memmap); - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; - free_map_bootmem(memmap_page, nr_pages); + free_map_bootmem(memmap, nr_pages); } } diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..4224627695ba 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; + clear_page_mlock(page); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; - clear_page_mlock(page); BUG_ON(page_has_private(page)); __delete_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index be5bc0af2e76..e6ca5051c021 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1983,10 +1983,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; spin_unlock_irq(&mz->zone->lru_lock); @@ -1999,7 +1999,7 @@ out: unsigned long scan; scan = zone_nr_lru_pages(mz, lru); - if (priority || noswap) { + if (priority || noswap || !vmscan_swappiness(mz, sc)) { scan >>= priority; if (!scan && force_scan) scan = SWAP_CLUSTER_MAX; @@ -3128,6 +3128,8 @@ static int kswapd(void *p) &balanced_classzone_idx); } } + + current->reclaim_state = NULL; return 0; } |