diff options
author | Andy Voltz <andy.voltz@timesys.com> | 2010-11-15 16:00:38 -0500 |
---|---|---|
committer | Andy Voltz <andy.voltz@timesys.com> | 2010-11-17 11:30:34 -0500 |
commit | 2e0ae4bf137e282e825b27b02eb46f86c59d004e (patch) | |
tree | 7c2534acd757ca414f02c5b983ee67eb2b2493c6 /mm | |
parent | 22763c5cf3690a681551162c15d34d935308c8d7 (diff) |
Bump Linux kernel to 2.6.32.25
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 1 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/filemap.c | 113 | ||||
-rw-r--r-- | mm/hugetlb.c | 9 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/ksm.c | 14 | ||||
-rw-r--r-- | mm/memcontrol.c | 24 | ||||
-rw-r--r-- | mm/memory-failure.c | 21 | ||||
-rw-r--r-- | mm/memory.c | 66 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 16 | ||||
-rw-r--r-- | mm/mempolicy.c | 40 | ||||
-rw-r--r-- | mm/migrate.c | 3 | ||||
-rw-r--r-- | mm/mincore.c | 37 | ||||
-rw-r--r-- | mm/mlock.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 66 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mremap.c | 241 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 40 | ||||
-rw-r--r-- | mm/pagewalk.c | 16 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 18 | ||||
-rw-r--r-- | mm/slab.c | 10 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/truncate.c | 30 | ||||
-rw-r--r-- | mm/util.c | 44 | ||||
-rw-r--r-- | mm/vmalloc.c | 115 | ||||
-rw-r--r-- | mm/vmscan.c | 71 | ||||
-rw-r--r-- | mm/vmstat.c | 15 |
31 files changed, 778 insertions, 304 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6d..2c19c0bac10e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -227,6 +227,7 @@ config KSM config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a588e34..1d5fa0870ca9 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -115,8 +115,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) */ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - flush_dcache_page(tovec->bv_page); bounce_copy_vec(tovec, vfrom); + flush_dcache_page(tovec->bv_page); } } diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: if (!mapping->a_ops->readpage) { diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156d..46e3f8a11b2a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -462,7 +462,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@ -473,7 +473,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } @@ -1120,6 +1120,12 @@ page_not_up_to_date_locked: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); @@ -1655,14 +1661,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1682,31 +1689,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1731,8 +1725,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1750,18 +1803,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); @@ -2217,6 +2259,9 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..f5a106eeb2bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -401,7 +401,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } @@ -545,6 +545,7 @@ static void free_huge_page(struct page *page) mapping = (struct address_space *) page_private(page); set_page_private(page, 0); + page->mapping = NULL; BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); @@ -1007,7 +1008,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page(h, vma, addr); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-VM_FAULT_SIGBUS); } } @@ -2095,8 +2096,10 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - } else + } else { lock_page(page); + page->mapping = HUGETLB_POISON; + } } /* diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..17bc0df273bb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -107,9 +107,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) } /* - * must be called with vma's mmap_sem held for read, and page locked. + * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -34,6 +34,7 @@ #include <linux/ksm.h> #include <asm/tlbflush.h> +#include "internal.h" /* * A few notes about the KSM scanning process, @@ -767,15 +768,14 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, oldpage, &orig_pte)) { - unlock_page(oldpage); - goto out_putpage; - } - unlock_page(oldpage); - - if (pages_identical(oldpage, newpage)) + if (write_protect_page(vma, oldpage, &orig_pte) == 0 && + pages_identical(oldpage, newpage)) err = replace_page(vma, oldpage, newpage, orig_pte); + if ((vma->vm_flags & VM_LOCKED) && !err) + munlock_vma_page(oldpage); + + unlock_page(oldpage); out_putpage: put_page(oldpage); put_page(newpage); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..ba9a0aaa4b74 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -758,7 +758,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) task_unlock(task); if (!curr) return 0; - if (curr->use_hierarchy) + /* + * We should check use_hierarchy of "mem" not "curr". Because checking + * use_hierarchy of "curr" here make this function true if hierarchy is + * enabled in "curr" and "curr" is a child of "mem" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "mem"). + */ + if (mem->use_hierarchy) ret = css_is_ancestor(&curr->css, &mem->css); else ret = (curr == mem); @@ -2002,12 +2008,12 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) } unlock_page_cgroup(pc); + *ptr = mem; if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, page); css_put(&mem->css); } - *ptr = mem; return ret; } @@ -2375,7 +2381,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) if (free_all) goto try_to_free; move_account: - while (mem->res.usage > 0) { + do { ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; @@ -2402,8 +2408,8 @@ move_account: if (ret == -ENOMEM) goto try_to_free; cond_resched(); - } - ret = 0; + /* "ret" should also be checked to ensure all lists are empty. */ + } while (mem->res.usage > 0 || ret); out: css_put(&mem->css); return ret; @@ -2436,10 +2442,7 @@ try_to_free: } lru_add_drain(); /* try move_account...there may be some *locked* pages. */ - if (mem->res.usage) - goto move_account; - ret = 0; - goto out; + goto move_account; } int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) @@ -2541,6 +2544,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) val += idx_val; mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val += idx_val; val <<= PAGE_SHIFT; } else val = res_counter_read_u64(&mem->memsw, name); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dacc64183874..8aeba5341ccd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -589,7 +589,6 @@ static struct page_state { { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, - { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, /* * Catchall entry: must be at end. @@ -638,7 +637,7 @@ static int page_action(struct page_state *ps, struct page *p, * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static void hwpoison_user_mappings(struct page *p, unsigned long pfn, +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno) { enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; @@ -648,15 +647,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, int i; int kill = 1; - if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) - return; + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(p)) - return; + return SWAP_SUCCESS; + + if (PageCompound(p) || PageKsm(p)) + return SWAP_FAIL; if (PageSwapCache(p)) { printk(KERN_ERR @@ -718,6 +720,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, */ kill_procs_ao(&tokill, !!PageDirty(p), trapno, ret != SWAP_SUCCESS, pfn); + + return ret; } int __memory_failure(unsigned long pfn, int trapno, int ref) @@ -787,8 +791,13 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) /* * Now take care of user space mappings. + * Abort on fail: __remove_from_page_cache() assumes unmapped page. */ - hwpoison_user_mappings(p, pfn, trapno); + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; + } /* * Torn down by someone else? diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..53c1da0d04a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1282,10 +1282,20 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - struct page *page = vm_normal_page(gate_vma, start, *pte); + struct page *page; + + page = vm_normal_page(gate_vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + return i ? : -EFAULT; + } + } pages[i] = page; - if (page) - get_page(page); + get_page(page); } pte_unmap(pte); if (vmas) @@ -2514,7 +2524,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, address, orig_pte, NULL); - ret = VM_FAULT_OOM; + ret = VM_FAULT_SIGBUS; } goto out; } @@ -2620,6 +2630,40 @@ out_release: } /* + * This is like a special single-page "expand_{down|up}wards()", + * except we must first make sure that 'address{-|+}PAGE_SIZE' + * doesn't hit another vma. + */ +static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) +{ + address &= PAGE_MASK; + if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { + struct vm_area_struct *prev = vma->vm_prev; + + /* + * Is there a mapping abutting this one below? + * + * That's only ok if it's the same stack mapping + * that has gotten split.. + */ + if (prev && prev->vm_end == address) + return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; + + expand_stack(vma, address - PAGE_SIZE); + } + if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { + struct vm_area_struct *next = vma->vm_next; + + /* As VM_GROWSDOWN but s/below/above/ */ + if (next && next->vm_start == address + PAGE_SIZE) + return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; + + expand_upwards(vma, address + PAGE_SIZE); + } + return 0; +} + +/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2632,19 +2676,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + pte_unmap(page_table); + + /* Check if we need to add a guard page to the stack */ + if (check_stack_guard_page(vma, address) < 0) + return VM_FAULT_SIGBUS; + + /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); - ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; goto setpte; } /* Allocate our own private page. */ - pte_unmap(page_table); - if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2910,7 +2958,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; } pgoff = pte_to_pgoff(orig_pte); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465cd27c..6d27a5b625fa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -551,19 +551,19 @@ static inline int pageblock_free(struct page *page) /* Return the start of the next active pageblock after a given page */ static struct page *next_active_pageblock(struct page *page) { - int pageblocks_stride; - /* Ensure the starting page is pageblock-aligned */ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); - /* Move forward by at least 1 * pageblock_nr_pages */ - pageblocks_stride = 1; - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) - pageblocks_stride += page_order(page) - pageblock_order; + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } - return page + (pageblocks_stride * pageblock_nr_pages); + return page + pageblock_nr_pages; } /* Checks if this range of memory is likely to be hot-removable. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..f29d8d70884d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2122,8 +2122,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *rest = nodelist; while (isdigit(*rest)) rest++; - if (!*rest) - err = 0; + if (*rest) + goto out; } break; case MPOL_INTERLEAVE: @@ -2132,7 +2132,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) nodes = node_states[N_HIGH_MEMORY]; - err = 0; break; case MPOL_LOCAL: /* @@ -2142,11 +2141,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) goto out; mode = MPOL_PREFERRED; break; - - /* - * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. - */ + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; } mode_flags = 0; @@ -2160,13 +2167,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else - err = 1; + goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) - err = 1; - else { + goto out; + + { int ret; NODEMASK_SCRATCH(scratch); if (scratch) { @@ -2177,13 +2185,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) ret = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); if (ret) { - err = 1; mpol_put(new); - } else if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; + goto out; } } + err = 0; + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } out: /* Restore string for error message */ diff --git a/mm/migrate.c b/mm/migrate.c index 7dbcb22316d2..0e39f94b2d9c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -953,6 +953,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..7a3436ef39eb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -14,6 +14,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/hugetlb.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag if (!vma || addr < vma->vm_start) return -ENOMEM; +#ifdef CONFIG_HUGETLB_PAGE + if (is_vm_hugetlb_page(vma)) { + struct hstate *h; + unsigned long nr_huge; + unsigned char present; + + i = 0; + nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); + h = hstate_vma(vma); + nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) + - (addr >> huge_page_shift(h)) + 1; + nr_huge = min(nr_huge, + (vma->vm_end - addr) >> huge_page_shift(h)); + while (1) { + /* hugepage always in RAM for now, + * but generally it needs to be check */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = !!(ptep && + !huge_pte_none(huge_ptep_get(ptep))); + while (1) { + vec[i++] = present; + addr += PAGE_SIZE; + /* reach buffer limit */ + if (i == nr) + return nr; + /* check hugepage border */ + if (!((addr & ~huge_page_mask(h)) + >> PAGE_SHIFT)) + break; + } + } + return nr; + } +#endif + /* * Calculate how many pages there are left in the last level of the * PTE array for our address. diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..2d846cfe3990 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -99,14 +99,14 @@ void mlock_vma_page(struct page *page) * not get another chance to clear PageMlocked. If we successfully * isolate the page and try_to_munlock() detects other VM_LOCKED vmas * mapping the page, it will restore the PageMlocked state, unless the page - * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * is mapped in a non-linear vma. So, we go ahead and ClearPageMlocked(), * perhaps redundantly. * If we lose the isolation race, and the page is mapped by other VM_LOCKED * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() * either of which will restore the PageMlocked state by calling * mlock_vma_page() above, if it can grab the vma's mmap sem. */ -static void munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); @@ -138,6 +138,13 @@ static void munlock_vma_page(struct page *page) } } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma @@ -170,6 +177,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if (vma->vm_flags & VM_WRITE) gup_flags |= FOLL_WRITE; + /* We don't try to access the guard page of a stack vma */ + if (stack_guard_page(vma, start)) { + addr += PAGE_SIZE; + nr_pages--; + } + while (nr_pages > 0) { int i; diff --git a/mm/mmap.c b/mm/mmap.c index 73f5e4b64010..866a66669037 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -389,17 +389,23 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + struct vm_area_struct *next; + + vma->vm_prev = prev; if (prev) { - vma->vm_next = prev->vm_next; + next = prev->vm_next; prev->vm_next = vma; } else { mm->mmap = vma; if (rb_parent) - vma->vm_next = rb_entry(rb_parent, + next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); else - vma->vm_next = NULL; + next = NULL; } + vma->vm_next = next; + if (next) + next->vm_prev = vma; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -487,7 +493,11 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - prev->vm_next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next; + + prev->vm_next = next; + if (next) + next->vm_prev = prev; rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; @@ -932,13 +942,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); - error = arch_mmap_check(addr, len, flags); - if (error) - return error; - /* Careful about overflows.. */ len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + if (!len) return -ENOMEM; /* offset overflow? */ @@ -949,24 +955,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; - if (file) - return -EINVAL; - - /* - * VM_NORESERVE is used because the reservations will be - * taken when vm_ops->mmap() is called - * A dummy user value is used because we are not locking - * memory so no accounting is necessary - */ - len = ALIGN(len, huge_page_size(&default_hstate)); - file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE); - if (IS_ERR(file)) - return PTR_ERR(file); - } - /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -1459,6 +1447,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + if (len > TASK_SIZE) + return -ENOMEM; + get_area = current->mm->get_unmapped_area; if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1604,9 +1600,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -#ifndef CONFIG_IA64 -static -#endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1812,6 +1805,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); + vma->vm_prev = NULL; do { rb_erase(&vma->vm_rb, &mm->mm_rb); mm->map_count--; @@ -1819,6 +1813,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; + if (vma) + vma->vm_prev = prev; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; @@ -2003,20 +1999,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - if ((addr + len) > TASK_SIZE || (addr + len) < addr) - return -EINVAL; - - if (is_hugepage_only_range(mm, addr, len)) - return -EINVAL; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); if (error) return error; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = arch_mmap_check(addr, len, flags); - if (error) + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) return error; /* diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d1760213..e35bfb82c855 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +#ifdef CONFIG_SMP +/* Called when a more accurate view of NR_FREE_PAGES is needed */ +unsigned long zone_nr_free_pages(struct zone *zone) +{ + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); + + /* + * While kswapd is awake, it is considered the zone is under some + * memory pressure. Under pressure, there is a risk that + * per-cpu-counter-drift will allow the min watermark to be breached + * potentially causing a live-lock. While kswapd is awake and + * free pages are low, get a better estimate for free pages + */ + if (nr_free_pages < zone->percpu_drift_mark && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return zone_page_state_snapshot(zone, NR_FREE_PAGES); + + return nr_free_pages; +} +#endif /* CONFIG_SMP */ diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..845190898d59 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } +static struct vm_area_struct *vma_to_resize(unsigned long addr, + unsigned long old_len, unsigned long new_len, unsigned long *p) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = find_vma(mm, addr); + + if (!vma || vma->vm_start > addr) + goto Efault; + + if (is_vm_hugetlb_page(vma)) + goto Einval; + + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto Efault; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { + if (new_len > old_len) + goto Efault; + } + + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + locked += new_len - old_len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + goto Eagain; + } + + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + goto Enomem; + + if (vma->vm_flags & VM_ACCOUNT) { + unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + goto Efault; + *p = charged; + } + + return vma; + +Efault: /* very odd choice for most of the cases, but... */ + return ERR_PTR(-EFAULT); +Einval: + return ERR_PTR(-EINVAL); +Enomem: + return ERR_PTR(-ENOMEM); +Eagain: + return ERR_PTR(-EAGAIN); +} + +static unsigned long mremap_to(unsigned long addr, + unsigned long old_len, unsigned long new_addr, + unsigned long new_len) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + unsigned long map_flags; + + if (new_addr & ~PAGE_MASK) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + + ret = do_munmap(mm, new_addr, new_len); + if (ret) + goto out; + + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + old_len = new_len; + } + + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + map_flags = MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (ret & ~PAGE_MASK) + goto out1; + + ret = move_vma(vma, addr, old_len, new_len, new_addr); + if (!(ret & ~PAGE_MASK)) + goto out; +out1: + vm_unacct_memory(charged); + +out: + return ret; +} + +static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) +{ + unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ + return 0; + if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + return 0; + if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, + 0, MAP_FIXED) & ~PAGE_MASK) + return 0; + return 1; +} + /* * Expand (or shrink) an existing mapping, potentially moving it at the * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) @@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr, if (!new_len) goto out; - /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) - goto out; - if (!(flags & MREMAP_MAYMOVE)) - goto out; - - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; - - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) - goto out; - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - - ret = do_munmap(mm, new_addr, new_len); - if (ret) - goto out; + if (flags & MREMAP_MAYMOVE) + ret = mremap_to(addr, old_len, new_addr, new_len); + goto out; } /* @@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr, if (ret && old_len != new_len) goto out; ret = addr; - if (!(flags & MREMAP_FIXED) || (new_addr == addr)) - goto out; - old_len = new_len; + goto out; } /* - * Ok, we need to grow.. or relocate. + * Ok, we need to grow.. */ - ret = -EFAULT; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - if (is_vm_hugetlb_page(vma)) { - ret = -EINVAL; - goto out; - } - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - goto out; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) - goto out; - } - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += new_len - old_len; - ret = -EAGAIN; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - goto out; - } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { - ret = -ENOMEM; + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto out; } - if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - goto out_nc; - } - /* old_len exactly to the end of the area.. - * And we're not relocating the area. */ - if (old_len == vma->vm_end - addr && - !((flags & MREMAP_FIXED) && (addr != new_addr)) && - (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { - unsigned long max_addr = TASK_SIZE; - if (vma->vm_next) - max_addr = vma->vm_next->vm_start; + if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ - if (max_addr - addr >= new_len) { + if (vma_expandable(vma, new_len - old_len)) { int pages = (new_len - old_len) >> PAGE_SHIFT; vma_adjust(vma, vma->vm_start, @@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr, */ ret = -ENOMEM; if (flags & MREMAP_MAYMOVE) { - if (!(flags & MREMAP_FIXED)) { - unsigned long map_flags = 0; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; - - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); - if (new_addr & ~PAGE_MASK) { - ret = new_addr; - goto out; - } - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; } + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); -out_nc: return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0c3ad3..ebb3154d84e4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -608,7 +608,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, **pp; + struct vm_area_struct *pvma, **pp, *next; struct address_space *mapping; struct rb_node **p, *parent; @@ -668,8 +668,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) break; } - vma->vm_next = *pp; + next = *pp; *pp = vma; + vma->vm_next = next; + if (next) + next->vm_prev = vma; } /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..83cd9bb55092 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,7 +404,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); - mem_cgroup_print_oom_info(mem, current); + mem_cgroup_print_oom_info(mem, p); show_mem(); if (sysctl_oom_dump_tasks) dump_tasks(mem); @@ -426,6 +426,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; + if (mem && !task_in_mem_cgroup(c, mem)) + continue; if (!oom_kill_task(c)) return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..902e5fc8db23 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -531,13 +531,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int to_free = count; spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count); - while (count) { + while (to_free) { struct page *page; struct list_head *list; @@ -559,10 +559,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, 0, migratetype); - trace_mm_page_pcpu_drain(page, 0, migratetype); - } while (--count && --batch_free && !list_empty(list)); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); + } while (--to_free && --batch_free && !list_empty(list)); } + __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -573,8 +575,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } @@ -1225,10 +1227,10 @@ again: } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); @@ -1364,7 +1366,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1680,6 +1682,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; struct reclaim_state reclaim_state; struct task_struct *p = current; + bool drained = false; cond_resched(); @@ -1698,14 +1701,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); - if (order != 0) - drain_all_pages(); + if (unlikely(!(*did_some_progress))) + return NULL; - if (likely(*did_some_progress)) - page = get_page_from_freelist(gfp_mask, nodemask, order, +retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(); + drained = true; + goto retry; + } + return page; } @@ -2237,7 +2251,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone_nr_free_pages(zone)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..a286915e23ef 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/hugetlb.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; int err = 0; + struct vm_area_struct *vma; if (addr >= end) return err; @@ -117,11 +119,22 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { next = pgd_addr_end(addr, end); + + /* skip hugetlb vma to avoid hugepage PMD being cleared + * in pmd_none_or_clear_bad(). */ + vma = find_vma(walk->mm, addr); + if (vma && is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + continue; + } + if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; + pgd++; continue; } if (walk->pgd_entry) @@ -131,7 +144,8 @@ int walk_page_range(unsigned long addr, unsigned long end, err = walk_pud_range(pgd, addr, next, walk); if (err) break; - } while (pgd++, addr = next, addr != end); + pgd++; + } while (addr = next, addr != end); return err; } diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..3bfd6e251f19 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1702,9 +1702,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (pcpu_first_unit_cpu == NR_CPUS) pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; } } - pcpu_last_unit_cpu = cpu; pcpu_nr_units = unit; for_each_possible_cpu(cpu) diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..fe1a069fb595 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + /* be dumb */ + if (filp && (filp->f_mode & FMODE_RANDOM)) { + force_page_cache_readahead(mapping, filp, offset, req_size); + return; + } + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, false, offset, req_size); } @@ -547,5 +553,17 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); + +#ifdef CONFIG_BLOCK + /* + * Normally the current page is !uptodate and lock_page() will be + * immediately called to implicitly unplug the device. However this + * is not always true for RAID conifgurations, where data arrives + * not strictly in their submission order. In this case we need to + * explicitly kick off the IO. + */ + if (PageUptodate(page)) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); diff --git a/mm/slab.c b/mm/slab.c index 7dfa481c96ba..c8d466a4e9b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -971,13 +971,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, gfp, node); + ac_ptr = kzalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { - if (i == node || !node_online(i)) { - ac_ptr[i] = NULL; + if (i == node || !node_online(i)) continue; - } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) @@ -2251,8 +2249,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; + && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..270e136349a0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -330,8 +330,10 @@ checks: if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { + /* reuse swap entry of cache-only swap if not hibernation. */ + if (vm_swap_full() + && cache == SWAP_CACHE + && si->swap_map[offset] == SWAP_HAS_CACHE) { int swap_was_freed; spin_unlock(&swap_lock); swap_was_freed = __try_to_reclaim_swap(si, offset); diff --git a/mm/truncate.c b/mm/truncate.c index 450cebdabfc0..258bda7ec31d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -516,22 +516,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); */ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) { - if (new < old) { - struct address_space *mapping = inode->i_mapping; - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - } + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); diff --git a/mm/util.c b/mm/util.c index 7c35ad95f927..b377ce430803 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,6 +4,10 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/sched.h> +#include <linux/hugetlb.h> +#include <linux/syscalls.h> +#include <linux/mman.h> +#include <linux/file.h> #include <asm/uaccess.h> #define CREATE_TRACE_POINTS @@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, } EXPORT_SYMBOL_GPL(get_user_pages_fast); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file * file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4a44cd..c2287313b985 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + /* * Purges all lazily-freed vmap areas. * @@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } rcu_read_unlock(); - if (nr) { - BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + if (nr) atomic_sub(nr, &vmap_lazy_nr); - } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); @@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -680,10 +682,9 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -759,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_cpu_blocks); @@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); @@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -808,24 +856,37 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_init(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; +next: spin_unlock(&vb->lock); } + + if (purge) + purge_fragmented_blocks_thiscpu(); + put_cpu_var(vmap_cpu_blocks); rcu_read_unlock(); @@ -862,11 +923,11 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(!vb); spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -1035,8 +1096,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ @@ -1993,6 +2052,7 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA static struct vmap_area *node_to_va(struct rb_node *n) { return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; @@ -2257,6 +2317,7 @@ err_free: kfree(vms); return NULL; } +#endif /** * pcpu_free_vm_areas - free vmalloc areas for percpu allocator diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..4649929401f8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1083,6 +1083,48 @@ static int too_many_isolated(struct zone *zone, int file, } /* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + int lumpy_reclaim, + struct scan_control *sc) +{ + int lumpy_stall_priority; + + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; + + /* Only stall on lumpy reclaim */ + if (!lumpy_reclaim) + return false; + + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; + + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; + + return priority <= lumpy_stall_priority; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1176,14 +1218,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, nr_scanned += nr_scan; nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); - /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller - */ - if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_freed, priority, + lumpy_reclaim, sc)) { congestion_wait(BLK_RW_ASYNC, HZ/10); /* @@ -1464,20 +1501,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f9feec..42d76c65e9f2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -136,10 +136,23 @@ static void refresh_zone_stat_thresholds(void) int threshold; for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + threshold = calculate_threshold(zone); for_each_online_cpu(cpu) zone_pcp(zone, cpu)->stat_threshold = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; } } @@ -715,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), + zone_nr_free_pages(zone), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), |