summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorArve Hjønnevåg <arve@android.com>2010-03-10 16:38:33 -0800
committerArve Hjønnevåg <arve@android.com>2010-03-10 16:38:33 -0800
commit67078ecae3edb5b4657dcb9f67f744ecccd18a97 (patch)
tree90333318a7229fd977881577e0d19fd8e7e26423 /mm
parentdc136618cb23810dd3d4adfcb836b289cd528b4f (diff)
parent7f5e918e62cbc9ac27c2f47d3c3dd4b86f67ff0e (diff)
Merge commit 'v2.6.32.9' into android-2.6.32
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/filemap.c103
-rw-r--r--mm/internal.h3
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/memcontrol.c20
-rw-r--r--mm/memory.c4
-rw-r--r--mm/migrate.c3
-rw-r--r--mm/mincore.c37
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmap.c42
-rw-r--r--mm/mremap.c241
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/pagewalk.c16
-rw-r--r--mm/truncate.c30
-rw-r--r--mm/util.c44
-rw-r--r--mm/vmalloc.c111
-rw-r--r--mm/vmscan.c18
18 files changed, 481 insertions, 219 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 44cf6f0a3a6d..2c19c0bac10e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -227,6 +227,7 @@ config KSM
config DEFAULT_MMAP_MIN_ADDR
int "Low address space to protect from user allocation"
+ depends on MMU
default 4096
help
This is the portion of low virtual memory which should be protected
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..8e96c9076f80 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1655,14 +1655,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct page *__read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
{
struct page *page;
int err;
repeat:
page = find_get_page(mapping, index);
if (!page) {
- page = page_cache_alloc_cold(mapping);
+ page = __page_cache_alloc(gfp | __GFP_COLD);
if (!page)
return ERR_PTR(-ENOMEM);
err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -1682,31 +1683,18 @@ repeat:
return page;
}
-/**
- * read_cache_page_async - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: destination for read data
- *
- * Same as read_cache_page, but don't wait for page to become unlocked
- * after submitting it to the filler.
- *
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page but don't wait for it to become unlocked.
- *
- * If the page does not get brought uptodate, return -EIO.
- */
-struct page *read_cache_page_async(struct address_space *mapping,
+static struct page *do_read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *,struct page*),
- void *data)
+ void *data,
+ gfp_t gfp)
+
{
struct page *page;
int err;
retry:
- page = __read_cache_page(mapping, index, filler, data);
+ page = __read_cache_page(mapping, index, filler, data, gfp);
if (IS_ERR(page))
return page;
if (PageUptodate(page))
@@ -1731,8 +1719,67 @@ out:
mark_page_accessed(page);
return page;
}
+
+/**
+ * read_cache_page_async - read into page cache, fill it if needed
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @filler: function to perform the read
+ * @data: destination for read data
+ *
+ * Same as read_cache_page, but don't wait for page to become unlocked
+ * after submitting it to the filler.
+ *
+ * Read into the page cache. If a page already exists, and PageUptodate() is
+ * not set, try to fill the page but don't wait for it to become unlocked.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_async(struct address_space *mapping,
+ pgoff_t index,
+ int (*filler)(void *,struct page*),
+ void *data)
+{
+ return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+}
EXPORT_SYMBOL(read_cache_page_async);
+static struct page *wait_on_page_read(struct page *page)
+{
+ if (!IS_ERR(page)) {
+ wait_on_page_locked(page);
+ if (!PageUptodate(page)) {
+ page_cache_release(page);
+ page = ERR_PTR(-EIO);
+ }
+ }
+ return page;
+}
+
+/**
+ * read_cache_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping: the page's address_space
+ * @index: the page index
+ * @gfp: the page allocator flags to use if allocating
+ *
+ * This is the same as "read_mapping_page(mapping, index, NULL)", but with
+ * any new page allocations done using the specified allocation flags. Note
+ * that the Radix tree operations will still use GFP_KERNEL, so you can't
+ * expect to do this atomically or anything like that - but you can pass in
+ * other page requirements.
+ *
+ * If the page does not get brought uptodate, return -EIO.
+ */
+struct page *read_cache_page_gfp(struct address_space *mapping,
+ pgoff_t index,
+ gfp_t gfp)
+{
+ filler_t *filler = (filler_t *)mapping->a_ops->readpage;
+
+ return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp));
+}
+EXPORT_SYMBOL(read_cache_page_gfp);
+
/**
* read_cache_page - read into page cache, fill it if needed
* @mapping: the page's address_space
@@ -1750,18 +1797,7 @@ struct page *read_cache_page(struct address_space *mapping,
int (*filler)(void *,struct page*),
void *data)
{
- struct page *page;
-
- page = read_cache_page_async(mapping, index, filler, data);
- if (IS_ERR(page))
- goto out;
- wait_on_page_locked(page);
- if (!PageUptodate(page)) {
- page_cache_release(page);
- page = ERR_PTR(-EIO);
- }
- out:
- return page;
+ return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
}
EXPORT_SYMBOL(read_cache_page);
@@ -2217,6 +2253,9 @@ again:
if (unlikely(status))
break;
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
pagefault_enable();
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..17bc0df273bb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -107,9 +107,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
}
/*
- * must be called with vma's mmap_sem held for read, and page locked.
+ * must be called with vma's mmap_sem held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
+extern void munlock_vma_page(struct page *page);
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
diff --git a/mm/ksm.c b/mm/ksm.c
index 5575f8628fef..e9501f833374 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
#include <linux/ksm.h>
#include <asm/tlbflush.h>
+#include "internal.h"
/*
* A few notes about the KSM scanning process,
@@ -767,15 +768,14 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
* ptes are necessarily already write-protected. But in either
* case, we need to lock and check page_count is not raised.
*/
- if (write_protect_page(vma, oldpage, &orig_pte)) {
- unlock_page(oldpage);
- goto out_putpage;
- }
- unlock_page(oldpage);
-
- if (pages_identical(oldpage, newpage))
+ if (write_protect_page(vma, oldpage, &orig_pte) == 0 &&
+ pages_identical(oldpage, newpage))
err = replace_page(vma, oldpage, newpage, orig_pte);
+ if ((vma->vm_flags & VM_LOCKED) && !err)
+ munlock_vma_page(oldpage);
+
+ unlock_page(oldpage);
out_putpage:
put_page(oldpage);
put_page(newpage);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f99f5991d6bb..66035bf92612 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -758,7 +758,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
task_unlock(task);
if (!curr)
return 0;
- if (curr->use_hierarchy)
+ /*
+ * We should check use_hierarchy of "mem" not "curr". Because checking
+ * use_hierarchy of "curr" here make this function true if hierarchy is
+ * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
+ * hierarchy(even if use_hierarchy is disabled in "mem").
+ */
+ if (mem->use_hierarchy)
ret = css_is_ancestor(&curr->css, &mem->css);
else
ret = (curr == mem);
@@ -2375,7 +2381,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
if (free_all)
goto try_to_free;
move_account:
- while (mem->res.usage > 0) {
+ do {
ret = -EBUSY;
if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
goto out;
@@ -2402,8 +2408,8 @@ move_account:
if (ret == -ENOMEM)
goto try_to_free;
cond_resched();
- }
- ret = 0;
+ /* "ret" should also be checked to ensure all lists are empty. */
+ } while (mem->res.usage > 0 || ret);
out:
css_put(&mem->css);
return ret;
@@ -2436,10 +2442,7 @@ try_to_free:
}
lru_add_drain();
/* try move_account...there may be some *locked* pages. */
- if (mem->res.usage)
- goto move_account;
- ret = 0;
- goto out;
+ goto move_account;
}
int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
@@ -2541,6 +2544,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
val += idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+ val += idx_val;
val <<= PAGE_SHIFT;
} else
val = res_counter_read_u64(&mem->memsw, name);
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..4e5945588c73 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2514,7 +2514,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_HWPOISON;
} else {
print_bad_pte(vma, address, orig_pte, NULL);
- ret = VM_FAULT_OOM;
+ ret = VM_FAULT_SIGBUS;
}
goto out;
}
@@ -2910,7 +2910,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Page table corrupted: show pte and kill process.
*/
print_bad_pte(vma, address, orig_pte, NULL);
- return VM_FAULT_OOM;
+ return VM_FAULT_SIGBUS;
}
pgoff = pte_to_pgoff(orig_pte);
diff --git a/mm/migrate.c b/mm/migrate.c
index 7dbcb22316d2..0e39f94b2d9c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -953,6 +953,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
goto out_pm;
err = -ENODEV;
+ if (node < 0 || node >= MAX_NUMNODES)
+ goto out_pm;
+
if (!node_state(node, N_HIGH_MEMORY))
goto out_pm;
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..7a3436ef39eb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
#include <linux/syscalls.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
if (!vma || addr < vma->vm_start)
return -ENOMEM;
+#ifdef CONFIG_HUGETLB_PAGE
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h;
+ unsigned long nr_huge;
+ unsigned char present;
+
+ i = 0;
+ nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
+ h = hstate_vma(vma);
+ nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
+ - (addr >> huge_page_shift(h)) + 1;
+ nr_huge = min(nr_huge,
+ (vma->vm_end - addr) >> huge_page_shift(h));
+ while (1) {
+ /* hugepage always in RAM for now,
+ * but generally it needs to be check */
+ ptep = huge_pte_offset(current->mm,
+ addr & huge_page_mask(h));
+ present = !!(ptep &&
+ !huge_pte_none(huge_ptep_get(ptep)));
+ while (1) {
+ vec[i++] = present;
+ addr += PAGE_SIZE;
+ /* reach buffer limit */
+ if (i == nr)
+ return nr;
+ /* check hugepage border */
+ if (!((addr & ~huge_page_mask(h))
+ >> PAGE_SHIFT))
+ break;
+ }
+ }
+ return nr;
+ }
+#endif
+
/*
* Calculate how many pages there are left in the last level of the
* PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..2e05c97b04f5 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -99,14 +99,14 @@ void mlock_vma_page(struct page *page)
* not get another chance to clear PageMlocked. If we successfully
* isolate the page and try_to_munlock() detects other VM_LOCKED vmas
* mapping the page, it will restore the PageMlocked state, unless the page
- * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
+ * is mapped in a non-linear vma. So, we go ahead and ClearPageMlocked(),
* perhaps redundantly.
* If we lose the isolation race, and the page is mapped by other VM_LOCKED
* vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
* either of which will restore the PageMlocked state by calling
* mlock_vma_page() above, if it can grab the vma's mmap sem.
*/
-static void munlock_vma_page(struct page *page)
+void munlock_vma_page(struct page *page)
{
BUG_ON(!PageLocked(page));
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b64010..ae197468b352 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -932,13 +932,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (!(flags & MAP_FIXED))
addr = round_hint_to_min(addr);
- error = arch_mmap_check(addr, len, flags);
- if (error)
- return error;
-
/* Careful about overflows.. */
len = PAGE_ALIGN(len);
- if (!len || len > TASK_SIZE)
+ if (!len)
return -ENOMEM;
/* offset overflow? */
@@ -949,24 +945,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (mm->map_count > sysctl_max_map_count)
return -ENOMEM;
- if (flags & MAP_HUGETLB) {
- struct user_struct *user = NULL;
- if (file)
- return -EINVAL;
-
- /*
- * VM_NORESERVE is used because the reservations will be
- * taken when vm_ops->mmap() is called
- * A dummy user value is used because we are not locking
- * memory so no accounting is necessary
- */
- len = ALIGN(len, huge_page_size(&default_hstate));
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
- &user, HUGETLB_ANONHUGE_INODE);
- if (IS_ERR(file))
- return PTR_ERR(file);
- }
-
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
@@ -1459,6 +1437,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
+ unsigned long error = arch_mmap_check(addr, len, flags);
+ if (error)
+ return error;
+
+ /* Careful about overflows.. */
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
get_area = current->mm->get_unmapped_area;
if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
@@ -2003,20 +1989,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
if (!len)
return addr;
- if ((addr + len) > TASK_SIZE || (addr + len) < addr)
- return -EINVAL;
-
- if (is_hugepage_only_range(mm, addr, len))
- return -EINVAL;
-
error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
if (error)
return error;
flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
- error = arch_mmap_check(addr, len, flags);
- if (error)
+ error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (error & ~PAGE_MASK)
return error;
/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff2547719..845190898d59 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
return new_addr;
}
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+ unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = find_vma(mm, addr);
+
+ if (!vma || vma->vm_start > addr)
+ goto Efault;
+
+ if (is_vm_hugetlb_page(vma))
+ goto Einval;
+
+ /* We can't remap across vm area boundaries */
+ if (old_len > vma->vm_end - addr)
+ goto Efault;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+ if (new_len > old_len)
+ goto Efault;
+ }
+
+ if (vma->vm_flags & VM_LOCKED) {
+ unsigned long locked, lock_limit;
+ locked = mm->locked_vm << PAGE_SHIFT;
+ lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+ locked += new_len - old_len;
+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+ goto Eagain;
+ }
+
+ if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ goto Enomem;
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+ if (security_vm_enough_memory(charged))
+ goto Efault;
+ *p = charged;
+ }
+
+ return vma;
+
+Efault: /* very odd choice for most of the cases, but... */
+ return ERR_PTR(-EFAULT);
+Einval:
+ return ERR_PTR(-EINVAL);
+Enomem:
+ return ERR_PTR(-ENOMEM);
+Eagain:
+ return ERR_PTR(-EAGAIN);
+}
+
+static unsigned long mremap_to(unsigned long addr,
+ unsigned long old_len, unsigned long new_addr,
+ unsigned long new_len)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long ret = -EINVAL;
+ unsigned long charged = 0;
+ unsigned long map_flags;
+
+ if (new_addr & ~PAGE_MASK)
+ goto out;
+
+ if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+ goto out;
+
+ /* Check if the location we're moving into overlaps the
+ * old location at all, and fail if it does.
+ */
+ if ((new_addr <= addr) && (new_addr+new_len) > addr)
+ goto out;
+
+ if ((addr <= new_addr) && (addr+old_len) > new_addr)
+ goto out;
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
+
+ ret = do_munmap(mm, new_addr, new_len);
+ if (ret)
+ goto out;
+
+ if (old_len >= new_len) {
+ ret = do_munmap(mm, addr+new_len, old_len - new_len);
+ if (ret && old_len != new_len)
+ goto out;
+ old_len = new_len;
+ }
+
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto out;
+ }
+
+ map_flags = MAP_FIXED;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (ret & ~PAGE_MASK)
+ goto out1;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr);
+ if (!(ret & ~PAGE_MASK))
+ goto out;
+out1:
+ vm_unacct_memory(charged);
+
+out:
+ return ret;
+}
+
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+ unsigned long end = vma->vm_end + delta;
+ if (end < vma->vm_end) /* overflow */
+ return 0;
+ if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+ return 0;
+ if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+ 0, MAP_FIXED) & ~PAGE_MASK)
+ return 0;
+ return 1;
+}
+
/*
* Expand (or shrink) an existing mapping, potentially moving it at the
* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr,
if (!new_len)
goto out;
- /* new_addr is only valid if MREMAP_FIXED is specified */
if (flags & MREMAP_FIXED) {
- if (new_addr & ~PAGE_MASK)
- goto out;
- if (!(flags & MREMAP_MAYMOVE))
- goto out;
-
- if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
- goto out;
-
- /* Check if the location we're moving into overlaps the
- * old location at all, and fail if it does.
- */
- if ((new_addr <= addr) && (new_addr+new_len) > addr)
- goto out;
-
- if ((addr <= new_addr) && (addr+old_len) > new_addr)
- goto out;
-
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
-
- ret = do_munmap(mm, new_addr, new_len);
- if (ret)
- goto out;
+ if (flags & MREMAP_MAYMOVE)
+ ret = mremap_to(addr, old_len, new_addr, new_len);
+ goto out;
}
/*
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr,
if (ret && old_len != new_len)
goto out;
ret = addr;
- if (!(flags & MREMAP_FIXED) || (new_addr == addr))
- goto out;
- old_len = new_len;
+ goto out;
}
/*
- * Ok, we need to grow.. or relocate.
+ * Ok, we need to grow..
*/
- ret = -EFAULT;
- vma = find_vma(mm, addr);
- if (!vma || vma->vm_start > addr)
- goto out;
- if (is_vm_hugetlb_page(vma)) {
- ret = -EINVAL;
- goto out;
- }
- /* We can't remap across vm area boundaries */
- if (old_len > vma->vm_end - addr)
- goto out;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
- goto out;
- }
- if (vma->vm_flags & VM_LOCKED) {
- unsigned long locked, lock_limit;
- locked = mm->locked_vm << PAGE_SHIFT;
- lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
- locked += new_len - old_len;
- ret = -EAGAIN;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- goto out;
- }
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
- ret = -ENOMEM;
+ vma = vma_to_resize(addr, old_len, new_len, &charged);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
goto out;
}
- if (vma->vm_flags & VM_ACCOUNT) {
- charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory(charged))
- goto out_nc;
- }
-
/* old_len exactly to the end of the area..
- * And we're not relocating the area.
*/
- if (old_len == vma->vm_end - addr &&
- !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
- (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
- unsigned long max_addr = TASK_SIZE;
- if (vma->vm_next)
- max_addr = vma->vm_next->vm_start;
+ if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
- if (max_addr - addr >= new_len) {
+ if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
vma_adjust(vma, vma->vm_start,
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr,
*/
ret = -ENOMEM;
if (flags & MREMAP_MAYMOVE) {
- if (!(flags & MREMAP_FIXED)) {
- unsigned long map_flags = 0;
- if (vma->vm_flags & VM_MAYSHARE)
- map_flags |= MAP_SHARED;
-
- new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
- vma->vm_pgoff, map_flags);
- if (new_addr & ~PAGE_MASK) {
- ret = new_addr;
- goto out;
- }
-
- ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
- if (ret)
- goto out;
+ unsigned long map_flags = 0;
+ if (vma->vm_flags & VM_MAYSHARE)
+ map_flags |= MAP_SHARED;
+
+ new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+ vma->vm_pgoff +
+ ((addr - vma->vm_start) >> PAGE_SHIFT),
+ map_flags);
+ if (new_addr & ~PAGE_MASK) {
+ ret = new_addr;
+ goto out;
}
+
+ ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+ if (ret)
+ goto out;
ret = move_vma(vma, addr, old_len, new_len, new_addr);
}
out:
if (ret & ~PAGE_MASK)
vm_unacct_memory(charged);
-out_nc:
return ret;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..9092b43f07fa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -404,7 +404,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
cpuset_print_task_mems_allowed(current);
task_unlock(current);
dump_stack();
- mem_cgroup_print_oom_info(mem, current);
+ mem_cgroup_print_oom_info(mem, p);
show_mem();
if (sysctl_oom_dump_tasks)
dump_tasks(mem);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cfaa29bbc733..7d4406f07d97 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -560,8 +560,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
page = list_entry(list->prev, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
- __free_one_page(page, zone, 0, migratetype);
- trace_mm_page_pcpu_drain(page, 0, migratetype);
+ /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ __free_one_page(page, zone, 0, page_private(page));
+ trace_mm_page_pcpu_drain(page, 0, page_private(page));
} while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
@@ -1226,10 +1227,10 @@ again:
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
spin_unlock(&zone->lock);
if (!page)
goto failed;
+ __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
}
__count_zone_vm_events(PGALLOC, zone, 1 << order);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..a286915e23ef 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/sched.h>
+#include <linux/hugetlb.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd_t *pgd;
unsigned long next;
int err = 0;
+ struct vm_area_struct *vma;
if (addr >= end)
return err;
@@ -117,11 +119,22 @@ int walk_page_range(unsigned long addr, unsigned long end,
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
+
+ /* skip hugetlb vma to avoid hugepage PMD being cleared
+ * in pmd_none_or_clear_bad(). */
+ vma = find_vma(walk->mm, addr);
+ if (vma && is_vm_hugetlb_page(vma)) {
+ if (vma->vm_end < next)
+ next = vma->vm_end;
+ continue;
+ }
+
if (pgd_none_or_clear_bad(pgd)) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
break;
+ pgd++;
continue;
}
if (walk->pgd_entry)
@@ -131,7 +144,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
err = walk_pud_range(pgd, addr, next, walk);
if (err)
break;
- } while (pgd++, addr = next, addr != end);
+ pgd++;
+ } while (addr = next, addr != end);
return err;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 450cebdabfc0..258bda7ec31d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -516,22 +516,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
*/
void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
{
- if (new < old) {
- struct address_space *mapping = inode->i_mapping;
-
- /*
- * unmap_mapping_range is called twice, first simply for
- * efficiency so that truncate_inode_pages does fewer
- * single-page unmaps. However after this first call, and
- * before truncate_inode_pages finishes, it is possible for
- * private pages to be COWed, which remain after
- * truncate_inode_pages finishes, hence the second
- * unmap_mapping_range call must be made for correctness.
- */
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(mapping, new);
- unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
- }
+ struct address_space *mapping = inode->i_mapping;
+
+ /*
+ * unmap_mapping_range is called twice, first simply for
+ * efficiency so that truncate_inode_pages does fewer
+ * single-page unmaps. However after this first call, and
+ * before truncate_inode_pages finishes, it is possible for
+ * private pages to be COWed, which remain after
+ * truncate_inode_pages finishes, hence the second
+ * unmap_mapping_range call must be made for correctness.
+ */
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(mapping, new);
+ unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..b377ce430803 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,10 @@
#include <linux/module.h>
#include <linux/err.h>
#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
@@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
+{
+ struct file * file = NULL;
+ unsigned long retval = -EBADF;
+
+ if (!(flags & MAP_ANONYMOUS)) {
+ if (unlikely(flags & MAP_HUGETLB))
+ return -EINVAL;
+ file = fget(fd);
+ if (!file)
+ goto out;
+ } else if (flags & MAP_HUGETLB) {
+ struct user_struct *user = NULL;
+ /*
+ * VM_NORESERVE is used because the reservations will be
+ * taken when vm_ops->mmap() is called
+ * A dummy user value is used because we are not locking
+ * memory so no accounting is necessary
+ */
+ len = ALIGN(len, huge_page_size(&default_hstate));
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+ &user, HUGETLB_ANONHUGE_INODE);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+ }
+
+ flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+
+ down_write(&current->mm->mmap_sem);
+ retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+ up_write(&current->mm->mmap_sem);
+
+ if (file)
+ fput(file);
+out:
+ return retval;
+}
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c85d75406c81..c2287313b985 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void)
static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/* for per-CPU blocks */
+static void purge_fragmented_blocks_allcpus(void);
+
/*
* Purges all lazily-freed vmap areas.
*
@@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
} else
spin_lock(&purge_lock);
+ if (sync)
+ purge_fragmented_blocks_allcpus();
+
rcu_read_lock();
list_for_each_entry_rcu(va, &vmap_area_list, list) {
if (va->flags & VM_LAZY_FREE) {
@@ -667,8 +673,6 @@ static bool vmap_initialized __read_mostly = false;
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
- struct list_head dirty;
- unsigned int nr_dirty;
};
struct vmap_block {
@@ -678,10 +682,9 @@ struct vmap_block {
unsigned long free, dirty;
DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
- union {
- struct list_head free_list;
- struct rcu_head rcu_head;
- };
+ struct list_head free_list;
+ struct rcu_head rcu_head;
+ struct list_head purge;
};
/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -757,7 +760,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vbq = &get_cpu_var(vmap_block_queue);
vb->vbq = vbq;
spin_lock(&vbq->lock);
- list_add(&vb->free_list, &vbq->free);
+ list_add_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
put_cpu_var(vmap_cpu_blocks);
@@ -776,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb)
struct vmap_block *tmp;
unsigned long vb_idx;
- BUG_ON(!list_empty(&vb->free_list));
-
vb_idx = addr_to_vb_idx(vb->va->va_start);
spin_lock(&vmap_block_tree_lock);
tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
@@ -788,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb)
call_rcu(&vb->rcu_head, rcu_free_vb);
}
+static void purge_fragmented_blocks(int cpu)
+{
+ LIST_HEAD(purge);
+ struct vmap_block *vb;
+ struct vmap_block *n_vb;
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+
+ if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
+ continue;
+
+ spin_lock(&vb->lock);
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
+ vb->free = 0; /* prevent further allocs after releasing lock */
+ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
+ bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
+ bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
+ spin_unlock(&vb->lock);
+ list_add_tail(&vb->purge, &purge);
+ } else
+ spin_unlock(&vb->lock);
+ }
+ rcu_read_unlock();
+
+ list_for_each_entry_safe(vb, n_vb, &purge, purge) {
+ list_del(&vb->purge);
+ free_vmap_block(vb);
+ }
+}
+
+static void purge_fragmented_blocks_thiscpu(void)
+{
+ purge_fragmented_blocks(smp_processor_id());
+}
+
+static void purge_fragmented_blocks_allcpus(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ purge_fragmented_blocks(cpu);
+}
+
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
struct vmap_block_queue *vbq;
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
+ int purge = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -806,24 +856,37 @@ again:
int i;
spin_lock(&vb->lock);
+ if (vb->free < 1UL << order)
+ goto next;
i = bitmap_find_free_region(vb->alloc_map,
VMAP_BBMAP_BITS, order);
- if (i >= 0) {
- addr = vb->va->va_start + (i << PAGE_SHIFT);
- BUG_ON(addr_to_vb_idx(addr) !=
- addr_to_vb_idx(vb->va->va_start));
- vb->free -= 1UL << order;
- if (vb->free == 0) {
- spin_lock(&vbq->lock);
- list_del_init(&vb->free_list);
- spin_unlock(&vbq->lock);
+ if (i < 0) {
+ if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
+ /* fragmented and no outstanding allocations */
+ BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
+ purge = 1;
}
- spin_unlock(&vb->lock);
- break;
+ goto next;
+ }
+ addr = vb->va->va_start + (i << PAGE_SHIFT);
+ BUG_ON(addr_to_vb_idx(addr) !=
+ addr_to_vb_idx(vb->va->va_start));
+ vb->free -= 1UL << order;
+ if (vb->free == 0) {
+ spin_lock(&vbq->lock);
+ list_del_rcu(&vb->free_list);
+ spin_unlock(&vbq->lock);
}
spin_unlock(&vb->lock);
+ break;
+next:
+ spin_unlock(&vb->lock);
}
+
+ if (purge)
+ purge_fragmented_blocks_thiscpu();
+
put_cpu_var(vmap_cpu_blocks);
rcu_read_unlock();
@@ -860,11 +923,11 @@ static void vb_free(const void *addr, unsigned long size)
BUG_ON(!vb);
spin_lock(&vb->lock);
- bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+ BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
vb->dirty += 1UL << order;
if (vb->dirty == VMAP_BBMAP_BITS) {
- BUG_ON(vb->free || !list_empty(&vb->free_list));
+ BUG_ON(vb->free);
spin_unlock(&vb->lock);
free_vmap_block(vb);
} else
@@ -1033,8 +1096,6 @@ void __init vmalloc_init(void)
vbq = &per_cpu(vmap_block_queue, i);
spin_lock_init(&vbq->lock);
INIT_LIST_HEAD(&vbq->free);
- INIT_LIST_HEAD(&vbq->dirty);
- vbq->nr_dirty = 0;
}
/* Import existing vmlist entries. */
@@ -1991,6 +2052,7 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
+#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
static struct vmap_area *node_to_va(struct rb_node *n)
{
return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2255,6 +2317,7 @@ err_free:
kfree(vms);
return NULL;
}
+#endif
/**
* pcpu_free_vm_areas - free vmalloc areas for percpu allocator
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 777af57fd8c8..692807f2228b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1464,20 +1464,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
return low;
}
+static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
+ int file)
+{
+ if (file)
+ return inactive_file_is_low(zone, sc);
+ else
+ return inactive_anon_is_low(zone, sc);
+}
+
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct zone *zone, struct scan_control *sc, int priority)
{
int file = is_file_lru(lru);
- if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
- shrink_active_list(nr_to_scan, zone, sc, priority, file);
+ if (is_active_lru(lru)) {
+ if (inactive_list_is_low(zone, sc, file))
+ shrink_active_list(nr_to_scan, zone, sc, priority, file);
return 0;
}
- if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
- shrink_active_list(nr_to_scan, zone, sc, priority, file);
- return 0;
- }
return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
}