summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorAlex Gonzalez <alex.gonzalez@digi.com>2012-01-19 13:54:23 +0100
committerAlex Gonzalez <alex.gonzalez@digi.com>2012-01-19 13:54:23 +0100
commit802699c91a967767fc94759f7a3e5e82d8269245 (patch)
treec8b714dd25edd333efbbf8bb1eb6c3d379084cc4 /mm
parentf135e68daa6745fd3dbb285e6161ae2758c4027f (diff)
parent675f7660ffb0e1880011f6b3c4f9ac241491e3cd (diff)
Merge commit 'v2.6.35.14' into del-5.8/main
Conflicts: arch/arm/plat-mxc/include/mach/gpio.h arch/x86/kernel/cpu/mtrr/main.c drivers/mmc/core/core.c drivers/net/smsc911x.c fs/proc/task_mmu.c include/linux/pm_runtime.h mm/memory.c mm/mlock.c Signed-off-by: Alex Gonzalez <alex.gonzalez@digi.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c3
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/filemap.c29
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c7
-rw-r--r--mm/ksm.c15
-rw-r--r--mm/memory.c142
-rw-r--r--mm/memory_hotplug.c18
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mlock.c15
-rw-r--r--mm/mmap.c41
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c15
-rw-r--r--mm/nommu.c17
-rw-r--r--mm/page-writeback.c26
-rw-r--r--mm/page_alloc.c85
-rw-r--r--mm/percpu.c2
-rw-r--r--mm/shmem.c1
-rw-r--r--mm/slab.c12
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c68
-rw-r--r--mm/vmstat.c79
26 files changed, 482 insertions, 160 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 123bcef13e51..3f2b8d037c98 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -29,6 +29,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -254,6 +255,7 @@ static int __init default_bdi_init(void)
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
return err;
}
@@ -644,6 +646,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
if (bdi->dev) {
+ bdi_set_min_ratio(bdi, 0);
bdi_prune_sb(bdi);
if (!bdi_cap_flush_forker(bdi))
diff --git a/mm/bounce.c b/mm/bounce.c
index 13b6dad1eed2..1481de68184b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
*/
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
- flush_dcache_page(tovec->bv_page);
bounce_copy_vec(tovec, vfrom);
+ flush_dcache_page(tovec->bv_page);
}
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 94cce51b0b35..4d709ee59013 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
-
- unsigned long inactive, isolated;
+ unsigned long active, inactive, isolated;
inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
zone_page_state(zone, NR_ISOLATED_ANON);
- return isolated > inactive;
+ return isolated > (inactive + active) / 2;
}
/*
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..183d2d439b28 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -631,7 +631,9 @@ repeat:
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) {
page = radix_tree_deref_slot(pagep);
- if (unlikely(!page || page == RADIX_TREE_RETRY))
+ if (unlikely(!page))
+ goto out;
+ if (radix_tree_deref_retry(page))
goto repeat;
if (!page_cache_get_speculative(page))
@@ -647,6 +649,7 @@ repeat:
goto repeat;
}
}
+out:
rcu_read_unlock();
return page;
@@ -764,12 +767,11 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page)) {
+ if (ret)
+ start = pages[ret-1]->index;
goto restart;
+ }
if (!page_cache_get_speculative(page))
goto repeat;
@@ -817,11 +819,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (page->mapping == NULL || page->index != index)
@@ -874,11 +872,7 @@ repeat:
page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page))
continue;
- /*
- * this can only trigger if nr_found == 1, making livelock
- * a non issue.
- */
- if (unlikely(page == RADIX_TREE_RETRY))
+ if (radix_tree_deref_retry(page))
goto restart;
if (!page_cache_get_speculative(page))
@@ -1016,6 +1010,9 @@ find_page:
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
if (!mapping->a_ops->is_partially_uptodate(page,
desc, offset))
goto page_not_up_to_date_locked;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..ffede4d34c69 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1030,10 +1030,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
*/
chg = vma_needs_reservation(h, vma, addr);
if (chg < 0)
- return ERR_PTR(chg);
+ return ERR_PTR(-VM_FAULT_OOM);
if (chg)
if (hugetlb_get_quota(inode->i_mapping, chg))
- return ERR_PTR(-ENOSPC);
+ return ERR_PTR(-VM_FAULT_SIGBUS);
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
@@ -1109,6 +1109,14 @@ static void __init gather_bootmem_prealloc(void)
WARN_ON(page_count(page) != 1);
prep_compound_huge_page(page, h->order);
prep_new_huge_page(h, page, page_to_nid(page));
+ /*
+ * If we had gigantic hugepages allocated at boot time, we need
+ * to restore the 'stolen' pages to totalram_pages in order to
+ * fix confusing memory reports from free(1) and another
+ * side-effects, like CommitLimit going negative.
+ */
+ if (h->order > (MAX_ORDER - 1))
+ totalram_pages += 1 << h->order;
}
}
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
*/
static inline unsigned long page_order(struct page *page)
{
- VM_BUG_ON(!PageBuddy(page));
+ /* PageBuddy() must be checked by the caller */
return page_private(page);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac898..49c94b53a6c6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1368,9 +1368,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++(*pos);
list_for_each_continue_rcu(n, &object_list) {
- next_obj = list_entry(n, struct kmemleak_object, object_list);
- if (get_object(next_obj))
+ struct kmemleak_object *obj =
+ list_entry(n, struct kmemleak_object, object_list);
+ if (get_object(obj)) {
+ next_obj = obj;
break;
+ }
}
put_object(prev_obj);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..2db786a576ac 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -731,7 +731,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (!ptep)
goto out;
- if (pte_write(*ptep)) {
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -754,7 +754,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
- entry = pte_wrprotect(entry);
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
@@ -1270,6 +1272,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
ksm_scan.mm_slot = slot;
spin_unlock(&ksm_mmlist_lock);
+ /*
+ * Although we tested list_empty() above, a racing __ksm_exit
+ * of the last mm on the list may have removed it since then.
+ */
+ if (slot == &ksm_mm_head)
+ return NULL;
next_mm:
ksm_scan.address = 0;
ksm_scan.rmap_list = &slot->rmap_list;
@@ -1523,8 +1531,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
{
struct page *new_page;
- unlock_page(page); /* any racers will COW it, not modify it */
-
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@@ -1540,7 +1546,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
add_page_to_unevictable_list(new_page);
}
- page_cache_release(page);
return new_page;
}
diff --git a/mm/memory.c b/mm/memory.c
index c43447382fec..d2f0c08adf82 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1353,7 +1353,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
- /*
+ /*
* Require read or write permissions.
* If FOLL_FORCE is set, we only require the "MAY" flags.
*/
@@ -1497,7 +1497,63 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
return i;
}
-/**
+/*
+ * fixup_user_fault() - manually resolve a user page fault
+ * @tsk: the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm: mm_struct of target mm
+ * @address: user address
+ * @fault_flags:flags to pass down to handle_mm_fault()
+ *
+ * This is meant to be called in the specific scenario where for locking reasons
+ * we try to access user memory in atomic context (within a pagefault_disable()
+ * section), this returns -EFAULT, and we want to resolve the user fault before
+ * trying again.
+ *
+ * Typically this is meant to be used by the futex code.
+ *
+ * The main difference with get_user_pages() is that this function will
+ * unconditionally call handle_mm_fault() which will in turn perform all the
+ * necessary SW fixup of the dirty and young bits in the PTE, while
+ * handle_mm_fault() only guarantees to update these in the struct page.
+ *
+ * This is important for some architectures where those bits also gate the
+ * access permission to the page because they are maintained in software. On
+ * such architectures, gup() will not be enough to make a subsequent access
+ * succeed.
+ *
+ * This should be called with the mm_sem held for read.
+ */
+int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ vma = find_extend_vma(mm, address);
+ if (!vma || address < vma->vm_start)
+ return -EFAULT;
+
+ ret = handle_mm_fault(mm, vma, address, fault_flags);
+ if (ret & VM_FAULT_ERROR) {
+ if (ret & VM_FAULT_OOM)
+ return -ENOMEM;
+ if (ret & VM_FAULT_HWPOISON)
+ return -EFAULT;
+ if (ret & VM_FAULT_SIGBUS)
+ return -EFAULT;
+ BUG();
+ }
+ if (tsk) {
+ if (ret & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
+ }
+ return 0;
+}
+
+/*
* get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
@@ -2573,6 +2629,7 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX;
details.i_mmap_lock = &mapping->i_mmap_lock;
+ mutex_lock(&mapping->unmap_mutex);
spin_lock(&mapping->i_mmap_lock);
/* Protect against endless unmapping loops */
@@ -2589,6 +2646,7 @@ void unmap_mapping_range(struct address_space *mapping,
if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->unmap_mutex);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2626,7 +2684,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
@@ -2681,10 +2739,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- page = ksm_might_need_to_copy(page, vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
+ */
+ if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ goto out_page;
+
+ if (ksm_might_need_to_copy(page, vma, address)) {
+ swapcache = page;
+ page = ksm_does_need_to_copy(page, vma, address);
+
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ swapcache = NULL;
+ goto out_page;
+ }
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2735,6 +2808,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
+ if (swapcache) {
+ /*
+ * Hold the lock to avoid the swap entry to be reused
+ * until we take the PT lock for the pte_same() check
+ * (to avoid false positives from pte_same). For
+ * further safety release the lock after the swap_free
+ * so that the swap count won't change under a
+ * parallel locked swapcache.
+ */
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2756,25 +2841,43 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
+ if (swapcache) {
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
return ret;
}
/*
- * This is like a special single-page "expand_downwards()",
- * except we must first make sure that 'address-PAGE_SIZE'
+ * This is like a special single-page "expand_{down|up}wards()",
+ * except we must first make sure that 'address{-|+}PAGE_SIZE'
* doesn't hit another vma.
- *
- * The "find_vma()" will do the right thing even if we wrap
*/
static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
{
address &= PAGE_MASK;
if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
- address -= PAGE_SIZE;
- if (find_vma(vma->vm_mm, address) != vma)
- return -ENOMEM;
+ struct vm_area_struct *prev = vma->vm_prev;
- expand_stack(vma, address);
+ /*
+ * Is there a mapping abutting this one below?
+ *
+ * That's only ok if it's the same stack mapping
+ * that has gotten split..
+ */
+ if (prev && prev->vm_end == address)
+ return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+
+ expand_stack(vma, address - PAGE_SIZE);
+ }
+ if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ struct vm_area_struct *next = vma->vm_next;
+
+ /* As VM_GROWSDOWN but s/below/above/ */
+ if (next && next->vm_start == address + PAGE_SIZE)
+ return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+
+ expand_upwards(vma, address + PAGE_SIZE);
}
return 0;
}
@@ -2792,22 +2895,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
+ pte_unmap(page_table);
+
+ /* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGBUS;
+ /* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* Allocate our own private page. */
- pte_unmap(page_table);
-
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a4cfcdc00455..6345dfe78d2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page)
/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
- int pageblocks_stride;
-
/* Ensure the starting page is pageblock-aligned */
BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
- /* Move forward by at least 1 * pageblock_nr_pages */
- pageblocks_stride = 1;
-
/* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page))
- pageblocks_stride += page_order(page) - pageblock_order;
+ if (pageblock_free(page)) {
+ int order;
+ /* be careful. we don't have locks, page_order can be changed.*/
+ order = page_order(page);
+ if ((order < MAX_ORDER) && (order >= pageblock_order))
+ return page + (1 << order);
+ }
- return page + (pageblocks_stride * pageblock_nr_pages);
+ return page + pageblock_nr_pages;
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -659,7 +659,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
* Scanning pfn is much easier than scanning lru list.
* Scan pfn from start to end and Find LRU page.
*/
-int scan_lru_pages(unsigned long start, unsigned long end)
+unsigned long scan_lru_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
struct page *page;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..407cda2d753f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1576,7 +1576,7 @@ unsigned slab_node(struct mempolicy *policy)
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
- return zone->node;
+ return zone ? zone->node : numa_node_id();
}
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..ca71d064bce7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -266,7 +266,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
*/
__dec_zone_page_state(page, NR_FILE_PAGES);
__inc_zone_page_state(newpage, NR_FILE_PAGES);
- if (PageSwapBacked(page)) {
+ if (!PageSwapCache(page) && PageSwapBacked(page)) {
__dec_zone_page_state(page, NR_SHMEM);
__inc_zone_page_state(newpage, NR_SHMEM);
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 49e5e4cb8232..b70919ce4f72 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page)
}
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return (vma->vm_flags & VM_GROWSDOWN) &&
+ (vma->vm_start == addr) &&
+ !vma_stack_continue(vma->vm_prev, addr);
+}
+
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -168,11 +175,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
gup_flags |= FOLL_WRITE;
/* We don't try to access the guard page of a stack vma */
- if (vma->vm_flags & VM_GROWSDOWN) {
- if (start == vma->vm_start) {
- start += PAGE_SIZE;
- nr_pages--;
- }
+ if (stack_guard_page(vma, start)) {
+ addr += PAGE_SIZE;
+ nr_pages--;
}
while (nr_pages > 0) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 456ec6f27889..073b39d32bc4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -388,17 +388,23 @@ static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
if (prev) {
- vma->vm_next = prev->vm_next;
+ next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
- vma->vm_next = NULL;
+ next = NULL;
}
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -485,7 +491,11 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ struct vm_area_struct *next = vma->vm_next;
+
+ prev->vm_next = next;
+ if (next)
+ next->vm_prev = prev;
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -1694,9 +1704,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifndef CONFIG_IA64
-static
-#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1900,6 +1907,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
@@ -1907,6 +1915,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
+ if (vma)
+ vma->vm_prev = prev;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -1984,6 +1994,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
removed_exe_file_vma(mm);
fput(new->vm_file);
}
+ unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
out_free_vma:
@@ -2433,6 +2444,7 @@ int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long vm_flags, struct page **pages)
{
+ int ret;
struct vm_area_struct *vma;
vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2450,16 +2462,23 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_ops = &special_mapping_vmops;
vma->vm_private_data = pages;
- if (unlikely(insert_vm_struct(mm, vma))) {
- kmem_cache_free(vm_area_cachep, vma);
- return -ENOMEM;
- }
+ ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
+ if (ret)
+ goto out;
+
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
mm->total_vm += len >> PAGE_SHIFT;
perf_event_mmap(vma);
return 0;
+
+out:
+ kmem_cache_free(vm_area_cachep, vma);
+ return ret;
}
static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..4c5133873097 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -211,6 +211,7 @@ success:
mmu_notifier_invalidate_range_end(mm, start, end);
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ perf_event_mmap(vma);
return 0;
fail:
@@ -299,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
- perf_event_mmap(vma);
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..97de5aef9431 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -91,9 +91,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
*/
mapping = vma->vm_file->f_mapping;
spin_lock(&mapping->i_mmap_lock);
- if (new_vma->vm_truncate_count &&
- new_vma->vm_truncate_count != vma->vm_truncate_count)
- new_vma->vm_truncate_count = 0;
+ new_vma->vm_truncate_count = 0;
}
/*
@@ -276,9 +274,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
if (old_len > vma->vm_end - addr)
goto Efault;
- if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
- if (new_len > old_len)
+ /* Need to be careful about a growing mapping */
+ if (new_len > old_len) {
+ unsigned long pgoff;
+
+ if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
goto Efault;
+ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ pgoff += vma->vm_pgoff;
+ if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+ goto Einval;
}
if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee0abe0..fbdb2e766758 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -609,7 +609,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp;
+ struct vm_area_struct *pvma, **pp, *next;
struct address_space *mapping;
struct rb_node **p, *parent;
@@ -669,8 +669,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
break;
}
- vma->vm_next = *pp;
+ next = *pp;
*pp = vma;
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
/*
@@ -1670,6 +1673,7 @@ void exit_mmap(struct mm_struct *mm)
mm->mmap = vma->vm_next;
delete_vma_from_mm(vma);
delete_vma(mm, vma);
+ cond_resched();
}
kleave("");
@@ -1743,10 +1747,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
return NULL;
}
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
- unsigned long to, unsigned long size, pgprot_t prot)
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
{
- vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+ if (addr != (pfn << PAGE_SHIFT))
+ return -EINVAL;
+
+ vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
return 0;
}
EXPORT_SYMBOL(remap_pfn_range);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..582cba18ae02 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -949,22 +949,16 @@ continue_unlock:
}
}
- if (wbc->nr_to_write > 0) {
- if (--wbc->nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
}
pagevec_release(&pvec);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c6..98c699d9136f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,19 +103,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
* only be modified with pm_mutex held, unless the suspend/hibernate code is
* guaranteed not to run in parallel with that modification).
*/
-void set_gfp_allowed_mask(gfp_t mask)
+
+static gfp_t saved_gfp_mask;
+
+void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask = mask;
+ if (saved_gfp_mask) {
+ gfp_allowed_mask = saved_gfp_mask;
+ saved_gfp_mask = 0;
+ }
}
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
{
- gfp_t ret = gfp_allowed_mask;
-
WARN_ON(!mutex_is_locked(&pm_mutex));
- gfp_allowed_mask &= ~mask;
- return ret;
+ WARN_ON(saved_gfp_mask);
+ saved_gfp_mask = gfp_allowed_mask;
+ gfp_allowed_mask &= ~GFP_IOFS;
}
#endif /* CONFIG_PM_SLEEP */
@@ -530,7 +535,7 @@ static inline void __free_one_page(struct page *page,
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
struct page *higher_page, *higher_buddy;
combined_idx = __find_combined_index(page_idx, order);
higher_page = page + combined_idx - page_idx;
@@ -588,13 +593,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
+ int to_free = count;
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
- while (count) {
+ while (to_free) {
struct page *page;
struct list_head *list;
@@ -619,8 +624,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, page_private(page));
trace_mm_page_pcpu_drain(page, 0, page_private(page));
- } while (--count && --batch_free && !list_empty(list));
+ } while (--to_free && --batch_free && !list_empty(list));
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -631,8 +637,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
@@ -1453,24 +1459,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
#endif /* CONFIG_FAIL_PAGE_ALLOC */
/*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int alloc_flags)
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
int o;
+ free_pages -= (1 << order) + 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
if (free_pages <= min + z->lowmem_reserve[classzone_idx])
- return 0;
+ return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
@@ -1479,9 +1485,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
min >>= 1;
if (free_pages <= min)
- return 0;
+ return false;
}
- return 1;
+ return true;
+}
+
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ zone_page_state(z, NR_FREE_PAGES));
+}
+
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+ int classzone_idx, int alloc_flags)
+{
+ long free_pages = zone_page_state(z, NR_FREE_PAGES);
+
+ if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+
+ return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages);
}
#ifdef CONFIG_NUMA
@@ -1843,6 +1868,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
+ bool drained = false;
cond_resched();
@@ -1861,14 +1887,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
cond_resched();
- if (order != 0)
- drain_all_pages();
+ if (unlikely(!(*did_some_progress)))
+ return NULL;
- if (likely(*did_some_progress))
- page = get_page_from_freelist(gfp_mask, nodemask, order,
+retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists. Drain them and try again
+ */
+ if (!page && !drained) {
+ drain_all_pages();
+ drained = true;
+ goto retry;
+ }
+
return page;
}
@@ -1990,6 +2027,7 @@ restart:
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
+rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -1997,7 +2035,6 @@ restart:
if (page)
goto got_pg;
-rebalance:
/* Allocate without watermarks if the context allows */
if (alloc_flags & ALLOC_NO_WATERMARKS) {
page = __alloc_pages_high_priority(gfp_mask, order,
diff --git a/mm/percpu.c b/mm/percpu.c
index 6470e7710231..0a09fff3d05d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1413,9 +1413,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
+ pcpu_last_unit_cpu = cpu;
}
}
- pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..676eed337809 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2756,5 +2756,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
fput(vma->vm_file);
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index e49f8f46f46d..91e691bf9a0e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2289,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
if (ralign < align) {
ralign = align;
}
- /* disable debug if not aligning with REDZONE_ALIGN */
- if (ralign & (__alignof__(unsigned long long) - 1))
+ /* disable debug if necessary */
+ if (ralign > __alignof__(unsigned long long))
flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
/*
* 4) Store it.
@@ -2316,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
*/
if (flags & SLAB_RED_ZONE) {
/* add space for red zone words */
- cachep->obj_offset += align;
- size += align + sizeof(unsigned long long);
+ cachep->obj_offset += sizeof(unsigned long long);
+ size += 2 * sizeof(unsigned long long);
}
if (flags & SLAB_STORE_USER) {
/* user store requires one word storage behind the end of
@@ -2331,8 +2331,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f08d165871b3..c731d1456495 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,8 +139,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
if (err)
return err;
cond_resched();
@@ -151,8 +150,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
if (err)
break;
@@ -191,8 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
- BLKDEV_IFL_BARRIER))
+ nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
break;
}
@@ -688,6 +685,24 @@ int try_to_free_swap(struct page *page)
if (page_swapcount(page))
return 0;
+ /*
+ * Once hibernation has begun to create its image of memory,
+ * there's a danger that one of the calls to try_to_free_swap()
+ * - most probably a call from __try_to_reclaim_swap() while
+ * hibernation is allocating its own swap pages for the image,
+ * but conceivably even a call from memory reclaim - will free
+ * the swap from a page which has already been recorded in the
+ * image as a clean swapcache page, and then reuse its swap for
+ * another page of the image. On waking from hibernation, the
+ * original page might be freed under memory pressure, then
+ * later read back in from swap, now with the wrong data.
+ *
+ * Hibernation clears bits from gfp_allowed_mask to prevent
+ * memory reclaim from writing to disk, so check that here.
+ */
+ if (!(gfp_allowed_mask & __GFP_IO))
+ return 0;
+
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -2034,7 +2049,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
- if (discard_swap(p) == 0)
+ if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
p->flags |= SWP_DISCARDABLE;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae007462b7f6..20a402ce7bd2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -513,6 +513,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
static void purge_fragmented_blocks_allcpus(void);
/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
* Purges all lazily-freed vmap areas.
*
* If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..22e56769e006 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1726,13 +1726,12 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
sc->nodemask) {
@@ -1759,8 +1758,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
}
shrink_zone(priority, zone, sc);
- all_unreclaimable = false;
}
+}
+
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ bool all_unreclaimable = true;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone)) {
+ all_unreclaimable = false;
+ break;
+ }
+ }
+
return all_unreclaimable;
}
@@ -1784,7 +1813,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
@@ -1815,7 +1843,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- all_unreclaimable = shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1879,7 +1907,7 @@ out:
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
- if (scanning_global_lru(sc) && !all_unreclaimable)
+ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@@ -1979,7 +2007,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
if (zone->all_unreclaimable)
continue;
- if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
0, 0))
return 1;
}
@@ -2076,7 +2104,7 @@ loop_again:
shrink_active_list(SWAP_CLUSTER_MAX, zone,
&sc, priority, 0);
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), 0, 0)) {
end_zone = i;
break;
@@ -2127,7 +2155,7 @@ loop_again:
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
8*high_wmark_pages(zone), end_zone, 0))
shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
@@ -2137,8 +2165,7 @@ loop_again:
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 &&
- zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+ if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
@@ -2149,7 +2176,7 @@ loop_again:
total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
high_wmark_pages(zone), end_zone, 0)) {
all_zones_ok = 0;
/*
@@ -2157,7 +2184,7 @@ loop_again:
* means that we have a GFP_ATOMIC allocation
* failure risk. Hurry up!
*/
- if (!zone_watermark_ok(zone, order,
+ if (!zone_watermark_ok_safe(zone, order,
min_wmark_pages(zone), end_zone, 0))
has_under_min_watermark_zone = 1;
}
@@ -2299,9 +2326,11 @@ static int kswapd(void *p)
* premature sleep. If not, then go fully
* to sleep until explicitly woken up
*/
- if (!sleeping_prematurely(pgdat, order, remaining))
+ if (!sleeping_prematurely(pgdat, order, remaining)) {
+ restore_pgdat_percpu_threshold(pgdat);
schedule();
- else {
+ reduce_pgdat_percpu_threshold(pgdat);
+ } else {
if (remaining)
count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
else
@@ -2337,15 +2366,16 @@ void wakeup_kswapd(struct zone *zone, int order)
if (!populated_zone(zone))
return;
- pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
+ pgdat = zone->zone_pgdat;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
+ if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+ return;
+
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e77..41dc8cd96a68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -81,6 +81,30 @@ EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
+static int calculate_pressure_threshold(struct zone *zone)
+{
+ int threshold;
+ int watermark_distance;
+
+ /*
+ * As vmstats are not up to date, there is drift between the estimated
+ * and real values. For high thresholds and a high number of CPUs, it
+ * is possible for the min watermark to be breached while the estimated
+ * value looks fine. The pressure threshold is a reduced value such
+ * that even the maximum amount of drift will not accidentally breach
+ * the min watermark
+ */
+ watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+ threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+
+ /*
+ * Maximum threshold is 125
+ */
+ threshold = min(125, threshold);
+
+ return threshold;
+}
+
static int calculate_threshold(struct zone *zone)
{
int threshold;
@@ -138,14 +162,69 @@ static void refresh_zone_stat_thresholds(void)
int threshold;
for_each_populated_zone(zone) {
+ unsigned long max_drift, tolerate_drift;
+
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
}
}
+void reduce_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_pressure_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
+void restore_pgdat_percpu_threshold(pg_data_t *pgdat)
+{
+ struct zone *zone;
+ int cpu;
+ int threshold;
+ int i;
+
+ get_online_cpus();
+ for (i = 0; i < pgdat->nr_zones; i++) {
+ zone = &pgdat->node_zones[i];
+ if (!zone->percpu_drift_mark)
+ continue;
+
+ threshold = calculate_threshold(zone);
+ for_each_online_cpu(cpu)
+ per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+ = threshold;
+ }
+ put_online_cpus();
+}
+
/*
* For use when we know that interrupts are disabled.
*/