summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/gup.c3
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/memory-failure.c17
-rw-r--r--mm/memory.c43
-rw-r--r--mm/memory_hotplug.c22
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/migrate.c18
-rw-r--r--mm/mmap.c7
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page-writeback.c35
-rw-r--r--mm/page_alloc.c18
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/slab.c6
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/zsmalloc.c11
16 files changed, 158 insertions, 70 deletions
diff --git a/mm/gup.c b/mm/gup.c
index d71da7216c6e..99c2f10188c0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1423,7 +1423,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_none(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e50fcfe6ad8..8b682da98d95 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3579,7 +3579,6 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmun_start = address & huge_page_mask(h);
mmun_end = mmun_start + huge_page_size(h);
@@ -3601,6 +3600,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, address);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -3683,6 +3683,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
pte_t new_pte;
spinlock_t *ptl;
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3716,7 +3717,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3788,6 +3789,15 @@ retry:
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 851efb004857..d6524dce43b2 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -336,7 +336,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
if (fail || tk->addr_valid == 0) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
}
/*
@@ -1704,19 +1705,17 @@ static int soft_offline_in_use_page(struct page *page, int flags)
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(hpage);
- if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
- unlock_page(hpage);
- if (!PageAnon(hpage))
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unlock_page(page);
+ if (!PageAnon(page))
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
else
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(hpage);
+ put_hwpoison_page(page);
return -EBUSY;
}
- unlock_page(hpage);
- get_hwpoison_page(page);
- put_hwpoison_page(hpage);
+ unlock_page(page);
}
if (PageHuge(page))
diff --git a/mm/memory.c b/mm/memory.c
index f3fef1df7402..47248dc0b9e1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2823,6 +2823,28 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
struct vm_fault vmf;
int ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*fe->pmd) && !fe->prealloc_pte) {
+ fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
+ if (!fe->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
vmf.pgoff = pgoff;
vmf.flags = fe->flags;
@@ -3307,15 +3329,24 @@ static int do_fault(struct fault_env *fe)
{
struct vm_area_struct *vma = fe->vma;
pgoff_t pgoff = linear_page_index(vma, fe->address);
+ int ret;
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
- return VM_FAULT_SIGBUS;
- if (!(fe->flags & FAULT_FLAG_WRITE))
- return do_read_fault(fe, pgoff);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(fe, pgoff);
- return do_shared_fault(fe, pgoff);
+ ret = VM_FAULT_SIGBUS;
+ else if (!(fe->flags & FAULT_FLAG_WRITE))
+ ret = do_read_fault(fe, pgoff);
+ else if (!(vma->vm_flags & VM_SHARED))
+ ret = do_cow_fault(fe, pgoff);
+ else
+ ret = do_shared_fault(fe, pgoff);
+
+ /* preallocated pagetable is unused: free it */
+ if (fe->prealloc_pte) {
+ pte_free(vma->vm_mm, fe->prealloc_pte);
+ fe->prealloc_pte = 0;
+ }
+ return ret;
}
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0f962cc3f1bf..b4c8d7b9ab82 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,7 @@
#include <linux/memblock.h>
#include <linux/bootmem.h>
#include <linux/compaction.h>
+#include <linux/rmap.h>
#include <asm/tlbflush.h>
@@ -1470,7 +1471,8 @@ static struct page *next_active_pageblock(struct page *page)
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page)));
+ struct page *end_page = pfn_to_page(end_pfn);
/* Check the starting page of each pageblock within the range */
for (; page < end_page; page = next_active_pageblock(page)) {
@@ -1510,6 +1512,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
@@ -1617,6 +1622,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
continue;
}
+ /*
+ * HWPoison pages have elevated reference counts so the migration would
+ * fail on them. It also doesn't make any sense to migrate them in the
+ * first place. Still try to unmap such a page in case it is still mapped
+ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+ * the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ continue;
+ }
+
if (!get_page_unless_zero(page))
continue;
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e21d9b44247b..593b74bed59b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1327,7 +1327,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1488,7 +1488,7 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1517,7 +1517,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
diff --git a/mm/migrate.c b/mm/migrate.c
index 821623fc7091..b810ac1359f0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1044,10 +1044,13 @@ out:
* If migration is successful, decrease refcount of the newpage
* which will not free the page because new page owner increased
* refcounter. As well, if it is LRU page, add the page to LRU
- * list in here.
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(__PageMovable(newpage)))
+ if (unlikely(!is_lru))
put_page(newpage);
else
putback_lru_page(newpage);
@@ -1231,6 +1234,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
@@ -1262,6 +1275,7 @@ put_anon:
set_page_owner_migrate_reason(new_hpage, reason);
}
+out_unlock:
unlock_page(hpage);
out:
if (rc != -EAGAIN)
diff --git a/mm/mmap.c b/mm/mmap.c
index 283755645d17..3f2314ad6acd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2345,12 +2345,11 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
unsigned long gap_addr;
- int error;
+ int error = 0;
address &= PAGE_MASK;
- error = security_mmap_addr(address);
- if (error)
- return error;
+ if (address < mmap_min_addr)
+ return -EPERM;
/* Enforce stack_guard_gap */
gap_addr = address - stack_guard_gap;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4a184157cc3d..d514eebad905 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -626,8 +626,8 @@ static void wake_oom_reaper(struct task_struct *tsk)
if (!oom_reaper_th)
return;
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -861,6 +861,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* still freeing memory.
*/
read_lock(&tasklist_lock);
+
+ /*
+ * The task 'p' might have already exited before reaching here. The
+ * put_task_struct() will free task_struct 'p' while the loop still try
+ * to access the field of 'p', so, get an extra reference.
+ */
+ get_task_struct(p);
for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -880,6 +887,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
}
}
+ put_task_struct(p);
read_unlock(&tasklist_lock);
p = find_lock_task_mm(victim);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 807236aed275..281a46aeae61 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2148,6 +2148,7 @@ int write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0;
+ int error;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
@@ -2244,25 +2245,31 @@ continue_unlock:
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = (*writepage)(page, wbc, data);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
done_index = page->index + 1;
done = 1;
break;
}
+ if (!ret)
+ ret = error;
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 28240ce475d6..05f141e39ac1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3530,8 +3530,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
enum compact_result compact_result;
int compaction_retries;
int no_progress_loops;
- unsigned long alloc_start = jiffies;
- unsigned int stall_timeout = 10 * HZ;
unsigned int cpuset_mems_cookie;
/*
@@ -3704,14 +3702,6 @@ retry:
if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
goto nopage;
- /* Make sure we know about allocations which stall for too long */
- if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask,
- "page allocation stalls for %ums, order:%u",
- jiffies_to_msecs(jiffies-alloc_start), order);
- stall_timeout += 10 * HZ;
- }
-
if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
did_some_progress > 0, &no_progress_loops))
goto retry;
@@ -3965,11 +3955,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, size);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
nc->offset = size;
}
@@ -3985,10 +3975,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, size + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
offset = size - fragsz;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9b17bd4cbc5e..944242491059 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2896,16 +2896,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret;
+ int ret = 0;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
*/
- ret = shmem_reserve_inode(inode->i_sb);
- if (ret)
- goto out;
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+ }
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
diff --git a/mm/slab.c b/mm/slab.c
index 263dcda6897b..354a09deecff 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -682,8 +682,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
diff --git a/mm/util.c b/mm/util.c
index 8c755d05d4e6..07f467206186 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -389,7 +389,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fa598162dbf0..e6aa073f01df 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2191,7 +2191,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index d3548c48369f..cf15851a7d2f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -473,7 +473,7 @@ static bool is_zspage_isolated(struct zspage *zspage)
return zspage->isolated;
}
-static int is_first_page(struct page *page)
+static __maybe_unused int is_first_page(struct page *page)
{
return PagePrivate(page);
}
@@ -558,20 +558,23 @@ static int get_size_class_index(int size)
return min(zs_size_classes - 1, idx);
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_inc(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
class->stats.objs[type] += cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline void zs_stat_dec(struct size_class *class,
- enum zs_stat_type type, unsigned long cnt)
+ int type, unsigned long cnt)
{
class->stats.objs[type] -= cnt;
}
+/* type can be of enum type zs_stat_type or fullness_group */
static inline unsigned long zs_stat_get(struct size_class *class,
- enum zs_stat_type type)
+ int type)
{
return class->stats.objs[type];
}