summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c1
-rw-r--r--mm/cma.c4
-rw-r--r--mm/gup.c48
-rw-r--r--mm/hmm.c131
-rw-r--r--mm/huge_memory.c72
-rw-r--r--mm/hugetlb.c57
-rw-r--r--mm/kmemleak.c18
-rw-r--r--mm/list_lru.c8
-rw-r--r--mm/memcontrol.c59
-rw-r--r--mm/memory-failure.c17
-rw-r--r--mm/memory.c48
-rw-r--r--mm/memory_hotplug.c82
-rw-r--r--mm/mempolicy.c48
-rw-r--r--mm/migrate.c29
-rw-r--r--mm/mincore.c23
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/oom_kill.c15
-rw-r--r--mm/page-writeback.c35
-rw-r--r--mm/page_alloc.c54
-rw-r--r--mm/page_ext.c5
-rw-r--r--mm/page_poison.c4
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu.c8
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/slab.c28
-rw-r--r--mm/slab.h3
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c5
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c17
-rw-r--r--mm/swapfile.c57
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c3
-rw-r--r--mm/util.c2
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c45
-rw-r--r--mm/vmstat.c9
37 files changed, 612 insertions, 375 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb8796c6c..72e6d0c55cfa 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
diff --git a/mm/cma.c b/mm/cma.c
index 4cb76121a3ab..bfe9f5397165 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base,
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto err;
+ goto free_mem;
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
+free_mem:
+ memblock_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4afff6..caadd31714a5 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -153,7 +153,10 @@ retry:
}
if (flags & FOLL_GET) {
- get_page(page);
+ if (unlikely(!try_get_page(page))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
+ }
/* drop the pgmap reference now that we hold the page */
if (pgmap) {
@@ -296,7 +299,10 @@ retry_locked:
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
- get_page(page);
+ if (unlikely(!try_get_page(page))) {
+ spin_unlock(ptl);
+ return ERR_PTR(-ENOMEM);
+ }
spin_unlock(ptl);
lock_page(page);
ret = split_huge_page(page);
@@ -480,7 +486,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if (is_device_public_page(*page))
goto unmap;
}
- get_page(*page);
+ if (unlikely(!try_get_page(*page))) {
+ ret = -ENOMEM;
+ goto unmap;
+ }
out:
ret = 0;
unmap:
@@ -1368,6 +1377,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
+/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
@@ -1402,9 +1425,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ head = try_get_compound_head(page, 1);
+ if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1543,8 +1566,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pmd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pmd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1581,8 +1604,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pud_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pud_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1618,8 +1641,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pgd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pgd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1649,7 +1672,8 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
diff --git a/mm/hmm.c b/mm/hmm.c
index 90193a7fabce..57f0d2a4ff34 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -945,7 +945,6 @@ static void hmm_devmem_ref_exit(void *data)
devmem = container_of(ref, struct hmm_devmem, ref);
percpu_ref_exit(ref);
- devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
}
static void hmm_devmem_ref_kill(void *data)
@@ -956,7 +955,6 @@ static void hmm_devmem_ref_kill(void *data)
devmem = container_of(ref, struct hmm_devmem, ref);
percpu_ref_kill(ref);
wait_for_completion(&devmem->completion);
- devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
}
static int hmm_devmem_fault(struct vm_area_struct *vma,
@@ -994,7 +992,7 @@ static void hmm_devmem_radix_release(struct resource *resource)
mutex_unlock(&hmm_devmem_lock);
}
-static void hmm_devmem_release(struct device *dev, void *data)
+static void hmm_devmem_release(void *data)
{
struct hmm_devmem *devmem = data;
struct resource *resource = devmem->resource;
@@ -1002,11 +1000,6 @@ static void hmm_devmem_release(struct device *dev, void *data)
struct zone *zone;
struct page *page;
- if (percpu_ref_tryget_live(&devmem->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(&devmem->ref);
- }
-
/* pages are dead and unused, undo the arch mapping */
start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
@@ -1130,19 +1123,6 @@ error:
return ret;
}
-static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
-{
- struct hmm_devmem *devmem = data;
-
- return devmem->resource == match_data;
-}
-
-static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
-{
- devres_release(devmem->device, &hmm_devmem_release,
- &hmm_devmem_match, devmem->resource);
-}
-
/*
* hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
*
@@ -1170,8 +1150,7 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
dev_pagemap_get_ops();
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
+ devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
if (!devmem)
return ERR_PTR(-ENOMEM);
@@ -1185,11 +1164,11 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
0, GFP_KERNEL);
if (ret)
- goto error_percpu_ref;
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit, &devmem->ref);
if (ret)
- goto error_devm_add_action;
+ return ERR_PTR(ret);
size = ALIGN(size, PA_SECTION_SIZE);
addr = min((unsigned long)iomem_resource.end,
@@ -1209,16 +1188,12 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
devmem->resource = devm_request_mem_region(device, addr, size,
dev_name(device));
- if (!devmem->resource) {
- ret = -ENOMEM;
- goto error_no_resource;
- }
+ if (!devmem->resource)
+ return ERR_PTR(-ENOMEM);
break;
}
- if (!devmem->resource) {
- ret = -ERANGE;
- goto error_no_resource;
- }
+ if (!devmem->resource)
+ return ERR_PTR(-ERANGE);
devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
@@ -1227,30 +1202,15 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
ret = hmm_devmem_pages_create(devmem);
if (ret)
- goto error_pages;
-
- devres_add(device, devmem);
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
+ ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
+ if (ret)
return ERR_PTR(ret);
- }
return devmem;
-
-error_pages:
- devm_release_mem_region(device, devmem->resource->start,
- resource_size(devmem->resource));
-error_no_resource:
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
}
-EXPORT_SYMBOL(hmm_devmem_add);
+EXPORT_SYMBOL_GPL(hmm_devmem_add);
struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
struct device *device,
@@ -1264,8 +1224,7 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
dev_pagemap_get_ops();
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
+ devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL);
if (!devmem)
return ERR_PTR(-ENOMEM);
@@ -1279,12 +1238,12 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
0, GFP_KERNEL);
if (ret)
- goto error_percpu_ref;
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
+ ret = devm_add_action_or_reset(device, hmm_devmem_ref_exit,
+ &devmem->ref);
if (ret)
- goto error_devm_add_action;
-
+ return ERR_PTR(ret);
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
devmem->pfn_last = devmem->pfn_first +
@@ -1292,58 +1251,20 @@ struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
ret = hmm_devmem_pages_create(devmem);
if (ret)
- goto error_devm_add_action;
+ return ERR_PTR(ret);
- devres_add(device, devmem);
+ ret = devm_add_action_or_reset(device, hmm_devmem_release, devmem);
+ if (ret)
+ return ERR_PTR(ret);
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
+ ret = devm_add_action_or_reset(device, hmm_devmem_ref_kill,
+ &devmem->ref);
+ if (ret)
return ERR_PTR(ret);
- }
return devmem;
-
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(hmm_devmem_add_resource);
-
-/*
- * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
- *
- * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
- *
- * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
- * of the device driver. It will free struct page and remove the resource that
- * reserved the physical address range for this device memory.
- */
-void hmm_devmem_remove(struct hmm_devmem *devmem)
-{
- resource_size_t start, size;
- struct device *device;
- bool cdm = false;
-
- if (!devmem)
- return;
-
- device = devmem->device;
- start = devmem->resource->start;
- size = resource_size(devmem->resource);
-
- cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
- hmm_devmem_pages_remove(devmem);
-
- if (!cdm)
- devm_release_mem_region(device, start, size);
}
-EXPORT_SYMBOL(hmm_devmem_remove);
+EXPORT_SYMBOL_GPL(hmm_devmem_add_resource);
/*
* A device driver that wants to handle multiple devices memory through a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 15310f14c25e..6fad1864ba03 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -734,6 +734,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
+ if (!pmd_none(*pmd)) {
+ if (write) {
+ if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
+ goto out_unlock;
+ }
+ entry = pmd_mkyoung(*pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+
+ goto out_unlock;
+ }
+
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pmd_mkdevmap(entry);
@@ -745,18 +760,25 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
+
+out_unlock:
spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -779,7 +801,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
+ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -800,6 +822,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
spinlock_t *ptl;
ptl = pud_lock(mm, pud);
+ if (!pud_none(*pud)) {
+ if (write) {
+ if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+ goto out_unlock;
+ }
+ entry = pud_mkyoung(*pud);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+ update_mmu_cache_pud(vma, addr, pud);
+ }
+ goto out_unlock;
+ }
+
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pud_mkdevmap(entry);
@@ -809,13 +845,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PUD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -832,7 +872,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -2127,23 +2167,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
*/
old_pmd = pmdp_invalidate(vma, haddr, pmd);
-#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
pmd_migration = is_pmd_migration_entry(old_pmd);
- if (pmd_migration) {
+ if (unlikely(pmd_migration)) {
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
page = pfn_to_page(swp_offset(entry));
- } else
-#endif
+ write = is_write_migration_entry(entry);
+ young = false;
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ } else {
page = pmd_page(old_pmd);
+ if (pmd_dirty(old_pmd))
+ SetPageDirty(page);
+ write = pmd_write(old_pmd);
+ young = pmd_young(old_pmd);
+ soft_dirty = pmd_soft_dirty(old_pmd);
+ }
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
- if (pmd_dirty(old_pmd))
- SetPageDirty(page);
- write = pmd_write(old_pmd);
- young = pmd_young(old_pmd);
- soft_dirty = pmd_soft_dirty(old_pmd);
/*
* Withdraw the table only after we mark the pmd entry invalid.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 309fb8c969af..0bbb033d7d8c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1572,8 +1572,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
put_page(page);
- page = NULL;
+ return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -3624,7 +3625,6 @@ retry_avoidcopy:
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
mmun_start = haddr;
mmun_end = mmun_start + huge_page_size(h);
@@ -3646,6 +3646,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -3730,6 +3731,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3776,8 +3778,7 @@ retry:
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3791,7 +3792,7 @@ retry:
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3862,6 +3863,15 @@ retry:
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -3876,21 +3886,14 @@ backout_unlocked:
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3901,9 +3904,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3948,7 +3949,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4096,7 +4097,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4164,6 +4164,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4269,7 +4270,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4288,6 +4290,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * Instead of doing 'try_get_page()' below in the same_page
+ * loop, just check the count once here.
+ */
+ if (unlikely(page_count(page) <= 0)) {
+ if (pages) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index b68a3d0d075f..0ed549045074 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1373,6 +1373,7 @@ static void scan_block(void *_start, void *_end,
/*
* Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
*/
+#ifdef CONFIG_SMP
static void scan_large_block(void *start, void *end)
{
void *next;
@@ -1384,6 +1385,7 @@ static void scan_large_block(void *start, void *end)
cond_resched();
}
}
+#endif
/*
* Scan a memory block corresponding to a kmemleak_object. A condition is
@@ -1501,11 +1503,6 @@ static void kmemleak_scan(void)
}
rcu_read_unlock();
- /* data/bss scanning */
- scan_large_block(_sdata, _edata);
- scan_large_block(__bss_start, __bss_stop);
- scan_large_block(__start_ro_after_init, __end_ro_after_init);
-
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
@@ -2036,6 +2033,17 @@ void __init kmemleak_init(void)
}
local_irq_restore(flags);
+ /* register the data/bss sections */
+ create_object((unsigned long)_sdata, _edata - _sdata,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ /* only register .data..ro_after_init if not within .data */
+ if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
+ create_object((unsigned long)__start_ro_after_init,
+ __end_ro_after_init - __start_ro_after_init,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+
/*
* This is the point where tracking allocations is safe. Automatic
* scanning is started during the late initcall. Add the early logged
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625fd365..f0a15d32b959 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -37,11 +37,7 @@ static int lru_shrinker_id(struct list_lru *lru)
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
- /*
- * This needs node 0 to be always present, even
- * in the systems supporting sparse numa ids.
- */
- return !!lru->node[0].memcg_lrus;
+ return lru->memcg_aware;
}
static inline struct list_lru_one *
@@ -451,6 +447,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
int i;
+ lru->memcg_aware = memcg_aware;
+
if (!memcg_aware)
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 808a780d6500..d0f245d80f93 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -251,6 +251,12 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+static inline bool should_force_charge(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -1385,8 +1391,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
};
bool ret;
- mutex_lock(&oom_lock);
- ret = out_of_memory(&oc);
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
@@ -1669,6 +1680,9 @@ enum oom_status {
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
+ enum oom_status ret;
+ bool locked;
+
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
@@ -1701,10 +1715,23 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int
return OOM_ASYNC;
}
+ mem_cgroup_mark_under_oom(memcg);
+
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
if (mem_cgroup_out_of_memory(memcg, mask, order))
- return OOM_SUCCESS;
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
- return OOM_FAILED;
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
+
+ return ret;
}
/**
@@ -2187,9 +2214,7 @@ retry:
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(tsk_is_oom_victim(current) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(should_force_charge()))
goto force;
/*
@@ -3875,6 +3900,22 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
return &memcg->cgwb_domain;
}
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page().
+ */
+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = atomic_long_read(&memcg->stat[idx]);
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx];
+ if (x < 0)
+ x = 0;
+ return x;
+}
+
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -3900,10 +3941,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
(1 << LRU_ACTIVE_FILE));
*pheadroom = PAGE_COUNTER_MAX;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0cd3de3550f0..6edc6db5ec1b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -372,7 +372,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
if (fail || tk->addr_valid == 0) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
}
/*
@@ -1822,19 +1823,17 @@ static int soft_offline_in_use_page(struct page *page, int flags)
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(hpage);
- if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
- unlock_page(hpage);
- if (!PageAnon(hpage))
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unlock_page(page);
+ if (!PageAnon(page))
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
else
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(hpage);
+ put_hwpoison_page(page);
return -EBUSY;
}
- unlock_page(hpage);
- get_hwpoison_page(page);
- put_hwpoison_page(hpage);
+ unlock_page(page);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index 5c5df53dbdf9..e0010cb870e0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1787,14 +1787,21 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
- * mapping and we expect the PFNs to match.
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
*/
- if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
- entry = *pte;
- goto out_mkwrite;
- } else
- goto out_unlock;
+ }
+ entry = pte_mkyoung(*pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+ update_mmu_cache(vma, addr, pte);
+ }
+ goto out_unlock;
}
/* Ok, finally just insert the thing.. */
@@ -1803,7 +1810,6 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
-out_mkwrite:
if (mkwrite) {
entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3237,6 +3243,29 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
+ vmf->address);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -3739,10 +3768,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
@@ -3783,7 +3815,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
- pte_free(vma->vm_mm, vmf->prealloc_pte);
+ pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f3f919728f5c..af6735562215 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -35,6 +35,7 @@
#include <linux/memblock.h>
#include <linux/bootmem.h>
#include <linux/compaction.h>
+#include <linux/rmap.h>
#include <asm/tlbflush.h>
@@ -900,6 +901,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
*/
mem = find_memory_block(__pfn_to_section(pfn));
nid = mem->nid;
+ put_device(&mem->dev);
/* associate pfn range with the zone */
zone = move_pfn_range(online_type, nid, pfn, nr_pages);
@@ -1212,11 +1214,13 @@ static inline int pageblock_free(struct page *page)
return PageBuddy(page) && page_order(page) >= pageblock_order;
}
-/* Return the start of the next active pageblock after a given page */
-static struct page *next_active_pageblock(struct page *page)
+/* Return the pfn of the start of the next active pageblock after a given pfn */
+static unsigned long next_active_pageblock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
+
/* Ensure the starting page is pageblock-aligned */
- BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+ BUG_ON(pfn & (pageblock_nr_pages - 1));
/* If the entire pageblock is free, move to the end of free page */
if (pageblock_free(page)) {
@@ -1224,16 +1228,16 @@ static struct page *next_active_pageblock(struct page *page)
/* be careful. we don't have locks, page_order can be changed.*/
order = page_order(page);
if ((order < MAX_ORDER) && (order >= pageblock_order))
- return page + (1 << order);
+ return pfn + (1 << order);
}
- return page + pageblock_nr_pages;
+ return pfn + pageblock_nr_pages;
}
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
struct zone *zone;
- unsigned long pfn;
/*
* We have to be careful here because we are iterating over memory
@@ -1256,12 +1260,14 @@ static bool is_pageblock_removable_nolock(struct page *page)
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn, pfn;
+
+ end_pfn = min(start_pfn + nr_pages,
+ zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
/* Check the starting page of each pageblock within the range */
- for (; page < end_page; page = next_active_pageblock(page)) {
- if (!is_pageblock_removable_nolock(page))
+ for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
+ if (!is_pageblock_removable_nolock(pfn))
return false;
cond_resched();
}
@@ -1297,6 +1303,9 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
@@ -1325,23 +1334,27 @@ int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
- struct page *page;
+
for (pfn = start; pfn < end; pfn++) {
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageLRU(page))
- return pfn;
- if (__PageMovable(page))
- return pfn;
- if (PageHuge(page)) {
- if (hugepage_migration_supported(page_hstate(page)) &&
- page_huge_active(page))
- return pfn;
- else
- pfn = round_up(pfn + 1,
- 1 << compound_order(page)) - 1;
- }
- }
+ struct page *page, *head;
+ unsigned long skip;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ if (__PageMovable(page))
+ return pfn;
+
+ if (!PageHuge(page))
+ continue;
+ head = compound_head(page);
+ if (hugepage_migration_supported(page_hstate(head)) &&
+ page_huge_active(head))
+ return pfn;
+ skip = (1 << compound_order(head)) - (page - head);
+ pfn += skip - 1;
}
return 0;
}
@@ -1393,6 +1406,21 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
pfn = page_to_pfn(compound_head(page))
+ hpage_nr_pages(page) - 1;
+ /*
+ * HWPoison pages have elevated reference counts so the migration would
+ * fail on them. It also doesn't make any sense to migrate them in the
+ * first place. Still try to unmap such a page in case it is still mapped
+ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+ * the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ continue;
+ }
+
if (!get_page_unless_zero(page))
continue;
/*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 149b6f4cf023..360b24bc69e5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -428,6 +428,13 @@ static inline bool queue_pages_required(struct page *page,
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
+/*
+ * queue_pages_pmd() has three possible return values:
+ * 1 - pages are placed on the right node or queued successfully.
+ * 0 - THP was split.
+ * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing
+ * page was already on a node that does not follow the policy.
+ */
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -437,7 +444,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
- ret = 1;
+ ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
@@ -454,8 +461,15 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = 1;
flags = qp->flags;
/* go to thp migration */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(walk->vma)) {
+ ret = -EIO;
+ goto unlock;
+ }
+
migrate_page_add(page, qp->pagelist, flags);
+ } else
+ ret = -EIO;
unlock:
spin_unlock(ptl);
out:
@@ -480,8 +494,10 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret)
+ if (ret > 0)
return 0;
+ else if (ret < 0)
+ return ret;
}
if (pmd_trans_unstable(pmd))
@@ -502,11 +518,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
continue;
if (!queue_pages_required(page, qp))
continue;
- migrate_page_add(page, qp->pagelist, flags);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(vma))
+ break;
+ migrate_page_add(page, qp->pagelist, flags);
+ } else
+ break;
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
- return 0;
+ return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
@@ -576,7 +597,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (!vma_migratable(vma))
+ /*
+ * Need check MPOL_MF_STRICT to return -EIO if possible
+ * regardless of vma_migratable
+ */
+ if (!vma_migratable(vma) &&
+ !(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
@@ -603,7 +629,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
}
/* queue pages from current vma */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
@@ -1300,7 +1326,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1477,7 +1503,7 @@ static int kernel_get_mempolicy(int __user *policy,
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1513,7 +1539,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
diff --git a/mm/migrate.c b/mm/migrate.c
index 84381b55b2bd..b2ea7d1e6f24 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -248,10 +248,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
pte = swp_entry_to_pte(entry);
} else if (is_device_public_page(new)) {
pte = pte_mkdevmap(pte);
- flush_dcache_page(new);
}
- } else
- flush_dcache_page(new);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
@@ -983,6 +981,13 @@ static int move_to_new_page(struct page *newpage, struct page *page,
*/
if (!PageMappingFlags(page))
page->mapping = NULL;
+
+ if (unlikely(is_zone_device_page(newpage))) {
+ if (is_device_public_page(newpage))
+ flush_dcache_page(newpage);
+ } else
+ flush_dcache_page(newpage);
+
}
out:
return rc;
@@ -1118,10 +1123,13 @@ out:
* If migration is successful, decrease refcount of the newpage
* which will not free the page because new page owner increased
* refcounter. As well, if it is LRU page, add the page to LRU
- * list in here.
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(__PageMovable(newpage)))
+ if (unlikely(!is_lru))
put_page(newpage);
else
putback_lru_page(newpage);
@@ -1300,6 +1308,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
lock_page(hpage);
}
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
@@ -1330,6 +1348,7 @@ put_anon:
put_new_page = NULL;
}
+out_unlock:
unlock_page(hpage);
out:
if (rc != -EAGAIN)
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe226e6..2732c8c0764c 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -169,6 +169,22 @@ out:
return 0;
}
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
- mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ mincore_walk.mm = vma->vm_mm;
err = walk_page_range(addr, end, &mincore_walk);
if (err < 0)
return err;
diff --git a/mm/mmap.c b/mm/mmap.c
index f7cd9cb966c0..1480880ff814 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,6 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -2391,12 +2392,11 @@ int expand_downwards(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- int error;
+ int error = 0;
address &= PAGE_MASK;
- error = security_mmap_addr(address);
- if (error)
- return error;
+ if (address < mmap_min_addr)
+ return -EPERM;
/* Enforce stack_guard_gap */
prev = vma->vm_prev;
@@ -2492,7 +2492,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2518,6 +2519,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
+ /* don't alter vm_start if the coredump is running */
+ if (!mmget_still_valid(mm))
+ return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f10aa5360616..dbddb7a409dd 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -634,8 +634,8 @@ static int oom_reaper(void *unused)
static void wake_oom_reaper(struct task_struct *tsk)
{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -915,7 +915,8 @@ static void __oom_kill_process(struct task_struct *victim)
*/
static int oom_kill_memcg_member(struct task_struct *task, void *unused)
{
- if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
get_task_struct(task);
__oom_kill_process(task);
}
@@ -962,6 +963,13 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* still freeing memory.
*/
read_lock(&tasklist_lock);
+
+ /*
+ * The task 'p' might have already exited before reaching here. The
+ * put_task_struct() will free task_struct 'p' while the loop still try
+ * to access the field of 'p', so, get an extra reference.
+ */
+ get_task_struct(p);
for_each_thread(p, t) {
list_for_each_entry(child, &t->children, sibling) {
unsigned int child_points;
@@ -981,6 +989,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
}
}
}
+ put_task_struct(p);
read_unlock(&tasklist_lock);
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 84ae9bf5858a..ea4fd3af3b4b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2156,6 +2156,7 @@ int write_cache_pages(struct address_space *mapping,
{
int ret = 0;
int done = 0;
+ int error;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
@@ -2236,25 +2237,31 @@ continue_unlock:
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = (*writepage)(page, wbc, data);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
done_index = page->index + 1;
done = 1;
break;
}
+ if (!ret)
+ ret = error;
}
/*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 004020752217..a01c15fdb723 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -307,6 +307,32 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+ if (!static_branch_unlikely(&deferred_pages))
+ kasan_free_pages(page, order);
+}
+
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
@@ -339,6 +365,8 @@ static inline bool update_defer_init(pg_data_t *pgdat,
return true;
}
#else
+#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
@@ -1043,7 +1071,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
arch_free_page(page, order);
kernel_poison_pages(page, 1 << order, 0);
kernel_map_pages(page, 1 << order, 0);
- kasan_free_pages(page, order);
+ kasan_free_nondeferred_pages(page, order);
return true;
}
@@ -1629,13 +1657,6 @@ static int __init deferred_init_memmap(void *data)
}
/*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-
-/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1936,8 +1957,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
@@ -4596,11 +4617,11 @@ refill:
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, size);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
nc->offset = size;
}
@@ -4616,10 +4637,10 @@ refill:
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, size + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = size + 1;
offset = size - fragsz;
}
@@ -7790,11 +7811,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
+ struct page *head = compound_head(page);
+ unsigned int skip_pages;
- if (!hugepage_migration_supported(page_hstate(page)))
+ if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+ skip_pages = (1 << compound_order(head)) - (page - head);
+ iter += skip_pages - 1;
continue;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a9826da84ccb..aad120123688 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -273,6 +273,7 @@ static void free_page_ext(void *addr)
table_size = get_entry_size() * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
@@ -398,10 +399,8 @@ void __init page_ext_init(void)
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
- *
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index aa2b3d34e8ea..6cfa8e7d7213 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -34,7 +35,10 @@ static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 38de70ab1a0d..0f643dc2dc65 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -50,6 +50,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
+ unsigned long flags;
int i;
chunk = pcpu_alloc_chunk(gfp);
@@ -68,9 +69,9 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
chunk->data = pages;
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
- spin_lock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_populated(chunk, 0, nr_pages, false);
- spin_unlock_irq(&pcpu_lock);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(chunk->base_addr);
diff --git a/mm/percpu.c b/mm/percpu.c
index 4b90682623e9..41e58f3d8fbf 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2529,8 +2529,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
ai->groups[group].base_offset = areas[group] - base;
}
- pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
- PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
rc = pcpu_setup_first_chunk(ai, base);
@@ -2651,8 +2651,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
}
/* we're ready, commit */
- pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
- unit_pages, psize_str, vm.addr, ai->static_size,
+ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
+ unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
rc = pcpu_setup_first_chunk(ai, vm.addr);
diff --git a/mm/shmem.c b/mm/shmem.c
index b6cf0e8e685b..3c8742655756 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2895,16 +2895,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret;
+ int ret = 0;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
*/
- ret = shmem_reserve_inode(inode->i_sb);
- if (ret)
- goto out;
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+ }
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
diff --git a/mm/slab.c b/mm/slab.c
index 19f6bb643d5a..8ccc092fcd39 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -563,14 +563,6 @@ static void start_cpu_timer(int cpu)
static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(ac);
if (ac) {
ac->avail = 0;
ac->limit = limit;
@@ -586,6 +578,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
struct array_cache *ac = NULL;
ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
init_arraycache(ac, entries, batchcount);
return ac;
}
@@ -679,8 +679,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ kmemleak_no_scan(alc);
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
@@ -2122,6 +2125,8 @@ done:
cachep->allocflags = __GFP_COMP;
if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
+ if (flags & SLAB_CACHE_DMA32)
+ cachep->allocflags |= GFP_DMA32;
if (flags & SLAB_RECLAIM_ACCOUNT)
cachep->allocflags |= __GFP_RECLAIMABLE;
cachep->size = size;
@@ -4300,7 +4305,8 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
+ struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+ root_caches_node);
struct page *page;
struct kmem_cache_node *n;
const char *name;
diff --git a/mm/slab.h b/mm/slab.h
index e904021207f1..d6b01d61f768 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
/* Legal flag mask for kmem_cache_create(), for various configurations */
-#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+ SLAB_CACHE_DMA32 | SLAB_PANIC | \
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
#if defined(CONFIG_DEBUG_SLAB)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3a7ac4f15194..4d3c2e76d1ba 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
- SLAB_ACCOUNT)
+ SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index bcc176415952..2240b51a0549 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3604,6 +3604,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
if (s->flags & SLAB_CACHE_DMA)
s->allocflags |= GFP_DMA;
+ if (s->flags & SLAB_CACHE_DMA32)
+ s->allocflags |= GFP_DMA32;
+
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
@@ -5710,6 +5713,8 @@ static char *create_unique_id(struct kmem_cache *s)
*/
if (s->flags & SLAB_CACHE_DMA)
*p++ = 'd';
+ if (s->flags & SLAB_CACHE_DMA32)
+ *p++ = 'D';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
if (s->flags & SLAB_CONSISTENCY_CHECKS)
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07eea9a6e..45950a074bdb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -196,7 +196,7 @@ static inline int next_present_section_nr(int section_nr)
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
- ((section_nr >= 0) && \
+ ((section_nr != -1) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
diff --git a/mm/swap.c b/mm/swap.c
index 9217027671c8..0457927d3f0c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -325,11 +325,6 @@ static inline void activate_page_drain(int cpu)
{
}
-static bool need_activate_page_drain(int cpu)
-{
- return false;
-}
-
void activate_page(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -666,6 +661,8 @@ void lru_add_drain(void)
local_unlock_cpu(swapvec_lock);
}
+#ifdef CONFIG_SMP
+
#ifdef CONFIG_PREEMPT_RT_BASE
static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
{
@@ -676,13 +673,13 @@ static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
#else
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work)
{
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
@@ -733,6 +730,12 @@ void lru_add_drain_all(void)
mutex_unlock(&lock);
}
+#else
+void lru_add_drain_all(void)
+{
+ lru_add_drain();
+}
+#endif
/**
* release_pages - batched put_page()
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8810a6d7d67f..0047dcaf9369 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+static struct swap_info_struct *swap_type_to_swap_info(int type)
+{
+ if (type >= READ_ONCE(nr_swapfiles))
+ return NULL;
+
+ smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
+ return READ_ONCE(swap_info[type]);
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
@@ -1030,12 +1039,14 @@ noswap:
/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
pgoff_t offset;
- si = swap_info[type];
+ if (!si)
+ goto fail;
+
spin_lock(&si->lock);
- if (si && (si->flags & SWP_WRITEOK)) {
+ if (si->flags & SWP_WRITEOK) {
atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
@@ -1046,6 +1057,7 @@ swp_entry_t get_swap_page_of_type(int type)
atomic_long_inc(&nr_swap_pages);
}
spin_unlock(&si->lock);
+fail:
return (swp_entry_t) {0};
}
@@ -1057,9 +1069,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
if (!entry.val)
goto out;
type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swap_type_to_swap_info(type);
+ if (!p)
goto bad_nofile;
- p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -1708,10 +1720,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
sector_t swapdev_block(int type, pgoff_t offset)
{
struct block_device *bdev;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
+ if (!si || !(si->flags & SWP_WRITEOK))
return 0;
return map_swap_entry(swp_entry(type, offset), &bdev);
}
@@ -2208,7 +2219,8 @@ int try_to_unuse(unsigned int type, bool frontswap,
*/
if (PageSwapCache(page) &&
likely(page_private(page) == entry.val) &&
- !page_swapped(page))
+ (!PageTransCompound(page) ||
+ !swap_page_trans_huge_swapped(si, entry)))
delete_from_swap_cache(compound_head(page));
/*
@@ -2268,7 +2280,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
struct swap_extent *se;
pgoff_t offset;
- sis = swap_info[swp_type(entry)];
+ sis = swp_swap_info(entry);
*bdev = sis->bdev;
offset = swp_offset(entry);
@@ -2706,9 +2718,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2728,9 +2738,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
else
type = si->type + 1;
- for (; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
@@ -2819,8 +2827,9 @@ static struct swap_info_struct *alloc_swap_info(void)
struct swap_info_struct *p;
unsigned int type;
int i;
+ int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node);
- p = kvzalloc(sizeof(*p), GFP_KERNEL);
+ p = kvzalloc(size, GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -2836,14 +2845,14 @@ static struct swap_info_struct *alloc_swap_info(void)
}
if (type >= nr_swapfiles) {
p->type = type;
- swap_info[type] = p;
+ WRITE_ONCE(swap_info[type], p);
/*
* Write swap_info[type] before nr_swapfiles, in case a
* racing procfs swap_start() or swap_next() is reading them.
* (We never shrink nr_swapfiles, we never free this entry.)
*/
smp_wmb();
- nr_swapfiles++;
+ WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
kvfree(p);
p = swap_info[type];
@@ -3363,7 +3372,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
@@ -3371,10 +3380,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
if (non_swap_entry(entry))
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swp_swap_info(entry);
+ if (!p)
goto bad_file;
- p = swap_info[type];
+
offset = swp_offset(entry);
if (unlikely(offset >= p->max))
goto out;
@@ -3471,7 +3480,7 @@ int swapcache_prepare(swp_entry_t entry)
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
- return swap_info[swp_type(entry)];
+ return swap_type_to_swap_info(swp_type(entry));
}
struct swap_info_struct *page_swap_info(struct page *page)
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
/*
* Validates that the given object is:
* - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);
- /* Check for bad heap object. */
- check_heap_object(ptr, n, to_user);
-
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
usercopy_abort("process stack", NULL, to_user, 0, n);
}
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda96f20..7529d3fcc899 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -271,8 +271,7 @@ retry:
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/util.c b/mm/util.c
index 9e3ebd2ef65f..6a24a1025d77 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -485,7 +485,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 495245727706..5c6939cc28b7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -498,7 +498,11 @@ nocache:
}
found:
- if (addr + size > vend)
+ /*
+ * Check also calculated address against the vstart,
+ * because it can be 0 because of big align request.
+ */
+ if (addr + size > vend || addr < vstart)
goto overflow;
va->va_start = addr;
@@ -2247,7 +2251,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & VM_USERMAP))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5ef7240cbcb..ee545d1e9894 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -477,16 +477,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
delta *= 4;
do_div(delta, shrinker->seeks);
- /*
- * Make sure we apply some minimal pressure on default priority
- * even on small cgroups. Stale objects are not only consuming memory
- * by themselves, but can also hold a reference to a dying cgroup,
- * preventing it from being reclaimed. A dying cgroup with all
- * corresponding structures like per-cpu stats and kmem caches
- * can be really big, so it may lead to a significant waste of memory.
- */
- delta = max_t(unsigned long long, delta, min(freeable, batch_size));
-
total_scan += delta;
if (total_scan < 0) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
@@ -2200,7 +2190,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct mem_cgroup *memcg,
struct scan_control *sc, bool actual_reclaim)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
@@ -2221,16 +2210,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
/*
* When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
if (file && actual_reclaim && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
@@ -2251,12 +2236,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc)
+ struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru),
- memcg, sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2356,7 +2339,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed.
*/
- if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) {
scan_balance = SCAN_ANON;
@@ -2374,7 +2357,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2456,9 +2439,11 @@ out:
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
+ * Make sure we don't miss the last page
+ * because of a round-off error.
*/
- scan = div64_u64(scan * fraction[file],
- denominator);
+ scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ denominator);
break;
case SCAN_FILE:
case SCAN_ANON:
@@ -2525,7 +2510,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, memcg, sc);
+ lruvec, sc);
}
}
@@ -2592,7 +2577,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2990,12 +2975,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
unsigned long refaults;
struct lruvec *lruvec;
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
@@ -3352,7 +3333,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 2cfaa5075c72..0cd11c5e3999 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1284,13 +1284,8 @@ const char * const vmstat_text[] = {
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
-#else
- "", /* nr_tlb_remote_flush */
- "", /* nr_tlb_remote_flush_received */
-#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
@@ -1559,6 +1554,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ /* Skip hidden vmstat items. */
+ if (*vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+ NR_VM_NUMA_STAT_ITEMS] == '\0')
+ continue;
seq_printf(m, "\n %-12s %lu",
vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
NR_VM_NUMA_STAT_ITEMS],