summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorMarcel Ziswiler <marcel.ziswiler@toradex.com>2019-12-18 22:52:20 +0100
committerMarcel Ziswiler <marcel.ziswiler@toradex.com>2019-12-18 22:52:20 +0100
commit1ddf624b0b268fdc0b80b1de618b98f8d117afea (patch)
tree3d3218332bcb34cb0afa01d6ad996058a3dbcb77 /mm
parent6b774eec1f9d3064e9b33634dfa99d5666d0a73a (diff)
parentbfb9e5c03076a446b1f4f6a523ddc8d723c907a6 (diff)
Merge tag 'v4.14.159' into 4.14-2.0.x-imx
This is the 4.14.159 stable release Conflicts: arch/arm/Kconfig.debug arch/arm/boot/dts/imx7s.dtsi arch/arm/mach-imx/cpuidle-imx6sx.c drivers/crypto/caam/caamalg.c drivers/crypto/mxs-dcp.c drivers/dma/imx-sdma.c drivers/input/keyboard/imx_keypad.c drivers/net/can/flexcan.c drivers/net/can/rx-offload.c drivers/net/wireless/ath/ath10k/pci.c drivers/pci/dwc/pci-imx6.c drivers/spi/spi-fsl-lpspi.c drivers/usb/dwc3/gadget.c
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.c13
-rw-r--r--mm/compaction.c35
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/gup.c12
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/hugetlb.c5
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h10
-rw-r--r--mm/khugepaged.c3
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c14
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/memcontrol.c51
-rw-r--r--mm/memory_hotplug.c125
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/mlock.c4
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c33
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_owner.c5
-rw-r--r--mm/shmem.c22
-rw-r--r--mm/slub.c13
-rw-r--r--mm/usercopy.c10
-rw-r--r--mm/vmalloc.c9
-rw-r--r--mm/vmscan.c35
-rw-r--r--mm/vmstat.c9
-rw-r--r--mm/zsmalloc.c80
28 files changed, 345 insertions, 191 deletions
diff --git a/mm/cma.c b/mm/cma.c
index 56761e40d191..c4a34c813d47 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -277,6 +277,12 @@ int __init cma_declare_contiguous(phys_addr_t base,
*/
alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
@@ -307,6 +313,13 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (limit == 0 || limit > memblock_end)
limit = memblock_end;
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
diff --git a/mm/compaction.c b/mm/compaction.c
index 85395dc6eb13..eb8e7f5d3a08 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1517,6 +1517,17 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
unsigned long end_pfn = zone_end_pfn(zone);
const bool sync = cc->mode != MIGRATE_ASYNC;
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
+
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
@@ -1680,10 +1691,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
.gfp_mask = gfp_mask,
.zone = zone,
@@ -1696,8 +1703,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
ret = compact_zone(zone, &cc);
@@ -1796,8 +1801,6 @@ static void compact_node(int nid)
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -1811,11 +1814,7 @@ static void compact_node(int nid)
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
compact_zone(zone, &cc);
@@ -1924,8 +1923,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = true,
@@ -1950,16 +1947,10 @@ static void kcompactd_do_work(pg_data_t *pgdat)
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
+
+ cc.zone = zone;
status = compact_zone(zone, &cc);
if (status == COMPACT_SUCCESS) {
diff --git a/mm/filemap.c b/mm/filemap.c
index e2e738cc08b1..a30dbf93de99 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -338,7 +338,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -464,6 +465,28 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * filemap_fdatawait_range_keep_errors - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space in the
+ * given range and wait for all of them. Unlike filemap_fdatawait_range(),
+ * this function does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
+
+/**
* file_fdatawait_range - wait for writeback to complete
* @file: file pointing to address space structure to wait for
* @start_byte: offset in bytes where the range starts
diff --git a/mm/gup.c b/mm/gup.c
index babcbd6d99c3..12b9626b1a9e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -442,11 +442,14 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
pgd = pgd_offset_k(address);
else
pgd = pgd_offset_gate(mm, address);
- BUG_ON(pgd_none(*pgd));
+ if (pgd_none(*pgd))
+ return -EFAULT;
p4d = p4d_offset(pgd, address);
- BUG_ON(p4d_none(*p4d));
+ if (p4d_none(*p4d))
+ return -EFAULT;
pud = pud_offset(p4d, address);
- BUG_ON(pud_none(*pud));
+ if (pud_none(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
@@ -1364,7 +1367,8 @@ static inline pte_t gup_get_pte(pte_t *ptep)
}
#endif
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+ struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 930f2aa3bb4d..1adc2e6c50f9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -33,6 +33,7 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -2387,6 +2388,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
/* Additional pin to radix tree of swap cache */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8ca0075a5464..310656b4ede6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1081,11 +1081,10 @@ static bool pfn_range_valid_gigantic(struct zone *z,
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index eec1150125b9..e430e04997ee 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/internal.h b/mm/internal.h
index 1df011f62480..a182506242c4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -455,6 +455,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
#define NODE_RECLAIM_SOME 0
#define NODE_RECLAIM_SUCCESS 1
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
+{
+ return NODE_RECLAIM_NOSCAN;
+}
+#endif
+
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index d27a73737f1a..acf66c5dc9bd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1006,6 +1006,9 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
+ result = SCAN_ANY_PROCESS;
+ if (!mmget_still_valid(mm))
+ goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 71ba1c7f8892..d779181bed4d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -577,7 +577,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
if (in_irq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
- } else if (in_softirq()) {
+ } else if (in_serving_softirq()) {
object->pid = 0;
strncpy(object->comm, "softirq", sizeof(object->comm));
} else {
diff --git a/mm/ksm.c b/mm/ksm.c
index f50cc573815f..764486ffcd16 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -849,13 +849,13 @@ static int remove_stable_node(struct stable_node *stable_node)
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
diff --git a/mm/list_lru.c b/mm/list_lru.c
index dc19694d0e92..d67348afd1dc 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -313,7 +313,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
}
return 0;
fail:
- __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
return -ENOMEM;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a9a7e1066ef..326525a97c47 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -725,7 +725,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -871,26 +871,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
css_put(&prev->css);
}
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
{
- struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -2333,6 +2352,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c9d3a49bd4e2..2d6626ab29d1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -343,12 +343,8 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -368,15 +364,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -398,7 +391,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -436,9 +428,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
*/
pfn = zone_start_pfn;
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
@@ -459,70 +449,33 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = pgdat->node_id;
-
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
- }
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
- }
-
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (unlikely(!valid_section(ms)))
- continue;
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
- if (pfn_to_nid(pfn) != nid)
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
continue;
+ }
- /* If we find valid section, we have nothing to do */
- return;
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
static void __remove_zone(struct zone *zone, unsigned long start_pfn)
@@ -531,9 +484,19 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn)
int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
@@ -1110,7 +1073,12 @@ static int online_memory_block(struct memory_block *mem, void *arg)
return device_online(&mem->dev);
}
-/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+/*
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations (triggered e.g. by sysfs).
+ *
+ * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
+ */
int __ref add_memory_resource(int nid, struct resource *res, bool online)
{
u64 start, size;
@@ -1203,9 +1171,9 @@ out:
mem_hotplug_done();
return ret;
}
-EXPORT_SYMBOL_GPL(add_memory_resource);
-int __ref add_memory(int nid, u64 start, u64 size)
+/* requires device_hotplug_lock, see add_memory_resource() */
+int __ref __add_memory(int nid, u64 start, u64 size)
{
struct resource *res;
int ret;
@@ -1219,6 +1187,17 @@ int __ref add_memory(int nid, u64 start, u64 size)
release_memory_resource(res);
return ret;
}
+
+int add_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
+ lock_device_hotplug();
+ rc = __add_memory(nid, start, size);
+ unlock_device_hotplug();
+
+ return rc;
+}
EXPORT_SYMBOL_GPL(add_memory);
#ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6ca0225335eb..a37cfa88669e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -305,7 +305,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
else {
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
*nodes);
- pol->w.cpuset_mems_allowed = tmp;
+ pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
diff --git a/mm/mlock.c b/mm/mlock.c
index 46af369c13e5..1f9ee86672e8 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,11 +629,11 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
* is also counted.
* Return value: previously mlocked page counts
*/
-static int count_mm_mlocked_page_nr(struct mm_struct *mm,
+static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
unsigned long start, size_t len)
{
struct vm_area_struct *vma;
- int count = 0;
+ unsigned long count = 0;
if (mm == NULL)
mm = current->mm;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 314285284e6e..70d0efb06374 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -267,7 +267,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
* thanks to mm_take_all_locks().
*/
spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fe0aac2348e5..7a5c0b229c6a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1050,9 +1050,10 @@ bool out_of_memory(struct oom_control *oc)
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e001de5ac50c..a40c075fd8f1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2150,6 +2150,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback);
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2164,7 +2171,6 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
int tag;
@@ -2172,23 +2178,17 @@ int write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
@@ -2296,17 +2296,14 @@ continue_unlock:
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index e412a63b2b74..504684181827 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -136,7 +136,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
@@ -181,7 +181,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index a71fe4c623ef..6ac05a6ff2d1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -273,7 +273,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
}
@@ -281,13 +282,13 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
+ /* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
diff --git a/mm/shmem.c b/mm/shmem.c
index 037e2ee9ccac..69106c600692 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2657,31 +2657,33 @@ static void shmem_tag_pins(struct address_space *mapping)
void **slot;
pgoff_t start;
struct page *page;
+ unsigned int tagged = 0;
lru_add_drain();
start = 0;
- rcu_read_lock();
+ spin_lock_irq(&mapping->tree_lock);
radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
- page = radix_tree_deref_slot(slot);
+ page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!page || radix_tree_exception(page)) {
if (radix_tree_deref_retry(page)) {
slot = radix_tree_iter_retry(&iter);
continue;
}
} else if (page_count(page) - page_mapcount(page) > 1) {
- spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_set(&mapping->page_tree, iter.index,
SHMEM_TAG_PINNED);
- spin_unlock_irq(&mapping->tree_lock);
}
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ if (++tagged % 1024)
+ continue;
+
+ slot = radix_tree_iter_resume(slot, &iter);
+ spin_unlock_irq(&mapping->tree_lock);
+ cond_resched();
+ spin_lock_irq(&mapping->tree_lock);
}
- rcu_read_unlock();
+ spin_unlock_irq(&mapping->tree_lock);
}
/*
@@ -2893,7 +2895,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
diff --git a/mm/slub.c b/mm/slub.c
index 220d42e592ef..07aeb129f3f8 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4790,7 +4790,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4831,7 +4841,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index a9852b24715d..f8d74e09f8e4 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -15,6 +15,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -121,7 +122,7 @@ static inline const char *check_kernel_text_object(const void *ptr,
static inline const char *check_bogus_address(const void *ptr, unsigned long n)
{
/* Reject if object wraps past end of memory. */
- if ((unsigned long)ptr + n < (unsigned long)ptr)
+ if ((unsigned long)ptr + (n - 1) < (unsigned long)ptr)
return "<wrapped address>";
/* Reject if NULL or ZERO-allocation. */
@@ -203,7 +204,12 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
if (!virt_addr_valid(ptr))
return NULL;
- page = virt_to_head_page(ptr);
+ /*
+ * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+ * highmem page or fallback to virt_to_page(). The following
+ * is effectively a highmem-aware virt_to_head_page().
+ */
+ page = compound_head(kmap_to_page((void *)ptr));
/* Check slab allocator for flags and size. */
if (PageSlab(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6c906f6f16cc..0b8852d80f44 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1766,6 +1766,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL;
/*
+ * First make sure the mappings are removed from all page-tables
+ * before they are freed.
+ */
+ vmalloc_sync_all();
+
+ /*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
* flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
@@ -2314,6 +2320,9 @@ EXPORT_SYMBOL(remap_vmalloc_range);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
+ *
+ * The purpose of this function is to make sure the vmalloc area
+ * mappings are identical in all page-tables in the system.
*/
void __weak vmalloc_sync_all(void)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b140c967bca..0cc3c1eb15f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1393,7 +1393,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
@@ -2120,7 +2120,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2146,7 +2146,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
* rid of the stale workingset quickly.
*/
refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2156,7 +2156,7 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -3439,19 +3439,18 @@ out:
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3592,7 +3591,7 @@ kswapd_try_sleep:
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
@@ -3643,8 +3642,12 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 28c45c26f901..e2197b03da57 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1805,12 +1805,13 @@ static bool need_update(int cpu)
/*
* The fast way of checking if there are any vmstat diffs.
- * This works because the diffs are byte sized items.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
return true;
#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
}
@@ -1951,7 +1952,7 @@ void __init init_mm_internals(void)
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", 0444, NULL, &buddyinfo_file_operations);
- proc_create("pagetypeinfo", 0444, NULL, &pagetypeinfo_file_operations);
+ proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_operations);
proc_create("vmstat", 0444, NULL, &vmstat_file_operations);
proc_create("zoneinfo", 0444, NULL, &zoneinfo_file_operations);
#endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 685049a9048d..3197de2a3896 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,6 +52,7 @@
#include <linux/zpool.h>
#include <linux/mount.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#define ZSPAGE_MAGIC 0x58
@@ -267,6 +268,10 @@ struct zs_pool {
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -1878,6 +1883,31 @@ static void dec_zspage_isolation(struct zspage *zspage)
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1947,6 +1977,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode)
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2046,8 +2077,16 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage,
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
reset_page(page);
put_page(page);
@@ -2093,13 +2132,12 @@ void zs_page_putback(struct page *page)
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
spin_unlock(&class->lock);
}
@@ -2123,8 +2161,36 @@ static int zs_register_migration(struct zs_pool *pool)
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2365,6 +2431,10 @@ struct zs_pool *zs_create_pool(const char *name)
if (!pool->name)
goto err;
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pool->migration_wait);
+#endif
+
if (create_cache(pool))
goto err;