diff options
author | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-05-21 00:54:36 +0200 |
---|---|---|
committer | Marcel Ziswiler <marcel.ziswiler@toradex.com> | 2020-05-21 00:54:36 +0200 |
commit | 135d39dcedcedd1f44ea0bba52f15ac5922c114f (patch) | |
tree | 12379fc8ef7489eaca1c7245f8bd6af74c619a8f /mm | |
parent | 187764bd111b27783b6d68ffb3b3dbb3a9bafd38 (diff) | |
parent | 1279cd128bba968ebe0a2df7f7ae38bae90250ef (diff) |
Merge remote-tracking branch 'remotes/fslc/4.9-2.3.x-imx' into toradex_4.9-2.3.x-imx-next
Conflicts:
sound/soc/codecs/sgtl5000.c
sound/soc/fsl/imx-sgtl5000.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/cma.c | 21 | ||||
-rw-r--r-- | mm/cma_debug.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 3 | ||||
-rw-r--r-- | mm/gup.c | 54 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 56 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 9 | ||||
-rw-r--r-- | mm/kasan/kasan_init.c | 15 | ||||
-rw-r--r-- | mm/kasan/report.c | 1 | ||||
-rw-r--r-- | mm/khugepaged.c | 3 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/ksm.c | 14 | ||||
-rw-r--r-- | mm/list_lru.c | 10 | ||||
-rw-r--r-- | mm/memcontrol.c | 65 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 22 | ||||
-rw-r--r-- | mm/mempolicy.c | 30 | ||||
-rw-r--r-- | mm/mincore.c | 23 | ||||
-rw-r--r-- | mm/mlock.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/page-writeback.c | 37 | ||||
-rw-r--r-- | mm/page_alloc.c | 14 | ||||
-rw-r--r-- | mm/page_ext.c | 1 | ||||
-rw-r--r-- | mm/page_idle.c | 4 | ||||
-rw-r--r-- | mm/percpu.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 22 | ||||
-rw-r--r-- | mm/slab.c | 23 | ||||
-rw-r--r-- | mm/slub.c | 54 | ||||
-rw-r--r-- | mm/usercopy.c | 10 | ||||
-rw-r--r-- | mm/vmalloc.c | 24 | ||||
-rw-r--r-- | mm/vmstat.c | 7 | ||||
-rw-r--r-- | mm/zsmalloc.c | 85 |
36 files changed, 482 insertions, 184 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6ff2d7744223..113b7d317079 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -669,6 +669,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; atomic_set(&bdi->usage_cnt, 1); + init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -100,8 +100,10 @@ static int __init cma_activate_area(struct cma *cma) cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + } WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); @@ -266,6 +268,12 @@ int __init cma_declare_contiguous(phys_addr_t base, */ alignment = max(alignment, (phys_addr_t)PAGE_SIZE << max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + if (fixed && base & (alignment - 1)) { + ret = -EINVAL; + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + goto err; + } base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); @@ -296,6 +304,13 @@ int __init cma_declare_contiguous(phys_addr_t base, if (limit == 0 || limit > memblock_end) limit = memblock_end; + if (base + size > limit) { + ret = -EINVAL; + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + goto err; + } + /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || @@ -339,12 +354,14 @@ int __init cma_declare_contiguous(phys_addr_t base, ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); if (ret) - goto err; + goto free_mem; pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, &base); return 0; +free_mem: + memblock_free(base, size); err: pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); return ret; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index f8e4b60db167..da50dab56b70 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -57,7 +57,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); diff --git a/mm/filemap.c b/mm/filemap.c index 6d2f561d517c..b046d8f147e2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -383,7 +383,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, .range_end = end, }; - if (!mapping_cap_writeback_dirty(mapping)) + if (!mapping_cap_writeback_dirty(mapping) || + !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -153,7 +153,10 @@ retry: } if (flags & FOLL_GET) { - get_page(page); + if (unlikely(!try_get_page(page))) { + page = ERR_PTR(-ENOMEM); + goto out; + } /* drop the pgmap reference now that we hold the page */ if (pgmap) { @@ -292,7 +295,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { - get_page(page); + if (unlikely(!try_get_page(page))) { + spin_unlock(ptl); + return ERR_PTR(-ENOMEM); + } spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); @@ -348,7 +354,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(*pte); } - get_page(*page); + if (unlikely(!try_get_page(*page))) { + ret = -ENOMEM; + goto unmap; + } out: ret = 0; unmap: @@ -1231,6 +1240,20 @@ struct page *get_dump_page(unsigned long addr) */ #ifdef CONFIG_HAVE_GENERIC_RCU_GUP +/* + * Return the compund head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) @@ -1263,9 +1286,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) + head = try_get_compound_head(page, 1); + if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { @@ -1313,17 +1336,16 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; refs = 0; - head = pmd_page(orig); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pmd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1348,17 +1370,16 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; refs = 0; - head = pud_page(orig); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pud_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1384,17 +1405,16 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; refs = 0; - head = pgd_page(orig); - page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); + page = pgd_page(orig) + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; (*nr)++; page++; refs++; } while (addr += PAGE_SIZE, addr != end); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pgd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7ea8da990b9d..5fbd77d52602 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -30,6 +30,7 @@ #include <linux/userfaultfd_k.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> +#include <linux/page_owner.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -1950,6 +1951,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); + + split_page_owner(head, HPAGE_PMD_ORDER); + /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { page_ref_inc(head); @@ -2091,7 +2095,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) unsigned long flags; pgoff_t end; - VM_BUG_ON_PAGE(is_huge_zero_page(page), page); + VM_BUG_ON_PAGE(is_huge_zero_page(head), head); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b682da98d95..9914da93069e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1247,12 +1247,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page); /* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } spin_lock(&hugetlb_lock); clear_page_huge_active(page); @@ -3812,21 +3823,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3837,9 +3841,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -3885,7 +3887,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, address); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3993,6 +3995,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); + int err = -EFAULT; while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -4064,6 +4067,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); @@ -4090,7 +4106,7 @@ same_page: *nr_pages = remainder; *position = vaddr; - return i ? i : -EFAULT; + return i ? i : err; } #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..e430e04997ee 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -196,7 +196,7 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); - if (!css_tryget_online(&h_cg->css)) { + if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } diff --git a/mm/internal.h b/mm/internal.h index 3e2d01694747..f6df7cb8cbc0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -442,6 +442,16 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, #define NODE_RECLAIM_SOME 0 #define NODE_RECLAIM_SUCCESS 1 +#ifdef CONFIG_NUMA +extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int); +#else +static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask, + unsigned int order) +{ + return NODE_RECLAIM_NOSCAN; +} +#endif + extern int hwpoison_filter(struct page *p); extern u32 hwpoison_filter_dev_major; diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 4ce386c44cf1..1169c1fe941f 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task) /* Unpoison the stack for the current task beyond a watermark sp value. */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, watermark); + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); } /* diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index 3f9a41cf0ac6..31238dad85fb 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -15,6 +15,7 @@ #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/memblock.h> +#include <linux/mm.h> #include <linux/pfn.h> #include <asm/page.h> @@ -49,7 +50,7 @@ static void __init zero_pte_populate(pmd_t *pmd, unsigned long addr, pte_t *pte = pte_offset_kernel(pmd, addr); pte_t zero_pte; - zero_pte = pfn_pte(PFN_DOWN(__pa(kasan_zero_page)), PAGE_KERNEL); + zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL); zero_pte = pte_wrprotect(zero_pte); while (addr + PAGE_SIZE <= end) { @@ -69,7 +70,7 @@ static void __init zero_pmd_populate(pud_t *pud, unsigned long addr, next = pmd_addr_end(addr, end); if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) { - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } @@ -92,9 +93,9 @@ static void __init zero_pud_populate(pgd_t *pgd, unsigned long addr, if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) { pmd_t *pmd; - pud_populate(&init_mm, pud, kasan_zero_pmd); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } @@ -135,11 +136,11 @@ void __init kasan_populate_zero_shadow(const void *shadow_start, * puds,pmds, so pgd_populate(), pud_populate() * is noops. */ - pgd_populate(&init_mm, pgd, kasan_zero_pud); + pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_pud)); pud = pud_offset(pgd, addr); - pud_populate(&init_mm, pud, kasan_zero_pmd); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); pmd = pmd_offset(pud, addr); - pmd_populate_kernel(&init_mm, pmd, kasan_zero_pte); + pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte)); continue; } diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 8ca412aebcf1..c505ac5b2d46 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -302,6 +302,7 @@ void kasan_report(unsigned long addr, size_t size, disable_trace_on_warning(); info.access_addr = (void *)addr; + info.first_bad_addr = (void *)addr; info.access_size = size; info.is_write = is_write; info.ip = ip; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e0cfc3a54b6a..8217ee5d66ef 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1004,6 +1004,9 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ down_write(&mm->mmap_sem); + result = SCAN_ANY_PROCESS; + if (!mmget_still_valid(mm)) + goto out; result = hugepage_vma_revalidate(mm, address, &vma); if (result) goto out; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 9e66449ed91f..d05133b37b17 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -569,7 +569,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, if (in_irq()) { object->pid = 0; strncpy(object->comm, "hardirq", sizeof(object->comm)); - } else if (in_softirq()) { + } else if (in_serving_softirq()) { object->pid = 0; strncpy(object->comm, "softirq", sizeof(object->comm)); } else { @@ -710,13 +710,13 @@ static int remove_stable_node(struct stable_node *stable_node) return 0; } - if (WARN_ON_ONCE(page_mapped(page))) { - /* - * This should not happen: but if it does, just refuse to let - * merge_across_nodes be switched - there is no need to panic. - */ - err = -EBUSY; - } else { + /* + * Page could be still mapped if this races with __mmput() running in + * between ksm_exit() and exit_mmap(). Just refuse to let + * merge_across_nodes/max_page_sharing be switched. + */ + err = -EBUSY; + if (!page_mapped(page)) { /* * The stable node did not yet appear stale to get_ksm_page(), * since that allows for an unmapped ksm page to be recognized diff --git a/mm/list_lru.c b/mm/list_lru.c index 7a40fa2be858..16361c989af9 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -42,11 +42,7 @@ static void list_lru_unregister(struct list_lru *lru) #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) static inline bool list_lru_memcg_aware(struct list_lru *lru) { - /* - * This needs node 0 to be always present, even - * in the systems supporting sparse numa ids. - */ - return !!lru->node[0].memcg_lrus; + return lru->memcg_aware; } static inline struct list_lru_one * @@ -317,7 +313,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, } return 0; fail: - __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); + __memcg_destroy_list_lru_node(memcg_lrus, begin, i); return -ENOMEM; } @@ -389,6 +385,8 @@ static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { int i; + lru->memcg_aware = memcg_aware; + if (!memcg_aware) return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86a6b331b964..2f7f934bf435 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -741,7 +741,7 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) if (unlikely(!memcg)) memcg = root_mem_cgroup; } - } while (!css_tryget_online(&memcg->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); return memcg; } @@ -887,26 +887,45 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, css_put(&prev->css); } -static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +static void __invalidate_reclaim_iterators(struct mem_cgroup *from, + struct mem_cgroup *dead_memcg) { - struct mem_cgroup *memcg = dead_memcg; struct mem_cgroup_reclaim_iter *iter; struct mem_cgroup_per_node *mz; int nid; int i; - for (; memcg; memcg = parent_mem_cgroup(memcg)) { - for_each_node(nid) { - mz = mem_cgroup_nodeinfo(memcg, nid); - for (i = 0; i <= DEF_PRIORITY; i++) { - iter = &mz->iter[i]; - cmpxchg(&iter->position, - dead_memcg, NULL); - } + for_each_node(nid) { + mz = mem_cgroup_nodeinfo(from, nid); + for (i = 0; i <= DEF_PRIORITY; i++) { + iter = &mz->iter[i]; + cmpxchg(&iter->position, + dead_memcg, NULL); } } } +static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) +{ + struct mem_cgroup *memcg = dead_memcg; + struct mem_cgroup *last; + + do { + __invalidate_reclaim_iterators(memcg, dead_memcg); + last = memcg; + } while ((memcg = parent_mem_cgroup(memcg))); + + /* + * When cgruop1 non-hierarchy mode is used, + * parent_mem_cgroup() does not walk all the way up to the + * cgroup root (root_mem_cgroup). So we have to handle + * dead_memcg from cgroup root separately. + */ + if (last != root_mem_cgroup) + __invalidate_reclaim_iterators(root_mem_cgroup, + dead_memcg); +} + /* * Iteration constructs for visiting all cgroups (under a tree). If * loops are exited prematurely (break), mem_cgroup_iter_break() must @@ -2306,6 +2325,16 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { + + /* + * Enforce __GFP_NOFAIL allocation because callers are not + * prepared to see failures and likely do not have any failure + * handling code. + */ + if (gfp & __GFP_NOFAIL) { + page_counter_charge(&memcg->kmem, nr_pages); + return 0; + } cancel_charge(memcg, nr_pages); return -ENOMEM; } @@ -3452,7 +3481,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; unsigned long usage; - int i, j, size; + int i, j, size, entries; mutex_lock(&memcg->thresholds_lock); @@ -3472,14 +3501,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, __mem_cgroup_threshold(memcg, type == _MEMSWAP); /* Calculate new number of threshold */ - size = 0; + size = entries = 0; for (i = 0; i < thresholds->primary->size; i++) { if (thresholds->primary->entries[i].eventfd != eventfd) size++; + else + entries++; } new = thresholds->spare; + /* If no items related to eventfd have been cleared, nothing to do */ + if (!entries) + goto unlock; + /* Set thresholds array to NULL if we don't have thresholds */ if (!size) { kfree(new); @@ -5697,6 +5732,10 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; } + /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (in_interrupt()) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b4c8d7b9ab82..449999657c0b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1340,7 +1340,12 @@ static int online_memory_block(struct memory_block *mem, void *arg) return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); } -/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +/* + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations (triggered e.g. by sysfs). + * + * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG + */ int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; @@ -1418,9 +1423,9 @@ out: mem_hotplug_done(); return ret; } -EXPORT_SYMBOL_GPL(add_memory_resource); -int __ref add_memory(int nid, u64 start, u64 size) +/* requires device_hotplug_lock, see add_memory_resource() */ +int __ref __add_memory(int nid, u64 start, u64 size) { struct resource *res; int ret; @@ -1434,6 +1439,17 @@ int __ref add_memory(int nid, u64 start, u64 size) release_memory_resource(res); return ret; } + +int add_memory(int nid, u64 start, u64 size) +{ + int rc; + + lock_device_hotplug(); + rc = __add_memory(nid, start, size); + unlock_device_hotplug(); + + return rc; +} EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 593b74bed59b..a2be65bf5d8c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -547,11 +547,16 @@ retry: goto retry; } - migrate_page_add(page, qp->pagelist, flags); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(vma)) + break; + migrate_page_add(page, qp->pagelist, flags); + } else + break; } pte_unmap_unlock(pte - 1, ptl); cond_resched(); - return 0; + return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, @@ -623,7 +628,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (!vma_migratable(vma)) + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) @@ -650,7 +660,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } /* queue pages from current vma */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & MPOL_MF_VALID) return 0; return 1; } @@ -2734,6 +2744,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) char *flags = strchr(str, '='); int err = 1; + if (flags) + *flags++ = '\0'; /* terminate mode string */ + if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; @@ -2744,9 +2757,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) } else nodes_clear(nodes); - if (flags) - *flags++ = '\0'; /* terminate mode string */ - for (mode = 0; mode < MPOL_MAX; mode++) { if (!strcmp(str, policy_modes[mode])) { break; @@ -2758,7 +2768,9 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) switch (mode) { case MPOL_PREFERRED: /* - * Insist on a nodelist of one node only + * Insist on a nodelist of one node only, although later + * we use first_node(nodes) to grab a single node, so here + * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; @@ -2766,6 +2778,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol) rest++; if (*rest) goto out; + if (nodes_empty(nodes)) + goto out; } break; case MPOL_INTERLEAVE: diff --git a/mm/mincore.c b/mm/mincore.c index bfb866435478..3b6a883d0926 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -167,6 +167,22 @@ out: return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -187,8 +203,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, &mincore_walk); if (err < 0) return err; diff --git a/mm/mlock.c b/mm/mlock.c index f0505692a5f4..3e7fe404bfb8 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -630,11 +630,11 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, * is also counted. * Return value: previously mlocked page counts */ -static int count_mm_mlocked_page_nr(struct mm_struct *mm, +static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long start, size_t len) { struct vm_area_struct *vma; - int count = 0; + unsigned long count = 0; if (mm == NULL) mm = current->mm; diff --git a/mm/mmap.c b/mm/mmap.c index 3f2314ad6acd..d221266d100f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -87,12 +87,6 @@ static void unmap_region(struct mm_struct *mm, * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes * w: (no) no w: (no) no w: (copy) copy w: (no) no * x: (no) no x: (no) yes x: (no) yes x: (yes) yes - * - * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and - * MAP_PRIVATE: - * r: (no) no - * w: (no) no - * x: (yes) yes */ pgprot_t protection_map[16] = { __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, @@ -2448,7 +2442,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2474,6 +2469,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index f4259e496f83..7a66e37efb4d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -286,7 +286,7 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, * thanks to mm_take_all_locks(). */ spin_lock(&mm->mmu_notifier_mm->lock); - hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); mm_drop_all_locks(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 44265e00b701..b40ec74f364c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -445,10 +445,14 @@ void vm_unmap_aliases(void) EXPORT_SYMBOL_GPL(vm_unmap_aliases); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement a stub for vmalloc_sync_[un]mapping() if the architecture + * chose not to have one. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) +{ +} + +void __weak vmalloc_sync_unmappings(void) { } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 281a46aeae61..462c778b9fb5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -200,11 +200,11 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, if (this_bw < tot_bw) { if (min) { min *= this_bw; - do_div(min, tot_bw); + min = div64_ul(min, tot_bw); } if (max < 100) { max *= this_bw; - do_div(max, tot_bw); + max = div64_ul(max, tot_bw); } } @@ -2141,6 +2141,13 @@ EXPORT_SYMBOL(tag_pages_for_writeback); * not miss some pages (e.g., because some other process has cleared TOWRITE * tag we set). The rule we follow is that TOWRITE tag can be cleared only * by the process clearing the DIRTY tag (and submitting the page for IO). + * + * To avoid deadlocks between range_cyclic writeback and callers that hold + * pages in PageWriteback to aggregate IO until write_cache_pages() returns, + * we do not loop back to the start of the file. Doing so causes a page + * lock/page writeback access order inversion - we should only ever lock + * multiple pages in ascending page->index order, and looping back to the start + * of the file violates that rule and causes deadlocks. */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, @@ -2155,7 +2162,6 @@ int write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; - int cycled; int range_whole = 0; int tag; @@ -2163,23 +2169,17 @@ int write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; - if (index == 0) - cycled = 1; - else - cycled = 0; end = -1; } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - cycled = 1; /* ignore range_cyclic tests */ } if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; -retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, index, end); done_index = index; @@ -2287,17 +2287,14 @@ continue_unlock: pagevec_release(&pvec); cond_resched(); } - if (!cycled && !done) { - /* - * range_cyclic: - * We hit the last page and there is more work to be done: wrap - * back to the start of the file - */ - cycled = 1; - index = 0; - end = writeback_index - 1; - goto retry; - } + + /* + * If we hit the last page and there is more work to be done: wrap + * back the index back to the start of the file for the next + * time we are called. + */ + if (wbc->range_cyclic && !done) + done_index = 0; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05f141e39ac1..ef710e387862 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3955,11 +3955,11 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size); + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size + 1; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->offset = size; } @@ -3975,10 +3975,10 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size + 1); + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size + 1; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; } @@ -5491,13 +5491,15 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..a7be1c7a79f6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -286,6 +286,7 @@ static void free_page_ext(void *addr) table_size = get_entry_size() * PAGES_PER_SECTION; BUG_ON(PageReserved(page)); + kmemleak_free(addr); free_pages_exact(addr, table_size); } } diff --git a/mm/page_idle.c b/mm/page_idle.c index ae11aa914e55..ded173d6c5b5 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -131,7 +131,7 @@ static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, end_pfn = pfn + count * BITS_PER_BYTE; if (end_pfn > max_pfn) - end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + end_pfn = max_pfn; for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; @@ -176,7 +176,7 @@ static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, end_pfn = pfn + count * BITS_PER_BYTE; if (end_pfn > max_pfn) - end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + end_pfn = max_pfn; for (; pfn < end_pfn; pfn++) { bit = pfn % BITMAP_CHUNK_BITS; diff --git a/mm/percpu.c b/mm/percpu.c index 3794cfc88689..0462a2a00f05 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2048,8 +2048,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, ai->groups[group].base_offset = areas[group] - base; } - pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); @@ -2162,8 +2162,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, ai->static_size, + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); diff --git a/mm/shmem.c b/mm/shmem.c index 944242491059..90ccbb35458b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2457,31 +2457,33 @@ static void shmem_tag_pins(struct address_space *mapping) void **slot; pgoff_t start; struct page *page; + unsigned int tagged = 0; lru_add_drain(); start = 0; - rcu_read_lock(); + spin_lock_irq(&mapping->tree_lock); radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - page = radix_tree_deref_slot(slot); + page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { slot = radix_tree_iter_retry(&iter); continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); } - if (need_resched()) { - cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); - } + if (++tagged % 1024) + continue; + + slot = radix_tree_iter_next(&iter); + spin_unlock_irq(&mapping->tree_lock); + cond_resched(); + spin_lock_irq(&mapping->tree_lock); } - rcu_read_unlock(); + spin_unlock_irq(&mapping->tree_lock); } /* @@ -2693,7 +2695,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, } shmem_falloc.waitq = &shmem_falloc_waitq; - shmem_falloc.start = unmap_start >> PAGE_SHIFT; + shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); inode->i_private = &shmem_falloc; diff --git a/mm/slab.c b/mm/slab.c index 354a09deecff..9547f02b4af9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -566,14 +566,6 @@ static void start_cpu_timer(int cpu) static void init_arraycache(struct array_cache *ac, int limit, int batch) { - /* - * The array_cache structures contain pointers to free object. - * However, when such objects are allocated or transferred to another - * cache the pointers are not cleared and they could be counted as - * valid references during a kmemleak scan. Therefore, kmemleak must - * not scan such objects. - */ - kmemleak_no_scan(ac); if (ac) { ac->avail = 0; ac->limit = limit; @@ -589,6 +581,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, struct array_cache *ac = NULL; ac = kmalloc_node(memsize, gfp, node); + /* + * The array_cache structures contain pointers to free object. + * However, when such objects are allocated or transferred to another + * cache the pointers are not cleared and they could be counted as + * valid references during a kmemleak scan. Therefore, kmemleak must + * not scan such objects. + */ + kmemleak_no_scan(ac); init_arraycache(ac, entries, batchcount); return ac; } @@ -683,6 +683,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, alc = kmalloc_node(memsize, gfp, node); if (alc) { + kmemleak_no_scan(alc); init_arraycache(&alc->ac, entries, batch); spin_lock_init(&alc->lock); } @@ -4364,8 +4365,12 @@ static int leaks_show(struct seq_file *m, void *p) * whole processing. */ do { - set_store_user_clean(cachep); drain_cpu_caches(cachep); + /* + * drain_cpu_caches() could make kmemleak_object and + * debug_objects_cache dirty, so reset afterwards. + */ + set_store_user_clean(cachep); x[1] = 0; diff --git a/mm/slub.c b/mm/slub.c index 131dee87a67c..9b44423f1cf0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1909,8 +1909,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - else if (!node_present_pages(node)) - searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -2506,17 +2504,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, struct page *page; page = c->page; - if (!page) + if (!page) { + /* + * if the node is not online or has no normal memory, just + * ignore the node constraint + */ + if (unlikely(node != NUMA_NO_NODE && + !node_state(node, N_NORMAL_MEMORY))) + node = NUMA_NO_NODE; goto new_slab; + } redo: if (unlikely(!node_match(page, node))) { - int searchnode = node; - - if (node != NUMA_NO_NODE && !node_present_pages(node)) - searchnode = node_to_mem_node(node); - - if (unlikely(!node_match(page, searchnode))) { + /* + * same as above but node_match() being false already + * implies node != NUMA_NO_NODE + */ + if (!node_state(node, N_NORMAL_MEMORY)) { + node = NUMA_NO_NODE; + goto redo; + } else { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, page, c->freelist); c->page = NULL; @@ -2935,11 +2943,13 @@ redo: barrier(); if (likely(page == c->page)) { - set_freepointer(s, tail_obj, c->freelist); + void **freelist = READ_ONCE(c->freelist); + + set_freepointer(s, tail_obj, freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, - c->freelist, tid, + freelist, tid, head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); @@ -3115,6 +3125,15 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, if (unlikely(!object)) { /* + * We may have removed an object from c->freelist using + * the fastpath in the previous iteration; in that case, + * c->tid has not been bumped yet. + * Since ___slab_alloc() may reenable interrupts while + * allocating memory, we should bump c->tid now. + */ + c->tid = next_tid(c->tid); + + /* * Invoking slow path likely have side-effect * of re-populating per CPU c->freelist */ @@ -4718,7 +4737,17 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - get_online_mems(); + /* + * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" + * already held which will conflict with an existing lock order: + * + * mem_hotplug_lock->slab_mutex->kernfs_mutex + * + * We don't really need mem_hotplug_lock (to hold off + * slab_mem_going_offline_callback) here because slab's memory hot + * unplug code doesn't destroy the kmem_cache->node[] data. + */ + #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { struct kmem_cache_node *n; @@ -4759,7 +4788,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } diff --git a/mm/usercopy.c b/mm/usercopy.c index 3c8da0af9695..c2de343baad4 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -15,6 +15,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mm.h> +#include <linux/highmem.h> #include <linux/slab.h> #include <asm/sections.h> @@ -124,7 +125,7 @@ static inline const char *check_kernel_text_object(const void *ptr, static inline const char *check_bogus_address(const void *ptr, unsigned long n) { /* Reject if object wraps past end of memory. */ - if ((unsigned long)ptr + n < (unsigned long)ptr) + if ((unsigned long)ptr + (n - 1) < (unsigned long)ptr) return "<wrapped address>"; /* Reject if NULL or ZERO-allocation. */ @@ -217,7 +218,12 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, if (!virt_addr_valid(ptr)) return NULL; - page = virt_to_head_page(ptr); + /* + * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the + * highmem page or fallback to virt_to_page(). The following + * is effectively a highmem-aware virt_to_head_page(). + */ + page = compound_head(kmap_to_page((void *)ptr)); /* Check slab allocator for flags and size. */ if (PageSlab(page)) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e6aa073f01df..153deec1df35 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -459,7 +459,11 @@ nocache: } found: - if (addr + size > vend) + /* + * Check also calculated address against the vstart, + * because it can be 0 because of big align request. + */ + if (addr + size > vend || addr < vstart) goto overflow; va->va_start = addr; @@ -1704,6 +1708,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return NULL; /* + * First make sure the mappings are removed from all page-tables + * before they are freed. + */ + vmalloc_sync_unmappings(); + + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. @@ -2237,13 +2247,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, EXPORT_SYMBOL(remap_vmalloc_range); /* - * Implement a stub for vmalloc_sync_all() if the architecture chose not to - * have one. + * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose + * not to have one. + * + * The purpose of this function is to make sure the vmalloc area + * mappings are identical in all page-tables in the system. */ -void __weak vmalloc_sync_all(void) +void __weak vmalloc_sync_mappings(void) { } +void __weak vmalloc_sync_unmappings(void) +{ +} static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) { diff --git a/mm/vmstat.c b/mm/vmstat.c index 5e6a4d76659d..e60435d556e3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1075,13 +1075,8 @@ const char * const vmstat_text[] = { #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ @@ -1799,7 +1794,7 @@ static int __init setup_vmstat(void) #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); - proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); + proc_create("pagetypeinfo", 0400, NULL, &pagetypeinfo_file_ops); proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cf15851a7d2f..e4cca3f5331e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -52,6 +52,7 @@ #include <linux/zpool.h> #include <linux/mount.h> #include <linux/migrate.h> +#include <linux/wait.h> #include <linux/pagemap.h> #define ZSPAGE_MAGIC 0x58 @@ -265,6 +266,10 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; + /* A wait queue for when migration races with async_free_zspage() */ + wait_queue_head_t migration_wait; + atomic_long_t isolated_pages; + bool destroying; #endif }; @@ -1939,6 +1944,31 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static void putback_zspage_deferred(struct zs_pool *pool, + struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fg; + + fg = putback_zspage(class, zspage); + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + +} + +static inline void zs_pool_dec_isolated(struct zs_pool *pool) +{ + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); + atomic_long_dec(&pool->isolated_pages); + /* + * There's no possibility of racing, since wait_for_isolated_drain() + * checks the isolated count under &class->lock after enqueuing + * on migration_wait. + */ + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + wake_up_all(&pool->migration_wait); +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -2008,6 +2038,7 @@ bool zs_page_isolate(struct page *page, isolate_mode_t mode) */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); } @@ -2096,8 +2127,21 @@ int zs_page_migrate(struct address_space *mapping, struct page *newpage, * Page migration is done so let's putback isolated zspage to * the list if @page is final isolated subpage in the zspage. */ - if (!is_zspage_isolated(zspage)) - putback_zspage(class, zspage); + if (!is_zspage_isolated(zspage)) { + /* + * We cannot race with zs_destroy_pool() here because we wait + * for isolation to hit zero before we start destroying. + * Also, we ensure that everyone can see pool->destroying before + * we start waiting. + */ + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); + } + + if (page_zone(newpage) != page_zone(page)) { + dec_zone_page_state(page, NR_ZSPAGES); + inc_zone_page_state(newpage, NR_ZSPAGES); + } reset_page(page); put_page(page); @@ -2144,13 +2188,12 @@ void zs_page_putback(struct page *page) spin_lock(&class->lock); dec_zspage_isolation(zspage); if (!is_zspage_isolated(zspage)) { - fg = putback_zspage(class, zspage); /* * Due to page_lock, we cannot free zspage immediately * so let's defer. */ - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); } spin_unlock(&class->lock); } @@ -2174,8 +2217,36 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } +static bool pool_isolated_are_drained(struct zs_pool *pool) +{ + return atomic_long_read(&pool->isolated_pages) == 0; +} + +/* Function for resolving migration */ +static void wait_for_isolated_drain(struct zs_pool *pool) +{ + + /* + * We're in the process of destroying the pool, so there are no + * active allocations. zs_page_isolate() fails for completely free + * zspages, so we need only wait for the zs_pool's isolated + * count to hit zero. + */ + wait_event(pool->migration_wait, + pool_isolated_are_drained(pool)); +} + static void zs_unregister_migration(struct zs_pool *pool) { + pool->destroying = true; + /* + * We need a memory barrier here to ensure global visibility of + * pool->destroying. Thus pool->isolated pages will either be 0 in which + * case we don't care, or it will be > 0 and pool->destroying will + * ensure that we wake up once isolation hits 0. + */ + smp_mb(); + wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2422,6 +2493,10 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pool->migration_wait); +#endif + if (create_cache(pool)) goto err; |