diff options
author | Deepak Nibade <dnibade@nvidia.com> | 2013-08-14 16:23:39 +0530 |
---|---|---|
committer | Deepak Nibade <dnibade@nvidia.com> | 2013-08-14 16:23:39 +0530 |
commit | 9282699d7fd7954f11d59254e568e5d4bfbbe71a (patch) | |
tree | 4d945260bc8eb63b0db0423ad55ad7021eb5d4ac /mm | |
parent | 933d6b11dbd7fda89ac094321d0cd9992afb5592 (diff) | |
parent | 67e6589a34ea5360b00869aaaec4a844c29cf713 (diff) |
Merge branch 'linux-3.4.57' into rel-17
Bug 1348440
Change-Id: If25c49f027dc2a69642f7ed4733e965962b2a5a2
Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/huge_memory.c | 7 | ||||
-rw-r--r-- | mm/hugetlb.c | 22 | ||||
-rw-r--r-- | mm/migrate.c | 25 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 80 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/pagewalk.c | 70 | ||||
-rw-r--r-- | mm/process_vm_access.c | 8 | ||||
-rw-r--r-- | mm/swap_state.c | 18 |
10 files changed, 149 insertions, 99 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index caf15b6fa753..ef99c15fa36c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1949,7 +1949,12 @@ static void collapse_huge_page(struct mm_struct *mm, pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); - set_pmd_at(mm, address, pmd, _pmd); + /* + * We can only use set_pmd_at when establishing + * hugepmds and never for establishing regular pmds that + * points to regular pagetables. Use pmd_populate for that + */ + pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); goto out; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c384e0901f9c..a69243982a08 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2096,8 +2096,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf) /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - struct hstate *h = &default_hstate; - return h->nr_huge_pages * pages_per_huge_page(h); + struct hstate *h; + unsigned long nr_total_pages = 0; + + for_each_hstate(h) + nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h); + return nr_total_pages; } static int hugetlb_acct_memory(struct hstate *h, long delta) @@ -2764,7 +2768,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait(mm, (pmd_t *)ptep, address); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2902,7 +2906,17 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - if (absent || + /* + * We need call hugetlb_fault for both hugepages under migration + * (in which case hugetlb_fault waits for the migration,) and + * hwpoisoned hugepages (in which case we need to prevent the + * caller from accessing to them.) In order to do this, we use + * here is_swap_pte instead of is_hugetlb_entry_migration and + * is_hugetlb_entry_hwpoisoned. This is because it simply covers + * both cases, and because we can't follow correct pages + * directly from any kind of swap entries. + */ + if (absent || is_swap_pte(huge_ptep_get(pte)) || ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; diff --git a/mm/migrate.c b/mm/migrate.c index 11072383ae12..5f588b1f2f6d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -145,7 +145,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, if (PageHuge(new)) pte = pte_mkhuge(pte); #endif - flush_cache_page(vma, addr, pte_pfn(pte)); + flush_dcache_page(new); set_pte_at(mm, addr, ptep, pte); if (PageHuge(new)) { @@ -180,15 +180,14 @@ static void remove_migration_ptes(struct page *old, struct page *new) * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. */ -void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, - unsigned long address) +static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, + spinlock_t *ptl) { - pte_t *ptep, pte; - spinlock_t *ptl; + pte_t pte; swp_entry_t entry; struct page *page; - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) goto out; @@ -216,6 +215,20 @@ out: pte_unmap_unlock(ptep, ptl); } +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + spinlock_t *ptl = pte_lockptr(mm, pmd); + pte_t *ptep = pte_offset_map(pmd, address); + __migration_entry_wait(mm, ptep, ptl); +} + +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl = &(mm)->page_table_lock; + __migration_entry_wait(mm, pte, ptl); +} + #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ static bool buffer_migrate_lock_buffers(struct buffer_head *head, diff --git a/mm/mmap.c b/mm/mmap.c index 922de71fc759..ed884dd9674b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1130,15 +1130,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, file = fget(fd); if (!file) goto out; + if (is_file_hugepages(file)) + len = ALIGN(len, huge_page_size(hstate_file(file))); } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; + + len = ALIGN(len, huge_page_size(&default_hstate)); /* * VM_NORESERVE is used because the reservations will be * taken when vm_ops->mmap() is called * A dummy user value is used because we are not locking * memory so no accounting is necessary */ - file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE); if (IS_ERR(file)) @@ -1619,7 +1623,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (mm) { /* Check the cache first. */ /* (Cache hit rate is typically around 35%.) */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { struct rb_node * rb_node; @@ -1920,7 +1924,7 @@ static void unmap_region(struct mm_struct *mm, unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, - next ? next->vm_start : TASK_SIZE); + next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, start, end); } @@ -2308,7 +2312,7 @@ void exit_mmap(struct mm_struct *mm) unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, TASK_SIZE); + free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb, 0, -1); /* diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 8d1ca2de4080..c1f947b4f61d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -37,51 +37,48 @@ static struct srcu_struct srcu; void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; + struct hlist_node *node; int id; /* - * srcu_read_lock() here will block synchronize_srcu() in - * mmu_notifier_unregister() until all registered - * ->release() callouts this function makes have - * returned. + * SRCU here will block mmu_notifier_unregister until + * ->release returns. */ id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, node, &mm->mmu_notifier_mm->list, hlist) + /* + * If ->release runs before mmu_notifier_unregister it must be + * handled, as it's the only way for the driver to flush all + * existing sptes and stop the driver from establishing any more + * sptes before all the pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { mn = hlist_entry(mm->mmu_notifier_mm->list.first, struct mmu_notifier, hlist); - /* - * Unlink. This will prevent mmu_notifier_unregister() - * from also making the ->release() callout. + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than to wait + * for ->release to finish and for mmu_notifier_unregister to + * return. */ hlist_del_init_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - - /* - * Clear sptes. (see 'release' description in mmu_notifier.h) - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); /* - * All callouts to ->release() which we have done are complete. - * Allow synchronize_srcu() in mmu_notifier_unregister() to complete - */ - srcu_read_unlock(&srcu, id); - - /* - * mmu_notifier_unregister() may have unlinked a notifier and may - * still be calling out to it. Additionally, other notifiers - * may have been active via vmtruncate() et. al. Block here - * to ensure that all notifier callouts for this mm have been - * completed and the sptes are really cleaned up before returning - * to exit_mmap(). + * synchronize_srcu here prevents mmu_notifier_release from returning to + * exit_mmap (which would proceed with freeing all pages in the mm) + * until the ->release method returns, if it was invoked by + * mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one mm_count + * is held by exit_mmap. */ synchronize_srcu(&srcu); } @@ -302,31 +299,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { + /* + * SRCU here will force exit_mmap to wait for ->release to + * finish before freeing the pages. + */ int id; + id = srcu_read_lock(&srcu); /* - * Ensure we synchronize up with __mmu_notifier_release(). + * exit_mmap will block in mmu_notifier_release to guarantee + * that ->release is called before freeing the pages. */ - id = srcu_read_lock(&srcu); - - hlist_del_rcu(&mn->hlist); - spin_unlock(&mm->mmu_notifier_mm->lock); - if (mn->ops->release) mn->ops->release(mn, mm); + srcu_read_unlock(&srcu, id); + spin_lock(&mm->mmu_notifier_mm->lock); /* - * Allow __mmu_notifier_release() to complete. + * Can not use list_del_rcu() since __mmu_notifier_release + * can delete it before we hold the lock. */ - srcu_read_unlock(&srcu, id); - } else + hlist_del_init_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* - * Wait for any running method to finish, including ->release() if it - * was run by __mmu_notifier_release() instead of us. + * Wait for any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. */ synchronize_srcu(&srcu); diff --git a/mm/nommu.c b/mm/nommu.c index bb8f4f004a82..b0956e39fd1c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -807,7 +807,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma; /* check the cache first */ - vma = mm->mmap_cache; + vma = ACCESS_ONCE(mm->mmap_cache); if (vma && vma->vm_start <= addr && vma->vm_end > addr) return vma; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef868e299932..16568c9755c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5910,6 +5910,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) zone->free_area[order].nr_free--; __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..1090e772a26d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - struct vm_area_struct *vma; - - /* We don't need vma lookup at all. */ - if (!walk->hugetlb_entry) - return NULL; - - VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); - vma = find_vma(walk->mm, addr); - if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) - return vma; - - return NULL; -} - #else /* CONFIG_HUGETLB_PAGE */ -static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) -{ - return NULL; -} - static int walk_hugetlb_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -199,30 +178,53 @@ int walk_page_range(unsigned long addr, unsigned long end, if (!walk->mm) return -EINVAL; + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; next = pgd_addr_end(addr, end); /* - * handle hugetlb vma individually because pagetable walk for - * the hugetlb page is dependent on the architecture and - * we can't handled it in the same manner as non-huge pages. + * This function was not intended to be vma based. + * But there are vma special cases to be handled: + * - hugetlb vma's + * - VM_PFNMAP vma's */ - vma = hugetlb_vma(addr, walk); + vma = find_vma(walk->mm, addr); if (vma) { - if (vma->vm_end < next) + /* + * There are no page structures backing a VM_PFNMAP + * range, so do not allow split_huge_page_pmd(). + */ + if ((vma->vm_start <= addr) && + (vma->vm_flags & VM_PFNMAP)) { next = vma->vm_end; + pgd = pgd_offset(walk->mm, next); + continue; + } /* - * Hugepage is very tightly coupled with vma, so - * walk through hugetlb entries within a given vma. + * Handle hugetlb vma individually because pagetable + * walk for the hugetlb page is dependent on the + * architecture and we can't handled it in the same + * manner as non-huge pages. */ - err = walk_hugetlb_range(vma, addr, next, walk); - if (err) - break; - pgd = pgd_offset(walk->mm, next); - continue; + if (walk->hugetlb_entry && (vma->vm_start <= addr) && + is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, + * so walk through hugetlb entries within a + * given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } } if (pgd_none_or_clear_bad(pgd)) { diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c20ff48994c2..b63e84a7fa88 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid, if (flags != 0) return -EINVAL; - if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) - goto out; - - if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) - goto out; - if (vm_write) rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, iovstack_l, @@ -459,8 +453,6 @@ free_iovecs: kfree(iov_r); if (iov_l != iovstack_l) kfree(iov_l); - -out: return rc; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f284d9..1fa92206829a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -313,8 +313,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) { /* seems racy */ + if (err == -EEXIST) { radix_tree_preload_end(); + /* + * We might race against get_swap_page() and stumble + * across a SWAP_HAS_CACHE swap_map entry whose page + * has not been brought into the swapcache yet, while + * the other end is scheduled away waiting on discard + * I/O completion at scan_swap_map(). + * + * In order to avoid turning this transitory state + * into a permanent loop around this -EEXIST case + * if !CONFIG_PREEMPT and the I/O completion happens + * to be waiting on the CPU waitqueue where we are now + * busy looping, we just conditionally invoke the + * scheduler here, if there are some more important + * tasks to run. + */ + cond_resched(); continue; } if (err) { /* swp entry is obsolete ? */ |