summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2013-08-14 16:23:39 +0530
committerDeepak Nibade <dnibade@nvidia.com>2013-08-14 16:23:39 +0530
commit9282699d7fd7954f11d59254e568e5d4bfbbe71a (patch)
tree4d945260bc8eb63b0db0423ad55ad7021eb5d4ac /mm
parent933d6b11dbd7fda89ac094321d0cd9992afb5592 (diff)
parent67e6589a34ea5360b00869aaaec4a844c29cf713 (diff)
Merge branch 'linux-3.4.57' into rel-17
Bug 1348440 Change-Id: If25c49f027dc2a69642f7ed4733e965962b2a5a2 Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Diffstat (limited to 'mm')
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/hugetlb.c22
-rw-r--r--mm/migrate.c25
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mmu_notifier.c80
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page_alloc.c4
-rw-r--r--mm/pagewalk.c70
-rw-r--r--mm/process_vm_access.c8
-rw-r--r--mm/swap_state.c18
10 files changed, 149 insertions, 99 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index caf15b6fa753..ef99c15fa36c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1949,7 +1949,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_unmap(pte);
spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
- set_pmd_at(mm, address, pmd, _pmd);
+ /*
+ * We can only use set_pmd_at when establishing
+ * hugepmds and never for establishing regular pmds that
+ * points to regular pagetables. Use pmd_populate for that
+ */
+ pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(&mm->page_table_lock);
anon_vma_unlock(vma->anon_vma);
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c384e0901f9c..a69243982a08 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2096,8 +2096,12 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
- struct hstate *h = &default_hstate;
- return h->nr_huge_pages * pages_per_huge_page(h);
+ struct hstate *h;
+ unsigned long nr_total_pages = 0;
+
+ for_each_hstate(h)
+ nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
+ return nr_total_pages;
}
static int hugetlb_acct_memory(struct hstate *h, long delta)
@@ -2764,7 +2768,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait(mm, (pmd_t *)ptep, address);
+ migration_entry_wait_huge(mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -2902,7 +2906,17 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
break;
}
- if (absent ||
+ /*
+ * We need call hugetlb_fault for both hugepages under migration
+ * (in which case hugetlb_fault waits for the migration,) and
+ * hwpoisoned hugepages (in which case we need to prevent the
+ * caller from accessing to them.) In order to do this, we use
+ * here is_swap_pte instead of is_hugetlb_entry_migration and
+ * is_hugetlb_entry_hwpoisoned. This is because it simply covers
+ * both cases, and because we can't follow correct pages
+ * directly from any kind of swap entries.
+ */
+ if (absent || is_swap_pte(huge_ptep_get(pte)) ||
((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
diff --git a/mm/migrate.c b/mm/migrate.c
index 11072383ae12..5f588b1f2f6d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -145,7 +145,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
if (PageHuge(new))
pte = pte_mkhuge(pte);
#endif
- flush_cache_page(vma, addr, pte_pfn(pte));
+ flush_dcache_page(new);
set_pte_at(mm, addr, ptep, pte);
if (PageHuge(new)) {
@@ -180,15 +180,14 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
- unsigned long address)
+static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+ spinlock_t *ptl)
{
- pte_t *ptep, pte;
- spinlock_t *ptl;
+ pte_t pte;
swp_entry_t entry;
struct page *page;
- ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+ spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
goto out;
@@ -216,6 +215,20 @@ out:
pte_unmap_unlock(ptep, ptl);
}
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
+{
+ spinlock_t *ptl = pte_lockptr(mm, pmd);
+ pte_t *ptep = pte_offset_map(pmd, address);
+ __migration_entry_wait(mm, ptep, ptl);
+}
+
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
+{
+ spinlock_t *ptl = &(mm)->page_table_lock;
+ __migration_entry_wait(mm, pte, ptl);
+}
+
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head,
diff --git a/mm/mmap.c b/mm/mmap.c
index 922de71fc759..ed884dd9674b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1130,15 +1130,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
file = fget(fd);
if (!file)
goto out;
+ if (is_file_hugepages(file))
+ len = ALIGN(len, huge_page_size(hstate_file(file)));
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
+
+ len = ALIGN(len, huge_page_size(&default_hstate));
/*
* VM_NORESERVE is used because the reservations will be
* taken when vm_ops->mmap() is called
* A dummy user value is used because we are not locking
* memory so no accounting is necessary
*/
- file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE, &user,
HUGETLB_ANONHUGE_INODE);
if (IS_ERR(file))
@@ -1619,7 +1623,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
if (mm) {
/* Check the cache first. */
/* (Cache hit rate is typically around 35%.) */
- vma = mm->mmap_cache;
+ vma = ACCESS_ONCE(mm->mmap_cache);
if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
struct rb_node * rb_node;
@@ -1920,7 +1924,7 @@ static void unmap_region(struct mm_struct *mm,
unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : TASK_SIZE);
+ next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
}
@@ -2308,7 +2312,7 @@ void exit_mmap(struct mm_struct *mm)
unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
- free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, TASK_SIZE);
+ free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, 0, -1);
/*
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d1ca2de4080..c1f947b4f61d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -37,51 +37,48 @@ static struct srcu_struct srcu;
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
+ struct hlist_node *node;
int id;
/*
- * srcu_read_lock() here will block synchronize_srcu() in
- * mmu_notifier_unregister() until all registered
- * ->release() callouts this function makes have
- * returned.
+ * SRCU here will block mmu_notifier_unregister until
+ * ->release returns.
*/
id = srcu_read_lock(&srcu);
+ hlist_for_each_entry_rcu(mn, node, &mm->mmu_notifier_mm->list, hlist)
+ /*
+ * If ->release runs before mmu_notifier_unregister it must be
+ * handled, as it's the only way for the driver to flush all
+ * existing sptes and stop the driver from establishing any more
+ * sptes before all the pages in the mm are freed.
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
struct mmu_notifier,
hlist);
-
/*
- * Unlink. This will prevent mmu_notifier_unregister()
- * from also making the ->release() callout.
+ * We arrived before mmu_notifier_unregister so
+ * mmu_notifier_unregister will do nothing other than to wait
+ * for ->release to finish and for mmu_notifier_unregister to
+ * return.
*/
hlist_del_init_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
-
- /*
- * Clear sptes. (see 'release' description in mmu_notifier.h)
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
-
- spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * All callouts to ->release() which we have done are complete.
- * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
- */
- srcu_read_unlock(&srcu, id);
-
- /*
- * mmu_notifier_unregister() may have unlinked a notifier and may
- * still be calling out to it. Additionally, other notifiers
- * may have been active via vmtruncate() et. al. Block here
- * to ensure that all notifier callouts for this mm have been
- * completed and the sptes are really cleaned up before returning
- * to exit_mmap().
+ * synchronize_srcu here prevents mmu_notifier_release from returning to
+ * exit_mmap (which would proceed with freeing all pages in the mm)
+ * until the ->release method returns, if it was invoked by
+ * mmu_notifier_unregister.
+ *
+ * The mmu_notifier_mm can't go away from under us because one mm_count
+ * is held by exit_mmap.
*/
synchronize_srcu(&srcu);
}
@@ -302,31 +299,34 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
- spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
+ /*
+ * SRCU here will force exit_mmap to wait for ->release to
+ * finish before freeing the pages.
+ */
int id;
+ id = srcu_read_lock(&srcu);
/*
- * Ensure we synchronize up with __mmu_notifier_release().
+ * exit_mmap will block in mmu_notifier_release to guarantee
+ * that ->release is called before freeing the pages.
*/
- id = srcu_read_lock(&srcu);
-
- hlist_del_rcu(&mn->hlist);
- spin_unlock(&mm->mmu_notifier_mm->lock);
-
if (mn->ops->release)
mn->ops->release(mn, mm);
+ srcu_read_unlock(&srcu, id);
+ spin_lock(&mm->mmu_notifier_mm->lock);
/*
- * Allow __mmu_notifier_release() to complete.
+ * Can not use list_del_rcu() since __mmu_notifier_release
+ * can delete it before we hold the lock.
*/
- srcu_read_unlock(&srcu, id);
- } else
+ hlist_del_init_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
+ }
/*
- * Wait for any running method to finish, including ->release() if it
- * was run by __mmu_notifier_release() instead of us.
+ * Wait for any running method to finish, of course including
+ * ->release if it was run by mmu_notifier_relase instead of us.
*/
synchronize_srcu(&srcu);
diff --git a/mm/nommu.c b/mm/nommu.c
index bb8f4f004a82..b0956e39fd1c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -807,7 +807,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma;
/* check the cache first */
- vma = mm->mmap_cache;
+ vma = ACCESS_ONCE(mm->mmap_cache);
if (vma && vma->vm_start <= addr && vma->vm_end > addr)
return vma;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef868e299932..16568c9755c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5910,6 +5910,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
zone->free_area[order].nr_free--;
__mod_zone_page_state(zone, NR_FREE_PAGES,
- (1UL << order));
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages -= 1 << order;
+#endif
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..1090e772a26d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -127,28 +127,7 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
return 0;
}
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
- struct vm_area_struct *vma;
-
- /* We don't need vma lookup at all. */
- if (!walk->hugetlb_entry)
- return NULL;
-
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- vma = find_vma(walk->mm, addr);
- if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
- return vma;
-
- return NULL;
-}
-
#else /* CONFIG_HUGETLB_PAGE */
-static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
-{
- return NULL;
-}
-
static int walk_hugetlb_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -199,30 +178,53 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (!walk->mm)
return -EINVAL;
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+
pgd = pgd_offset(walk->mm, addr);
do {
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = NULL;
next = pgd_addr_end(addr, end);
/*
- * handle hugetlb vma individually because pagetable walk for
- * the hugetlb page is dependent on the architecture and
- * we can't handled it in the same manner as non-huge pages.
+ * This function was not intended to be vma based.
+ * But there are vma special cases to be handled:
+ * - hugetlb vma's
+ * - VM_PFNMAP vma's
*/
- vma = hugetlb_vma(addr, walk);
+ vma = find_vma(walk->mm, addr);
if (vma) {
- if (vma->vm_end < next)
+ /*
+ * There are no page structures backing a VM_PFNMAP
+ * range, so do not allow split_huge_page_pmd().
+ */
+ if ((vma->vm_start <= addr) &&
+ (vma->vm_flags & VM_PFNMAP)) {
next = vma->vm_end;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
/*
- * Hugepage is very tightly coupled with vma, so
- * walk through hugetlb entries within a given vma.
+ * Handle hugetlb vma individually because pagetable
+ * walk for the hugetlb page is dependent on the
+ * architecture and we can't handled it in the same
+ * manner as non-huge pages.
*/
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
- continue;
+ if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
+ is_vm_hugetlb_page(vma)) {
+ if (vma->vm_end < next)
+ next = vma->vm_end;
+ /*
+ * Hugepage is very tightly coupled with vma,
+ * so walk through hugetlb entries within a
+ * given vma.
+ */
+ err = walk_hugetlb_range(vma, addr, next, walk);
+ if (err)
+ break;
+ pgd = pgd_offset(walk->mm, next);
+ continue;
+ }
}
if (pgd_none_or_clear_bad(pgd)) {
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c2..b63e84a7fa88 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -429,12 +429,6 @@ compat_process_vm_rw(compat_pid_t pid,
if (flags != 0)
return -EINVAL;
- if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
- goto out;
-
- if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
- goto out;
-
if (vm_write)
rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
UIO_FASTIOV, iovstack_l,
@@ -459,8 +453,6 @@ free_iovecs:
kfree(iov_r);
if (iov_l != iovstack_l)
kfree(iov_l);
-
-out:
return rc;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..1fa92206829a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -313,8 +313,24 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) { /* seems racy */
+ if (err == -EEXIST) {
radix_tree_preload_end();
+ /*
+ * We might race against get_swap_page() and stumble
+ * across a SWAP_HAS_CACHE swap_map entry whose page
+ * has not been brought into the swapcache yet, while
+ * the other end is scheduled away waiting on discard
+ * I/O completion at scan_swap_map().
+ *
+ * In order to avoid turning this transitory state
+ * into a permanent loop around this -EEXIST case
+ * if !CONFIG_PREEMPT and the I/O completion happens
+ * to be waiting on the CPU waitqueue where we are now
+ * busy looping, we just conditionally invoke the
+ * scheduler here, if there are some more important
+ * tasks to run.
+ */
+ cond_resched();
continue;
}
if (err) { /* swp entry is obsolete ? */