summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorSachin Nikam <snikam@nvidia.com>2013-03-06 19:12:15 +0530
committerSachin Nikam <snikam@nvidia.com>2013-03-06 19:31:07 +0530
commitd3dd6b4227fe3e59df0a55f679ec196e4ccb1e3a (patch)
tree0137a07ecab729a19cea68b671c88def5bf83a2d /mm
parentbb36790494c382ec26250f351c8cea76613f0f02 (diff)
parent2713e2797a78a0fdda765bc5ad7fed41e94818ba (diff)
Merge branch 'linux-3.4.35' into rel-17
Bug 1243631 Change-Id: I915826047b2e20f0ad0a7d75df295c6cbf6e5b0a
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/dmapool.c31
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/huge_memory.c3
-rw-r--r--mm/hugetlb.c3
-rw-r--r--mm/memblock.c24
-rw-r--r--mm/memcontrol.c21
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c23
-rw-r--r--mm/memory_hotplug.c16
-rw-r--r--mm/mempolicy.c225
-rw-r--r--mm/mmu_notifier.c147
-rw-r--r--mm/page-writeback.c25
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/shmem.c44
-rw-r--r--mm/slab.c6
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmscan.c8
21 files changed, 403 insertions, 265 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..53cf62b186b6 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,13 +766,17 @@ void * __init alloc_bootmem_section(unsigned long size,
unsigned long section_nr)
{
bootmem_data_t *bdata;
- unsigned long pfn, goal;
+ unsigned long pfn, goal, limit;
pfn = section_nr_to_pfn(section_nr);
goal = pfn << PAGE_SHIFT;
+ limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
- return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
+ if (goal + size > limit)
+ limit = 0;
+
+ return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
}
#endif
diff --git a/mm/compaction.c b/mm/compaction.c
index eede98104e9f..10f9bf96f7f0 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -840,7 +840,7 @@ static int compact_node(int nid)
}
/* Compact all nodes in the system */
-static int compact_nodes(void)
+static void compact_nodes(void)
{
int nid;
@@ -849,8 +849,6 @@ static int compact_nodes(void)
for_each_online_node(nid)
compact_node(nid);
-
- return COMPACT_COMPLETE;
}
/* The written value is actually unused, all memory is compacted */
@@ -861,7 +859,7 @@ int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
- return compact_nodes();
+ compact_nodes();
return 0;
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index c5ab33bca0a8..da1b0f0b8709 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -50,7 +50,6 @@ struct dma_pool { /* the pool */
size_t allocation;
size_t boundary;
char name[32];
- wait_queue_head_t waitq;
struct list_head pools;
};
@@ -62,8 +61,6 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
unsigned int offset;
};
-#define POOL_TIMEOUT_JIFFIES ((100 /* msec */ * HZ) / 1000)
-
static DEFINE_MUTEX(pools_lock);
static ssize_t
@@ -172,7 +169,6 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
- init_waitqueue_head(&retval->waitq);
if (dev) {
int ret;
@@ -227,7 +223,6 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
#endif
pool_initialise_page(pool, page);
- list_add(&page->page_list, &pool->page_list);
page->in_use = 0;
page->offset = 0;
} else {
@@ -315,30 +310,21 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
might_sleep_if(mem_flags & __GFP_WAIT);
spin_lock_irqsave(&pool->lock, flags);
- restart:
list_for_each_entry(page, &pool->page_list, page_list) {
if (page->offset < pool->allocation)
goto ready;
}
- page = pool_alloc_page(pool, GFP_ATOMIC);
- if (!page) {
- if (mem_flags & __GFP_WAIT) {
- DECLARE_WAITQUEUE(wait, current);
- __set_current_state(TASK_UNINTERRUPTIBLE);
- __add_wait_queue(&pool->waitq, &wait);
- spin_unlock_irqrestore(&pool->lock, flags);
+ /* pool_alloc_page() might sleep, so temporarily drop &pool->lock */
+ spin_unlock_irqrestore(&pool->lock, flags);
- schedule_timeout(POOL_TIMEOUT_JIFFIES);
+ page = pool_alloc_page(pool, mem_flags);
+ if (!page)
+ return NULL;
- spin_lock_irqsave(&pool->lock, flags);
- __remove_wait_queue(&pool->waitq, &wait);
- goto restart;
- }
- retval = NULL;
- goto done;
- }
+ spin_lock_irqsave(&pool->lock, flags);
+ list_add(&page->page_list, &pool->page_list);
ready:
page->in_use++;
offset = page->offset;
@@ -348,7 +334,6 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
#ifdef DMAPOOL_DEBUG
memset(retval, POOL_POISON_ALLOCATED, pool->size);
#endif
- done:
spin_unlock_irqrestore(&pool->lock, flags);
return retval;
}
@@ -435,8 +420,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
page->in_use--;
*(int *)vaddr = page->offset;
page->offset = offset;
- if (waitqueue_active(&pool->waitq))
- wake_up_locked(&pool->waitq);
/*
* Resist a temptation to do
* if (!is_page_busy(page)) pool_free_page(pool, page);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..dcb98722a22f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -17,6 +17,7 @@
#include <linux/fadvise.h>
#include <linux/writeback.h>
#include <linux/syscalls.h>
+#include <linux/swap.h>
#include <asm/unistd.h>
@@ -124,9 +125,22 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
end_index = (endbyte >> PAGE_CACHE_SHIFT);
- if (end_index >= start_index)
- invalidate_mapping_pages(mapping, start_index,
+ if (end_index >= start_index) {
+ unsigned long count = invalidate_mapping_pages(mapping,
+ start_index, end_index);
+
+ /*
+ * If fewer pages were invalidated than expected then
+ * it is possible that some of the pages were on
+ * a per-cpu pagevec for a remote CPU. Drain all
+ * pagevecs and try again.
+ */
+ if (count < (end_index - start_index + 1)) {
+ lru_add_drain_all();
+ invalidate_mapping_pages(mapping, start_index,
end_index);
+ }
+ }
break;
default:
ret = -EINVAL;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..caf15b6fa753 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -950,6 +950,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
pmd, orig_pmd, page, haddr);
+ if (ret & VM_FAULT_OOM)
+ split_huge_page(page);
put_page(page);
goto out;
}
@@ -957,6 +959,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
put_page(new_page);
+ split_huge_page(page);
put_page(page);
ret |= VM_FAULT_OOM;
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a799df59d310..c384e0901f9c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2431,7 +2431,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
* from page cache lookup which is in HPAGE_SIZE units.
*/
address = address & huge_page_mask(h);
- pgoff = vma_hugecache_offset(h, vma, address);
+ pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
+ vma->vm_pgoff;
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
/*
diff --git a/mm/memblock.c b/mm/memblock.c
index 280d3d7835d6..11e5bd174f3c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -908,6 +908,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
}
+void __init_memblock memblock_trim_memory(phys_addr_t align)
+{
+ int i;
+ phys_addr_t start, end, orig_start, orig_end;
+ struct memblock_type *mem = &memblock.memory;
+
+ for (i = 0; i < mem->cnt; i++) {
+ orig_start = mem->regions[i].base;
+ orig_end = mem->regions[i].base + mem->regions[i].size;
+ start = round_up(orig_start, align);
+ end = round_down(orig_end, align);
+
+ if (start == orig_start && end == orig_end)
+ continue;
+
+ if (start < end) {
+ mem->regions[i].base = start;
+ mem->regions[i].size = end - start;
+ } else {
+ memblock_remove_region(mem, i);
+ i--;
+ }
+ }
+}
void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7685d4a0b3ce..81c275b3afed 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1489,17 +1489,26 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
{
u64 limit;
- u64 memsw;
limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
- limit += total_swap_pages << PAGE_SHIFT;
- memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
/*
- * If memsw is finite and limits the amount of swap space available
- * to this memcg, return that limit.
+ * Do not consider swap space if we cannot swap due to swappiness
*/
- return min(limit, memsw);
+ if (mem_cgroup_swappiness(memcg)) {
+ u64 memsw;
+
+ limit += total_swap_pages << PAGE_SHIFT;
+ memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+
+ /*
+ * If memsw is finite and limits the amount of swap space
+ * available to this memcg, return that limit.
+ */
+ limit = min(limit, memsw);
+ }
+
+ return limit;
}
static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9299eea77f72..b68c8f66e4d3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1481,9 +1481,17 @@ int soft_offline_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_trans_head(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
+ if (PageTransHuge(hpage)) {
+ if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+ pr_info("soft offline: %#lx: failed to split THP\n",
+ pfn);
+ return -EBUSY;
+ }
+ }
ret = get_any_page(page, pfn, flags);
if (ret < 0)
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..2f42aab50e9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,10 +182,14 @@ static int tlb_next_batch(struct mmu_gather *tlb)
return 1;
}
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return 0;
+
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
return 0;
+ tlb->batch_count++;
batch->next = NULL;
batch->nr = 0;
batch->max = MAX_GATHER_BATCH;
@@ -212,6 +216,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
tlb->local.nr = 0;
tlb->local.max = ARRAY_SIZE(tlb->__pages);
tlb->active = &tlb->local;
+ tlb->batch_count = 0;
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
tlb->batch = NULL;
@@ -3489,6 +3494,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);
+retry:
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
@@ -3502,13 +3508,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pmd, flags);
} else {
pmd_t orig_pmd = *pmd;
+ int ret;
+
barrier();
if (pmd_trans_huge(orig_pmd)) {
if (flags & FAULT_FLAG_WRITE &&
!pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd))
- return do_huge_pmd_wp_page(mm, vma, address,
- pmd, orig_pmd);
+ !pmd_trans_splitting(orig_pmd)) {
+ ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
+ orig_pmd);
+ /*
+ * If COW results in an oom, the huge pmd will
+ * have been split, so retry the fault on the
+ * pte for a smaller charge.
+ */
+ if (unlikely(ret & VM_FAULT_OOM))
+ goto retry;
+ return ret;
+ }
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc898cb4fe8f..77ad30613a3d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -127,9 +127,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
struct mem_section *ms;
struct page *page, *memmap;
- if (!pfn_valid(start_pfn))
- return;
-
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -188,9 +185,16 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
end_pfn = pfn + pgdat->node_spanned_pages;
/* register_section info */
- for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
- register_page_bootmem_info_section(pfn);
-
+ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+ /*
+ * Some platforms can assign the same pfn to multiple nodes - on
+ * node0 as well as nodeN. To avoid registering a pfn against
+ * multiple nodes we check that this pfn does not already
+ * reside in some other node.
+ */
+ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+ register_page_bootmem_info_section(pfn);
+ }
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bf5b485ecd31..82f1b027ba1c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return first;
}
+/*
+ * Apply policy to a single VMA
+ * This must be called with the mmap_sem held for writing.
+ */
+static int vma_replace_policy(struct vm_area_struct *vma,
+ struct mempolicy *pol)
+{
+ int err;
+ struct mempolicy *old;
+ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+ new = mpol_dup(pol);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+ if (err)
+ goto err_out;
+ }
+
+ old = vma->vm_policy;
+ vma->vm_policy = new; /* protected by mmap_sem */
+ mpol_put(old);
+
+ return 0;
+ err_out:
+ mpol_put(new);
+ return err;
+}
+
/* Step 2: apply policy to a range and do splits. */
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
if (err)
goto out;
}
-
- /*
- * Apply policy to a single VMA. The reference counting of
- * policy for vma_policy linkages has already been handled by
- * vma_merge and split_vma as necessary. If this is a shared
- * policy then ->set_policy will increment the reference count
- * for an sp node.
- */
- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
- vma->vm_start, vma->vm_end, vma->vm_pgoff,
- vma->vm_ops, vma->vm_file,
- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
- if (vma->vm_ops && vma->vm_ops->set_policy) {
- err = vma->vm_ops->set_policy(vma, new_pol);
- if (err)
- goto out;
- }
+ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
}
out:
@@ -1510,8 +1532,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
addr);
if (vpol)
pol = vpol;
- } else if (vma->vm_policy)
+ } else if (vma->vm_policy) {
pol = vma->vm_policy;
+
+ /*
+ * shmem_alloc_page() passes MPOL_F_SHARED policy with
+ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
+ * count on these policies which will be dropped by
+ * mpol_cond_put() later
+ */
+ if (mpol_needs_cond_ref(pol))
+ mpol_get(pol);
+ }
}
if (!pol)
pol = &default_policy;
@@ -1977,28 +2009,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
return new;
}
-/*
- * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
- * eliminate the * MPOL_F_* flags that require conditional ref and
- * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
- * after return. Use the returned value.
- *
- * Allows use of a mempolicy for, e.g., multiple allocations with a single
- * policy lookup, even if the policy needs/has extra ref on lookup.
- * shmem_readahead needs this.
- */
-struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
- struct mempolicy *frompol)
-{
- if (!mpol_needs_cond_ref(frompol))
- return frompol;
-
- *tompol = *frompol;
- tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
- __mpol_put(frompol);
- return tompol;
-}
-
/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
@@ -2035,7 +2045,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/* Caller holds sp->mutex */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2099,36 +2109,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ mutex_unlock(&sp->mutex);
return pol;
}
+static void sp_free(struct sp_node *n)
+{
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_free(n);
}
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ struct sp_node *n;
+ struct mempolicy *newpol;
+ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n)
return NULL;
+
+ newpol = mpol_dup(pol);
+ if (IS_ERR(newpol)) {
+ kmem_cache_free(sn_cache, n);
+ return NULL;
+ }
+ newpol->flags |= MPOL_F_SHARED;
+
n->start = start;
n->end = end;
- mpol_get(pol);
- pol->flags |= MPOL_F_SHARED; /* for unref */
- n->policy = pol;
+ n->policy = newpol;
+
return n;
}
@@ -2136,10 +2160,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
- struct sp_node *n, *new2 = NULL;
+ struct sp_node *n;
+ int ret = 0;
-restart:
- spin_lock(&sp->lock);
+ mutex_lock(&sp->mutex);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2152,16 +2176,14 @@ restart:
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
+ struct sp_node *new2;
+ new2 = sp_alloc(end, n->end, n->policy);
if (!new2) {
- spin_unlock(&sp->lock);
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2)
- return -ENOMEM;
- goto restart;
+ ret = -ENOMEM;
+ goto out;
}
n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
break;
} else
n->end = start;
@@ -2172,12 +2194,9 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
- if (new2) {
- mpol_put(new2->policy);
- kmem_cache_free(sn_cache, new2);
- }
- return 0;
+out:
+ mutex_unlock(&sp->mutex);
+ return ret;
}
/**
@@ -2195,7 +2214,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ mutex_init(&sp->mutex);
if (mpol) {
struct vm_area_struct pvma;
@@ -2249,7 +2268,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
}
err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
if (err && new)
- kmem_cache_free(sn_cache, new);
+ sp_free(new);
return err;
}
@@ -2261,16 +2280,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ mutex_lock(&p->mutex);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
- mpol_put(n->policy);
- kmem_cache_free(sn_cache, n);
+ sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ mutex_unlock(&p->mutex);
}
/* assumes fs == KERNEL_DS */
@@ -2327,8 +2344,7 @@ void numa_default_policy(void)
*/
/*
- * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_parse_str() and mpol_to_str()
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
@@ -2343,28 +2359,21 @@ static const char * const policy_modes[] =
#ifdef CONFIG_TMPFS
/**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
- * @no_context: flag whether to "contextualize" the mempolicy
+ * @unused: redundant argument, to be removed later.
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy. This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
- * mount option. Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved. Saving
- * it again is redundant, but safe.
- *
* On success, returns 0, else 1
*/
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol, int unused)
{
struct mempolicy *new = NULL;
unsigned short mode;
- unsigned short uninitialized_var(mode_flags);
+ unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
@@ -2452,24 +2461,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
- if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
- } else {
- int ret;
- NODEMASK_SCRATCH(scratch);
- if (scratch) {
- task_lock(current);
- ret = mpol_set_nodemask(new, &nodes, scratch);
- task_unlock(current);
- } else
- ret = -ENOMEM;
- NODEMASK_SCRATCH_FREE(scratch);
- if (ret) {
- mpol_put(new);
- goto out;
- }
- }
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
+
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
+
err = 0;
out:
@@ -2489,13 +2497,13 @@ out:
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
- * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
+ * @unused: redundant argument, to be removed later.
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int unused)
{
char *p = buffer;
int l;
@@ -2521,7 +2529,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL; /* pseudo-policy */
+ mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
@@ -2529,14 +2537,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- if (no_context)
- nodes = pol->w.user_nodemask;
- else
- nodes = pol->v.nodes;
+ nodes = pol->v.nodes;
break;
default:
- BUG();
+ return -EINVAL;
}
l = strlen(policy_modes[mode]);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..8d1ca2de4080 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
+#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
+/* global SRCU for all MMs */
+static struct srcu_struct srcu;
+
/*
* This function can't run concurrently against mmu_notifier_register
* because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,58 +29,61 @@
* in parallel despite there being no task using this mm any more,
* through the vmas outside of the exit_mmap context, such as with
* vmtruncate. This serializes against mmu_notifier_unregister with
- * the mmu_notifier_mm->lock in addition to RCU and it serializes
- * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * the mmu_notifier_mm->lock in addition to SRCU and it serializes
+ * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
* can't go away from under us as exit_mmap holds an mm_count pin
* itself.
*/
void __mmu_notifier_release(struct mm_struct *mm)
{
struct mmu_notifier *mn;
- struct hlist_node *n;
+ int id;
/*
- * RCU here will block mmu_notifier_unregister until
- * ->release returns.
+ * srcu_read_lock() here will block synchronize_srcu() in
+ * mmu_notifier_unregister() until all registered
+ * ->release() callouts this function makes have
+ * returned.
*/
- rcu_read_lock();
- hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
- /*
- * if ->release runs before mmu_notifier_unregister it
- * must be handled as it's the only way for the driver
- * to flush all existing sptes and stop the driver
- * from establishing any more sptes before all the
- * pages in the mm are freed.
- */
- if (mn->ops->release)
- mn->ops->release(mn, mm);
- rcu_read_unlock();
-
+ id = srcu_read_lock(&srcu);
spin_lock(&mm->mmu_notifier_mm->lock);
while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
mn = hlist_entry(mm->mmu_notifier_mm->list.first,
struct mmu_notifier,
hlist);
+
/*
- * We arrived before mmu_notifier_unregister so
- * mmu_notifier_unregister will do nothing other than
- * to wait ->release to finish and
- * mmu_notifier_unregister to return.
+ * Unlink. This will prevent mmu_notifier_unregister()
+ * from also making the ->release() callout.
*/
hlist_del_init_rcu(&mn->hlist);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
+ /*
+ * Clear sptes. (see 'release' description in mmu_notifier.h)
+ */
+ if (mn->ops->release)
+ mn->ops->release(mn, mm);
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
}
spin_unlock(&mm->mmu_notifier_mm->lock);
/*
- * synchronize_rcu here prevents mmu_notifier_release to
- * return to exit_mmap (which would proceed freeing all pages
- * in the mm) until the ->release method returns, if it was
- * invoked by mmu_notifier_unregister.
- *
- * The mmu_notifier_mm can't go away from under us because one
- * mm_count is hold by exit_mmap.
+ * All callouts to ->release() which we have done are complete.
+ * Allow synchronize_srcu() in mmu_notifier_unregister() to complete
+ */
+ srcu_read_unlock(&srcu, id);
+
+ /*
+ * mmu_notifier_unregister() may have unlinked a notifier and may
+ * still be calling out to it. Additionally, other notifiers
+ * may have been active via vmtruncate() et. al. Block here
+ * to ensure that all notifier callouts for this mm have been
+ * completed and the sptes are really cleaned up before returning
+ * to exit_mmap().
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
}
/*
@@ -89,14 +96,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
young |= mn->ops->clear_flush_young(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -106,9 +113,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
- int young = 0;
+ int young = 0, id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->test_young) {
young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +123,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
break;
}
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
return young;
}
@@ -126,8 +133,9 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->change_pte)
mn->ops->change_pte(mn, mm, address, pte);
@@ -138,7 +146,7 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
else if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +154,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_page)
mn->ops->invalidate_page(mn, mm, address);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +169,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start)
mn->ops->invalidate_range_start(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +184,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
{
struct mmu_notifier *mn;
struct hlist_node *n;
+ int id;
- rcu_read_lock();
+ id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_end)
mn->ops->invalidate_range_end(mn, mm, start, end);
}
- rcu_read_unlock();
+ srcu_read_unlock(&srcu, id);
}
static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,6 +203,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ /*
+ * Verify that mmu_notifier_init() already run and the global srcu is
+ * initialized.
+ */
+ BUG_ON(!srcu.per_cpu_ref);
+
ret = -ENOMEM;
mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
if (unlikely(!mmu_notifier_mm))
@@ -274,8 +291,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
/*
* This releases the mm_count pin automatically and frees the mm
* structure if it was the last user of it. It serializes against
- * running mmu notifiers with RCU and against mmu_notifier_unregister
- * with the unregister lock + RCU. All sptes must be dropped before
+ * running mmu notifiers with SRCU and against mmu_notifier_unregister
+ * with the unregister lock + SRCU. All sptes must be dropped before
* calling mmu_notifier_unregister. ->release or any other notifier
* method may be invoked concurrently with mmu_notifier_unregister,
* and only after mmu_notifier_unregister returned we're guaranteed
@@ -285,35 +302,43 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
{
BUG_ON(atomic_read(&mm->mm_count) <= 0);
+ spin_lock(&mm->mmu_notifier_mm->lock);
if (!hlist_unhashed(&mn->hlist)) {
- /*
- * RCU here will force exit_mmap to wait ->release to finish
- * before freeing the pages.
- */
- rcu_read_lock();
+ int id;
/*
- * exit_mmap will block in mmu_notifier_release to
- * guarantee ->release is called before freeing the
- * pages.
+ * Ensure we synchronize up with __mmu_notifier_release().
*/
+ id = srcu_read_lock(&srcu);
+
+ hlist_del_rcu(&mn->hlist);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+
if (mn->ops->release)
mn->ops->release(mn, mm);
- rcu_read_unlock();
- spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_del_rcu(&mn->hlist);
+ /*
+ * Allow __mmu_notifier_release() to complete.
+ */
+ srcu_read_unlock(&srcu, id);
+ } else
spin_unlock(&mm->mmu_notifier_mm->lock);
- }
/*
- * Wait any running method to finish, of course including
- * ->release if it was run by mmu_notifier_relase instead of us.
+ * Wait for any running method to finish, including ->release() if it
+ * was run by __mmu_notifier_release() instead of us.
*/
- synchronize_rcu();
+ synchronize_srcu(&srcu);
BUG_ON(atomic_read(&mm->mm_count) <= 0);
mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
+
+static int __init mmu_notifier_init(void)
+{
+ return init_srcu_struct(&srcu);
+}
+
+module_init(mmu_notifier_init);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8ca2e7..bc8465f579a8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -187,6 +187,18 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
zone_reclaimable_pages(z) - z->dirty_balance_reserve;
}
/*
+ * Unreclaimable memory (kernel memory or anonymous memory
+ * without swap) can bring down the dirtyable pages below
+ * the zone's dirty balance reserve and the above calculation
+ * will underflow. However we still want to add in nodes
+ * which are below threshold (negative values) to get a more
+ * accurate calculation but make sure that the total never
+ * underflows.
+ */
+ if ((long)x < 0)
+ x = 0;
+
+ /*
* Make sure that the number of highmem pages is never larger
* than the number of the total dirtyable memory. This can only
* occur in very strange VM situations but we want to make sure
@@ -208,8 +220,8 @@ unsigned long global_dirtyable_memory(void)
{
unsigned long x;
- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
- dirty_balance_reserve;
+ x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+ x -= min(x, dirty_balance_reserve);
if (!vm_highmem_is_dirtyable)
x -= highmem_dirtyable_memory(x);
@@ -276,9 +288,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
* highmem zone can hold its share of dirty pages, so we don't
* care about vm_highmem_is_dirtyable here.
*/
- return zone_page_state(zone, NR_FREE_PAGES) +
- zone_reclaimable_pages(zone) -
- zone->dirty_balance_reserve;
+ unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
+ zone_reclaimable_pages(zone);
+
+ /* don't allow this to underflow */
+ nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ return nr_pages;
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0f20f1c354ef..ef868e299932 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -581,7 +581,7 @@ static inline void __free_one_page(struct page *page,
combined_idx = buddy_idx & page_idx;
higher_page = page + (combined_idx - page_idx);
buddy_idx = __find_buddy_index(combined_idx, order + 1);
- higher_buddy = page + (buddy_idx - combined_idx);
+ higher_buddy = higher_page + (buddy_idx - combined_idx);
if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
list_add_tail(&page->lru,
&zone->free_area[order].free_list[migratetype]);
@@ -4273,10 +4273,11 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
* round what is now in bits to nearest long in bits, then return it in
* bytes.
*/
-static unsigned long __init usemap_size(unsigned long zonesize)
+static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
{
unsigned long usemapsize;
+ zonesize += zone_start_pfn & (pageblock_nr_pages-1);
usemapsize = roundup(zonesize, pageblock_nr_pages);
usemapsize = usemapsize >> pageblock_order;
usemapsize *= NR_PAGEBLOCK_BITS;
@@ -4286,17 +4287,19 @@ static unsigned long __init usemap_size(unsigned long zonesize)
}
static void __init setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize)
+ struct zone *zone,
+ unsigned long zone_start_pfn,
+ unsigned long zonesize)
{
- unsigned long usemapsize = usemap_size(zonesize);
+ unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
if (usemapsize)
zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
usemapsize);
}
#else
-static inline void setup_usemap(struct pglist_data *pgdat,
- struct zone *zone, unsigned long zonesize) {}
+static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
+ unsigned long zone_start_pfn, unsigned long zonesize) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -4424,7 +4427,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
continue;
set_pageblock_order(pageblock_default_order());
- setup_usemap(pgdat, zone, size);
+ setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
@@ -5409,7 +5412,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
pfn &= (PAGES_PER_SECTION-1);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
- pfn = pfn - zone->zone_start_pfn;
+ pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..bfca52c96999 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/backing-dev.h>
#include <asm/tlbflush.h>
@@ -977,11 +978,8 @@ int page_mkclean(struct page *page)
if (page_mapped(page)) {
struct address_space *mapping = page_mapping(page);
- if (mapping) {
+ if (mapping)
ret = page_mkclean_file(mapping, page);
- if (page_test_and_clear_dirty(page_to_pfn(page), 1))
- ret = 1;
- }
}
return ret;
@@ -1167,6 +1165,7 @@ void page_add_file_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page)
{
+ struct address_space *mapping = page_mapping(page);
bool anon = PageAnon(page);
bool locked;
unsigned long flags;
@@ -1189,8 +1188,19 @@ void page_remove_rmap(struct page *page)
* this if the page is anon, so about to be freed; but perhaps
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
+ *
+ * And we can skip it on file pages, so long as the filesystem
+ * participates in dirty tracking; but need to catch shm and tmpfs
+ * and ramfs pages which have been modified since creation by read
+ * fault.
+ *
+ * Note that mapping must be decided above, before decrementing
+ * mapcount (which luckily provides a barrier): once page is unmapped,
+ * it could be truncated and page->mapping reset to NULL at any moment.
+ * Note also that we are relying on page_mapping(page) to set mapping
+ * to &swapper_space when PageSwapCache(page).
*/
- if ((!anon || PageSwapCache(page)) &&
+ if (mapping && !mapping_cap_account_dirty(mapping) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 2910f0d58c46..448956691e6b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -595,7 +595,7 @@ static void shmem_evict_inode(struct inode *inode)
kfree(xattr->name);
kfree(xattr);
}
- BUG_ON(inode->i_blocks);
+ WARN_ON(inode->i_blocks);
shmem_free_inode(inode->i_sb);
end_writeback(inode);
}
@@ -798,24 +798,28 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct mempolicy mpol, *spol;
struct vm_area_struct pvma;
-
- spol = mpol_cond_copy(&mpol,
- mpol_shared_policy_lookup(&info->policy, index));
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
pvma.vm_pgoff = index;
pvma.vm_ops = NULL;
- pvma.vm_policy = spol;
- return swapin_readahead(swap, gfp, &pvma, 0);
+ pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
+
+ page = swapin_readahead(swap, gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
static struct page *shmem_alloc_page(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
+ struct page *page;
/* Create a pseudo vma that just contains the policy */
pvma.vm_start = 0;
@@ -823,10 +827,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
pvma.vm_ops = NULL;
pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
- /*
- * alloc_page_vma() will drop the shared policy reference
- */
- return alloc_page_vma(gfp, &pvma, 0);
+ page = alloc_page_vma(gfp, &pvma, 0);
+
+ /* Drop reference taken by mpol_shared_policy_lookup() */
+ mpol_cond_put(pvma.vm_policy);
+
+ return page;
}
#else /* !CONFIG_NUMA */
#ifdef CONFIG_TMPFS
@@ -2018,12 +2024,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
{
struct inode *inode;
struct dentry *dentry = NULL;
- u64 inum = fid->raw[2];
- inum = (inum << 32) | fid->raw[1];
+ u64 inum;
if (fh_len < 3)
return NULL;
+ inum = fid->raw[2];
+ inum = (inum << 32) | fid->raw[1];
+
inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
shmem_match, fid->raw);
if (inode) {
@@ -2169,6 +2177,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
unsigned long inodes;
int error = -EINVAL;
+ config.mpol = NULL;
if (shmem_parse_options(data, &config, true))
return error;
@@ -2193,8 +2202,13 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes;
- mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
+ /*
+ * Preserve previous mempolicy unless mpol remount option was specified.
+ */
+ if (config.mpol) {
+ mpol_put(sbinfo->mpol);
+ sbinfo->mpol = config.mpol; /* transfers initial ref */
+ }
out:
spin_unlock(&sbinfo->stat_lock);
return error;
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..da2bb689a008 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1685,9 +1685,6 @@ void __init kmem_cache_init_late(void)
g_cpucache_up = LATE;
- /* Annotate slab for lockdep -- annotate the malloc caches */
- init_lock_keys();
-
/* 6) resize the head arrays to their final sizes */
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next)
@@ -1695,6 +1692,9 @@ void __init kmem_cache_init_late(void)
BUG();
mutex_unlock(&cache_chain_mutex);
+ /* Annotate slab for lockdep -- annotate the malloc caches */
+ init_lock_keys();
+
/* Done! */
g_cpucache_up = FULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..290dba25a7ed 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -619,7 +619,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
return; /* XXX: Not implemented yet */
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
}
#else
@@ -660,10 +660,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
get_order(sizeof(struct page) * nr_pages));
}
-static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
unsigned long magic;
+ struct page *page = virt_to_page(memmap);
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -712,13 +713,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
*/
if (memmap) {
- struct page *memmap_page;
- memmap_page = virt_to_page(memmap);
-
nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
>> PAGE_SHIFT;
- free_map_bootmem(memmap_page, nr_pages);
+ free_map_bootmem(memmap, nr_pages);
}
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..4224627695ba 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -394,11 +394,12 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
+ clear_page_mlock(page);
+
spin_lock_irq(&mapping->tree_lock);
if (PageDirty(page))
goto failed;
- clear_page_mlock(page);
BUG_ON(page_has_private(page));
__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be5bc0af2e76..e6ca5051c021 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1983,10 +1983,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
* proportional to the fraction of recently scanned pages on
* each list that were recently referenced and in active use.
*/
- ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+ ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
ap /= reclaim_stat->recent_rotated[0] + 1;
- fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+ fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
fp /= reclaim_stat->recent_rotated[1] + 1;
spin_unlock_irq(&mz->zone->lru_lock);
@@ -1999,7 +1999,7 @@ out:
unsigned long scan;
scan = zone_nr_lru_pages(mz, lru);
- if (priority || noswap) {
+ if (priority || noswap || !vmscan_swappiness(mz, sc)) {
scan >>= priority;
if (!scan && force_scan)
scan = SWAP_CLUSTER_MAX;
@@ -3128,6 +3128,8 @@ static int kswapd(void *p)
&balanced_classzone_idx);
}
}
+
+ current->reclaim_state = NULL;
return 0;
}