From 1e01979c8f502ac13e3cdece4f38712c5944e6e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Jul 2011 09:45:34 +0200
Subject: x86, numa: Implement pfn -> nid mapping granularity check

SPARSEMEM w/o VMEMMAP and DISCONTIGMEM, both used only on 32bit, use
sections array to map pfn to nid which is limited in granularity.  If
NUMA nodes are laid out such that the mapping cannot be accurate, boot
will fail triggering BUG_ON() in mminit_verify_page_links().

On 32bit, it's 512MiB w/ PAE and SPARSEMEM.  This seems to have been
granular enough until commit 2706a0bf7b (x86, NUMA: Enable
CONFIG_AMD_NUMA on 32bit too).  Apparently, there is a machine which
aligns NUMA nodes to 128MiB and has only AMD NUMA but not SRAT.  This
led to the following BUG_ON().

 On node 0 totalpages: 2096615
   DMA zone: 32 pages used for memmap
   DMA zone: 0 pages reserved
   DMA zone: 3927 pages, LIFO batch:0
   Normal zone: 1740 pages used for memmap
   Normal zone: 220978 pages, LIFO batch:31
   HighMem zone: 16405 pages used for memmap
   HighMem zone: 1853533 pages, LIFO batch:31
 BUG: Int 6: CR2   (null)
      EDI   (null)  ESI 00000002  EBP 00000002  ESP c1543ecc
      EBX f2400000  EDX 00000006  ECX   (null)  EAX 00000001
      err   (null)  EIP c16209aa   CS 00000060  flg 00010002
 Stack: f2400000 00220000 f7200800 c1620613 00220000 01000000 04400000 00238000
          (null) f7200000 00000002 f7200b58 f7200800 c1620929 000375fe   (null)
        f7200b80 c16395f0 00200a02 f7200a80   (null) 000375fe 00000002   (null)
 Pid: 0, comm: swapper Not tainted 2.6.39-rc5-00181-g2706a0b #17
 Call Trace:
  [<c136b1e5>] ? early_fault+0x2e/0x2e
  [<c16209aa>] ? mminit_verify_page_links+0x12/0x42
  [<c1620613>] ? memmap_init_zone+0xaf/0x10c
  [<c1620929>] ? free_area_init_node+0x2b9/0x2e3
  [<c1607e99>] ? free_area_init_nodes+0x3f2/0x451
  [<c1601d80>] ? paging_init+0x112/0x118
  [<c15f578d>] ? setup_arch+0x791/0x82f
  [<c15f43d9>] ? start_kernel+0x6a/0x257

This patch implements node_map_pfn_alignment() which determines
maximum internode alignment and update numa_register_memblks() to
reject NUMA configuration if alignment exceeds the pfn -> nid mapping
granularity of the memory model as determined by PAGES_PER_SECTION.

This makes the problematic machine boot w/ flatmem by rejecting the
NUMA config and provides protection against crazy NUMA configurations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20110712074534.GB2872@htj.dyndns.org
LKML-Reference: <20110628174613.GP478@escobedo.osrc.amd.com>
Reported-and-Tested-by: Hans Rosenfeld <hans.rosenfeld@amd.com>
Cc: Conny Seidel <conny.seidel@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/mm.h')
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71d7be9..c70a326b8f26 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1313,6 +1313,7 @@ extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 void sort_node_map(void);
+unsigned long node_map_pfn_alignment(void);
 unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
 						unsigned long end_pfn);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
-- 
cgit v1.2.3


From e9299f5058595a655c3b207cda9635e28b9197e6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:37 +1000
Subject: vmscan: add customisable shrinker batch size

For shrinkers that have their own cond_resched* calls, having
shrink_slab break the work down into small batches is not
paticularly efficient. Add a custom batchsize field to the struct
shrinker so that shrinkers can use a larger batch size if they
desire.

A value of zero (uninitialised) means "use the default", so
behaviour is unchanged by this patch.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/mm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71d7be9..9b9777ac726d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1150,6 +1150,7 @@ struct shrink_control {
 struct shrinker {
 	int (*shrink)(struct shrinker *, struct shrink_control *sc);
 	int seeks;	/* seeks to recreate an obj */
+	long batch;	/* reclaim batch size, 0 = default */
 
 	/* These are for internal use */
 	struct list_head list;
-- 
cgit v1.2.3


From b0d40c92adafde7c2d81203ce7c1c69275f41140 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 8 Jul 2011 14:14:42 +1000
Subject: superblock: introduce per-sb cache shrinker infrastructure

With context based shrinkers, we can implement a per-superblock
shrinker that shrinks the caches attached to the superblock. We
currently have global shrinkers for the inode and dentry caches that
split up into per-superblock operations via a coarse proportioning
method that does not batch very well.  The global shrinkers also
have a dependency - dentries pin inodes - so we have to be very
careful about how we register the global shrinkers so that the
implicit call order is always correct.

With a per-sb shrinker callout, we can encode this dependency
directly into the per-sb shrinker, hence avoiding the need for
strictly ordering shrinker registrations. We also have no need for
any proportioning code for the shrinker subsystem already provides
this functionality across all shrinkers. Allowing the shrinker to
operate on a single superblock at a time means that we do less
superblock list traversals and locking and reclaim should batch more
effectively. This should result in less CPU overhead for reclaim and
potentially faster reclaim of items from each filesystem.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/mm.h | 40 +---------------------------------------
 1 file changed, 1 insertion(+), 39 deletions(-)

(limited to 'include/linux/mm.h')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b9777ac726d..e3a1a9eec0b1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -15,6 +15,7 @@
 #include <linux/range.h>
 #include <linux/pfn.h>
 #include <linux/bit_spinlock.h>
+#include <linux/shrinker.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1121,45 +1122,6 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
 }
 #endif
 
-/*
- * This struct is used to pass information from page reclaim to the shrinkers.
- * We consolidate the values for easier extention later.
- */
-struct shrink_control {
-	gfp_t gfp_mask;
-
-	/* How many slab objects shrinker() should scan and try to reclaim */
-	unsigned long nr_to_scan;
-};
-
-/*
- * A callback you can register to apply pressure to ageable caches.
- *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
- *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
- *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
- */
-struct shrinker {
-	int (*shrink)(struct shrinker *, struct shrink_control *sc);
-	int seeks;	/* seeks to recreate an obj */
-	long batch;	/* reclaim batch size, 0 = default */
-
-	/* These are for internal use */
-	struct list_head list;
-	long nr;	/* objs pending delete */
-};
-#define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
-extern void unregister_shrinker(struct shrinker *);
-
 int vma_wants_writenotify(struct vm_area_struct *vma);
 
 extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
-- 
cgit v1.2.3