From ddd588b5dd55f14320379961e47683db4e4c1d90 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 22 Mar 2011 16:30:46 -0700 Subject: oom: suppress nodes that are not allowed from meminfo on oom kill The oom killer is extremely verbose for machines with a large number of cpus and/or nodes. This verbosity can often be harmful if it causes other important messages to be scrolled from the kernel log and incurs a signicant time delay, specifically for kernels with CONFIG_NODES_SHIFT > 8. This patch causes only memory information to be displayed for nodes that are allowed by current's cpuset when dumping the VM state. Information for all other nodes is irrelevant to the oom condition; we don't care if there's an abundance of memory elsewhere if we can't access it. This only affects the behavior of dumping memory information when an oom is triggered. Other dumps, such as for sysrq+m, still display the unfiltered form when using the existing show_mem() interface. Additionally, the per-cpu pageset statistics are extremely verbose in oom killer output, so it is now suppressed. This removes nodes_weight(current->mems_allowed) * (1 + nr_cpus) lines from the oom killer output. Callers may use __show_mem(SHOW_MEM_FILTER_NODES) to filter disallowed nodes. Signed-off-by: David Rientjes Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 581703d86fbd..1f82adc85352 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -859,7 +859,14 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) +/* + * Flags passed to __show_mem() and __show_free_areas() to suppress output in + * various contexts. + */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ + extern void show_free_areas(void); +extern void __show_free_areas(unsigned int flags); int shmem_lock(struct file *file, int lock, struct user_struct *user); struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); @@ -1348,6 +1355,7 @@ extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); +extern void __show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; -- cgit v1.2.3 From 318b275fbca1ab9ec0862de71420e0e92c3d1aa7 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Tue, 22 Mar 2011 16:30:51 -0700 Subject: mm: allow GUP to fail instead of waiting on a page GUP user may want to try to acquire a reference to a page if it is already in memory, but not if IO, to bring it in, is needed. For example KVM may tell vcpu to schedule another guest process if current one is trying to access swapped out page. Meanwhile, the page will be swapped in and the guest process, that depends on it, will be able to run again. This patch adds FAULT_FLAG_RETRY_NOWAIT (suggested by Linus) and FOLL_NOWAIT follow_page flags. FAULT_FLAG_RETRY_NOWAIT, when used in conjunction with VM_FAULT_ALLOW_RETRY, indicates to handle_mm_fault that it shouldn't drop mmap_sem and wait on a page, but return VM_FAULT_RETRY instead. [akpm@linux-foundation.org: improve FOLL_NOWAIT comment] Signed-off-by: Gleb Natapov Cc: Linus Torvalds Cc: Hugh Dickins Acked-by: Rik van Riel Cc: Michel Lespinasse Cc: Avi Kivity Cc: Marcelo Tosatti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f82adc85352..901435e3a9a9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -151,6 +151,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ +#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ /* * This interface is used by x86 PAT code to identify a pfn mapping that is @@ -1545,6 +1546,8 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO + * and return without waiting upon it */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -- cgit v1.2.3 From 033193275b3ffcfe7f3fde7b569f3d207f6cd6a0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Mar 2011 16:32:56 -0700 Subject: pagewalk: only split huge pages when necessary Right now, if a mm_walk has either ->pte_entry or ->pmd_entry set, it will unconditionally split any transparent huge pages it runs in to. In practice, that means that anyone doing a cat /proc/$pid/smaps will unconditionally break down every huge page in the process and depend on khugepaged to re-collapse it later. This is fairly suboptimal. This patch changes that behavior. It teaches each ->pmd_entry handler (there are five) that they must break down the THPs themselves. Also, the _generic_ code will never break down a THP unless a ->pte_entry handler is actually set. This means that the ->pmd_entry handlers can now choose to deal with THPs without breaking them down. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Hansen Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Eric B Munson Tested-by: Eric B Munson Cc: Michael J Wolf Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Matt Mackall Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 901435e3a9a9..294104e0891d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -914,6 +914,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlb, * @pgd_entry: if set, called for each non-empty PGD (top-level) entry * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * this handler is required to be able to handle + * pmd_trans_huge() pmds. They may simply choose to + * split_huge_page() instead of handling it explicitly. * @pte_entry: if set, called for each non-empty PTE (4th-level) entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry -- cgit v1.2.3 From 31db58b3ab432f72ea76be58b12e6ffaf627d5db Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:15 -0400 Subject: mm: arch: make get_gate_vma take an mm_struct instead of a task_struct Morally, the presence of a gate vma is more an attribute of a particular mm than a particular task. Moreover, dropping the dependency on task_struct will help make both existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 581703d86fbd..18b4a6358ab4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1578,7 +1578,7 @@ static inline bool kernel_page_present(struct page *page) { return true; } #endif /* CONFIG_HIBERNATION */ #endif -extern struct vm_area_struct *get_gate_vma(struct task_struct *tsk); +extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); #ifdef __HAVE_ARCH_GATE_AREA int in_gate_area_no_task(unsigned long addr); int in_gate_area(struct task_struct *task, unsigned long addr); -- cgit v1.2.3 From 83b964bbf82eb13a8f31bb49ca420787fe01f7a6 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:16 -0400 Subject: mm: arch: make in_gate_area take an mm_struct instead of a task_struct Morally, the question of whether an address lies in a gate vma should be asked with respect to an mm, not a particular task. Moreover, dropping the dependency on task_struct will help make existing and future operations on mm's more flexible and convenient. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 18b4a6358ab4..5c6d916cd302 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1581,10 +1581,10 @@ static inline bool kernel_page_present(struct page *page) { return true; } extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); #ifdef __HAVE_ARCH_GATE_AREA int in_gate_area_no_task(unsigned long addr); -int in_gate_area(struct task_struct *task, unsigned long addr); +int in_gate_area(struct mm_struct *mm, unsigned long addr); #else int in_gate_area_no_task(unsigned long addr); -#define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) +#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_task(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ int drop_caches_sysctl_handler(struct ctl_table *, int, -- cgit v1.2.3 From cae5d39032acf26c265f6b1dc73d7ce6ff4bc387 Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:17 -0400 Subject: mm: arch: rename in_gate_area_no_task to in_gate_area_no_mm Now that gate vma's are referenced with respect to a particular mm and not a particular task it only makes sense to propagate the change to this predicate as well. Signed-off-by: Stephen Wilson Reviewed-by: Michel Lespinasse Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Al Viro --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5c6d916cd302..9d6efefdde50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1580,11 +1580,11 @@ static inline bool kernel_page_present(struct page *page) { return true; } extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); #ifdef __HAVE_ARCH_GATE_AREA -int in_gate_area_no_task(unsigned long addr); +int in_gate_area_no_mm(unsigned long addr); int in_gate_area(struct mm_struct *mm, unsigned long addr); #else -int in_gate_area_no_task(unsigned long addr); -#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_task(addr);}) +int in_gate_area_no_mm(unsigned long addr); +#define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ int drop_caches_sysctl_handler(struct ctl_table *, int, -- cgit v1.2.3 From 5ddd36b9c59887c6416e21daf984fbdd9b1818df Mon Sep 17 00:00:00 2001 From: Stephen Wilson Date: Sun, 13 Mar 2011 15:49:20 -0400 Subject: mm: implement access_remote_vm Provide an alternative to access_process_vm that allows the caller to obtain a reference to the supplied mm_struct. Signed-off-by: Stephen Wilson Signed-off-by: Al Viro --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d6efefdde50..60011d26bffc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -971,6 +971,8 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, int write); int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, -- cgit v1.2.3 From b2b755b5f10eb32fbdc73a9907c07006b17f714b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 24 Mar 2011 15:18:15 -0700 Subject: lib, arch: add filter argument to show_mem and fix private implementations Commit ddd588b5dd55 ("oom: suppress nodes that are not allowed from meminfo on oom kill") moved lib/show_mem.o out of lib/lib.a, which resulted in build warnings on all architectures that implement their own versions of show_mem(): lib/lib.a(show_mem.o): In function `show_mem': show_mem.c:(.text+0x1f4): multiple definition of `show_mem' arch/sparc/mm/built-in.o:(.text+0xd70): first defined here The fix is to remove __show_mem() and add its argument to show_mem() in all implementations to prevent this breakage. Architectures that implement their own show_mem() actually don't do anything with the argument yet, but they could be made to filter nodes that aren't allowed in the current context in the future just like the generic implementation. Reported-by: Stephen Rothwell Reported-by: James Bottomley Suggested-by: Andrew Morton Signed-off-by: David Rientjes Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f9535b2c9558..7606d7db96c9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -861,7 +861,7 @@ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) /* - * Flags passed to __show_mem() and __show_free_areas() to suppress output in + * Flags passed to show_mem() and __show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* filter disallowed nodes */ @@ -1360,8 +1360,7 @@ extern void setup_per_zone_wmarks(void); extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(void); -extern void __show_mem(unsigned int flags); +extern void show_mem(unsigned int flags); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern int after_bootmem; -- cgit v1.2.3 From 25985edcedea6396277003854657b5f3cb31a628 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 30 Mar 2011 22:57:33 -0300 Subject: Fix common misspellings Fixes generated by 'codespell' and manually reviewed. Signed-off-by: Lucas De Marchi --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7606d7db96c9..692dbae6ffa7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -608,7 +608,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) #endif /* - * Define the bit shifts to access each section. For non-existant + * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ -- cgit v1.2.3