From 76cdd58e558669366adfaded436fda01b30cce3e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 May 2008 16:05:52 -0700 Subject: memory_hotplug: always initialize pageblock bitmap Trying to online a new memory section that was added via memory hotplug sometimes results in crashes when the new pages are added via __free_page. Reason for that is that the pageblock bitmap isn't initialized and hence contains random stuff. That means that get_pageblock_migratetype() returns also random stuff and therefore list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); in __free_one_page() tries to do a list_add to something that isn't even necessarily a list. This happens since 86051ca5eaf5e560113ec7673462804c54284456 ("mm: fix usemap initialization") which makes sure that the pageblock bitmap gets only initialized for pages present in a zone. Unfortunately for hot-added memory the zones "grow" after the memmap and the pageblock memmap have been initialized. Which means that the new pages have an unitialized bitmap. To solve this the calls to grow_zone_span() and grow_pgdat_span() are moved to __add_zone() just before the initialization happens. The patch also moves the two functions since __add_zone() is the only caller and I didn't want to add a forward declaration. Signed-off-by: Heiko Carstens Cc: Andy Whitcroft Cc: Dave Hansen Cc: Gerald Schaefer Cc: KAMEZAWA Hiroyuki Cc: Yasunori Goto Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdd5c432c426..63835579323a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2862,8 +2862,6 @@ __meminit int init_currently_empty_zone(struct zone *zone, zone->zone_start_pfn = zone_start_pfn; - memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); - zone_init_free_lists(zone); return 0; @@ -3433,6 +3431,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); + memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; } } -- cgit v1.2.3 From f7232154198f928fc25f420d6190468212a7632a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 23 May 2008 13:04:21 -0700 Subject: mm: don't drop a partial page in a zone's memory map size In a zone's present pages number, account for all pages occupied by the memory map, including a partial. Signed-off-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 63835579323a..035300299f94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3378,7 +3378,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; + memmap_pages = + PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; printk(KERN_DEBUG -- cgit v1.2.3 From 7eb54824b76793dd86afb54f182ef9aa64b3a45a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 23 May 2008 13:04:50 -0700 Subject: zonelists: handle a node zonelist with no applicable entries When booting 2.6.26-rc3 on a multi-node x86_32 numa system we are seeing panics when trying node local allocations: BUG: unable to handle kernel NULL pointer dereference at 0000034c IP: [] get_page_from_freelist+0x4a/0x18e *pdpt = 00000000013a7001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Modules linked in: Pid: 0, comm: swapper Not tainted (2.6.26-rc3-00003-g5abc28d #82) EIP: 0060:[] EFLAGS: 00010282 CPU: 0 EIP is at get_page_from_freelist+0x4a/0x18e EAX: c1371ed8 EBX: 00000000 ECX: 00000000 EDX: 00000000 ESI: f7801180 EDI: 00000000 EBP: 00000000 ESP: c1371ec0 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 0, ti=c1370000 task=c12f5b40 task.ti=c1370000) Stack: 00000000 00000000 00000000 00000000 000612d0 000412d0 00000000 000412d0 f7801180 f7c0101c f7c01018 c10426e4 f7c01018 00000001 00000044 00000000 00000001 c12f5b40 00000001 00000010 00000000 000412d0 00000286 000412d0 Call Trace: [] __alloc_pages_internal+0x99/0x378 [] __alloc_pages+0x7/0x9 [] kmem_getpages+0x66/0xef [] cache_grow+0x8f/0x123 [] ____cache_alloc_node+0xb9/0xe4 [] kmem_cache_alloc_node+0x92/0xd2 [] setup_cpu_cache+0xaf/0x177 [] kmem_cache_create+0x2c8/0x353 [] kmem_cache_init+0x1ce/0x3ad [] start_kernel+0x178/0x1ee This occurs when we are scanning the zonelists looking for a ZONE_NORMAL page. In this system there is only ZONE_DMA and ZONE_NORMAL memory on node 0, all other nodes are mapped above 4GB physical. Here is a dump of the zonelists from this system: zonelists pgdat=c1400000 0: c14006c0:2 f7c006c0:2 f7e006c0:2 c1400360:1 c1400000:0 1: c14006c0:2 c1400360:1 c1400000:0 zonelists pgdat=f7c00000 0: f7c006c0:2 f7e006c0:2 c14006c0:2 c1400360:1 c1400000:0 1: f7c006c0:2 zonelists pgdat=f7e00000 0: f7e006c0:2 c14006c0:2 f7c006c0:2 c1400360:1 c1400000:0 1: f7e006c0:2 When performing a node local allocation we call get_page_from_freelist() looking for a page. It in turn calls first_zones_zonelist() which returns a preferred_zone. Where there are no applicable zones this will be NULL. However we use this unconditionally, leading to this panic. Where there are no applicable zones there is no possibility of a successful allocation, so simply fail the allocation. Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 035300299f94..7f4c66ff65b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1396,6 +1396,9 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + if (!preferred_zone) + return NULL; + classzone_idx = zone_idx(preferred_zone); zonelist_scan: -- cgit v1.2.3 From cd94b9dbfa300fc42e45f230010623fc08d59563 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 23 May 2008 13:04:52 -0700 Subject: memory hotplug: fix early allocation handling Trying to add memory via add_memory() from within an initcall function results in bootmem alloc of 163840 bytes failed! Kernel panic - not syncing: Out of memory This is caused by zone_wait_table_init() which uses system_state to decide if it should use the bootmem allocator or not. When initcalls are handled the system_state is still SYSTEM_BOOTING but the bootmem allocator doesn't work anymore. So the allocation will fail. To fix this use slab_is_available() instead as indicator like we do it everywhere else. [akpm@linux-foundation.org: coding-style fix] Reviewed-by: Andy Whitcroft Cc: Dave Hansen Cc: Gerald Schaefer Cc: KAMEZAWA Hiroyuki Acked-by: Yasunori Goto Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f4c66ff65b7..8e83f02cd2d3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2807,7 +2807,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) alloc_size = zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t); - if (system_state == SYSTEM_BOOTING) { + if (!slab_is_available()) { zone->wait_table = (wait_queue_head_t *) alloc_bootmem_node(pgdat, alloc_size); } else { -- cgit v1.2.3 From dfa7e20cc0d1a7a620def4dce97de1ae5375f99b Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Mon, 9 Jun 2008 11:18:45 -0500 Subject: mm: Minor clean-up of page flags in mm/page_alloc.c Minor source code cleanup of page flags in mm/page_alloc.c. Move the definition of the groups of bits to page-flags.h. The purpose of this clean up is that the next patch will conditionally add a page flag to the groups. Doing that in a header file is cleaner than adding #ifdefs to the C code. Signed-off-by: Russ Anderson Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 34 +++------------------------------- 1 file changed, 3 insertions(+), 31 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e83f02cd2d3..2f552955a02f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -237,16 +237,7 @@ static void bad_page(struct page *page) printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); - page->flags &= ~(1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_reclaim | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_buddy ); + page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -463,16 +454,7 @@ static inline int free_pages_check(struct page *page) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -616,17 +598,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) (page->mapping != NULL) | (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & ( - 1 << PG_lru | - 1 << PG_private | - 1 << PG_locked | - 1 << PG_active | - 1 << PG_dirty | - 1 << PG_slab | - 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved | - 1 << PG_buddy )))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); /* -- cgit v1.2.3