summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c163
1 files changed, 117 insertions, 46 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..1dbcf8888f14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1370,21 +1370,12 @@ failed:
#ifdef CONFIG_FAIL_PAGE_ALLOC
-static struct fail_page_alloc_attr {
+static struct {
struct fault_attr attr;
u32 ignore_gfp_highmem;
u32 ignore_gfp_wait;
u32 min_order;
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-
- struct dentry *ignore_gfp_highmem_file;
- struct dentry *ignore_gfp_wait_file;
- struct dentry *min_order_file;
-
-#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
-
} fail_page_alloc = {
.attr = FAULT_ATTR_INITIALIZER,
.ignore_gfp_wait = 1,
@@ -1424,30 +1415,24 @@ static int __init fail_page_alloc_debugfs(void)
"fail_page_alloc");
if (err)
return err;
- dir = fail_page_alloc.attr.dentries.dir;
-
- fail_page_alloc.ignore_gfp_wait_file =
- debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_wait);
-
- fail_page_alloc.ignore_gfp_highmem_file =
- debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem);
- fail_page_alloc.min_order_file =
- debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order);
-
- if (!fail_page_alloc.ignore_gfp_wait_file ||
- !fail_page_alloc.ignore_gfp_highmem_file ||
- !fail_page_alloc.min_order_file) {
- err = -ENOMEM;
- debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
- debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
- debugfs_remove(fail_page_alloc.min_order_file);
- cleanup_fault_attr_dentries(&fail_page_alloc.attr);
- }
- return err;
+ dir = fail_page_alloc.attr.dir;
+
+ if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait))
+ goto fail;
+ if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem))
+ goto fail;
+ if (!debugfs_create_u32("min-order", mode, dir,
+ &fail_page_alloc.min_order))
+ goto fail;
+
+ return 0;
+fail:
+ cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+
+ return -ENOMEM;
}
late_initcall(fail_page_alloc_debugfs);
@@ -1616,6 +1601,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
set_bit(i, zlc->fullzones);
}
+/*
+ * clear all zones full, called after direct reclaim makes progress so that
+ * a zone that was recently full is not skipped over for up to a second
+ */
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+}
+
#else /* CONFIG_NUMA */
static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1632,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
{
}
+
+static void zlc_clear_zones_full(struct zonelist *zonelist)
+{
+}
#endif /* CONFIG_NUMA */
/*
@@ -1664,7 +1668,7 @@ zonelist_scan:
continue;
if ((alloc_flags & ALLOC_CPUSET) &&
!cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
+ continue;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1680,36 @@ zonelist_scan:
classzone_idx, alloc_flags))
goto try_this_zone;
+ if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+ /*
+ * we do zlc_setup if there are multiple nodes
+ * and before considering the first zone allowed
+ * by the cpuset.
+ */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
+ }
+
if (zone_reclaim_mode == 0)
goto this_zone_full;
+ /*
+ * As we may have just activated ZLC, check if the first
+ * eligible zone has failed zone_reclaim recently.
+ */
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
+
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
- goto try_next_zone;
+ continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
- goto this_zone_full;
+ continue;
default:
/* did we reclaim enough */
if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1726,6 @@ try_this_zone:
this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);
-try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
}
if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1967,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
if (unlikely(!(*did_some_progress)))
return NULL;
+ /* After successful reclaim, reconsider all zones for allocation */
+ if (NUMA_BUILD)
+ zlc_clear_zones_full(zonelist);
+
retry:
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
@@ -4585,6 +4602,60 @@ void __init sort_node_map(void)
cmp_node_active_region, NULL);
}
+/**
+ * node_map_pfn_alignment - determine the maximum internode alignment
+ *
+ * This function should be called after node map is populated and sorted.
+ * It calculates the maximum power of two alignment which can distinguish
+ * all the nodes.
+ *
+ * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
+ * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
+ * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
+ * shifted, 1GiB is enough and this function will indicate so.
+ *
+ * This is used to test whether pfn -> nid mapping of the chosen memory
+ * model has fine enough granularity to avoid incorrect mapping for the
+ * populated node map.
+ *
+ * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * requirement (single node).
+ */
+unsigned long __init node_map_pfn_alignment(void)
+{
+ unsigned long accl_mask = 0, last_end = 0;
+ int last_nid = -1;
+ int i;
+
+ for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
+ int nid = early_node_map[i].nid;
+ unsigned long start = early_node_map[i].start_pfn;
+ unsigned long end = early_node_map[i].end_pfn;
+ unsigned long mask;
+
+ if (!start || last_nid < 0 || last_nid == nid) {
+ last_nid = nid;
+ last_end = end;
+ continue;
+ }
+
+ /*
+ * Start with a mask granular enough to pin-point to the
+ * start pfn and tick off bits one-by-one until it becomes
+ * too coarse to separate the current node from the last.
+ */
+ mask = ~((1 << __ffs(start)) - 1);
+ while (mask && last_end <= (start & (mask << 1)))
+ mask <<= 1;
+
+ /* accumulate all internode masks */
+ accl_mask |= mask;
+ }
+
+ /* convert mask to number of pages */
+ return ~accl_mask + 1;
+}
+
/* Find the lowest pfn for a node */
static unsigned long __init find_min_pfn_for_node(int nid)
{