summaryrefslogtreecommitdiff
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/fault.c55
-rw-r--r--arch/x86_64/mm/init.c48
-rw-r--r--arch/x86_64/mm/k8topology.c13
-rw-r--r--arch/x86_64/mm/numa.c15
-rw-r--r--arch/x86_64/mm/pageattr.c24
-rw-r--r--arch/x86_64/mm/srat.c97
6 files changed, 156 insertions, 96 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 635e58d443d7..54816adb8e93 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -159,7 +159,7 @@ void dump_pagetable(unsigned long address)
pmd_t *pmd;
pte_t *pte;
- asm("movq %%cr3,%0" : "=r" (pgd));
+ pgd = (pgd_t *)read_cr3();
pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK);
pgd += pgd_index(address);
@@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
return 0;
}
-int unhandled_signal(struct task_struct *tsk, int sig)
-{
- if (is_init(tsk))
- return 1;
- if (tsk->ptrace & PT_PTRACED)
- return 0;
- return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) ||
- (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
-}
-
static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
unsigned long error_code)
{
@@ -301,8 +291,8 @@ static int vmalloc_fault(unsigned long address)
return 0;
}
-int page_fault_trace = 0;
-int exception_trace = 1;
+static int page_fault_trace;
+int show_unhandled_signals = 1;
/*
* This routine handles page faults. It determines the address,
@@ -317,7 +307,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
struct vm_area_struct * vma;
unsigned long address;
const struct exception_table_entry *fixup;
- int write;
+ int write, fault;
unsigned long flags;
siginfo_t info;
@@ -326,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
prefetchw(&mm->mmap_sem);
/* get the address */
- __asm__("movq %%cr2,%0":"=r" (address));
+ address = read_cr2();
info.si_code = SEGV_MAPERR;
@@ -384,6 +374,13 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
if (unlikely(in_atomic() || !mm))
goto bad_area_nosemaphore;
+ /*
+ * User-mode registers count as a user access even for any
+ * potential system fault or CPU buglet.
+ */
+ if (user_mode_vm(regs))
+ error_code |= PF_USER;
+
again:
/* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in the
@@ -450,19 +447,18 @@ good_area:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- switch (handle_mm_fault(mm, vma, address, write)) {
- case VM_FAULT_MINOR:
- tsk->min_flt++;
- break;
- case VM_FAULT_MAJOR:
- tsk->maj_flt++;
- break;
- case VM_FAULT_SIGBUS:
- goto do_sigbus;
- default:
- goto out_of_memory;
+ fault = handle_mm_fault(mm, vma, address, write);
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
+ goto out_of_memory;
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
}
-
+ if (fault & VM_FAULT_MAJOR)
+ tsk->maj_flt++;
+ else
+ tsk->min_flt++;
up_read(&mm->mmap_sem);
return;
@@ -495,7 +491,8 @@ bad_area_nosemaphore:
(address >> 32))
return;
- if (exception_trace && unhandled_signal(tsk, SIGSEGV)) {
+ if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+ printk_ratelimit()) {
printk(
"%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n",
tsk->pid > 1 ? KERN_INFO : KERN_EMERG,
@@ -569,7 +566,7 @@ out_of_memory:
}
printk("VM: killing process %s\n", tsk->comm);
if (error_code & 4)
- do_exit(SIGKILL);
+ do_group_exit(SIGKILL);
goto no_context;
do_sigbus:
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 9a0e98accf04..458893b376f8 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -383,7 +383,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
}
if (!after_bootmem)
- asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
+ mmu_cr4_features = read_cr4();
__flush_tlb_all();
}
@@ -697,41 +697,6 @@ int kern_addr_valid(unsigned long addr)
return pfn_valid(pte_pfn(*pte));
}
-#ifdef CONFIG_SYSCTL
-#include <linux/sysctl.h>
-
-extern int exception_trace, page_fault_trace;
-
-static ctl_table debug_table2[] = {
- {
- .ctl_name = 99,
- .procname = "exception-trace",
- .data = &exception_trace,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {}
-};
-
-static ctl_table debug_root_table2[] = {
- {
- .ctl_name = CTL_DEBUG,
- .procname = "debug",
- .mode = 0555,
- .child = debug_table2
- },
- {}
-};
-
-static __init int x8664_sysctl_init(void)
-{
- register_sysctl_table(debug_root_table2);
- return 0;
-}
-__initcall(x8664_sysctl_init);
-#endif
-
/* A pseudo VMA to allow ptrace access for the vsyscall page. This only
covers the 64bit vsyscall page now. 32bit has a real VMA now and does
not need special handling anymore. */
@@ -769,8 +734,17 @@ int in_gate_area_no_task(unsigned long addr)
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
}
-void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
{
return __alloc_bootmem_core(pgdat->bdata, size,
SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
}
+
+const char *arch_vma_name(struct vm_area_struct *vma)
+{
+ if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
+ return "[vdso]";
+ if (vma == &gate_vma)
+ return "[vsyscall]";
+ return NULL;
+}
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index f983c75825d0..a96006f7ae0c 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -44,12 +44,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
{
unsigned long prevbase;
struct bootnode nodes[8];
- int nodeid, i, nb;
+ int nodeid, i, j, nb;
unsigned char nodeids[8];
int found = 0;
u32 reg;
unsigned numnodes;
- unsigned dualcore = 0;
+ unsigned num_cores;
if (!early_pci_allowed())
return -1;
@@ -60,6 +60,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
+ num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+ printk(KERN_INFO "CPU has %d num_cores\n", num_cores);
+
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
@@ -73,8 +76,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
unsigned long base,limit;
u32 nodeid;
- /* Undefined before E stepping, but hopefully 0 */
- dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1;
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -170,8 +171,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
for (i = 0; i < 8; i++) {
if (nodes[i].start != nodes[i].end) {
nodeid = nodeids[i];
- apicid_to_node[nodeid << dualcore] = i;
- apicid_to_node[(nodeid << dualcore) + dualcore] = i;
+ for (j = 0; j < num_cores; j++)
+ apicid_to_node[(nodeid * num_cores) + j] = i;
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
}
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 51548947ad3b..6da235522269 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,9 +273,6 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
-#define E820_ADDR_HOLE_SIZE(start, end) \
- (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
- PAGE_SHIFT)
char *cmdline __initdata;
/*
@@ -319,7 +316,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
return -1;
if (num_nodes > MAX_NUMNODES)
num_nodes = MAX_NUMNODES;
- size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
+ size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
num_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
@@ -347,7 +344,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
if (i == num_nodes + node_start - 1)
end = max_addr;
else
- while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
+ while (end - *addr - e820_hole_size(*addr, end) <
size) {
end += FAKE_NODE_MIN_SIZE;
if (end > max_addr) {
@@ -476,18 +473,22 @@ out:
/*
* We need to vacate all active ranges that may have been registered by
- * SRAT.
+ * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
+ * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
*/
remove_all_active_ranges();
+#ifdef CONFIG_ACPI_NUMA
+ acpi_numa = -1;
+#endif
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
+ acpi_fake_nodes(nodes, num_nodes);
numa_init_array();
return 0;
}
-#undef E820_ADDR_HOLE_SIZE
#endif /* CONFIG_NUMA_EMU */
void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 9148f4a4cec6..10b9809ce821 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -13,7 +13,7 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
-static inline pte_t *lookup_address(unsigned long address)
+pte_t *lookup_address(unsigned long address)
{
pgd_t *pgd = pgd_offset_k(address);
pud_t *pud;
@@ -74,14 +74,13 @@ static void flush_kernel_map(void *arg)
struct page *pg;
/* When clflush is available always use it because it is
- much cheaper than WBINVD. Disable clflush for now because
- the high level code is not ready yet */
+ much cheaper than WBINVD. */
+ /* clflush is still broken. Disable for now. */
if (1 || !cpu_has_clflush)
asm volatile("wbinvd" ::: "memory");
else list_for_each_entry(pg, l, lru) {
void *adr = page_address(pg);
- if (cpu_has_clflush)
- cache_flush_page(adr);
+ cache_flush_page(adr);
}
__flush_tlb_all();
}
@@ -95,7 +94,8 @@ static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */
static inline void save_page(struct page *fpage)
{
- list_add(&fpage->lru, &deferred_pages);
+ if (!test_and_set_bit(PG_arch_1, &fpage->flags))
+ list_add(&fpage->lru, &deferred_pages);
}
/*
@@ -129,9 +129,12 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
pte_t *kpte;
struct page *kpte_page;
pgprot_t ref_prot2;
+
kpte = lookup_address(address);
if (!kpte) return 0;
kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+ BUG_ON(PageLRU(kpte_page));
+ BUG_ON(PageCompound(kpte_page));
if (pgprot_val(prot) != pgprot_val(ref_prot)) {
if (!pte_huge(*kpte)) {
set_pte(kpte, pfn_pte(pfn, prot));
@@ -159,10 +162,9 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
/* on x86-64 the direct mapping set at boot is not using 4k pages */
BUG_ON(PageReserved(kpte_page));
- if (page_private(kpte_page) == 0) {
- save_page(kpte_page);
+ save_page(kpte_page);
+ if (page_private(kpte_page) == 0)
revert_page(address, ref_prot);
- }
return 0;
}
@@ -234,6 +236,10 @@ void global_flush_tlb(void)
flush_map(&l);
list_for_each_entry_safe(pg, next, &l, lru) {
+ list_del(&pg->lru);
+ clear_bit(PG_arch_1, &pg->flags);
+ if (page_private(pg) != 0)
+ continue;
ClearPagePrivate(pg);
__free_page(pg);
}
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 1e76bb0a7277..acdf03e19146 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi_table_slit *slit)
for (j = 0; j < d; j++) {
u8 val = slit->entry[d*i + j];
if (i == j) {
- if (val != 10)
+ if (val != LOCAL_DISTANCE)
return 0;
- } else if (val <= 10)
+ } else if (val <= LOCAL_DISTANCE)
return 0;
}
}
@@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
/* Sanity check to catch more bad SRATs (they are amazingly common).
Make sure the PXMs cover all memory. */
-static int nodes_cover_memory(void)
+static int __init nodes_cover_memory(const struct bootnode *nodes)
{
int i;
unsigned long pxmram, e820ram;
@@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
int i;
+ if (acpi_numa <= 0)
+ return -1;
+
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
cutoff_node(i, start, end);
@@ -403,10 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
}
}
- if (acpi_numa <= 0)
- return -1;
-
- if (!nodes_cover_memory()) {
+ if (!nodes_cover_memory(nodes)) {
bad_srat();
return -1;
}
@@ -440,6 +440,86 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return 0;
}
+#ifdef CONFIG_NUMA_EMU
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ /*
+ * Find the real node that this emulated node appears on. For
+ * the sake of simplicity, we only use a real node's starting
+ * address to determine which emulated node it appears on.
+ */
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ }
+ return i;
+}
+
+/*
+ * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
+ * mappings that respect the real ACPI topology but reflect our emulated
+ * environment. For each emulated node, we find which real node it appears on
+ * and create PXM to NID mappings for those fake nodes which mirror that
+ * locality. SLIT will now represent the correct distances between emulated
+ * nodes as a result of the real topology.
+ */
+void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
+{
+ int i, j;
+ int fake_node_to_pxm_map[MAX_NUMNODES] = {
+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
+ };
+ unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+ };
+
+ printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
+ "topology.\n");
+ for (i = 0; i < num_nodes; i++) {
+ int nid, pxm;
+
+ nid = find_node_by_addr(fake_nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+ pxm = node_to_pxm(nid);
+ if (pxm == PXM_INVAL)
+ continue;
+ fake_node_to_pxm_map[i] = pxm;
+ /*
+ * For each apicid_to_node mapping that exists for this real
+ * node, it must now point to the fake node ID.
+ */
+ for (j = 0; j < MAX_LOCAL_APIC; j++)
+ if (apicid_to_node[j] == nid)
+ fake_apicid_to_node[j] = i;
+ }
+ for (i = 0; i < num_nodes; i++)
+ __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+
+ nodes_clear(nodes_parsed);
+ for (i = 0; i < num_nodes; i++)
+ if (fake_nodes[i].start != fake_nodes[i].end)
+ node_set(i, nodes_parsed);
+ WARN_ON(!nodes_cover_memory(fake_nodes));
+}
+
+static int null_slit_node_compare(int a, int b)
+{
+ return node_to_pxm(a) == node_to_pxm(b);
+}
+#else
+static int null_slit_node_compare(int a, int b)
+{
+ return a == b;
+}
+#endif /* CONFIG_NUMA_EMU */
+
void __init srat_reserve_add_area(int nodeid)
{
if (found_add_area && nodes_add[nodeid].end) {
@@ -464,7 +544,8 @@ int __node_distance(int a, int b)
int index;
if (!acpi_slit)
- return a == b ? 10 : 20;
+ return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
+ REMOTE_DISTANCE;
index = acpi_slit->locality_count * node_to_pxm(a);
return acpi_slit->entry[index + node_to_pxm(b)];
}