diff options
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r-- | arch/x86_64/mm/fault.c | 55 | ||||
-rw-r--r-- | arch/x86_64/mm/init.c | 48 | ||||
-rw-r--r-- | arch/x86_64/mm/k8topology.c | 13 | ||||
-rw-r--r-- | arch/x86_64/mm/numa.c | 15 | ||||
-rw-r--r-- | arch/x86_64/mm/pageattr.c | 24 | ||||
-rw-r--r-- | arch/x86_64/mm/srat.c | 97 |
6 files changed, 156 insertions, 96 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 635e58d443d7..54816adb8e93 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -159,7 +159,7 @@ void dump_pagetable(unsigned long address) pmd_t *pmd; pte_t *pte; - asm("movq %%cr3,%0" : "=r" (pgd)); + pgd = (pgd_t *)read_cr3(); pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); pgd += pgd_index(address); @@ -221,16 +221,6 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) return 0; } -int unhandled_signal(struct task_struct *tsk, int sig) -{ - if (is_init(tsk)) - return 1; - if (tsk->ptrace & PT_PTRACED) - return 0; - return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || - (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL); -} - static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, unsigned long error_code) { @@ -301,8 +291,8 @@ static int vmalloc_fault(unsigned long address) return 0; } -int page_fault_trace = 0; -int exception_trace = 1; +static int page_fault_trace; +int show_unhandled_signals = 1; /* * This routine handles page faults. It determines the address, @@ -317,7 +307,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, struct vm_area_struct * vma; unsigned long address; const struct exception_table_entry *fixup; - int write; + int write, fault; unsigned long flags; siginfo_t info; @@ -326,7 +316,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, prefetchw(&mm->mmap_sem); /* get the address */ - __asm__("movq %%cr2,%0":"=r" (address)); + address = read_cr2(); info.si_code = SEGV_MAPERR; @@ -384,6 +374,13 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; + again: /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -450,19 +447,18 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - switch (handle_mm_fault(mm, vma, address, write)) { - case VM_FAULT_MINOR: - tsk->min_flt++; - break; - case VM_FAULT_MAJOR: - tsk->maj_flt++; - break; - case VM_FAULT_SIGBUS: - goto do_sigbus; - default: - goto out_of_memory; + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); } - + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; up_read(&mm->mmap_sem); return; @@ -495,7 +491,8 @@ bad_area_nosemaphore: (address >> 32)) return; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) { + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { printk( "%s%s[%d]: segfault at %016lx rip %016lx rsp %016lx error %lx\n", tsk->pid > 1 ? KERN_INFO : KERN_EMERG, @@ -569,7 +566,7 @@ out_of_memory: } printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + do_group_exit(SIGKILL); goto no_context; do_sigbus: diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 9a0e98accf04..458893b376f8 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -383,7 +383,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) } if (!after_bootmem) - asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); + mmu_cr4_features = read_cr4(); __flush_tlb_all(); } @@ -697,41 +697,6 @@ int kern_addr_valid(unsigned long addr) return pfn_valid(pte_pfn(*pte)); } -#ifdef CONFIG_SYSCTL -#include <linux/sysctl.h> - -extern int exception_trace, page_fault_trace; - -static ctl_table debug_table2[] = { - { - .ctl_name = 99, - .procname = "exception-trace", - .data = &exception_trace, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table debug_root_table2[] = { - { - .ctl_name = CTL_DEBUG, - .procname = "debug", - .mode = 0555, - .child = debug_table2 - }, - {} -}; - -static __init int x8664_sysctl_init(void) -{ - register_sysctl_table(debug_root_table2); - return 0; -} -__initcall(x8664_sysctl_init); -#endif - /* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ @@ -769,8 +734,17 @@ int in_gate_area_no_task(unsigned long addr) return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } -void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) +void * __init alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) { return __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); } + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) + return "[vdso]"; + if (vma == &gate_vma) + return "[vsyscall]"; + return NULL; +} diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index f983c75825d0..a96006f7ae0c 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -44,12 +44,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) { unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, nb; + int nodeid, i, j, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned dualcore = 0; + unsigned num_cores; if (!early_pci_allowed()) return -1; @@ -60,6 +60,9 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) @@ -73,8 +76,6 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) unsigned long base,limit; u32 nodeid; - /* Undefined before E stepping, but hopefully 0 */ - dualcore |= ((read_pci_config(0, nb, 3, 0xe8) >> 12) & 3) == 1; base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); @@ -170,8 +171,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - apicid_to_node[nodeid << dualcore] = i; - apicid_to_node[(nodeid << dualcore) + dualcore] = i; + for (j = 0; j < num_cores; j++) + apicid_to_node[(nodeid * num_cores) + j] = i; setup_node_bootmem(i, nodes[i].start, nodes[i].end); } } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 51548947ad3b..6da235522269 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -273,9 +273,6 @@ void __init numa_init_array(void) #ifdef CONFIG_NUMA_EMU /* Numa emulation */ -#define E820_ADDR_HOLE_SIZE(start, end) \ - (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ - PAGE_SHIFT) char *cmdline __initdata; /* @@ -319,7 +316,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return -1; if (num_nodes > MAX_NUMNODES) num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) / + size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / num_nodes; /* * Calculate the number of big nodes that can be allocated as a result @@ -347,7 +344,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) < + while (end - *addr - e820_hole_size(*addr, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { @@ -476,18 +473,22 @@ out: /* * We need to vacate all active ranges that may have been registered by - * SRAT. + * SRAT and set acpi_numa to -1 so that srat_disabled() always returns + * true. NUMA emulation has succeeded so we will not scan ACPI nodes. */ remove_all_active_ranges(); +#ifdef CONFIG_ACPI_NUMA + acpi_numa = -1; +#endif for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); setup_node_bootmem(i, nodes[i].start, nodes[i].end); } + acpi_fake_nodes(nodes, num_nodes); numa_init_array(); return 0; } -#undef E820_ADDR_HOLE_SIZE #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 9148f4a4cec6..10b9809ce821 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -13,7 +13,7 @@ #include <asm/tlbflush.h> #include <asm/io.h> -static inline pte_t *lookup_address(unsigned long address) +pte_t *lookup_address(unsigned long address) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; @@ -74,14 +74,13 @@ static void flush_kernel_map(void *arg) struct page *pg; /* When clflush is available always use it because it is - much cheaper than WBINVD. Disable clflush for now because - the high level code is not ready yet */ + much cheaper than WBINVD. */ + /* clflush is still broken. Disable for now. */ if (1 || !cpu_has_clflush) asm volatile("wbinvd" ::: "memory"); else list_for_each_entry(pg, l, lru) { void *adr = page_address(pg); - if (cpu_has_clflush) - cache_flush_page(adr); + cache_flush_page(adr); } __flush_tlb_all(); } @@ -95,7 +94,8 @@ static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ static inline void save_page(struct page *fpage) { - list_add(&fpage->lru, &deferred_pages); + if (!test_and_set_bit(PG_arch_1, &fpage->flags)) + list_add(&fpage->lru, &deferred_pages); } /* @@ -129,9 +129,12 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, pte_t *kpte; struct page *kpte_page; pgprot_t ref_prot2; + kpte = lookup_address(address); if (!kpte) return 0; kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); if (pgprot_val(prot) != pgprot_val(ref_prot)) { if (!pte_huge(*kpte)) { set_pte(kpte, pfn_pte(pfn, prot)); @@ -159,10 +162,9 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - if (page_private(kpte_page) == 0) { - save_page(kpte_page); + save_page(kpte_page); + if (page_private(kpte_page) == 0) revert_page(address, ref_prot); - } return 0; } @@ -234,6 +236,10 @@ void global_flush_tlb(void) flush_map(&l); list_for_each_entry_safe(pg, next, &l, lru) { + list_del(&pg->lru); + clear_bit(PG_arch_1, &pg->flags); + if (page_private(pg) != 0) + continue; ClearPagePrivate(pg); __free_page(pg); } diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c index 1e76bb0a7277..acdf03e19146 100644 --- a/arch/x86_64/mm/srat.c +++ b/arch/x86_64/mm/srat.c @@ -106,9 +106,9 @@ static __init int slit_valid(struct acpi_table_slit *slit) for (j = 0; j < d; j++) { u8 val = slit->entry[d*i + j]; if (i == j) { - if (val != 10) + if (val != LOCAL_DISTANCE) return 0; - } else if (val <= 10) + } else if (val <= LOCAL_DISTANCE) return 0; } } @@ -350,7 +350,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) /* Sanity check to catch more bad SRATs (they are amazingly common). Make sure the PXMs cover all memory. */ -static int nodes_cover_memory(void) +static int __init nodes_cover_memory(const struct bootnode *nodes) { int i; unsigned long pxmram, e820ram; @@ -394,6 +394,9 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) { int i; + if (acpi_numa <= 0) + return -1; + /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); @@ -403,10 +406,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } } - if (acpi_numa <= 0) - return -1; - - if (!nodes_cover_memory()) { + if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; } @@ -440,6 +440,86 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return 0; } +#ifdef CONFIG_NUMA_EMU +static int __init find_node_by_addr(unsigned long addr) +{ + int ret = NUMA_NO_NODE; + int i; + + for_each_node_mask(i, nodes_parsed) { + /* + * Find the real node that this emulated node appears on. For + * the sake of simplicity, we only use a real node's starting + * address to determine which emulated node it appears on. + */ + if (addr >= nodes[i].start && addr < nodes[i].end) { + ret = i; + break; + } + } + return i; +} + +/* + * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID + * mappings that respect the real ACPI topology but reflect our emulated + * environment. For each emulated node, we find which real node it appears on + * and create PXM to NID mappings for those fake nodes which mirror that + * locality. SLIT will now represent the correct distances between emulated + * nodes as a result of the real topology. + */ +void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) +{ + int i, j; + int fake_node_to_pxm_map[MAX_NUMNODES] = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL + }; + unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE + }; + + printk(KERN_INFO "Faking PXM affinity for fake nodes on real " + "topology.\n"); + for (i = 0; i < num_nodes; i++) { + int nid, pxm; + + nid = find_node_by_addr(fake_nodes[i].start); + if (nid == NUMA_NO_NODE) + continue; + pxm = node_to_pxm(nid); + if (pxm == PXM_INVAL) + continue; + fake_node_to_pxm_map[i] = pxm; + /* + * For each apicid_to_node mapping that exists for this real + * node, it must now point to the fake node ID. + */ + for (j = 0; j < MAX_LOCAL_APIC; j++) + if (apicid_to_node[j] == nid) + fake_apicid_to_node[j] = i; + } + for (i = 0; i < num_nodes; i++) + __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); + memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); + + nodes_clear(nodes_parsed); + for (i = 0; i < num_nodes; i++) + if (fake_nodes[i].start != fake_nodes[i].end) + node_set(i, nodes_parsed); + WARN_ON(!nodes_cover_memory(fake_nodes)); +} + +static int null_slit_node_compare(int a, int b) +{ + return node_to_pxm(a) == node_to_pxm(b); +} +#else +static int null_slit_node_compare(int a, int b) +{ + return a == b; +} +#endif /* CONFIG_NUMA_EMU */ + void __init srat_reserve_add_area(int nodeid) { if (found_add_area && nodes_add[nodeid].end) { @@ -464,7 +544,8 @@ int __node_distance(int a, int b) int index; if (!acpi_slit) - return a == b ? 10 : 20; + return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : + REMOTE_DISTANCE; index = acpi_slit->locality_count * node_to_pxm(a); return acpi_slit->entry[index + node_to_pxm(b)]; } |