diff options
author | Alex Gonzalez <alex.gonzalez@digi.com> | 2012-01-19 13:54:23 +0100 |
---|---|---|
committer | Alex Gonzalez <alex.gonzalez@digi.com> | 2012-01-19 13:54:23 +0100 |
commit | 802699c91a967767fc94759f7a3e5e82d8269245 (patch) | |
tree | c8b714dd25edd333efbbf8bb1eb6c3d379084cc4 /arch/x86 | |
parent | f135e68daa6745fd3dbb285e6161ae2758c4027f (diff) | |
parent | 675f7660ffb0e1880011f6b3c4f9ac241491e3cd (diff) |
Merge commit 'v2.6.35.14' into del-5.8/main
Conflicts:
arch/arm/plat-mxc/include/mach/gpio.h
arch/x86/kernel/cpu/mtrr/main.c
drivers/mmc/core/core.c
drivers/net/smsc911x.c
fs/proc/task_mmu.c
include/linux/pm_runtime.h
mm/memory.c
mm/mlock.c
Signed-off-by: Alex Gonzalez <alex.gonzalez@digi.com>
Diffstat (limited to 'arch/x86')
97 files changed, 1268 insertions, 651 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dcb0593b4a66..d2b82109a3fa 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -247,6 +247,11 @@ config ARCH_HWEIGHT_CFLAGS config KTIME_SCALAR def_bool X86_32 + +config ARCH_CPU_PROBE_RELEASE + def_bool y + depends on HOTPLUG_CPU + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -792,6 +797,17 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config IRQ_TIME_ACCOUNTING + bool "Fine granularity task level IRQ time accounting" + default n + ---help--- + Select this option to enable fine granularity task irq time + accounting. This is done by reading a timestamp on each + transitions between softirq and hardirq state, so there can be a + small performance impact. + + If in doubt, say N here. + source "kernel/Kconfig.preempt" config X86_UP_APIC diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e790bc1fbfa3..4f5f71e03ce1 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -50,7 +50,12 @@ /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. */ .macro LOAD_ARGS32 offset, _r9=0 .if \_r9 @@ -60,6 +65,7 @@ movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi + movl %eax,%eax /* zero extension */ .endm .macro CFI_STARTPROC32 simple @@ -153,7 +159,7 @@ ENTRY(ia32_sysenter_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: IA32_ARG_FIXUP @@ -195,7 +201,7 @@ sysexit_from_sys_call: movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ call audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ @@ -248,7 +254,7 @@ sysenter_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC @@ -314,7 +320,7 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpl $IA32_NR_syscalls-1,%eax + cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 @@ -367,7 +373,7 @@ cstar_tracesys: LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -425,7 +431,7 @@ ENTRY(ia32_syscall) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP @@ -444,7 +450,7 @@ ia32_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index aa2c39d968fc..27d837979e80 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -88,6 +88,7 @@ extern int acpi_disabled; extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; +extern int acpi_fix_pin2_polarity; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h index d2544f1d705d..cb030374b90a 100644 --- a/arch/x86/include/asm/amd_iommu_proto.h +++ b/arch/x86/include/asm/amd_iommu_proto.h @@ -38,4 +38,10 @@ static inline void amd_iommu_stats_init(void) { } #endif /* !CONFIG_AMD_IOMMU_STATS */ +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_ATI) && + (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index 7014e88bc779..08616180deaf 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -368,6 +368,9 @@ struct amd_iommu { /* capabilities of that IOMMU read from ACPI */ u32 cap; + /* flags read from acpi table */ + u8 acpi_flags; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -411,6 +414,15 @@ struct amd_iommu { /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; + + /* + * This array is required to work around a potential BIOS bug. + * The BIOS may miss to restore parts of the PCI configuration + * space when the system resumes from S3. The result is that the + * IOMMU does not execute commands anymore which leads to system + * failure. + */ + u32 cache_cfg[4]; }; /* diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7fe3b3060f08..49d7c96791dc 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -78,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 8859e12dd3cf..20955ea7bc12 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -27,20 +27,20 @@ struct __xchg_dummy { switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -53,60 +53,33 @@ struct __xchg_dummy { __xchg((v), (ptr), sizeof(*ptr)) /* - * The semantics of XCHGCMP8B are a bit strange, this is why - * there is a loop and the loading of %%eax and %%edx has to - * be inside. This inlines well in most cases, the cached - * cost is around ~38 cycles. (in the future we might want - * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that - * might have an implicit FPU-save as a cost, so it's not - * clear which path to go.) + * CMPXCHG8B only writes to the target if we had the previous + * value in registers, otherwise it acts as a read and gives us the + * "new previous" value. That is why there is a loop. Preloading + * EDX:EAX is a performance optimization: in the common case it means + * we need only one locked operation. * - * cmpxchg8b must be used with the lock prefix here to allow - * the instruction to be executed atomically, see page 3-102 - * of the instruction set reference 24319102.pdf. We need - * the reader side to see the coherent 64bit value. + * A SIMD/3DNOW!/MMX/FPU 64-bit store here would require at the very + * least an FPU save and/or %cr0.ts manipulation. + * + * cmpxchg8b must be used with the lock prefix here to allow the + * instruction to be executed atomically. We need to have the reader + * side to see the coherent 64bit value. */ -static inline void __set_64bit(unsigned long long *ptr, - unsigned int low, unsigned int high) +static inline void set_64bit(volatile u64 *ptr, u64 value) { + u32 low = value; + u32 high = value >> 32; + u64 prev = *ptr; + asm volatile("\n1:\t" - "movl (%0), %%eax\n\t" - "movl 4(%0), %%edx\n\t" - LOCK_PREFIX "cmpxchg8b (%0)\n\t" + LOCK_PREFIX "cmpxchg8b %0\n\t" "jnz 1b" - : /* no outputs */ - : "D"(ptr), - "b"(low), - "c"(high) - : "ax", "dx", "memory"); -} - -static inline void __set_64bit_constant(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32)); -} - -#define ll_low(x) *(((unsigned int *)&(x)) + 0) -#define ll_high(x) *(((unsigned int *)&(x)) + 1) - -static inline void __set_64bit_var(unsigned long long *ptr, - unsigned long long value) -{ - __set_64bit(ptr, ll_low(value), ll_high(value)); + : "=m" (*ptr), "+A" (prev) + : "b" (low), "c" (high) + : "memory"); } -#define set_64bit(ptr, value) \ - (__builtin_constant_p((value)) \ - ? __set_64bit_constant((ptr), (value)) \ - : __set_64bit_var((ptr), (value))) - -#define _set_64bit(ptr, value) \ - (__builtin_constant_p(value) \ - ? __set_64bit(ptr, (unsigned int)(value), \ - (unsigned int)((value) >> 32)) \ - : __set_64bit(ptr, ll_low((value)), ll_high((value)))) - extern void __cmpxchg_wrong_size(void); /* @@ -121,21 +94,21 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ @@ -180,12 +153,12 @@ static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile(LOCK_PREFIX "cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile(LOCK_PREFIX "cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } @@ -195,12 +168,12 @@ static inline unsigned long long __cmpxchg64_local(volatile void *ptr, unsigned long long new) { unsigned long long prev; - asm volatile("cmpxchg8b %3" - : "=A"(prev) - : "b"((unsigned long)new), - "c"((unsigned long)(new >> 32)), - "m"(*__xg(ptr)), - "0"(old) + asm volatile("cmpxchg8b %1" + : "=A" (prev), + "+m" (*__xg(ptr)) + : "b" ((unsigned long)new), + "c" ((unsigned long)(new >> 32)), + "0" (old) : "memory"); return prev; } diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 485ae415faec..9596e7c61960 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -5,13 +5,11 @@ #define __xg(x) ((volatile long *)(x)) -static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; } -#define _set_64bit set_64bit - extern void __xchg_wrong_size(void); extern void __cmpxchg_wrong_size(void); @@ -26,26 +24,26 @@ extern void __cmpxchg_wrong_size(void); switch (size) { \ case 1: \ asm volatile("xchgb %b0,%1" \ - : "=q" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=q" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 2: \ asm volatile("xchgw %w0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 4: \ asm volatile("xchgl %k0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ case 8: \ asm volatile("xchgq %0,%1" \ - : "=r" (__x) \ - : "m" (*__xg(ptr)), "0" (__x) \ + : "=r" (__x), "+m" (*__xg(ptr)) \ + : "0" (__x) \ : "memory"); \ break; \ default: \ @@ -71,27 +69,27 @@ extern void __cmpxchg_wrong_size(void); __typeof__(*(ptr)) __new = (new); \ switch (size) { \ case 1: \ - asm volatile(lock "cmpxchgb %b1,%2" \ - : "=a"(__ret) \ - : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgb %b2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "q" (__new), "0" (__old) \ : "memory"); \ break; \ case 2: \ - asm volatile(lock "cmpxchgw %w1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgw %w2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 4: \ - asm volatile(lock "cmpxchgl %k1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgl %k2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ case 8: \ - asm volatile(lock "cmpxchgq %1,%2" \ - : "=a"(__ret) \ - : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__xg(ptr)) \ + : "r" (__new), "0" (__old) \ : "memory"); \ break; \ default: \ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 306160e58b48..1d9cd27c2920 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -205,7 +205,7 @@ static inline compat_uptr_t ptr_to_compat(void __user *uptr) return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 468145914389..429e4f485746 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -150,7 +150,7 @@ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ #define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ @@ -162,6 +162,7 @@ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ #define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ /* Virtualization flags: Linux defined */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 004e6e25e913..1d5c08a1bdfd 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -68,7 +68,6 @@ extern unsigned long force_hpet_address; extern u8 hpet_blockid; extern int hpet_force_user; extern u8 hpet_msi_disable; -extern u8 hpet_readback_cmp; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 30a3e9776123..6a45ec41ec26 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size) extern void iounmap(volatile void __iomem *addr); +extern void set_iounmap_nonlazy(void); #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0b2729bf2070..50de7b4b2db1 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -143,7 +143,15 @@ struct x86_emulate_ops { struct operand { enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; unsigned int bytes; - unsigned long val, orig_val, *ptr; + union { + unsigned long orig_val; + u64 orig_val64; + }; + unsigned long *ptr; + union { + unsigned long val; + u64 val64; + }; }; struct fetch_cache { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 76f5483cffec..535c3738ef6b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -77,7 +77,7 @@ #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 @@ -673,20 +673,6 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -694,16 +680,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4a2d4e0c18d9..8b5393ec1080 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); @@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Re-load page tables */ load_cr3(next->pgd); + /* stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + /* * load the LDT, if the LDT is different: */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 8c7ae4318629..692c73bdc3fe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -85,11 +85,15 @@ #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) @@ -227,6 +231,11 @@ #define THERM_INT_LOW_ENABLE (1 << 0) #define THERM_INT_HIGH_ENABLE (1 << 1) +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_POWERSWAVE 15 + #define MSR_IA32_THERM_STATUS 0x0000019c #define THERM_STATUS_PROCHOT (1 << 0) @@ -239,6 +248,8 @@ #define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b0165ea01..33927d283c92 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd) static inline void pud_clear(pud_t *pudp) { - unsigned long pgd; - set_pud(pudp, __pud(0)); /* @@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... * - * Make sure the pud entry we're updating is within the - * current pgd to avoid unnecessary TLB flushes. + * Currently all places where pud_clear() is called either have + * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or + * pud_clear_bad()), so we don't need TLB flush here. */ - pgd = read_cr3(); - if (__pa(pudp) >= pgd && __pa(pudp) < - (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) - write_cr3(pgd); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 2984a25ff383..f686f49e8b7b 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -26,6 +26,7 @@ struct mm_struct; struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; +extern pgd_t trampoline_pg_dir[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7e5c6a60b8ee..2fe362c72eb0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -763,29 +763,6 @@ extern unsigned long boot_option_idle_override; extern unsigned long idle_halt; extern unsigned long idle_nomwait; -/* - * on systems with caches, caches must be flashed as the absolute - * last instruction before going into a suspended halt. Otherwise, - * dirty data can linger in the cache and become stale on resume, - * leading to strange errors. - * - * perform a variety of operations to guarantee that the compiler - * will not reorder instructions. wbinvd itself is serializing - * so the processor will not reorder. - * - * Systems without cache can just go into halt. - */ -static inline void wbinvd_halt(void) -{ - mb(); - /* check for clflush to determine if wbinvd is legal */ - if (cpu_has_clflush) - asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); - else - while (1) - halt(); -} - extern void enable_sep_cpu(void); extern int sysenter_setup(void); @@ -1025,4 +1002,23 @@ unsigned long calc_aperfmperf_ratio(struct aperfmperf *old, return ratio; } +/* + * AMD errata checking + */ +#ifdef CONFIG_CPU_SUP_AMD +extern const int amd_erratum_400[]; +extern bool cpu_has_amd_erratum(const int *); + +#define AMD_LEGACY_ERRATUM(...) { -1, __VA_ARGS__, 0 } +#define AMD_OSVW_ERRATUM(osvw_id, ...) { osvw_id, __VA_ARGS__, 0 } +#define AMD_MODEL_RANGE(f, m_start, s_start, m_end, s_end) \ + ((f << 24) | (m_start << 16) | (s_start << 12) | (m_end << 4) | (s_end)) +#define AMD_MODEL_RANGE_FAMILY(range) (((range) >> 24) & 0xff) +#define AMD_MODEL_RANGE_START(range) (((range) >> 12) & 0xfff) +#define AMD_MODEL_RANGE_END(range) ((range) & 0xfff) + +#else +#define cpu_has_amd_erratum(x) (false) +#endif /* CONFIG_CPU_SUP_AMD */ + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index cd02f324aa6b..6226870d11cd 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -11,5 +11,6 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +void pvclock_resume(void); #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4cfc90824068..4c2f63c7fc1b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -50,7 +50,7 @@ struct smp_ops { void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -73,7 +73,12 @@ extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 1def60114906..cfdc6c88c9d5 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -34,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void) */ CMOS_WRITE(0, 0xf); - *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; + *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0; } static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index cb507bb05d79..4dde797c0578 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -13,14 +13,17 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +extern unsigned long initial_page_table; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) extern unsigned long setup_trampoline(void); +extern void __init setup_trampoline_page_table(void); extern void __init reserve_trampoline_memory(void); #else -static inline void reserve_trampoline_memory(void) {}; +static inline void setup_trampoline_page_table(void) {} +static inline void reserve_trampoline_memory(void) {} #endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c0427295e8f5..1ca132fc0d03 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -59,5 +59,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); +extern void save_sched_clock_state(void); +extern void restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index abd3e0ea762a..99f0ad753f32 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -42,7 +42,7 @@ * Returns 0 if the range is valid, nonzero otherwise. * * This is equivalent to the following test: - * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) + * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64) * * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e77b22083721..0357c514fc7f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -11,6 +11,8 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c05872aa3ce0..510bc6db1bd3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; int acpi_skip_timer_override __initdata; int acpi_use_timer_override __initdata; +int acpi_fix_pin2_polarity __initdata; #ifdef CONFIG_X86_LOCAL_APIC static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; @@ -410,10 +411,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (acpi_skip_timer_override && - intsrc->source_irq == 0 && intsrc->global_irq == 2) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); - return 0; + if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (acpi_skip_timer_override) { + printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + return 0; + } + if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; + printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); + } } mp_override_legacy_irq(intsrc->source_irq, diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0d20286d78c6..e26ac4035754 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -25,6 +25,7 @@ #include <linux/dma-mapping.h> #include <linux/iommu-helper.h> #include <linux/iommu.h> +#include <asm/msidef.h> #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -1048,7 +1049,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; struct amd_iommu *iommu; - unsigned long i; + unsigned long i, old_size; #ifdef CONFIG_IOMMU_STRESS populate = false; @@ -1084,8 +1085,21 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, } } + old_size = dma_dom->aperture_size; dma_dom->aperture_size += APERTURE_RANGE_SIZE; + /* Reserve address range used for MSI messages */ + if (old_size < MSI_ADDR_BASE_LO && + dma_dom->aperture_size > MSI_ADDR_BASE_LO) { + unsigned long spage; + int pages; + + pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE); + spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT; + + dma_ops_reserve_addresses(dma_dom, spage, pages); + } + /* Intialize the exclusion range if necessary */ for_each_iommu(iommu) { if (iommu->exclusion_start && @@ -1953,6 +1967,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, size_t size, int dir) { + dma_addr_t flush_addr; dma_addr_t i, start; unsigned int pages; @@ -1960,6 +1975,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, (dma_addr + size > dma_dom->aperture_size)) return; + flush_addr = dma_addr; pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -1974,7 +1990,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom, dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(&dma_dom->domain, dma_addr, size); + iommu_flush_pages(&dma_dom->domain, flush_addr, size); dma_dom->need_flush = false; } } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3cc63e2b8dd4..82cbee9547c7 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -632,6 +632,13 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + + if (is_rd890_iommu(iommu->dev)) { + pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); + pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); + pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); + pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); + } } /* @@ -643,35 +650,15 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, { u8 *p = (u8 *)h; u8 *end = p, flags = 0; - u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; - u32 ext_flags = 0; + u16 devid = 0, devid_start = 0, devid_to = 0; + u32 dev_i, ext_flags = 0; bool alias = false; struct ivhd_entry *e; /* - * First set the recommended feature enable bits from ACPI - * into the IOMMU control registers + * First save the recommended feature enable bits from ACPI */ - h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : - iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - - h->flags & IVHD_FLAG_PASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - - h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - - h->flags & IVHD_FLAG_ISOC_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_ISOC_EN) : - iommu_feature_disable(iommu, CONTROL_ISOC_EN); - - /* - * make IOMMU memory accesses cache coherent - */ - iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + iommu->acpi_flags = h->flags; /* * Done. Now parse the device entries @@ -819,7 +806,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, /* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { - u16 i; + u32 i; for (i = iommu->first_device; i <= iommu->last_device; ++i) set_iommu_for_device(iommu, i); @@ -1108,7 +1095,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table) */ static void init_device_table(void) { - u16 devid; + u32 devid; for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { set_dev_entry_bit(devid, DEV_ENTRY_VALID); @@ -1116,6 +1103,40 @@ static void init_device_table(void) } } +static void iommu_init_flags(struct amd_iommu *iommu) +{ + iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); +} + +static void iommu_apply_quirks(struct amd_iommu *iommu) +{ + if (is_rd890_iommu(iommu->dev)) { + pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); + pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); + pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); + pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); + } +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1126,6 +1147,8 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); + iommu_apply_quirks(iommu); + iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a96489ee6cab..6583884a83fb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1340,6 +1340,14 @@ void __cpuinit end_local_APIC_setup(void) setup_apic_nmi_watchdog(NULL); apic_pm_activate(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (!smp_processor_id() && intr_remapping_enabled) + enable_drhd_fault_handling(); + } #ifdef CONFIG_X86_X2APIC @@ -1606,7 +1614,7 @@ void __init init_apic_mappings(void) * acpi lapic path already maps that address in * acpi_register_lapic_address() */ - if (!acpi_lapic) + if (!acpi_lapic && !smp_found_config) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e41ed24ab26d..4d90327853b7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -306,14 +306,19 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc, old_cfg = old_desc->chip_data; - memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + cfg->vector = old_cfg->vector; + cfg->move_in_progress = old_cfg->move_in_progress; + cpumask_copy(cfg->domain, old_cfg->domain); + cpumask_copy(cfg->old_domain, old_cfg->old_domain); init_copy_irq_2_pin(old_cfg, cfg, node); } -static void free_irq_cfg(struct irq_cfg *old_cfg) +static void free_irq_cfg(struct irq_cfg *cfg) { - kfree(old_cfg); + free_cpumask_var(cfg->domain); + free_cpumask_var(cfg->old_domain); + kfree(cfg); } void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) @@ -1392,6 +1397,7 @@ int setup_ioapic_entry(int apic_id, int irq, irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = vector; irte.dest_id = IRTE_DEST(destination); + irte.redir_hint = 1; /* Set source-id of interrupt request */ set_ioapic_sid(&irte, apic_id); @@ -1728,6 +1734,8 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_pin_list *entry; cfg = desc->chip_data; + if (!cfg) + continue; entry = cfg->irq_2_pin; if (!entry) continue; @@ -3341,6 +3349,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, irte.dlvry_mode = apic->irq_delivery_mode; irte.vector = cfg->vector; irte.dest_id = IRTE_DEST(dest); + irte.redir_hint = 1; /* Set source-id of interrupt request */ if (pdev) @@ -3397,7 +3406,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); @@ -3617,6 +3626,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 83e9be4778e2..fac49a845064 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -76,13 +76,6 @@ void __init default_setup_apic_routing(void) /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; } - - /* - * Now that apic routing model is selected, configure the - * fault handling for intr remapping. - */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 10fa5684a662..7369b4c2c55a 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -33,6 +33,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006 }, { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007 }, { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..29dfd22ab642 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -305,8 +305,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; /* fixup topology information on multi-node processors */ - if ((c->x86 == 0x10) && (c->x86_model == 9)) - amd_fixup_dcm(c); + amd_fixup_dcm(c); #endif } @@ -565,6 +564,35 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } #endif + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + int err; + + err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); + if (err == 0) { + mask |= (1 << 10); + checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } + } } #ifdef CONFIG_X86_32 @@ -609,3 +637,68 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); + + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 4) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 68e4a6f2211e..d938871e3d83 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -537,7 +537,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } -static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; u32 ebx; @@ -576,6 +576,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +732,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3624e8a0f71b..f668bb1f7d43 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -33,5 +33,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); +extern void get_cpu_cap(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1d3cddaa40ee..5384b0418428 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -704,6 +704,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); + kfree(data->freq_table); kfree(data); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 85f69cdeae10..8e82a525b80c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -39,6 +39,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID; wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); c->cpuid_level = cpuid_eax(0); + get_cpu_cap(c); } } @@ -449,6 +450,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + + /* + * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not. + * x86_energy_perf_policy(8) is available to change it at run-time + */ + if (cpu_has(c, X86_FEATURE_EPB)) { + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if ((epb & 0xF) == 0) { + printk_once(KERN_WARNING, "x86: updated energy_perf_bias" + " to 'normal' from 'performance'\n" + "You can view and update epb via utility," + " such as x86_energy_perf_policy(8)\n"); + epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + } + } } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 224392d8fe8c..1bdce3410b5e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -141,6 +141,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -148,12 +149,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) @@ -472,6 +469,7 @@ recurse: out_free: if (b) { kobject_put(&b->kobj); + list_del(&b->miscj); kfree(b); } return err; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e1a0a3bf9716..22100c649e74 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -293,18 +293,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c) */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); + h = lvtthmr_init; /* * The initial value of thermal LVT entries on all APs always reads * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI * sequence to them and LVT registers are reset to 0s except for * the mask bits which are set to 1s when APs receive INIT IPI. - * Always restore the value that BIOS has programmed on AP based on - * BSP's info we saved since BIOS is always setting the same value - * for all threads/cores + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. */ - apic_write(APIC_LVTTHMR, lvtthmr_init); + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); - h = lvtthmr_init; if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..a67038401129 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 01c0f3ee6cc3..675ac513dbe2 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -247,6 +247,25 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ unsigned long flags; int cpu; +#ifdef CONFIG_SMP + /* + * If this cpu is not yet active, we are in the cpu online path. There + * can be no stop_machine() in parallel, as stop machine ensures this + * by using get_online_cpus(). We can skip taking the stop_cpus_mutex, + * as we don't need it and also we can't afford to block while waiting + * for the mutex. + * + * If this cpu is active, we need to prevent stop_machine() happening + * in parallel by taking the stop cpus mutex. + * + * Also, this is called in the context of cpu online path or in the + * context where cpu hotplug is prevented. So checking the active status + * of the raw_smp_processor_id() is safe. + */ + if (cpu_active(raw_smp_processor_id())) + mutex_lock(&stop_cpus_mutex); +#endif + preempt_disable(); data.smp_reg = reg; @@ -292,14 +311,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ /* * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... + * + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * this cpu we still do mtrr_if->set_all(). During boot/resume, this + * is unnecessary if at this point we are still on the cpu that started + * the boot/resume sequence. But there is no guarantee that we are still + * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be + * sure that we are in sync with everyone else. */ if (reg != ~0U) mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) + else mtrr_if->set_all(); /* Wait for the others */ @@ -319,6 +348,11 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ local_irq_restore(flags); preempt_enable(); + +#ifdef CONFIG_SMP + if (cpu_active(raw_smp_processor_id())) + mutex_unlock(&stop_cpus_mutex); +#endif } /** @@ -793,13 +827,21 @@ void set_mtrr_aps_delayed_init(void) } /* - * MTRR initialization for all AP's + * Delayed MTRR initialization for all AP's */ void mtrr_aps_init(void) { if (!use_intel()) return; + /* + * Check if someone has requested the delay of AP MTRR initialization, + * by doing set_mtrr_aps_delayed_init(), prior to this point. If not, + * then we are done. + */ + if (!mtrr_aps_delayed_init) + return; + set_mtrr(~0U, 0, 0, 0); mtrr_aps_delayed_init = false; } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index c2897b7b4a3b..46d58448c3af 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -52,7 +52,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0, @@ -66,7 +66,7 @@ static __initconst const u64 amd_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 214ac860ebe0..d8d86d014008 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -491,33 +491,78 @@ static void intel_pmu_enable_all(int added) * Intel Errata AAP53 (model 30) * Intel Errata BD53 (model 44) * - * These chips need to be 'reset' when adding counters by programming - * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 - * either in sequence on the same PMC or on different PMCs. + * The official story: + * These chips need to be 'reset' when adding counters by programming the + * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either + * in sequence on the same PMC or on different PMCs. + * + * In practise it appears some of these events do in fact count, and + * we need to programm all 4 events. */ -static void intel_pmu_nhm_enable_all(int added) +static void intel_pmu_nhm_workaround(void) { - if (added) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int i; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + static const unsigned long nhm_magic[4] = { + 0x4300B5, + 0x4300D2, + 0x4300B1, + 0x4300B1 + }; + struct perf_event *event; + int i; + + /* + * The Errata requires below steps: + * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; + * 2) Configure 4 PERFEVTSELx with the magic events and clear + * the corresponding PMCx; + * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; + * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; + * 5) Clear 4 pairs of ERFEVTSELx and PMCx; + */ + + /* + * The real steps we choose are a little different from above. + * A) To reduce MSR operations, we don't run step 1) as they + * are already cleared before this function is called; + * B) Call x86_perf_event_update to save PMCx before configuring + * PERFEVTSELx with magic number; + * C) With step 5), we do clear only when the PERFEVTSELx is + * not used currently. + * D) Call x86_perf_event_set_period to restore PMCx; + */ - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); + /* We always operate 4 pairs of PERF Counters */ + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) + x86_perf_event_update(event); + } - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + for (i = 0; i < 4; i++) { + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + } - for (i = 0; i < 3; i++) { - struct perf_event *event = cpuc->events[i]; + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); - if (!event) - continue; + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) { + x86_perf_event_set_period(event); __x86_pmu_enable_event(&event->hw, - ARCH_PERFMON_EVENTSEL_ENABLE); - } + ARCH_PERFMON_EVENTSEL_ENABLE); + } else + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); } +} + +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) + intel_pmu_nhm_workaround(); intel_pmu_enable_all(added); } diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index ae85d69644d1..a187365d98b4 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -457,6 +457,8 @@ static int p4_hw_config(struct perf_event *event) event->hw.config |= event->attr.config & (p4_config_pack_escr(P4_ESCR_MASK_HT) | p4_config_pack_cccr(P4_CCCR_MASK_HT)); + + event->hw.config &= ~P4_CCCR_FORCE_OVF; } rc = x86_setup_perfctr(event); @@ -581,6 +583,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int overflow; if (!test_bit(idx, cpuc->active_mask)) continue; @@ -591,12 +594,14 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) WARN_ON_ONCE(hwc->idx != idx); /* it might be unflagged overflow */ - handled = p4_pmu_clear_cccr_ovf(hwc); + overflow = p4_pmu_clear_cccr_ovf(hwc); val = x86_perf_event_update(event); - if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue; + handled += overflow; + /* event overflow for sure */ data.period = event->hw.last_period; @@ -612,7 +617,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) inc_irq_stat(apic_perf_irqs); } - return handled; + return handled > 0; } /* diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index b9d1ff588445..ce9c6c28de77 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -23,6 +23,7 @@ #include <linux/dmi.h> #include <linux/module.h> +#include <linux/jiffies.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> @@ -51,7 +52,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,6 +63,13 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 045b36cada65..994828899e09 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (!vaddr) return -ENOMEM; @@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, } else memcpy(buf, vaddr + offset, csize); + set_iounmap_nonlazy(); iounmap(vaddr); return csize; } diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0d6fc71bedb1..28b09af9775c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -980,15 +980,21 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; -#ifdef CONFIG_X86_32 if (!strcmp(p, "nopentium")) { +#ifdef CONFIG_X86_32 setup_clear_cpu_cap(X86_FEATURE_PSE); return 0; - } +#else + printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n"); + return -EINVAL; #endif + } userdef = 1; mem_size = memparse(p, &p); + /* don't remove all of memory when handling "mem={invalid}" param */ + if (mem_size == 0) + return -EINVAL; e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); return 0; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index e5cc7e82e60d..f67a33c7f415 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,7 +18,6 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> -#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -146,15 +145,10 @@ static void __init ati_bugs(int num, int slot, int func) static u32 __init ati_sbx00_rev(int num, int slot, int func) { - u32 old, d; + u32 d; - d = read_pci_config(num, slot, func, 0x70); - old = d; - d &= ~(1<<8); - write_pci_config(num, slot, func, 0x70, d); d = read_pci_config(num, slot, func, 0x8); d &= 0xff; - write_pci_config(num, slot, func, 0x70, old); return d; } @@ -163,11 +157,19 @@ static void __init ati_bugs_contd(int num, int slot, int func) { u32 d, rev; - if (acpi_use_timer_override) + rev = ati_sbx00_rev(num, slot, func); + if (rev >= 0x40) + acpi_fix_pin2_polarity = 1; + + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) return; - rev = ati_sbx00_rev(num, slot, func); - if (rev > 0x13) + if (acpi_use_timer_override) return; /* check for IRQ0 interrupt swap */ @@ -192,21 +194,6 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif -/* - * Force the read back of the CMP register in hpet_next_event() - * to work around the problem that the CMP register write seems to be - * delayed. See hpet_next_event() for details. - * - * We do this on all SMBUS incarnations for now until we have more - * information about the affected chipsets. - */ -static void __init ati_hpet_bugs(int num, int slot, int func) -{ -#ifdef CONFIG_HPET_TIMER - hpet_readback_cmp = 1; -#endif -} - #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -236,8 +223,6 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, - { PCI_VENDOR_ID_ATI, PCI_ANY_ID, - PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4db7c4d12ffa..2642cf9911f8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1268,7 +1268,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..75e398199643 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -328,7 +328,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $pa(swapper_pg_dir),%eax + movl pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -608,6 +608,8 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(initial_page_table) + .long pa(swapper_pg_dir) /* * BSS section @@ -623,6 +625,10 @@ ENTRY(swapper_pg_dir) #endif swapper_pg_fixmap: .fill 1024,4,0 +#ifdef CONFIG_X86_TRAMPOLINE +ENTRY(trampoline_pg_dir) + .fill 1024,4,0 +#endif ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ba390d731175..917c66fca56d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -36,7 +36,6 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ u8 hpet_msi_disable; -u8 hpet_readback_cmp; #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; @@ -396,23 +395,27 @@ static int hpet_next_event(unsigned long delta, * at that point and we would wait for the next hpet interrupt * forever. We found out that reading the CMP register back * forces the transfer so we can rely on the comparison with - * the counter register below. + * the counter register below. If the read back from the + * compare register does not match the value we programmed + * then we might have a real hardware problem. We can not do + * much about it here, but at least alert the user/admin with + * a prominent warning. * - * That works fine on those ATI chipsets, but on newer Intel - * chipsets (ICH9...) this triggers due to an erratum: Reading - * the comparator immediately following a write is returning - * the old value. + * An erratum on some chipsets (ICH9,..), results in + * comparator read immediately following a write returning old + * value. Workaround for this is to read this value second + * time, when first read returns old value. * - * We restrict the read back to the affected ATI chipsets (set - * by quirks) and also run it with hpet=verbose for debugging - * purposes. + * In fact the write to the comparator register is delayed up + * to two HPET cycles so the workaround we tried to restrict + * the readback to those known to be borked ATI chipsets + * failed miserably. So we give up on optimizations forever + * and penalize all HPET incarnations unconditionally. */ - if (hpet_readback_cmp || hpet_verbose) { - u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); - - if (cmp != cnt) + if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { + if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); + "hpet: compare register read back failed.\n"); } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; @@ -504,7 +507,7 @@ static int hpet_assign_irq(struct hpet_dev *dev) { unsigned int irq; - irq = create_irq(); + irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index a8f1b803d2fd..f3654702099a 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -421,6 +421,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; + /* If it's a single step, TRAP bits are random */ + if (dr6 & DR_STEP) + return NOTIFY_DONE; + /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1af7c055c7d..67381a227d67 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -66,7 +66,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -125,6 +124,37 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + actual_size = buf[4] + (buf[5] << 8); + + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; + } + + return actual_size; +} + static int apply_microcode_amd(int cpu) { u32 rev, dummy; @@ -162,11 +192,11 @@ static int get_ucode_data(void *to, const u8 *from, size_t n) } static void * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) { - unsigned int total_size; + unsigned int actual_size = 0; u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; + void *mc = NULL; if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) return NULL; @@ -176,23 +206,18 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) return NULL; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - - if (total_size > size || total_size > UCODE_MAX_SIZE) { - pr_err("error: size mismatch\n"); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) return NULL; - } - mc = vmalloc(UCODE_MAX_SIZE); - if (mc) { - memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, - total_size)) { - vfree(mc); - mc = NULL; - } else - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; - } + mc = vmalloc(actual_size); + if (!mc) + return NULL; + + memset(mc, 0, actual_size); + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; + return mc; } @@ -258,7 +283,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); + mc = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); if (!mc) break; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 356170262a93..2573689bda77 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - if (mc) - vfree(mc); + vfree(mc); mc = vmalloc(mc_size); if (!mc) break; @@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { - vfree(mc); break; } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; mc = NULL; /* trigger new vmalloc */ @@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (mc) - vfree(mc); + vfree(mc); if (leftover) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); state = UCODE_ERROR; goto out; } @@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, goto out; } - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index d86dbf7e54be..d7b6f7fb4fec 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -274,6 +274,18 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt) void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } +static void __init smp_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -295,6 +307,10 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) if (early) return 1; + /* Initialize the lapic mapping */ + if (!acpi_lapic) + smp_register_lapic_address(mpc->lapic); + if (mpc->oemptr) x86_init.mpparse.smp_read_mpc_oem(mpc); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..a23b38252b85 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -117,6 +117,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, unsigned long flags; int ret = -EIO; int i; + int restarts = 0; spin_lock_irqsave(&ec_lock, flags); @@ -173,7 +174,9 @@ restart: if (wait_on_obf(0x6c, 1)) { printk(KERN_ERR "olpc-ec: timeout waiting for" " EC to provide data!\n"); - goto restart; + if (restarts++ < 10) + goto restart; + goto err; } outbuf[i] = inb(0x68); printk(KERN_DEBUG "olpc-ec: received 0x%x\n", diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0f7f130caa67..870e069863a4 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -80,6 +80,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -211,9 +214,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_addr; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e7e35219b32f..553b02f13094 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -525,42 +525,6 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} - static cpumask_var_t c1e_mask; static int c1e_detected; @@ -638,7 +602,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8d128783af47..a3d0dc59067b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { set_user_gs(regs, 0); regs->fs = 0; - set_fs(USER_DS); regs->ds = __USER_DS; regs->es = __USER_DS; regs->ss = __USER_DS; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3c2422a99f1f..9cb414381d44 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -337,7 +337,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - set_fs(USER_DS); /* * Free the old FP and other extended state */ diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427ca02af..a4f07c1cfc87 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -120,6 +120,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) static atomic64_t last_value = ATOMIC64_INIT(0); +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e3af342fe83a..807e3c98bbbf 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -477,6 +477,22 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), }, }, + { /* Handle problems with rebooting on the Latitude E5420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E5420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6420. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6420", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), + }, + }, { } }; @@ -641,7 +657,7 @@ void native_machine_shutdown(void) /* O.K Now that I'm on the appropriate processor, * stop all of the others. */ - smp_send_stop(); + stop_other_cpus(); #endif lapic_shutdown(); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 41235531b11c..36818f8ec2be 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -97,6 +97,8 @@ relocate_kernel: ret identity_mapped: + /* set return address to 0 if not preserving context */ + pushl $0 /* store the start address on the stack */ pushl %edx diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 4de8f5b3d476..7a6f3b3be3cf 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -100,6 +100,8 @@ relocate_kernel: ret identity_mapped: + /* set return address to 0 if not preserving context */ + pushq $0 /* store the start address on the stack */ pushq %rdx diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4ae4acbd031..6600cfde6b7c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1008,6 +1008,8 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + setup_trampoline_page_table(); + tboot_probe(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..513deac7228d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -179,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -227,7 +230,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 11015fd1abbc..40eb0f94e8c5 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -73,7 +73,6 @@ #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; -static int low_mappings; #endif /* State of each CPU */ @@ -91,6 +90,25 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) + +/* + * We need this for trampoline_base protection from concurrent accesses when + * off- and onlining cores wildly. + */ +static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); + +void cpu_hotplug_driver_lock() +{ + mutex_lock(&x86_cpu_hotplug_driver_mutex); +} + +void cpu_hotplug_driver_unlock() +{ + mutex_unlock(&x86_cpu_hotplug_driver_mutex); +} + +ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } +ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } #else static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) @@ -281,6 +299,18 @@ notrace static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ + +#ifdef CONFIG_X86_32 + /* + * Switch away from the trampoline page-table + * + * Do this before cpu_init() because it needs to access per-cpu + * data which may not be mapped in the trampoline page-table. + */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + vmi_bringup(); cpu_init(); preempt_disable(); @@ -299,12 +329,6 @@ notrace static void __cpuinit start_secondary(void *unused) legacy_pic->chip->unmask(0); } -#ifdef CONFIG_X86_32 - while (low_mappings) - cpu_relax(); - __flush_tlb_all(); -#endif - /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); @@ -754,6 +778,7 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -901,20 +926,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -#ifdef CONFIG_X86_32 - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - flush_tlb_all(); - low_mappings = 1; - err = do_boot_cpu(apicid, cpu); - zap_low_mappings(false); - low_mappings = 0; -#else - err = do_boot_cpu(apicid, cpu); -#endif if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1374,11 +1387,94 @@ void play_dead_common(void) local_irq_disable(); } +#define MWAIT_SUBSTATE_MASK 0xf +#define MWAIT_SUBSTATE_SIZE 4 + +#define CPUID_MWAIT_LEAF 5 +#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 + +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + void *mwait_ptr; + + if (!cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT)) + return; + if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + return; + if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + /* + * This should be a memory location in a cache line which is + * unlikely to be touched by other processors. The actual + * content is immaterial as it is not actually modified in any way. + */ + mwait_ptr = ¤t_thread_info()->flags; + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + clflush(mwait_ptr); + __monitor(mwait_ptr, 0, 0); + mb(); + __mwait(eax, 0); + } +} + +static inline void hlt_play_dead(void) +{ + if (current_cpu_data.x86 >= 4) + wbinvd(); + + while (1) { + native_halt(); + } +} + void native_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - wbinvd_halt(); + + mwait_play_dead(); /* Only returns on failure */ + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..e2a595257390 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -1,6 +1,7 @@ #include <linux/io.h> #include <asm/trampoline.h> +#include <asm/pgtable.h> #include <asm/e820.h> #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) @@ -37,3 +38,19 @@ unsigned long __trampinit setup_trampoline(void) memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } + +void __init setup_trampoline_page_table(void) +{ +#ifdef CONFIG_X86_32 + /* Copy kernel address range */ + clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* Initialize low mappings */ + clone_pgd_range(trampoline_pg_dir, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); +#endif +} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 725ef4d17cd5..4d0f3ed34c41 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -568,6 +568,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..97cdbe8b732c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,10 +104,14 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); +static int no_sched_irq_time; + static int __init tsc_setup(char *str) { if (!strcmp(str, "reliable")) tsc_clocksource_reliable = 1; + if (!strncmp(str, "noirqtime", 9)) + no_sched_irq_time = 1; return 1; } @@ -626,6 +630,44 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_restore(flags); } +static unsigned long long cyc2ns_suspend; + +void save_sched_clock_state(void) +{ + if (!sched_clock_stable) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable) + return; + + local_irq_save(flags); + + __get_cpu_var(cyc2ns_offset) = 0; + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) + per_cpu(cyc2ns_offset, cpu) = offset; + + local_irq_restore(flags); +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -764,6 +806,7 @@ void mark_tsc_unstable(char *reason) if (!tsc_unstable) { tsc_unstable = 1; sched_clock_stable = 0; + disable_sched_clock_irqtime(); printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -952,6 +995,9 @@ void __init tsc_init(void) /* now allow native_sched_clock() to use rdtsc */ tsc_disabled = 0; + if (!no_sched_irq_time) + enable_sched_clock_irqtime(); + lpj = ((u64)tsc_khz * 1000); do_div(lpj, HZ); lpj_fine = lpj; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5ffb5622f793..61fb98519622 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -551,8 +551,14 @@ cannot_handle: int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { if (VMPI.is_vm86pus) { - if ((trapno == 3) || (trapno == 1)) - return_to_32bit(regs, VM86_TRAP + (trapno << 8)); + if ((trapno == 3) || (trapno == 1)) { + KVM86->regs32->ax = VM86_TRAP + (trapno << 8); + /* setting this flag forces the code in entry_32.S to + call save_v86_state() and change the stack pointer + to KVM86->regs32 */ + set_thread_flag(TIF_IRET); + return 0; + } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 37e68fc5e24a..aa8bf4fcf725 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -305,7 +305,8 @@ void __cpuinit xsave_init(void) */ static void __init setup_xstate_init(void) { - init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5ac0bb465ed6..582c8fcad2e1 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -345,10 +345,10 @@ static u32 group_table[] = { DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, SrcMem | ModRM | Stack, 0, @@ -1712,17 +1712,16 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - u64 old = c->dst.orig_val; + u64 old = c->dst.orig_val64; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; } else { - c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; + c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; ctxt->eflags |= EFLG_ZF; } @@ -2535,7 +2534,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) ctxt->vcpu); if (rc != X86EMUL_CONTINUE) goto done; - c->src.orig_val = c->src.val; + c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 93825ff3338f..8a3324668cac 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -553,6 +553,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index cd1f362f413d..4227b1aed869 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -45,7 +45,6 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ - u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -58,6 +57,7 @@ struct kvm_kpic_state { u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; + u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b1ed0a1a5913..92b6ca4fb54d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -799,8 +799,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int nr; + + nr = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL+j); + idx = (memslot->base_gfn+gfn_offset) / nr - + memslot->base_gfn / nr; ret |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 2331bdc2b549..e34452b92928 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -324,8 +324,32 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, break; } - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) - continue; + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { + struct kvm_mmu_page *child; + unsigned direct_access; + + if (level != gw->level) + continue; + + /* + * For the direct sp, if the guest pte's dirty bit + * changed form clean to dirty, it will corrupt the + * sp's access: allow writable in the read-only sp, + * so we should update the spte at this point to get + * a new sp with the correct access. + */ + direct_access = gw->pt_access & gw->pte_access; + if (!is_dirty_gpte(gw->ptes[gw->level - 1])) + direct_access &= ~ACC_WRITE_MASK; + + child = page_header(*sptep & PT64_BASE_ADDR_MASK); + if (child->role.access == direct_access) + continue; + + mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } if (is_large_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce438e0fdd26..d103e15c64f3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -87,6 +87,14 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; + /* + * If we vmexit during an instruction emulation we need this to restore + * the l1 guest rip after the emulation + */ + unsigned long vmexit_rip; + unsigned long vmexit_rsp; + unsigned long vmexit_rax; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -766,7 +774,6 @@ static void init_vmcb(struct vcpu_svm *svm) control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -902,6 +909,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); + svm->vmcb->control.tsc_offset = 0-native_read_tsc(); fx_init(&svm->vcpu); svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; @@ -1201,8 +1209,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (old == new) { /* cr0 write with ts and mp unchanged */ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { + svm->nested.vmexit_rip = kvm_rip_read(vcpu); + svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); return; + } } } @@ -2398,6 +2410,23 @@ static int emulate_on_interception(struct vcpu_svm *svm) return 1; } +static int cr0_write_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int r; + + r = emulate_instruction(&svm->vcpu, 0, 0, 0); + + if (svm->nested.vmexit_rip) { + kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); + kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); + kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); + svm->nested.vmexit_rip = 0; + } + + return r == EMULATE_DONE; +} + static int cr8_write_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; @@ -2671,7 +2700,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr0_write_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -3067,8 +3096,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ @@ -3155,10 +3184,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + kvm_load_ldt(ldt_selector); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif reload_tss(vcpu); @@ -3253,6 +3287,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x80000001: + if (nested) + entry->ecx |= (1 << 2); /* Set SVM bit */ + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee03679efe78..f1abe23da5d3 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -177,6 +177,7 @@ static u64 construct_eptp(unsigned long root_hpa); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); +static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; @@ -745,7 +746,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -753,7 +754,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -770,10 +771,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - } #endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, @@ -783,35 +783,30 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; - if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } + if (vmx->host_state.fs_reload_needed) + loadsegment(fs, vmx->host_state.fs_sel); reload_tss(); #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); - } + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif + load_gdt(&__get_cpu_var(host_gdt)); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -1314,6 +1309,8 @@ static int hardware_enable(void *garbage) ept_sync_global(); + store_gdt(&__get_cpu_var(host_gdt)); + return 0; } @@ -2514,8 +2511,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fa89c39c64f..eee5cdd29bc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1910,9 +1910,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* Reserved, XSAVE, OSXSAVE */; /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT */ | 0 /* WDT */; /* all calls to cpuid_count() should be made on the same cpu */ @@ -2220,6 +2220,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, !kvm_exception_is_soft(vcpu->arch.exception.nr); events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; events->interrupt.injected = @@ -2233,13 +2234,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.pad = 0; events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); - + memset(&events->reserved, 0, sizeof(events->reserved)); vcpu_put(vcpu); } @@ -2289,6 +2291,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; + memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); vcpu_put(vcpu); } @@ -2756,6 +2759,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -2825,10 +2829,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (is_dirty) { struct kvm_memslots *slots, *old_slots; - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); if (!slots) goto out_free; @@ -2841,6 +2841,11 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, synchronize_srcu_expedited(&kvm->srcu); dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); + } r = 0; @@ -3152,6 +3157,7 @@ long kvm_arch_vm_ioctl(struct file *filp, now_ns = timespec_to_ns(&now); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; user_ns.flags = 0; + memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; if (copy_to_user(argp, &user_ns, sizeof(user_ns))) @@ -5438,6 +5444,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, int user_alloc) { int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -5450,7 +5461,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S index 4a5979aa6883..2cda60a06e65 100644 --- a/arch/x86/lib/atomic64_386_32.S +++ b/arch/x86/lib/atomic64_386_32.S @@ -25,150 +25,172 @@ CFI_ADJUST_CFA_OFFSET -4 .endm -.macro BEGIN func reg -$v = \reg - -ENTRY(atomic64_\func\()_386) - CFI_STARTPROC - LOCK $v - -.macro RETURN - UNLOCK $v +#define BEGIN(op) \ +.macro endp; \ + CFI_ENDPROC; \ +ENDPROC(atomic64_##op##_386); \ +.purgem endp; \ +.endm; \ +ENTRY(atomic64_##op##_386); \ + CFI_STARTPROC; \ + LOCK v; + +#define ENDP endp + +#define RET \ + UNLOCK v; \ ret -.endm - -.macro END_ - CFI_ENDPROC -ENDPROC(atomic64_\func\()_386) -.purgem RETURN -.purgem END_ -.purgem END -.endm - -.macro END -RETURN -END_ -.endm -.endm -BEGIN read %ecx - movl ($v), %eax - movl 4($v), %edx -END - -BEGIN set %esi - movl %ebx, ($v) - movl %ecx, 4($v) -END - -BEGIN xchg %esi - movl ($v), %eax - movl 4($v), %edx - movl %ebx, ($v) - movl %ecx, 4($v) -END - -BEGIN add %ecx - addl %eax, ($v) - adcl %edx, 4($v) -END - -BEGIN add_return %ecx - addl ($v), %eax - adcl 4($v), %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN sub %ecx - subl %eax, ($v) - sbbl %edx, 4($v) -END - -BEGIN sub_return %ecx +#define RET_ENDP \ + RET; \ + ENDP + +#define v %ecx +BEGIN(read) + movl (v), %eax + movl 4(v), %edx +RET_ENDP +#undef v + +#define v %esi +BEGIN(set) + movl %ebx, (v) + movl %ecx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(xchg) + movl (v), %eax + movl 4(v), %edx + movl %ebx, (v) + movl %ecx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(add) + addl %eax, (v) + adcl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(add_return) + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(sub) + subl %eax, (v) + sbbl %edx, 4(v) +RET_ENDP +#undef v + +#define v %ecx +BEGIN(sub_return) negl %edx negl %eax sbbl $0, %edx - addl ($v), %eax - adcl 4($v), %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN inc %esi - addl $1, ($v) - adcl $0, 4($v) -END - -BEGIN inc_return %esi - movl ($v), %eax - movl 4($v), %edx + addl (v), %eax + adcl 4(v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(inc) + addl $1, (v) + adcl $0, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(inc_return) + movl (v), %eax + movl 4(v), %edx addl $1, %eax adcl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) -END - -BEGIN dec %esi - subl $1, ($v) - sbbl $0, 4($v) -END - -BEGIN dec_return %esi - movl ($v), %eax - movl 4($v), %edx + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(dec) + subl $1, (v) + sbbl $0, 4(v) +RET_ENDP +#undef v + +#define v %esi +BEGIN(dec_return) + movl (v), %eax + movl 4(v), %edx subl $1, %eax sbbl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) -END + movl %eax, (v) + movl %edx, 4(v) +RET_ENDP +#undef v -BEGIN add_unless %ecx +#define v %ecx +BEGIN(add_unless) addl %eax, %esi adcl %edx, %edi - addl ($v), %eax - adcl 4($v), %edx + addl (v), %eax + adcl 4(v), %edx cmpl %eax, %esi je 3f 1: - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) movl $1, %eax 2: -RETURN + RET 3: cmpl %edx, %edi jne 1b xorl %eax, %eax jmp 2b -END_ +ENDP +#undef v -BEGIN inc_not_zero %esi - movl ($v), %eax - movl 4($v), %edx +#define v %esi +BEGIN(inc_not_zero) + movl (v), %eax + movl 4(v), %edx testl %eax, %eax je 3f 1: addl $1, %eax adcl $0, %edx - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) movl $1, %eax 2: -RETURN + RET 3: testl %edx, %edx jne 1b jmp 2b -END_ +ENDP +#undef v -BEGIN dec_if_positive %esi - movl ($v), %eax - movl 4($v), %edx +#define v %esi +BEGIN(dec_if_positive) + movl (v), %eax + movl 4(v), %edx subl $1, %eax sbbl $0, %edx js 1f - movl %eax, ($v) - movl %edx, 4($v) + movl %eax, (v) + movl %edx, 4(v) 1: -END +RET_ENDP +#undef v diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 71100c98e337..a4899ae3975f 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -72,7 +72,7 @@ ENTRY(_copy_to_user) addq %rdx,%rcx jc bad_to_user cmpq TI_addr_limit(%rax),%rcx - jae bad_to_user + ja bad_to_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC ENDPROC(_copy_to_user) @@ -85,7 +85,7 @@ ENTRY(_copy_from_user) addq %rdx,%rcx jc bad_from_user cmpq TI_addr_limit(%rax),%rcx - jae bad_from_user + ja bad_from_user ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string CFI_ENDPROC ENDPROC(_copy_from_user) diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 648fe4741782..f35eec78a68e 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -36,7 +36,7 @@ */ #ifdef CONFIG_SMP ENTRY(__write_lock_failed) - CFI_STARTPROC simple + CFI_STARTPROC FRAME 2: LOCK_PREFIX addl $ RW_LOCK_BIAS,(%eax) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f62777940dfb..544ed251a40c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -802,8 +802,10 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) + if (!(error_code & PF_USER)) { no_context(regs, error_code, address); + return; + } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) @@ -829,6 +831,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + out_of_memory(regs, error_code, address); } else { if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..2a72049068e7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -160,8 +160,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ - if (mm == current->active_mm) - write_cr3(read_cr3()); + flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index f9897f7a9ef1..9c0d0d399c30 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; } - for_each_node_mask(i, nodes_parsed) - e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); + for (i = 0; i < num_node_memblks; i++) + e820_register_active_regions(memblk_nodeid[i], + node_memblk_range[i].start >> PAGE_SHIFT, + node_memblk_range[i].end >> PAGE_SHIFT); + /* for out of order entries in SRAT */ sort_node_map(); if (!nodes_cover_memory(nodes)) { diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b28d2f1253bb..f1575c9a2572 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -568,8 +568,13 @@ static int __init init_sysfs(void) int error; error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); + if (error) + return error; + + error = sysdev_register(&device_oprofile); + if (error) + sysdev_class_unregister(&oprofile_sysclass); + return error; } @@ -580,8 +585,10 @@ static void exit_sysfs(void) } #else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) + +static inline int init_sysfs(void) { return 0; } +static inline void exit_sysfs(void) { } + #endif /* CONFIG_PM */ static int __init p4_init(char **cpu_type) @@ -634,6 +641,18 @@ static int __init ppro_init(char **cpu_type) if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; + /* + * Documentation on identifying Intel processors by CPU family + * and model can be found in the Intel Software Developer's + * Manuals (SDM): + * + * http://www.intel.com/products/processor/manuals/ + * + * As of May 2010 the documentation for this was in the: + * "Intel 64 and IA-32 Architectures Software Developer's + * Manual Volume 3B: System Programming Guide", "Table B-1 + * CPUID Signature Values of DisplayFamily_DisplayModel". + */ switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; @@ -652,15 +671,19 @@ static int __init ppro_init(char **cpu_type) case 14: *cpu_type = "i386/core"; break; - case 15: case 23: + case 0x0f: + case 0x16: + case 0x17: + case 0x1d: *cpu_type = "i386/core_2"; break; + case 0x1a: + case 0x1e: case 0x2e: - case 26: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; - case 28: + case 0x1c: *cpu_type = "i386/atom"; break; default: @@ -682,6 +705,8 @@ int __init op_nmi_init(struct oprofile_operations *ops) char *cpu_type = NULL; int ret = 0; + using_nmi = 0; + if (!cpu_has_apic) return -ENODEV; @@ -761,7 +786,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) mux_init(ops); - init_sysfs(); + ret = init_sysfs(); + if (ret) + return ret; + using_nmi = 1; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b67a6b5aa8d4..42623310c968 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -484,21 +484,29 @@ static int __init_ibs_nmi(void) return 0; } -/* initialize the APIC for the IBS interrupts if available */ +/* + * check and reserve APIC extended interrupt LVT offset for IBS if + * available + * + * init_ibs() preforms implicitly cpu-local operations, so pin this + * thread to its current CPU + */ + static void init_ibs(void) { - ibs_caps = get_ibs_caps(); + preempt_disable(); + ibs_caps = get_ibs_caps(); if (!ibs_caps) - return; + goto out; - if (__init_ibs_nmi()) { + if (__init_ibs_nmi() < 0) ibs_caps = 0; - return; - } + else + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", - (unsigned)ibs_caps); +out: + preempt_enable(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 2ec04c424a62..15466c096ba5 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -34,6 +34,15 @@ static const struct dmi_system_id pci_use_crs_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), }, }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */ + /* 2006 AMD HT/VIA system with two host bridges */ + { + .callback = set_use_crs, + .ident = "ASRock ALiveSATA2-GLAN", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), + }, + }, {} }; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1290ba54b350..9c57cb1b33f1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -113,6 +113,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); + save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -229,6 +230,7 @@ static void __restore_processor_state(struct saved_context *ctxt) void restore_processor_state(void) { __restore_processor_state(&saved_context); + restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 6b4ffedb93c9..dd78ef687c5e 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so @@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter vdso32-images = $(vdso32.so-y:%=vdso32-%.so) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 # This makes sure the $(obj) subdirectory exists even though vdso32/ # is not a kbuild sub-make subdirectory. diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 65d8d79b46a8..25d787c17ad8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -927,7 +927,7 @@ static const struct pv_init_ops xen_init_ops __initdata = { }; static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, + .sched_clock = xen_clocksource_read, }; static const struct pv_cpu_ops xen_cpu_ops __initdata = { @@ -1000,10 +1000,6 @@ static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - smp_send_stop(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 914f04695ce5..96cdf7806972 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1142,7 +1142,7 @@ static void drop_other_mm_ref(void *info) active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure @@ -1641,8 +1641,10 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; +#ifdef CONFIG_X86_32 if (pfn > max_pfn_mapped) max_pfn_mapped = pfn; +#endif if (!pte_none(pte_page[pteidx])) continue; @@ -1687,6 +1689,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, pud_t *l3; pmd_t *l2; + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 8bff7e7c290b..1b2b73ff0a6e 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args) unsigned argidx = roundup(b->argidx, sizeof(u64)); BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == MC_BATCH || - (argidx + args) > MC_ARGS) { + (argidx + args) >= MC_ARGS) { mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); @@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args) ret.args = &b->args[argidx]; b->argidx = argidx + args; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } @@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) struct multicall_space ret = { NULL, NULL }; BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == 0) return ret; @@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size) if (b->entries[b->mcidx - 1].op != op) return ret; - if ((b->argidx + size) > MC_ARGS) + if ((b->argidx + size) >= MC_ARGS) return ret; ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; b->argidx += size; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a29693fd3138..d2dfbf500fc8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -398,9 +398,9 @@ static void stop_self(void *v) BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -468,7 +468,7 @@ static const struct smp_ops xen_smp_ops __initdata = { .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b3c6c59ed302..41eb583adc8d 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -155,45 +155,6 @@ static void do_stolen_accounting(void) account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { @@ -464,6 +425,8 @@ void xen_timer_resume(void) { int cpu; + pvclock_resume(); + if (xen_clockevent != &xen_vcpuop_clockevent) return; |