diff options
Diffstat (limited to 'arch/x86')
42 files changed, 406 insertions, 304 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f408babdf746..b5226a009973 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -11,6 +11,16 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif +# For gcc stack alignment is specified with -mpreferred-stack-boundary, +# clang has the option -mstack-alignment for that purpose. +ifneq ($(call cc-option, -mpreferred-stack-boundary=4),) + cc_stack_align4 := -mpreferred-stack-boundary=2 + cc_stack_align8 := -mpreferred-stack-boundary=3 +else ifneq ($(call cc-option, -mstack-alignment=16),) + cc_stack_align4 := -mstack-alignment=4 + cc_stack_align8 := -mstack-alignment=8 +endif + # How to compile the 16-bit code. Note we always compile for -march=i386; # that way we can complain to the user if the CPU is insufficient. # @@ -24,10 +34,11 @@ REALMODE_CFLAGS := $(M16_CFLAGS) -g -Os -D__KERNEL__ \ -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) \ - $(call cc-option, -mpreferred-stack-boundary=2) + -mno-mmx -mno-sse + +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -ffreestanding) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), -fno-stack-protector) +REALMODE_CFLAGS += $(call __cc-option, $(CC), $(REALMODE_CFLAGS), $(cc_stack_align4)) export REALMODE_CFLAGS # BITS is used as extension for files which are available in a 32 bit @@ -64,8 +75,10 @@ ifeq ($(CONFIG_X86_32),y) # with nonstandard options KBUILD_CFLAGS += -fno-pic - # prevent gcc from keeping the stack 16 byte aligned - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + # Align the stack to the register width instead of using the default + # alignment of 16 bytes. This reduces stack usage and the number of + # alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align4)) # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use # a lot more stack due to the lack of sharing of stacklots: @@ -88,17 +101,23 @@ else KBUILD_CFLAGS += -m64 # Align jump targets to 1 byte, not the default 16 bytes: - KBUILD_CFLAGS += -falign-jumps=1 + KBUILD_CFLAGS += $(call cc-option,-falign-jumps=1) # Pack loops tightly as well: - KBUILD_CFLAGS += -falign-loops=1 + KBUILD_CFLAGS += $(call cc-option,-falign-loops=1) # Don't autogenerate traditional x87 instructions KBUILD_CFLAGS += $(call cc-option,-mno-80387) KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387) - # Use -mpreferred-stack-boundary=3 if supported. - KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3) + # By default gcc and clang use a stack alignment of 16 bytes for x86. + # However the standard kernel entry on x86-64 leaves the stack on an + # 8-byte boundary. If the compiler isn't informed about the actual + # alignment it will generate extra alignment instructions for the + # default alignment which keep the stack *mis*aligned. + # Furthermore an alignment to the register width reduces stack usage + # and the number of alignment instructions. + KBUILD_CFLAGS += $(call cc-option,$(cc_stack_align8)) # Use -mskip-rax-setup if supported. KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 9e240fcba784..08dfce02362c 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -16,6 +16,15 @@ #include "ctype.h" #include "string.h" +/* + * Undef these macros so that the functions that we provide + * here will have the correct names regardless of how string.h + * may have chosen to #define them. + */ +#undef memcpy +#undef memset +#undef memcmp + int memcmp(const void *s1, const void *s2, size_t len) { bool diff; diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 0702d2531bc7..039c4a66aca4 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -390,6 +390,13 @@ int main(int argc, char ** argv) die("Unable to mmap '%s': %m", argv[2]); /* Number of 16-byte paragraphs, including space for a 4-byte CRC */ sys_size = (sz + 15 + 4) / 16; +#ifdef CONFIG_EFI_STUB + /* + * COFF requires minimum 32-byte alignment of sections, and + * adding a signature is problematic without that alignment. + */ + sys_size = (sys_size + 1) & ~1; +#endif /* Patch the setup code with the appropriate size parameters */ buf[0x1f1] = setup_sectors-1; diff --git a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S index a916c4a61165..5f6a5af9c489 100644 --- a/arch/x86/crypto/aes_ctrby8_avx-x86_64.S +++ b/arch/x86/crypto/aes_ctrby8_avx-x86_64.S @@ -65,7 +65,6 @@ #include <linux/linkage.h> #include <asm/inst.h> -#define CONCAT(a,b) a##b #define VMOVDQ vmovdqu #define xdata0 %xmm0 @@ -92,8 +91,6 @@ #define num_bytes %r8 #define tmp %r10 -#define DDQ(i) CONCAT(ddq_add_,i) -#define XMM(i) CONCAT(%xmm, i) #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 @@ -131,12 +128,12 @@ ddq_add_8: /* generate a unique variable for ddq_add_x */ .macro setddq n - var_ddq_add = DDQ(\n) + var_ddq_add = ddq_add_\n .endm /* generate a unique variable for xmm register */ .macro setxdata n - var_xdata = XMM(\n) + var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index dd1958436591..5773e1161072 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,21 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -185,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -197,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -257,7 +249,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(&alg); diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 76c1d85e749b..870e941c1947 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -91,7 +91,7 @@ ENDPROC(native_usergs_sysret64) .endm .macro TRACE_IRQS_IRETQ_DEBUG - bt $9, EFLAGS(%rsp) /* interrupts off? */ + btl $9, EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -485,7 +485,7 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - bt $9, EFLAGS(%rsp) /* were interrupts off? */ + btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f 0: cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 02223cb4bcfd..1e967099ae51 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -37,8 +37,9 @@ extern u8 pvclock_page notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*ts) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : + "memory", "rcx", "r11"); return ret; } @@ -46,8 +47,9 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + asm ("syscall" : "=a" (ret), "=m" (*tv), "=m" (*tz) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : + "memory", "rcx", "r11"); return ret; } @@ -58,13 +60,13 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[clock], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "=a" (ret), "=m" (*ts) + : "0" (__NR_clock_gettime), [clock] "g" (clock), "c" (ts) : "memory", "edx"); return ret; } @@ -73,13 +75,13 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) { long ret; - asm( + asm ( "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" + "mov %[tv], %%ebx \n" "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "=a" (ret), "=m" (*tv), "=m" (*tz) + : "0" (__NR_gettimeofday), [tv] "g" (tv), "c" (tz) : "memory", "edx"); return ret; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 655a65eaf105..cadf99923600 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -437,26 +437,6 @@ int x86_setup_perfctr(struct perf_event *event) if (config == -1LL) return -EINVAL; - /* - * Branch tracing: - */ - if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !attr->freq && hwc->sample_period == 1) { - /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts_active) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (!attr->exclude_kernel) - return -EOPNOTSUPP; - - /* disallow bts if conflicting events are present */ - if (x86_add_exclusive(x86_lbr_exclusive_lbr)) - return -EBUSY; - - event->destroy = hw_perf_lbr_event_destroy; - } - hwc->config |= config; return 0; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 815039327932..4f8560774082 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2198,16 +2198,7 @@ done: static struct event_constraint * intel_bts_constraints(struct perf_event *event) { - struct hw_perf_event *hwc = &event->hw; - unsigned int hw_event, bts_event; - - if (event->attr.freq) - return NULL; - - hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; - bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - - if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) + if (unlikely(intel_pmu_has_bts(event))) return &bts_constraint; return NULL; @@ -2822,6 +2813,39 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) return flags; } +static int intel_pmu_bts_config(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (unlikely(intel_pmu_has_bts(event))) { + /* BTS is not supported by this architecture. */ + if (!x86_pmu.bts_active) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (!attr->exclude_kernel) + return -EOPNOTSUPP; + + /* disallow bts if conflicting events are present */ + if (x86_add_exclusive(x86_lbr_exclusive_lbr)) + return -EBUSY; + + event->destroy = hw_perf_lbr_event_destroy; + } + + return 0; +} + +static int core_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + return intel_pmu_bts_config(event); +} + static int intel_pmu_hw_config(struct perf_event *event) { int ret = x86_pmu_hw_config(event); @@ -2829,6 +2853,10 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + ret = intel_pmu_bts_config(event); + if (ret) + return ret; + if (event->attr.precise_ip) { if (!event->attr.freq) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; @@ -2848,7 +2876,7 @@ static int intel_pmu_hw_config(struct perf_event *event) /* * BTS is set up earlier in this path, so don't account twice */ - if (!intel_pmu_has_bts(event)) { + if (!unlikely(intel_pmu_has_bts(event))) { /* disallow lbr if conflicting events are present */ if (x86_add_exclusive(x86_lbr_exclusive_lbr)) return -EBUSY; @@ -3265,7 +3293,7 @@ static __initconst const struct x86_pmu core_pmu = { .enable_all = core_pmu_enable_all, .enable = core_pmu_enable_event, .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, + .hw_config = core_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 5d103a87e984..10c1a5c448e5 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -342,7 +342,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = task_ctx->tos; - for (i = 0; i < tos; i++) { + for (i = 0; i < task_ctx->valid_lbrs; i++) { lbr_idx = (tos - i) & mask; wrlbr_from(lbr_idx, task_ctx->lbr_from[i]); wrlbr_to (lbr_idx, task_ctx->lbr_to[i]); @@ -350,6 +350,15 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + + for (; i < x86_pmu.lbr_nr; i++) { + lbr_idx = (tos - i) & mask; + wrlbr_from(lbr_idx, 0); + wrlbr_to(lbr_idx, 0); + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0); + } + wrmsrl(x86_pmu.lbr_tos, tos); task_ctx->lbr_stack_state = LBR_NONE; } @@ -357,7 +366,7 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx) static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) { unsigned lbr_idx, mask; - u64 tos; + u64 tos, from; int i; if (task_ctx->lbr_callstack_users == 0) { @@ -367,13 +376,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) mask = x86_pmu.lbr_nr - 1; tos = intel_pmu_lbr_tos(); - for (i = 0; i < tos; i++) { + for (i = 0; i < x86_pmu.lbr_nr; i++) { lbr_idx = (tos - i) & mask; - task_ctx->lbr_from[i] = rdlbr_from(lbr_idx); + from = rdlbr_from(lbr_idx); + if (!from) + break; + task_ctx->lbr_from[i] = from; task_ctx->lbr_to[i] = rdlbr_to(lbr_idx); if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]); } + task_ctx->valid_lbrs = i; task_ctx->tos = tos; task_ctx->lbr_stack_state = LBR_VALID; } @@ -522,7 +535,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) */ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) { - bool need_info = false; + bool need_info = false, call_stack = false; unsigned long mask = x86_pmu.lbr_nr - 1; int lbr_format = x86_pmu.intel_cap.lbr_format; u64 tos = intel_pmu_lbr_tos(); @@ -533,7 +546,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) if (cpuc->lbr_sel) { need_info = !(cpuc->lbr_sel->config & LBR_NO_INFO); if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + call_stack = true; } for (i = 0; i < num; i++) { @@ -546,6 +559,13 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) from = rdlbr_from(lbr_idx); to = rdlbr_to(lbr_idx); + /* + * Read LBR call stack entries + * until invalid entry (0s) is detected. + */ + if (call_stack && !from) + break; + if (lbr_format == LBR_FORMAT_INFO && need_info) { u64 info; @@ -1175,4 +1195,8 @@ void intel_pmu_lbr_init_knl(void) x86_pmu.lbr_sel_mask = LBR_SEL_MASK; x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* Knights Landing does have MISPREDICT bit */ + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP) + x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS; } diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index a3dcc12bef4a..8c700069060b 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -14,6 +14,25 @@ #define PCI_DEVICE_ID_INTEL_SKL_HQ_IMC 0x1910 #define PCI_DEVICE_ID_INTEL_SKL_SD_IMC 0x190f #define PCI_DEVICE_ID_INTEL_SKL_SQ_IMC 0x191f +#define PCI_DEVICE_ID_INTEL_KBL_Y_IMC 0x590c +#define PCI_DEVICE_ID_INTEL_KBL_U_IMC 0x5904 +#define PCI_DEVICE_ID_INTEL_KBL_UQ_IMC 0x5914 +#define PCI_DEVICE_ID_INTEL_KBL_SD_IMC 0x590f +#define PCI_DEVICE_ID_INTEL_KBL_SQ_IMC 0x591f +#define PCI_DEVICE_ID_INTEL_CFL_2U_IMC 0x3ecc +#define PCI_DEVICE_ID_INTEL_CFL_4U_IMC 0x3ed0 +#define PCI_DEVICE_ID_INTEL_CFL_4H_IMC 0x3e10 +#define PCI_DEVICE_ID_INTEL_CFL_6H_IMC 0x3ec4 +#define PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC 0x3e0f +#define PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC 0x3e1f +#define PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC 0x3ec2 +#define PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC 0x3e30 +#define PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC 0x3e18 +#define PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC 0x3ec6 +#define PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC 0x3e31 +#define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 +#define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca +#define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -631,7 +650,82 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SKL_SQ_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, - + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_Y_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_UQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SD_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_KBL_SQ_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4H_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6H_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_2S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_D_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_W_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -680,6 +774,25 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(SKL_HQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core H Quad Core */ IMC_DEV(SKL_SD_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Dual Core */ IMC_DEV(SKL_SQ_IMC, &skl_uncore_pci_driver), /* 6th Gen Core S Quad Core */ + IMC_DEV(KBL_Y_IMC, &skl_uncore_pci_driver), /* 7th Gen Core Y */ + IMC_DEV(KBL_U_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U */ + IMC_DEV(KBL_UQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core U Quad Core */ + IMC_DEV(KBL_SD_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Dual Core */ + IMC_DEV(KBL_SQ_IMC, &skl_uncore_pci_driver), /* 7th Gen Core S Quad Core */ + IMC_DEV(CFL_2U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 2 Cores */ + IMC_DEV(CFL_4U_IMC, &skl_uncore_pci_driver), /* 8th Gen Core U 4 Cores */ + IMC_DEV(CFL_4H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 4 Cores */ + IMC_DEV(CFL_6H_IMC, &skl_uncore_pci_driver), /* 8th Gen Core H 6 Cores */ + IMC_DEV(CFL_2S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 2 Cores Desktop */ + IMC_DEV(CFL_4S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Desktop */ + IMC_DEV(CFL_6S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Desktop */ + IMC_DEV(CFL_8S_D_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Desktop */ + IMC_DEV(CFL_4S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Work Station */ + IMC_DEV(CFL_6S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Work Station */ + IMC_DEV(CFL_8S_W_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Work Station */ + IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ + IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ + IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ { /* end marker */ } }; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 6bc36944a8c1..8c2a9fa0caf3 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3767,16 +3767,16 @@ static const struct pci_device_id skx_uncore_pci_ids[] = { .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), }, { /* M3UPI0 Link 0 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 0), }, { /* M3UPI0 Link 1 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204E), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 2, SKX_PCI_UNCORE_M3UPI, 1), }, { /* M3UPI1 Link 2 */ - PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), - .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 5, SKX_PCI_UNCORE_M3UPI, 2), }, { /* end: all zeroes */ } }; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index f3563179290b..7ace39c51ff7 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -633,6 +633,7 @@ struct x86_perf_task_context { u64 lbr_to[MAX_LBR_ENTRIES]; u64 lbr_info[MAX_LBR_ENTRIES]; int tos; + int valid_lbrs; int lbr_callstack_users; int lbr_stack_state; }; @@ -834,11 +835,16 @@ static inline int amd_pmu_init(void) static inline bool intel_pmu_has_bts(struct perf_event *event) { - if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && - !event->attr.freq && event->hw.sample_period == 1) - return true; + struct hw_perf_event *hwc = &event->hw; + unsigned int hw_event, bts_event; + + if (event->attr.freq) + return false; + + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; + bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); - return false; + return hw_event == bts_event && hwc->sample_period == 1; } int intel_pmu_save_and_restart(struct perf_event *event); diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index fbc1474960e3..c56c24347f15 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -104,7 +104,6 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -/* free, was #define X86_FEATURE_EAGER_FPU ( 3*32+29) * "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ @@ -214,6 +213,7 @@ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ +#define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 8852e3afa1ad..499d6ed0e376 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -60,11 +60,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return true; -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -501,24 +496,6 @@ static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) } -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) -{ - if (!use_eager_fpu()) - clts(); -} - -static inline void __fpregs_deactivate_hw(void) -{ - if (!use_eager_fpu()) - stts(); -} - -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ static inline void __fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -528,7 +505,6 @@ static inline void __fpregs_deactivate(struct fpu *fpu) trace_x86_fpu_regs_deactivated(fpu); } -/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ static inline void __fpregs_activate(struct fpu *fpu) { WARN_ON_FPU(fpu->fpregs_active); @@ -554,22 +530,17 @@ static inline int fpregs_active(void) } /* - * Encapsulate the CR0.TS handling together with the - * software flag. - * * These generally need preemption protection to work, * do try to avoid using these on their own. */ static inline void fpregs_activate(struct fpu *fpu) { - __fpregs_activate_hw(); __fpregs_activate(fpu); } static inline void fpregs_deactivate(struct fpu *fpu) { __fpregs_deactivate(fpu); - __fpregs_deactivate_hw(); } /* @@ -596,8 +567,7 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) * or if the past 5 consecutive context-switches used math. */ fpu.preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); + new_fpu->fpstate_active; if (old_fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(old_fpu)) @@ -611,18 +581,13 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { - new_fpu->counter++; __fpregs_activate(new_fpu); trace_x86_fpu_regs_activated(new_fpu); prefetch(&new_fpu->state); - } else { - __fpregs_deactivate_hw(); } } else { - old_fpu->counter = 0; old_fpu->last_cpu = -1; if (fpu.preload) { - new_fpu->counter++; if (fpu_want_lazy_restore(new_fpu, cpu)) fpu.preload = 0; else diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 48df486b02f9..3c80f5b9c09d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,17 +322,6 @@ struct fpu { unsigned char fpregs_active; /* - * @counter: - * - * This counter contains the number of consecutive context switches - * during which the FPU stays used. If this is over a threshold, the - * lazy FPU restore logic becomes eager, to save the trap overhead. - * This is an unsigned char so that after 256 iterations the counter - * wraps and the context switch behavior turns lazy again; this is to - * deal with bursty apps that only use the FPU for a short time: - */ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore @@ -340,29 +329,6 @@ struct fpu { * the registers in the FPU are more recent than this state * copy. If the task context-switches away then they get * saved here and represent the FPU state. - * - * After context switches there may be a (short) time period - * during which the in-FPU hardware registers are unchanged - * and still perfectly match this state, if the tasks - * scheduled afterwards are not using the FPU. - * - * This is the 'lazy restore' window of optimization, which - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. - * - * We detect whether a subsequent task uses the FPU via setting - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. - * - * During this window, if the task gets scheduled again, we - * might be able to skip having to do a restore from this - * memory buffer to the hardware registers - at the cost of - * incurring the overhead of #NM fault traps. - * - * Note that on modern CPUs that support the XSAVEOPT (or other - * optimized XSAVE instructions), we don't use #NM traps anymore, - * as the hardware can track whether FPU registers need saving - * or not. On such CPUs we activate the non-lazy ('eagerfpu') - * logic, which unconditionally saves/restores all FPU state - * across context switches. (if FPU state exists.) */ union fpregs_state state; /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 8b38df98548e..1b4132161c1f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -215,6 +215,7 @@ enum spectre_v2_mitigation { SPECTRE_V2_RETPOLINE_GENERIC, SPECTRE_V2_RETPOLINE_AMD, SPECTRE_V2_IBRS, + SPECTRE_V2_IBRS_ENHANCED, }; /* The Speculative Store Bypass disable variants */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 84f58de08c2b..2cb5d0f13641 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -184,22 +184,22 @@ do { \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(1)",%0" \ + asm volatile(op "b "__percpu_arg(1)",%0"\ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(1)",%0" \ + asm volatile(op "w "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(1)",%0" \ + asm volatile(op "l "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(1)",%0" \ + asm volatile(op "q "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index dfdb7e21ba56..93d089c9509a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -134,7 +134,7 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* The ASID is the lower 12 bits of CR3 */ diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 6136a18152af..2bd96b4df140 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -42,8 +42,7 @@ struct saved_context { set_debugreg((thread)->debugreg##register, register) /* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern char core_restore_code; -extern char restore_registers; +extern char core_restore_code[]; +extern char restore_registers[]; #endif /* _ASM_X86_SUSPEND_64_H */ diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 9217ab1f5bf6..342e59789fcd 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu, __field(struct fpu *, fpu) __field(bool, fpregs_active) __field(bool, fpstate_active) - __field(int, counter) __field(u64, xfeatures) __field(u64, xcomp_bv) ), @@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; - __entry->counter = fpu->counter; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->fpregs_active, __entry->fpstate_active, - __entry->counter, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..1bb90fafcdc3 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -356,5 +356,6 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) +#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 145863d4d343..a8b215865636 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -30,6 +30,11 @@ static __init int set_corruption_check(char *arg) ssize_t ret; unsigned long val; + if (!arg) { + pr_err("memory_corruption_check config string not provided\n"); + return -EINVAL; + } + ret = kstrtoul(arg, 10, &val); if (ret) return ret; @@ -44,6 +49,11 @@ static __init int set_corruption_check_period(char *arg) ssize_t ret; unsigned long val; + if (!arg) { + pr_err("memory_corruption_check_period config string not provided\n"); + return -EINVAL; + } + ret = kstrtoul(arg, 10, &val); if (ret) return ret; @@ -58,6 +68,11 @@ static __init int set_corruption_check_size(char *arg) char *end; unsigned size; + if (!arg) { + pr_err("memory_corruption_check_size config string not provided\n"); + return -EINVAL; + } + size = memparse(arg, &end); if (*end == '\0') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8103adacbc83..6221166e3fca 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -139,6 +139,7 @@ static const char *spectre_v2_strings[] = { [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", + [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS", }; #undef pr_fmt @@ -340,6 +341,13 @@ static void __init spectre_v2_select_mitigation(void) case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: + if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { + mode = SPECTRE_V2_IBRS_ENHANCED; + /* Force it so VMEXIT will restore correctly */ + x86_spec_ctrl_base |= SPEC_CTRL_IBRS; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + goto specv2_set_mode; + } if (IS_ENABLED(CONFIG_RETPOLINE)) goto retpoline_auto; break; @@ -377,6 +385,7 @@ retpoline_auto: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); } +specv2_set_mode: spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); @@ -399,9 +408,16 @@ retpoline_auto: /* * Retpoline means the kernel is safe because it has no indirect - * branches. But firmware isn't, so use IBRS to protect that. + * branches. Enhanced IBRS protects firmware too, so, enable restricted + * speculation around firmware calls only when Enhanced IBRS isn't + * supported. + * + * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because + * the user might select retpoline on the kernel command line and if + * the CPU supports Enhanced IBRS, kernel might un-intentionally not + * enable IBRS around firmware calls. */ - if (boot_cpu_has(X86_FEATURE_IBRS)) { + if (boot_cpu_has(X86_FEATURE_IBRS) && mode != SPECTRE_V2_IBRS_ENHANCED) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dc0850bb74be..3c01610c5ba9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -959,6 +959,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_SPECTRE_V1); setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + if (ia32_cap & ARCH_CAP_IBRS_ALL) + setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED); + if (x86_match_cpu(cpu_no_meltdown)) return; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 455d8ada9b9a..d39cfb2c6b63 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -253,6 +253,7 @@ static void init_cyrix(struct cpuinfo_x86 *c) break; case 4: /* MediaGX/GXm or Geode GXM/GXLV/GX1 */ + case 11: /* GX1 with inverted Device ID */ #ifdef CONFIG_PCI { u32 vendor, device; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 430c095cfa0e..fc965118d2e6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -59,27 +59,9 @@ static bool kernel_fpu_disabled(void) return this_cpu_read(in_kernel_fpu); } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ static bool interrupted_kernel_fpu_idle(void) { - if (kernel_fpu_disabled()) - return false; - - if (use_eager_fpu()) - return true; - - return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); + return !kernel_fpu_disabled(); } /* @@ -127,7 +109,6 @@ void __kernel_fpu_begin(void) copy_fpregs_to_fpstate(fpu); } else { this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -138,8 +119,6 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); kernel_fpu_enable(); } @@ -201,10 +180,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { - if (use_eager_fpu()) - copy_kernel_to_fpregs(&fpu->state); - else - fpregs_deactivate(fpu); + copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); @@ -249,7 +225,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -262,8 +237,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ - if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -285,10 +259,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - if (use_eager_fpu()) - copy_kernel_to_fpregs(&src_fpu->state); - else - fpregs_deactivate(src_fpu); + copy_kernel_to_fpregs(&src_fpu->state); } preempt_enable(); @@ -461,7 +432,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); - fpu->counter++; trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } @@ -479,7 +449,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3ec0d2d64601..ae52ef05d098 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -309,7 +309,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Sanitize the copied state etc. */ - struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; @@ -344,11 +343,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(fpu); - preempt_enable(); - } + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index abfbb61b18b8..e9d7f461b7fa 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -890,15 +890,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* - * For most XSAVE components, this would be an arduous task: - * brining fpstate up to date with fpregs, updating fpstate, - * then re-populating fpregs. But, for components that are - * never lazily managed, we can just access the fpregs - * directly. PKRU is never managed lazily, so we can just - * manipulate it directly. Make sure it stays that way. - */ - WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 29d465627919..bf9552bebb3c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -90,7 +90,7 @@ unsigned paravirt_patch_call(void *insnbuf, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } @@ -110,7 +110,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target, if (len < 5) { #ifdef CONFIG_RETPOLINE - WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr); + WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr); #endif return len; /* call too long for patch site */ } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index f8a0518d2810..89d1190b9d94 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -24,7 +24,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -__visible volatile unsigned long jiffies __cacheline_aligned = INITIAL_JIFFIES; +__visible volatile unsigned long jiffies __cacheline_aligned_in_smp = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..3f818ce985c0 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -12,6 +12,7 @@ #include <asm/setup.h> #include <asm/apic.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7e5119c1d15c..c17d3893ae60 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,7 +16,6 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -114,8 +113,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); + kvm_x86_ops->fpu_activate(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a8a86be8cf15..69a81a7daa24 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1220,9 +1220,8 @@ EXPORT_SYMBOL_GPL(kvm_lapic_reg_read); static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr) { - return kvm_apic_hw_enabled(apic) && - addr >= apic->base_address && - addr < apic->base_address + LAPIC_MMIO_LENGTH; + return addr >= apic->base_address && + addr < apic->base_address + LAPIC_MMIO_LENGTH; } static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, @@ -1234,6 +1233,15 @@ static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + memset(data, 0xff, len); + return 0; + } + kvm_lapic_reg_read(apic, offset, len, data); return 0; @@ -1646,6 +1654,14 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!apic_mmio_in_range(apic, address)) return -EOPNOTSUPP; + if (!kvm_apic_hw_enabled(apic) || apic_x2apic_mode(apic)) { + if (!kvm_check_has_quirk(vcpu->kvm, + KVM_X86_QUIRK_LAPIC_MMIO_HOLE)) + return -EOPNOTSUPP; + + return 0; + } + /* * APIC register must be aligned on 128-bits boundary. * 32/64/128 bits registers must be accessed thru 32 bits. diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8a4d6bc8fed0..676edfc19a95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4297,9 +4297,9 @@ static bool need_remote_flush(u64 old, u64 new) } static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, - const u8 *new, int *bytes) + int *bytes) { - u64 gentry; + u64 gentry = 0; int r; /* @@ -4311,22 +4311,12 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); - if (r) - gentry = 0; - new = (const u8 *)&gentry; } - switch (*bytes) { - case 4: - gentry = *(const u32 *)new; - break; - case 8: - gentry = *(const u64 *)new; - break; - default: - gentry = 0; - break; + if (*bytes == 4 || *bytes == 8) { + r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes); + if (r) + gentry = 0; } return gentry; @@ -4437,8 +4427,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); - /* * No need to care whether allocation memory is successful * or not since pte prefetch is skiped if it does not have @@ -4447,6 +4435,9 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_topup_memory_caches(vcpu); spin_lock(&vcpu->kvm->mmu_lock); + + gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes); + ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5f44d63a9d69..fa1b0e3c8a06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1333,20 +1333,23 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) static int avic_init_access_page(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - int ret; + int ret = 0; + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page_done) - return 0; + goto out; - ret = x86_set_memory_region(kvm, - APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, - APIC_DEFAULT_PHYS_BASE, - PAGE_SIZE); + ret = __x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); if (ret) - return ret; + goto out; kvm->arch.apic_access_page_done = true; - return 0; +out: + mutex_unlock(&kvm->slots_lock); + return ret; } static int avic_init_backing_page(struct kvm_vcpu *vcpu) @@ -1672,21 +1675,31 @@ out: return ERR_PTR(err); } +static void svm_clear_current_vmcb(struct vmcb *vmcb) +{ + int i; + + for_each_online_cpu(i) + cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL); +} + static void svm_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + /* + * The vmcb page can be recycled, causing a false negative in + * svm_vcpu_load(). So, ensure that no logical CPU has this + * vmcb page recorded as its current vmcb. + */ + svm_clear_current_vmcb(svm->vmcb); + __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); __free_page(virt_to_page(svm->nested.hsave)); __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); - /* - * The vmcb page can be recycled, causing a false negative in - * svm_vcpu_load(). So do a full IBPB now. - */ - indirect_branch_prediction_barrier(); } static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 203d42340fc1..27d13b870e07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6661,7 +6661,8 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) else { if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); - kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); + if (ioapic_in_kernel(vcpu->kvm)) + kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); @@ -7631,16 +7632,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } trace_kvm_fpu(0); } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index a8f90ce3dedf..dc6d99017f3f 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -60,7 +60,7 @@ static int __init emu_setup_memblk(struct numa_meminfo *ei, eb->nid = nid; if (emu_nid_to_phys[nid] == NUMA_NO_NODE) - emu_nid_to_phys[nid] = nid; + emu_nid_to_phys[nid] = pb->nid; pb->start += size; if (pb->start >= pb->end) { diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 0bbec041c003..e2d2b3cd4276 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -142,8 +142,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where * we know the FPU regstiers are safe for use and we can use PKRU - * directly. The fact that PKRU is only available when we are - * using eagerfpu mode makes this possible. + * directly. */ void copy_init_pkru_to_fpregs(void) { diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c index a2b4efddd61a..8e7ddd7e313a 100644 --- a/arch/x86/platform/olpc/olpc-xo1-rtc.c +++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c @@ -16,6 +16,7 @@ #include <asm/msr.h> #include <asm/olpc.h> +#include <asm/x86_init.h> static void rtc_wake_on(struct device *dev) { @@ -75,6 +76,8 @@ static int __init xo1_rtc_init(void) if (r) return r; + x86_platform.legacy.rtc = 0; + device_init_wakeup(&xo1_rtc_device.dev, 1); return 0; } diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 0cb1dd461529..fef485b789ca 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -126,7 +126,7 @@ static int relocate_restore_code(void) if (!relocated_restore_code) return -ENOMEM; - memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE); + memcpy((void *)relocated_restore_code, core_restore_code, PAGE_SIZE); /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); @@ -197,8 +197,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) if (max_size < sizeof(struct restore_data_record)) return -EOVERFLOW; - rdr->jump_address = (unsigned long)&restore_registers; - rdr->jump_address_phys = __pa_symbol(&restore_registers); + rdr->jump_address = (unsigned long)restore_registers; + rdr->jump_address_phys = __pa_symbol(restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; return 0; diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3d6e0064cbfc..8d2c6f071dcc 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -8,6 +8,7 @@ #include <linux/log2.h> #include <linux/gfp.h> #include <linux/slab.h> +#include <linux/atomic.h> #include <asm/paravirt.h> @@ -19,6 +20,7 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); +static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest); static bool xen_pvspin = true; #include <asm/qspinlock.h> @@ -40,33 +42,24 @@ static void xen_qlock_kick(int cpu) static void xen_qlock_wait(u8 *byte, u8 val) { int irq = __this_cpu_read(lock_kicker_irq); + atomic_t *nest_cnt = this_cpu_ptr(&xen_qlock_wait_nest); /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) + if (irq == -1 || in_nmi()) return; - /* clear pending */ - xen_clear_irq_pending(irq); - barrier(); - - /* - * We check the byte value after clearing pending IRQ to make sure - * that we won't miss a wakeup event because of the clearing. - * - * The sync_clear_bit() call in xen_clear_irq_pending() is atomic. - * So it is effectively a memory barrier for x86. - */ - if (READ_ONCE(*byte) != val) - return; + /* Detect reentry. */ + atomic_inc(nest_cnt); - /* - * If an interrupt happens here, it will leave the wakeup irq - * pending, which will cause xen_poll_irq() to return - * immediately. - */ + /* If irq pending already and no nested call clear it. */ + if (atomic_read(nest_cnt) == 1 && xen_test_irq_pending(irq)) { + xen_clear_irq_pending(irq); + } else if (READ_ONCE(*byte) == val) { + /* Block until irq becomes pending (or a spurious wakeup) */ + xen_poll_irq(irq); + } - /* Block until irq becomes pending (or perhaps a spurious wakeup) */ - xen_poll_irq(irq); + atomic_dec(nest_cnt); } static irqreturn_t dummy_handler(int irq, void *dev_id) |